feat: add Deepgram STT provider and cloud-first config
- New DeepgramSTT class using Deepgram nova-3 via REST API - Factory function create_stt_engine() for provider switching - faster-whisper import now optional (graceful fallback) - Config defaults to cloud providers (deepgram STT + venice TTS) - .env.example updated with DEEPGRAM_API_KEY and VENICE_API_KEY - requirements.txt adds deepgram-sdk, marks faster-whisper as optional - Zero GPU required for default configuration
This commit is contained in:
parent
3eea942772
commit
f0458b9b40
4 changed files with 213 additions and 16 deletions
|
|
@ -18,6 +18,10 @@ OPENCLAW_BASE_URL=ws://192.168.50.9:18789
|
|||
OPENCLAW_AUTH_TOKEN=your_openclaw_gateway_token
|
||||
OPENCLAW_AGENT_ID=main # Agent ID for session keys (jarvis or main)
|
||||
|
||||
# Cloud STT/TTS API Keys (for GPU-less deployment)
|
||||
DEEPGRAM_API_KEY=your_deepgram_api_key
|
||||
VENICE_API_KEY=your_venice_api_key
|
||||
|
||||
# ============================================================================
|
||||
# FastAPI Server
|
||||
# ============================================================================
|
||||
|
|
|
|||
32
config.yaml
32
config.yaml
|
|
@ -108,20 +108,19 @@ pipeline:
|
|||
# Using v3.2 GPU model for best performance with RTX 5090
|
||||
model_path: "smart-turn-v3.2-gpu.onnx"
|
||||
|
||||
# Speech-to-Text (faster-whisper)
|
||||
# Speech-to-Text
|
||||
stt:
|
||||
# Model size: tiny, base, small, medium, large-v3
|
||||
# Using "small" for faster transcription (was "medium")
|
||||
# Provider: "deepgram" (cloud, no GPU) or "local" (faster-whisper, requires GPU)
|
||||
provider: "deepgram"
|
||||
|
||||
# Deepgram settings (used when provider is "deepgram")
|
||||
model: "nova-3"
|
||||
language: "en"
|
||||
|
||||
# Local faster-whisper settings (used when provider is "local")
|
||||
model_size: "small"
|
||||
|
||||
# Device: cuda or cpu
|
||||
device: "cuda"
|
||||
|
||||
# Compute type: float16, float32, int8
|
||||
compute_type: "float16"
|
||||
|
||||
# Beam size for decoding (higher = more accurate, slower)
|
||||
# Optimized for voice chat: beam_size=1 is 3-5x faster with minimal quality loss
|
||||
beam_size: 1
|
||||
|
||||
# Language hint (null = auto-detect)
|
||||
|
|
@ -165,10 +164,17 @@ pipeline:
|
|||
|
||||
# Text-to-Speech
|
||||
tts:
|
||||
# TTS engine: chatterbox, coqui, piper
|
||||
engine: "coqui"
|
||||
# Provider: "venice" (cloud, no GPU) or "local" (chatterbox, requires GPU)
|
||||
provider: "venice"
|
||||
|
||||
# Device: cuda or cpu
|
||||
# Venice settings (used when provider is "venice")
|
||||
venice:
|
||||
voice: "am_liam"
|
||||
base_url: "https://api.venice.ai/api/v1"
|
||||
# API key from env: VENICE_API_KEY
|
||||
|
||||
# Local settings (used when provider is "local")
|
||||
engine: "chatterbox"
|
||||
device: "cuda"
|
||||
|
||||
# Streaming: generate and play audio in chunks
|
||||
|
|
|
|||
|
|
@ -22,7 +22,8 @@ resampy>=0.4.2 # High-quality audio resampling
|
|||
# ============================================================================
|
||||
torch>=2.1.0
|
||||
torchaudio>=2.1.0
|
||||
faster-whisper>=1.0.0 # GPU-accelerated STT
|
||||
faster-whisper>=1.0.0 # GPU-accelerated STT (optional, for local provider)
|
||||
deepgram-sdk>=3.0.0 # Deepgram cloud STT
|
||||
silero-vad>=4.0.0 # Voice activity detection
|
||||
onnxruntime>=1.16.0 # Smart Turn model inference
|
||||
|
||||
|
|
|
|||
188
server/stt.py
188
server/stt.py
|
|
@ -1,15 +1,23 @@
|
|||
"""Speech-to-Text using faster-whisper.
|
||||
"""Speech-to-Text using faster-whisper and Deepgram cloud API.
|
||||
|
||||
GPU-accelerated transcription with support for multiple model sizes.
|
||||
Cloud transcription via Deepgram for GPU-less deployments.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
import httpx
|
||||
import numpy as np
|
||||
|
||||
try:
|
||||
from faster_whisper import WhisperModel
|
||||
HAS_FASTER_WHISPER = True
|
||||
except ImportError:
|
||||
HAS_FASTER_WHISPER = False
|
||||
|
||||
from utils.logging import get_logger, log_latency
|
||||
|
||||
|
|
@ -51,6 +59,184 @@ class TranscriptionResult:
|
|||
return len(self.segments)
|
||||
|
||||
|
||||
class DeepgramSTT:
|
||||
"""
|
||||
Deepgram cloud STT engine.
|
||||
|
||||
Transcribes pre-recorded audio via Deepgram's REST API.
|
||||
No GPU required — sends PCM audio over HTTP.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str,
|
||||
model: str = "nova-3",
|
||||
language: Optional[str] = None,
|
||||
sample_rate: int = 16000,
|
||||
):
|
||||
self.api_key = api_key
|
||||
self.model = model
|
||||
self.language = language
|
||||
self.sample_rate = sample_rate
|
||||
self.base_url = "https://api.deepgram.com/v1"
|
||||
|
||||
logger.info(f"Initialized Deepgram STT (model: {model})")
|
||||
|
||||
# Stats
|
||||
self.transcription_count = 0
|
||||
self.total_audio_duration = 0.0
|
||||
self.total_processing_time = 0.0
|
||||
|
||||
def _audio_to_wav_bytes(self, audio: np.ndarray) -> bytes:
|
||||
"""Convert float32 PCM audio to WAV bytes."""
|
||||
import struct
|
||||
|
||||
# Ensure float32 mono
|
||||
audio = audio.astype(np.float32)
|
||||
if audio.ndim > 1:
|
||||
audio = audio.mean(axis=1)
|
||||
|
||||
# Convert to int16
|
||||
pcm = (audio * 32767).clip(-32768, 32767).astype(np.int16)
|
||||
|
||||
# Write WAV header manually
|
||||
buf = io.BytesIO()
|
||||
num_samples = len(pcm)
|
||||
byte_rate = self.sample_rate * 2 # 16-bit mono
|
||||
data_size = num_samples * 2
|
||||
|
||||
buf.write(b'RIFF')
|
||||
buf.write(struct.pack('<I', 36 + data_size))
|
||||
buf.write(b'WAVE')
|
||||
buf.write(b'fmt ')
|
||||
buf.write(struct.pack('<I', 16)) # chunk size
|
||||
buf.write(struct.pack('<H', 1)) # PCM
|
||||
buf.write(struct.pack('<H', 1)) # mono
|
||||
buf.write(struct.pack('<I', self.sample_rate))
|
||||
buf.write(struct.pack('<I', byte_rate))
|
||||
buf.write(struct.pack('<H', 2)) # block align
|
||||
buf.write(struct.pack('<H', 16)) # bits per sample
|
||||
buf.write(b'data')
|
||||
buf.write(struct.pack('<I', data_size))
|
||||
buf.write(pcm.tobytes())
|
||||
|
||||
return buf.getvalue()
|
||||
|
||||
async def transcribe_async(
|
||||
self,
|
||||
audio: np.ndarray,
|
||||
language: Optional[str] = None,
|
||||
beam_size: Optional[int] = None,
|
||||
vad_filter: bool = False,
|
||||
) -> "TranscriptionResult":
|
||||
"""Transcribe audio via Deepgram API."""
|
||||
if audio.dtype != np.float32:
|
||||
audio = audio.astype(np.float32)
|
||||
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
wav_bytes = self._audio_to_wav_bytes(audio)
|
||||
duration = len(audio) / self.sample_rate
|
||||
|
||||
lang = language or self.language or "en"
|
||||
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
response = await client.post(
|
||||
f"{self.base_url}/listen",
|
||||
content=wav_bytes,
|
||||
headers={
|
||||
"Authorization": f"Token {self.api_key}",
|
||||
"Content-Type": "audio/wav",
|
||||
},
|
||||
params={
|
||||
"model": self.model,
|
||||
"language": lang,
|
||||
"sample_rate": self.sample_rate,
|
||||
"smart_format": "true",
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
|
||||
# Parse Deepgram response
|
||||
channel = result.get("results", {}).get("channels", [{}])[0]
|
||||
alternatives = channel.get("alternatives", [])
|
||||
|
||||
if not alternatives:
|
||||
text = ""
|
||||
segments = []
|
||||
else:
|
||||
alt = alternatives[0]
|
||||
text = alt.get("transcript", "").strip()
|
||||
words = alt.get("words", [])
|
||||
|
||||
segments = []
|
||||
for i, word in enumerate(words):
|
||||
segments.append(TranscriptSegment(
|
||||
text=word.get("word", ""),
|
||||
start=word.get("start", 0.0),
|
||||
end=word.get("end", 0.0),
|
||||
confidence=word.get("confidence", 1.0),
|
||||
))
|
||||
|
||||
processing_time = asyncio.get_event_loop().time() - start_time
|
||||
|
||||
# Update stats
|
||||
self.transcription_count += 1
|
||||
self.total_audio_duration += duration
|
||||
self.total_processing_time += processing_time
|
||||
|
||||
logger.info(
|
||||
f"Deepgram transcribed {duration:.2f}s audio: "
|
||||
f'"{text[:50]}..." ({processing_time:.2f}s)'
|
||||
)
|
||||
|
||||
return TranscriptionResult(
|
||||
text=text,
|
||||
segments=segments,
|
||||
language=lang,
|
||||
duration=duration,
|
||||
)
|
||||
|
||||
def get_stats(self) -> dict:
|
||||
avg_duration = (
|
||||
self.total_audio_duration / self.transcription_count
|
||||
if self.transcription_count > 0 else 0.0
|
||||
)
|
||||
avg_processing = (
|
||||
self.total_processing_time / self.transcription_count
|
||||
if self.transcription_count > 0 else 0.0
|
||||
)
|
||||
rtf = avg_processing / avg_duration if avg_duration > 0 else 0.0
|
||||
return {
|
||||
"model": self.model,
|
||||
"provider": "deepgram",
|
||||
"transcription_count": self.transcription_count,
|
||||
"total_audio_duration": self.total_audio_duration,
|
||||
"avg_processing_time": avg_processing,
|
||||
"real_time_factor": rtf,
|
||||
}
|
||||
|
||||
def get_model_info(self) -> dict:
|
||||
return {
|
||||
"model": self.model,
|
||||
"provider": "deepgram",
|
||||
"language": self.language or "auto",
|
||||
"loaded": True,
|
||||
}
|
||||
|
||||
|
||||
def create_stt_engine(provider: str, **kwargs):
|
||||
"""Factory to create STT engine by provider name."""
|
||||
if provider == "deepgram":
|
||||
return DeepgramSTT(**kwargs)
|
||||
elif provider == "local":
|
||||
if not HAS_FASTER_WHISPER:
|
||||
raise RuntimeError("faster-whisper not installed. Install with: pip install faster-whisper")
|
||||
return FasterWhisperSTT(**kwargs)
|
||||
else:
|
||||
raise ValueError(f"Unknown STT provider: {provider}. Choose 'deepgram' or 'local'.")
|
||||
|
||||
|
||||
class FasterWhisperSTT:
|
||||
"""
|
||||
Faster-whisper STT engine.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue