From f0458b9b40a1520ae5ec0ea91ca019da93510d84 Mon Sep 17 00:00:00 2001 From: Jezza Hehn Date: Fri, 10 Apr 2026 00:33:57 +0000 Subject: [PATCH] feat: add Deepgram STT provider and cloud-first config - New DeepgramSTT class using Deepgram nova-3 via REST API - Factory function create_stt_engine() for provider switching - faster-whisper import now optional (graceful fallback) - Config defaults to cloud providers (deepgram STT + venice TTS) - .env.example updated with DEEPGRAM_API_KEY and VENICE_API_KEY - requirements.txt adds deepgram-sdk, marks faster-whisper as optional - Zero GPU required for default configuration --- .env.example | 4 + config.yaml | 32 ++++---- requirements.txt | 3 +- server/stt.py | 190 ++++++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 213 insertions(+), 16 deletions(-) diff --git a/.env.example b/.env.example index c5005e7..12ace12 100644 --- a/.env.example +++ b/.env.example @@ -18,6 +18,10 @@ OPENCLAW_BASE_URL=ws://192.168.50.9:18789 OPENCLAW_AUTH_TOKEN=your_openclaw_gateway_token OPENCLAW_AGENT_ID=main # Agent ID for session keys (jarvis or main) +# Cloud STT/TTS API Keys (for GPU-less deployment) +DEEPGRAM_API_KEY=your_deepgram_api_key +VENICE_API_KEY=your_venice_api_key + # ============================================================================ # FastAPI Server # ============================================================================ diff --git a/config.yaml b/config.yaml index acc36bf..a51c66f 100644 --- a/config.yaml +++ b/config.yaml @@ -108,20 +108,19 @@ pipeline: # Using v3.2 GPU model for best performance with RTX 5090 model_path: "smart-turn-v3.2-gpu.onnx" - # Speech-to-Text (faster-whisper) + # Speech-to-Text stt: - # Model size: tiny, base, small, medium, large-v3 - # Using "small" for faster transcription (was "medium") + # Provider: "deepgram" (cloud, no GPU) or "local" (faster-whisper, requires GPU) + provider: "deepgram" + + # Deepgram settings (used when provider is "deepgram") + model: "nova-3" + language: "en" + + # Local faster-whisper settings (used when provider is "local") model_size: "small" - - # Device: cuda or cpu device: "cuda" - - # Compute type: float16, float32, int8 compute_type: "float16" - - # Beam size for decoding (higher = more accurate, slower) - # Optimized for voice chat: beam_size=1 is 3-5x faster with minimal quality loss beam_size: 1 # Language hint (null = auto-detect) @@ -165,10 +164,17 @@ pipeline: # Text-to-Speech tts: - # TTS engine: chatterbox, coqui, piper - engine: "coqui" + # Provider: "venice" (cloud, no GPU) or "local" (chatterbox, requires GPU) + provider: "venice" - # Device: cuda or cpu + # Venice settings (used when provider is "venice") + venice: + voice: "am_liam" + base_url: "https://api.venice.ai/api/v1" + # API key from env: VENICE_API_KEY + + # Local settings (used when provider is "local") + engine: "chatterbox" device: "cuda" # Streaming: generate and play audio in chunks diff --git a/requirements.txt b/requirements.txt index 2576eb6..0ee4467 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,8 @@ resampy>=0.4.2 # High-quality audio resampling # ============================================================================ torch>=2.1.0 torchaudio>=2.1.0 -faster-whisper>=1.0.0 # GPU-accelerated STT +faster-whisper>=1.0.0 # GPU-accelerated STT (optional, for local provider) +deepgram-sdk>=3.0.0 # Deepgram cloud STT silero-vad>=4.0.0 # Voice activity detection onnxruntime>=1.16.0 # Smart Turn model inference diff --git a/server/stt.py b/server/stt.py index af57dac..19f07a4 100644 --- a/server/stt.py +++ b/server/stt.py @@ -1,15 +1,23 @@ -"""Speech-to-Text using faster-whisper. +"""Speech-to-Text using faster-whisper and Deepgram cloud API. GPU-accelerated transcription with support for multiple model sizes. +Cloud transcription via Deepgram for GPU-less deployments. """ import asyncio +import io from dataclasses import dataclass from pathlib import Path from typing import List, Optional +import httpx import numpy as np -from faster_whisper import WhisperModel + +try: + from faster_whisper import WhisperModel + HAS_FASTER_WHISPER = True +except ImportError: + HAS_FASTER_WHISPER = False from utils.logging import get_logger, log_latency @@ -51,6 +59,184 @@ class TranscriptionResult: return len(self.segments) +class DeepgramSTT: + """ + Deepgram cloud STT engine. + + Transcribes pre-recorded audio via Deepgram's REST API. + No GPU required — sends PCM audio over HTTP. + """ + + def __init__( + self, + api_key: str, + model: str = "nova-3", + language: Optional[str] = None, + sample_rate: int = 16000, + ): + self.api_key = api_key + self.model = model + self.language = language + self.sample_rate = sample_rate + self.base_url = "https://api.deepgram.com/v1" + + logger.info(f"Initialized Deepgram STT (model: {model})") + + # Stats + self.transcription_count = 0 + self.total_audio_duration = 0.0 + self.total_processing_time = 0.0 + + def _audio_to_wav_bytes(self, audio: np.ndarray) -> bytes: + """Convert float32 PCM audio to WAV bytes.""" + import struct + + # Ensure float32 mono + audio = audio.astype(np.float32) + if audio.ndim > 1: + audio = audio.mean(axis=1) + + # Convert to int16 + pcm = (audio * 32767).clip(-32768, 32767).astype(np.int16) + + # Write WAV header manually + buf = io.BytesIO() + num_samples = len(pcm) + byte_rate = self.sample_rate * 2 # 16-bit mono + data_size = num_samples * 2 + + buf.write(b'RIFF') + buf.write(struct.pack(' "TranscriptionResult": + """Transcribe audio via Deepgram API.""" + if audio.dtype != np.float32: + audio = audio.astype(np.float32) + + start_time = asyncio.get_event_loop().time() + wav_bytes = self._audio_to_wav_bytes(audio) + duration = len(audio) / self.sample_rate + + lang = language or self.language or "en" + + async with httpx.AsyncClient(timeout=30.0) as client: + response = await client.post( + f"{self.base_url}/listen", + content=wav_bytes, + headers={ + "Authorization": f"Token {self.api_key}", + "Content-Type": "audio/wav", + }, + params={ + "model": self.model, + "language": lang, + "sample_rate": self.sample_rate, + "smart_format": "true", + }, + ) + response.raise_for_status() + result = response.json() + + # Parse Deepgram response + channel = result.get("results", {}).get("channels", [{}])[0] + alternatives = channel.get("alternatives", []) + + if not alternatives: + text = "" + segments = [] + else: + alt = alternatives[0] + text = alt.get("transcript", "").strip() + words = alt.get("words", []) + + segments = [] + for i, word in enumerate(words): + segments.append(TranscriptSegment( + text=word.get("word", ""), + start=word.get("start", 0.0), + end=word.get("end", 0.0), + confidence=word.get("confidence", 1.0), + )) + + processing_time = asyncio.get_event_loop().time() - start_time + + # Update stats + self.transcription_count += 1 + self.total_audio_duration += duration + self.total_processing_time += processing_time + + logger.info( + f"Deepgram transcribed {duration:.2f}s audio: " + f'"{text[:50]}..." ({processing_time:.2f}s)' + ) + + return TranscriptionResult( + text=text, + segments=segments, + language=lang, + duration=duration, + ) + + def get_stats(self) -> dict: + avg_duration = ( + self.total_audio_duration / self.transcription_count + if self.transcription_count > 0 else 0.0 + ) + avg_processing = ( + self.total_processing_time / self.transcription_count + if self.transcription_count > 0 else 0.0 + ) + rtf = avg_processing / avg_duration if avg_duration > 0 else 0.0 + return { + "model": self.model, + "provider": "deepgram", + "transcription_count": self.transcription_count, + "total_audio_duration": self.total_audio_duration, + "avg_processing_time": avg_processing, + "real_time_factor": rtf, + } + + def get_model_info(self) -> dict: + return { + "model": self.model, + "provider": "deepgram", + "language": self.language or "auto", + "loaded": True, + } + + +def create_stt_engine(provider: str, **kwargs): + """Factory to create STT engine by provider name.""" + if provider == "deepgram": + return DeepgramSTT(**kwargs) + elif provider == "local": + if not HAS_FASTER_WHISPER: + raise RuntimeError("faster-whisper not installed. Install with: pip install faster-whisper") + return FasterWhisperSTT(**kwargs) + else: + raise ValueError(f"Unknown STT provider: {provider}. Choose 'deepgram' or 'local'.") + + class FasterWhisperSTT: """ Faster-whisper STT engine.