feat: add Deepgram STT provider and cloud-first config

- New DeepgramSTT class using Deepgram nova-3 via REST API - Factory function create_stt_engine() for provider switching - faster-whisper import now optional (graceful fallback) - Config defaults to cloud providers (deepgram STT + venice TTS) - .env.example updated with DEEPGRAM_API_KEY and VENICE_API_KEY - requirements.txt adds deepgram-sdk, marks faster-whisper as optional - Zero GPU required for default configuration
2026-04-10 00:33:57 +00:00 · 2026-04-10 00:33:57 +00:00 · f0458b9b40
commit f0458b9b40
parent 3eea942772
4 changed files with 213 additions and 16 deletions
--- a/.env.example
+++ b/.env.example
@ -18,6 +18,10 @@ OPENCLAW_BASE_URL=ws://192.168.50.9:18789
 OPENCLAW_AUTH_TOKEN=your_openclaw_gateway_token
 OPENCLAW_AGENT_ID=main  # Agent ID for session keys (jarvis or main)
 # Cloud STT/TTS API Keys (for GPU-less deployment)
 DEEPGRAM_API_KEY=your_deepgram_api_key
 VENICE_API_KEY=your_venice_api_key
 # ============================================================================
 # FastAPI Server
 # ============================================================================
--- a/config.yaml
+++ b/config.yaml
@ -108,20 +108,19 @@ pipeline:
    # Using v3.2 GPU model for best performance with RTX 5090
    model_path: "smart-turn-v3.2-gpu.onnx"
-  # Speech-to-Text (faster-whisper)
+  # Speech-to-Text
  stt:
-    # Model size: tiny, base, small, medium, large-v3
+    # Provider: "deepgram" (cloud, no GPU) or "local" (faster-whisper, requires GPU)
-    # Using "small" for faster transcription (was "medium")
+    provider: "deepgram"
    # Deepgram settings (used when provider is "deepgram")
    model: "nova-3"
    language: "en"
    # Local faster-whisper settings (used when provider is "local")
    model_size: "small"
    # Device: cuda or cpu
    device: "cuda"
    # Compute type: float16, float32, int8
    compute_type: "float16"
    # Beam size for decoding (higher = more accurate, slower)
    # Optimized for voice chat: beam_size=1 is 3-5x faster with minimal quality loss
    beam_size: 1
    # Language hint (null = auto-detect)
@ -165,10 +164,17 @@ pipeline:
  # Text-to-Speech
  tts:
-    # TTS engine: chatterbox, coqui, piper
+    # Provider: "venice" (cloud, no GPU) or "local" (chatterbox, requires GPU)
-    engine: "coqui"
+    provider: "venice"
-    # Device: cuda or cpu
+    # Venice settings (used when provider is "venice")
    venice:
      voice: "am_liam"
      base_url: "https://api.venice.ai/api/v1"
      # API key from env: VENICE_API_KEY
    # Local settings (used when provider is "local")
    engine: "chatterbox"
    device: "cuda"
    # Streaming: generate and play audio in chunks
--- a/requirements.txt
+++ b/requirements.txt
@ -22,7 +22,8 @@ resampy>=0.4.2  # High-quality audio resampling
 # ============================================================================
 torch>=2.1.0
 torchaudio>=2.1.0
-faster-whisper>=1.0.0  # GPU-accelerated STT
+faster-whisper>=1.0.0  # GPU-accelerated STT (optional, for local provider)
 deepgram-sdk>=3.0.0  # Deepgram cloud STT
 silero-vad>=4.0.0  # Voice activity detection
 onnxruntime>=1.16.0  # Smart Turn model inference
--- a/server/stt.py
+++ b/server/stt.py
@ -1,15 +1,23 @@
-"""Speech-to-Text using faster-whisper.
+"""Speech-to-Text using faster-whisper and Deepgram cloud API.
 GPU-accelerated transcription with support for multiple model sizes.
 Cloud transcription via Deepgram for GPU-less deployments.
 """
 import asyncio
 import io
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional
 import httpx
 import numpy as np
-from faster_whisper import WhisperModel
+
 try:
    from faster_whisper import WhisperModel
    HAS_FASTER_WHISPER = True
 except ImportError:
    HAS_FASTER_WHISPER = False
 from utils.logging import get_logger, log_latency
@ -51,6 +59,184 @@ class TranscriptionResult:
        return len(self.segments)
 class DeepgramSTT:
    """
    Deepgram cloud STT engine.
    Transcribes pre-recorded audio via Deepgram's REST API.
    No GPU required — sends PCM audio over HTTP.
    """
    def __init__(
        self,
        api_key: str,
        model: str = "nova-3",
        language: Optional[str] = None,
        sample_rate: int = 16000,
    ):
        self.api_key = api_key
        self.model = model
        self.language = language
        self.sample_rate = sample_rate
        self.base_url = "https://api.deepgram.com/v1"
        logger.info(f"Initialized Deepgram STT (model: {model})")
        # Stats
        self.transcription_count = 0
        self.total_audio_duration = 0.0
        self.total_processing_time = 0.0
    def _audio_to_wav_bytes(self, audio: np.ndarray) -> bytes:
        """Convert float32 PCM audio to WAV bytes."""
        import struct
        # Ensure float32 mono
        audio = audio.astype(np.float32)
        if audio.ndim > 1:
            audio = audio.mean(axis=1)
        # Convert to int16
        pcm = (audio * 32767).clip(-32768, 32767).astype(np.int16)
        # Write WAV header manually
        buf = io.BytesIO()
        num_samples = len(pcm)
        byte_rate = self.sample_rate * 2  # 16-bit mono
        data_size = num_samples * 2
        buf.write(b'RIFF')
        buf.write(struct.pack('<I', 36 + data_size))
        buf.write(b'WAVE')
        buf.write(b'fmt ')
        buf.write(struct.pack('<I', 16))  # chunk size
        buf.write(struct.pack('<H', 1))   # PCM
        buf.write(struct.pack('<H', 1))   # mono
        buf.write(struct.pack('<I', self.sample_rate))
        buf.write(struct.pack('<I', byte_rate))
        buf.write(struct.pack('<H', 2))   # block align
        buf.write(struct.pack('<H', 16))  # bits per sample
        buf.write(b'data')
        buf.write(struct.pack('<I', data_size))
        buf.write(pcm.tobytes())
        return buf.getvalue()
    async def transcribe_async(
        self,
        audio: np.ndarray,
        language: Optional[str] = None,
        beam_size: Optional[int] = None,
        vad_filter: bool = False,
    ) -> "TranscriptionResult":
        """Transcribe audio via Deepgram API."""
        if audio.dtype != np.float32:
            audio = audio.astype(np.float32)
        start_time = asyncio.get_event_loop().time()
        wav_bytes = self._audio_to_wav_bytes(audio)
        duration = len(audio) / self.sample_rate
        lang = language or self.language or "en"
        async with httpx.AsyncClient(timeout=30.0) as client:
            response = await client.post(
                f"{self.base_url}/listen",
                content=wav_bytes,
                headers={
                    "Authorization": f"Token {self.api_key}",
                    "Content-Type": "audio/wav",
                },
                params={
                    "model": self.model,
                    "language": lang,
                    "sample_rate": self.sample_rate,
                    "smart_format": "true",
                },
            )
            response.raise_for_status()
            result = response.json()
        # Parse Deepgram response
        channel = result.get("results", {}).get("channels", [{}])[0]
        alternatives = channel.get("alternatives", [])
        if not alternatives:
            text = ""
            segments = []
        else:
            alt = alternatives[0]
            text = alt.get("transcript", "").strip()
            words = alt.get("words", [])
            segments = []
            for i, word in enumerate(words):
                segments.append(TranscriptSegment(
                    text=word.get("word", ""),
                    start=word.get("start", 0.0),
                    end=word.get("end", 0.0),
                    confidence=word.get("confidence", 1.0),
                ))
        processing_time = asyncio.get_event_loop().time() - start_time
        # Update stats
        self.transcription_count += 1
        self.total_audio_duration += duration
        self.total_processing_time += processing_time
        logger.info(
            f"Deepgram transcribed {duration:.2f}s audio: "
            f'"{text[:50]}..." ({processing_time:.2f}s)'
        )
        return TranscriptionResult(
            text=text,
            segments=segments,
            language=lang,
            duration=duration,
        )
    def get_stats(self) -> dict:
        avg_duration = (
            self.total_audio_duration / self.transcription_count
            if self.transcription_count > 0 else 0.0
        )
        avg_processing = (
            self.total_processing_time / self.transcription_count
            if self.transcription_count > 0 else 0.0
        )
        rtf = avg_processing / avg_duration if avg_duration > 0 else 0.0
        return {
            "model": self.model,
            "provider": "deepgram",
            "transcription_count": self.transcription_count,
            "total_audio_duration": self.total_audio_duration,
            "avg_processing_time": avg_processing,
            "real_time_factor": rtf,
        }
    def get_model_info(self) -> dict:
        return {
            "model": self.model,
            "provider": "deepgram",
            "language": self.language or "auto",
            "loaded": True,
        }
 def create_stt_engine(provider: str, **kwargs):
    """Factory to create STT engine by provider name."""
    if provider == "deepgram":
        return DeepgramSTT(**kwargs)
    elif provider == "local":
        if not HAS_FASTER_WHISPER:
            raise RuntimeError("faster-whisper not installed. Install with: pip install faster-whisper")
        return FasterWhisperSTT(**kwargs)
    else:
        raise ValueError(f"Unknown STT provider: {provider}. Choose 'deepgram' or 'local'.")
 class FasterWhisperSTT:
    """
    Faster-whisper STT engine.