feat: add Deepgram STT provider and cloud-first config

- New DeepgramSTT class using Deepgram nova-3 via REST API - Factory function create_stt_engine() for provider switching - faster-whisper import now optional (graceful fallback) - Config defaults to cloud providers (deepgram STT + venice TTS) - .env.example updated with DEEPGRAM_API_KEY and VENICE_API_KEY - requirements.txt adds deepgram-sdk, marks faster-whisper as optional - Zero GPU required for default configuration
2026-04-10 00:33:57 +00:00 · 2026-04-10 00:33:57 +00:00 · f0458b9b40
commit f0458b9b40
parent 3eea942772
4 changed files with 213 additions and 16 deletions
--- a/.env.example
+++ b/.env.example
@ -18,6 +18,10 @@ OPENCLAW_BASE_URL=ws://192.168.50.9:18789
 OPENCLAW_AUTH_TOKEN=your_openclaw_gateway_token
 OPENCLAW_AGENT_ID=main  # Agent ID for session keys (jarvis or main)

+# Cloud STT/TTS API Keys (for GPU-less deployment)
+DEEPGRAM_API_KEY=your_deepgram_api_key
+VENICE_API_KEY=your_venice_api_key
+
 # ============================================================================
 # FastAPI Server
 # ============================================================================
--- a/config.yaml
+++ b/config.yaml
@ -108,20 +108,19 @@ pipeline:
    # Using v3.2 GPU model for best performance with RTX 5090
    model_path: "smart-turn-v3.2-gpu.onnx"

-  # Speech-to-Text (faster-whisper)
+  # Speech-to-Text
  stt:
-    # Model size: tiny, base, small, medium, large-v3
-    # Using "small" for faster transcription (was "medium")
+    # Provider: "deepgram" (cloud, no GPU) or "local" (faster-whisper, requires GPU)
+    provider: "deepgram"
+
+    # Deepgram settings (used when provider is "deepgram")
+    model: "nova-3"
+    language: "en"
+
+    # Local faster-whisper settings (used when provider is "local")
    model_size: "small"
-
-    # Device: cuda or cpu
    device: "cuda"
-
-    # Compute type: float16, float32, int8
    compute_type: "float16"
-
-    # Beam size for decoding (higher = more accurate, slower)
-    # Optimized for voice chat: beam_size=1 is 3-5x faster with minimal quality loss
    beam_size: 1

    # Language hint (null = auto-detect)
@ -165,10 +164,17 @@ pipeline:

  # Text-to-Speech
  tts:
-    # TTS engine: chatterbox, coqui, piper
-    engine: "coqui"
+    # Provider: "venice" (cloud, no GPU) or "local" (chatterbox, requires GPU)
+    provider: "venice"

-    # Device: cuda or cpu
+    # Venice settings (used when provider is "venice")
+    venice:
+      voice: "am_liam"
+      base_url: "https://api.venice.ai/api/v1"
+      # API key from env: VENICE_API_KEY
+
+    # Local settings (used when provider is "local")
+    engine: "chatterbox"
    device: "cuda"

    # Streaming: generate and play audio in chunks
--- a/requirements.txt
+++ b/requirements.txt
@ -22,7 +22,8 @@ resampy>=0.4.2  # High-quality audio resampling
 # ============================================================================
 torch>=2.1.0
 torchaudio>=2.1.0
-faster-whisper>=1.0.0  # GPU-accelerated STT
+faster-whisper>=1.0.0  # GPU-accelerated STT (optional, for local provider)
+deepgram-sdk>=3.0.0  # Deepgram cloud STT
 silero-vad>=4.0.0  # Voice activity detection
 onnxruntime>=1.16.0  # Smart Turn model inference

--- a/server/stt.py
+++ b/server/stt.py
@ -1,15 +1,23 @@
-"""Speech-to-Text using faster-whisper.
+"""Speech-to-Text using faster-whisper and Deepgram cloud API.

 GPU-accelerated transcription with support for multiple model sizes.
+Cloud transcription via Deepgram for GPU-less deployments.
 """

 import asyncio
+import io
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional

+import httpx
 import numpy as np
-from faster_whisper import WhisperModel
+
+try:
+    from faster_whisper import WhisperModel
+    HAS_FASTER_WHISPER = True
+except ImportError:
+    HAS_FASTER_WHISPER = False

 from utils.logging import get_logger, log_latency

@ -51,6 +59,184 @@ class TranscriptionResult:
        return len(self.segments)


+class DeepgramSTT:
+    """
+    Deepgram cloud STT engine.
+
+    Transcribes pre-recorded audio via Deepgram's REST API.
+    No GPU required — sends PCM audio over HTTP.
+    """
+
+    def __init__(
+        self,
+        api_key: str,
+        model: str = "nova-3",
+        language: Optional[str] = None,
+        sample_rate: int = 16000,
+    ):
+        self.api_key = api_key
+        self.model = model
+        self.language = language
+        self.sample_rate = sample_rate
+        self.base_url = "https://api.deepgram.com/v1"
+
+        logger.info(f"Initialized Deepgram STT (model: {model})")
+
+        # Stats
+        self.transcription_count = 0
+        self.total_audio_duration = 0.0
+        self.total_processing_time = 0.0
+
+    def _audio_to_wav_bytes(self, audio: np.ndarray) -> bytes:
+        """Convert float32 PCM audio to WAV bytes."""
+        import struct
+
+        # Ensure float32 mono
+        audio = audio.astype(np.float32)
+        if audio.ndim > 1:
+            audio = audio.mean(axis=1)
+
+        # Convert to int16
+        pcm = (audio * 32767).clip(-32768, 32767).astype(np.int16)
+
+        # Write WAV header manually
+        buf = io.BytesIO()
+        num_samples = len(pcm)
+        byte_rate = self.sample_rate * 2  # 16-bit mono
+        data_size = num_samples * 2
+
+        buf.write(b'RIFF')
+        buf.write(struct.pack('<I', 36 + data_size))
+        buf.write(b'WAVE')
+        buf.write(b'fmt ')
+        buf.write(struct.pack('<I', 16))  # chunk size
+        buf.write(struct.pack('<H', 1))   # PCM
+        buf.write(struct.pack('<H', 1))   # mono
+        buf.write(struct.pack('<I', self.sample_rate))
+        buf.write(struct.pack('<I', byte_rate))
+        buf.write(struct.pack('<H', 2))   # block align
+        buf.write(struct.pack('<H', 16))  # bits per sample
+        buf.write(b'data')
+        buf.write(struct.pack('<I', data_size))
+        buf.write(pcm.tobytes())
+
+        return buf.getvalue()
+
+    async def transcribe_async(
+        self,
+        audio: np.ndarray,
+        language: Optional[str] = None,
+        beam_size: Optional[int] = None,
+        vad_filter: bool = False,
+    ) -> "TranscriptionResult":
+        """Transcribe audio via Deepgram API."""
+        if audio.dtype != np.float32:
+            audio = audio.astype(np.float32)
+
+        start_time = asyncio.get_event_loop().time()
+        wav_bytes = self._audio_to_wav_bytes(audio)
+        duration = len(audio) / self.sample_rate
+
+        lang = language or self.language or "en"
+
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            response = await client.post(
+                f"{self.base_url}/listen",
+                content=wav_bytes,
+                headers={
+                    "Authorization": f"Token {self.api_key}",
+                    "Content-Type": "audio/wav",
+                },
+                params={
+                    "model": self.model,
+                    "language": lang,
+                    "sample_rate": self.sample_rate,
+                    "smart_format": "true",
+                },
+            )
+            response.raise_for_status()
+            result = response.json()
+
+        # Parse Deepgram response
+        channel = result.get("results", {}).get("channels", [{}])[0]
+        alternatives = channel.get("alternatives", [])
+
+        if not alternatives:
+            text = ""
+            segments = []
+        else:
+            alt = alternatives[0]
+            text = alt.get("transcript", "").strip()
+            words = alt.get("words", [])
+
+            segments = []
+            for i, word in enumerate(words):
+                segments.append(TranscriptSegment(
+                    text=word.get("word", ""),
+                    start=word.get("start", 0.0),
+                    end=word.get("end", 0.0),
+                    confidence=word.get("confidence", 1.0),
+                ))
+
+        processing_time = asyncio.get_event_loop().time() - start_time
+
+        # Update stats
+        self.transcription_count += 1
+        self.total_audio_duration += duration
+        self.total_processing_time += processing_time
+
+        logger.info(
+            f"Deepgram transcribed {duration:.2f}s audio: "
+            f'"{text[:50]}..." ({processing_time:.2f}s)'
+        )
+
+        return TranscriptionResult(
+            text=text,
+            segments=segments,
+            language=lang,
+            duration=duration,
+        )
+
+    def get_stats(self) -> dict:
+        avg_duration = (
+            self.total_audio_duration / self.transcription_count
+            if self.transcription_count > 0 else 0.0
+        )
+        avg_processing = (
+            self.total_processing_time / self.transcription_count
+            if self.transcription_count > 0 else 0.0
+        )
+        rtf = avg_processing / avg_duration if avg_duration > 0 else 0.0
+        return {
+            "model": self.model,
+            "provider": "deepgram",
+            "transcription_count": self.transcription_count,
+            "total_audio_duration": self.total_audio_duration,
+            "avg_processing_time": avg_processing,
+            "real_time_factor": rtf,
+        }
+
+    def get_model_info(self) -> dict:
+        return {
+            "model": self.model,
+            "provider": "deepgram",
+            "language": self.language or "auto",
+            "loaded": True,
+        }
+
+
+def create_stt_engine(provider: str, **kwargs):
+    """Factory to create STT engine by provider name."""
+    if provider == "deepgram":
+        return DeepgramSTT(**kwargs)
+    elif provider == "local":
+        if not HAS_FASTER_WHISPER:
+            raise RuntimeError("faster-whisper not installed. Install with: pip install faster-whisper")
+        return FasterWhisperSTT(**kwargs)
+    else:
+        raise ValueError(f"Unknown STT provider: {provider}. Choose 'deepgram' or 'local'.")
+
+
 class FasterWhisperSTT:
    """
    Faster-whisper STT engine.