From f0458b9b40a1520ae5ec0ea91ca019da93510d84 Mon Sep 17 00:00:00 2001
From: Jezza Hehn <jezza.hehn@wikitribune.com>
Date: Fri, 10 Apr 2026 00:33:57 +0000
Subject: [PATCH] feat: add Deepgram STT provider and cloud-first config

- New DeepgramSTT class using Deepgram nova-3 via REST API
- Factory function create_stt_engine() for provider switching
- faster-whisper import now optional (graceful fallback)
- Config defaults to cloud providers (deepgram STT + venice TTS)
- .env.example updated with DEEPGRAM_API_KEY and VENICE_API_KEY
- requirements.txt adds deepgram-sdk, marks faster-whisper as optional
- Zero GPU required for default configuration
---
 .env.example     |   4 +
 config.yaml      |  32 ++++----
 requirements.txt |   3 +-
 server/stt.py    | 190 ++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 213 insertions(+), 16 deletions(-)

diff --git a/.env.example b/.env.example
index c5005e7..12ace12 100644
--- a/.env.example
+++ b/.env.example
@@ -18,6 +18,10 @@ OPENCLAW_BASE_URL=ws://192.168.50.9:18789
 OPENCLAW_AUTH_TOKEN=your_openclaw_gateway_token
 OPENCLAW_AGENT_ID=main  # Agent ID for session keys (jarvis or main)
 
+# Cloud STT/TTS API Keys (for GPU-less deployment)
+DEEPGRAM_API_KEY=your_deepgram_api_key
+VENICE_API_KEY=your_venice_api_key
+
 # ============================================================================
 # FastAPI Server
 # ============================================================================
diff --git a/config.yaml b/config.yaml
index acc36bf..a51c66f 100644
--- a/config.yaml
+++ b/config.yaml
@@ -108,20 +108,19 @@ pipeline:
     # Using v3.2 GPU model for best performance with RTX 5090
     model_path: "smart-turn-v3.2-gpu.onnx"
 
-  # Speech-to-Text (faster-whisper)
+  # Speech-to-Text
   stt:
-    # Model size: tiny, base, small, medium, large-v3
-    # Using "small" for faster transcription (was "medium")
+    # Provider: "deepgram" (cloud, no GPU) or "local" (faster-whisper, requires GPU)
+    provider: "deepgram"
+
+    # Deepgram settings (used when provider is "deepgram")
+    model: "nova-3"
+    language: "en"
+
+    # Local faster-whisper settings (used when provider is "local")
     model_size: "small"
-
-    # Device: cuda or cpu
     device: "cuda"
-
-    # Compute type: float16, float32, int8
     compute_type: "float16"
-
-    # Beam size for decoding (higher = more accurate, slower)
-    # Optimized for voice chat: beam_size=1 is 3-5x faster with minimal quality loss
     beam_size: 1
 
     # Language hint (null = auto-detect)
@@ -165,10 +164,17 @@ pipeline:
 
   # Text-to-Speech
   tts:
-    # TTS engine: chatterbox, coqui, piper
-    engine: "coqui"
+    # Provider: "venice" (cloud, no GPU) or "local" (chatterbox, requires GPU)
+    provider: "venice"
 
-    # Device: cuda or cpu
+    # Venice settings (used when provider is "venice")
+    venice:
+      voice: "am_liam"
+      base_url: "https://api.venice.ai/api/v1"
+      # API key from env: VENICE_API_KEY
+
+    # Local settings (used when provider is "local")
+    engine: "chatterbox"
     device: "cuda"
 
     # Streaming: generate and play audio in chunks
diff --git a/requirements.txt b/requirements.txt
index 2576eb6..0ee4467 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,7 +22,8 @@ resampy>=0.4.2  # High-quality audio resampling
 # ============================================================================
 torch>=2.1.0
 torchaudio>=2.1.0
-faster-whisper>=1.0.0  # GPU-accelerated STT
+faster-whisper>=1.0.0  # GPU-accelerated STT (optional, for local provider)
+deepgram-sdk>=3.0.0  # Deepgram cloud STT
 silero-vad>=4.0.0  # Voice activity detection
 onnxruntime>=1.16.0  # Smart Turn model inference
 
diff --git a/server/stt.py b/server/stt.py
index af57dac..19f07a4 100644
--- a/server/stt.py
+++ b/server/stt.py
@@ -1,15 +1,23 @@
-"""Speech-to-Text using faster-whisper.
+"""Speech-to-Text using faster-whisper and Deepgram cloud API.
 
 GPU-accelerated transcription with support for multiple model sizes.
+Cloud transcription via Deepgram for GPU-less deployments.
 """
 
 import asyncio
+import io
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List, Optional
 
+import httpx
 import numpy as np
-from faster_whisper import WhisperModel
+
+try:
+    from faster_whisper import WhisperModel
+    HAS_FASTER_WHISPER = True
+except ImportError:
+    HAS_FASTER_WHISPER = False
 
 from utils.logging import get_logger, log_latency
 
@@ -51,6 +59,184 @@ class TranscriptionResult:
         return len(self.segments)
 
 
+class DeepgramSTT:
+    """
+    Deepgram cloud STT engine.
+
+    Transcribes pre-recorded audio via Deepgram's REST API.
+    No GPU required — sends PCM audio over HTTP.
+    """
+
+    def __init__(
+        self,
+        api_key: str,
+        model: str = "nova-3",
+        language: Optional[str] = None,
+        sample_rate: int = 16000,
+    ):
+        self.api_key = api_key
+        self.model = model
+        self.language = language
+        self.sample_rate = sample_rate
+        self.base_url = "https://api.deepgram.com/v1"
+
+        logger.info(f"Initialized Deepgram STT (model: {model})")
+
+        # Stats
+        self.transcription_count = 0
+        self.total_audio_duration = 0.0
+        self.total_processing_time = 0.0
+
+    def _audio_to_wav_bytes(self, audio: np.ndarray) -> bytes:
+        """Convert float32 PCM audio to WAV bytes."""
+        import struct
+
+        # Ensure float32 mono
+        audio = audio.astype(np.float32)
+        if audio.ndim > 1:
+            audio = audio.mean(axis=1)
+
+        # Convert to int16
+        pcm = (audio * 32767).clip(-32768, 32767).astype(np.int16)
+
+        # Write WAV header manually
+        buf = io.BytesIO()
+        num_samples = len(pcm)
+        byte_rate = self.sample_rate * 2  # 16-bit mono
+        data_size = num_samples * 2
+
+        buf.write(b'RIFF')
+        buf.write(struct.pack('<I', 36 + data_size))
+        buf.write(b'WAVE')
+        buf.write(b'fmt ')
+        buf.write(struct.pack('<I', 16))  # chunk size
+        buf.write(struct.pack('<H', 1))   # PCM
+        buf.write(struct.pack('<H', 1))   # mono
+        buf.write(struct.pack('<I', self.sample_rate))
+        buf.write(struct.pack('<I', byte_rate))
+        buf.write(struct.pack('<H', 2))   # block align
+        buf.write(struct.pack('<H', 16))  # bits per sample
+        buf.write(b'data')
+        buf.write(struct.pack('<I', data_size))
+        buf.write(pcm.tobytes())
+
+        return buf.getvalue()
+
+    async def transcribe_async(
+        self,
+        audio: np.ndarray,
+        language: Optional[str] = None,
+        beam_size: Optional[int] = None,
+        vad_filter: bool = False,
+    ) -> "TranscriptionResult":
+        """Transcribe audio via Deepgram API."""
+        if audio.dtype != np.float32:
+            audio = audio.astype(np.float32)
+
+        start_time = asyncio.get_event_loop().time()
+        wav_bytes = self._audio_to_wav_bytes(audio)
+        duration = len(audio) / self.sample_rate
+
+        lang = language or self.language or "en"
+
+        async with httpx.AsyncClient(timeout=30.0) as client:
+            response = await client.post(
+                f"{self.base_url}/listen",
+                content=wav_bytes,
+                headers={
+                    "Authorization": f"Token {self.api_key}",
+                    "Content-Type": "audio/wav",
+                },
+                params={
+                    "model": self.model,
+                    "language": lang,
+                    "sample_rate": self.sample_rate,
+                    "smart_format": "true",
+                },
+            )
+            response.raise_for_status()
+            result = response.json()
+
+        # Parse Deepgram response
+        channel = result.get("results", {}).get("channels", [{}])[0]
+        alternatives = channel.get("alternatives", [])
+
+        if not alternatives:
+            text = ""
+            segments = []
+        else:
+            alt = alternatives[0]
+            text = alt.get("transcript", "").strip()
+            words = alt.get("words", [])
+
+            segments = []
+            for i, word in enumerate(words):
+                segments.append(TranscriptSegment(
+                    text=word.get("word", ""),
+                    start=word.get("start", 0.0),
+                    end=word.get("end", 0.0),
+                    confidence=word.get("confidence", 1.0),
+                ))
+
+        processing_time = asyncio.get_event_loop().time() - start_time
+
+        # Update stats
+        self.transcription_count += 1
+        self.total_audio_duration += duration
+        self.total_processing_time += processing_time
+
+        logger.info(
+            f"Deepgram transcribed {duration:.2f}s audio: "
+            f'"{text[:50]}..." ({processing_time:.2f}s)'
+        )
+
+        return TranscriptionResult(
+            text=text,
+            segments=segments,
+            language=lang,
+            duration=duration,
+        )
+
+    def get_stats(self) -> dict:
+        avg_duration = (
+            self.total_audio_duration / self.transcription_count
+            if self.transcription_count > 0 else 0.0
+        )
+        avg_processing = (
+            self.total_processing_time / self.transcription_count
+            if self.transcription_count > 0 else 0.0
+        )
+        rtf = avg_processing / avg_duration if avg_duration > 0 else 0.0
+        return {
+            "model": self.model,
+            "provider": "deepgram",
+            "transcription_count": self.transcription_count,
+            "total_audio_duration": self.total_audio_duration,
+            "avg_processing_time": avg_processing,
+            "real_time_factor": rtf,
+        }
+
+    def get_model_info(self) -> dict:
+        return {
+            "model": self.model,
+            "provider": "deepgram",
+            "language": self.language or "auto",
+            "loaded": True,
+        }
+
+
+def create_stt_engine(provider: str, **kwargs):
+    """Factory to create STT engine by provider name."""
+    if provider == "deepgram":
+        return DeepgramSTT(**kwargs)
+    elif provider == "local":
+        if not HAS_FASTER_WHISPER:
+            raise RuntimeError("faster-whisper not installed. Install with: pip install faster-whisper")
+        return FasterWhisperSTT(**kwargs)
+    else:
+        raise ValueError(f"Unknown STT provider: {provider}. Choose 'deepgram' or 'local'.")
+
+
 class FasterWhisperSTT:
     """
     Faster-whisper STT engine.