feat: add Deepgram STT provider and cloud-first config

- New DeepgramSTT class using Deepgram nova-3 via REST API - Factory function create_stt_engine() for provider switching - faster-whisper import now optional (graceful fallback) - Config defaults to cloud providers (deepgram STT + venice TTS) - .env.example updated with DEEPGRAM_API_KEY and VENICE_API_KEY - requirements.txt adds deepgram-sdk, marks faster-whisper as optional - Zero GPU required for default configuration
2026-04-10 00:33:57 +00:00 · 2026-04-10 00:33:57 +00:00 · f0458b9b40
commit f0458b9b40
parent 3eea942772
4 changed files with 213 additions and 16 deletions
--- a/config.yaml
+++ b/config.yaml
@ -108,20 +108,19 @@ pipeline:
    # Using v3.2 GPU model for best performance with RTX 5090
    model_path: "smart-turn-v3.2-gpu.onnx"

-  # Speech-to-Text (faster-whisper)
+  # Speech-to-Text
  stt:
-    # Model size: tiny, base, small, medium, large-v3
-    # Using "small" for faster transcription (was "medium")
+    # Provider: "deepgram" (cloud, no GPU) or "local" (faster-whisper, requires GPU)
+    provider: "deepgram"
+
+    # Deepgram settings (used when provider is "deepgram")
+    model: "nova-3"
+    language: "en"
+
+    # Local faster-whisper settings (used when provider is "local")
    model_size: "small"
-
-    # Device: cuda or cpu
    device: "cuda"
-
-    # Compute type: float16, float32, int8
    compute_type: "float16"
-
-    # Beam size for decoding (higher = more accurate, slower)
-    # Optimized for voice chat: beam_size=1 is 3-5x faster with minimal quality loss
    beam_size: 1

    # Language hint (null = auto-detect)
@ -165,10 +164,17 @@ pipeline:

  # Text-to-Speech
  tts:
-    # TTS engine: chatterbox, coqui, piper
-    engine: "coqui"
+    # Provider: "venice" (cloud, no GPU) or "local" (chatterbox, requires GPU)
+    provider: "venice"

-    # Device: cuda or cpu
+    # Venice settings (used when provider is "venice")
+    venice:
+      voice: "am_liam"
+      base_url: "https://api.venice.ai/api/v1"
+      # API key from env: VENICE_API_KEY
+
+    # Local settings (used when provider is "local")
+    engine: "chatterbox"
    device: "cuda"

    # Streaming: generate and play audio in chunks