feat: add Deepgram STT provider and cloud-first config

- New DeepgramSTT class using Deepgram nova-3 via REST API
- Factory function create_stt_engine() for provider switching
- faster-whisper import now optional (graceful fallback)
- Config defaults to cloud providers (deepgram STT + venice TTS)
- .env.example updated with DEEPGRAM_API_KEY and VENICE_API_KEY
- requirements.txt adds deepgram-sdk, marks faster-whisper as optional
- Zero GPU required for default configuration
This commit is contained in:
Jezza Hehn 2026-04-10 00:33:57 +00:00
parent 3eea942772
commit f0458b9b40
4 changed files with 213 additions and 16 deletions

View file

@ -108,20 +108,19 @@ pipeline:
# Using v3.2 GPU model for best performance with RTX 5090
model_path: "smart-turn-v3.2-gpu.onnx"
# Speech-to-Text (faster-whisper)
# Speech-to-Text
stt:
# Model size: tiny, base, small, medium, large-v3
# Using "small" for faster transcription (was "medium")
# Provider: "deepgram" (cloud, no GPU) or "local" (faster-whisper, requires GPU)
provider: "deepgram"
# Deepgram settings (used when provider is "deepgram")
model: "nova-3"
language: "en"
# Local faster-whisper settings (used when provider is "local")
model_size: "small"
# Device: cuda or cpu
device: "cuda"
# Compute type: float16, float32, int8
compute_type: "float16"
# Beam size for decoding (higher = more accurate, slower)
# Optimized for voice chat: beam_size=1 is 3-5x faster with minimal quality loss
beam_size: 1
# Language hint (null = auto-detect)
@ -165,10 +164,17 @@ pipeline:
# Text-to-Speech
tts:
# TTS engine: chatterbox, coqui, piper
engine: "coqui"
# Provider: "venice" (cloud, no GPU) or "local" (chatterbox, requires GPU)
provider: "venice"
# Device: cuda or cpu
# Venice settings (used when provider is "venice")
venice:
voice: "am_liam"
base_url: "https://api.venice.ai/api/v1"
# API key from env: VENICE_API_KEY
# Local settings (used when provider is "local")
engine: "chatterbox"
device: "cuda"
# Streaming: generate and play audio in chunks