feat: add Deepgram STT provider and cloud-first config

- New DeepgramSTT class using Deepgram nova-3 via REST API
- Factory function create_stt_engine() for provider switching
- faster-whisper import now optional (graceful fallback)
- Config defaults to cloud providers (deepgram STT + venice TTS)
- .env.example updated with DEEPGRAM_API_KEY and VENICE_API_KEY
- requirements.txt adds deepgram-sdk, marks faster-whisper as optional
- Zero GPU required for default configuration
This commit is contained in:
Jezza Hehn 2026-04-10 00:33:57 +00:00
parent 3eea942772
commit f0458b9b40
4 changed files with 213 additions and 16 deletions

View file

@ -18,6 +18,10 @@ OPENCLAW_BASE_URL=ws://192.168.50.9:18789
OPENCLAW_AUTH_TOKEN=your_openclaw_gateway_token
OPENCLAW_AGENT_ID=main # Agent ID for session keys (jarvis or main)
# Cloud STT/TTS API Keys (for GPU-less deployment)
DEEPGRAM_API_KEY=your_deepgram_api_key
VENICE_API_KEY=your_venice_api_key
# ============================================================================
# FastAPI Server
# ============================================================================

View file

@ -108,20 +108,19 @@ pipeline:
# Using v3.2 GPU model for best performance with RTX 5090
model_path: "smart-turn-v3.2-gpu.onnx"
# Speech-to-Text (faster-whisper)
# Speech-to-Text
stt:
# Model size: tiny, base, small, medium, large-v3
# Using "small" for faster transcription (was "medium")
# Provider: "deepgram" (cloud, no GPU) or "local" (faster-whisper, requires GPU)
provider: "deepgram"
# Deepgram settings (used when provider is "deepgram")
model: "nova-3"
language: "en"
# Local faster-whisper settings (used when provider is "local")
model_size: "small"
# Device: cuda or cpu
device: "cuda"
# Compute type: float16, float32, int8
compute_type: "float16"
# Beam size for decoding (higher = more accurate, slower)
# Optimized for voice chat: beam_size=1 is 3-5x faster with minimal quality loss
beam_size: 1
# Language hint (null = auto-detect)
@ -165,10 +164,17 @@ pipeline:
# Text-to-Speech
tts:
# TTS engine: chatterbox, coqui, piper
engine: "coqui"
# Provider: "venice" (cloud, no GPU) or "local" (chatterbox, requires GPU)
provider: "venice"
# Device: cuda or cpu
# Venice settings (used when provider is "venice")
venice:
voice: "am_liam"
base_url: "https://api.venice.ai/api/v1"
# API key from env: VENICE_API_KEY
# Local settings (used when provider is "local")
engine: "chatterbox"
device: "cuda"
# Streaming: generate and play audio in chunks

View file

@ -22,7 +22,8 @@ resampy>=0.4.2 # High-quality audio resampling
# ============================================================================
torch>=2.1.0
torchaudio>=2.1.0
faster-whisper>=1.0.0 # GPU-accelerated STT
faster-whisper>=1.0.0 # GPU-accelerated STT (optional, for local provider)
deepgram-sdk>=3.0.0 # Deepgram cloud STT
silero-vad>=4.0.0 # Voice activity detection
onnxruntime>=1.16.0 # Smart Turn model inference

View file

@ -1,15 +1,23 @@
"""Speech-to-Text using faster-whisper.
"""Speech-to-Text using faster-whisper and Deepgram cloud API.
GPU-accelerated transcription with support for multiple model sizes.
Cloud transcription via Deepgram for GPU-less deployments.
"""
import asyncio
import io
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional
import httpx
import numpy as np
from faster_whisper import WhisperModel
try:
from faster_whisper import WhisperModel
HAS_FASTER_WHISPER = True
except ImportError:
HAS_FASTER_WHISPER = False
from utils.logging import get_logger, log_latency
@ -51,6 +59,184 @@ class TranscriptionResult:
return len(self.segments)
class DeepgramSTT:
"""
Deepgram cloud STT engine.
Transcribes pre-recorded audio via Deepgram's REST API.
No GPU required sends PCM audio over HTTP.
"""
def __init__(
self,
api_key: str,
model: str = "nova-3",
language: Optional[str] = None,
sample_rate: int = 16000,
):
self.api_key = api_key
self.model = model
self.language = language
self.sample_rate = sample_rate
self.base_url = "https://api.deepgram.com/v1"
logger.info(f"Initialized Deepgram STT (model: {model})")
# Stats
self.transcription_count = 0
self.total_audio_duration = 0.0
self.total_processing_time = 0.0
def _audio_to_wav_bytes(self, audio: np.ndarray) -> bytes:
"""Convert float32 PCM audio to WAV bytes."""
import struct
# Ensure float32 mono
audio = audio.astype(np.float32)
if audio.ndim > 1:
audio = audio.mean(axis=1)
# Convert to int16
pcm = (audio * 32767).clip(-32768, 32767).astype(np.int16)
# Write WAV header manually
buf = io.BytesIO()
num_samples = len(pcm)
byte_rate = self.sample_rate * 2 # 16-bit mono
data_size = num_samples * 2
buf.write(b'RIFF')
buf.write(struct.pack('<I', 36 + data_size))
buf.write(b'WAVE')
buf.write(b'fmt ')
buf.write(struct.pack('<I', 16)) # chunk size
buf.write(struct.pack('<H', 1)) # PCM
buf.write(struct.pack('<H', 1)) # mono
buf.write(struct.pack('<I', self.sample_rate))
buf.write(struct.pack('<I', byte_rate))
buf.write(struct.pack('<H', 2)) # block align
buf.write(struct.pack('<H', 16)) # bits per sample
buf.write(b'data')
buf.write(struct.pack('<I', data_size))
buf.write(pcm.tobytes())
return buf.getvalue()
async def transcribe_async(
self,
audio: np.ndarray,
language: Optional[str] = None,
beam_size: Optional[int] = None,
vad_filter: bool = False,
) -> "TranscriptionResult":
"""Transcribe audio via Deepgram API."""
if audio.dtype != np.float32:
audio = audio.astype(np.float32)
start_time = asyncio.get_event_loop().time()
wav_bytes = self._audio_to_wav_bytes(audio)
duration = len(audio) / self.sample_rate
lang = language or self.language or "en"
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
f"{self.base_url}/listen",
content=wav_bytes,
headers={
"Authorization": f"Token {self.api_key}",
"Content-Type": "audio/wav",
},
params={
"model": self.model,
"language": lang,
"sample_rate": self.sample_rate,
"smart_format": "true",
},
)
response.raise_for_status()
result = response.json()
# Parse Deepgram response
channel = result.get("results", {}).get("channels", [{}])[0]
alternatives = channel.get("alternatives", [])
if not alternatives:
text = ""
segments = []
else:
alt = alternatives[0]
text = alt.get("transcript", "").strip()
words = alt.get("words", [])
segments = []
for i, word in enumerate(words):
segments.append(TranscriptSegment(
text=word.get("word", ""),
start=word.get("start", 0.0),
end=word.get("end", 0.0),
confidence=word.get("confidence", 1.0),
))
processing_time = asyncio.get_event_loop().time() - start_time
# Update stats
self.transcription_count += 1
self.total_audio_duration += duration
self.total_processing_time += processing_time
logger.info(
f"Deepgram transcribed {duration:.2f}s audio: "
f'"{text[:50]}..." ({processing_time:.2f}s)'
)
return TranscriptionResult(
text=text,
segments=segments,
language=lang,
duration=duration,
)
def get_stats(self) -> dict:
avg_duration = (
self.total_audio_duration / self.transcription_count
if self.transcription_count > 0 else 0.0
)
avg_processing = (
self.total_processing_time / self.transcription_count
if self.transcription_count > 0 else 0.0
)
rtf = avg_processing / avg_duration if avg_duration > 0 else 0.0
return {
"model": self.model,
"provider": "deepgram",
"transcription_count": self.transcription_count,
"total_audio_duration": self.total_audio_duration,
"avg_processing_time": avg_processing,
"real_time_factor": rtf,
}
def get_model_info(self) -> dict:
return {
"model": self.model,
"provider": "deepgram",
"language": self.language or "auto",
"loaded": True,
}
def create_stt_engine(provider: str, **kwargs):
"""Factory to create STT engine by provider name."""
if provider == "deepgram":
return DeepgramSTT(**kwargs)
elif provider == "local":
if not HAS_FASTER_WHISPER:
raise RuntimeError("faster-whisper not installed. Install with: pip install faster-whisper")
return FasterWhisperSTT(**kwargs)
else:
raise ValueError(f"Unknown STT provider: {provider}. Choose 'deepgram' or 'local'.")
class FasterWhisperSTT:
"""
Faster-whisper STT engine.