feat: add Deepgram STT provider and cloud-first config

- New DeepgramSTT class using Deepgram nova-3 via REST API
- Factory function create_stt_engine() for provider switching
- faster-whisper import now optional (graceful fallback)
- Config defaults to cloud providers (deepgram STT + venice TTS)
- .env.example updated with DEEPGRAM_API_KEY and VENICE_API_KEY
- requirements.txt adds deepgram-sdk, marks faster-whisper as optional
- Zero GPU required for default configuration
This commit is contained in:
Jezza Hehn 2026-04-10 00:33:57 +00:00
parent 3eea942772
commit f0458b9b40
4 changed files with 213 additions and 16 deletions

View file

@ -18,6 +18,10 @@ OPENCLAW_BASE_URL=ws://192.168.50.9:18789
OPENCLAW_AUTH_TOKEN=your_openclaw_gateway_token OPENCLAW_AUTH_TOKEN=your_openclaw_gateway_token
OPENCLAW_AGENT_ID=main # Agent ID for session keys (jarvis or main) OPENCLAW_AGENT_ID=main # Agent ID for session keys (jarvis or main)
# Cloud STT/TTS API Keys (for GPU-less deployment)
DEEPGRAM_API_KEY=your_deepgram_api_key
VENICE_API_KEY=your_venice_api_key
# ============================================================================ # ============================================================================
# FastAPI Server # FastAPI Server
# ============================================================================ # ============================================================================

View file

@ -108,20 +108,19 @@ pipeline:
# Using v3.2 GPU model for best performance with RTX 5090 # Using v3.2 GPU model for best performance with RTX 5090
model_path: "smart-turn-v3.2-gpu.onnx" model_path: "smart-turn-v3.2-gpu.onnx"
# Speech-to-Text (faster-whisper) # Speech-to-Text
stt: stt:
# Model size: tiny, base, small, medium, large-v3 # Provider: "deepgram" (cloud, no GPU) or "local" (faster-whisper, requires GPU)
# Using "small" for faster transcription (was "medium") provider: "deepgram"
# Deepgram settings (used when provider is "deepgram")
model: "nova-3"
language: "en"
# Local faster-whisper settings (used when provider is "local")
model_size: "small" model_size: "small"
# Device: cuda or cpu
device: "cuda" device: "cuda"
# Compute type: float16, float32, int8
compute_type: "float16" compute_type: "float16"
# Beam size for decoding (higher = more accurate, slower)
# Optimized for voice chat: beam_size=1 is 3-5x faster with minimal quality loss
beam_size: 1 beam_size: 1
# Language hint (null = auto-detect) # Language hint (null = auto-detect)
@ -165,10 +164,17 @@ pipeline:
# Text-to-Speech # Text-to-Speech
tts: tts:
# TTS engine: chatterbox, coqui, piper # Provider: "venice" (cloud, no GPU) or "local" (chatterbox, requires GPU)
engine: "coqui" provider: "venice"
# Device: cuda or cpu # Venice settings (used when provider is "venice")
venice:
voice: "am_liam"
base_url: "https://api.venice.ai/api/v1"
# API key from env: VENICE_API_KEY
# Local settings (used when provider is "local")
engine: "chatterbox"
device: "cuda" device: "cuda"
# Streaming: generate and play audio in chunks # Streaming: generate and play audio in chunks

View file

@ -22,7 +22,8 @@ resampy>=0.4.2 # High-quality audio resampling
# ============================================================================ # ============================================================================
torch>=2.1.0 torch>=2.1.0
torchaudio>=2.1.0 torchaudio>=2.1.0
faster-whisper>=1.0.0 # GPU-accelerated STT faster-whisper>=1.0.0 # GPU-accelerated STT (optional, for local provider)
deepgram-sdk>=3.0.0 # Deepgram cloud STT
silero-vad>=4.0.0 # Voice activity detection silero-vad>=4.0.0 # Voice activity detection
onnxruntime>=1.16.0 # Smart Turn model inference onnxruntime>=1.16.0 # Smart Turn model inference

View file

@ -1,15 +1,23 @@
"""Speech-to-Text using faster-whisper. """Speech-to-Text using faster-whisper and Deepgram cloud API.
GPU-accelerated transcription with support for multiple model sizes. GPU-accelerated transcription with support for multiple model sizes.
Cloud transcription via Deepgram for GPU-less deployments.
""" """
import asyncio import asyncio
import io
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import List, Optional from typing import List, Optional
import httpx
import numpy as np import numpy as np
from faster_whisper import WhisperModel
try:
from faster_whisper import WhisperModel
HAS_FASTER_WHISPER = True
except ImportError:
HAS_FASTER_WHISPER = False
from utils.logging import get_logger, log_latency from utils.logging import get_logger, log_latency
@ -51,6 +59,184 @@ class TranscriptionResult:
return len(self.segments) return len(self.segments)
class DeepgramSTT:
"""
Deepgram cloud STT engine.
Transcribes pre-recorded audio via Deepgram's REST API.
No GPU required sends PCM audio over HTTP.
"""
def __init__(
self,
api_key: str,
model: str = "nova-3",
language: Optional[str] = None,
sample_rate: int = 16000,
):
self.api_key = api_key
self.model = model
self.language = language
self.sample_rate = sample_rate
self.base_url = "https://api.deepgram.com/v1"
logger.info(f"Initialized Deepgram STT (model: {model})")
# Stats
self.transcription_count = 0
self.total_audio_duration = 0.0
self.total_processing_time = 0.0
def _audio_to_wav_bytes(self, audio: np.ndarray) -> bytes:
"""Convert float32 PCM audio to WAV bytes."""
import struct
# Ensure float32 mono
audio = audio.astype(np.float32)
if audio.ndim > 1:
audio = audio.mean(axis=1)
# Convert to int16
pcm = (audio * 32767).clip(-32768, 32767).astype(np.int16)
# Write WAV header manually
buf = io.BytesIO()
num_samples = len(pcm)
byte_rate = self.sample_rate * 2 # 16-bit mono
data_size = num_samples * 2
buf.write(b'RIFF')
buf.write(struct.pack('<I', 36 + data_size))
buf.write(b'WAVE')
buf.write(b'fmt ')
buf.write(struct.pack('<I', 16)) # chunk size
buf.write(struct.pack('<H', 1)) # PCM
buf.write(struct.pack('<H', 1)) # mono
buf.write(struct.pack('<I', self.sample_rate))
buf.write(struct.pack('<I', byte_rate))
buf.write(struct.pack('<H', 2)) # block align
buf.write(struct.pack('<H', 16)) # bits per sample
buf.write(b'data')
buf.write(struct.pack('<I', data_size))
buf.write(pcm.tobytes())
return buf.getvalue()
async def transcribe_async(
self,
audio: np.ndarray,
language: Optional[str] = None,
beam_size: Optional[int] = None,
vad_filter: bool = False,
) -> "TranscriptionResult":
"""Transcribe audio via Deepgram API."""
if audio.dtype != np.float32:
audio = audio.astype(np.float32)
start_time = asyncio.get_event_loop().time()
wav_bytes = self._audio_to_wav_bytes(audio)
duration = len(audio) / self.sample_rate
lang = language or self.language or "en"
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
f"{self.base_url}/listen",
content=wav_bytes,
headers={
"Authorization": f"Token {self.api_key}",
"Content-Type": "audio/wav",
},
params={
"model": self.model,
"language": lang,
"sample_rate": self.sample_rate,
"smart_format": "true",
},
)
response.raise_for_status()
result = response.json()
# Parse Deepgram response
channel = result.get("results", {}).get("channels", [{}])[0]
alternatives = channel.get("alternatives", [])
if not alternatives:
text = ""
segments = []
else:
alt = alternatives[0]
text = alt.get("transcript", "").strip()
words = alt.get("words", [])
segments = []
for i, word in enumerate(words):
segments.append(TranscriptSegment(
text=word.get("word", ""),
start=word.get("start", 0.0),
end=word.get("end", 0.0),
confidence=word.get("confidence", 1.0),
))
processing_time = asyncio.get_event_loop().time() - start_time
# Update stats
self.transcription_count += 1
self.total_audio_duration += duration
self.total_processing_time += processing_time
logger.info(
f"Deepgram transcribed {duration:.2f}s audio: "
f'"{text[:50]}..." ({processing_time:.2f}s)'
)
return TranscriptionResult(
text=text,
segments=segments,
language=lang,
duration=duration,
)
def get_stats(self) -> dict:
avg_duration = (
self.total_audio_duration / self.transcription_count
if self.transcription_count > 0 else 0.0
)
avg_processing = (
self.total_processing_time / self.transcription_count
if self.transcription_count > 0 else 0.0
)
rtf = avg_processing / avg_duration if avg_duration > 0 else 0.0
return {
"model": self.model,
"provider": "deepgram",
"transcription_count": self.transcription_count,
"total_audio_duration": self.total_audio_duration,
"avg_processing_time": avg_processing,
"real_time_factor": rtf,
}
def get_model_info(self) -> dict:
return {
"model": self.model,
"provider": "deepgram",
"language": self.language or "auto",
"loaded": True,
}
def create_stt_engine(provider: str, **kwargs):
"""Factory to create STT engine by provider name."""
if provider == "deepgram":
return DeepgramSTT(**kwargs)
elif provider == "local":
if not HAS_FASTER_WHISPER:
raise RuntimeError("faster-whisper not installed. Install with: pip install faster-whisper")
return FasterWhisperSTT(**kwargs)
else:
raise ValueError(f"Unknown STT provider: {provider}. Choose 'deepgram' or 'local'.")
class FasterWhisperSTT: class FasterWhisperSTT:
""" """
Faster-whisper STT engine. Faster-whisper STT engine.