feat: add Deepgram STT provider and cloud-first config
- New DeepgramSTT class using Deepgram nova-3 via REST API - Factory function create_stt_engine() for provider switching - faster-whisper import now optional (graceful fallback) - Config defaults to cloud providers (deepgram STT + venice TTS) - .env.example updated with DEEPGRAM_API_KEY and VENICE_API_KEY - requirements.txt adds deepgram-sdk, marks faster-whisper as optional - Zero GPU required for default configuration
This commit is contained in:
parent
3eea942772
commit
f0458b9b40
4 changed files with 213 additions and 16 deletions
|
|
@ -18,6 +18,10 @@ OPENCLAW_BASE_URL=ws://192.168.50.9:18789
|
||||||
OPENCLAW_AUTH_TOKEN=your_openclaw_gateway_token
|
OPENCLAW_AUTH_TOKEN=your_openclaw_gateway_token
|
||||||
OPENCLAW_AGENT_ID=main # Agent ID for session keys (jarvis or main)
|
OPENCLAW_AGENT_ID=main # Agent ID for session keys (jarvis or main)
|
||||||
|
|
||||||
|
# Cloud STT/TTS API Keys (for GPU-less deployment)
|
||||||
|
DEEPGRAM_API_KEY=your_deepgram_api_key
|
||||||
|
VENICE_API_KEY=your_venice_api_key
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# FastAPI Server
|
# FastAPI Server
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
|
||||||
32
config.yaml
32
config.yaml
|
|
@ -108,20 +108,19 @@ pipeline:
|
||||||
# Using v3.2 GPU model for best performance with RTX 5090
|
# Using v3.2 GPU model for best performance with RTX 5090
|
||||||
model_path: "smart-turn-v3.2-gpu.onnx"
|
model_path: "smart-turn-v3.2-gpu.onnx"
|
||||||
|
|
||||||
# Speech-to-Text (faster-whisper)
|
# Speech-to-Text
|
||||||
stt:
|
stt:
|
||||||
# Model size: tiny, base, small, medium, large-v3
|
# Provider: "deepgram" (cloud, no GPU) or "local" (faster-whisper, requires GPU)
|
||||||
# Using "small" for faster transcription (was "medium")
|
provider: "deepgram"
|
||||||
|
|
||||||
|
# Deepgram settings (used when provider is "deepgram")
|
||||||
|
model: "nova-3"
|
||||||
|
language: "en"
|
||||||
|
|
||||||
|
# Local faster-whisper settings (used when provider is "local")
|
||||||
model_size: "small"
|
model_size: "small"
|
||||||
|
|
||||||
# Device: cuda or cpu
|
|
||||||
device: "cuda"
|
device: "cuda"
|
||||||
|
|
||||||
# Compute type: float16, float32, int8
|
|
||||||
compute_type: "float16"
|
compute_type: "float16"
|
||||||
|
|
||||||
# Beam size for decoding (higher = more accurate, slower)
|
|
||||||
# Optimized for voice chat: beam_size=1 is 3-5x faster with minimal quality loss
|
|
||||||
beam_size: 1
|
beam_size: 1
|
||||||
|
|
||||||
# Language hint (null = auto-detect)
|
# Language hint (null = auto-detect)
|
||||||
|
|
@ -165,10 +164,17 @@ pipeline:
|
||||||
|
|
||||||
# Text-to-Speech
|
# Text-to-Speech
|
||||||
tts:
|
tts:
|
||||||
# TTS engine: chatterbox, coqui, piper
|
# Provider: "venice" (cloud, no GPU) or "local" (chatterbox, requires GPU)
|
||||||
engine: "coqui"
|
provider: "venice"
|
||||||
|
|
||||||
# Device: cuda or cpu
|
# Venice settings (used when provider is "venice")
|
||||||
|
venice:
|
||||||
|
voice: "am_liam"
|
||||||
|
base_url: "https://api.venice.ai/api/v1"
|
||||||
|
# API key from env: VENICE_API_KEY
|
||||||
|
|
||||||
|
# Local settings (used when provider is "local")
|
||||||
|
engine: "chatterbox"
|
||||||
device: "cuda"
|
device: "cuda"
|
||||||
|
|
||||||
# Streaming: generate and play audio in chunks
|
# Streaming: generate and play audio in chunks
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,8 @@ resampy>=0.4.2 # High-quality audio resampling
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
torch>=2.1.0
|
torch>=2.1.0
|
||||||
torchaudio>=2.1.0
|
torchaudio>=2.1.0
|
||||||
faster-whisper>=1.0.0 # GPU-accelerated STT
|
faster-whisper>=1.0.0 # GPU-accelerated STT (optional, for local provider)
|
||||||
|
deepgram-sdk>=3.0.0 # Deepgram cloud STT
|
||||||
silero-vad>=4.0.0 # Voice activity detection
|
silero-vad>=4.0.0 # Voice activity detection
|
||||||
onnxruntime>=1.16.0 # Smart Turn model inference
|
onnxruntime>=1.16.0 # Smart Turn model inference
|
||||||
|
|
||||||
|
|
|
||||||
188
server/stt.py
188
server/stt.py
|
|
@ -1,15 +1,23 @@
|
||||||
"""Speech-to-Text using faster-whisper.
|
"""Speech-to-Text using faster-whisper and Deepgram cloud API.
|
||||||
|
|
||||||
GPU-accelerated transcription with support for multiple model sizes.
|
GPU-accelerated transcription with support for multiple model sizes.
|
||||||
|
Cloud transcription via Deepgram for GPU-less deployments.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import io
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
try:
|
||||||
from faster_whisper import WhisperModel
|
from faster_whisper import WhisperModel
|
||||||
|
HAS_FASTER_WHISPER = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_FASTER_WHISPER = False
|
||||||
|
|
||||||
from utils.logging import get_logger, log_latency
|
from utils.logging import get_logger, log_latency
|
||||||
|
|
||||||
|
|
@ -51,6 +59,184 @@ class TranscriptionResult:
|
||||||
return len(self.segments)
|
return len(self.segments)
|
||||||
|
|
||||||
|
|
||||||
|
class DeepgramSTT:
|
||||||
|
"""
|
||||||
|
Deepgram cloud STT engine.
|
||||||
|
|
||||||
|
Transcribes pre-recorded audio via Deepgram's REST API.
|
||||||
|
No GPU required — sends PCM audio over HTTP.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
api_key: str,
|
||||||
|
model: str = "nova-3",
|
||||||
|
language: Optional[str] = None,
|
||||||
|
sample_rate: int = 16000,
|
||||||
|
):
|
||||||
|
self.api_key = api_key
|
||||||
|
self.model = model
|
||||||
|
self.language = language
|
||||||
|
self.sample_rate = sample_rate
|
||||||
|
self.base_url = "https://api.deepgram.com/v1"
|
||||||
|
|
||||||
|
logger.info(f"Initialized Deepgram STT (model: {model})")
|
||||||
|
|
||||||
|
# Stats
|
||||||
|
self.transcription_count = 0
|
||||||
|
self.total_audio_duration = 0.0
|
||||||
|
self.total_processing_time = 0.0
|
||||||
|
|
||||||
|
def _audio_to_wav_bytes(self, audio: np.ndarray) -> bytes:
|
||||||
|
"""Convert float32 PCM audio to WAV bytes."""
|
||||||
|
import struct
|
||||||
|
|
||||||
|
# Ensure float32 mono
|
||||||
|
audio = audio.astype(np.float32)
|
||||||
|
if audio.ndim > 1:
|
||||||
|
audio = audio.mean(axis=1)
|
||||||
|
|
||||||
|
# Convert to int16
|
||||||
|
pcm = (audio * 32767).clip(-32768, 32767).astype(np.int16)
|
||||||
|
|
||||||
|
# Write WAV header manually
|
||||||
|
buf = io.BytesIO()
|
||||||
|
num_samples = len(pcm)
|
||||||
|
byte_rate = self.sample_rate * 2 # 16-bit mono
|
||||||
|
data_size = num_samples * 2
|
||||||
|
|
||||||
|
buf.write(b'RIFF')
|
||||||
|
buf.write(struct.pack('<I', 36 + data_size))
|
||||||
|
buf.write(b'WAVE')
|
||||||
|
buf.write(b'fmt ')
|
||||||
|
buf.write(struct.pack('<I', 16)) # chunk size
|
||||||
|
buf.write(struct.pack('<H', 1)) # PCM
|
||||||
|
buf.write(struct.pack('<H', 1)) # mono
|
||||||
|
buf.write(struct.pack('<I', self.sample_rate))
|
||||||
|
buf.write(struct.pack('<I', byte_rate))
|
||||||
|
buf.write(struct.pack('<H', 2)) # block align
|
||||||
|
buf.write(struct.pack('<H', 16)) # bits per sample
|
||||||
|
buf.write(b'data')
|
||||||
|
buf.write(struct.pack('<I', data_size))
|
||||||
|
buf.write(pcm.tobytes())
|
||||||
|
|
||||||
|
return buf.getvalue()
|
||||||
|
|
||||||
|
async def transcribe_async(
|
||||||
|
self,
|
||||||
|
audio: np.ndarray,
|
||||||
|
language: Optional[str] = None,
|
||||||
|
beam_size: Optional[int] = None,
|
||||||
|
vad_filter: bool = False,
|
||||||
|
) -> "TranscriptionResult":
|
||||||
|
"""Transcribe audio via Deepgram API."""
|
||||||
|
if audio.dtype != np.float32:
|
||||||
|
audio = audio.astype(np.float32)
|
||||||
|
|
||||||
|
start_time = asyncio.get_event_loop().time()
|
||||||
|
wav_bytes = self._audio_to_wav_bytes(audio)
|
||||||
|
duration = len(audio) / self.sample_rate
|
||||||
|
|
||||||
|
lang = language or self.language or "en"
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
|
response = await client.post(
|
||||||
|
f"{self.base_url}/listen",
|
||||||
|
content=wav_bytes,
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Token {self.api_key}",
|
||||||
|
"Content-Type": "audio/wav",
|
||||||
|
},
|
||||||
|
params={
|
||||||
|
"model": self.model,
|
||||||
|
"language": lang,
|
||||||
|
"sample_rate": self.sample_rate,
|
||||||
|
"smart_format": "true",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
result = response.json()
|
||||||
|
|
||||||
|
# Parse Deepgram response
|
||||||
|
channel = result.get("results", {}).get("channels", [{}])[0]
|
||||||
|
alternatives = channel.get("alternatives", [])
|
||||||
|
|
||||||
|
if not alternatives:
|
||||||
|
text = ""
|
||||||
|
segments = []
|
||||||
|
else:
|
||||||
|
alt = alternatives[0]
|
||||||
|
text = alt.get("transcript", "").strip()
|
||||||
|
words = alt.get("words", [])
|
||||||
|
|
||||||
|
segments = []
|
||||||
|
for i, word in enumerate(words):
|
||||||
|
segments.append(TranscriptSegment(
|
||||||
|
text=word.get("word", ""),
|
||||||
|
start=word.get("start", 0.0),
|
||||||
|
end=word.get("end", 0.0),
|
||||||
|
confidence=word.get("confidence", 1.0),
|
||||||
|
))
|
||||||
|
|
||||||
|
processing_time = asyncio.get_event_loop().time() - start_time
|
||||||
|
|
||||||
|
# Update stats
|
||||||
|
self.transcription_count += 1
|
||||||
|
self.total_audio_duration += duration
|
||||||
|
self.total_processing_time += processing_time
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Deepgram transcribed {duration:.2f}s audio: "
|
||||||
|
f'"{text[:50]}..." ({processing_time:.2f}s)'
|
||||||
|
)
|
||||||
|
|
||||||
|
return TranscriptionResult(
|
||||||
|
text=text,
|
||||||
|
segments=segments,
|
||||||
|
language=lang,
|
||||||
|
duration=duration,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_stats(self) -> dict:
|
||||||
|
avg_duration = (
|
||||||
|
self.total_audio_duration / self.transcription_count
|
||||||
|
if self.transcription_count > 0 else 0.0
|
||||||
|
)
|
||||||
|
avg_processing = (
|
||||||
|
self.total_processing_time / self.transcription_count
|
||||||
|
if self.transcription_count > 0 else 0.0
|
||||||
|
)
|
||||||
|
rtf = avg_processing / avg_duration if avg_duration > 0 else 0.0
|
||||||
|
return {
|
||||||
|
"model": self.model,
|
||||||
|
"provider": "deepgram",
|
||||||
|
"transcription_count": self.transcription_count,
|
||||||
|
"total_audio_duration": self.total_audio_duration,
|
||||||
|
"avg_processing_time": avg_processing,
|
||||||
|
"real_time_factor": rtf,
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_model_info(self) -> dict:
|
||||||
|
return {
|
||||||
|
"model": self.model,
|
||||||
|
"provider": "deepgram",
|
||||||
|
"language": self.language or "auto",
|
||||||
|
"loaded": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def create_stt_engine(provider: str, **kwargs):
|
||||||
|
"""Factory to create STT engine by provider name."""
|
||||||
|
if provider == "deepgram":
|
||||||
|
return DeepgramSTT(**kwargs)
|
||||||
|
elif provider == "local":
|
||||||
|
if not HAS_FASTER_WHISPER:
|
||||||
|
raise RuntimeError("faster-whisper not installed. Install with: pip install faster-whisper")
|
||||||
|
return FasterWhisperSTT(**kwargs)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown STT provider: {provider}. Choose 'deepgram' or 'local'.")
|
||||||
|
|
||||||
|
|
||||||
class FasterWhisperSTT:
|
class FasterWhisperSTT:
|
||||||
"""
|
"""
|
||||||
Faster-whisper STT engine.
|
Faster-whisper STT engine.
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue