Wire cloud STT/TTS providers into pipeline

- Add provider field to STTConfig and TTSConfig (deepgram/venice)
- Add VeniceTTSConfig model for venice voice/base_url settings
- Add CloudTTSSynthesizer adapter wrapping VeniceKokoroTTS
- Loosen STTTranscriber type hint to accept any engine with transcribe_async
- Update run.py to use create_stt_engine/create_tts_engine factories
- Provider-based init: reads config.pipeline.stt.provider and .tts.provider
- Fix duplicate language key in config.yaml
- Remove duplicate language field from STT config

Cloud-only path: VAD (local) -> Deepgram STT -> OpenClaw -> Venice TTS -> Discord
This commit is contained in:
Jezza Hehn 2026-04-10 00:44:03 +00:00
parent f0458b9b40
commit 7d3e13a3ca
5 changed files with 115 additions and 48 deletions

View file

@ -123,9 +123,6 @@ pipeline:
compute_type: "float16"
beam_size: 1
# Language hint (null = auto-detect)
language: "en"
# VAD filter (use built-in VAD in whisper)
vad_filter: false

70
run.py
View file

@ -84,10 +84,10 @@ async def main():
logger.info(f" Default Agent: {config.agents.default}")
logger.info(f" OpenClaw Gateway: {config.openclaw.base_url}")
logger.info(f" OpenClaw Agent ID: {config.openclaw.agent_id}")
logger.info(f" STT Model: {config.pipeline.stt.model_size}")
logger.info(f" STT Device: {config.pipeline.stt.device}")
logger.info(f" TTS Engine: {config.pipeline.tts.engine}")
logger.info(f" TTS Device: {config.pipeline.tts.device}")
logger.info(f" STT Provider: {config.pipeline.stt.provider}")
logger.info(f" TTS Provider: {config.pipeline.tts.provider}")
if config.pipeline.tts.provider == "venice":
logger.info(f" TTS Voice: {config.pipeline.tts.venice.voice}")
logger.info(f" Server Port: {config.server.port}")
logger.info(f" Latency Tracking: {config.logging.track_latency}")
logger.info("")
@ -95,38 +95,60 @@ async def main():
# Initialize shared TTS and STT engines
logger.info("Initializing TTS and STT engines...")
from server.stt import create_transcriber
from server.tts import create_tts_synthesizer
from server.stt import create_stt_engine, STTTranscriber
from server.tts import create_tts_engine, CloudTTSSynthesizer
import os
# Create voice references map
# --- TTS ---
tts_provider = config.pipeline.tts.provider
if tts_provider == "venice":
tts_engine = create_tts_engine("venice", {
"api_key": os.getenv("VENICE_API_KEY", ""),
"voice": config.pipeline.tts.venice.voice,
"base_url": config.pipeline.tts.venice.base_url,
})
tts_synthesizer = CloudTTSSynthesizer(tts_engine)
logger.info(f"✓ TTS engine initialized (Venice Kokoro, voice={config.pipeline.tts.venice.voice})")
else:
# Local Chatterbox (requires GPU + voice files)
voice_refs = {
"jarvis": str(jarvis_voice),
"sage": str(sage_voice),
}
tts_engine = create_tts_engine("chatterbox", {
"device": config.pipeline.tts.device,
"sample_rate": 24000,
})
from server.tts import TTSSynthesizer
voice_map = {a: Path(p) for a, p in voice_refs.items()}
tts_synthesizer = TTSSynthesizer(engine=tts_engine, voice_map=voice_map)
logger.info(f"✓ TTS engine initialized (Chatterbox on {config.pipeline.tts.device})")
# Initialize TTS synthesizer (shared between Discord and API)
tts_synthesizer = await create_tts_synthesizer(
voice_refs=voice_refs,
device=config.pipeline.tts.device,
sample_rate=24000, # Default sample rate for Chatterbox TTS
)
logger.info(f"✓ TTS engine initialized ({config.pipeline.tts.device})")
# Warmup TTS and cache common phrases
logger.info("Warming up TTS engine and caching common phrases...")
# Warmup TTS
logger.info("Warming up TTS engine...")
await tts_synthesizer.warmup()
logger.info(f"✓ TTS warmup complete ({len(tts_synthesizer.phrase_cache)} phrases cached)")
logger.info("✓ TTS warmup complete")
# Initialize STT transcriber (shared between Discord and API)
stt_transcriber = await create_transcriber(
# --- STT ---
stt_provider = config.pipeline.stt.provider
if stt_provider == "deepgram":
stt_engine = create_stt_engine("deepgram",
api_key=os.getenv("DEEPGRAM_API_KEY", ""),
model=config.pipeline.stt.model,
language=config.pipeline.stt.language,
)
stt_transcriber = STTTranscriber(engine=stt_engine, max_concurrent=3)
logger.info(f"✓ STT engine initialized (Deepgram {config.pipeline.stt.model})")
else:
stt_engine = create_stt_engine("local",
model_size=config.pipeline.stt.model_size,
device=config.pipeline.stt.device,
compute_type=config.pipeline.stt.compute_type,
beam_size=config.pipeline.stt.beam_size,
language=config.pipeline.stt.language,
)
logger.info(
f"✓ STT engine initialized "
f"({config.pipeline.stt.model_size} on {config.pipeline.stt.device})"
)
stt_transcriber = STTTranscriber(engine=stt_engine, max_concurrent=1)
logger.info(f"✓ STT engine initialized (faster-whisper {config.pipeline.stt.model_size} on {config.pipeline.stt.device})")
# Initialize OpenClaw Gateway client
logger.info("Initializing OpenClaw Gateway client...")

View file

@ -481,27 +481,17 @@ class STTTranscriber:
Pipeline stage for speech-to-text transcription.
Handles queueing and concurrent transcription requests.
Accepts any engine with a transcribe_async() method.
"""
def __init__(
self,
engine: FasterWhisperSTT,
engine, # FasterWhisperSTT or DeepgramSTT
max_concurrent: int = 1,
):
"""
Initialize transcriber.
Args:
engine: STT engine instance
max_concurrent: Max concurrent transcriptions (default 1 for single GPU)
"""
self.engine = engine
self.max_concurrent = max_concurrent
# Semaphore for concurrency control
self._semaphore = asyncio.Semaphore(max_concurrent)
# Queue for pending requests
self._queue_size = 0
async def transcribe(

View file

@ -915,3 +915,48 @@ def create_tts_engine(provider: str, config: dict) -> ChatterboxTTS | VeniceKoko
),
voice_references={},
)
class CloudTTSSynthesizer:
"""
TTS synthesizer wrapper for cloud providers (Venice Kokoro).
Provides the same synthesize(agent, text) interface as TTSSynthesizer
but delegates to a stateless cloud engine (no voice refs, no caching).
"""
def __init__(self, engine: VeniceKokoroTTS):
self.engine = engine
self.total_syntheses = 0
self.total_failures = 0
async def synthesize(
self,
agent: str,
text: str,
emotion_exaggeration: Optional[float] = None,
) -> Optional[np.ndarray]:
"""Synthesize speech (agent param accepted for interface compat, ignored)."""
try:
audio = await self.engine.generate_async(
text=text,
voice_ref_path=None,
emotion_exaggeration=emotion_exaggeration,
)
self.total_syntheses += 1
return audio
except Exception as e:
logger.error(f"Cloud TTS synthesis failed: {e}")
self.total_failures += 1
return None
async def warmup(self) -> None:
"""No warmup needed for cloud TTS."""
pass
def get_stats(self) -> dict:
return {
"engine": "venice-kokoro",
"total_syntheses": self.total_syntheses,
"total_failures": self.total_failures,
}

View file

@ -99,9 +99,20 @@ class TurnDetectionConfig(BaseModel):
model_path: str = "smart_turn_v3.onnx"
class VeniceTTSConfig(BaseModel):
"""Venice Kokoro TTS configuration."""
voice: str = "am_liam"
base_url: str = "https://api.venice.ai/api/v1"
class STTConfig(BaseModel):
"""Speech-to-text configuration."""
provider: str = "deepgram" # "deepgram" or "local"
# Deepgram settings
model: str = "nova-3"
# Local faster-whisper settings
model_size: str = "medium"
device: str = "cuda"
compute_type: str = "float16"
@ -148,10 +159,12 @@ class CoquiTTSConfig(BaseModel):
class TTSConfig(BaseModel):
"""Text-to-speech configuration."""
engine: str = "coqui"
provider: str = "venice" # "venice" or "local"
engine: str = "chatterbox"
device: str = "cuda"
streaming: bool = True
chunk_duration: float = 0.5
venice: VeniceTTSConfig = VeniceTTSConfig()
coqui: CoquiTTSConfig