Wire cloud STT/TTS providers into pipeline

- Add provider field to STTConfig and TTSConfig (deepgram/venice)
- Add VeniceTTSConfig model for venice voice/base_url settings
- Add CloudTTSSynthesizer adapter wrapping VeniceKokoroTTS
- Loosen STTTranscriber type hint to accept any engine with transcribe_async
- Update run.py to use create_stt_engine/create_tts_engine factories
- Provider-based init: reads config.pipeline.stt.provider and .tts.provider
- Fix duplicate language key in config.yaml
- Remove duplicate language field from STT config

Cloud-only path: VAD (local) -> Deepgram STT -> OpenClaw -> Venice TTS -> Discord
This commit is contained in:
Jezza Hehn 2026-04-10 00:44:03 +00:00
parent f0458b9b40
commit 7d3e13a3ca
5 changed files with 115 additions and 48 deletions

86
run.py
View file

@ -84,10 +84,10 @@ async def main():
logger.info(f" Default Agent: {config.agents.default}")
logger.info(f" OpenClaw Gateway: {config.openclaw.base_url}")
logger.info(f" OpenClaw Agent ID: {config.openclaw.agent_id}")
logger.info(f" STT Model: {config.pipeline.stt.model_size}")
logger.info(f" STT Device: {config.pipeline.stt.device}")
logger.info(f" TTS Engine: {config.pipeline.tts.engine}")
logger.info(f" TTS Device: {config.pipeline.tts.device}")
logger.info(f" STT Provider: {config.pipeline.stt.provider}")
logger.info(f" TTS Provider: {config.pipeline.tts.provider}")
if config.pipeline.tts.provider == "venice":
logger.info(f" TTS Voice: {config.pipeline.tts.venice.voice}")
logger.info(f" Server Port: {config.server.port}")
logger.info(f" Latency Tracking: {config.logging.track_latency}")
logger.info("")
@ -95,38 +95,60 @@ async def main():
# Initialize shared TTS and STT engines
logger.info("Initializing TTS and STT engines...")
from server.stt import create_transcriber
from server.tts import create_tts_synthesizer
from server.stt import create_stt_engine, STTTranscriber
from server.tts import create_tts_engine, CloudTTSSynthesizer
import os
# Create voice references map
voice_refs = {
"jarvis": str(jarvis_voice),
"sage": str(sage_voice),
}
# --- TTS ---
tts_provider = config.pipeline.tts.provider
if tts_provider == "venice":
tts_engine = create_tts_engine("venice", {
"api_key": os.getenv("VENICE_API_KEY", ""),
"voice": config.pipeline.tts.venice.voice,
"base_url": config.pipeline.tts.venice.base_url,
})
tts_synthesizer = CloudTTSSynthesizer(tts_engine)
logger.info(f"✓ TTS engine initialized (Venice Kokoro, voice={config.pipeline.tts.venice.voice})")
else:
# Local Chatterbox (requires GPU + voice files)
voice_refs = {
"jarvis": str(jarvis_voice),
"sage": str(sage_voice),
}
tts_engine = create_tts_engine("chatterbox", {
"device": config.pipeline.tts.device,
"sample_rate": 24000,
})
from server.tts import TTSSynthesizer
voice_map = {a: Path(p) for a, p in voice_refs.items()}
tts_synthesizer = TTSSynthesizer(engine=tts_engine, voice_map=voice_map)
logger.info(f"✓ TTS engine initialized (Chatterbox on {config.pipeline.tts.device})")
# Initialize TTS synthesizer (shared between Discord and API)
tts_synthesizer = await create_tts_synthesizer(
voice_refs=voice_refs,
device=config.pipeline.tts.device,
sample_rate=24000, # Default sample rate for Chatterbox TTS
)
logger.info(f"✓ TTS engine initialized ({config.pipeline.tts.device})")
# Warmup TTS and cache common phrases
logger.info("Warming up TTS engine and caching common phrases...")
# Warmup TTS
logger.info("Warming up TTS engine...")
await tts_synthesizer.warmup()
logger.info(f"✓ TTS warmup complete ({len(tts_synthesizer.phrase_cache)} phrases cached)")
logger.info("✓ TTS warmup complete")
# Initialize STT transcriber (shared between Discord and API)
stt_transcriber = await create_transcriber(
model_size=config.pipeline.stt.model_size,
device=config.pipeline.stt.device,
compute_type=config.pipeline.stt.compute_type,
)
logger.info(
f"✓ STT engine initialized "
f"({config.pipeline.stt.model_size} on {config.pipeline.stt.device})"
)
# --- STT ---
stt_provider = config.pipeline.stt.provider
if stt_provider == "deepgram":
stt_engine = create_stt_engine("deepgram",
api_key=os.getenv("DEEPGRAM_API_KEY", ""),
model=config.pipeline.stt.model,
language=config.pipeline.stt.language,
)
stt_transcriber = STTTranscriber(engine=stt_engine, max_concurrent=3)
logger.info(f"✓ STT engine initialized (Deepgram {config.pipeline.stt.model})")
else:
stt_engine = create_stt_engine("local",
model_size=config.pipeline.stt.model_size,
device=config.pipeline.stt.device,
compute_type=config.pipeline.stt.compute_type,
beam_size=config.pipeline.stt.beam_size,
language=config.pipeline.stt.language,
)
stt_transcriber = STTTranscriber(engine=stt_engine, max_concurrent=1)
logger.info(f"✓ STT engine initialized (faster-whisper {config.pipeline.stt.model_size} on {config.pipeline.stt.device})")
# Initialize OpenClaw Gateway client
logger.info("Initializing OpenClaw Gateway client...")