From 7d3e13a3cac7fb27f0786a4325b7f17fcdfe7553 Mon Sep 17 00:00:00 2001 From: Jezza Hehn Date: Fri, 10 Apr 2026 00:44:03 +0000 Subject: [PATCH] Wire cloud STT/TTS providers into pipeline - Add provider field to STTConfig and TTSConfig (deepgram/venice) - Add VeniceTTSConfig model for venice voice/base_url settings - Add CloudTTSSynthesizer adapter wrapping VeniceKokoroTTS - Loosen STTTranscriber type hint to accept any engine with transcribe_async - Update run.py to use create_stt_engine/create_tts_engine factories - Provider-based init: reads config.pipeline.stt.provider and .tts.provider - Fix duplicate language key in config.yaml - Remove duplicate language field from STT config Cloud-only path: VAD (local) -> Deepgram STT -> OpenClaw -> Venice TTS -> Discord --- config.yaml | 3 -- run.py | 86 +++++++++++++++++++++++++++++++------------------ server/stt.py | 14 ++------ server/tts.py | 45 ++++++++++++++++++++++++++ utils/config.py | 15 ++++++++- 5 files changed, 115 insertions(+), 48 deletions(-) diff --git a/config.yaml b/config.yaml index a51c66f..c301754 100644 --- a/config.yaml +++ b/config.yaml @@ -123,9 +123,6 @@ pipeline: compute_type: "float16" beam_size: 1 - # Language hint (null = auto-detect) - language: "en" - # VAD filter (use built-in VAD in whisper) vad_filter: false diff --git a/run.py b/run.py index e9a2ccd..ec343f7 100644 --- a/run.py +++ b/run.py @@ -84,10 +84,10 @@ async def main(): logger.info(f" Default Agent: {config.agents.default}") logger.info(f" OpenClaw Gateway: {config.openclaw.base_url}") logger.info(f" OpenClaw Agent ID: {config.openclaw.agent_id}") - logger.info(f" STT Model: {config.pipeline.stt.model_size}") - logger.info(f" STT Device: {config.pipeline.stt.device}") - logger.info(f" TTS Engine: {config.pipeline.tts.engine}") - logger.info(f" TTS Device: {config.pipeline.tts.device}") + logger.info(f" STT Provider: {config.pipeline.stt.provider}") + logger.info(f" TTS Provider: {config.pipeline.tts.provider}") + if config.pipeline.tts.provider == "venice": + logger.info(f" TTS Voice: {config.pipeline.tts.venice.voice}") logger.info(f" Server Port: {config.server.port}") logger.info(f" Latency Tracking: {config.logging.track_latency}") logger.info("") @@ -95,38 +95,60 @@ async def main(): # Initialize shared TTS and STT engines logger.info("Initializing TTS and STT engines...") - from server.stt import create_transcriber - from server.tts import create_tts_synthesizer + from server.stt import create_stt_engine, STTTranscriber + from server.tts import create_tts_engine, CloudTTSSynthesizer + import os - # Create voice references map - voice_refs = { - "jarvis": str(jarvis_voice), - "sage": str(sage_voice), - } + # --- TTS --- + tts_provider = config.pipeline.tts.provider + if tts_provider == "venice": + tts_engine = create_tts_engine("venice", { + "api_key": os.getenv("VENICE_API_KEY", ""), + "voice": config.pipeline.tts.venice.voice, + "base_url": config.pipeline.tts.venice.base_url, + }) + tts_synthesizer = CloudTTSSynthesizer(tts_engine) + logger.info(f"✓ TTS engine initialized (Venice Kokoro, voice={config.pipeline.tts.venice.voice})") + else: + # Local Chatterbox (requires GPU + voice files) + voice_refs = { + "jarvis": str(jarvis_voice), + "sage": str(sage_voice), + } + tts_engine = create_tts_engine("chatterbox", { + "device": config.pipeline.tts.device, + "sample_rate": 24000, + }) + from server.tts import TTSSynthesizer + voice_map = {a: Path(p) for a, p in voice_refs.items()} + tts_synthesizer = TTSSynthesizer(engine=tts_engine, voice_map=voice_map) + logger.info(f"✓ TTS engine initialized (Chatterbox on {config.pipeline.tts.device})") - # Initialize TTS synthesizer (shared between Discord and API) - tts_synthesizer = await create_tts_synthesizer( - voice_refs=voice_refs, - device=config.pipeline.tts.device, - sample_rate=24000, # Default sample rate for Chatterbox TTS - ) - logger.info(f"✓ TTS engine initialized ({config.pipeline.tts.device})") - - # Warmup TTS and cache common phrases - logger.info("Warming up TTS engine and caching common phrases...") + # Warmup TTS + logger.info("Warming up TTS engine...") await tts_synthesizer.warmup() - logger.info(f"✓ TTS warmup complete ({len(tts_synthesizer.phrase_cache)} phrases cached)") + logger.info("✓ TTS warmup complete") - # Initialize STT transcriber (shared between Discord and API) - stt_transcriber = await create_transcriber( - model_size=config.pipeline.stt.model_size, - device=config.pipeline.stt.device, - compute_type=config.pipeline.stt.compute_type, - ) - logger.info( - f"✓ STT engine initialized " - f"({config.pipeline.stt.model_size} on {config.pipeline.stt.device})" - ) + # --- STT --- + stt_provider = config.pipeline.stt.provider + if stt_provider == "deepgram": + stt_engine = create_stt_engine("deepgram", + api_key=os.getenv("DEEPGRAM_API_KEY", ""), + model=config.pipeline.stt.model, + language=config.pipeline.stt.language, + ) + stt_transcriber = STTTranscriber(engine=stt_engine, max_concurrent=3) + logger.info(f"✓ STT engine initialized (Deepgram {config.pipeline.stt.model})") + else: + stt_engine = create_stt_engine("local", + model_size=config.pipeline.stt.model_size, + device=config.pipeline.stt.device, + compute_type=config.pipeline.stt.compute_type, + beam_size=config.pipeline.stt.beam_size, + language=config.pipeline.stt.language, + ) + stt_transcriber = STTTranscriber(engine=stt_engine, max_concurrent=1) + logger.info(f"✓ STT engine initialized (faster-whisper {config.pipeline.stt.model_size} on {config.pipeline.stt.device})") # Initialize OpenClaw Gateway client logger.info("Initializing OpenClaw Gateway client...") diff --git a/server/stt.py b/server/stt.py index 19f07a4..337db47 100644 --- a/server/stt.py +++ b/server/stt.py @@ -481,27 +481,17 @@ class STTTranscriber: Pipeline stage for speech-to-text transcription. Handles queueing and concurrent transcription requests. + Accepts any engine with a transcribe_async() method. """ def __init__( self, - engine: FasterWhisperSTT, + engine, # FasterWhisperSTT or DeepgramSTT max_concurrent: int = 1, ): - """ - Initialize transcriber. - - Args: - engine: STT engine instance - max_concurrent: Max concurrent transcriptions (default 1 for single GPU) - """ self.engine = engine self.max_concurrent = max_concurrent - - # Semaphore for concurrency control self._semaphore = asyncio.Semaphore(max_concurrent) - - # Queue for pending requests self._queue_size = 0 async def transcribe( diff --git a/server/tts.py b/server/tts.py index 8f69c51..97a801d 100644 --- a/server/tts.py +++ b/server/tts.py @@ -915,3 +915,48 @@ def create_tts_engine(provider: str, config: dict) -> ChatterboxTTS | VeniceKoko ), voice_references={}, ) + + +class CloudTTSSynthesizer: + """ + TTS synthesizer wrapper for cloud providers (Venice Kokoro). + + Provides the same synthesize(agent, text) interface as TTSSynthesizer + but delegates to a stateless cloud engine (no voice refs, no caching). + """ + + def __init__(self, engine: VeniceKokoroTTS): + self.engine = engine + self.total_syntheses = 0 + self.total_failures = 0 + + async def synthesize( + self, + agent: str, + text: str, + emotion_exaggeration: Optional[float] = None, + ) -> Optional[np.ndarray]: + """Synthesize speech (agent param accepted for interface compat, ignored).""" + try: + audio = await self.engine.generate_async( + text=text, + voice_ref_path=None, + emotion_exaggeration=emotion_exaggeration, + ) + self.total_syntheses += 1 + return audio + except Exception as e: + logger.error(f"Cloud TTS synthesis failed: {e}") + self.total_failures += 1 + return None + + async def warmup(self) -> None: + """No warmup needed for cloud TTS.""" + pass + + def get_stats(self) -> dict: + return { + "engine": "venice-kokoro", + "total_syntheses": self.total_syntheses, + "total_failures": self.total_failures, + } diff --git a/utils/config.py b/utils/config.py index 39d9067..bd69b93 100644 --- a/utils/config.py +++ b/utils/config.py @@ -99,9 +99,20 @@ class TurnDetectionConfig(BaseModel): model_path: str = "smart_turn_v3.onnx" +class VeniceTTSConfig(BaseModel): + """Venice Kokoro TTS configuration.""" + + voice: str = "am_liam" + base_url: str = "https://api.venice.ai/api/v1" + + class STTConfig(BaseModel): """Speech-to-text configuration.""" + provider: str = "deepgram" # "deepgram" or "local" + # Deepgram settings + model: str = "nova-3" + # Local faster-whisper settings model_size: str = "medium" device: str = "cuda" compute_type: str = "float16" @@ -148,10 +159,12 @@ class CoquiTTSConfig(BaseModel): class TTSConfig(BaseModel): """Text-to-speech configuration.""" - engine: str = "coqui" + provider: str = "venice" # "venice" or "local" + engine: str = "chatterbox" device: str = "cuda" streaming: bool = True chunk_duration: float = 0.5 + venice: VeniceTTSConfig = VeniceTTSConfig() coqui: CoquiTTSConfig