Wire cloud STT/TTS providers into pipeline
- Add provider field to STTConfig and TTSConfig (deepgram/venice) - Add VeniceTTSConfig model for venice voice/base_url settings - Add CloudTTSSynthesizer adapter wrapping VeniceKokoroTTS - Loosen STTTranscriber type hint to accept any engine with transcribe_async - Update run.py to use create_stt_engine/create_tts_engine factories - Provider-based init: reads config.pipeline.stt.provider and .tts.provider - Fix duplicate language key in config.yaml - Remove duplicate language field from STT config Cloud-only path: VAD (local) -> Deepgram STT -> OpenClaw -> Venice TTS -> Discord
This commit is contained in:
parent
f0458b9b40
commit
7d3e13a3ca
5 changed files with 115 additions and 48 deletions
|
|
@ -123,9 +123,6 @@ pipeline:
|
|||
compute_type: "float16"
|
||||
beam_size: 1
|
||||
|
||||
# Language hint (null = auto-detect)
|
||||
language: "en"
|
||||
|
||||
# VAD filter (use built-in VAD in whisper)
|
||||
vad_filter: false
|
||||
|
||||
|
|
|
|||
70
run.py
70
run.py
|
|
@ -84,10 +84,10 @@ async def main():
|
|||
logger.info(f" Default Agent: {config.agents.default}")
|
||||
logger.info(f" OpenClaw Gateway: {config.openclaw.base_url}")
|
||||
logger.info(f" OpenClaw Agent ID: {config.openclaw.agent_id}")
|
||||
logger.info(f" STT Model: {config.pipeline.stt.model_size}")
|
||||
logger.info(f" STT Device: {config.pipeline.stt.device}")
|
||||
logger.info(f" TTS Engine: {config.pipeline.tts.engine}")
|
||||
logger.info(f" TTS Device: {config.pipeline.tts.device}")
|
||||
logger.info(f" STT Provider: {config.pipeline.stt.provider}")
|
||||
logger.info(f" TTS Provider: {config.pipeline.tts.provider}")
|
||||
if config.pipeline.tts.provider == "venice":
|
||||
logger.info(f" TTS Voice: {config.pipeline.tts.venice.voice}")
|
||||
logger.info(f" Server Port: {config.server.port}")
|
||||
logger.info(f" Latency Tracking: {config.logging.track_latency}")
|
||||
logger.info("")
|
||||
|
|
@ -95,38 +95,60 @@ async def main():
|
|||
# Initialize shared TTS and STT engines
|
||||
logger.info("Initializing TTS and STT engines...")
|
||||
|
||||
from server.stt import create_transcriber
|
||||
from server.tts import create_tts_synthesizer
|
||||
from server.stt import create_stt_engine, STTTranscriber
|
||||
from server.tts import create_tts_engine, CloudTTSSynthesizer
|
||||
import os
|
||||
|
||||
# Create voice references map
|
||||
# --- TTS ---
|
||||
tts_provider = config.pipeline.tts.provider
|
||||
if tts_provider == "venice":
|
||||
tts_engine = create_tts_engine("venice", {
|
||||
"api_key": os.getenv("VENICE_API_KEY", ""),
|
||||
"voice": config.pipeline.tts.venice.voice,
|
||||
"base_url": config.pipeline.tts.venice.base_url,
|
||||
})
|
||||
tts_synthesizer = CloudTTSSynthesizer(tts_engine)
|
||||
logger.info(f"✓ TTS engine initialized (Venice Kokoro, voice={config.pipeline.tts.venice.voice})")
|
||||
else:
|
||||
# Local Chatterbox (requires GPU + voice files)
|
||||
voice_refs = {
|
||||
"jarvis": str(jarvis_voice),
|
||||
"sage": str(sage_voice),
|
||||
}
|
||||
tts_engine = create_tts_engine("chatterbox", {
|
||||
"device": config.pipeline.tts.device,
|
||||
"sample_rate": 24000,
|
||||
})
|
||||
from server.tts import TTSSynthesizer
|
||||
voice_map = {a: Path(p) for a, p in voice_refs.items()}
|
||||
tts_synthesizer = TTSSynthesizer(engine=tts_engine, voice_map=voice_map)
|
||||
logger.info(f"✓ TTS engine initialized (Chatterbox on {config.pipeline.tts.device})")
|
||||
|
||||
# Initialize TTS synthesizer (shared between Discord and API)
|
||||
tts_synthesizer = await create_tts_synthesizer(
|
||||
voice_refs=voice_refs,
|
||||
device=config.pipeline.tts.device,
|
||||
sample_rate=24000, # Default sample rate for Chatterbox TTS
|
||||
)
|
||||
logger.info(f"✓ TTS engine initialized ({config.pipeline.tts.device})")
|
||||
|
||||
# Warmup TTS and cache common phrases
|
||||
logger.info("Warming up TTS engine and caching common phrases...")
|
||||
# Warmup TTS
|
||||
logger.info("Warming up TTS engine...")
|
||||
await tts_synthesizer.warmup()
|
||||
logger.info(f"✓ TTS warmup complete ({len(tts_synthesizer.phrase_cache)} phrases cached)")
|
||||
logger.info("✓ TTS warmup complete")
|
||||
|
||||
# Initialize STT transcriber (shared between Discord and API)
|
||||
stt_transcriber = await create_transcriber(
|
||||
# --- STT ---
|
||||
stt_provider = config.pipeline.stt.provider
|
||||
if stt_provider == "deepgram":
|
||||
stt_engine = create_stt_engine("deepgram",
|
||||
api_key=os.getenv("DEEPGRAM_API_KEY", ""),
|
||||
model=config.pipeline.stt.model,
|
||||
language=config.pipeline.stt.language,
|
||||
)
|
||||
stt_transcriber = STTTranscriber(engine=stt_engine, max_concurrent=3)
|
||||
logger.info(f"✓ STT engine initialized (Deepgram {config.pipeline.stt.model})")
|
||||
else:
|
||||
stt_engine = create_stt_engine("local",
|
||||
model_size=config.pipeline.stt.model_size,
|
||||
device=config.pipeline.stt.device,
|
||||
compute_type=config.pipeline.stt.compute_type,
|
||||
beam_size=config.pipeline.stt.beam_size,
|
||||
language=config.pipeline.stt.language,
|
||||
)
|
||||
logger.info(
|
||||
f"✓ STT engine initialized "
|
||||
f"({config.pipeline.stt.model_size} on {config.pipeline.stt.device})"
|
||||
)
|
||||
stt_transcriber = STTTranscriber(engine=stt_engine, max_concurrent=1)
|
||||
logger.info(f"✓ STT engine initialized (faster-whisper {config.pipeline.stt.model_size} on {config.pipeline.stt.device})")
|
||||
|
||||
# Initialize OpenClaw Gateway client
|
||||
logger.info("Initializing OpenClaw Gateway client...")
|
||||
|
|
|
|||
|
|
@ -481,27 +481,17 @@ class STTTranscriber:
|
|||
Pipeline stage for speech-to-text transcription.
|
||||
|
||||
Handles queueing and concurrent transcription requests.
|
||||
Accepts any engine with a transcribe_async() method.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine: FasterWhisperSTT,
|
||||
engine, # FasterWhisperSTT or DeepgramSTT
|
||||
max_concurrent: int = 1,
|
||||
):
|
||||
"""
|
||||
Initialize transcriber.
|
||||
|
||||
Args:
|
||||
engine: STT engine instance
|
||||
max_concurrent: Max concurrent transcriptions (default 1 for single GPU)
|
||||
"""
|
||||
self.engine = engine
|
||||
self.max_concurrent = max_concurrent
|
||||
|
||||
# Semaphore for concurrency control
|
||||
self._semaphore = asyncio.Semaphore(max_concurrent)
|
||||
|
||||
# Queue for pending requests
|
||||
self._queue_size = 0
|
||||
|
||||
async def transcribe(
|
||||
|
|
|
|||
|
|
@ -915,3 +915,48 @@ def create_tts_engine(provider: str, config: dict) -> ChatterboxTTS | VeniceKoko
|
|||
),
|
||||
voice_references={},
|
||||
)
|
||||
|
||||
|
||||
class CloudTTSSynthesizer:
|
||||
"""
|
||||
TTS synthesizer wrapper for cloud providers (Venice Kokoro).
|
||||
|
||||
Provides the same synthesize(agent, text) interface as TTSSynthesizer
|
||||
but delegates to a stateless cloud engine (no voice refs, no caching).
|
||||
"""
|
||||
|
||||
def __init__(self, engine: VeniceKokoroTTS):
|
||||
self.engine = engine
|
||||
self.total_syntheses = 0
|
||||
self.total_failures = 0
|
||||
|
||||
async def synthesize(
|
||||
self,
|
||||
agent: str,
|
||||
text: str,
|
||||
emotion_exaggeration: Optional[float] = None,
|
||||
) -> Optional[np.ndarray]:
|
||||
"""Synthesize speech (agent param accepted for interface compat, ignored)."""
|
||||
try:
|
||||
audio = await self.engine.generate_async(
|
||||
text=text,
|
||||
voice_ref_path=None,
|
||||
emotion_exaggeration=emotion_exaggeration,
|
||||
)
|
||||
self.total_syntheses += 1
|
||||
return audio
|
||||
except Exception as e:
|
||||
logger.error(f"Cloud TTS synthesis failed: {e}")
|
||||
self.total_failures += 1
|
||||
return None
|
||||
|
||||
async def warmup(self) -> None:
|
||||
"""No warmup needed for cloud TTS."""
|
||||
pass
|
||||
|
||||
def get_stats(self) -> dict:
|
||||
return {
|
||||
"engine": "venice-kokoro",
|
||||
"total_syntheses": self.total_syntheses,
|
||||
"total_failures": self.total_failures,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -99,9 +99,20 @@ class TurnDetectionConfig(BaseModel):
|
|||
model_path: str = "smart_turn_v3.onnx"
|
||||
|
||||
|
||||
class VeniceTTSConfig(BaseModel):
|
||||
"""Venice Kokoro TTS configuration."""
|
||||
|
||||
voice: str = "am_liam"
|
||||
base_url: str = "https://api.venice.ai/api/v1"
|
||||
|
||||
|
||||
class STTConfig(BaseModel):
|
||||
"""Speech-to-text configuration."""
|
||||
|
||||
provider: str = "deepgram" # "deepgram" or "local"
|
||||
# Deepgram settings
|
||||
model: str = "nova-3"
|
||||
# Local faster-whisper settings
|
||||
model_size: str = "medium"
|
||||
device: str = "cuda"
|
||||
compute_type: str = "float16"
|
||||
|
|
@ -148,10 +159,12 @@ class CoquiTTSConfig(BaseModel):
|
|||
class TTSConfig(BaseModel):
|
||||
"""Text-to-speech configuration."""
|
||||
|
||||
engine: str = "coqui"
|
||||
provider: str = "venice" # "venice" or "local"
|
||||
engine: str = "chatterbox"
|
||||
device: str = "cuda"
|
||||
streaming: bool = True
|
||||
chunk_duration: float = 0.5
|
||||
venice: VeniceTTSConfig = VeniceTTSConfig()
|
||||
coqui: CoquiTTSConfig
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue