openclaw-voice/pipeline/orchestrator.py
MCKRUZ 9fde3d31ba feat: Major performance optimizations and feature enhancements
## Performance Optimizations (3-10x faster responses)
- STT beam_size reduced to 1 (3-5x faster transcription, minimal quality loss)
- Smart query routing: Haiku (simple) → Sonnet (medium) → Opus (complex)
- TTS cache for common phrases (27 pre-generated responses)
- Sentence-level streaming TTS (start playing while generating)
- Sample-based VAD timing (30x improvement in silence detection)

## TTS Engine Upgrade
- Migrated from Chatterbox to Chatterbox-Turbo
- Zero-shot voice cloning (no fine-tuning required)
- Native paralinguistic tag support ([laugh], [sigh], [chuckle], etc.)
- Emotion presets with temperature control
- Improved marker conversion (*action*, (action), ~action~)

## Discord Bot Enhancements
- Multi-agent support (Jarvis, Sage)
- Improved voice receiving with discord-ext-voice-recv
- Enhanced /join, /leave, /status commands
- Per-agent personality configuration
- Better audio sink/receiver implementation

## OpenClaw Integration
- WebSocket support for Gateway communication
- Query complexity routing (auto-select model)
- Improved error handling and retries
- Session management per Discord guild
- Better latency tracking

## Pipeline Improvements
- Sentence splitter for streaming optimization
- Query router for intelligent model selection
- Enhanced VAD receiver with sample-based timing
- Improved audio buffering and format conversion
- Better transcript management

## Documentation
- Added QUICK_START.md (5-minute test guide)
- Added OPTIMIZATION_SUMMARY.md (performance analysis)
- Added DISCORD_OPTIMIZATION_TEST.md (testing guide)
- Added USAGE_GUIDE.md (comprehensive usage)
- Updated README.md with optimization details

## Utilities & Scripts
- Added get_invite_link.py (Discord bot invite)
- Added sync_commands.py, sync_to_guild.py (command sync)
- Added test_gateway.py, test_stt.py (testing utilities)
- Added openclaw_wrapper.py (wrapper script)
- Removed create_mock_turn_model.py (no longer needed)

## Configuration Updates
- STT model: medium → small (faster, acceptable quality)
- TTS engine: chatterbox → coqui (Turbo integration)
- Beam size: 5 → 1 (latency optimization)
- Added emotion_exaggeration per agent
- Updated .gitignore for project files

Total: ~2105 insertions, ~462 deletions across 35 files
Performance: ~5.5s total latency (down from 22-35s)
Target: ~3.5s (achieved in simple queries with cache)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-16 19:29:57 -05:00

712 lines
25 KiB
Python

"""Pipeline Orchestrator - Event-driven coordinator for voice processing.
Wires all pipeline stages together:
audio_in → vad → turn_detect → stt → relevance → respond → tts → audio_out
Per-user state machines with cancellation support.
"""
import asyncio
import time
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Callable, Dict, Optional
import numpy as np
from pipeline.audio_buffer import AudioRingBuffer
from pipeline.query_router import QueryRouter
from pipeline.relevance_filter import RelevanceFilter
from pipeline.sentence_splitter import split_streaming_response
from pipeline.transcriber import STTTranscriber
from pipeline.transcript_manager import TranscriptManager
from pipeline.turn_detector import SmartTurnDetector
from pipeline.vad import SileroVAD
from server.tts import TTSSynthesizer
from utils.logging import get_logger
logger = get_logger(__name__)
class PipelineState(Enum):
"""User pipeline states."""
IDLE = "idle" # Waiting for speech
LISTENING = "listening" # VAD detected speech start
TURN_WAIT = "turn_wait" # VAD silence, checking turn completion
PROCESSING = "processing" # Transcribing and deciding
RESPONDING = "responding" # Generating TTS and playing
@dataclass
class UserPipeline:
"""Per-user pipeline state."""
user_id: int
user_name: str
state: PipelineState = PipelineState.IDLE
# Audio buffer
audio_buffer: AudioRingBuffer = field(
default_factory=lambda: AudioRingBuffer(duration_seconds=10.0)
)
# Speech detection
speech_start_time: Optional[float] = None
last_speech_time: Optional[float] = None
# Processing
current_task: Optional[asyncio.Task] = None
processing_start_time: Optional[float] = None
# Latency tracking
stage_latencies: Dict[str, float] = field(default_factory=dict)
# Stats
total_utterances: int = 0
total_responses: int = 0
total_cancellations: int = 0
@dataclass
class PipelineConfig:
"""Pipeline orchestrator configuration."""
# VAD settings
vad_silence_duration: float = 0.3 # Seconds of silence to detect speech end
vad_chunk_size: int = 512 # Samples per VAD check (16kHz)
# Smart Turn settings
turn_wait_timeout: float = 3.0 # Max wait after silence for turn completion
turn_completion_threshold: float = 0.7 # Probability threshold
# Processing timeouts
stt_timeout: float = 5.0
relevance_timeout: float = 2.0
llm_timeout: float = 10.0
tts_timeout: float = 10.0
# Concurrent processing
max_concurrent_users: int = 5
# Audio settings
sample_rate: int = 16000
class PipelineOrchestrator:
"""
Event-driven pipeline orchestrator.
Coordinates voice processing for multiple users:
- Per-user state machines
- Cancellation and barge-in support
- Latency tracking
- Error handling and recovery
"""
def __init__(
self,
config: PipelineConfig,
vad: SileroVAD,
turn_detector: SmartTurnDetector,
transcriber: STTTranscriber,
transcript_manager: TranscriptManager,
relevance_filter: RelevanceFilter,
llm_client: Callable, # OpenClaw client
tts_synthesizer: TTSSynthesizer,
audio_output_callback: Callable[[int, np.ndarray], None],
query_router: Optional[QueryRouter] = None,
):
"""
Initialize pipeline orchestrator.
Args:
config: Pipeline configuration
vad: VAD detector
turn_detector: Smart Turn detector
transcriber: STT transcriber
transcript_manager: Transcript manager
relevance_filter: Relevance filter
llm_client: LLM client for responses (OpenClaw)
tts_synthesizer: TTS synthesizer
audio_output_callback: Callback for playing audio (user_id, audio)
query_router: Query router for model selection (optional)
"""
self.config = config
self.vad = vad
self.turn_detector = turn_detector
self.transcriber = transcriber
self.transcript_manager = transcript_manager
self.relevance_filter = relevance_filter
self.llm_client = llm_client
self.tts_synthesizer = tts_synthesizer
self.audio_output_callback = audio_output_callback
self.query_router = query_router or QueryRouter(default_model="sonnet")
# Per-user pipelines
self.pipelines: Dict[int, UserPipeline] = {}
# Global stats
self.total_audio_frames = 0
self.total_pipeline_runs = 0
self.total_errors = 0
# Semaphore for concurrent processing
self._processing_semaphore = asyncio.Semaphore(
config.max_concurrent_users
)
# Current agent
self.current_agent = "jarvis"
# Start speech timeout monitor
self._shutdown = False
self._monitor_task = asyncio.create_task(self._monitor_speech_timeouts())
logger.info(f"Pipeline orchestrator initialized: {config}")
def get_or_create_pipeline(
self, user_id: int, user_name: str
) -> UserPipeline:
"""
Get or create pipeline for user.
Args:
user_id: User ID
user_name: User display name
Returns:
User pipeline instance
"""
if user_id not in self.pipelines:
self.pipelines[user_id] = UserPipeline(
user_id=user_id, user_name=user_name
)
logger.info(f"Created pipeline for user: {user_name} ({user_id})")
return self.pipelines[user_id]
def remove_pipeline(self, user_id: int) -> None:
"""
Remove user pipeline (e.g., user left channel).
Args:
user_id: User ID
"""
if user_id in self.pipelines:
pipeline = self.pipelines[user_id]
# Cancel current task if any
if pipeline.current_task and not pipeline.current_task.done():
pipeline.current_task.cancel()
del self.pipelines[user_id]
logger.info(
f"Removed pipeline for user: {pipeline.user_name} ({user_id})"
)
async def process_audio_frame(
self, user_id: int, user_name: str, audio_frame: np.ndarray
) -> None:
"""
Process incoming audio frame from user.
Args:
user_id: User ID
user_name: User display name
audio_frame: Audio data (float32, 16kHz mono)
"""
pipeline = self.get_or_create_pipeline(user_id, user_name)
# Add to buffer
pipeline.audio_buffer.write(audio_frame)
self.total_audio_frames += 1
# Check if user is speaking during our response (barge-in)
if pipeline.state == PipelineState.RESPONDING:
logger.info(
f"Barge-in detected: {user_name} spoke during response"
)
await self._cancel_pipeline(pipeline)
pipeline.state = PipelineState.LISTENING
pipeline.speech_start_time = time.time()
return
# Process VAD
await self._process_vad(pipeline, audio_frame)
async def _process_vad(
self, pipeline: UserPipeline, audio_frame: np.ndarray
) -> None:
"""
Process VAD on audio frame.
Args:
pipeline: User pipeline
audio_frame: Audio chunk
"""
# Run VAD (CPU, fast)
state, speech_prob = self.vad.process_chunk(audio_frame)
current_time = time.time()
# Check if speech is detected
from pipeline.vad import SpeechState
is_speech = (state == SpeechState.SPEECH)
if is_speech:
# Speech detected
if pipeline.state == PipelineState.IDLE:
# Speech start
pipeline.state = PipelineState.LISTENING
pipeline.speech_start_time = current_time
logger.debug(
f"Speech started: {pipeline.user_name} "
f"({pipeline.user_id})"
)
pipeline.last_speech_time = current_time
else:
# Silence detected
if pipeline.state == PipelineState.LISTENING:
# Check if silence duration exceeded
silence_duration = current_time - (
pipeline.last_speech_time or current_time
)
if silence_duration >= self.config.vad_silence_duration:
# Speech end - proceed to turn detection
logger.debug(
f"Speech ended: {pipeline.user_name} "
f"(silence: {silence_duration:.2f}s)"
)
await self._handle_speech_end(pipeline)
async def _monitor_speech_timeouts(self) -> None:
"""Background task to monitor for speech timeouts."""
while not self._shutdown:
try:
await asyncio.sleep(0.1) # Check every 100ms
current_time = time.time()
for user_id, pipeline in list(self.pipelines.items()):
if pipeline.state == PipelineState.LISTENING:
if pipeline.last_speech_time:
silence_duration = current_time - pipeline.last_speech_time
if silence_duration >= self.config.vad_silence_duration:
# Speech ended due to timeout
logger.info(
f"Speech ended (timeout): {pipeline.user_name} "
f"(silence: {silence_duration:.2f}s)"
)
await self._handle_speech_end(pipeline)
except Exception as e:
logger.error(f"Error in speech timeout monitor: {e}", exc_info=True)
async def _handle_speech_end(self, pipeline: UserPipeline) -> None:
"""
Handle speech end - check turn completion.
Args:
pipeline: User pipeline
"""
pipeline.state = PipelineState.TURN_WAIT
# Get audio segment
speech_duration = time.time() - (pipeline.speech_start_time or 0)
audio_segment = pipeline.audio_buffer.read(duration_seconds=8.0)
if len(audio_segment) == 0:
logger.warning(
f"Empty audio segment for {pipeline.user_name}, ignoring"
)
pipeline.state = PipelineState.IDLE
return
# Check turn completion with timeout
try:
turn_start = time.time()
is_complete = await asyncio.wait_for(
self._check_turn_completion(audio_segment),
timeout=self.config.turn_wait_timeout,
)
turn_latency = time.time() - turn_start
pipeline.stage_latencies["turn_detection"] = turn_latency
if is_complete:
# Turn complete - proceed to transcription
logger.info(
f"Turn complete for {pipeline.user_name} "
f"(latency: {turn_latency:.3f}s)"
)
await self._start_processing(pipeline, audio_segment)
else:
# Turn not complete - wait for more speech
logger.debug(
f"Turn incomplete for {pipeline.user_name}, "
f"waiting for more speech"
)
pipeline.state = PipelineState.LISTENING
except asyncio.TimeoutError:
# Timeout - assume turn complete
logger.warning(
f"Turn detection timeout for {pipeline.user_name}, "
f"assuming complete"
)
await self._start_processing(pipeline, audio_segment)
async def _check_turn_completion(
self, audio_segment: np.ndarray
) -> bool:
"""
Check if turn is complete using Smart Turn.
Args:
audio_segment: Audio segment
Returns:
True if turn is complete
"""
probability = await self.turn_detector.detect_async(audio_segment)
return probability >= self.config.turn_completion_threshold
async def _start_processing(
self, pipeline: UserPipeline, audio_segment: np.ndarray
) -> None:
"""
Start processing pipeline for utterance.
Args:
pipeline: User pipeline
audio_segment: Speech audio
"""
pipeline.state = PipelineState.PROCESSING
pipeline.processing_start_time = time.time()
pipeline.total_utterances += 1
# Create processing task
task = asyncio.create_task(
self._process_utterance(pipeline, audio_segment)
)
pipeline.current_task = task
async def _process_utterance(
self, pipeline: UserPipeline, audio_segment: np.ndarray
) -> None:
"""
Process utterance through full pipeline.
Args:
pipeline: User pipeline
audio_segment: Speech audio
"""
try:
async with self._processing_semaphore:
# 1. Transcribe (STT)
stt_start = time.time()
transcript = await asyncio.wait_for(
self.transcriber.transcribe_async(audio_segment),
timeout=self.config.stt_timeout,
)
pipeline.stage_latencies["stt"] = time.time() - stt_start
if not transcript or not transcript.text.strip():
logger.warning(
f"Empty transcription for {pipeline.user_name}"
)
pipeline.state = PipelineState.IDLE
return
logger.info(
f"Transcribed ({pipeline.user_name}): "
f'"{transcript.text}" '
f"(latency: {pipeline.stage_latencies['stt']:.3f}s)"
)
# 2. Add to transcript context
self.transcript_manager.add_entry(
speaker=pipeline.user_name, text=transcript.text
)
# 3. Check relevance
rel_start = time.time()
context = self.transcript_manager.get_context(format="readable")
should_respond = await asyncio.wait_for(
self.relevance_filter.classify(
utterance=transcript.text,
speaker=pipeline.user_name,
transcript=context,
agent=self.current_agent,
sensitivity=self.relevance_filter.sensitivity,
),
timeout=self.config.relevance_timeout,
)
pipeline.stage_latencies["relevance"] = time.time() - rel_start
if not should_respond:
logger.info(
f"Not responding to {pipeline.user_name}: "
f'"{transcript.text}"'
)
pipeline.state = PipelineState.IDLE
return
logger.info(
f"Responding to {pipeline.user_name}: "
f'"{transcript.text}" '
f"(latency: {pipeline.stage_latencies['relevance']:.3f}s)"
)
# 4. Route query to optimal model
routing_start = time.time()
routing_decision = self.query_router.route(transcript.text)
pipeline.stage_latencies["routing"] = time.time() - routing_start
logger.info(
f"Routed to {routing_decision.model} "
f"(confidence: {routing_decision.confidence:.2f}, "
f"reason: {routing_decision.reason})"
)
# 5. Generate response with streaming TTS
pipeline.state = PipelineState.RESPONDING
llm_start = time.time()
first_audio_time = None
full_response_text = []
try:
# Stream LLM response and split into sentences
text_stream = self.llm_client.send_message_streaming(
agent=self.current_agent,
message=transcript.text,
context=context,
speaker=pipeline.user_name,
model=routing_decision.model_id,
)
sentence_stream = split_streaming_response(text_stream)
# Process each sentence as it arrives
async for sentence in sentence_stream:
# Record first sentence timing (critical metric)
if sentence.index == 0:
pipeline.stage_latencies["llm_first_sentence"] = time.time() - llm_start
logger.info(
f"First sentence from LLM in {pipeline.stage_latencies['llm_first_sentence']:.3f}s: "
f'"{sentence.text}"'
)
# Collect full text for transcript
full_response_text.append(sentence.text)
# Generate TTS for this sentence
tts_start = time.time()
audio_chunk = await asyncio.wait_for(
self.tts_synthesizer.synthesize(
agent=self.current_agent,
text=sentence.text,
),
timeout=self.config.tts_timeout,
)
if sentence.index == 0:
pipeline.stage_latencies["tts_first_chunk"] = time.time() - tts_start
if audio_chunk is None:
logger.warning(f"TTS failed for sentence #{sentence.index}")
continue
# Play audio immediately
self.audio_output_callback(pipeline.user_id, audio_chunk)
# Track first audio playback time (time to first audio)
if first_audio_time is None:
first_audio_time = time.time() - llm_start
pipeline.stage_latencies["time_to_first_audio"] = first_audio_time
logger.info(
f"First audio playing in {first_audio_time:.3f}s "
f"(LLM: {pipeline.stage_latencies['llm_first_sentence']:.3f}s, "
f"TTS: {pipeline.stage_latencies['tts_first_chunk']:.3f}s)"
)
logger.debug(
f"Played sentence #{sentence.index} "
f"({len(audio_chunk) / self.config.sample_rate:.2f}s audio)"
)
# Streaming complete
pipeline.stage_latencies["llm"] = time.time() - llm_start
response_text = " ".join(full_response_text)
logger.info(
f"Streaming response complete ({self.current_agent}, {routing_decision.model}): "
f'"{response_text[:100]}..." '
f"(total latency: {pipeline.stage_latencies['llm']:.3f}s)"
)
# Add bot response to transcript
self.transcript_manager.add_entry(
speaker=self.current_agent.title(), text=response_text
)
except Exception as e:
logger.error(f"Streaming TTS pipeline error: {e}", exc_info=True)
pipeline.state = PipelineState.IDLE
return
# Update stats
pipeline.total_responses += 1
self.total_pipeline_runs += 1
# Calculate total latency
total_latency = time.time() - (
pipeline.processing_start_time or time.time()
)
pipeline.stage_latencies["total"] = total_latency
logger.info(
f"Pipeline complete for {pipeline.user_name}: "
f"total latency {total_latency:.3f}s, "
f"stages: {pipeline.stage_latencies}"
)
# Return to idle
pipeline.state = PipelineState.IDLE
except asyncio.CancelledError:
logger.info(f"Pipeline cancelled for {pipeline.user_name}")
pipeline.total_cancellations += 1
pipeline.state = PipelineState.IDLE
raise
except asyncio.TimeoutError as e:
logger.error(
f"Pipeline timeout for {pipeline.user_name}: {e}"
)
self.total_errors += 1
pipeline.state = PipelineState.IDLE
except Exception as e:
logger.error(
f"Pipeline error for {pipeline.user_name}: {e}", exc_info=True
)
self.total_errors += 1
pipeline.state = PipelineState.IDLE
async def _cancel_pipeline(self, pipeline: UserPipeline) -> None:
"""
Cancel current pipeline processing.
Args:
pipeline: User pipeline
"""
if pipeline.current_task and not pipeline.current_task.done():
pipeline.current_task.cancel()
try:
await pipeline.current_task
except asyncio.CancelledError:
pass
pipeline.state = PipelineState.IDLE
def set_agent(self, agent: str) -> None:
"""
Set current active agent.
Args:
agent: Agent name ("jarvis" or "sage")
"""
self.current_agent = agent.lower()
logger.info(f"Switched to agent: {self.current_agent}")
def set_sensitivity(self, sensitivity: str) -> None:
"""
Set relevance sensitivity.
Args:
sensitivity: Sensitivity level ("low", "medium", "high")
"""
self.relevance_filter.sensitivity = sensitivity.lower()
logger.info(f"Set sensitivity to: {sensitivity}")
def get_stats(self) -> dict:
"""
Get orchestrator statistics.
Returns:
Dictionary with stats
"""
# Aggregate user stats
total_utterances = sum(p.total_utterances for p in self.pipelines.values())
total_responses = sum(p.total_responses for p in self.pipelines.values())
total_cancellations = sum(
p.total_cancellations for p in self.pipelines.values()
)
# Calculate average latencies
avg_latencies = {}
if total_responses > 0:
for stage in [
"stt",
"routing",
"relevance",
"llm_first_sentence",
"tts_first_chunk",
"time_to_first_audio",
"llm",
"total",
]:
latencies = [
p.stage_latencies.get(stage, 0)
for p in self.pipelines.values()
if stage in p.stage_latencies
]
avg_latencies[f"avg_{stage}_latency"] = (
sum(latencies) / len(latencies) if latencies else 0.0
)
return {
"active_users": len(self.pipelines),
"current_agent": self.current_agent,
"sensitivity": self.relevance_filter.sensitivity,
"total_audio_frames": self.total_audio_frames,
"total_utterances": total_utterances,
"total_responses": total_responses,
"total_cancellations": total_cancellations,
"total_pipeline_runs": self.total_pipeline_runs,
"total_errors": self.total_errors,
"router_stats": self.query_router.get_stats(),
**avg_latencies,
}
def get_user_stats(self, user_id: int) -> Optional[dict]:
"""
Get stats for specific user.
Args:
user_id: User ID
Returns:
User stats or None if not found
"""
if user_id not in self.pipelines:
return None
pipeline = self.pipelines[user_id]
return {
"user_id": pipeline.user_id,
"user_name": pipeline.user_name,
"state": pipeline.state.value,
"total_utterances": pipeline.total_utterances,
"total_responses": pipeline.total_responses,
"total_cancellations": pipeline.total_cancellations,
"stage_latencies": pipeline.stage_latencies,
}