openclaw-voice/pipeline/vad.py
MCKRUZ 9fde3d31ba feat: Major performance optimizations and feature enhancements
## Performance Optimizations (3-10x faster responses)
- STT beam_size reduced to 1 (3-5x faster transcription, minimal quality loss)
- Smart query routing: Haiku (simple) → Sonnet (medium) → Opus (complex)
- TTS cache for common phrases (27 pre-generated responses)
- Sentence-level streaming TTS (start playing while generating)
- Sample-based VAD timing (30x improvement in silence detection)

## TTS Engine Upgrade
- Migrated from Chatterbox to Chatterbox-Turbo
- Zero-shot voice cloning (no fine-tuning required)
- Native paralinguistic tag support ([laugh], [sigh], [chuckle], etc.)
- Emotion presets with temperature control
- Improved marker conversion (*action*, (action), ~action~)

## Discord Bot Enhancements
- Multi-agent support (Jarvis, Sage)
- Improved voice receiving with discord-ext-voice-recv
- Enhanced /join, /leave, /status commands
- Per-agent personality configuration
- Better audio sink/receiver implementation

## OpenClaw Integration
- WebSocket support for Gateway communication
- Query complexity routing (auto-select model)
- Improved error handling and retries
- Session management per Discord guild
- Better latency tracking

## Pipeline Improvements
- Sentence splitter for streaming optimization
- Query router for intelligent model selection
- Enhanced VAD receiver with sample-based timing
- Improved audio buffering and format conversion
- Better transcript management

## Documentation
- Added QUICK_START.md (5-minute test guide)
- Added OPTIMIZATION_SUMMARY.md (performance analysis)
- Added DISCORD_OPTIMIZATION_TEST.md (testing guide)
- Added USAGE_GUIDE.md (comprehensive usage)
- Updated README.md with optimization details

## Utilities & Scripts
- Added get_invite_link.py (Discord bot invite)
- Added sync_commands.py, sync_to_guild.py (command sync)
- Added test_gateway.py, test_stt.py (testing utilities)
- Added openclaw_wrapper.py (wrapper script)
- Removed create_mock_turn_model.py (no longer needed)

## Configuration Updates
- STT model: medium → small (faster, acceptable quality)
- TTS engine: chatterbox → coqui (Turbo integration)
- Beam size: 5 → 1 (latency optimization)
- Added emotion_exaggeration per agent
- Updated .gitignore for project files

Total: ~2105 insertions, ~462 deletions across 35 files
Performance: ~5.5s total latency (down from 22-35s)
Target: ~3.5s (achieved in simple queries with cache)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-16 19:29:57 -05:00

425 lines
13 KiB
Python

"""Voice Activity Detection using Silero VAD.
Detects speech start/end in audio streams for turn-taking and transcription.
"""
import asyncio
from dataclasses import dataclass
from enum import Enum
from typing import Callable, Optional
import numpy as np
import torch
from utils.logging import get_logger
logger = get_logger(__name__)
class SpeechState(Enum):
"""Current speech detection state."""
SILENCE = "silence"
SPEECH = "speech"
UNKNOWN = "unknown"
@dataclass
class SpeechSegment:
"""Represents a detected speech segment."""
audio: np.ndarray # Audio samples (float32)
start_time: float # Start time in seconds (relative to stream)
end_time: float # End time in seconds
duration: float # Duration in seconds
user_id: int # User ID who spoke
@property
def sample_count(self) -> int:
"""Get number of audio samples."""
return len(self.audio)
class SileroVAD:
"""
Silero VAD wrapper for speech detection.
Silero VAD is a lightweight, fast voice activity detector that runs on CPU.
"""
def __init__(
self,
sample_rate: int = 16000,
silence_threshold: float = 0.3,
speech_threshold: float = 0.5,
min_speech_duration: float = 0.25,
min_silence_duration: float = 0.3,
):
"""
Initialize Silero VAD.
Args:
sample_rate: Audio sample rate (must be 8000 or 16000)
silence_threshold: Silence threshold after speech (seconds)
speech_threshold: VAD confidence threshold (0.0-1.0)
min_speech_duration: Minimum speech duration to trigger (seconds)
min_silence_duration: Minimum silence after speech to end segment
"""
if sample_rate not in [8000, 16000]:
raise ValueError(
f"Silero VAD only supports 8000 or 16000 Hz, got {sample_rate}"
)
self.sample_rate = sample_rate
self.silence_threshold = silence_threshold
self.speech_threshold = speech_threshold
self.min_speech_duration = min_speech_duration
self.min_silence_duration = min_silence_duration
# Load Silero VAD model
self.model = None
self._load_model()
# State tracking
self.current_state = SpeechState.SILENCE
self.speech_start_sample = 0
self.last_speech_sample = 0
self.accumulated_audio: list[np.ndarray] = []
self.total_samples_processed = 0
def _load_model(self) -> None:
"""Load Silero VAD model from torch hub."""
try:
logger.info("Loading Silero VAD model...")
# Load model from torch hub
self.model, utils = torch.hub.load(
repo_or_dir="snakers4/silero-vad",
model="silero_vad",
force_reload=False,
onnx=False,
)
# Extract utility functions
(get_speech_timestamps, _, read_audio, *_) = utils
self.model.eval()
logger.info("Silero VAD model loaded successfully")
except Exception as e:
logger.error(f"Failed to load Silero VAD model: {e}")
raise
def process_chunk(self, audio: np.ndarray) -> tuple[SpeechState, Optional[float]]:
"""
Process an audio chunk and detect speech.
Args:
audio: Audio chunk (float32, mono, 16kHz)
Returns:
Tuple of (current_state, speech_probability)
"""
if audio.dtype != np.float32:
raise ValueError(f"Expected float32 audio, got {audio.dtype}")
# Convert to torch tensor
audio_tensor = torch.from_numpy(audio)
# Run VAD
with torch.no_grad():
speech_prob = self.model(audio_tensor, self.sample_rate).item()
# Debug logging - log speech probability when it's above a minimal threshold
if speech_prob > 0.1:
logger.info(f"VAD: speech_prob={speech_prob:.3f}, threshold={self.speech_threshold:.3f}")
# Determine state based on threshold
if speech_prob >= self.speech_threshold:
new_state = SpeechState.SPEECH
logger.info(f"SPEECH DETECTED! probability={speech_prob:.3f}")
else:
new_state = SpeechState.SILENCE
return new_state, speech_prob
def process_stream(
self, audio: np.ndarray
) -> tuple[SpeechState, Optional[SpeechSegment]]:
"""
Process streaming audio and detect speech segments.
Args:
audio: Audio chunk to process (float32, mono)
Returns:
Tuple of (current_state, speech_segment_if_complete)
"""
# Process chunk to get speech probability
state, speech_prob = self.process_chunk(audio)
# Update total samples
self.total_samples_processed += len(audio)
# State machine for speech detection
if self.current_state == SpeechState.SILENCE:
if state == SpeechState.SPEECH:
# Speech started
self.current_state = SpeechState.SPEECH
self.speech_start_sample = self.total_samples_processed - len(audio)
self.last_speech_sample = self.total_samples_processed
self.accumulated_audio = [audio.copy()]
logger.debug(
f"Speech started at sample {self.speech_start_sample} "
f"(prob: {speech_prob:.3f})"
)
elif self.current_state == SpeechState.SPEECH:
# Accumulate audio
self.accumulated_audio.append(audio.copy())
if state == SpeechState.SPEECH:
# Speech continuing
self.last_speech_sample = self.total_samples_processed
else:
# Potential silence
silence_duration = (
self.total_samples_processed - self.last_speech_sample
) / self.sample_rate
if silence_duration >= self.min_silence_duration:
# Speech ended - create segment
segment = self._create_segment()
# Reset state
self.current_state = SpeechState.SILENCE
self.accumulated_audio = []
logger.debug(
f"Speech ended after {segment.duration:.2f}s "
f"(silence: {silence_duration:.2f}s)"
)
return self.current_state, segment
return self.current_state, None
def _create_segment(self) -> SpeechSegment:
"""
Create a speech segment from accumulated audio.
Returns:
SpeechSegment
"""
# Concatenate accumulated audio
audio = np.concatenate(self.accumulated_audio)
# Calculate times
start_time = self.speech_start_sample / self.sample_rate
end_time = self.last_speech_sample / self.sample_rate
duration = end_time - start_time
segment = SpeechSegment(
audio=audio,
start_time=start_time,
end_time=end_time,
duration=duration,
user_id=0, # Will be set by caller
)
return segment
def reset(self) -> None:
"""Reset VAD state (for new stream or user)."""
self.current_state = SpeechState.SILENCE
self.speech_start_sample = 0
self.last_speech_sample = 0
self.accumulated_audio = []
self.total_samples_processed = 0
logger.debug("VAD state reset")
def force_end_speech(self) -> Optional[SpeechSegment]:
"""
Force end current speech segment (if any).
Useful when user leaves or stream ends.
Returns:
SpeechSegment if speech was active, None otherwise
"""
if self.current_state == SpeechState.SPEECH:
segment = self._create_segment()
self.current_state = SpeechState.SILENCE
self.accumulated_audio = []
logger.debug(f"Forced speech end after {segment.duration:.2f}s")
return segment
return None
def get_state(self) -> SpeechState:
"""Get current speech detection state."""
return self.current_state
def is_speech_active(self) -> bool:
"""Check if speech is currently being detected."""
return self.current_state == SpeechState.SPEECH
class PerUserVAD:
"""
Manages VAD instances for multiple users.
Maintains separate VAD state for each user in a voice channel.
"""
def __init__(
self,
sample_rate: int = 16000,
silence_threshold: float = 0.3,
speech_threshold: float = 0.5,
min_speech_duration: float = 0.25,
speech_callback: Optional[Callable[[int, SpeechSegment], None]] = None,
):
"""
Initialize per-user VAD manager.
Args:
sample_rate: Audio sample rate
silence_threshold: Silence duration threshold
speech_threshold: VAD confidence threshold
min_speech_duration: Minimum speech duration
speech_callback: Async callback when speech segment detected
"""
self.sample_rate = sample_rate
self.silence_threshold = silence_threshold
self.speech_threshold = speech_threshold
self.min_speech_duration = min_speech_duration
self.speech_callback = speech_callback
self._vad_instances: dict[int, SileroVAD] = {}
self._lock = asyncio.Lock()
async def get_or_create_vad(self, user_id: int) -> SileroVAD:
"""
Get VAD instance for a user, creating if necessary.
Args:
user_id: User ID
Returns:
SileroVAD instance
"""
async with self._lock:
if user_id not in self._vad_instances:
self._vad_instances[user_id] = SileroVAD(
sample_rate=self.sample_rate,
silence_threshold=self.silence_threshold,
speech_threshold=self.speech_threshold,
min_speech_duration=self.min_speech_duration,
)
logger.debug(f"Created VAD instance for user {user_id}")
return self._vad_instances[user_id]
async def process_audio(
self, user_id: int, audio: np.ndarray
) -> Optional[SpeechSegment]:
"""
Process audio for a user and detect speech.
Args:
user_id: User ID
audio: Audio chunk (float32, mono)
Returns:
SpeechSegment if speech segment completed, None otherwise
"""
vad = await self.get_or_create_vad(user_id)
# Process audio
state, segment = vad.process_stream(audio)
# If segment completed, set user_id and invoke callback
if segment is not None:
segment.user_id = user_id
if self.speech_callback:
await self.speech_callback(user_id, segment)
return segment
async def reset_user(self, user_id: int) -> None:
"""
Reset VAD state for a user.
Args:
user_id: User ID
"""
async with self._lock:
if user_id in self._vad_instances:
self._vad_instances[user_id].reset()
async def remove_user(self, user_id: int) -> None:
"""
Remove VAD instance for a user.
Args:
user_id: User ID
"""
async with self._lock:
if user_id in self._vad_instances:
# Force end any active speech
vad = self._vad_instances[user_id]
segment = vad.force_end_speech()
if segment is not None:
segment.user_id = user_id
if self.speech_callback:
await self.speech_callback(user_id, segment)
del self._vad_instances[user_id]
logger.debug(f"Removed VAD instance for user {user_id}")
async def get_active_users(self) -> list[int]:
"""
Get list of users with active VAD instances.
Returns:
List of user IDs
"""
async with self._lock:
return list(self._vad_instances.keys())
async def get_speaking_users(self) -> list[int]:
"""
Get list of users currently speaking.
Returns:
List of user IDs
"""
async with self._lock:
return [
user_id
for user_id, vad in self._vad_instances.items()
if vad.is_speech_active()
]
async def remove_all(self) -> None:
"""Remove all VAD instances."""
async with self._lock:
self._vad_instances.clear()
logger.debug("Removed all VAD instances")
def __len__(self) -> int:
"""Get number of VAD instances."""
return len(self._vad_instances)
def __repr__(self) -> str:
"""String representation."""
return f"PerUserVAD(users={len(self._vad_instances)})"