openclaw-voice/discord_bot/vad_receiver.py
MCKRUZ 9fde3d31ba feat: Major performance optimizations and feature enhancements
## Performance Optimizations (3-10x faster responses)
- STT beam_size reduced to 1 (3-5x faster transcription, minimal quality loss)
- Smart query routing: Haiku (simple) → Sonnet (medium) → Opus (complex)
- TTS cache for common phrases (27 pre-generated responses)
- Sentence-level streaming TTS (start playing while generating)
- Sample-based VAD timing (30x improvement in silence detection)

## TTS Engine Upgrade
- Migrated from Chatterbox to Chatterbox-Turbo
- Zero-shot voice cloning (no fine-tuning required)
- Native paralinguistic tag support ([laugh], [sigh], [chuckle], etc.)
- Emotion presets with temperature control
- Improved marker conversion (*action*, (action), ~action~)

## Discord Bot Enhancements
- Multi-agent support (Jarvis, Sage)
- Improved voice receiving with discord-ext-voice-recv
- Enhanced /join, /leave, /status commands
- Per-agent personality configuration
- Better audio sink/receiver implementation

## OpenClaw Integration
- WebSocket support for Gateway communication
- Query complexity routing (auto-select model)
- Improved error handling and retries
- Session management per Discord guild
- Better latency tracking

## Pipeline Improvements
- Sentence splitter for streaming optimization
- Query router for intelligent model selection
- Enhanced VAD receiver with sample-based timing
- Improved audio buffering and format conversion
- Better transcript management

## Documentation
- Added QUICK_START.md (5-minute test guide)
- Added OPTIMIZATION_SUMMARY.md (performance analysis)
- Added DISCORD_OPTIMIZATION_TEST.md (testing guide)
- Added USAGE_GUIDE.md (comprehensive usage)
- Updated README.md with optimization details

## Utilities & Scripts
- Added get_invite_link.py (Discord bot invite)
- Added sync_commands.py, sync_to_guild.py (command sync)
- Added test_gateway.py, test_stt.py (testing utilities)
- Added openclaw_wrapper.py (wrapper script)
- Removed create_mock_turn_model.py (no longer needed)

## Configuration Updates
- STT model: medium → small (faster, acceptable quality)
- TTS engine: chatterbox → coqui (Turbo integration)
- Beam size: 5 → 1 (latency optimization)
- Added emotion_exaggeration per agent
- Updated .gitignore for project files

Total: ~2105 insertions, ~462 deletions across 35 files
Performance: ~5.5s total latency (down from 22-35s)
Target: ~3.5s (achieved in simple queries with cache)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-16 19:29:57 -05:00

241 lines
9.1 KiB
Python

"""VAD-based audio receiver for Discord with sample-based timing.
Processes audio with Silero VAD in the callback thread using sample-based timing
(not wall-clock) for accurate silence detection. Accumulates speech+silence and
triggers processing when silence threshold is exceeded.
Key features:
- Sample-based timing for accurate silence detection (avoids processing delays)
- Per-user audio buffers with independent VAD state
- LSTM state management for switching between users
- Configurable silence threshold and minimum speech duration
"""
import asyncio
import logging
import threading
from typing import Callable, Optional
import numpy as np
import torch
logger = logging.getLogger(__name__)
# Discord audio format
DISCORD_SAMPLE_RATE = 48_000
TARGET_SAMPLE_RATE = 16_000
DOWNSAMPLE_FACTOR = DISCORD_SAMPLE_RATE // TARGET_SAMPLE_RATE
# Silero VAD expects 512 samples at 16 kHz
VAD_CHUNK_SAMPLES = 512
class UserAudioBuffer:
"""Per-user audio buffer with VAD state tracking."""
def __init__(self, user_id: int, user_name: str):
self.user_id = user_id
self.user_name = user_name
# Accumulated audio chunks (16kHz mono float32)
self.audio_chunks: list[np.ndarray] = []
# VAD buffer for incomplete chunks
self.vad_buffer = np.empty(0, dtype=np.float32)
# Speech state (using SAMPLE-BASED timing, not wall-clock!)
self.is_speaking = False
self.total_samples_processed = 0
self.speech_start_sample = 0
self.silence_start_sample: Optional[int] = None
def reset(self) -> None:
"""Reset buffer state."""
self.audio_chunks.clear()
self.vad_buffer = np.empty(0, dtype=np.float32)
self.is_speaking = False
self.total_samples_processed = 0
self.speech_start_sample = 0
self.silence_start_sample = None
def get_speech_audio(self) -> np.ndarray:
"""Get accumulated speech as single array."""
if not self.audio_chunks:
return np.empty(0, dtype=np.float32)
return np.concatenate(self.audio_chunks)
class VADAudioReceiver:
"""
VAD-based audio receiver for Discord.
Processes audio in the callback thread using Silero VAD,
accumulates complete utterances, and triggers callbacks.
"""
def __init__(
self,
vad_model,
vad_threshold: float = 0.5,
silence_duration_ms: float = 300,
min_speech_duration_s: float = 0.3,
on_speech_complete: Optional[Callable] = None,
loop: Optional[asyncio.AbstractEventLoop] = None,
):
"""
Initialize VAD audio receiver.
Args:
vad_model: Silero VAD model
vad_threshold: VAD confidence threshold (0.0-1.0)
silence_duration_ms: Silence duration to end speech (milliseconds)
min_speech_duration_s: Minimum speech duration to process (seconds)
on_speech_complete: Async callback(user_id, user_name, audio_array)
loop: Event loop for running callbacks
"""
self.vad_model = vad_model
self.vad_model.eval()
self.vad_threshold = vad_threshold
self.silence_duration_ms = silence_duration_ms
self.min_speech_duration_s = min_speech_duration_s
self.on_speech_complete = on_speech_complete
self.loop = loop or asyncio.get_event_loop()
# Per-user buffers
self._buffers: dict[int, UserAudioBuffer] = {}
self._lock = threading.Lock()
# Track last user for VAD state reset
self._last_vad_user: Optional[int] = None
logger.info(
f"VAD audio receiver initialized "
f"(threshold={vad_threshold}, silence={silence_duration_ms}ms)"
)
def _get_buffer(self, user_id: int, user_name: str) -> UserAudioBuffer:
"""Get or create buffer for user."""
if user_id not in self._buffers:
self._buffers[user_id] = UserAudioBuffer(user_id, user_name)
logger.debug(f"Created audio buffer for {user_name} ({user_id})")
return self._buffers[user_id]
def on_audio(self, user_id: int, user_name: str, pcm_data: bytes) -> None:
"""
Process incoming audio from Discord.
Called from Discord's audio thread - keep it fast!
Args:
user_id: Discord user ID
user_name: User display name
pcm_data: Raw PCM audio (48kHz stereo int16)
"""
with self._lock:
buf = self._get_buffer(user_id, user_name)
# Convert Discord format to pipeline format
# bytes → int16 stereo → float32 mono → downsample to 16kHz
samples = np.frombuffer(pcm_data, dtype=np.int16)
# Stereo → mono (average channels)
if len(samples) % 2 == 0:
stereo = samples.reshape(-1, 2)
mono = stereo.mean(axis=1).astype(np.float32) / 32768.0
else:
mono = samples.astype(np.float32) / 32768.0
# Downsample 48kHz → 16kHz (take every 3rd sample)
downsampled = mono[::DOWNSAMPLE_FACTOR]
# Append to VAD buffer
buf.vad_buffer = np.concatenate([buf.vad_buffer, downsampled])
# Reset VAD LSTM state when switching between users
if self._last_vad_user != user_id:
self.vad_model.reset_states()
self._last_vad_user = user_id
logger.debug(f"Reset VAD state for {user_name}")
# Process VAD in chunks
while len(buf.vad_buffer) >= VAD_CHUNK_SAMPLES:
chunk = buf.vad_buffer[:VAD_CHUNK_SAMPLES]
buf.vad_buffer = buf.vad_buffer[VAD_CHUNK_SAMPLES:]
# Update sample counter (CRITICAL: use audio time, not wall-clock time!)
buf.total_samples_processed += VAD_CHUNK_SAMPLES
# Run VAD on chunk
chunk_tensor = torch.from_numpy(chunk)
with torch.no_grad():
speech_prob = self.vad_model(chunk_tensor, TARGET_SAMPLE_RATE).item()
is_speech = speech_prob >= self.vad_threshold
if is_speech:
# Speech detected
buf.silence_start_sample = None
if not buf.is_speaking:
# Speech start
buf.is_speaking = True
buf.speech_start_sample = buf.total_samples_processed
buf.audio_chunks.clear()
logger.info(f"Speech started: {user_name} (prob={speech_prob:.3f})")
# Accumulate audio during speech
buf.audio_chunks.append(chunk.copy())
elif buf.is_speaking:
# Silence during speech - keep accumulating
buf.audio_chunks.append(chunk.copy())
if buf.silence_start_sample is None:
# First silence chunk after speech
buf.silence_start_sample = buf.total_samples_processed
logger.debug(f"Silence started for {user_name}")
else:
# Check if silence duration exceeded (using SAMPLE-BASED timing)
silence_samples = buf.total_samples_processed - buf.silence_start_sample
silence_duration_ms = (silence_samples / TARGET_SAMPLE_RATE) * 1000
if silence_duration_ms >= self.silence_duration_ms:
# Speech complete!
audio = buf.get_speech_audio()
duration_s = len(audio) / TARGET_SAMPLE_RATE
logger.info(
f"Speech complete: {user_name} "
f"({duration_s:.2f}s, "
f"silence: {silence_duration_ms:.0f}ms)"
)
# Reset buffer
buf.reset()
# Trigger callback if audio is long enough
if duration_s >= self.min_speech_duration_s:
if self.on_speech_complete:
asyncio.run_coroutine_threadsafe(
self.on_speech_complete(user_id, user_name, audio),
self.loop,
)
else:
logger.debug(
f"Ignoring short speech: {user_name} ({duration_s:.2f}s)"
)
def clear_user(self, user_id: int) -> None:
"""Clear buffer for user (when they leave)."""
with self._lock:
if user_id in self._buffers:
user_name = self._buffers[user_id].user_name
del self._buffers[user_id]
logger.info(f"Cleared audio buffer for {user_name} ({user_id})")
def clear_all(self) -> None:
"""Clear all user buffers."""
with self._lock:
self._buffers.clear()
logger.info("Cleared all audio buffers")