openclaw-voice/discord_bot/audio_bridge.py
MCKRUZ 9fde3d31ba feat: Major performance optimizations and feature enhancements
## Performance Optimizations (3-10x faster responses)
- STT beam_size reduced to 1 (3-5x faster transcription, minimal quality loss)
- Smart query routing: Haiku (simple) → Sonnet (medium) → Opus (complex)
- TTS cache for common phrases (27 pre-generated responses)
- Sentence-level streaming TTS (start playing while generating)
- Sample-based VAD timing (30x improvement in silence detection)

## TTS Engine Upgrade
- Migrated from Chatterbox to Chatterbox-Turbo
- Zero-shot voice cloning (no fine-tuning required)
- Native paralinguistic tag support ([laugh], [sigh], [chuckle], etc.)
- Emotion presets with temperature control
- Improved marker conversion (*action*, (action), ~action~)

## Discord Bot Enhancements
- Multi-agent support (Jarvis, Sage)
- Improved voice receiving with discord-ext-voice-recv
- Enhanced /join, /leave, /status commands
- Per-agent personality configuration
- Better audio sink/receiver implementation

## OpenClaw Integration
- WebSocket support for Gateway communication
- Query complexity routing (auto-select model)
- Improved error handling and retries
- Session management per Discord guild
- Better latency tracking

## Pipeline Improvements
- Sentence splitter for streaming optimization
- Query router for intelligent model selection
- Enhanced VAD receiver with sample-based timing
- Improved audio buffering and format conversion
- Better transcript management

## Documentation
- Added QUICK_START.md (5-minute test guide)
- Added OPTIMIZATION_SUMMARY.md (performance analysis)
- Added DISCORD_OPTIMIZATION_TEST.md (testing guide)
- Added USAGE_GUIDE.md (comprehensive usage)
- Updated README.md with optimization details

## Utilities & Scripts
- Added get_invite_link.py (Discord bot invite)
- Added sync_commands.py, sync_to_guild.py (command sync)
- Added test_gateway.py, test_stt.py (testing utilities)
- Added openclaw_wrapper.py (wrapper script)
- Removed create_mock_turn_model.py (no longer needed)

## Configuration Updates
- STT model: medium → small (faster, acceptable quality)
- TTS engine: chatterbox → coqui (Turbo integration)
- Beam size: 5 → 1 (latency optimization)
- Added emotion_exaggeration per agent
- Updated .gitignore for project files

Total: ~2105 insertions, ~462 deletions across 35 files
Performance: ~5.5s total latency (down from 22-35s)
Target: ~3.5s (achieved in simple queries with cache)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-16 19:29:57 -05:00

255 lines
7.5 KiB
Python

"""Audio bridge between Discord and processing pipeline.
Handles:
- Receiving per-user audio from Discord (placeholder for Phase 4+)
- Sending TTS audio back to Discord
"""
import asyncio
import threading
from typing import Callable, Optional
import discord
import numpy as np
from utils import audio
from utils.logging import get_logger
logger = get_logger(__name__)
class PipelineAudioSource(discord.AudioSource):
"""
Audio source that sends TTS audio to Discord.
Converts processing format (16kHz mono float32) to Discord format
(48kHz stereo int16) and provides it as 20ms opus frames.
"""
def __init__(self):
"""Initialize audio source."""
self._queue: asyncio.Queue[Optional[bytes]] = asyncio.Queue()
self._lock = threading.Lock()
self._is_done = False
def read(self) -> bytes:
"""
Called by Discord to get next audio frame (runs on sync thread).
Returns:
20ms of PCM audio (48kHz stereo int16) or empty bytes if done
"""
try:
# Try to get from queue (non-blocking)
try:
data = self._queue.get_nowait()
if data is None:
# Sentinel value means we're done
self._is_done = True
return b""
return data
except asyncio.QueueEmpty:
# No data available, return silence
silence_frame_size = 960 * 2 * 2 # 20ms @ 48kHz stereo int16
return b"\x00" * silence_frame_size
except Exception as e:
logger.error(f"Error reading audio: {e}")
return b""
async def write_audio(self, audio_data: np.ndarray) -> None:
"""
Write processing audio to be played in Discord.
Args:
audio_data: Processing format audio (16kHz mono float32)
"""
try:
# Convert to Discord format
pcm_bytes = audio.processing_to_discord(audio_data)
# Split into 20ms frames
frames = audio.split_into_frames(pcm_bytes)
# Queue all frames
for frame in frames:
await self._queue.put(frame)
except Exception as e:
logger.error(f"Error writing audio: {e}")
async def finish(self) -> None:
"""Signal that no more audio will be written."""
await self._queue.put(None)
def is_opus(self) -> bool:
"""We provide PCM, not opus."""
return False
@property
def is_done(self) -> bool:
"""Check if playback is complete."""
return self._is_done
class AudioBridge:
"""
Manages audio flow between Discord and processing pipeline.
Handles:
- Per-user audio reception from Discord (TODO: Phase 4+)
- Audio callbacks to pipeline
- TTS audio playback in Discord
"""
def __init__(self, loop: asyncio.AbstractEventLoop):
"""
Initialize audio bridge.
Args:
loop: Asyncio event loop
"""
self.loop = loop
self._audio_sources: dict[int, PipelineAudioSource] = {}
self._audio_receivers: dict[int, "AudioReceiver"] = {} # type: ignore
self._audio_callback: Optional[Callable[[int, int, bytes], None]] = None
def set_audio_callback(
self, callback: Callable[[int, int, bytes], None]
) -> None:
"""
Set callback for received audio.
Args:
callback: Async function(guild_id, user_id, pcm_data)
"""
self._audio_callback = callback
async def start_receiving(
self, guild_id: int, voice_client: discord.VoiceClient
) -> None:
"""
Start receiving audio from Discord voice channel.
Args:
guild_id: Discord guild ID
voice_client: Connected voice client
"""
try:
from .audio_receiver import AudioReceiver
# Create and start audio receiver
receiver = AudioReceiver(
guild_id=guild_id,
voice_client=voice_client,
callback=self._audio_callback,
loop=self.loop
)
receiver.start()
self._audio_receivers[guild_id] = receiver
logger.info(f"Started receiving audio for guild {guild_id}")
except Exception as e:
logger.error(f"Error starting audio receiving for guild {guild_id}: {e}", exc_info=True)
async def stop_receiving(self, guild_id: int, voice_client: discord.VoiceClient = None) -> None:
"""
Stop receiving audio from Discord voice channel.
Args:
guild_id: Discord guild ID
voice_client: Connected voice client (optional)
"""
try:
receiver = self._audio_receivers.pop(guild_id, None)
if receiver:
receiver.stop()
logger.info(f"Stopped receiving audio for guild {guild_id}")
except Exception as e:
logger.error(f"Error stopping audio receiving for guild {guild_id}: {e}")
async def play_audio(
self,
guild_id: int,
voice_client: discord.VoiceClient,
audio_data: np.ndarray,
) -> None:
"""
Play TTS audio in Discord voice channel.
Args:
guild_id: Discord guild ID
voice_client: Connected voice client
audio_data: Processing format audio (16kHz mono float32)
"""
try:
# Stop any currently playing audio
if voice_client.is_playing():
voice_client.stop()
# Create audio source
source = PipelineAudioSource()
self._audio_sources[guild_id] = source
# Write audio data
await source.write_audio(audio_data)
await source.finish()
# Start playback
voice_client.play(
source,
after=lambda error: self._playback_finished_callback(
guild_id, error
),
)
logger.info(
f"Started playback for guild {guild_id} "
f"({len(audio_data)} samples)"
)
except Exception as e:
logger.error(f"Error playing audio for guild {guild_id}: {e}")
async def stop_playback(
self, guild_id: int, voice_client: discord.VoiceClient
) -> None:
"""
Stop TTS playback (for barge-in).
Args:
guild_id: Discord guild ID
voice_client: Connected voice client
"""
if voice_client.is_playing():
voice_client.stop()
logger.info(f"Stopped playback for guild {guild_id} (barge-in)")
# Clean up source
self._audio_sources.pop(guild_id, None)
def _playback_finished_callback(
self, guild_id: int, error: Optional[Exception]
) -> None:
"""Called when playback finishes."""
if error:
logger.error(f"Playback error for guild {guild_id}: {error}")
else:
logger.debug(f"Playback finished for guild {guild_id}")
# Clean up source
self._audio_sources.pop(guild_id, None)
async def cleanup(self) -> None:
"""Clean up all audio bridges."""
logger.info("Cleaning up audio bridges")
# Stop all receivers
for receiver in self._audio_receivers.values():
receiver.stop()
self._audio_receivers.clear()
# Clear sources
self._audio_sources.clear()