## Performance Optimizations (3-10x faster responses) - STT beam_size reduced to 1 (3-5x faster transcription, minimal quality loss) - Smart query routing: Haiku (simple) → Sonnet (medium) → Opus (complex) - TTS cache for common phrases (27 pre-generated responses) - Sentence-level streaming TTS (start playing while generating) - Sample-based VAD timing (30x improvement in silence detection) ## TTS Engine Upgrade - Migrated from Chatterbox to Chatterbox-Turbo - Zero-shot voice cloning (no fine-tuning required) - Native paralinguistic tag support ([laugh], [sigh], [chuckle], etc.) - Emotion presets with temperature control - Improved marker conversion (*action*, (action), ~action~) ## Discord Bot Enhancements - Multi-agent support (Jarvis, Sage) - Improved voice receiving with discord-ext-voice-recv - Enhanced /join, /leave, /status commands - Per-agent personality configuration - Better audio sink/receiver implementation ## OpenClaw Integration - WebSocket support for Gateway communication - Query complexity routing (auto-select model) - Improved error handling and retries - Session management per Discord guild - Better latency tracking ## Pipeline Improvements - Sentence splitter for streaming optimization - Query router for intelligent model selection - Enhanced VAD receiver with sample-based timing - Improved audio buffering and format conversion - Better transcript management ## Documentation - Added QUICK_START.md (5-minute test guide) - Added OPTIMIZATION_SUMMARY.md (performance analysis) - Added DISCORD_OPTIMIZATION_TEST.md (testing guide) - Added USAGE_GUIDE.md (comprehensive usage) - Updated README.md with optimization details ## Utilities & Scripts - Added get_invite_link.py (Discord bot invite) - Added sync_commands.py, sync_to_guild.py (command sync) - Added test_gateway.py, test_stt.py (testing utilities) - Added openclaw_wrapper.py (wrapper script) - Removed create_mock_turn_model.py (no longer needed) ## Configuration Updates - STT model: medium → small (faster, acceptable quality) - TTS engine: chatterbox → coqui (Turbo integration) - Beam size: 5 → 1 (latency optimization) - Added emotion_exaggeration per agent - Updated .gitignore for project files Total: ~2105 insertions, ~462 deletions across 35 files Performance: ~5.5s total latency (down from 22-35s) Target: ~3.5s (achieved in simple queries with cache) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
255 lines
7.5 KiB
Python
255 lines
7.5 KiB
Python
"""Audio bridge between Discord and processing pipeline.
|
|
|
|
Handles:
|
|
- Receiving per-user audio from Discord (placeholder for Phase 4+)
|
|
- Sending TTS audio back to Discord
|
|
"""
|
|
|
|
import asyncio
|
|
import threading
|
|
from typing import Callable, Optional
|
|
|
|
import discord
|
|
import numpy as np
|
|
|
|
from utils import audio
|
|
from utils.logging import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class PipelineAudioSource(discord.AudioSource):
|
|
"""
|
|
Audio source that sends TTS audio to Discord.
|
|
|
|
Converts processing format (16kHz mono float32) to Discord format
|
|
(48kHz stereo int16) and provides it as 20ms opus frames.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize audio source."""
|
|
self._queue: asyncio.Queue[Optional[bytes]] = asyncio.Queue()
|
|
self._lock = threading.Lock()
|
|
self._is_done = False
|
|
|
|
def read(self) -> bytes:
|
|
"""
|
|
Called by Discord to get next audio frame (runs on sync thread).
|
|
|
|
Returns:
|
|
20ms of PCM audio (48kHz stereo int16) or empty bytes if done
|
|
"""
|
|
try:
|
|
# Try to get from queue (non-blocking)
|
|
try:
|
|
data = self._queue.get_nowait()
|
|
if data is None:
|
|
# Sentinel value means we're done
|
|
self._is_done = True
|
|
return b""
|
|
return data
|
|
except asyncio.QueueEmpty:
|
|
# No data available, return silence
|
|
silence_frame_size = 960 * 2 * 2 # 20ms @ 48kHz stereo int16
|
|
return b"\x00" * silence_frame_size
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error reading audio: {e}")
|
|
return b""
|
|
|
|
async def write_audio(self, audio_data: np.ndarray) -> None:
|
|
"""
|
|
Write processing audio to be played in Discord.
|
|
|
|
Args:
|
|
audio_data: Processing format audio (16kHz mono float32)
|
|
"""
|
|
try:
|
|
# Convert to Discord format
|
|
pcm_bytes = audio.processing_to_discord(audio_data)
|
|
|
|
# Split into 20ms frames
|
|
frames = audio.split_into_frames(pcm_bytes)
|
|
|
|
# Queue all frames
|
|
for frame in frames:
|
|
await self._queue.put(frame)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error writing audio: {e}")
|
|
|
|
async def finish(self) -> None:
|
|
"""Signal that no more audio will be written."""
|
|
await self._queue.put(None)
|
|
|
|
def is_opus(self) -> bool:
|
|
"""We provide PCM, not opus."""
|
|
return False
|
|
|
|
@property
|
|
def is_done(self) -> bool:
|
|
"""Check if playback is complete."""
|
|
return self._is_done
|
|
|
|
|
|
class AudioBridge:
|
|
"""
|
|
Manages audio flow between Discord and processing pipeline.
|
|
|
|
Handles:
|
|
- Per-user audio reception from Discord (TODO: Phase 4+)
|
|
- Audio callbacks to pipeline
|
|
- TTS audio playback in Discord
|
|
"""
|
|
|
|
def __init__(self, loop: asyncio.AbstractEventLoop):
|
|
"""
|
|
Initialize audio bridge.
|
|
|
|
Args:
|
|
loop: Asyncio event loop
|
|
"""
|
|
self.loop = loop
|
|
self._audio_sources: dict[int, PipelineAudioSource] = {}
|
|
self._audio_receivers: dict[int, "AudioReceiver"] = {} # type: ignore
|
|
self._audio_callback: Optional[Callable[[int, int, bytes], None]] = None
|
|
|
|
def set_audio_callback(
|
|
self, callback: Callable[[int, int, bytes], None]
|
|
) -> None:
|
|
"""
|
|
Set callback for received audio.
|
|
|
|
Args:
|
|
callback: Async function(guild_id, user_id, pcm_data)
|
|
"""
|
|
self._audio_callback = callback
|
|
|
|
async def start_receiving(
|
|
self, guild_id: int, voice_client: discord.VoiceClient
|
|
) -> None:
|
|
"""
|
|
Start receiving audio from Discord voice channel.
|
|
|
|
Args:
|
|
guild_id: Discord guild ID
|
|
voice_client: Connected voice client
|
|
"""
|
|
try:
|
|
from .audio_receiver import AudioReceiver
|
|
|
|
# Create and start audio receiver
|
|
receiver = AudioReceiver(
|
|
guild_id=guild_id,
|
|
voice_client=voice_client,
|
|
callback=self._audio_callback,
|
|
loop=self.loop
|
|
)
|
|
|
|
receiver.start()
|
|
self._audio_receivers[guild_id] = receiver
|
|
|
|
logger.info(f"Started receiving audio for guild {guild_id}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error starting audio receiving for guild {guild_id}: {e}", exc_info=True)
|
|
|
|
async def stop_receiving(self, guild_id: int, voice_client: discord.VoiceClient = None) -> None:
|
|
"""
|
|
Stop receiving audio from Discord voice channel.
|
|
|
|
Args:
|
|
guild_id: Discord guild ID
|
|
voice_client: Connected voice client (optional)
|
|
"""
|
|
try:
|
|
receiver = self._audio_receivers.pop(guild_id, None)
|
|
if receiver:
|
|
receiver.stop()
|
|
logger.info(f"Stopped receiving audio for guild {guild_id}")
|
|
except Exception as e:
|
|
logger.error(f"Error stopping audio receiving for guild {guild_id}: {e}")
|
|
|
|
async def play_audio(
|
|
self,
|
|
guild_id: int,
|
|
voice_client: discord.VoiceClient,
|
|
audio_data: np.ndarray,
|
|
) -> None:
|
|
"""
|
|
Play TTS audio in Discord voice channel.
|
|
|
|
Args:
|
|
guild_id: Discord guild ID
|
|
voice_client: Connected voice client
|
|
audio_data: Processing format audio (16kHz mono float32)
|
|
"""
|
|
try:
|
|
# Stop any currently playing audio
|
|
if voice_client.is_playing():
|
|
voice_client.stop()
|
|
|
|
# Create audio source
|
|
source = PipelineAudioSource()
|
|
self._audio_sources[guild_id] = source
|
|
|
|
# Write audio data
|
|
await source.write_audio(audio_data)
|
|
await source.finish()
|
|
|
|
# Start playback
|
|
voice_client.play(
|
|
source,
|
|
after=lambda error: self._playback_finished_callback(
|
|
guild_id, error
|
|
),
|
|
)
|
|
|
|
logger.info(
|
|
f"Started playback for guild {guild_id} "
|
|
f"({len(audio_data)} samples)"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error playing audio for guild {guild_id}: {e}")
|
|
|
|
async def stop_playback(
|
|
self, guild_id: int, voice_client: discord.VoiceClient
|
|
) -> None:
|
|
"""
|
|
Stop TTS playback (for barge-in).
|
|
|
|
Args:
|
|
guild_id: Discord guild ID
|
|
voice_client: Connected voice client
|
|
"""
|
|
if voice_client.is_playing():
|
|
voice_client.stop()
|
|
logger.info(f"Stopped playback for guild {guild_id} (barge-in)")
|
|
|
|
# Clean up source
|
|
self._audio_sources.pop(guild_id, None)
|
|
|
|
def _playback_finished_callback(
|
|
self, guild_id: int, error: Optional[Exception]
|
|
) -> None:
|
|
"""Called when playback finishes."""
|
|
if error:
|
|
logger.error(f"Playback error for guild {guild_id}: {error}")
|
|
else:
|
|
logger.debug(f"Playback finished for guild {guild_id}")
|
|
|
|
# Clean up source
|
|
self._audio_sources.pop(guild_id, None)
|
|
|
|
async def cleanup(self) -> None:
|
|
"""Clean up all audio bridges."""
|
|
logger.info("Cleaning up audio bridges")
|
|
|
|
# Stop all receivers
|
|
for receiver in self._audio_receivers.values():
|
|
receiver.stop()
|
|
self._audio_receivers.clear()
|
|
|
|
# Clear sources
|
|
self._audio_sources.clear()
|