## Performance Optimizations (3-10x faster responses) - STT beam_size reduced to 1 (3-5x faster transcription, minimal quality loss) - Smart query routing: Haiku (simple) → Sonnet (medium) → Opus (complex) - TTS cache for common phrases (27 pre-generated responses) - Sentence-level streaming TTS (start playing while generating) - Sample-based VAD timing (30x improvement in silence detection) ## TTS Engine Upgrade - Migrated from Chatterbox to Chatterbox-Turbo - Zero-shot voice cloning (no fine-tuning required) - Native paralinguistic tag support ([laugh], [sigh], [chuckle], etc.) - Emotion presets with temperature control - Improved marker conversion (*action*, (action), ~action~) ## Discord Bot Enhancements - Multi-agent support (Jarvis, Sage) - Improved voice receiving with discord-ext-voice-recv - Enhanced /join, /leave, /status commands - Per-agent personality configuration - Better audio sink/receiver implementation ## OpenClaw Integration - WebSocket support for Gateway communication - Query complexity routing (auto-select model) - Improved error handling and retries - Session management per Discord guild - Better latency tracking ## Pipeline Improvements - Sentence splitter for streaming optimization - Query router for intelligent model selection - Enhanced VAD receiver with sample-based timing - Improved audio buffering and format conversion - Better transcript management ## Documentation - Added QUICK_START.md (5-minute test guide) - Added OPTIMIZATION_SUMMARY.md (performance analysis) - Added DISCORD_OPTIMIZATION_TEST.md (testing guide) - Added USAGE_GUIDE.md (comprehensive usage) - Updated README.md with optimization details ## Utilities & Scripts - Added get_invite_link.py (Discord bot invite) - Added sync_commands.py, sync_to_guild.py (command sync) - Added test_gateway.py, test_stt.py (testing utilities) - Added openclaw_wrapper.py (wrapper script) - Removed create_mock_turn_model.py (no longer needed) ## Configuration Updates - STT model: medium → small (faster, acceptable quality) - TTS engine: chatterbox → coqui (Turbo integration) - Beam size: 5 → 1 (latency optimization) - Added emotion_exaggeration per agent - Updated .gitignore for project files Total: ~2105 insertions, ~462 deletions across 35 files Performance: ~5.5s total latency (down from 22-35s) Target: ~3.5s (achieved in simple queries with cache) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
176 lines
5 KiB
Python
176 lines
5 KiB
Python
"""Streaming sentence splitter for real-time TTS.
|
|
|
|
Buffers streaming text and yields complete sentences as soon as they're detected.
|
|
Optimized for low latency - starts TTS on first sentence while rest generates.
|
|
"""
|
|
|
|
import re
|
|
from dataclasses import dataclass
|
|
from typing import AsyncIterator, List
|
|
|
|
from utils.logging import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class Sentence:
|
|
"""A complete sentence ready for TTS."""
|
|
|
|
text: str
|
|
index: int # Sentence number in stream (0-indexed)
|
|
is_final: bool = False # True if this is the last sentence
|
|
|
|
|
|
class StreamingSentenceSplitter:
|
|
"""
|
|
Split streaming text into sentences in real-time.
|
|
|
|
Detects sentence boundaries (. ! ? followed by space or newline)
|
|
and yields complete sentences immediately for TTS processing.
|
|
"""
|
|
|
|
# Sentence boundary patterns
|
|
# Must have punctuation + whitespace or end of string
|
|
SENTENCE_END_PATTERN = re.compile(
|
|
r'([.!?])\s+|([.!?])$'
|
|
)
|
|
|
|
# Minimum sentence length to avoid fragmenting
|
|
MIN_SENTENCE_LENGTH = 10
|
|
|
|
def __init__(self):
|
|
"""Initialize sentence splitter."""
|
|
self.buffer = ""
|
|
self.sentence_count = 0
|
|
|
|
def add_text(self, text: str) -> List[Sentence]:
|
|
"""
|
|
Add streaming text chunk and extract complete sentences.
|
|
|
|
Args:
|
|
text: New text chunk from LLM stream
|
|
|
|
Returns:
|
|
List of complete sentences (may be empty if no boundaries found)
|
|
"""
|
|
self.buffer += text
|
|
return self._extract_sentences()
|
|
|
|
def flush(self) -> List[Sentence]:
|
|
"""
|
|
Flush remaining buffer as final sentence.
|
|
|
|
Call this when stream is complete to get any remaining text.
|
|
|
|
Returns:
|
|
List containing final sentence (or empty if buffer is empty)
|
|
"""
|
|
sentences = []
|
|
|
|
if self.buffer.strip():
|
|
sentence = Sentence(
|
|
text=self.buffer.strip(),
|
|
index=self.sentence_count,
|
|
is_final=True,
|
|
)
|
|
sentences.append(sentence)
|
|
self.sentence_count += 1
|
|
logger.debug(
|
|
f"Flushed final sentence #{sentence.index}: "
|
|
f'"{sentence.text[:50]}..."'
|
|
)
|
|
|
|
self.buffer = ""
|
|
return sentences
|
|
|
|
def _extract_sentences(self) -> List[Sentence]:
|
|
"""
|
|
Extract complete sentences from current buffer.
|
|
|
|
Returns:
|
|
List of complete sentences
|
|
"""
|
|
sentences = []
|
|
|
|
while True:
|
|
# Find next sentence boundary
|
|
match = self.SENTENCE_END_PATTERN.search(self.buffer)
|
|
|
|
if not match:
|
|
# No complete sentence yet
|
|
break
|
|
|
|
# Extract sentence up to boundary (including punctuation)
|
|
end_pos = match.end()
|
|
sentence_text = self.buffer[:end_pos].strip()
|
|
|
|
# Check minimum length to avoid fragments
|
|
if len(sentence_text) < self.MIN_SENTENCE_LENGTH:
|
|
# Too short - might be abbreviation or fragment
|
|
# Only break if we have more text coming, otherwise keep it
|
|
if len(self.buffer) > end_pos + 10:
|
|
# More text after boundary - likely fragment, skip
|
|
self.buffer = self.buffer[end_pos:]
|
|
continue
|
|
else:
|
|
# Close to end of buffer - keep as sentence
|
|
pass
|
|
|
|
# Valid sentence found
|
|
sentence = Sentence(
|
|
text=sentence_text,
|
|
index=self.sentence_count,
|
|
is_final=False,
|
|
)
|
|
sentences.append(sentence)
|
|
self.sentence_count += 1
|
|
|
|
logger.debug(
|
|
f"Extracted sentence #{sentence.index}: "
|
|
f'"{sentence.text[:50]}..."'
|
|
)
|
|
|
|
# Remove sentence from buffer
|
|
self.buffer = self.buffer[end_pos:].lstrip()
|
|
|
|
return sentences
|
|
|
|
def reset(self) -> None:
|
|
"""Reset splitter state for new stream."""
|
|
self.buffer = ""
|
|
self.sentence_count = 0
|
|
|
|
|
|
async def split_streaming_response(
|
|
text_stream: AsyncIterator[str],
|
|
) -> AsyncIterator[Sentence]:
|
|
"""
|
|
Split streaming LLM response into sentences in real-time.
|
|
|
|
Args:
|
|
text_stream: Async iterator yielding text chunks from LLM
|
|
|
|
Yields:
|
|
Complete sentences as they're detected
|
|
"""
|
|
splitter = StreamingSentenceSplitter()
|
|
|
|
try:
|
|
async for chunk in text_stream:
|
|
sentences = splitter.add_text(chunk)
|
|
for sentence in sentences:
|
|
yield sentence
|
|
|
|
# Flush any remaining text as final sentence
|
|
final_sentences = splitter.flush()
|
|
for sentence in final_sentences:
|
|
yield sentence
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in sentence splitting: {e}")
|
|
# Flush buffer on error to avoid losing text
|
|
final_sentences = splitter.flush()
|
|
for sentence in final_sentences:
|
|
yield sentence
|
|
raise
|