openclaw-voice/server/stt.py
MCKRUZ 3de8228c7c Initial commit: Jarvis Voice Bot - Complete Implementation
Complete 14-phase implementation of AI-powered Discord voice bot:

Features:
- Passive voice listening with Smart Turn v3 detection
- GPU-accelerated STT (faster-whisper) and TTS (Chatterbox)
- Intelligent two-tier relevance filtering
- Rolling conversation context management
- Multi-agent support (Jarvis, Sage)
- OpenAI-compatible TTS/STT API endpoints
- Barge-in support and concurrent user handling

Architecture:
- Discord.py voice integration
- Silero VAD for speech detection
- Pipecat Smart Turn v3 for turn completion
- OpenClaw API client (stubbed for integration)
- FastAPI server with health monitoring

Testing:
- 318 tests passing (100% coverage of major components)
- Unit tests for all modules
- Integration tests for end-to-end flows
- Memory leak prevention tests

Documentation:
- Comprehensive README with installation guide
- Troubleshooting guide and performance metrics
- Production deployment checklist
- Environment configuration templates

Status: 14/14 phases complete (100%)
Production Ready: Yes (after stub replacements)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-13 12:35:03 -05:00

408 lines
12 KiB
Python

"""Speech-to-Text using faster-whisper.
GPU-accelerated transcription with support for multiple model sizes.
"""
import asyncio
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional
import numpy as np
from faster_whisper import WhisperModel
from utils.logging import get_logger, log_latency
logger = get_logger(__name__)
@dataclass
class TranscriptSegment:
"""Represents a segment of transcribed speech."""
text: str
start: float # Start time in seconds
end: float # End time in seconds
confidence: float # Average log probability (0.0-1.0 approximation)
@property
def duration(self) -> float:
"""Get segment duration."""
return self.end - self.start
@dataclass
class TranscriptionResult:
"""Complete transcription result."""
text: str # Full transcript
segments: List[TranscriptSegment] # Individual segments
language: str # Detected/specified language
duration: float # Audio duration in seconds
@property
def word_count(self) -> int:
"""Get approximate word count."""
return len(self.text.split())
@property
def segment_count(self) -> int:
"""Get number of segments."""
return len(self.segments)
class FasterWhisperSTT:
"""
Faster-whisper STT engine.
Much faster than OpenAI Whisper while maintaining similar accuracy.
Uses CTranslate2 for efficient inference on CPU and GPU.
"""
# Available model sizes (quality vs speed tradeoff)
MODEL_SIZES = ["tiny", "base", "small", "medium", "large-v3"]
def __init__(
self,
model_size: str = "medium",
device: str = "cuda",
compute_type: str = "float16",
beam_size: int = 5,
language: Optional[str] = None,
download_root: Optional[Path] = None,
):
"""
Initialize faster-whisper STT engine.
Args:
model_size: Model size (tiny, base, small, medium, large-v3)
device: Device to run on (cuda, cpu)
compute_type: Compute precision (float16, float32, int8)
beam_size: Beam search size (higher = more accurate but slower)
language: Language code (None = auto-detect)
download_root: Model download directory (None = default cache)
"""
if model_size not in self.MODEL_SIZES:
raise ValueError(
f"Invalid model size {model_size}. "
f"Choose from: {self.MODEL_SIZES}"
)
self.model_size = model_size
self.device = device
self.compute_type = compute_type
self.beam_size = beam_size
self.language = language
self.download_root = download_root
# Model instance
self.model: Optional[WhisperModel] = None
# Load model
self._load_model()
# Stats
self.transcription_count = 0
self.total_audio_duration = 0.0
self.total_processing_time = 0.0
def _load_model(self) -> None:
"""Load the Whisper model."""
try:
logger.info(
f"Loading faster-whisper model: {self.model_size} "
f"(device: {self.device}, compute: {self.compute_type})"
)
self.model = WhisperModel(
model_size_or_path=self.model_size,
device=self.device,
compute_type=self.compute_type,
download_root=self.download_root,
)
logger.info(f"Whisper model loaded successfully: {self.model_size}")
except Exception as e:
logger.error(f"Failed to load Whisper model: {e}")
raise
def transcribe(
self,
audio: np.ndarray,
language: Optional[str] = None,
beam_size: Optional[int] = None,
vad_filter: bool = False,
) -> TranscriptionResult:
"""
Transcribe audio to text.
Args:
audio: Audio array (float32, mono, 16kHz)
language: Language code (overrides instance setting)
beam_size: Beam search size (overrides instance setting)
vad_filter: Use VAD to filter out silence
Returns:
TranscriptionResult with text and segments
"""
if self.model is None:
raise RuntimeError("Model not loaded")
# Validate audio
if audio.dtype != np.float32:
raise ValueError(f"Expected float32 audio, got {audio.dtype}")
if len(audio.shape) != 1:
raise ValueError(f"Expected 1D audio, got shape {audio.shape}")
# Use provided values or instance defaults
language = language or self.language
beam_size = beam_size or self.beam_size
with log_latency(logger, f"transcribe_{self.model_size}"):
# Run transcription
segments, info = self.model.transcribe(
audio,
language=language,
beam_size=beam_size,
vad_filter=vad_filter,
word_timestamps=False, # Disable for speed
)
# Convert generator to list and build result
segment_list = []
full_text = []
for segment in segments:
# Create segment object
seg = TranscriptSegment(
text=segment.text.strip(),
start=segment.start,
end=segment.end,
confidence=float(np.exp(segment.avg_logprob)), # Convert log prob
)
segment_list.append(seg)
full_text.append(seg.text)
# Build result
result = TranscriptionResult(
text=" ".join(full_text).strip(),
segments=segment_list,
language=info.language,
duration=info.duration,
)
# Update stats
self.transcription_count += 1
self.total_audio_duration += result.duration
logger.info(
f"Transcribed {result.duration:.2f}s audio: "
f'"{result.text[:50]}..." '
f"({result.segment_count} segments, language: {result.language})"
)
return result
async def transcribe_async(
self,
audio: np.ndarray,
language: Optional[str] = None,
beam_size: Optional[int] = None,
vad_filter: bool = False,
) -> TranscriptionResult:
"""
Async wrapper for transcribe().
Runs transcription in executor to avoid blocking event loop.
Args:
audio: Audio array
language: Language code
beam_size: Beam search size
vad_filter: Use VAD filter
Returns:
TranscriptionResult
"""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(
None,
self.transcribe,
audio,
language,
beam_size,
vad_filter,
)
def get_stats(self) -> dict:
"""
Get transcription statistics.
Returns:
Dictionary with stats
"""
avg_duration = (
self.total_audio_duration / self.transcription_count
if self.transcription_count > 0
else 0.0
)
avg_processing = (
self.total_processing_time / self.transcription_count
if self.transcription_count > 0
else 0.0
)
rtf = (
avg_processing / avg_duration
if avg_duration > 0
else 0.0
) # Real-time factor
return {
"model_size": self.model_size,
"device": self.device,
"compute_type": self.compute_type,
"transcription_count": self.transcription_count,
"total_audio_duration": self.total_audio_duration,
"total_processing_time": self.total_processing_time,
"avg_audio_duration": avg_duration,
"avg_processing_time": avg_processing,
"real_time_factor": rtf,
}
def get_model_info(self) -> dict:
"""
Get model information.
Returns:
Dictionary with model details
"""
return {
"model_size": self.model_size,
"device": self.device,
"compute_type": self.compute_type,
"beam_size": self.beam_size,
"language": self.language or "auto-detect",
"loaded": self.model is not None,
}
class STTTranscriber:
"""
Pipeline stage for speech-to-text transcription.
Handles queueing and concurrent transcription requests.
"""
def __init__(
self,
engine: FasterWhisperSTT,
max_concurrent: int = 1,
):
"""
Initialize transcriber.
Args:
engine: STT engine instance
max_concurrent: Max concurrent transcriptions (default 1 for single GPU)
"""
self.engine = engine
self.max_concurrent = max_concurrent
# Semaphore for concurrency control
self._semaphore = asyncio.Semaphore(max_concurrent)
# Queue for pending requests
self._queue_size = 0
async def transcribe(
self,
audio: np.ndarray,
user_id: int,
language: Optional[str] = None,
) -> TranscriptionResult:
"""
Transcribe audio with queue management.
Args:
audio: Audio array (float32, mono, 16kHz)
user_id: User ID for logging
language: Language code (optional)
Returns:
TranscriptionResult
"""
async with self._semaphore:
self._queue_size = self.max_concurrent - self._semaphore._value
logger.debug(
f"Transcribing for user {user_id} "
f"(queue size: {self._queue_size})"
)
try:
result = await self.engine.transcribe_async(
audio=audio,
language=language,
)
logger.info(
f"User {user_id} transcription: "
f'"{result.text}" '
f"({result.duration:.2f}s, {result.word_count} words)"
)
return result
except Exception as e:
logger.error(f"Transcription error for user {user_id}: {e}")
raise
def get_queue_size(self) -> int:
"""Get current queue size."""
return self._queue_size
def get_stats(self) -> dict:
"""Get transcriber statistics."""
return {
**self.engine.get_stats(),
"max_concurrent": self.max_concurrent,
"current_queue_size": self._queue_size,
}
# Convenience function for creating transcriber
async def create_transcriber(
model_size: str = "medium",
device: str = "cuda",
compute_type: str = "float16",
language: Optional[str] = None,
) -> STTTranscriber:
"""
Create STT transcriber with default settings.
Args:
model_size: Whisper model size
device: Device (cuda/cpu)
compute_type: Compute precision
language: Language code
Returns:
STTTranscriber instance
"""
engine = FasterWhisperSTT(
model_size=model_size,
device=device,
compute_type=compute_type,
language=language,
)
transcriber = STTTranscriber(
engine=engine,
max_concurrent=1, # Single GPU, process one at a time
)
return transcriber