openclaw-voice/pipeline/turn_detector.py
MCKRUZ 3de8228c7c Initial commit: Jarvis Voice Bot - Complete Implementation
Complete 14-phase implementation of AI-powered Discord voice bot:

Features:
- Passive voice listening with Smart Turn v3 detection
- GPU-accelerated STT (faster-whisper) and TTS (Chatterbox)
- Intelligent two-tier relevance filtering
- Rolling conversation context management
- Multi-agent support (Jarvis, Sage)
- OpenAI-compatible TTS/STT API endpoints
- Barge-in support and concurrent user handling

Architecture:
- Discord.py voice integration
- Silero VAD for speech detection
- Pipecat Smart Turn v3 for turn completion
- OpenClaw API client (stubbed for integration)
- FastAPI server with health monitoring

Testing:
- 318 tests passing (100% coverage of major components)
- Unit tests for all modules
- Integration tests for end-to-end flows
- Memory leak prevention tests

Documentation:
- Comprehensive README with installation guide
- Troubleshooting guide and performance metrics
- Production deployment checklist
- Environment configuration templates

Status: 14/14 phases complete (100%)
Production Ready: Yes (after stub replacements)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-13 12:35:03 -05:00

441 lines
14 KiB
Python

"""Smart Turn v3 integration for turn completion detection.
Uses Pipecat AI's Smart Turn v3 model to determine if a speaker has
finished their turn or is just pausing.
"""
import asyncio
from pathlib import Path
from typing import Optional
import numpy as np
import onnxruntime as ort
from utils.config import get_models_dir
from utils.logging import get_logger, log_latency
logger = get_logger(__name__)
class SmartTurnDetector:
"""
Smart Turn v3 turn completion detector.
Determines if a speaker has finished their turn based on audio analysis.
Uses an ONNX model that expects exactly 8 seconds of 16kHz audio.
"""
# Model details
MODEL_SAMPLE_RATE = 16000
MODEL_DURATION = 8.0 # seconds
MODEL_SAMPLES = int(MODEL_SAMPLE_RATE * MODEL_DURATION) # 128,000 samples
def __init__(
self,
model_path: Optional[Path] = None,
threshold: float = 0.7,
device: str = "cpu",
):
"""
Initialize Smart Turn detector.
Args:
model_path: Path to ONNX model file (None = auto-download)
threshold: Turn completion threshold (0.0-1.0)
device: Device to run on ('cpu' or 'cuda')
"""
self.threshold = threshold
self.device = device
# Determine model path
if model_path is None:
models_dir = get_models_dir()
model_path = models_dir / "smart_turn_v3.onnx"
self.model_path = model_path
# Load model
self.session = None
self._load_model()
def _load_model(self) -> None:
"""Load ONNX model."""
try:
# Download if not exists
if not self.model_path.exists():
logger.info(f"Smart Turn model not found at {self.model_path}")
logger.info("Attempting to download from HuggingFace...")
self._download_model()
logger.info(f"Loading Smart Turn model from {self.model_path}")
# Configure ONNX runtime
providers = []
if self.device == "cuda":
providers.append("CUDAExecutionProvider")
providers.append("CPUExecutionProvider")
# Create inference session
self.session = ort.InferenceSession(
str(self.model_path),
providers=providers,
)
# Get model info
input_name = self.session.get_inputs()[0].name
output_name = self.session.get_outputs()[0].name
logger.info(
f"Smart Turn model loaded successfully "
f"(input: {input_name}, output: {output_name})"
)
except Exception as e:
logger.error(f"Failed to load Smart Turn model: {e}")
raise
def _download_model(self) -> None:
"""
Download Smart Turn v3 model from HuggingFace.
Note: This is a placeholder. In production, you would use huggingface_hub
to download the model automatically.
"""
try:
from huggingface_hub import hf_hub_download
logger.info("Downloading Smart Turn v3 from HuggingFace...")
# Download model
downloaded_path = hf_hub_download(
repo_id="pipecat-ai/smart-turn-v3",
filename="model.onnx",
cache_dir=get_models_dir(),
)
# Copy to expected location
import shutil
shutil.copy(downloaded_path, self.model_path)
logger.info(f"Model downloaded to {self.model_path}")
except ImportError:
logger.error(
"huggingface_hub not installed. "
"Install with: pip install huggingface_hub"
)
logger.error(
f"Please manually download the model from "
f"https://huggingface.co/pipecat-ai/smart-turn-v3 "
f"and place it at {self.model_path}"
)
raise
except Exception as e:
logger.error(f"Failed to download model: {e}")
logger.error(
f"Please manually download from "
f"https://huggingface.co/pipecat-ai/smart-turn-v3"
)
raise
def prepare_audio(self, audio: np.ndarray) -> np.ndarray:
"""
Prepare audio for Smart Turn model.
Model expects exactly 8 seconds (128,000 samples) of 16kHz mono audio.
- If audio is shorter: zero-pad at the beginning
- If audio is longer: truncate from the beginning (keep most recent)
Args:
audio: Audio array (float32, mono, 16kHz)
Returns:
Prepared audio (exactly 128,000 samples)
"""
if audio.dtype != np.float32:
raise ValueError(f"Expected float32 audio, got {audio.dtype}")
current_samples = len(audio)
if current_samples > self.MODEL_SAMPLES:
# Too long - keep most recent 8 seconds
audio = audio[-self.MODEL_SAMPLES :]
elif current_samples < self.MODEL_SAMPLES:
# Too short - zero-pad at beginning
padding = np.zeros(
self.MODEL_SAMPLES - current_samples, dtype=np.float32
)
audio = np.concatenate([padding, audio])
return audio
def detect(self, audio: np.ndarray) -> tuple[bool, float]:
"""
Detect if turn is complete.
Args:
audio: Audio to analyze (float32, mono, 16kHz, any length)
Returns:
Tuple of (is_complete, confidence)
- is_complete: True if turn completion confidence >= threshold
- confidence: Turn completion probability (0.0-1.0)
"""
if self.session is None:
raise RuntimeError("Model not loaded")
with log_latency(logger, "turn_detection"):
# Prepare audio (pad/truncate to 8 seconds)
prepared_audio = self.prepare_audio(audio)
# Reshape for model: [1, num_samples]
input_tensor = prepared_audio.reshape(1, -1).astype(np.float32)
# Run inference
input_name = self.session.get_inputs()[0].name
output_name = self.session.get_outputs()[0].name
outputs = self.session.run(
[output_name],
{input_name: input_tensor},
)
# Extract probability (handle various output shapes)
output = outputs[0]
if isinstance(output, np.ndarray):
probability = float(output.flatten()[0])
else:
probability = float(output)
# Clamp to [0, 1]
probability = max(0.0, min(1.0, probability))
# Determine completion
is_complete = probability >= self.threshold
logger.debug(
f"Turn detection: probability={probability:.3f}, "
f"threshold={self.threshold:.3f}, "
f"complete={is_complete}"
)
return is_complete, probability
async def detect_async(self, audio: np.ndarray) -> tuple[bool, float]:
"""
Async wrapper for detect().
Args:
audio: Audio to analyze
Returns:
Tuple of (is_complete, confidence)
"""
# Run in executor to avoid blocking
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, self.detect, audio)
def set_threshold(self, threshold: float) -> None:
"""
Update turn completion threshold.
Args:
threshold: New threshold (0.0-1.0)
"""
if not 0.0 <= threshold <= 1.0:
raise ValueError(f"Threshold must be in [0, 1], got {threshold}")
old_threshold = self.threshold
self.threshold = threshold
logger.info(
f"Turn completion threshold updated: {old_threshold:.2f}{threshold:.2f}"
)
def get_model_info(self) -> dict:
"""
Get model information.
Returns:
Dictionary with model details
"""
if self.session is None:
return {"loaded": False}
return {
"loaded": True,
"path": str(self.model_path),
"threshold": self.threshold,
"sample_rate": self.MODEL_SAMPLE_RATE,
"duration": self.MODEL_DURATION,
"samples": self.MODEL_SAMPLES,
"device": self.device,
}
class TurnDetectionManager:
"""
Manages turn detection with waiting and timeout logic.
Handles the scenario where a user pauses mid-utterance:
1. VAD detects silence
2. Check turn completion
3. If incomplete: wait for more speech (up to max_wait)
4. If complete OR timeout: proceed to transcription
"""
def __init__(
self,
detector: SmartTurnDetector,
max_wait: float = 3.0,
check_interval: float = 0.1,
):
"""
Initialize turn detection manager.
Args:
detector: SmartTurnDetector instance
max_wait: Maximum time to wait for turn completion (seconds)
check_interval: How often to check for new audio (seconds)
"""
self.detector = detector
self.max_wait = max_wait
self.check_interval = check_interval
# State for waiting
self._waiting_tasks: dict[int, asyncio.Task] = {}
async def check_turn_complete(
self,
user_id: int,
audio: np.ndarray,
audio_callback: Optional[callable] = None,
) -> tuple[bool, float, bool]:
"""
Check if turn is complete, potentially waiting for more speech.
Args:
user_id: User ID
audio: Current audio accumulation
audio_callback: Async callback to get updated audio (returns np.ndarray)
Returns:
Tuple of (is_complete, confidence, timed_out)
- is_complete: True if turn complete or timed out
- confidence: Turn completion probability
- timed_out: True if max_wait exceeded
"""
# Check turn completion
is_complete, confidence = await self.detector.detect_async(audio)
if is_complete:
logger.debug(
f"User {user_id} turn complete "
f"(confidence: {confidence:.3f})"
)
return True, confidence, False
# Turn not complete - wait for more speech (if callback provided)
if audio_callback is None:
# No way to get more audio, consider complete
logger.debug(
f"User {user_id} turn incomplete "
f"(confidence: {confidence:.3f}) but no callback, proceeding"
)
return True, confidence, False
# Wait for more speech
logger.debug(
f"User {user_id} turn incomplete "
f"(confidence: {confidence:.3f}), waiting up to {self.max_wait}s"
)
start_time = asyncio.get_event_loop().time()
while True:
# Check timeout
elapsed = asyncio.get_event_loop().time() - start_time
if elapsed >= self.max_wait:
logger.debug(
f"User {user_id} max wait exceeded ({elapsed:.1f}s), "
f"forcing completion"
)
return True, confidence, True
# Wait for new audio
await asyncio.sleep(self.check_interval)
# Get updated audio
try:
updated_audio = await audio_callback()
if updated_audio is None or len(updated_audio) == len(audio):
# No new audio yet
continue
# New audio available - check turn completion again
audio = updated_audio
is_complete, confidence = await self.detector.detect_async(audio)
if is_complete:
logger.debug(
f"User {user_id} turn complete after waiting "
f"(confidence: {confidence:.3f}, elapsed: {elapsed:.1f}s)"
)
return True, confidence, False
except Exception as e:
logger.error(f"Error getting updated audio: {e}")
# On error, proceed with what we have
return True, confidence, True
def cancel_waiting(self, user_id: int) -> None:
"""
Cancel waiting for a user (e.g., if they leave or speak again).
Args:
user_id: User ID
"""
if user_id in self._waiting_tasks:
task = self._waiting_tasks.pop(user_id)
task.cancel()
logger.debug(f"Cancelled turn detection wait for user {user_id}")
def cancel_all(self) -> None:
"""Cancel all waiting tasks."""
for user_id in list(self._waiting_tasks.keys()):
self.cancel_waiting(user_id)
logger.debug("Cancelled all turn detection waits")
# Convenience function for basic usage
async def create_turn_detector(
model_path: Optional[Path] = None,
threshold: float = 0.7,
max_wait: float = 3.0,
) -> TurnDetectionManager:
"""
Create a turn detector with default settings.
Args:
model_path: Path to model (None = auto-download)
threshold: Turn completion threshold
max_wait: Maximum wait time
Returns:
TurnDetectionManager instance
"""
detector = SmartTurnDetector(
model_path=model_path,
threshold=threshold,
)
manager = TurnDetectionManager(
detector=detector,
max_wait=max_wait,
)
return manager