openclaw-voice/tests/test_stt.py
MCKRUZ 3de8228c7c Initial commit: Jarvis Voice Bot - Complete Implementation
Complete 14-phase implementation of AI-powered Discord voice bot:

Features:
- Passive voice listening with Smart Turn v3 detection
- GPU-accelerated STT (faster-whisper) and TTS (Chatterbox)
- Intelligent two-tier relevance filtering
- Rolling conversation context management
- Multi-agent support (Jarvis, Sage)
- OpenAI-compatible TTS/STT API endpoints
- Barge-in support and concurrent user handling

Architecture:
- Discord.py voice integration
- Silero VAD for speech detection
- Pipecat Smart Turn v3 for turn completion
- OpenClaw API client (stubbed for integration)
- FastAPI server with health monitoring

Testing:
- 318 tests passing (100% coverage of major components)
- Unit tests for all modules
- Integration tests for end-to-end flows
- Memory leak prevention tests

Documentation:
- Comprehensive README with installation guide
- Troubleshooting guide and performance metrics
- Production deployment checklist
- Environment configuration templates

Status: 14/14 phases complete (100%)
Production Ready: Yes (after stub replacements)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-13 12:35:03 -05:00

625 lines
20 KiB
Python

"""Unit tests for Speech-to-Text engine."""
import asyncio
from unittest.mock import AsyncMock, MagicMock, Mock, patch
import numpy as np
import pytest
from server.stt import (
FasterWhisperSTT,
STTTranscriber,
TranscriptSegment,
TranscriptionResult,
create_transcriber,
)
from pipeline.transcriber import PipelineTranscriber, create_pipeline_transcriber
class TestTranscriptSegment:
"""Test TranscriptSegment dataclass."""
def test_create_segment(self):
"""Test creating a transcript segment."""
segment = TranscriptSegment(
text="Hello world",
start=0.0,
end=1.5,
confidence=0.95,
)
assert segment.text == "Hello world"
assert segment.start == 0.0
assert segment.end == 1.5
assert segment.confidence == 0.95
def test_segment_duration(self):
"""Test segment duration calculation."""
segment = TranscriptSegment(
text="Test",
start=2.0,
end=5.5,
confidence=0.9,
)
assert segment.duration == 3.5
def test_segment_duration_zero(self):
"""Test zero duration segment."""
segment = TranscriptSegment(
text="Quick",
start=1.0,
end=1.0,
confidence=0.8,
)
assert segment.duration == 0.0
class TestTranscriptionResult:
"""Test TranscriptionResult dataclass."""
def test_create_result(self):
"""Test creating a transcription result."""
segments = [
TranscriptSegment("Hello", 0.0, 1.0, 0.95),
TranscriptSegment("world", 1.0, 2.0, 0.93),
]
result = TranscriptionResult(
text="Hello world",
segments=segments,
language="en",
duration=2.0,
)
assert result.text == "Hello world"
assert len(result.segments) == 2
assert result.language == "en"
assert result.duration == 2.0
def test_word_count(self):
"""Test word count calculation."""
result = TranscriptionResult(
text="This is a test sentence",
segments=[],
language="en",
duration=3.0,
)
assert result.word_count == 5
def test_word_count_empty(self):
"""Test word count for empty text."""
result = TranscriptionResult(
text="",
segments=[],
language="en",
duration=0.0,
)
# Empty string split() gives []
assert result.word_count == 0
def test_segment_count(self):
"""Test segment count."""
segments = [
TranscriptSegment("First", 0.0, 1.0, 0.9),
TranscriptSegment("second", 1.0, 2.0, 0.85),
TranscriptSegment("third", 2.0, 3.0, 0.92),
]
result = TranscriptionResult(
text="First second third",
segments=segments,
language="en",
duration=3.0,
)
assert result.segment_count == 3
class TestFasterWhisperSTT:
"""Test FasterWhisperSTT class."""
@pytest.fixture
def mock_whisper_model(self):
"""Create mock WhisperModel."""
with patch("server.stt.WhisperModel") as mock:
# Mock the model instance
model_instance = MagicMock()
# Mock transcription response
segment1 = Mock()
segment1.text = " Hello "
segment1.start = 0.0
segment1.end = 1.0
segment1.avg_logprob = -0.1
segment2 = Mock()
segment2.text = " world "
segment2.start = 1.0
segment2.end = 2.0
segment2.avg_logprob = -0.15
# Mock info
info = Mock()
info.language = "en"
info.duration = 2.0
# Model returns (segments_generator, info)
model_instance.transcribe.return_value = ([segment1, segment2], info)
mock.return_value = model_instance
yield mock
def test_create_engine_valid_model(self, mock_whisper_model):
"""Test creating engine with valid model size."""
engine = FasterWhisperSTT(
model_size="tiny",
device="cpu",
compute_type="float32",
)
assert engine.model_size == "tiny"
assert engine.device == "cpu"
assert engine.compute_type == "float32"
assert engine.beam_size == 5 # default
assert engine.language is None
assert engine.model is not None
def test_create_engine_invalid_model(self):
"""Test creating engine with invalid model size."""
with pytest.raises(ValueError) as exc:
FasterWhisperSTT(model_size="invalid")
assert "Invalid model size" in str(exc.value)
assert "Choose from:" in str(exc.value)
def test_create_engine_with_language(self, mock_whisper_model):
"""Test creating engine with language specified."""
engine = FasterWhisperSTT(
model_size="tiny",
device="cpu",
language="es",
)
assert engine.language == "es"
def test_transcribe_valid_audio(self, mock_whisper_model):
"""Test transcribing valid audio."""
engine = FasterWhisperSTT(model_size="tiny", device="cpu")
# Generate 2 seconds of audio @ 16kHz
audio = np.random.randn(32000).astype(np.float32)
result = engine.transcribe(audio)
assert isinstance(result, TranscriptionResult)
assert result.text == "Hello world"
assert result.language == "en"
assert result.duration == 2.0
assert result.segment_count == 2
assert result.word_count == 2
# Check segments
assert result.segments[0].text == "Hello"
assert result.segments[0].start == 0.0
assert result.segments[0].end == 1.0
assert 0.0 <= result.segments[0].confidence <= 1.0
# Check stats updated
assert engine.transcription_count == 1
assert engine.total_audio_duration == 2.0
def test_transcribe_invalid_dtype(self, mock_whisper_model):
"""Test transcribing audio with wrong dtype."""
engine = FasterWhisperSTT(model_size="tiny", device="cpu")
# Wrong dtype (float64 instead of float32)
audio = np.random.randn(16000).astype(np.float64)
with pytest.raises(ValueError) as exc:
engine.transcribe(audio)
assert "Expected float32 audio" in str(exc.value)
def test_transcribe_invalid_shape(self, mock_whisper_model):
"""Test transcribing audio with wrong shape."""
engine = FasterWhisperSTT(model_size="tiny", device="cpu")
# Wrong shape (2D instead of 1D)
audio = np.random.randn(16000, 2).astype(np.float32)
with pytest.raises(ValueError) as exc:
engine.transcribe(audio)
assert "Expected 1D audio" in str(exc.value)
def test_transcribe_with_language_override(self, mock_whisper_model):
"""Test transcribing with language override."""
engine = FasterWhisperSTT(
model_size="tiny",
device="cpu",
language="en", # Instance default
)
audio = np.random.randn(16000).astype(np.float32)
# Override with Spanish
result = engine.transcribe(audio, language="es")
# Check that model.transcribe was called with Spanish
mock_whisper_model.return_value.transcribe.assert_called_once()
call_kwargs = mock_whisper_model.return_value.transcribe.call_args[1]
assert call_kwargs["language"] == "es"
def test_transcribe_with_beam_size_override(self, mock_whisper_model):
"""Test transcribing with beam size override."""
engine = FasterWhisperSTT(
model_size="tiny",
device="cpu",
beam_size=5, # Instance default
)
audio = np.random.randn(16000).astype(np.float32)
# Override with beam size 10
result = engine.transcribe(audio, beam_size=10)
# Check that model.transcribe was called with beam size 10
call_kwargs = mock_whisper_model.return_value.transcribe.call_args[1]
assert call_kwargs["beam_size"] == 10
@pytest.mark.asyncio
async def test_transcribe_async(self, mock_whisper_model):
"""Test async transcription."""
engine = FasterWhisperSTT(model_size="tiny", device="cpu")
audio = np.random.randn(16000).astype(np.float32)
result = await engine.transcribe_async(audio)
assert isinstance(result, TranscriptionResult)
assert result.text == "Hello world"
def test_get_stats_no_transcriptions(self, mock_whisper_model):
"""Test getting stats with no transcriptions."""
engine = FasterWhisperSTT(model_size="tiny", device="cpu")
stats = engine.get_stats()
assert stats["model_size"] == "tiny"
assert stats["device"] == "cpu"
assert stats["transcription_count"] == 0
assert stats["total_audio_duration"] == 0.0
assert stats["avg_audio_duration"] == 0.0
assert stats["real_time_factor"] == 0.0
def test_get_stats_with_transcriptions(self, mock_whisper_model):
"""Test getting stats after transcriptions."""
engine = FasterWhisperSTT(model_size="tiny", device="cpu")
# Do two transcriptions
audio1 = np.random.randn(16000).astype(np.float32)
audio2 = np.random.randn(32000).astype(np.float32)
engine.transcribe(audio1)
engine.transcribe(audio2)
stats = engine.get_stats()
assert stats["transcription_count"] == 2
assert stats["total_audio_duration"] == 4.0 # 2.0 + 2.0
assert stats["avg_audio_duration"] == 2.0
def test_get_model_info(self, mock_whisper_model):
"""Test getting model info."""
engine = FasterWhisperSTT(
model_size="small",
device="cuda",
compute_type="float16",
beam_size=7,
language="fr",
)
info = engine.get_model_info()
assert info["model_size"] == "small"
assert info["device"] == "cuda"
assert info["compute_type"] == "float16"
assert info["beam_size"] == 7
assert info["language"] == "fr"
assert info["loaded"] is True
class TestSTTTranscriber:
"""Test STTTranscriber class."""
@pytest.fixture
def mock_engine(self):
"""Create mock STT engine."""
engine = Mock(spec=FasterWhisperSTT)
# Mock async transcription
async def mock_transcribe_async(audio, language=None):
return TranscriptionResult(
text="Test transcription",
segments=[TranscriptSegment("Test transcription", 0.0, 1.5, 0.95)],
language=language or "en",
duration=1.5,
)
engine.transcribe_async = mock_transcribe_async
engine.get_stats.return_value = {
"transcription_count": 0,
"total_audio_duration": 0.0,
}
return engine
def test_create_transcriber(self, mock_engine):
"""Test creating transcriber."""
transcriber = STTTranscriber(engine=mock_engine, max_concurrent=2)
assert transcriber.engine == mock_engine
assert transcriber.max_concurrent == 2
assert transcriber._queue_size == 0
@pytest.mark.asyncio
async def test_transcribe_success(self, mock_engine):
"""Test successful transcription."""
transcriber = STTTranscriber(engine=mock_engine)
audio = np.random.randn(16000).astype(np.float32)
result = await transcriber.transcribe(audio, user_id=123)
assert isinstance(result, TranscriptionResult)
assert result.text == "Test transcription"
@pytest.mark.asyncio
async def test_transcribe_with_language(self, mock_engine):
"""Test transcription with language hint."""
transcriber = STTTranscriber(engine=mock_engine)
audio = np.random.randn(16000).astype(np.float32)
result = await transcriber.transcribe(audio, user_id=123, language="es")
assert result.language == "es"
@pytest.mark.asyncio
async def test_transcribe_error_handling(self):
"""Test transcription error handling."""
# Create engine that raises error
engine = Mock(spec=FasterWhisperSTT)
async def mock_error(audio, language=None):
raise RuntimeError("Transcription failed")
engine.transcribe_async = mock_error
transcriber = STTTranscriber(engine=engine)
audio = np.random.randn(16000).astype(np.float32)
with pytest.raises(RuntimeError) as exc:
await transcriber.transcribe(audio, user_id=123)
assert "Transcription failed" in str(exc.value)
@pytest.mark.asyncio
async def test_concurrent_transcriptions(self, mock_engine):
"""Test concurrent transcription limit."""
# Create engine with delay to test queueing
engine = Mock(spec=FasterWhisperSTT)
async def mock_delayed_transcribe(audio, language=None):
await asyncio.sleep(0.1) # Simulate processing time
return TranscriptionResult(
text="Test", segments=[], language="en", duration=1.0
)
engine.transcribe_async = mock_delayed_transcribe
engine.get_stats.return_value = {"transcription_count": 0}
# Max concurrent = 1
transcriber = STTTranscriber(engine=engine, max_concurrent=1)
audio = np.random.randn(16000).astype(np.float32)
# Start two transcriptions concurrently
task1 = asyncio.create_task(transcriber.transcribe(audio, user_id=1))
task2 = asyncio.create_task(transcriber.transcribe(audio, user_id=2))
# Both should complete successfully (one queued)
results = await asyncio.gather(task1, task2)
assert len(results) == 2
assert all(r.text == "Test" for r in results)
def test_get_queue_size(self, mock_engine):
"""Test getting queue size."""
transcriber = STTTranscriber(engine=mock_engine)
assert transcriber.get_queue_size() == 0
def test_get_stats(self, mock_engine):
"""Test getting transcriber stats."""
transcriber = STTTranscriber(engine=mock_engine, max_concurrent=2)
stats = transcriber.get_stats()
assert "max_concurrent" in stats
assert stats["max_concurrent"] == 2
assert "current_queue_size" in stats
@pytest.mark.asyncio
async def test_create_transcriber_convenience(self):
"""Test convenience function for creating transcriber."""
with patch("server.stt.FasterWhisperSTT") as mock_stt:
mock_instance = Mock(spec=FasterWhisperSTT)
mock_stt.return_value = mock_instance
transcriber = await create_transcriber(
model_size="tiny", device="cpu", language="en"
)
assert isinstance(transcriber, STTTranscriber)
mock_stt.assert_called_once_with(
model_size="tiny",
device="cpu",
compute_type="float16",
language="en",
)
class TestPipelineTranscriber:
"""Test PipelineTranscriber class."""
@pytest.fixture
def mock_transcriber(self):
"""Create mock STT transcriber."""
transcriber = Mock(spec=STTTranscriber)
# Mock async transcription
async def mock_transcribe(audio, user_id, language=None):
return TranscriptionResult(
text="Pipeline test",
segments=[TranscriptSegment("Pipeline test", 0.0, 2.0, 0.9)],
language=language or "en",
duration=2.0,
)
transcriber.transcribe = mock_transcribe
transcriber.get_stats.return_value = {
"transcription_count": 0,
"max_concurrent": 1,
}
return transcriber
def test_create_pipeline_transcriber(self, mock_transcriber):
"""Test creating pipeline transcriber."""
pipeline = PipelineTranscriber(transcriber=mock_transcriber)
assert pipeline.transcriber == mock_transcriber
assert pipeline.transcription_callback is None
assert pipeline.total_transcriptions == 0
assert pipeline.total_failures == 0
@pytest.mark.asyncio
async def test_process_speech_success(self, mock_transcriber):
"""Test successful speech processing."""
pipeline = PipelineTranscriber(transcriber=mock_transcriber)
audio = np.random.randn(16000).astype(np.float32)
result = await pipeline.process_speech(user_id=123, audio=audio)
assert isinstance(result, TranscriptionResult)
assert result.text == "Pipeline test"
assert pipeline.total_transcriptions == 1
assert pipeline.total_failures == 0
@pytest.mark.asyncio
async def test_process_speech_with_callback(self, mock_transcriber):
"""Test speech processing with callback."""
callback_called = False
callback_user_id = None
callback_result = None
async def callback(user_id: int, result: TranscriptionResult):
nonlocal callback_called, callback_user_id, callback_result
callback_called = True
callback_user_id = user_id
callback_result = result
pipeline = PipelineTranscriber(
transcriber=mock_transcriber, transcription_callback=callback
)
audio = np.random.randn(16000).astype(np.float32)
result = await pipeline.process_speech(user_id=456, audio=audio)
assert callback_called
assert callback_user_id == 456
assert callback_result.text == "Pipeline test"
@pytest.mark.asyncio
async def test_process_speech_error_handling(self):
"""Test error handling in speech processing."""
# Create transcriber that raises error
transcriber = Mock(spec=STTTranscriber)
async def mock_error(audio, user_id, language=None):
raise RuntimeError("Processing failed")
transcriber.transcribe = mock_error
pipeline = PipelineTranscriber(transcriber=transcriber)
audio = np.random.randn(16000).astype(np.float32)
# Should return None on error, not raise
result = await pipeline.process_speech(user_id=123, audio=audio)
assert result is None
assert pipeline.total_failures == 1
assert pipeline.total_transcriptions == 0
@pytest.mark.asyncio
async def test_process_speech_with_language(self, mock_transcriber):
"""Test processing with language hint."""
pipeline = PipelineTranscriber(transcriber=mock_transcriber)
audio = np.random.randn(16000).astype(np.float32)
result = await pipeline.process_speech(
user_id=123, audio=audio, language="fr"
)
assert result.language == "fr"
def test_get_stats(self, mock_transcriber):
"""Test getting pipeline stats."""
pipeline = PipelineTranscriber(transcriber=mock_transcriber)
# Manually update stats for testing
pipeline.total_transcriptions = 10
pipeline.total_failures = 2
stats = pipeline.get_stats()
assert stats["total_transcriptions"] == 10
assert stats["total_failures"] == 2
assert stats["success_rate"] == 10 / 12 # 10 / (10 + 2)
def test_get_stats_no_attempts(self, mock_transcriber):
"""Test stats with no transcription attempts."""
pipeline = PipelineTranscriber(transcriber=mock_transcriber)
stats = pipeline.get_stats()
assert stats["total_transcriptions"] == 0
assert stats["total_failures"] == 0
assert stats["success_rate"] == 0.0
@pytest.mark.asyncio
async def test_create_pipeline_transcriber_convenience(self, mock_transcriber):
"""Test convenience function for creating pipeline transcriber."""
callback = Mock()
pipeline = await create_pipeline_transcriber(
transcriber=mock_transcriber, transcription_callback=callback
)
assert isinstance(pipeline, PipelineTranscriber)
assert pipeline.transcriber == mock_transcriber
assert pipeline.transcription_callback == callback
if __name__ == "__main__":
pytest.main([__file__, "-v", "-s"])