openclaw-voice/utils/audio.py
MCKRUZ 3de8228c7c Initial commit: Jarvis Voice Bot - Complete Implementation
Complete 14-phase implementation of AI-powered Discord voice bot:

Features:
- Passive voice listening with Smart Turn v3 detection
- GPU-accelerated STT (faster-whisper) and TTS (Chatterbox)
- Intelligent two-tier relevance filtering
- Rolling conversation context management
- Multi-agent support (Jarvis, Sage)
- OpenAI-compatible TTS/STT API endpoints
- Barge-in support and concurrent user handling

Architecture:
- Discord.py voice integration
- Silero VAD for speech detection
- Pipecat Smart Turn v3 for turn completion
- OpenClaw API client (stubbed for integration)
- FastAPI server with health monitoring

Testing:
- 318 tests passing (100% coverage of major components)
- Unit tests for all modules
- Integration tests for end-to-end flows
- Memory leak prevention tests

Documentation:
- Comprehensive README with installation guide
- Troubleshooting guide and performance metrics
- Production deployment checklist
- Environment configuration templates

Status: 14/14 phases complete (100%)
Production Ready: Yes (after stub replacements)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-13 12:35:03 -05:00

533 lines
14 KiB
Python

"""Audio format conversion and processing utilities.
Handles conversion between various audio formats used by Discord, VAD, STT, and TTS.
Typical conversions:
Discord (48kHz stereo int16) → Processing (16kHz mono int16) → Numpy (float32)
Numpy (float32) → Processing (16kHz mono int16) → Discord (48kHz stereo int16)
"""
import io
import struct
from typing import Optional, Tuple
import numpy as np
from scipy import signal
# Audio format constants
DISCORD_SAMPLE_RATE = 48000 # Hz
PROCESSING_SAMPLE_RATE = 16000 # Hz
DISCORD_CHANNELS = 2 # Stereo
PROCESSING_CHANNELS = 1 # Mono
DISCORD_FRAME_SIZE = 960 # Samples per channel per frame (20ms @ 48kHz)
DISCORD_FRAME_DURATION = 0.02 # 20ms
# Opus frame sizes (samples per channel)
OPUS_FRAME_SIZES = {
DISCORD_SAMPLE_RATE: [120, 240, 480, 960, 1920, 2880], # Valid at 48kHz
}
def pcm_to_numpy(pcm_data: bytes, dtype: np.dtype = np.int16) -> np.ndarray:
"""
Convert PCM bytes to numpy array.
Args:
pcm_data: Raw PCM bytes
dtype: Data type (np.int16 or np.float32)
Returns:
Numpy array of audio samples
Example:
>>> pcm_bytes = b'\\x00\\x00\\xFF\\x7F' # 2 int16 samples
>>> audio = pcm_to_numpy(pcm_bytes, np.int16)
>>> audio.shape
(2,)
"""
if dtype == np.int16:
return np.frombuffer(pcm_data, dtype=np.int16)
elif dtype == np.float32:
# Convert from int16 to float32 in range [-1.0, 1.0]
int16_array = np.frombuffer(pcm_data, dtype=np.int16)
return int16_array.astype(np.float32) / 32768.0
else:
raise ValueError(f"Unsupported dtype: {dtype}")
def numpy_to_pcm(audio: np.ndarray, dtype: np.dtype = np.int16) -> bytes:
"""
Convert numpy array to PCM bytes.
Args:
audio: Numpy array of audio samples
dtype: Target data type (np.int16 or np.float32)
Returns:
Raw PCM bytes
Example:
>>> audio = np.array([0, 32767], dtype=np.int16)
>>> pcm_bytes = numpy_to_pcm(audio)
>>> len(pcm_bytes)
4
"""
if dtype == np.int16:
# Ensure input is int16
if audio.dtype != np.int16:
# Assume float32 in range [-1.0, 1.0]
audio = (audio * 32768.0).clip(-32768, 32767).astype(np.int16)
return audio.tobytes()
elif dtype == np.float32:
# Ensure input is float32
if audio.dtype != np.float32:
# Assume int16
audio = audio.astype(np.float32) / 32768.0
return audio.tobytes()
else:
raise ValueError(f"Unsupported dtype: {dtype}")
def int16_to_float32(audio: np.ndarray) -> np.ndarray:
"""
Convert int16 audio to float32 in range [-1.0, 1.0].
Args:
audio: Int16 audio array
Returns:
Float32 audio array normalized to [-1.0, 1.0]
"""
if audio.dtype != np.int16:
raise ValueError(f"Expected int16, got {audio.dtype}")
return audio.astype(np.float32) / 32768.0
def float32_to_int16(audio: np.ndarray) -> np.ndarray:
"""
Convert float32 audio to int16.
Args:
audio: Float32 audio array (values should be in [-1.0, 1.0])
Returns:
Int16 audio array
"""
if audio.dtype != np.float32:
raise ValueError(f"Expected float32, got {audio.dtype}")
# Clip to valid range and convert
return (audio * 32768.0).clip(-32768, 32767).astype(np.int16)
def stereo_to_mono(audio: np.ndarray) -> np.ndarray:
"""
Convert stereo audio to mono by averaging channels.
Args:
audio: Stereo audio array (interleaved or shape [samples, 2])
Returns:
Mono audio array
Example:
>>> stereo = np.array([100, 200, 300, 400], dtype=np.int16) # L, R, L, R
>>> mono = stereo_to_mono(stereo)
>>> mono
array([150, 350], dtype=int16)
"""
if len(audio.shape) == 1:
# Interleaved stereo (L, R, L, R, ...)
if len(audio) % 2 != 0:
raise ValueError("Stereo audio must have even number of samples")
# Reshape to [samples, 2] and average
stereo_shaped = audio.reshape(-1, 2)
return stereo_shaped.mean(axis=1).astype(audio.dtype)
elif len(audio.shape) == 2 and audio.shape[1] == 2:
# Already shaped [samples, 2]
return audio.mean(axis=1).astype(audio.dtype)
else:
raise ValueError(f"Invalid stereo audio shape: {audio.shape}")
def mono_to_stereo(audio: np.ndarray) -> np.ndarray:
"""
Convert mono audio to stereo by duplicating the channel.
Args:
audio: Mono audio array
Returns:
Stereo audio array (interleaved: L, R, L, R, ...)
Example:
>>> mono = np.array([100, 200], dtype=np.int16)
>>> stereo = mono_to_stereo(mono)
>>> stereo
array([100, 100, 200, 200], dtype=int16)
"""
if len(audio.shape) != 1:
raise ValueError(f"Expected 1D mono audio, got shape {audio.shape}")
# Stack and interleave
stereo = np.repeat(audio, 2)
return stereo
def resample(
audio: np.ndarray,
orig_sr: int,
target_sr: int,
method: str = "scipy",
) -> np.ndarray:
"""
Resample audio to a different sample rate.
Args:
audio: Audio array (mono or stereo interleaved)
orig_sr: Original sample rate (Hz)
target_sr: Target sample rate (Hz)
method: Resampling method ('scipy', 'linear')
Returns:
Resampled audio array
Example:
>>> audio_48k = np.array([1, 2, 3, 4, 5, 6], dtype=np.int16)
>>> audio_16k = resample(audio_48k, 48000, 16000)
>>> len(audio_16k)
2
"""
if orig_sr == target_sr:
return audio
if method == "scipy":
# High-quality resampling using scipy
num_samples = int(len(audio) * target_sr / orig_sr)
resampled = signal.resample(audio, num_samples)
# Preserve dtype
if audio.dtype == np.int16:
resampled = resampled.clip(-32768, 32767).astype(np.int16)
elif audio.dtype == np.float32:
resampled = resampled.astype(np.float32)
return resampled
elif method == "linear":
# Fast linear interpolation
num_samples = int(len(audio) * target_sr / orig_sr)
resampled = np.interp(
np.linspace(0, len(audio) - 1, num_samples),
np.arange(len(audio)),
audio,
)
# Preserve dtype
if audio.dtype == np.int16:
resampled = resampled.clip(-32768, 32767).astype(np.int16)
elif audio.dtype == np.float32:
resampled = resampled.astype(np.float32)
return resampled
else:
raise ValueError(f"Unknown resampling method: {method}")
def discord_to_processing(pcm_data: bytes) -> np.ndarray:
"""
Convert Discord audio format to processing format.
Discord: 48kHz stereo int16
Processing: 16kHz mono float32
Args:
pcm_data: Raw PCM from Discord (48kHz stereo int16)
Returns:
Numpy array ready for VAD/STT (16kHz mono float32)
"""
# Convert to numpy (int16)
audio = pcm_to_numpy(pcm_data, dtype=np.int16)
# Stereo to mono
audio = stereo_to_mono(audio)
# Resample 48kHz → 16kHz
audio = resample(audio, DISCORD_SAMPLE_RATE, PROCESSING_SAMPLE_RATE)
# Convert to float32
audio = int16_to_float32(audio)
return audio
def processing_to_discord(audio: np.ndarray) -> bytes:
"""
Convert processing format to Discord audio format.
Processing: 16kHz mono float32
Discord: 48kHz stereo int16
Args:
audio: Processing audio (16kHz mono float32)
Returns:
Raw PCM for Discord (48kHz stereo int16)
"""
# Convert to int16
audio = float32_to_int16(audio)
# Resample 16kHz → 48kHz
audio = resample(audio, PROCESSING_SAMPLE_RATE, DISCORD_SAMPLE_RATE)
# Mono to stereo
audio = mono_to_stereo(audio)
# Convert to bytes
return numpy_to_pcm(audio, dtype=np.int16)
def validate_opus_frame_size(frame_size: int, sample_rate: int) -> bool:
"""
Check if frame size is valid for Opus encoding.
Args:
frame_size: Number of samples per channel
sample_rate: Sample rate in Hz
Returns:
True if valid, False otherwise
"""
valid_sizes = OPUS_FRAME_SIZES.get(sample_rate, [])
return frame_size in valid_sizes
def align_to_opus_frame(
pcm_data: bytes,
sample_rate: int = DISCORD_SAMPLE_RATE,
channels: int = DISCORD_CHANNELS,
) -> bytes:
"""
Align PCM data to Opus frame boundary by padding with silence if needed.
Args:
pcm_data: Raw PCM data
sample_rate: Sample rate (Hz)
channels: Number of channels
Returns:
PCM data aligned to frame boundary (may be padded)
"""
bytes_per_sample = 2 # int16
frame_size = DISCORD_FRAME_SIZE # 960 samples per channel
frame_bytes = frame_size * channels * bytes_per_sample
remainder = len(pcm_data) % frame_bytes
if remainder == 0:
return pcm_data
# Pad with silence
padding_bytes = frame_bytes - remainder
return pcm_data + (b"\x00" * padding_bytes)
def split_into_frames(
pcm_data: bytes,
frame_size: int = DISCORD_FRAME_SIZE,
sample_rate: int = DISCORD_SAMPLE_RATE,
channels: int = DISCORD_CHANNELS,
) -> list[bytes]:
"""
Split PCM data into frames of specified size.
Args:
pcm_data: Raw PCM data
frame_size: Samples per channel per frame
sample_rate: Sample rate (Hz)
channels: Number of channels
Returns:
List of frame bytes
"""
bytes_per_sample = 2 # int16
frame_bytes = frame_size * channels * bytes_per_sample
frames = []
for i in range(0, len(pcm_data), frame_bytes):
frame = pcm_data[i : i + frame_bytes]
if len(frame) == frame_bytes:
frames.append(frame)
return frames
def compute_rms(audio: np.ndarray) -> float:
"""
Compute RMS (Root Mean Square) of audio signal.
Useful for measuring audio loudness.
Args:
audio: Audio array (int16 or float32)
Returns:
RMS value
"""
if audio.dtype == np.int16:
audio = int16_to_float32(audio)
return float(np.sqrt(np.mean(audio**2)))
def compute_db(audio: np.ndarray, ref: float = 1.0) -> float:
"""
Compute decibel level of audio signal.
Args:
audio: Audio array (int16 or float32)
ref: Reference value (default 1.0 for float32)
Returns:
Decibel level (dB)
"""
rms = compute_rms(audio)
if rms == 0:
return -np.inf
return float(20 * np.log10(rms / ref))
def normalize_audio(audio: np.ndarray, target_db: float = -20.0) -> np.ndarray:
"""
Normalize audio to target decibel level.
Args:
audio: Audio array (float32)
target_db: Target RMS level in dB
Returns:
Normalized audio array
"""
if audio.dtype != np.float32:
raise ValueError("normalize_audio requires float32 input")
current_db = compute_db(audio)
if current_db == -np.inf:
return audio # Silent audio, no normalization needed
gain_db = target_db - current_db
gain_linear = 10 ** (gain_db / 20)
normalized = audio * gain_linear
# Clip to valid range
return np.clip(normalized, -1.0, 1.0)
def apply_gain(audio: np.ndarray, gain_db: float) -> np.ndarray:
"""
Apply gain to audio signal.
Args:
audio: Audio array (float32)
gain_db: Gain in decibels (positive = louder, negative = quieter)
Returns:
Audio with gain applied
"""
if audio.dtype != np.float32:
raise ValueError("apply_gain requires float32 input")
gain_linear = 10 ** (gain_db / 20)
return np.clip(audio * gain_linear, -1.0, 1.0)
def detect_silence(
audio: np.ndarray,
threshold_db: float = -40.0,
frame_duration: float = 0.02,
sample_rate: int = PROCESSING_SAMPLE_RATE,
) -> bool:
"""
Detect if audio is predominantly silence.
Args:
audio: Audio array (float32)
threshold_db: Silence threshold in dB
frame_duration: Frame duration for analysis (seconds)
sample_rate: Sample rate (Hz)
Returns:
True if audio is silence, False otherwise
"""
if len(audio) == 0:
return True
# Compute RMS in dB
db_level = compute_db(audio)
return db_level < threshold_db
# Validation functions
def validate_sample_rate(sample_rate: int) -> None:
"""Validate sample rate is supported."""
valid_rates = [8000, 16000, 22050, 24000, 32000, 44100, 48000]
if sample_rate not in valid_rates:
raise ValueError(
f"Sample rate {sample_rate} not in valid rates: {valid_rates}"
)
def validate_channels(channels: int) -> None:
"""Validate number of channels is supported."""
if channels not in [1, 2]:
raise ValueError(f"Channels must be 1 (mono) or 2 (stereo), got {channels}")
def validate_audio_format(
pcm_data: bytes,
sample_rate: int,
channels: int,
duration_ms: Optional[int] = None,
) -> None:
"""
Validate audio format is correct.
Args:
pcm_data: Raw PCM data
sample_rate: Sample rate (Hz)
channels: Number of channels
duration_ms: Expected duration in milliseconds (optional)
Raises:
ValueError: If format is invalid
"""
validate_sample_rate(sample_rate)
validate_channels(channels)
bytes_per_sample = 2 # int16
expected_bytes_per_ms = sample_rate * channels * bytes_per_sample // 1000
if duration_ms is not None:
expected_bytes = expected_bytes_per_ms * duration_ms
if len(pcm_data) != expected_bytes:
raise ValueError(
f"Expected {expected_bytes} bytes for {duration_ms}ms, "
f"got {len(pcm_data)} bytes"
)
# Check byte alignment
if len(pcm_data) % (channels * bytes_per_sample) != 0:
raise ValueError(
f"PCM data length ({len(pcm_data)}) not aligned to sample size "
f"({channels * bytes_per_sample} bytes)"
)