Add browser-based voice portal (WebSocket + mic → STT → LLM → TTS)

This commit is contained in:
Jezza Hehn 2026-04-10 02:30:23 +00:00
parent a2099e9d81
commit bc580861dd
3 changed files with 706 additions and 0 deletions

View file

@ -0,0 +1,31 @@
// AudioWorklet processor for capturing raw PCM audio
// Captures audio at 16kHz mono float32 as specified by getUserMedia
class VoiceProcessor extends AudioWorkletProcessor {
constructor() {
super();
this.port.onmessage = this.handleMessage.bind(this);
}
handleMessage(event) {
// No message handling needed - audio is captured automatically
// in onaudioprocess
}
process(inputs, outputs, parameters) {
// Get input audio
const input = inputs[0];
if (input && input.length > 0) {
// Get mono channel (channel 0)
const channelData = input[0];
// Send audio data to main thread
this.port.postMessage({ type: 'audio', audio: channelData });
}
// Keep processor alive
return true;
}
}
registerProcessor('voice-processor', VoiceProcessor);

436
server/static/voice.html Normal file
View file

@ -0,0 +1,436 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>MoltMic Voice Portal</title>
<style>
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
body {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
min-height: 100vh;
display: flex;
flex-direction: column;
align-items: center;
padding: 20px;
}
.container {
max-width: 600px;
width: 100%;
text-align: center;
}
h1 {
color: #fff;
margin-bottom: 20px;
font-size: 2rem;
}
.status {
display: inline-flex;
align-items: center;
gap: 8px;
padding: 8px 16px;
border-radius: 20px;
font-size: 14px;
font-weight: 500;
margin-bottom: 20px;
}
.status.connected {
background: #4ade80;
color: #1a1a2e;
}
.status.disconnected {
background: #ef4444;
color: white;
}
.status.connecting {
background: #f59e0b;
color: white;
}
.status-dot {
width: 10px;
height: 10px;
border-radius: 50%;
background: currentColor;
animation: pulse 2s infinite;
}
@keyframes pulse {
0%, 100% { opacity: 1; }
50% { opacity: 0.5; }
}
.transcript {
background: rgba(255, 255, 255, 0.1);
border-radius: 12px;
padding: 20px;
margin: 20px 0;
min-height: 120px;
max-height: 300px;
overflow-y: auto;
text-align: left;
}
.transcript-label {
color: #9ca3af;
font-size: 12px;
margin-bottom: 10px;
text-transform: uppercase;
letter-spacing: 1px;
}
.transcript-item {
padding: 10px 0;
border-bottom: 1px solid rgba(255, 255, 255, 0.1);
}
.transcript-item:last-child {
border-bottom: none;
}
.transcript-transcript {
color: #e5e7eb;
font-size: 14px;
margin-bottom: 4px;
}
.transcript-response {
color: #a5b4fc;
font-size: 13px;
}
.controls {
display: flex;
gap: 16px;
justify-content: center;
margin-bottom: 30px;
}
button {
padding: 16px 32px;
font-size: 16px;
font-weight: 600;
border: none;
border-radius: 12px;
cursor: pointer;
transition: all 0.2s;
}
button:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.connect-btn {
background: #6366f1;
color: white;
}
.connect-btn:hover:not(:disabled) {
background: #4f46e5;
transform: translateY(-2px);
}
.disconnect-btn {
background: #ef4444;
color: white;
}
.disconnect-btn:hover:not(:disabled) {
background: #dc2626;
transform: translateY(-2px);
}
.retry-btn {
background: #10b981;
color: white;
}
.retry-btn:hover:not(:disabled) {
background: #059669;
transform: translateY(-2px);
}
.error {
background: rgba(239, 68, 68, 0.2);
color: #fca5a5;
padding: 12px 16px;
border-radius: 8px;
margin: 10px 0;
font-size: 14px;
}
.info {
color: #9ca3af;
font-size: 14px;
margin-top: 20px;
}
</style>
</head>
<body>
<div class="container">
<h1>🎙️ MoltMic Voice</h1>
<div id="status" class="status disconnected">
<span class="status-dot"></span>
<span id="status-text">Disconnected</span>
</div>
<div id="transcript" class="transcript" style="display: none;">
<div class="transcript-label">Transcript</div>
<div id="transcript-content"></div>
</div>
<div class="controls">
<button id="connect-btn" class="connect-btn">Connect</button>
<button id="disconnect-btn" class="disconnect-btn" disabled>Disconnect</button>
</div>
<div id="error" class="error" style="display: none;"></div>
<p class="info">Say something and the bot will respond. Auto-reconnects on disconnect.</p>
</div>
<script>
const sessionId = new URLSearchParams(window.location.search).get('session');
const wsUrl = `wss://${window.location.host}/ws/voice/${sessionId}`;
let ws = null;
let audioContext = null;
let microphone = null;
let scriptProcessor = null;
let isConnected = false;
let reconnectAttempts = 0;
const maxReconnectAttempts = 5;
const statusEl = document.getElementById('status');
const statusTextEl = document.getElementById('status-text');
const connectBtn = document.getElementById('connect-btn');
const disconnectBtn = document.getElementById('disconnect-btn');
const transcriptEl = document.getElementById('transcript');
const transcriptContentEl = document.getElementById('transcript-content');
const errorEl = document.getElementById('error');
function updateStatus(status, text) {
status.className = `status ${status}`;
statusTextEl.textContent = text;
}
function showError(message) {
errorEl.textContent = message;
errorEl.style.display = 'block';
}
function hideError() {
errorEl.style.display = 'none';
}
async function connect() {
if (isConnected) return;
updateStatus('connecting', 'Connecting...');
hideError();
connectBtn.disabled = true;
try {
// Open WebSocket
ws = new WebSocket(wsUrl);
ws.onopen = async () => {
console.log('WebSocket connected');
// Initialize audio
await initAudio();
isConnected = true;
reconnectAttempts = 0;
updateStatus('connected', 'Connected');
connectBtn.disabled = true;
disconnectBtn.disabled = false;
};
ws.onmessage = (event) => {
const data = JSON.parse(event.data);
if (data.type === 'welcome') {
console.log('Server greeting:', data.message);
}
};
ws.onclose = () => {
console.log('WebSocket disconnected');
handleDisconnect();
};
ws.onerror = (error) => {
console.error('WebSocket error:', error);
showError('Connection error. Please try again.');
};
} catch (error) {
console.error('Connection error:', error);
showError('Failed to connect: ' + error.message);
updateStatus('disconnected', 'Disconnected');
connectBtn.disabled = false;
}
}
async function disconnect() {
if (!ws) return;
isConnected = false;
ws.close();
disconnectAudio();
updateStatus('disconnected', 'Disconnected');
connectBtn.disabled = false;
disconnectBtn.disabled = true;
}
async function handleDisconnect() {
if (!isConnected) return;
isConnected = false;
disconnectAudio();
updateStatus('disconnected', 'Disconnected');
connectBtn.disabled = false;
disconnectBtn.disabled = true;
// Auto-reconnect
if (reconnectAttempts < maxReconnectAttempts) {
const delay = Math.min(1000 * Math.pow(2, reconnectAttempts), 30000);
console.log(`Reconnecting in ${delay}ms...`);
updateStatus('connecting', `Reconnecting (${reconnectAttempts + 1}/${maxReconnectAttempts})...`);
setTimeout(() => {
reconnectAttempts++;
connect();
}, delay);
}
}
async function initAudio() {
try {
audioContext = new (window.AudioContext || window.webkitAudioContext)({
sampleRate: 16000
});
// Get microphone
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
sampleRate: 16000,
channelCount: 1,
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true
}
});
microphone = audioContext.createMediaStreamSource(stream);
// Use AudioWorklet or ScriptProcessor as fallback
if (audioContext.audioWorklet) {
try {
await initAudioWorklet();
} catch (error) {
console.warn('AudioWorklet failed, falling back to ScriptProcessor:', error);
initScriptProcessor();
}
} else {
initScriptProcessor();
}
} catch (error) {
console.error('Audio initialization error:', error);
throw error;
}
}
async function initAudioWorklet() {
// Load worklet module
const workletUrl = `${window.location.origin}/static/voice-worklet.js`;
await audioContext.audioWorklet.addModule(workletUrl);
const processor = new AudioWorkletProcessor(audioContext, {
numberOfInputs: 1,
numberOfOutputs: 1,
outputChannelCount: [1]
});
microphone.connect(processor);
processor.port.onmessage = (event) => {
if (event.data.type === 'audio') {
sendAudio(event.data.audio);
}
};
}
function initScriptProcessor() {
scriptProcessor = audioContext.createScriptProcessor(4096, 1, 1);
microphone.connect(scriptProcessor);
scriptProcessor.connect(audioContext.destination);
scriptProcessor.onaudioprocess = (event) => {
const inputData = event.inputBuffer.getChannelData(0);
sendAudio(inputData);
};
}
function disconnectAudio() {
if (microphone) {
microphone.disconnect();
microphone = null;
}
if (scriptProcessor) {
scriptProcessor.disconnect();
scriptProcessor = null;
}
if (audioContext && audioContext.state !== 'closed') {
audioContext.close();
}
}
function sendAudio(audioData) {
if (!ws || ws.readyState !== WebSocket.OPEN) return;
// Convert Float32 to Int16 for transmission
const int16Data = new Int16Array(audioData.length);
for (let i = 0; i < audioData.length; i++) {
const sample = Math.max(-1, Math.min(1, audioData[i]));
int16Data[i] = sample < 0 ? sample * 0x8000 : sample * 0x7FFF;
}
ws.send(int16Data.buffer);
}
// Event listeners
connectBtn.addEventListener('click', connect);
disconnectBtn.addEventListener('click', disconnect);
// Handle page visibility
document.addEventListener('visibilitychange', () => {
if (document.hidden && isConnected) {
disconnect();
}
});
</script>
</body>
</html>

239
server/voice_ws.py Normal file
View file

@ -0,0 +1,239 @@
"""WebSocket voice endpoint for browser-based speech-to-text and text-to-speech.
Accepts binary PCM audio from browser, transcribes via Deepgram, sends to OpenClaw Gateway,
and streams TTS audio back to browser.
"""
import asyncio
import json
import logging
import os
import random
import string
from pathlib import Path
from typing import Optional
import numpy as np
from fastapi import WebSocket, WebSocketDisconnect
from pydantic import BaseModel
from server.stt import DeepgramSTT
from server.tts import VeniceKokoroTTS
from openclaw_client.client import OpenClawClient, OpenClawConfig
logger = logging.getLogger(__name__)
class VoiceSession:
"""Manages a single voice session."""
def __init__(self, session_id: str):
self.session_id = session_id
self.transcript_file = Path("logs/voice") / f"{session_id}.jsonl"
self.transcript_file.parent.mkdir(parents=True, exist_ok=True)
# Audio buffering
self.audio_buffer = bytearray()
self.buffer_duration = 0.0 # Seconds
self._buffer_lock = asyncio.Lock()
# Audio processing
self.sample_rate = 16000
self.channel_count = 1
self.bits_per_sample = 32
# Engines (self-contained, don't share with run.py)
self.stt = None
self.tts = None
self.openclaw = None
# Session state
self.connected = False
self.transcript = []
logger.info(f"Created voice session {session_id}")
async def initialize(self):
"""Initialize STT, TTS, and OpenClaw client."""
# Load env vars
deepgram_key = os.getenv("DEEPGRAM_API_KEY")
venice_key = os.getenv("VENICE_API_KEY")
openclaw_url = os.getenv("OPENCLAW_BASE_URL", "ws://192.168.50.9:18789")
openclaw_token = os.getenv("OPENCLAW_AUTH_TOKEN")
if not deepgram_key or not venice_key:
raise ValueError("Missing required API keys")
# Initialize STT
self.stt = DeepgramSTT(
api_key=deepgram_key,
model="nova-3",
language="en",
sample_rate=self.sample_rate,
)
# Initialize TTS
self.tts = VeniceKokoroTTS(
api_key=venice_key,
voice="am_liam",
base_url="https://api.venice.ai/api/v1",
)
# Initialize OpenClaw client
self.openclaw = OpenClawClient(
config=OpenClawConfig(
base_url=openclaw_url,
auth_token=openclaw_token,
timeout=30.0,
agent_id="main",
)
)
await self.openclaw.connect()
logger.info(f"Voice session {self.session_id} initialized")
async def close(self):
"""Clean up resources."""
self.connected = False
if self.openclaw:
await self.openclaw.disconnect()
logger.info(f"Voice session {self.session_id} closed")
def _new_id(self) -> str:
"""Generate random session ID."""
return "".join(random.choices(string.ascii_letters + string.digits, k=8))
async def process_audio_chunk(self, data: bytes):
"""Process incoming audio chunk."""
async with self._buffer_lock:
self.audio_buffer.extend(data)
# Calculate duration
chunk_size = len(data)
chunk_duration = chunk_size / (self.sample_rate * self.channel_count * 4)
self.buffer_duration += chunk_duration
# Buffer until ~1 second
if self.buffer_duration >= 0.8: # Slightly less than 1 second
await self._transcribe_buffered_audio()
async def _transcribe_buffered_audio(self):
"""Transcribe accumulated audio and send to OpenClaw."""
async with self._buffer_lock:
if not self.audio_buffer:
return
# Convert bytearray to numpy array
audio_data = np.frombuffer(bytes(self.audio_buffer), dtype=np.float32)
# Transcribe
try:
result = await self.stt.transcribe_async(audio_data)
if result.text.strip():
# Send to OpenClaw
response = await self.openclaw.send_message(
agent="main",
message=result.text,
speaker="voice_user",
)
# Log transcript
timestamp = asyncio.get_event_loop().time()
entry = {
"timestamp": timestamp,
"session_id": self.session_id,
"transcript": result.text,
"response": response,
}
self.transcript.append(entry)
# Write to file
with open(self.transcript_file, "a") as f:
f.write(json.dumps(entry, ensure_ascii=False) + "\n")
logger.info(
f"Session {self.session_id}: "
f'"{result.text[:50]}..." -> "{response[:50]}..."'
)
# Clear buffer
self.audio_buffer.clear()
self.buffer_duration = 0.0
except Exception as e:
logger.error(f"Transcription error: {e}")
async def synthesize_response(self, text: str):
"""Synthesize TTS audio from response text."""
try:
audio = await self.tts.generate_async(
text=text,
voice_ref_path=None,
emotion_exaggeration=0.8,
)
return audio
except Exception as e:
logger.error(f"TTS synthesis error: {e}")
return None
def get_transcript(self) -> list:
"""Get transcript history."""
return self.transcript
async def handle_voice_websocket(websocket: WebSocket, session_id: str):
"""Handle WebSocket connection for voice session."""
session = VoiceSession(session_id)
await websocket.accept()
session.connected = True
logger.info(f"WebSocket connected for session {session_id}")
# Initialize session
try:
await session.initialize()
# Send welcome message
await websocket.send_json({
"type": "welcome",
"message": "Connected to voice portal",
})
# Receive and process audio
while session.connected:
try:
data = await websocket.receive_bytes()
# Process audio chunk
await session.process_audio_chunk(data)
except WebSocketDisconnect:
session.connected = False
logger.info(f"WebSocket disconnected for session {session_id}")
break
except Exception as e:
logger.error(f"WebSocket error: {e}")
session.connected = False
break
except Exception as e:
logger.error(f"Session initialization error: {e}")
await websocket.close(code=1011, reason=str(e))
finally:
await session.close()
def create_session_id() -> str:
"""Generate a random session ID."""
return "".join(random.choices(string.ascii_letters + string.digits, k=8))