Add browser-based voice portal (WebSocket + mic → STT → LLM → TTS)

2026-04-10 02:30:23 +00:00 · 2026-04-10 02:30:23 +00:00 · bc580861dd
commit bc580861dd
parent a2099e9d81
3 changed files with 706 additions and 0 deletions
--- a/server/static/voice-worklet.js
+++ b/server/static/voice-worklet.js
@ -0,0 +1,31 @@
+// AudioWorklet processor for capturing raw PCM audio
+// Captures audio at 16kHz mono float32 as specified by getUserMedia
+
+class VoiceProcessor extends AudioWorkletProcessor {
+  constructor() {
+    super();
+    this.port.onmessage = this.handleMessage.bind(this);
+  }
+
+  handleMessage(event) {
+    // No message handling needed - audio is captured automatically
+    // in onaudioprocess
+  }
+
+  process(inputs, outputs, parameters) {
+    // Get input audio
+    const input = inputs[0];
+    if (input && input.length > 0) {
+      // Get mono channel (channel 0)
+      const channelData = input[0];
+
+      // Send audio data to main thread
+      this.port.postMessage({ type: 'audio', audio: channelData });
+    }
+
+    // Keep processor alive
+    return true;
+  }
+}
+
+registerProcessor('voice-processor', VoiceProcessor);
--- a/server/static/voice.html
+++ b/server/static/voice.html
@ -0,0 +1,436 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>MoltMic Voice Portal</title>
+    <style>
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+            background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
+            min-height: 100vh;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            padding: 20px;
+        }
+
+        .container {
+            max-width: 600px;
+            width: 100%;
+            text-align: center;
+        }
+
+        h1 {
+            color: #fff;
+            margin-bottom: 20px;
+            font-size: 2rem;
+        }
+
+        .status {
+            display: inline-flex;
+            align-items: center;
+            gap: 8px;
+            padding: 8px 16px;
+            border-radius: 20px;
+            font-size: 14px;
+            font-weight: 500;
+            margin-bottom: 20px;
+        }
+
+        .status.connected {
+            background: #4ade80;
+            color: #1a1a2e;
+        }
+
+        .status.disconnected {
+            background: #ef4444;
+            color: white;
+        }
+
+        .status.connecting {
+            background: #f59e0b;
+            color: white;
+        }
+
+        .status-dot {
+            width: 10px;
+            height: 10px;
+            border-radius: 50%;
+            background: currentColor;
+            animation: pulse 2s infinite;
+        }
+
+        @keyframes pulse {
+            0%, 100% { opacity: 1; }
+            50% { opacity: 0.5; }
+        }
+
+        .transcript {
+            background: rgba(255, 255, 255, 0.1);
+            border-radius: 12px;
+            padding: 20px;
+            margin: 20px 0;
+            min-height: 120px;
+            max-height: 300px;
+            overflow-y: auto;
+            text-align: left;
+        }
+
+        .transcript-label {
+            color: #9ca3af;
+            font-size: 12px;
+            margin-bottom: 10px;
+            text-transform: uppercase;
+            letter-spacing: 1px;
+        }
+
+        .transcript-item {
+            padding: 10px 0;
+            border-bottom: 1px solid rgba(255, 255, 255, 0.1);
+        }
+
+        .transcript-item:last-child {
+            border-bottom: none;
+        }
+
+        .transcript-transcript {
+            color: #e5e7eb;
+            font-size: 14px;
+            margin-bottom: 4px;
+        }
+
+        .transcript-response {
+            color: #a5b4fc;
+            font-size: 13px;
+        }
+
+        .controls {
+            display: flex;
+            gap: 16px;
+            justify-content: center;
+            margin-bottom: 30px;
+        }
+
+        button {
+            padding: 16px 32px;
+            font-size: 16px;
+            font-weight: 600;
+            border: none;
+            border-radius: 12px;
+            cursor: pointer;
+            transition: all 0.2s;
+        }
+
+        button:disabled {
+            opacity: 0.5;
+            cursor: not-allowed;
+        }
+
+        .connect-btn {
+            background: #6366f1;
+            color: white;
+        }
+
+        .connect-btn:hover:not(:disabled) {
+            background: #4f46e5;
+            transform: translateY(-2px);
+        }
+
+        .disconnect-btn {
+            background: #ef4444;
+            color: white;
+        }
+
+        .disconnect-btn:hover:not(:disabled) {
+            background: #dc2626;
+            transform: translateY(-2px);
+        }
+
+        .retry-btn {
+            background: #10b981;
+            color: white;
+        }
+
+        .retry-btn:hover:not(:disabled) {
+            background: #059669;
+            transform: translateY(-2px);
+        }
+
+        .error {
+            background: rgba(239, 68, 68, 0.2);
+            color: #fca5a5;
+            padding: 12px 16px;
+            border-radius: 8px;
+            margin: 10px 0;
+            font-size: 14px;
+        }
+
+        .info {
+            color: #9ca3af;
+            font-size: 14px;
+            margin-top: 20px;
+        }
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>🎙️ MoltMic Voice</h1>
+
+        <div id="status" class="status disconnected">
+            <span class="status-dot"></span>
+            <span id="status-text">Disconnected</span>
+        </div>
+
+        <div id="transcript" class="transcript" style="display: none;">
+            <div class="transcript-label">Transcript</div>
+            <div id="transcript-content"></div>
+        </div>
+
+        <div class="controls">
+            <button id="connect-btn" class="connect-btn">Connect</button>
+            <button id="disconnect-btn" class="disconnect-btn" disabled>Disconnect</button>
+        </div>
+
+        <div id="error" class="error" style="display: none;"></div>
+
+        <p class="info">Say something and the bot will respond. Auto-reconnects on disconnect.</p>
+    </div>
+
+    <script>
+        const sessionId = new URLSearchParams(window.location.search).get('session');
+        const wsUrl = `wss://${window.location.host}/ws/voice/${sessionId}`;
+
+        let ws = null;
+        let audioContext = null;
+        let microphone = null;
+        let scriptProcessor = null;
+        let isConnected = false;
+        let reconnectAttempts = 0;
+        const maxReconnectAttempts = 5;
+
+        const statusEl = document.getElementById('status');
+        const statusTextEl = document.getElementById('status-text');
+        const connectBtn = document.getElementById('connect-btn');
+        const disconnectBtn = document.getElementById('disconnect-btn');
+        const transcriptEl = document.getElementById('transcript');
+        const transcriptContentEl = document.getElementById('transcript-content');
+        const errorEl = document.getElementById('error');
+
+        function updateStatus(status, text) {
+            status.className = `status ${status}`;
+            statusTextEl.textContent = text;
+        }
+
+        function showError(message) {
+            errorEl.textContent = message;
+            errorEl.style.display = 'block';
+        }
+
+        function hideError() {
+            errorEl.style.display = 'none';
+        }
+
+        async function connect() {
+            if (isConnected) return;
+
+            updateStatus('connecting', 'Connecting...');
+            hideError();
+            connectBtn.disabled = true;
+
+            try {
+                // Open WebSocket
+                ws = new WebSocket(wsUrl);
+
+                ws.onopen = async () => {
+                    console.log('WebSocket connected');
+
+                    // Initialize audio
+                    await initAudio();
+
+                    isConnected = true;
+                    reconnectAttempts = 0;
+                    updateStatus('connected', 'Connected');
+                    connectBtn.disabled = true;
+                    disconnectBtn.disabled = false;
+                };
+
+                ws.onmessage = (event) => {
+                    const data = JSON.parse(event.data);
+
+                    if (data.type === 'welcome') {
+                        console.log('Server greeting:', data.message);
+                    }
+                };
+
+                ws.onclose = () => {
+                    console.log('WebSocket disconnected');
+                    handleDisconnect();
+                };
+
+                ws.onerror = (error) => {
+                    console.error('WebSocket error:', error);
+                    showError('Connection error. Please try again.');
+                };
+
+            } catch (error) {
+                console.error('Connection error:', error);
+                showError('Failed to connect: ' + error.message);
+                updateStatus('disconnected', 'Disconnected');
+                connectBtn.disabled = false;
+            }
+        }
+
+        async function disconnect() {
+            if (!ws) return;
+
+            isConnected = false;
+            ws.close();
+            disconnectAudio();
+
+            updateStatus('disconnected', 'Disconnected');
+            connectBtn.disabled = false;
+            disconnectBtn.disabled = true;
+        }
+
+        async function handleDisconnect() {
+            if (!isConnected) return;
+
+            isConnected = false;
+            disconnectAudio();
+
+            updateStatus('disconnected', 'Disconnected');
+            connectBtn.disabled = false;
+            disconnectBtn.disabled = true;
+
+            // Auto-reconnect
+            if (reconnectAttempts < maxReconnectAttempts) {
+                const delay = Math.min(1000 * Math.pow(2, reconnectAttempts), 30000);
+                console.log(`Reconnecting in ${delay}ms...`);
+                updateStatus('connecting', `Reconnecting (${reconnectAttempts + 1}/${maxReconnectAttempts})...`);
+
+                setTimeout(() => {
+                    reconnectAttempts++;
+                    connect();
+                }, delay);
+            }
+        }
+
+        async function initAudio() {
+            try {
+                audioContext = new (window.AudioContext || window.webkitAudioContext)({
+                    sampleRate: 16000
+                });
+
+                // Get microphone
+                const stream = await navigator.mediaDevices.getUserMedia({
+                    audio: {
+                        sampleRate: 16000,
+                        channelCount: 1,
+                        echoCancellation: true,
+                        noiseSuppression: true,
+                        autoGainControl: true
+                    }
+                });
+
+                microphone = audioContext.createMediaStreamSource(stream);
+
+                // Use AudioWorklet or ScriptProcessor as fallback
+                if (audioContext.audioWorklet) {
+                    try {
+                        await initAudioWorklet();
+                    } catch (error) {
+                        console.warn('AudioWorklet failed, falling back to ScriptProcessor:', error);
+                        initScriptProcessor();
+                    }
+                } else {
+                    initScriptProcessor();
+                }
+
+            } catch (error) {
+                console.error('Audio initialization error:', error);
+                throw error;
+            }
+        }
+
+        async function initAudioWorklet() {
+            // Load worklet module
+            const workletUrl = `${window.location.origin}/static/voice-worklet.js`;
+
+            await audioContext.audioWorklet.addModule(workletUrl);
+
+            const processor = new AudioWorkletProcessor(audioContext, {
+                numberOfInputs: 1,
+                numberOfOutputs: 1,
+                outputChannelCount: [1]
+            });
+
+            microphone.connect(processor);
+
+            processor.port.onmessage = (event) => {
+                if (event.data.type === 'audio') {
+                    sendAudio(event.data.audio);
+                }
+            };
+        }
+
+        function initScriptProcessor() {
+            scriptProcessor = audioContext.createScriptProcessor(4096, 1, 1);
+
+            microphone.connect(scriptProcessor);
+            scriptProcessor.connect(audioContext.destination);
+
+            scriptProcessor.onaudioprocess = (event) => {
+                const inputData = event.inputBuffer.getChannelData(0);
+                sendAudio(inputData);
+            };
+        }
+
+        function disconnectAudio() {
+            if (microphone) {
+                microphone.disconnect();
+                microphone = null;
+            }
+
+            if (scriptProcessor) {
+                scriptProcessor.disconnect();
+                scriptProcessor = null;
+            }
+
+            if (audioContext && audioContext.state !== 'closed') {
+                audioContext.close();
+            }
+        }
+
+        function sendAudio(audioData) {
+            if (!ws || ws.readyState !== WebSocket.OPEN) return;
+
+            // Convert Float32 to Int16 for transmission
+            const int16Data = new Int16Array(audioData.length);
+            for (let i = 0; i < audioData.length; i++) {
+                const sample = Math.max(-1, Math.min(1, audioData[i]));
+                int16Data[i] = sample < 0 ? sample * 0x8000 : sample * 0x7FFF;
+            }
+
+            ws.send(int16Data.buffer);
+        }
+
+        // Event listeners
+        connectBtn.addEventListener('click', connect);
+        disconnectBtn.addEventListener('click', disconnect);
+
+        // Handle page visibility
+        document.addEventListener('visibilitychange', () => {
+            if (document.hidden && isConnected) {
+                disconnect();
+            }
+        });
+    </script>
+</body>
+</html>
--- a/server/voice_ws.py
+++ b/server/voice_ws.py
@ -0,0 +1,239 @@
+"""WebSocket voice endpoint for browser-based speech-to-text and text-to-speech.
+
+Accepts binary PCM audio from browser, transcribes via Deepgram, sends to OpenClaw Gateway,
+and streams TTS audio back to browser.
+"""
+
+import asyncio
+import json
+import logging
+import os
+import random
+import string
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+from fastapi import WebSocket, WebSocketDisconnect
+from pydantic import BaseModel
+
+from server.stt import DeepgramSTT
+from server.tts import VeniceKokoroTTS
+from openclaw_client.client import OpenClawClient, OpenClawConfig
+
+logger = logging.getLogger(__name__)
+
+
+class VoiceSession:
+    """Manages a single voice session."""
+
+    def __init__(self, session_id: str):
+        self.session_id = session_id
+        self.transcript_file = Path("logs/voice") / f"{session_id}.jsonl"
+        self.transcript_file.parent.mkdir(parents=True, exist_ok=True)
+
+        # Audio buffering
+        self.audio_buffer = bytearray()
+        self.buffer_duration = 0.0  # Seconds
+        self._buffer_lock = asyncio.Lock()
+
+        # Audio processing
+        self.sample_rate = 16000
+        self.channel_count = 1
+        self.bits_per_sample = 32
+
+        # Engines (self-contained, don't share with run.py)
+        self.stt = None
+        self.tts = None
+        self.openclaw = None
+
+        # Session state
+        self.connected = False
+        self.transcript = []
+
+        logger.info(f"Created voice session {session_id}")
+
+    async def initialize(self):
+        """Initialize STT, TTS, and OpenClaw client."""
+        # Load env vars
+        deepgram_key = os.getenv("DEEPGRAM_API_KEY")
+        venice_key = os.getenv("VENICE_API_KEY")
+        openclaw_url = os.getenv("OPENCLAW_BASE_URL", "ws://192.168.50.9:18789")
+        openclaw_token = os.getenv("OPENCLAW_AUTH_TOKEN")
+
+        if not deepgram_key or not venice_key:
+            raise ValueError("Missing required API keys")
+
+        # Initialize STT
+        self.stt = DeepgramSTT(
+            api_key=deepgram_key,
+            model="nova-3",
+            language="en",
+            sample_rate=self.sample_rate,
+        )
+
+        # Initialize TTS
+        self.tts = VeniceKokoroTTS(
+            api_key=venice_key,
+            voice="am_liam",
+            base_url="https://api.venice.ai/api/v1",
+        )
+
+        # Initialize OpenClaw client
+        self.openclaw = OpenClawClient(
+            config=OpenClawConfig(
+                base_url=openclaw_url,
+                auth_token=openclaw_token,
+                timeout=30.0,
+                agent_id="main",
+            )
+        )
+
+        await self.openclaw.connect()
+
+        logger.info(f"Voice session {self.session_id} initialized")
+
+    async def close(self):
+        """Clean up resources."""
+        self.connected = False
+
+        if self.openclaw:
+            await self.openclaw.disconnect()
+
+        logger.info(f"Voice session {self.session_id} closed")
+
+    def _new_id(self) -> str:
+        """Generate random session ID."""
+        return "".join(random.choices(string.ascii_letters + string.digits, k=8))
+
+    async def process_audio_chunk(self, data: bytes):
+        """Process incoming audio chunk."""
+        async with self._buffer_lock:
+            self.audio_buffer.extend(data)
+
+            # Calculate duration
+            chunk_size = len(data)
+            chunk_duration = chunk_size / (self.sample_rate * self.channel_count * 4)
+
+            self.buffer_duration += chunk_duration
+
+            # Buffer until ~1 second
+            if self.buffer_duration >= 0.8:  # Slightly less than 1 second
+                await self._transcribe_buffered_audio()
+
+    async def _transcribe_buffered_audio(self):
+        """Transcribe accumulated audio and send to OpenClaw."""
+        async with self._buffer_lock:
+            if not self.audio_buffer:
+                return
+
+            # Convert bytearray to numpy array
+            audio_data = np.frombuffer(bytes(self.audio_buffer), dtype=np.float32)
+
+            # Transcribe
+            try:
+                result = await self.stt.transcribe_async(audio_data)
+
+                if result.text.strip():
+                    # Send to OpenClaw
+                    response = await self.openclaw.send_message(
+                        agent="main",
+                        message=result.text,
+                        speaker="voice_user",
+                    )
+
+                    # Log transcript
+                    timestamp = asyncio.get_event_loop().time()
+                    entry = {
+                        "timestamp": timestamp,
+                        "session_id": self.session_id,
+                        "transcript": result.text,
+                        "response": response,
+                    }
+
+                    self.transcript.append(entry)
+
+                    # Write to file
+                    with open(self.transcript_file, "a") as f:
+                        f.write(json.dumps(entry, ensure_ascii=False) + "\n")
+
+                    logger.info(
+                        f"Session {self.session_id}: "
+                        f'"{result.text[:50]}..." -> "{response[:50]}..."'
+                    )
+
+                    # Clear buffer
+                    self.audio_buffer.clear()
+                    self.buffer_duration = 0.0
+
+            except Exception as e:
+                logger.error(f"Transcription error: {e}")
+
+    async def synthesize_response(self, text: str):
+        """Synthesize TTS audio from response text."""
+        try:
+            audio = await self.tts.generate_async(
+                text=text,
+                voice_ref_path=None,
+                emotion_exaggeration=0.8,
+            )
+
+            return audio
+
+        except Exception as e:
+            logger.error(f"TTS synthesis error: {e}")
+            return None
+
+    def get_transcript(self) -> list:
+        """Get transcript history."""
+        return self.transcript
+
+
+async def handle_voice_websocket(websocket: WebSocket, session_id: str):
+    """Handle WebSocket connection for voice session."""
+    session = VoiceSession(session_id)
+
+    await websocket.accept()
+    session.connected = True
+
+    logger.info(f"WebSocket connected for session {session_id}")
+
+    # Initialize session
+    try:
+        await session.initialize()
+
+        # Send welcome message
+        await websocket.send_json({
+            "type": "welcome",
+            "message": "Connected to voice portal",
+        })
+
+        # Receive and process audio
+        while session.connected:
+            try:
+                data = await websocket.receive_bytes()
+
+                # Process audio chunk
+                await session.process_audio_chunk(data)
+
+            except WebSocketDisconnect:
+                session.connected = False
+                logger.info(f"WebSocket disconnected for session {session_id}")
+                break
+
+            except Exception as e:
+                logger.error(f"WebSocket error: {e}")
+                session.connected = False
+                break
+
+    except Exception as e:
+        logger.error(f"Session initialization error: {e}")
+        await websocket.close(code=1011, reason=str(e))
+
+    finally:
+        await session.close()
+
+
+def create_session_id() -> str:
+    """Generate a random session ID."""
+    return "".join(random.choices(string.ascii_letters + string.digits, k=8))