diff --git a/server/static/voice-worklet.js b/server/static/voice-worklet.js new file mode 100644 index 0000000..437a1d4 --- /dev/null +++ b/server/static/voice-worklet.js @@ -0,0 +1,31 @@ +// AudioWorklet processor for capturing raw PCM audio +// Captures audio at 16kHz mono float32 as specified by getUserMedia + +class VoiceProcessor extends AudioWorkletProcessor { + constructor() { + super(); + this.port.onmessage = this.handleMessage.bind(this); + } + + handleMessage(event) { + // No message handling needed - audio is captured automatically + // in onaudioprocess + } + + process(inputs, outputs, parameters) { + // Get input audio + const input = inputs[0]; + if (input && input.length > 0) { + // Get mono channel (channel 0) + const channelData = input[0]; + + // Send audio data to main thread + this.port.postMessage({ type: 'audio', audio: channelData }); + } + + // Keep processor alive + return true; + } +} + +registerProcessor('voice-processor', VoiceProcessor); diff --git a/server/static/voice.html b/server/static/voice.html new file mode 100644 index 0000000..87e4071 --- /dev/null +++ b/server/static/voice.html @@ -0,0 +1,436 @@ + + + + + + MoltMic Voice Portal + + + +
+

🎙️ MoltMic Voice

+ +
+ + Disconnected +
+ + + +
+ + +
+ + + +

Say something and the bot will respond. Auto-reconnects on disconnect.

+
+ + + + diff --git a/server/voice_ws.py b/server/voice_ws.py new file mode 100644 index 0000000..ebac6ae --- /dev/null +++ b/server/voice_ws.py @@ -0,0 +1,239 @@ +"""WebSocket voice endpoint for browser-based speech-to-text and text-to-speech. + +Accepts binary PCM audio from browser, transcribes via Deepgram, sends to OpenClaw Gateway, +and streams TTS audio back to browser. +""" + +import asyncio +import json +import logging +import os +import random +import string +from pathlib import Path +from typing import Optional + +import numpy as np +from fastapi import WebSocket, WebSocketDisconnect +from pydantic import BaseModel + +from server.stt import DeepgramSTT +from server.tts import VeniceKokoroTTS +from openclaw_client.client import OpenClawClient, OpenClawConfig + +logger = logging.getLogger(__name__) + + +class VoiceSession: + """Manages a single voice session.""" + + def __init__(self, session_id: str): + self.session_id = session_id + self.transcript_file = Path("logs/voice") / f"{session_id}.jsonl" + self.transcript_file.parent.mkdir(parents=True, exist_ok=True) + + # Audio buffering + self.audio_buffer = bytearray() + self.buffer_duration = 0.0 # Seconds + self._buffer_lock = asyncio.Lock() + + # Audio processing + self.sample_rate = 16000 + self.channel_count = 1 + self.bits_per_sample = 32 + + # Engines (self-contained, don't share with run.py) + self.stt = None + self.tts = None + self.openclaw = None + + # Session state + self.connected = False + self.transcript = [] + + logger.info(f"Created voice session {session_id}") + + async def initialize(self): + """Initialize STT, TTS, and OpenClaw client.""" + # Load env vars + deepgram_key = os.getenv("DEEPGRAM_API_KEY") + venice_key = os.getenv("VENICE_API_KEY") + openclaw_url = os.getenv("OPENCLAW_BASE_URL", "ws://192.168.50.9:18789") + openclaw_token = os.getenv("OPENCLAW_AUTH_TOKEN") + + if not deepgram_key or not venice_key: + raise ValueError("Missing required API keys") + + # Initialize STT + self.stt = DeepgramSTT( + api_key=deepgram_key, + model="nova-3", + language="en", + sample_rate=self.sample_rate, + ) + + # Initialize TTS + self.tts = VeniceKokoroTTS( + api_key=venice_key, + voice="am_liam", + base_url="https://api.venice.ai/api/v1", + ) + + # Initialize OpenClaw client + self.openclaw = OpenClawClient( + config=OpenClawConfig( + base_url=openclaw_url, + auth_token=openclaw_token, + timeout=30.0, + agent_id="main", + ) + ) + + await self.openclaw.connect() + + logger.info(f"Voice session {self.session_id} initialized") + + async def close(self): + """Clean up resources.""" + self.connected = False + + if self.openclaw: + await self.openclaw.disconnect() + + logger.info(f"Voice session {self.session_id} closed") + + def _new_id(self) -> str: + """Generate random session ID.""" + return "".join(random.choices(string.ascii_letters + string.digits, k=8)) + + async def process_audio_chunk(self, data: bytes): + """Process incoming audio chunk.""" + async with self._buffer_lock: + self.audio_buffer.extend(data) + + # Calculate duration + chunk_size = len(data) + chunk_duration = chunk_size / (self.sample_rate * self.channel_count * 4) + + self.buffer_duration += chunk_duration + + # Buffer until ~1 second + if self.buffer_duration >= 0.8: # Slightly less than 1 second + await self._transcribe_buffered_audio() + + async def _transcribe_buffered_audio(self): + """Transcribe accumulated audio and send to OpenClaw.""" + async with self._buffer_lock: + if not self.audio_buffer: + return + + # Convert bytearray to numpy array + audio_data = np.frombuffer(bytes(self.audio_buffer), dtype=np.float32) + + # Transcribe + try: + result = await self.stt.transcribe_async(audio_data) + + if result.text.strip(): + # Send to OpenClaw + response = await self.openclaw.send_message( + agent="main", + message=result.text, + speaker="voice_user", + ) + + # Log transcript + timestamp = asyncio.get_event_loop().time() + entry = { + "timestamp": timestamp, + "session_id": self.session_id, + "transcript": result.text, + "response": response, + } + + self.transcript.append(entry) + + # Write to file + with open(self.transcript_file, "a") as f: + f.write(json.dumps(entry, ensure_ascii=False) + "\n") + + logger.info( + f"Session {self.session_id}: " + f'"{result.text[:50]}..." -> "{response[:50]}..."' + ) + + # Clear buffer + self.audio_buffer.clear() + self.buffer_duration = 0.0 + + except Exception as e: + logger.error(f"Transcription error: {e}") + + async def synthesize_response(self, text: str): + """Synthesize TTS audio from response text.""" + try: + audio = await self.tts.generate_async( + text=text, + voice_ref_path=None, + emotion_exaggeration=0.8, + ) + + return audio + + except Exception as e: + logger.error(f"TTS synthesis error: {e}") + return None + + def get_transcript(self) -> list: + """Get transcript history.""" + return self.transcript + + +async def handle_voice_websocket(websocket: WebSocket, session_id: str): + """Handle WebSocket connection for voice session.""" + session = VoiceSession(session_id) + + await websocket.accept() + session.connected = True + + logger.info(f"WebSocket connected for session {session_id}") + + # Initialize session + try: + await session.initialize() + + # Send welcome message + await websocket.send_json({ + "type": "welcome", + "message": "Connected to voice portal", + }) + + # Receive and process audio + while session.connected: + try: + data = await websocket.receive_bytes() + + # Process audio chunk + await session.process_audio_chunk(data) + + except WebSocketDisconnect: + session.connected = False + logger.info(f"WebSocket disconnected for session {session_id}") + break + + except Exception as e: + logger.error(f"WebSocket error: {e}") + session.connected = False + break + + except Exception as e: + logger.error(f"Session initialization error: {e}") + await websocket.close(code=1011, reason=str(e)) + + finally: + await session.close() + + +def create_session_id() -> str: + """Generate a random session ID.""" + return "".join(random.choices(string.ascii_letters + string.digits, k=8))