Fix voice portal: WebSocket routing, Caddy keepalive, audio pipeline

- Fix app.py: @app.get -> @app.websocket for /ws/voice route (was returning 403)
- Fix app.py: create static_dir before mounting it (AttributeError on startup)
- Fix voice.html: AudioWorkletNode constructor (was AudioWorkletProcessor)
- Fix voice.html: use ScriptProcessor directly (more reliable)
- Fix voice.html: send Float32 directly (server expects float32, was sending Int16)
- Fix voice.html: auto-detect ws/wss protocol from page URL
- Add Caddy reverse proxy keepalive pings every 15s to prevent timeout
- Add detailed message type logging in WebSocket receive loop
- Strip Jarvis/Sage personas, rename bot to MoltMic
- Add /moltmic voice slash command for portal URL
- Update portal URL to https://voice.jezzahehn.com
This commit is contained in:
Jezza Hehn 2026-04-10 04:47:31 +00:00
parent bc580861dd
commit 3450e57ca6
6 changed files with 122 additions and 41 deletions

View file

@ -1,18 +1,18 @@
"""Jarvis Voice Bot - Discord Integration"""
"""MoltMic - OpenClaw Voice Bot"""
from .bot import JarvisVoiceBot, create_bot, run_bot
from .bot import MoltMicBot, create_bot, run_bot
from .voice_session import VoiceSession, VoiceSessionManager
from .audio_bridge import AudioBridge, PipelineAudioSource
from .commands import VoiceBotCommands, setup_commands
from .commands import MoltMicCommands, setup_commands
__all__ = [
"JarvisVoiceBot",
"MoltMicBot",
"create_bot",
"run_bot",
"VoiceSession",
"VoiceSessionManager",
"AudioBridge",
"PipelineAudioSource",
"VoiceBotCommands",
"MoltMicCommands",
"setup_commands",
]

View file

@ -20,8 +20,8 @@ from .vad_receiver import VADAudioReceiver
logger = get_logger(__name__)
class JarvisVoiceBot(discord.Client):
"""Discord bot for voice interaction with AI agents."""
class MoltMicBot(discord.Client):
"""MoltMic - Discord voice bot for OpenClaw."""
def __init__(
self,
@ -479,7 +479,7 @@ async def create_bot(
stt_transcriber=None,
orchestrator=None,
audio_output_callbacks=None,
) -> JarvisVoiceBot:
) -> MoltMicBot:
"""
Create and initialize the Discord bot.
@ -494,7 +494,7 @@ async def create_bot(
Returns:
Initialized bot instance
"""
bot = JarvisVoiceBot(
bot = MoltMicBot(
config=config,
openclaw_config=openclaw_config,
tts_synthesizer=tts_synthesizer,

View file

@ -94,6 +94,43 @@ class MoltMicCommands(app_commands.Group):
logger.exception(f"Status error: {e}")
await interaction.followup.send("❌ Error.", ephemeral=True)
@app_commands.command(name="voice", description="Open voice portal in browser")
async def voice(self, interaction: discord.Interaction):
"""Generate a voice portal URL for browser-based speech."""
await interaction.response.defer(thinking=True)
try:
# Import here to avoid circular dependency
from server.voice_ws import create_session_id
session_id = create_session_id()
portal_url = f"https://voice.jezzahehn.com/voice?session={session_id}"
embed = discord.Embed(
title="🎙️ Voice Portal",
description="Click below to open the voice portal in your browser",
color=discord.Color.blue()
)
embed.add_field(
name="Portal URL",
value=f"[Open Voice Portal]({portal_url})",
inline=False
)
embed.add_field(
name="Instructions",
value="1. Click the link above\n2. Allow microphone access\n3. Start talking! The bot will listen and respond.",
inline=False
)
embed.set_footer(text="The bot will start listening when you connect")
await interaction.followup.send(embed=embed)
logger.info(f"Voice portal created for session {session_id}")
except Exception as e:
logger.exception(f"Voice portal error: {e}")
await interaction.followup.send("❌ Failed to create voice portal.", ephemeral=True)
async def setup_commands(bot):
"""Register slash commands."""

View file

@ -4,10 +4,12 @@ Provides HTTP endpoints for:
- Text-to-Speech (OpenAI /v1/audio/speech compatible)
- Speech-to-Text (OpenAI /v1/audio/transcriptions compatible)
- Health checks and status
- WebSocket voice endpoint for browser-based speech
Shares STT and TTS engines with Discord bot for efficiency.
"""
import asyncio
import io
import tempfile
import time
@ -16,13 +18,15 @@ from typing import Literal, Optional
import numpy as np
import soundfile as sf
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
from fastapi import FastAPI, File, Form, HTTPException, UploadFile, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import Response, StreamingResponse
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel, Field
from server.stt import FasterWhisperSTT, STTTranscriber
from server.tts import ChatterboxTTS, TTSSynthesizer
from server.voice_ws import handle_voice_websocket, create_session_id
from utils.logging import get_logger
logger = get_logger(__name__)
@ -111,6 +115,13 @@ class VoiceAPIServer:
allow_headers=["*"],
)
# Create static files directory
self.static_dir = Path("server/static")
self.static_dir.mkdir(parents=True, exist_ok=True)
# Mount static files
self.app.mount("/static", StaticFiles(directory=str(self.static_dir)), name="static")
# Register routes
self._register_routes()
@ -129,6 +140,19 @@ class VoiceAPIServer:
"""Health check endpoint."""
return await self._health_check()
@self.app.get("/voice")
async def get_voice_page():
"""Serve voice portal HTML page."""
static_file = self.static_dir / "voice.html"
if static_file.exists():
return Response(content=static_file.read_text(), media_type="text/html")
raise HTTPException(status_code=404, detail="Voice page not found")
@self.app.websocket("/ws/voice/{session_id}")
async def voice_websocket(session_id: str, websocket: WebSocket):
"""WebSocket endpoint for voice session."""
await handle_voice_websocket(websocket, session_id)
@self.app.post("/v1/audio/speech")
async def create_speech(request: TTSRequest):
"""

View file

@ -205,7 +205,8 @@
<script>
const sessionId = new URLSearchParams(window.location.search).get('session');
const wsUrl = `wss://${window.location.host}/ws/voice/${sessionId}`;
const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
const wsUrl = `${wsProtocol}//${window.location.host}/ws/voice/${sessionId}`;
let ws = null;
let audioContext = null;
@ -339,19 +340,12 @@
}
});
console.log('Microphone acquired, stream tracks:', stream.getTracks().length);
microphone = audioContext.createMediaStreamSource(stream);
console.log('MediaStreamSource created, sample rate:', audioContext.sampleRate);
// Use AudioWorklet or ScriptProcessor as fallback
if (audioContext.audioWorklet) {
try {
await initAudioWorklet();
} catch (error) {
console.warn('AudioWorklet failed, falling back to ScriptProcessor:', error);
initScriptProcessor();
}
} else {
initScriptProcessor();
}
// Use ScriptProcessor for reliable audio capture
initScriptProcessor();
} catch (error) {
console.error('Audio initialization error:', error);
@ -365,11 +359,7 @@
await audioContext.audioWorklet.addModule(workletUrl);
const processor = new AudioWorkletProcessor(audioContext, {
numberOfInputs: 1,
numberOfOutputs: 1,
outputChannelCount: [1]
});
const processor = new AudioWorkletNode(audioContext, 'voice-processor');
microphone.connect(processor);
@ -411,14 +401,9 @@
function sendAudio(audioData) {
if (!ws || ws.readyState !== WebSocket.OPEN) return;
// Convert Float32 to Int16 for transmission
const int16Data = new Int16Array(audioData.length);
for (let i = 0; i < audioData.length; i++) {
const sample = Math.max(-1, Math.min(1, audioData[i]));
int16Data[i] = sample < 0 ? sample * 0x8000 : sample * 0x7FFF;
}
ws.send(int16Data.buffer);
// Send as Float32Array directly
ws.send(audioData.buffer);
console.log('Sent audio chunk:', audioData.length, 'samples');
}
// Event listeners

View file

@ -208,13 +208,43 @@ async def handle_voice_websocket(websocket: WebSocket, session_id: str):
"message": "Connected to voice portal",
})
# Background task: send periodic pings to keep connection alive through Caddy
async def keepalive():
while session.connected:
try:
await asyncio.sleep(15)
if session.connected:
await websocket.send_json({"type": "ping"})
except Exception:
break
keepalive_task = asyncio.create_task(keepalive())
# Receive and process audio
chunk_count = 0
while session.connected:
try:
data = await websocket.receive_bytes()
msg = await websocket.receive()
msg_type = msg.get("type", "unknown")
# Process audio chunk
await session.process_audio_chunk(data)
if msg_type == "websocket.disconnect":
session.connected = False
logger.info(f"WebSocket disconnected for session {session_id}")
break
elif msg_type == "websocket.receive":
if "bytes" in msg:
chunk_count += 1
if chunk_count <= 5 or chunk_count % 100 == 0:
logger.info(f"Audio chunk #{chunk_count}: {len(msg['bytes'])} bytes")
await session.process_audio_chunk(msg["bytes"])
elif "text" in msg:
pass
else:
logger.warning(f"Unknown receive msg: {msg}")
else:
logger.warning(f"Unknown WebSocket msg type: {msg_type}: {msg}")
except WebSocketDisconnect:
session.connected = False
@ -222,13 +252,18 @@ async def handle_voice_websocket(websocket: WebSocket, session_id: str):
break
except Exception as e:
logger.error(f"WebSocket error: {e}")
logger.error(f"WebSocket error in receive loop: {e}", exc_info=True)
session.connected = False
break
keepalive_task.cancel()
except Exception as e:
logger.error(f"Session initialization error: {e}")
await websocket.close(code=1011, reason=str(e))
logger.error(f"Session error: {e}", exc_info=True)
try:
await websocket.close(code=1011, reason=str(e))
except Exception:
pass
finally:
await session.close()