Fix voice portal: WebSocket routing, Caddy keepalive, audio pipeline

- Fix app.py: @app.get -> @app.websocket for /ws/voice route (was returning 403)
- Fix app.py: create static_dir before mounting it (AttributeError on startup)
- Fix voice.html: AudioWorkletNode constructor (was AudioWorkletProcessor)
- Fix voice.html: use ScriptProcessor directly (more reliable)
- Fix voice.html: send Float32 directly (server expects float32, was sending Int16)
- Fix voice.html: auto-detect ws/wss protocol from page URL
- Add Caddy reverse proxy keepalive pings every 15s to prevent timeout
- Add detailed message type logging in WebSocket receive loop
- Strip Jarvis/Sage personas, rename bot to MoltMic
- Add /moltmic voice slash command for portal URL
- Update portal URL to https://voice.jezzahehn.com
This commit is contained in:
Jezza Hehn 2026-04-10 04:47:31 +00:00
parent bc580861dd
commit 3450e57ca6
6 changed files with 122 additions and 41 deletions

View file

@ -1,18 +1,18 @@
"""Jarvis Voice Bot - Discord Integration""" """MoltMic - OpenClaw Voice Bot"""
from .bot import JarvisVoiceBot, create_bot, run_bot from .bot import MoltMicBot, create_bot, run_bot
from .voice_session import VoiceSession, VoiceSessionManager from .voice_session import VoiceSession, VoiceSessionManager
from .audio_bridge import AudioBridge, PipelineAudioSource from .audio_bridge import AudioBridge, PipelineAudioSource
from .commands import VoiceBotCommands, setup_commands from .commands import MoltMicCommands, setup_commands
__all__ = [ __all__ = [
"JarvisVoiceBot", "MoltMicBot",
"create_bot", "create_bot",
"run_bot", "run_bot",
"VoiceSession", "VoiceSession",
"VoiceSessionManager", "VoiceSessionManager",
"AudioBridge", "AudioBridge",
"PipelineAudioSource", "PipelineAudioSource",
"VoiceBotCommands", "MoltMicCommands",
"setup_commands", "setup_commands",
] ]

View file

@ -20,8 +20,8 @@ from .vad_receiver import VADAudioReceiver
logger = get_logger(__name__) logger = get_logger(__name__)
class JarvisVoiceBot(discord.Client): class MoltMicBot(discord.Client):
"""Discord bot for voice interaction with AI agents.""" """MoltMic - Discord voice bot for OpenClaw."""
def __init__( def __init__(
self, self,
@ -479,7 +479,7 @@ async def create_bot(
stt_transcriber=None, stt_transcriber=None,
orchestrator=None, orchestrator=None,
audio_output_callbacks=None, audio_output_callbacks=None,
) -> JarvisVoiceBot: ) -> MoltMicBot:
""" """
Create and initialize the Discord bot. Create and initialize the Discord bot.
@ -494,7 +494,7 @@ async def create_bot(
Returns: Returns:
Initialized bot instance Initialized bot instance
""" """
bot = JarvisVoiceBot( bot = MoltMicBot(
config=config, config=config,
openclaw_config=openclaw_config, openclaw_config=openclaw_config,
tts_synthesizer=tts_synthesizer, tts_synthesizer=tts_synthesizer,

View file

@ -94,6 +94,43 @@ class MoltMicCommands(app_commands.Group):
logger.exception(f"Status error: {e}") logger.exception(f"Status error: {e}")
await interaction.followup.send("❌ Error.", ephemeral=True) await interaction.followup.send("❌ Error.", ephemeral=True)
@app_commands.command(name="voice", description="Open voice portal in browser")
async def voice(self, interaction: discord.Interaction):
"""Generate a voice portal URL for browser-based speech."""
await interaction.response.defer(thinking=True)
try:
# Import here to avoid circular dependency
from server.voice_ws import create_session_id
session_id = create_session_id()
portal_url = f"https://voice.jezzahehn.com/voice?session={session_id}"
embed = discord.Embed(
title="🎙️ Voice Portal",
description="Click below to open the voice portal in your browser",
color=discord.Color.blue()
)
embed.add_field(
name="Portal URL",
value=f"[Open Voice Portal]({portal_url})",
inline=False
)
embed.add_field(
name="Instructions",
value="1. Click the link above\n2. Allow microphone access\n3. Start talking! The bot will listen and respond.",
inline=False
)
embed.set_footer(text="The bot will start listening when you connect")
await interaction.followup.send(embed=embed)
logger.info(f"Voice portal created for session {session_id}")
except Exception as e:
logger.exception(f"Voice portal error: {e}")
await interaction.followup.send("❌ Failed to create voice portal.", ephemeral=True)
async def setup_commands(bot): async def setup_commands(bot):
"""Register slash commands.""" """Register slash commands."""

View file

@ -4,10 +4,12 @@ Provides HTTP endpoints for:
- Text-to-Speech (OpenAI /v1/audio/speech compatible) - Text-to-Speech (OpenAI /v1/audio/speech compatible)
- Speech-to-Text (OpenAI /v1/audio/transcriptions compatible) - Speech-to-Text (OpenAI /v1/audio/transcriptions compatible)
- Health checks and status - Health checks and status
- WebSocket voice endpoint for browser-based speech
Shares STT and TTS engines with Discord bot for efficiency. Shares STT and TTS engines with Discord bot for efficiency.
""" """
import asyncio
import io import io
import tempfile import tempfile
import time import time
@ -16,13 +18,15 @@ from typing import Literal, Optional
import numpy as np import numpy as np
import soundfile as sf import soundfile as sf
from fastapi import FastAPI, File, Form, HTTPException, UploadFile from fastapi import FastAPI, File, Form, HTTPException, UploadFile, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import Response, StreamingResponse from fastapi.responses import Response, StreamingResponse
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from server.stt import FasterWhisperSTT, STTTranscriber from server.stt import FasterWhisperSTT, STTTranscriber
from server.tts import ChatterboxTTS, TTSSynthesizer from server.tts import ChatterboxTTS, TTSSynthesizer
from server.voice_ws import handle_voice_websocket, create_session_id
from utils.logging import get_logger from utils.logging import get_logger
logger = get_logger(__name__) logger = get_logger(__name__)
@ -111,6 +115,13 @@ class VoiceAPIServer:
allow_headers=["*"], allow_headers=["*"],
) )
# Create static files directory
self.static_dir = Path("server/static")
self.static_dir.mkdir(parents=True, exist_ok=True)
# Mount static files
self.app.mount("/static", StaticFiles(directory=str(self.static_dir)), name="static")
# Register routes # Register routes
self._register_routes() self._register_routes()
@ -129,6 +140,19 @@ class VoiceAPIServer:
"""Health check endpoint.""" """Health check endpoint."""
return await self._health_check() return await self._health_check()
@self.app.get("/voice")
async def get_voice_page():
"""Serve voice portal HTML page."""
static_file = self.static_dir / "voice.html"
if static_file.exists():
return Response(content=static_file.read_text(), media_type="text/html")
raise HTTPException(status_code=404, detail="Voice page not found")
@self.app.websocket("/ws/voice/{session_id}")
async def voice_websocket(session_id: str, websocket: WebSocket):
"""WebSocket endpoint for voice session."""
await handle_voice_websocket(websocket, session_id)
@self.app.post("/v1/audio/speech") @self.app.post("/v1/audio/speech")
async def create_speech(request: TTSRequest): async def create_speech(request: TTSRequest):
""" """

View file

@ -205,7 +205,8 @@
<script> <script>
const sessionId = new URLSearchParams(window.location.search).get('session'); const sessionId = new URLSearchParams(window.location.search).get('session');
const wsUrl = `wss://${window.location.host}/ws/voice/${sessionId}`; const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
const wsUrl = `${wsProtocol}//${window.location.host}/ws/voice/${sessionId}`;
let ws = null; let ws = null;
let audioContext = null; let audioContext = null;
@ -339,19 +340,12 @@
} }
}); });
console.log('Microphone acquired, stream tracks:', stream.getTracks().length);
microphone = audioContext.createMediaStreamSource(stream); microphone = audioContext.createMediaStreamSource(stream);
console.log('MediaStreamSource created, sample rate:', audioContext.sampleRate);
// Use AudioWorklet or ScriptProcessor as fallback // Use ScriptProcessor for reliable audio capture
if (audioContext.audioWorklet) {
try {
await initAudioWorklet();
} catch (error) {
console.warn('AudioWorklet failed, falling back to ScriptProcessor:', error);
initScriptProcessor(); initScriptProcessor();
}
} else {
initScriptProcessor();
}
} catch (error) { } catch (error) {
console.error('Audio initialization error:', error); console.error('Audio initialization error:', error);
@ -365,11 +359,7 @@
await audioContext.audioWorklet.addModule(workletUrl); await audioContext.audioWorklet.addModule(workletUrl);
const processor = new AudioWorkletProcessor(audioContext, { const processor = new AudioWorkletNode(audioContext, 'voice-processor');
numberOfInputs: 1,
numberOfOutputs: 1,
outputChannelCount: [1]
});
microphone.connect(processor); microphone.connect(processor);
@ -411,14 +401,9 @@
function sendAudio(audioData) { function sendAudio(audioData) {
if (!ws || ws.readyState !== WebSocket.OPEN) return; if (!ws || ws.readyState !== WebSocket.OPEN) return;
// Convert Float32 to Int16 for transmission // Send as Float32Array directly
const int16Data = new Int16Array(audioData.length); ws.send(audioData.buffer);
for (let i = 0; i < audioData.length; i++) { console.log('Sent audio chunk:', audioData.length, 'samples');
const sample = Math.max(-1, Math.min(1, audioData[i]));
int16Data[i] = sample < 0 ? sample * 0x8000 : sample * 0x7FFF;
}
ws.send(int16Data.buffer);
} }
// Event listeners // Event listeners

View file

@ -208,13 +208,43 @@ async def handle_voice_websocket(websocket: WebSocket, session_id: str):
"message": "Connected to voice portal", "message": "Connected to voice portal",
}) })
# Receive and process audio # Background task: send periodic pings to keep connection alive through Caddy
async def keepalive():
while session.connected: while session.connected:
try: try:
data = await websocket.receive_bytes() await asyncio.sleep(15)
if session.connected:
await websocket.send_json({"type": "ping"})
except Exception:
break
# Process audio chunk keepalive_task = asyncio.create_task(keepalive())
await session.process_audio_chunk(data)
# Receive and process audio
chunk_count = 0
while session.connected:
try:
msg = await websocket.receive()
msg_type = msg.get("type", "unknown")
if msg_type == "websocket.disconnect":
session.connected = False
logger.info(f"WebSocket disconnected for session {session_id}")
break
elif msg_type == "websocket.receive":
if "bytes" in msg:
chunk_count += 1
if chunk_count <= 5 or chunk_count % 100 == 0:
logger.info(f"Audio chunk #{chunk_count}: {len(msg['bytes'])} bytes")
await session.process_audio_chunk(msg["bytes"])
elif "text" in msg:
pass
else:
logger.warning(f"Unknown receive msg: {msg}")
else:
logger.warning(f"Unknown WebSocket msg type: {msg_type}: {msg}")
except WebSocketDisconnect: except WebSocketDisconnect:
session.connected = False session.connected = False
@ -222,13 +252,18 @@ async def handle_voice_websocket(websocket: WebSocket, session_id: str):
break break
except Exception as e: except Exception as e:
logger.error(f"WebSocket error: {e}") logger.error(f"WebSocket error in receive loop: {e}", exc_info=True)
session.connected = False session.connected = False
break break
keepalive_task.cancel()
except Exception as e: except Exception as e:
logger.error(f"Session initialization error: {e}") logger.error(f"Session error: {e}", exc_info=True)
try:
await websocket.close(code=1011, reason=str(e)) await websocket.close(code=1011, reason=str(e))
except Exception:
pass
finally: finally:
await session.close() await session.close()