Fix voice portal: WebSocket routing, Caddy keepalive, audio pipeline

- Fix app.py: @app.get -> @app.websocket for /ws/voice route (was returning 403) - Fix app.py: create static_dir before mounting it (AttributeError on startup) - Fix voice.html: AudioWorkletNode constructor (was AudioWorkletProcessor) - Fix voice.html: use ScriptProcessor directly (more reliable) - Fix voice.html: send Float32 directly (server expects float32, was sending Int16) - Fix voice.html: auto-detect ws/wss protocol from page URL - Add Caddy reverse proxy keepalive pings every 15s to prevent timeout - Add detailed message type logging in WebSocket receive loop - Strip Jarvis/Sage personas, rename bot to MoltMic - Add /moltmic voice slash command for portal URL - Update portal URL to https://voice.jezzahehn.com
2026-04-10 04:47:31 +00:00 · 2026-04-10 04:47:31 +00:00 · 3450e57ca6
commit 3450e57ca6
parent bc580861dd
6 changed files with 122 additions and 41 deletions
--- a/discord_bot/init.py
+++ b/discord_bot/init.py
@ -1,18 +1,18 @@
-"""Jarvis Voice Bot - Discord Integration"""
+"""MoltMic - OpenClaw Voice Bot"""

-from .bot import JarvisVoiceBot, create_bot, run_bot
+from .bot import MoltMicBot, create_bot, run_bot
 from .voice_session import VoiceSession, VoiceSessionManager
 from .audio_bridge import AudioBridge, PipelineAudioSource
-from .commands import VoiceBotCommands, setup_commands
+from .commands import MoltMicCommands, setup_commands

 __all__ = [
-    "JarvisVoiceBot",
+    "MoltMicBot",
    "create_bot",
    "run_bot",
    "VoiceSession",
    "VoiceSessionManager",
    "AudioBridge",
    "PipelineAudioSource",
-    "VoiceBotCommands",
+    "MoltMicCommands",
    "setup_commands",
 ]
--- a/discord_bot/bot.py
+++ b/discord_bot/bot.py
@ -20,8 +20,8 @@ from .vad_receiver import VADAudioReceiver
 logger = get_logger(__name__)


-class JarvisVoiceBot(discord.Client):
-    """Discord bot for voice interaction with AI agents."""
+class MoltMicBot(discord.Client):
+    """MoltMic - Discord voice bot for OpenClaw."""

    def __init__(
        self,
@ -479,7 +479,7 @@ async def create_bot(
    stt_transcriber=None,
    orchestrator=None,
    audio_output_callbacks=None,
-) -> JarvisVoiceBot:
+) -> MoltMicBot:
    """
    Create and initialize the Discord bot.

@ -494,7 +494,7 @@ async def create_bot(
    Returns:
        Initialized bot instance
    """
-    bot = JarvisVoiceBot(
+    bot = MoltMicBot(
        config=config,
        openclaw_config=openclaw_config,
        tts_synthesizer=tts_synthesizer,
--- a/discord_bot/commands.py
+++ b/discord_bot/commands.py
@ -94,6 +94,43 @@ class MoltMicCommands(app_commands.Group):
            logger.exception(f"Status error: {e}")
            await interaction.followup.send("❌ Error.", ephemeral=True)

+    @app_commands.command(name="voice", description="Open voice portal in browser")
+    async def voice(self, interaction: discord.Interaction):
+        """Generate a voice portal URL for browser-based speech."""
+        await interaction.response.defer(thinking=True)
+
+        try:
+            # Import here to avoid circular dependency
+            from server.voice_ws import create_session_id
+
+            session_id = create_session_id()
+            portal_url = f"https://voice.jezzahehn.com/voice?session={session_id}"
+
+            embed = discord.Embed(
+                title="🎙️ Voice Portal",
+                description="Click below to open the voice portal in your browser",
+                color=discord.Color.blue()
+            )
+            embed.add_field(
+                name="Portal URL",
+                value=f"[Open Voice Portal]({portal_url})",
+                inline=False
+            )
+            embed.add_field(
+                name="Instructions",
+                value="1. Click the link above\n2. Allow microphone access\n3. Start talking! The bot will listen and respond.",
+                inline=False
+            )
+            embed.set_footer(text="The bot will start listening when you connect")
+
+            await interaction.followup.send(embed=embed)
+
+            logger.info(f"Voice portal created for session {session_id}")
+
+        except Exception as e:
+            logger.exception(f"Voice portal error: {e}")
+            await interaction.followup.send("❌ Failed to create voice portal.", ephemeral=True)
+

 async def setup_commands(bot):
    """Register slash commands."""
--- a/server/app.py
+++ b/server/app.py
@ -4,10 +4,12 @@ Provides HTTP endpoints for:
 - Text-to-Speech (OpenAI /v1/audio/speech compatible)
 - Speech-to-Text (OpenAI /v1/audio/transcriptions compatible)
 - Health checks and status
+- WebSocket voice endpoint for browser-based speech

 Shares STT and TTS engines with Discord bot for efficiency.
 """

+import asyncio
 import io
 import tempfile
 import time
@ -16,13 +18,15 @@ from typing import Literal, Optional

 import numpy as np
 import soundfile as sf
-from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile, WebSocket, WebSocketDisconnect
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import Response, StreamingResponse
+from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel, Field

 from server.stt import FasterWhisperSTT, STTTranscriber
 from server.tts import ChatterboxTTS, TTSSynthesizer
+from server.voice_ws import handle_voice_websocket, create_session_id
 from utils.logging import get_logger

 logger = get_logger(__name__)
@ -111,6 +115,13 @@ class VoiceAPIServer:
            allow_headers=["*"],
        )

+        # Create static files directory
+        self.static_dir = Path("server/static")
+        self.static_dir.mkdir(parents=True, exist_ok=True)
+
+        # Mount static files
+        self.app.mount("/static", StaticFiles(directory=str(self.static_dir)), name="static")
+
        # Register routes
        self._register_routes()

@ -129,6 +140,19 @@ class VoiceAPIServer:
            """Health check endpoint."""
            return await self._health_check()

+        @self.app.get("/voice")
+        async def get_voice_page():
+            """Serve voice portal HTML page."""
+            static_file = self.static_dir / "voice.html"
+            if static_file.exists():
+                return Response(content=static_file.read_text(), media_type="text/html")
+            raise HTTPException(status_code=404, detail="Voice page not found")
+
+        @self.app.websocket("/ws/voice/{session_id}")
+        async def voice_websocket(session_id: str, websocket: WebSocket):
+            """WebSocket endpoint for voice session."""
+            await handle_voice_websocket(websocket, session_id)
+
        @self.app.post("/v1/audio/speech")
        async def create_speech(request: TTSRequest):
            """
--- a/server/static/voice.html
+++ b/server/static/voice.html
@ -205,7 +205,8 @@

    <script>
        const sessionId = new URLSearchParams(window.location.search).get('session');
-        const wsUrl = `wss://${window.location.host}/ws/voice/${sessionId}`;
+        const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
+        const wsUrl = `${wsProtocol}//${window.location.host}/ws/voice/${sessionId}`;

        let ws = null;
        let audioContext = null;
@ -339,19 +340,12 @@
                    }
                });

+                console.log('Microphone acquired, stream tracks:', stream.getTracks().length);
                microphone = audioContext.createMediaStreamSource(stream);
+                console.log('MediaStreamSource created, sample rate:', audioContext.sampleRate);

-                // Use AudioWorklet or ScriptProcessor as fallback
-                if (audioContext.audioWorklet) {
-                    try {
-                        await initAudioWorklet();
-                    } catch (error) {
-                        console.warn('AudioWorklet failed, falling back to ScriptProcessor:', error);
-                        initScriptProcessor();
-                    }
-                } else {
-                    initScriptProcessor();
-                }
+                // Use ScriptProcessor for reliable audio capture
+                initScriptProcessor();

            } catch (error) {
                console.error('Audio initialization error:', error);
@ -365,11 +359,7 @@

            await audioContext.audioWorklet.addModule(workletUrl);

-            const processor = new AudioWorkletProcessor(audioContext, {
-                numberOfInputs: 1,
-                numberOfOutputs: 1,
-                outputChannelCount: [1]
-            });
+            const processor = new AudioWorkletNode(audioContext, 'voice-processor');

            microphone.connect(processor);

@ -411,14 +401,9 @@
        function sendAudio(audioData) {
            if (!ws || ws.readyState !== WebSocket.OPEN) return;

-            // Convert Float32 to Int16 for transmission
-            const int16Data = new Int16Array(audioData.length);
-            for (let i = 0; i < audioData.length; i++) {
-                const sample = Math.max(-1, Math.min(1, audioData[i]));
-                int16Data[i] = sample < 0 ? sample * 0x8000 : sample * 0x7FFF;
-            }
-
-            ws.send(int16Data.buffer);
+            // Send as Float32Array directly
+            ws.send(audioData.buffer);
+            console.log('Sent audio chunk:', audioData.length, 'samples');
        }

        // Event listeners
--- a/server/voice_ws.py
+++ b/server/voice_ws.py
@ -208,13 +208,43 @@ async def handle_voice_websocket(websocket: WebSocket, session_id: str):
            "message": "Connected to voice portal",
        })

+        # Background task: send periodic pings to keep connection alive through Caddy
+        async def keepalive():
+            while session.connected:
+                try:
+                    await asyncio.sleep(15)
+                    if session.connected:
+                        await websocket.send_json({"type": "ping"})
+                except Exception:
+                    break
+
+        keepalive_task = asyncio.create_task(keepalive())
+
        # Receive and process audio
+        chunk_count = 0
        while session.connected:
            try:
-                data = await websocket.receive_bytes()
+                msg = await websocket.receive()
+                msg_type = msg.get("type", "unknown")

-                # Process audio chunk
-                await session.process_audio_chunk(data)
+                if msg_type == "websocket.disconnect":
+                    session.connected = False
+                    logger.info(f"WebSocket disconnected for session {session_id}")
+                    break
+
+                elif msg_type == "websocket.receive":
+                    if "bytes" in msg:
+                        chunk_count += 1
+                        if chunk_count <= 5 or chunk_count % 100 == 0:
+                            logger.info(f"Audio chunk #{chunk_count}: {len(msg['bytes'])} bytes")
+                        await session.process_audio_chunk(msg["bytes"])
+                    elif "text" in msg:
+                        pass
+                    else:
+                        logger.warning(f"Unknown receive msg: {msg}")
+
+                else:
+                    logger.warning(f"Unknown WebSocket msg type: {msg_type}: {msg}")

            except WebSocketDisconnect:
                session.connected = False
@ -222,13 +252,18 @@ async def handle_voice_websocket(websocket: WebSocket, session_id: str):
                break

            except Exception as e:
-                logger.error(f"WebSocket error: {e}")
+                logger.error(f"WebSocket error in receive loop: {e}", exc_info=True)
                session.connected = False
                break

+        keepalive_task.cancel()
+
    except Exception as e:
-        logger.error(f"Session initialization error: {e}")
-        await websocket.close(code=1011, reason=str(e))
+        logger.error(f"Session error: {e}", exc_info=True)
+        try:
+            await websocket.close(code=1011, reason=str(e))
+        except Exception:
+            pass

    finally:
        await session.close()