From 3450e57ca6fa34ffe44db65de9730d7931b4c18f Mon Sep 17 00:00:00 2001
From: Jezza Hehn <jezza.hehn@wikitribune.com>
Date: Fri, 10 Apr 2026 04:47:31 +0000
Subject: [PATCH] Fix voice portal: WebSocket routing, Caddy keepalive, audio
 pipeline

- Fix app.py: @app.get -> @app.websocket for /ws/voice route (was returning 403)
- Fix app.py: create static_dir before mounting it (AttributeError on startup)
- Fix voice.html: AudioWorkletNode constructor (was AudioWorkletProcessor)
- Fix voice.html: use ScriptProcessor directly (more reliable)
- Fix voice.html: send Float32 directly (server expects float32, was sending Int16)
- Fix voice.html: auto-detect ws/wss protocol from page URL
- Add Caddy reverse proxy keepalive pings every 15s to prevent timeout
- Add detailed message type logging in WebSocket receive loop
- Strip Jarvis/Sage personas, rename bot to MoltMic
- Add /moltmic voice slash command for portal URL
- Update portal URL to https://voice.jezzahehn.com
---
 discord_bot/__init__.py  | 10 ++++-----
 discord_bot/bot.py       |  8 +++----
 discord_bot/commands.py  | 37 +++++++++++++++++++++++++++++++
 server/app.py            | 26 +++++++++++++++++++++-
 server/static/voice.html | 35 +++++++++---------------------
 server/voice_ws.py       | 47 +++++++++++++++++++++++++++++++++++-----
 6 files changed, 122 insertions(+), 41 deletions(-)

diff --git a/discord_bot/__init__.py b/discord_bot/__init__.py
index 7662387..4125ef4 100644
--- a/discord_bot/__init__.py
+++ b/discord_bot/__init__.py
@@ -1,18 +1,18 @@
-"""Jarvis Voice Bot - Discord Integration"""
+"""MoltMic - OpenClaw Voice Bot"""
 
-from .bot import JarvisVoiceBot, create_bot, run_bot
+from .bot import MoltMicBot, create_bot, run_bot
 from .voice_session import VoiceSession, VoiceSessionManager
 from .audio_bridge import AudioBridge, PipelineAudioSource
-from .commands import VoiceBotCommands, setup_commands
+from .commands import MoltMicCommands, setup_commands
 
 __all__ = [
-    "JarvisVoiceBot",
+    "MoltMicBot",
     "create_bot",
     "run_bot",
     "VoiceSession",
     "VoiceSessionManager",
     "AudioBridge",
     "PipelineAudioSource",
-    "VoiceBotCommands",
+    "MoltMicCommands",
     "setup_commands",
 ]
diff --git a/discord_bot/bot.py b/discord_bot/bot.py
index 18bdbd7..f430b39 100644
--- a/discord_bot/bot.py
+++ b/discord_bot/bot.py
@@ -20,8 +20,8 @@ from .vad_receiver import VADAudioReceiver
 logger = get_logger(__name__)
 
 
-class JarvisVoiceBot(discord.Client):
-    """Discord bot for voice interaction with AI agents."""
+class MoltMicBot(discord.Client):
+    """MoltMic - Discord voice bot for OpenClaw."""
 
     def __init__(
         self,
@@ -479,7 +479,7 @@ async def create_bot(
     stt_transcriber=None,
     orchestrator=None,
     audio_output_callbacks=None,
-) -> JarvisVoiceBot:
+) -> MoltMicBot:
     """
     Create and initialize the Discord bot.
 
@@ -494,7 +494,7 @@ async def create_bot(
     Returns:
         Initialized bot instance
     """
-    bot = JarvisVoiceBot(
+    bot = MoltMicBot(
         config=config,
         openclaw_config=openclaw_config,
         tts_synthesizer=tts_synthesizer,
diff --git a/discord_bot/commands.py b/discord_bot/commands.py
index 816ab54..957e969 100644
--- a/discord_bot/commands.py
+++ b/discord_bot/commands.py
@@ -94,6 +94,43 @@ class MoltMicCommands(app_commands.Group):
             logger.exception(f"Status error: {e}")
             await interaction.followup.send("❌ Error.", ephemeral=True)
 
+    @app_commands.command(name="voice", description="Open voice portal in browser")
+    async def voice(self, interaction: discord.Interaction):
+        """Generate a voice portal URL for browser-based speech."""
+        await interaction.response.defer(thinking=True)
+
+        try:
+            # Import here to avoid circular dependency
+            from server.voice_ws import create_session_id
+
+            session_id = create_session_id()
+            portal_url = f"https://voice.jezzahehn.com/voice?session={session_id}"
+
+            embed = discord.Embed(
+                title="🎙️ Voice Portal",
+                description="Click below to open the voice portal in your browser",
+                color=discord.Color.blue()
+            )
+            embed.add_field(
+                name="Portal URL",
+                value=f"[Open Voice Portal]({portal_url})",
+                inline=False
+            )
+            embed.add_field(
+                name="Instructions",
+                value="1. Click the link above\n2. Allow microphone access\n3. Start talking! The bot will listen and respond.",
+                inline=False
+            )
+            embed.set_footer(text="The bot will start listening when you connect")
+
+            await interaction.followup.send(embed=embed)
+
+            logger.info(f"Voice portal created for session {session_id}")
+
+        except Exception as e:
+            logger.exception(f"Voice portal error: {e}")
+            await interaction.followup.send("❌ Failed to create voice portal.", ephemeral=True)
+
 
 async def setup_commands(bot):
     """Register slash commands."""
diff --git a/server/app.py b/server/app.py
index 12aa38c..ccf54ca 100644
--- a/server/app.py
+++ b/server/app.py
@@ -4,10 +4,12 @@ Provides HTTP endpoints for:
 - Text-to-Speech (OpenAI /v1/audio/speech compatible)
 - Speech-to-Text (OpenAI /v1/audio/transcriptions compatible)
 - Health checks and status
+- WebSocket voice endpoint for browser-based speech
 
 Shares STT and TTS engines with Discord bot for efficiency.
 """
 
+import asyncio
 import io
 import tempfile
 import time
@@ -16,13 +18,15 @@ from typing import Literal, Optional
 
 import numpy as np
 import soundfile as sf
-from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile, WebSocket, WebSocketDisconnect
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import Response, StreamingResponse
+from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel, Field
 
 from server.stt import FasterWhisperSTT, STTTranscriber
 from server.tts import ChatterboxTTS, TTSSynthesizer
+from server.voice_ws import handle_voice_websocket, create_session_id
 from utils.logging import get_logger
 
 logger = get_logger(__name__)
@@ -111,6 +115,13 @@ class VoiceAPIServer:
             allow_headers=["*"],
         )
 
+        # Create static files directory
+        self.static_dir = Path("server/static")
+        self.static_dir.mkdir(parents=True, exist_ok=True)
+
+        # Mount static files
+        self.app.mount("/static", StaticFiles(directory=str(self.static_dir)), name="static")
+
         # Register routes
         self._register_routes()
 
@@ -129,6 +140,19 @@ class VoiceAPIServer:
             """Health check endpoint."""
             return await self._health_check()
 
+        @self.app.get("/voice")
+        async def get_voice_page():
+            """Serve voice portal HTML page."""
+            static_file = self.static_dir / "voice.html"
+            if static_file.exists():
+                return Response(content=static_file.read_text(), media_type="text/html")
+            raise HTTPException(status_code=404, detail="Voice page not found")
+
+        @self.app.websocket("/ws/voice/{session_id}")
+        async def voice_websocket(session_id: str, websocket: WebSocket):
+            """WebSocket endpoint for voice session."""
+            await handle_voice_websocket(websocket, session_id)
+
         @self.app.post("/v1/audio/speech")
         async def create_speech(request: TTSRequest):
             """
diff --git a/server/static/voice.html b/server/static/voice.html
index 87e4071..a316727 100644
--- a/server/static/voice.html
+++ b/server/static/voice.html
@@ -205,7 +205,8 @@
 
     <script>
         const sessionId = new URLSearchParams(window.location.search).get('session');
-        const wsUrl = `wss://${window.location.host}/ws/voice/${sessionId}`;
+        const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
+        const wsUrl = `${wsProtocol}//${window.location.host}/ws/voice/${sessionId}`;
 
         let ws = null;
         let audioContext = null;
@@ -339,19 +340,12 @@
                     }
                 });
 
+                console.log('Microphone acquired, stream tracks:', stream.getTracks().length);
                 microphone = audioContext.createMediaStreamSource(stream);
+                console.log('MediaStreamSource created, sample rate:', audioContext.sampleRate);
 
-                // Use AudioWorklet or ScriptProcessor as fallback
-                if (audioContext.audioWorklet) {
-                    try {
-                        await initAudioWorklet();
-                    } catch (error) {
-                        console.warn('AudioWorklet failed, falling back to ScriptProcessor:', error);
-                        initScriptProcessor();
-                    }
-                } else {
-                    initScriptProcessor();
-                }
+                // Use ScriptProcessor for reliable audio capture
+                initScriptProcessor();
 
             } catch (error) {
                 console.error('Audio initialization error:', error);
@@ -365,11 +359,7 @@
 
             await audioContext.audioWorklet.addModule(workletUrl);
 
-            const processor = new AudioWorkletProcessor(audioContext, {
-                numberOfInputs: 1,
-                numberOfOutputs: 1,
-                outputChannelCount: [1]
-            });
+            const processor = new AudioWorkletNode(audioContext, 'voice-processor');
 
             microphone.connect(processor);
 
@@ -411,14 +401,9 @@
         function sendAudio(audioData) {
             if (!ws || ws.readyState !== WebSocket.OPEN) return;
 
-            // Convert Float32 to Int16 for transmission
-            const int16Data = new Int16Array(audioData.length);
-            for (let i = 0; i < audioData.length; i++) {
-                const sample = Math.max(-1, Math.min(1, audioData[i]));
-                int16Data[i] = sample < 0 ? sample * 0x8000 : sample * 0x7FFF;
-            }
-
-            ws.send(int16Data.buffer);
+            // Send as Float32Array directly
+            ws.send(audioData.buffer);
+            console.log('Sent audio chunk:', audioData.length, 'samples');
         }
 
         // Event listeners
diff --git a/server/voice_ws.py b/server/voice_ws.py
index ebac6ae..e0a866a 100644
--- a/server/voice_ws.py
+++ b/server/voice_ws.py
@@ -208,13 +208,43 @@ async def handle_voice_websocket(websocket: WebSocket, session_id: str):
             "message": "Connected to voice portal",
         })
 
+        # Background task: send periodic pings to keep connection alive through Caddy
+        async def keepalive():
+            while session.connected:
+                try:
+                    await asyncio.sleep(15)
+                    if session.connected:
+                        await websocket.send_json({"type": "ping"})
+                except Exception:
+                    break
+
+        keepalive_task = asyncio.create_task(keepalive())
+
         # Receive and process audio
+        chunk_count = 0
         while session.connected:
             try:
-                data = await websocket.receive_bytes()
+                msg = await websocket.receive()
+                msg_type = msg.get("type", "unknown")
 
-                # Process audio chunk
-                await session.process_audio_chunk(data)
+                if msg_type == "websocket.disconnect":
+                    session.connected = False
+                    logger.info(f"WebSocket disconnected for session {session_id}")
+                    break
+
+                elif msg_type == "websocket.receive":
+                    if "bytes" in msg:
+                        chunk_count += 1
+                        if chunk_count <= 5 or chunk_count % 100 == 0:
+                            logger.info(f"Audio chunk #{chunk_count}: {len(msg['bytes'])} bytes")
+                        await session.process_audio_chunk(msg["bytes"])
+                    elif "text" in msg:
+                        pass
+                    else:
+                        logger.warning(f"Unknown receive msg: {msg}")
+
+                else:
+                    logger.warning(f"Unknown WebSocket msg type: {msg_type}: {msg}")
 
             except WebSocketDisconnect:
                 session.connected = False
@@ -222,13 +252,18 @@ async def handle_voice_websocket(websocket: WebSocket, session_id: str):
                 break
 
             except Exception as e:
-                logger.error(f"WebSocket error: {e}")
+                logger.error(f"WebSocket error in receive loop: {e}", exc_info=True)
                 session.connected = False
                 break
 
+        keepalive_task.cancel()
+
     except Exception as e:
-        logger.error(f"Session initialization error: {e}")
-        await websocket.close(code=1011, reason=str(e))
+        logger.error(f"Session error: {e}", exc_info=True)
+        try:
+            await websocket.close(code=1011, reason=str(e))
+        except Exception:
+            pass
 
     finally:
         await session.close()