Fix voice portal: WebSocket routing, Caddy keepalive, audio pipeline
- Fix app.py: @app.get -> @app.websocket for /ws/voice route (was returning 403) - Fix app.py: create static_dir before mounting it (AttributeError on startup) - Fix voice.html: AudioWorkletNode constructor (was AudioWorkletProcessor) - Fix voice.html: use ScriptProcessor directly (more reliable) - Fix voice.html: send Float32 directly (server expects float32, was sending Int16) - Fix voice.html: auto-detect ws/wss protocol from page URL - Add Caddy reverse proxy keepalive pings every 15s to prevent timeout - Add detailed message type logging in WebSocket receive loop - Strip Jarvis/Sage personas, rename bot to MoltMic - Add /moltmic voice slash command for portal URL - Update portal URL to https://voice.jezzahehn.com
This commit is contained in:
parent
bc580861dd
commit
3450e57ca6
6 changed files with 122 additions and 41 deletions
|
|
@ -1,18 +1,18 @@
|
|||
"""Jarvis Voice Bot - Discord Integration"""
|
||||
"""MoltMic - OpenClaw Voice Bot"""
|
||||
|
||||
from .bot import JarvisVoiceBot, create_bot, run_bot
|
||||
from .bot import MoltMicBot, create_bot, run_bot
|
||||
from .voice_session import VoiceSession, VoiceSessionManager
|
||||
from .audio_bridge import AudioBridge, PipelineAudioSource
|
||||
from .commands import VoiceBotCommands, setup_commands
|
||||
from .commands import MoltMicCommands, setup_commands
|
||||
|
||||
__all__ = [
|
||||
"JarvisVoiceBot",
|
||||
"MoltMicBot",
|
||||
"create_bot",
|
||||
"run_bot",
|
||||
"VoiceSession",
|
||||
"VoiceSessionManager",
|
||||
"AudioBridge",
|
||||
"PipelineAudioSource",
|
||||
"VoiceBotCommands",
|
||||
"MoltMicCommands",
|
||||
"setup_commands",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -20,8 +20,8 @@ from .vad_receiver import VADAudioReceiver
|
|||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class JarvisVoiceBot(discord.Client):
|
||||
"""Discord bot for voice interaction with AI agents."""
|
||||
class MoltMicBot(discord.Client):
|
||||
"""MoltMic - Discord voice bot for OpenClaw."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
|
@ -479,7 +479,7 @@ async def create_bot(
|
|||
stt_transcriber=None,
|
||||
orchestrator=None,
|
||||
audio_output_callbacks=None,
|
||||
) -> JarvisVoiceBot:
|
||||
) -> MoltMicBot:
|
||||
"""
|
||||
Create and initialize the Discord bot.
|
||||
|
||||
|
|
@ -494,7 +494,7 @@ async def create_bot(
|
|||
Returns:
|
||||
Initialized bot instance
|
||||
"""
|
||||
bot = JarvisVoiceBot(
|
||||
bot = MoltMicBot(
|
||||
config=config,
|
||||
openclaw_config=openclaw_config,
|
||||
tts_synthesizer=tts_synthesizer,
|
||||
|
|
|
|||
|
|
@ -94,6 +94,43 @@ class MoltMicCommands(app_commands.Group):
|
|||
logger.exception(f"Status error: {e}")
|
||||
await interaction.followup.send("❌ Error.", ephemeral=True)
|
||||
|
||||
@app_commands.command(name="voice", description="Open voice portal in browser")
|
||||
async def voice(self, interaction: discord.Interaction):
|
||||
"""Generate a voice portal URL for browser-based speech."""
|
||||
await interaction.response.defer(thinking=True)
|
||||
|
||||
try:
|
||||
# Import here to avoid circular dependency
|
||||
from server.voice_ws import create_session_id
|
||||
|
||||
session_id = create_session_id()
|
||||
portal_url = f"https://voice.jezzahehn.com/voice?session={session_id}"
|
||||
|
||||
embed = discord.Embed(
|
||||
title="🎙️ Voice Portal",
|
||||
description="Click below to open the voice portal in your browser",
|
||||
color=discord.Color.blue()
|
||||
)
|
||||
embed.add_field(
|
||||
name="Portal URL",
|
||||
value=f"[Open Voice Portal]({portal_url})",
|
||||
inline=False
|
||||
)
|
||||
embed.add_field(
|
||||
name="Instructions",
|
||||
value="1. Click the link above\n2. Allow microphone access\n3. Start talking! The bot will listen and respond.",
|
||||
inline=False
|
||||
)
|
||||
embed.set_footer(text="The bot will start listening when you connect")
|
||||
|
||||
await interaction.followup.send(embed=embed)
|
||||
|
||||
logger.info(f"Voice portal created for session {session_id}")
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Voice portal error: {e}")
|
||||
await interaction.followup.send("❌ Failed to create voice portal.", ephemeral=True)
|
||||
|
||||
|
||||
async def setup_commands(bot):
|
||||
"""Register slash commands."""
|
||||
|
|
|
|||
|
|
@ -4,10 +4,12 @@ Provides HTTP endpoints for:
|
|||
- Text-to-Speech (OpenAI /v1/audio/speech compatible)
|
||||
- Speech-to-Text (OpenAI /v1/audio/transcriptions compatible)
|
||||
- Health checks and status
|
||||
- WebSocket voice endpoint for browser-based speech
|
||||
|
||||
Shares STT and TTS engines with Discord bot for efficiency.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import tempfile
|
||||
import time
|
||||
|
|
@ -16,13 +18,15 @@ from typing import Literal, Optional
|
|||
|
||||
import numpy as np
|
||||
import soundfile as sf
|
||||
from fastapi import FastAPI, File, Form, HTTPException, UploadFile
|
||||
from fastapi import FastAPI, File, Form, HTTPException, UploadFile, WebSocket, WebSocketDisconnect
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import Response, StreamingResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from server.stt import FasterWhisperSTT, STTTranscriber
|
||||
from server.tts import ChatterboxTTS, TTSSynthesizer
|
||||
from server.voice_ws import handle_voice_websocket, create_session_id
|
||||
from utils.logging import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
|
@ -111,6 +115,13 @@ class VoiceAPIServer:
|
|||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Create static files directory
|
||||
self.static_dir = Path("server/static")
|
||||
self.static_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Mount static files
|
||||
self.app.mount("/static", StaticFiles(directory=str(self.static_dir)), name="static")
|
||||
|
||||
# Register routes
|
||||
self._register_routes()
|
||||
|
||||
|
|
@ -129,6 +140,19 @@ class VoiceAPIServer:
|
|||
"""Health check endpoint."""
|
||||
return await self._health_check()
|
||||
|
||||
@self.app.get("/voice")
|
||||
async def get_voice_page():
|
||||
"""Serve voice portal HTML page."""
|
||||
static_file = self.static_dir / "voice.html"
|
||||
if static_file.exists():
|
||||
return Response(content=static_file.read_text(), media_type="text/html")
|
||||
raise HTTPException(status_code=404, detail="Voice page not found")
|
||||
|
||||
@self.app.websocket("/ws/voice/{session_id}")
|
||||
async def voice_websocket(session_id: str, websocket: WebSocket):
|
||||
"""WebSocket endpoint for voice session."""
|
||||
await handle_voice_websocket(websocket, session_id)
|
||||
|
||||
@self.app.post("/v1/audio/speech")
|
||||
async def create_speech(request: TTSRequest):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -205,7 +205,8 @@
|
|||
|
||||
<script>
|
||||
const sessionId = new URLSearchParams(window.location.search).get('session');
|
||||
const wsUrl = `wss://${window.location.host}/ws/voice/${sessionId}`;
|
||||
const wsProtocol = window.location.protocol === 'https:' ? 'wss:' : 'ws:';
|
||||
const wsUrl = `${wsProtocol}//${window.location.host}/ws/voice/${sessionId}`;
|
||||
|
||||
let ws = null;
|
||||
let audioContext = null;
|
||||
|
|
@ -339,19 +340,12 @@
|
|||
}
|
||||
});
|
||||
|
||||
console.log('Microphone acquired, stream tracks:', stream.getTracks().length);
|
||||
microphone = audioContext.createMediaStreamSource(stream);
|
||||
console.log('MediaStreamSource created, sample rate:', audioContext.sampleRate);
|
||||
|
||||
// Use AudioWorklet or ScriptProcessor as fallback
|
||||
if (audioContext.audioWorklet) {
|
||||
try {
|
||||
await initAudioWorklet();
|
||||
} catch (error) {
|
||||
console.warn('AudioWorklet failed, falling back to ScriptProcessor:', error);
|
||||
initScriptProcessor();
|
||||
}
|
||||
} else {
|
||||
initScriptProcessor();
|
||||
}
|
||||
// Use ScriptProcessor for reliable audio capture
|
||||
initScriptProcessor();
|
||||
|
||||
} catch (error) {
|
||||
console.error('Audio initialization error:', error);
|
||||
|
|
@ -365,11 +359,7 @@
|
|||
|
||||
await audioContext.audioWorklet.addModule(workletUrl);
|
||||
|
||||
const processor = new AudioWorkletProcessor(audioContext, {
|
||||
numberOfInputs: 1,
|
||||
numberOfOutputs: 1,
|
||||
outputChannelCount: [1]
|
||||
});
|
||||
const processor = new AudioWorkletNode(audioContext, 'voice-processor');
|
||||
|
||||
microphone.connect(processor);
|
||||
|
||||
|
|
@ -411,14 +401,9 @@
|
|||
function sendAudio(audioData) {
|
||||
if (!ws || ws.readyState !== WebSocket.OPEN) return;
|
||||
|
||||
// Convert Float32 to Int16 for transmission
|
||||
const int16Data = new Int16Array(audioData.length);
|
||||
for (let i = 0; i < audioData.length; i++) {
|
||||
const sample = Math.max(-1, Math.min(1, audioData[i]));
|
||||
int16Data[i] = sample < 0 ? sample * 0x8000 : sample * 0x7FFF;
|
||||
}
|
||||
|
||||
ws.send(int16Data.buffer);
|
||||
// Send as Float32Array directly
|
||||
ws.send(audioData.buffer);
|
||||
console.log('Sent audio chunk:', audioData.length, 'samples');
|
||||
}
|
||||
|
||||
// Event listeners
|
||||
|
|
|
|||
|
|
@ -208,13 +208,43 @@ async def handle_voice_websocket(websocket: WebSocket, session_id: str):
|
|||
"message": "Connected to voice portal",
|
||||
})
|
||||
|
||||
# Background task: send periodic pings to keep connection alive through Caddy
|
||||
async def keepalive():
|
||||
while session.connected:
|
||||
try:
|
||||
await asyncio.sleep(15)
|
||||
if session.connected:
|
||||
await websocket.send_json({"type": "ping"})
|
||||
except Exception:
|
||||
break
|
||||
|
||||
keepalive_task = asyncio.create_task(keepalive())
|
||||
|
||||
# Receive and process audio
|
||||
chunk_count = 0
|
||||
while session.connected:
|
||||
try:
|
||||
data = await websocket.receive_bytes()
|
||||
msg = await websocket.receive()
|
||||
msg_type = msg.get("type", "unknown")
|
||||
|
||||
# Process audio chunk
|
||||
await session.process_audio_chunk(data)
|
||||
if msg_type == "websocket.disconnect":
|
||||
session.connected = False
|
||||
logger.info(f"WebSocket disconnected for session {session_id}")
|
||||
break
|
||||
|
||||
elif msg_type == "websocket.receive":
|
||||
if "bytes" in msg:
|
||||
chunk_count += 1
|
||||
if chunk_count <= 5 or chunk_count % 100 == 0:
|
||||
logger.info(f"Audio chunk #{chunk_count}: {len(msg['bytes'])} bytes")
|
||||
await session.process_audio_chunk(msg["bytes"])
|
||||
elif "text" in msg:
|
||||
pass
|
||||
else:
|
||||
logger.warning(f"Unknown receive msg: {msg}")
|
||||
|
||||
else:
|
||||
logger.warning(f"Unknown WebSocket msg type: {msg_type}: {msg}")
|
||||
|
||||
except WebSocketDisconnect:
|
||||
session.connected = False
|
||||
|
|
@ -222,13 +252,18 @@ async def handle_voice_websocket(websocket: WebSocket, session_id: str):
|
|||
break
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"WebSocket error: {e}")
|
||||
logger.error(f"WebSocket error in receive loop: {e}", exc_info=True)
|
||||
session.connected = False
|
||||
break
|
||||
|
||||
keepalive_task.cancel()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Session initialization error: {e}")
|
||||
await websocket.close(code=1011, reason=str(e))
|
||||
logger.error(f"Session error: {e}", exc_info=True)
|
||||
try:
|
||||
await websocket.close(code=1011, reason=str(e))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
finally:
|
||||
await session.close()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue