- Replace /jarvis and /sage command groups with /moltmic join|leave|status - Remove AgentVoiceConfig, AgentsConfig now just has default agent - Remove voice file checks from run.py (cloud TTS doesn't need them) - Remove agent-to-voice mapping in bot.py on_speech_complete - Rename from 'Jarvis Voice Bot' to 'MoltMic' throughout
359 lines
13 KiB
Python
359 lines
13 KiB
Python
"""
|
|
MoltMic - OpenClaw Voice Bot
|
|
|
|
This script starts both the Discord bot and FastAPI server.
|
|
"""
|
|
|
|
import asyncio
|
|
import signal
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from utils.config import load_config
|
|
from utils.logging import get_logger, setup_logging
|
|
|
|
|
|
# Global shutdown event
|
|
shutdown_event = asyncio.Event()
|
|
|
|
|
|
def signal_handler(signum, frame):
|
|
"""Handle shutdown signals gracefully."""
|
|
print("\n\nShutdown signal received. Cleaning up...\n")
|
|
shutdown_event.set()
|
|
|
|
|
|
async def main():
|
|
"""Main application entry point."""
|
|
logger = None
|
|
|
|
try:
|
|
# Load configuration
|
|
print("Loading configuration...")
|
|
config = load_config()
|
|
|
|
# Setup logging
|
|
setup_logging(config.logging)
|
|
logger = get_logger(__name__)
|
|
|
|
logger.info("=" * 70)
|
|
logger.info("MoltMic Starting")
|
|
logger.info("=" * 70)
|
|
|
|
# Validate required configuration
|
|
logger.info("Validating configuration...")
|
|
|
|
if not config.discord.token:
|
|
logger.error("Discord token not configured!")
|
|
logger.error("Set DISCORD_TOKEN environment variable in .env file")
|
|
return 1
|
|
|
|
logger.info("✓ Discord token configured")
|
|
|
|
# Validate OpenClaw Gateway configuration
|
|
if not config.openclaw.base_url:
|
|
logger.error("OpenClaw Gateway URL not configured!")
|
|
logger.error("Set OPENCLAW_BASE_URL environment variable in .env file")
|
|
return 1
|
|
|
|
if not config.openclaw.token:
|
|
logger.error("OpenClaw Gateway token not configured!")
|
|
logger.error("Set OPENCLAW_AUTH_TOKEN environment variable in .env file")
|
|
return 1
|
|
|
|
logger.info("✓ OpenClaw Gateway configured")
|
|
|
|
# Display configuration summary
|
|
logger.info("")
|
|
logger.info("Configuration Summary:")
|
|
logger.info(f" Default Agent: {config.agents.default}")
|
|
logger.info(f" OpenClaw Gateway: {config.openclaw.base_url}")
|
|
logger.info(f" OpenClaw Agent ID: {config.openclaw.agent_id}")
|
|
logger.info(f" STT Provider: {config.pipeline.stt.provider}")
|
|
logger.info(f" TTS Provider: {config.pipeline.tts.provider}")
|
|
if config.pipeline.tts.provider == "venice":
|
|
logger.info(f" TTS Voice: {config.pipeline.tts.venice.voice}")
|
|
logger.info(f" Server Port: {config.server.port}")
|
|
logger.info(f" Latency Tracking: {config.logging.track_latency}")
|
|
logger.info("")
|
|
|
|
# Initialize shared TTS and STT engines
|
|
logger.info("Initializing TTS and STT engines...")
|
|
|
|
from server.stt import create_stt_engine, STTTranscriber
|
|
from server.tts import create_tts_engine, CloudTTSSynthesizer
|
|
import os
|
|
|
|
# --- TTS ---
|
|
tts_provider = config.pipeline.tts.provider
|
|
if tts_provider == "venice":
|
|
tts_engine = create_tts_engine("venice", {
|
|
"api_key": os.getenv("VENICE_API_KEY", ""),
|
|
"voice": config.pipeline.tts.venice.voice,
|
|
"base_url": config.pipeline.tts.venice.base_url,
|
|
})
|
|
tts_synthesizer = CloudTTSSynthesizer(tts_engine)
|
|
logger.info(f"✓ TTS engine initialized (Venice Kokoro, voice={config.pipeline.tts.venice.voice})")
|
|
else:
|
|
# Local Chatterbox (requires GPU + voice files)
|
|
voice_refs = {
|
|
"jarvis": str(jarvis_voice),
|
|
"sage": str(sage_voice),
|
|
}
|
|
tts_engine = create_tts_engine("chatterbox", {
|
|
"device": config.pipeline.tts.device,
|
|
"sample_rate": 24000,
|
|
})
|
|
from server.tts import TTSSynthesizer
|
|
voice_map = {a: Path(p) for a, p in voice_refs.items()}
|
|
tts_synthesizer = TTSSynthesizer(engine=tts_engine, voice_map=voice_map)
|
|
logger.info(f"✓ TTS engine initialized (Chatterbox on {config.pipeline.tts.device})")
|
|
|
|
# Warmup TTS
|
|
logger.info("Warming up TTS engine...")
|
|
await tts_synthesizer.warmup()
|
|
logger.info("✓ TTS warmup complete")
|
|
|
|
# --- STT ---
|
|
stt_provider = config.pipeline.stt.provider
|
|
if stt_provider == "deepgram":
|
|
stt_engine = create_stt_engine("deepgram",
|
|
api_key=os.getenv("DEEPGRAM_API_KEY", ""),
|
|
model=config.pipeline.stt.model,
|
|
language=config.pipeline.stt.language,
|
|
)
|
|
stt_transcriber = STTTranscriber(engine=stt_engine, max_concurrent=3)
|
|
logger.info(f"✓ STT engine initialized (Deepgram {config.pipeline.stt.model})")
|
|
else:
|
|
stt_engine = create_stt_engine("local",
|
|
model_size=config.pipeline.stt.model_size,
|
|
device=config.pipeline.stt.device,
|
|
compute_type=config.pipeline.stt.compute_type,
|
|
beam_size=config.pipeline.stt.beam_size,
|
|
language=config.pipeline.stt.language,
|
|
)
|
|
stt_transcriber = STTTranscriber(engine=stt_engine, max_concurrent=1)
|
|
logger.info(f"✓ STT engine initialized (faster-whisper {config.pipeline.stt.model_size} on {config.pipeline.stt.device})")
|
|
|
|
# Initialize OpenClaw Gateway client
|
|
logger.info("Initializing OpenClaw Gateway client...")
|
|
from openclaw_client import OpenClawConfig
|
|
|
|
openclaw_config = OpenClawConfig(
|
|
base_url=config.openclaw.base_url,
|
|
auth_token=config.openclaw.token,
|
|
timeout=config.openclaw.timeout,
|
|
retry_timeout=config.openclaw.retry_timeout,
|
|
agent_id=config.openclaw.agent_id,
|
|
session_scope=config.openclaw.session_scope,
|
|
)
|
|
logger.info(f"✓ OpenClaw Gateway client initialized ({config.openclaw.base_url})")
|
|
|
|
# Initialize Pipeline Components
|
|
logger.info("Initializing voice processing pipeline...")
|
|
|
|
from pipeline import (
|
|
SileroVAD,
|
|
SmartTurnDetector,
|
|
PipelineTranscriber,
|
|
TranscriptManager,
|
|
RelevanceFilter,
|
|
PipelineOrchestrator,
|
|
PipelineConfig,
|
|
QueryRouter,
|
|
)
|
|
from openclaw_client import OpenClawClient
|
|
|
|
# Create pipeline components
|
|
vad = SileroVAD()
|
|
logger.info("✓ VAD initialized (Silero)")
|
|
|
|
turn_detector = None
|
|
try:
|
|
turn_detector = SmartTurnDetector(
|
|
model_path=Path("models") / config.pipeline.turn_detection.model_path,
|
|
threshold=config.pipeline.turn_detection.threshold,
|
|
)
|
|
logger.info("✓ Smart Turn v3 detector initialized")
|
|
except Exception as e:
|
|
logger.warning(f"Smart Turn model unavailable, using simple fallback: {e}")
|
|
# Create a simple fallback that always returns True (trust VAD silence)
|
|
class SimpleTurnFallback:
|
|
async def detect_async(self, audio):
|
|
return 1.0 # Always say turn is complete
|
|
turn_detector = SimpleTurnFallback()
|
|
logger.info("✓ Using simple turn detection (VAD silence = turn complete)")
|
|
|
|
stt_pipeline = PipelineTranscriber(
|
|
transcriber=stt_transcriber,
|
|
)
|
|
logger.info("✓ STT pipeline wrapped")
|
|
|
|
transcript_manager = TranscriptManager(
|
|
max_age_seconds=config.pipeline.transcript.window_duration,
|
|
max_entries=config.pipeline.transcript.max_turns,
|
|
)
|
|
logger.info("✓ Transcript manager initialized")
|
|
|
|
relevance_filter = RelevanceFilter(
|
|
agent_name=config.agents.default,
|
|
sensitivity=config.pipeline.relevance.default_sensitivity,
|
|
)
|
|
logger.info("✓ Relevance filter initialized")
|
|
|
|
query_router = QueryRouter(default_model="sonnet")
|
|
logger.info("✓ Query router initialized")
|
|
|
|
# Create OpenClaw client instance for pipeline
|
|
openclaw_client = OpenClawClient(openclaw_config)
|
|
|
|
# Create audio output callback (will be set by Discord bot)
|
|
audio_output_callbacks = {}
|
|
|
|
def audio_output_callback(user_id: int, audio_data):
|
|
"""Route audio output to appropriate callback."""
|
|
if user_id in audio_output_callbacks:
|
|
audio_output_callbacks[user_id](audio_data)
|
|
|
|
# Create pipeline orchestrator
|
|
pipeline_config = PipelineConfig(
|
|
vad_silence_duration=config.pipeline.vad.silence_threshold,
|
|
turn_completion_threshold=config.pipeline.turn_detection.threshold,
|
|
turn_wait_timeout=config.pipeline.turn_detection.max_wait,
|
|
stt_timeout=5.0,
|
|
relevance_timeout=2.0,
|
|
llm_timeout=10.0,
|
|
tts_timeout=10.0,
|
|
sample_rate=16000,
|
|
)
|
|
|
|
orchestrator = PipelineOrchestrator(
|
|
config=pipeline_config,
|
|
vad=vad,
|
|
turn_detector=turn_detector,
|
|
transcriber=stt_pipeline,
|
|
transcript_manager=transcript_manager,
|
|
relevance_filter=relevance_filter,
|
|
llm_client=openclaw_client,
|
|
tts_synthesizer=tts_synthesizer,
|
|
audio_output_callback=audio_output_callback,
|
|
query_router=query_router,
|
|
)
|
|
|
|
logger.info("✓ Pipeline orchestrator initialized with all optimizations")
|
|
logger.info(" - STT beam_size=1 optimization active")
|
|
logger.info(" - Smart model router active (Haiku/Sonnet/Opus)")
|
|
logger.info(" - Sentence-level streaming TTS active")
|
|
logger.info(" - TTS phrase cache active")
|
|
|
|
# Test OpenClaw Gateway connection
|
|
logger.info("Testing OpenClaw Gateway connection...")
|
|
try:
|
|
await openclaw_client.connect()
|
|
logger.info(f"✓ Connected to OpenClaw Gateway ({config.openclaw.base_url})")
|
|
except Exception as e:
|
|
logger.error(f"✗ Failed to connect to OpenClaw Gateway: {e}")
|
|
logger.error("Check OPENCLAW_BASE_URL and OPENCLAW_AUTH_TOKEN in .env")
|
|
logger.error("Ensure OpenClaw Gateway is running on Synology NAS")
|
|
return 1
|
|
|
|
# Initialize FastAPI server
|
|
logger.info("Initializing API server...")
|
|
from server.app import create_api_server
|
|
import uvicorn
|
|
|
|
api_server = create_api_server(
|
|
tts_synthesizer=tts_synthesizer,
|
|
stt_transcriber=stt_transcriber,
|
|
)
|
|
logger.info(
|
|
f"✓ API server initialized (port {config.server.port})"
|
|
)
|
|
|
|
# Initialize Discord bot
|
|
logger.info("Initializing Discord bot...")
|
|
from discord_bot.bot import run_bot
|
|
|
|
logger.info("")
|
|
logger.info("=" * 70)
|
|
logger.info("Starting services...")
|
|
logger.info("=" * 70)
|
|
logger.info("")
|
|
|
|
# Create tasks for both servers
|
|
discord_task = asyncio.create_task(
|
|
run_bot(
|
|
config=config,
|
|
openclaw_config=openclaw_config,
|
|
tts_synthesizer=tts_synthesizer,
|
|
stt_transcriber=stt_transcriber,
|
|
orchestrator=orchestrator,
|
|
audio_output_callbacks=audio_output_callbacks,
|
|
),
|
|
name="discord_bot",
|
|
)
|
|
logger.info("✓ Discord bot started")
|
|
|
|
# Create uvicorn server config
|
|
uvicorn_config = uvicorn.Config(
|
|
api_server.app,
|
|
host=config.server.host,
|
|
port=config.server.port,
|
|
log_level="info",
|
|
)
|
|
uvicorn_server = uvicorn.Server(uvicorn_config)
|
|
api_task = asyncio.create_task(
|
|
uvicorn_server.serve(), name="api_server"
|
|
)
|
|
logger.info(
|
|
f"✓ API server started on {config.server.host}:{config.server.port}"
|
|
)
|
|
|
|
logger.info("")
|
|
logger.info("All services running. Press Ctrl+C to stop.")
|
|
logger.info("")
|
|
|
|
# Run both servers concurrently
|
|
await asyncio.gather(discord_task, api_task, return_exceptions=True)
|
|
|
|
return 0
|
|
|
|
except FileNotFoundError as e:
|
|
if logger:
|
|
logger.error(f"Configuration error: {e}")
|
|
else:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
return 1
|
|
|
|
except ValueError as e:
|
|
if logger:
|
|
logger.error(f"Configuration validation error: {e}")
|
|
else:
|
|
print(f"Error: {e}", file=sys.stderr)
|
|
return 1
|
|
|
|
except KeyboardInterrupt:
|
|
if logger:
|
|
logger.info("Keyboard interrupt received")
|
|
return 0
|
|
|
|
except Exception as e:
|
|
if logger:
|
|
logger.exception(f"Unexpected error: {e}")
|
|
else:
|
|
print(f"Unexpected error: {e}", file=sys.stderr)
|
|
return 1
|
|
|
|
finally:
|
|
if logger:
|
|
logger.info("Shutdown complete")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Register signal handlers
|
|
signal.signal(signal.SIGINT, signal_handler)
|
|
signal.signal(signal.SIGTERM, signal_handler)
|
|
|
|
# Run the async main function
|
|
exit_code = asyncio.run(main())
|
|
sys.exit(exit_code)
|