openclaw-voice/config.yaml
Jezza Hehn a2099e9d81 Strip Jarvis/Sage personas, simplify to MoltMic pipe
- Replace /jarvis and /sage command groups with /moltmic join|leave|status
- Remove AgentVoiceConfig, AgentsConfig now just has default agent
- Remove voice file checks from run.py (cloud TTS doesn't need them)
- Remove agent-to-voice mapping in bot.py on_speech_complete
- Rename from 'Jarvis Voice Bot' to 'MoltMic' throughout
2026-04-10 01:43:02 +00:00

233 lines
6 KiB
YAML

# Jarvis Voice Bot Configuration
# Environment variables in .env override these values
# ============================================================================
# Discord Settings
# ============================================================================
discord:
# Bot token from Discord Developer Portal
# REQUIRED: Set via DISCORD_TOKEN environment variable
token: null
# Command prefix for text commands (if needed)
command_prefix: "/"
# Bot status message
status_message: "Listening in voice channels"
# Auto-join voice channel on bot start (if user is in voice)
auto_join: false
# ============================================================================
# Agent Configuration
# ============================================================================
agents:
default: "main"
# ============================================================================
# OpenClaw Gateway
# ============================================================================
openclaw:
# WebSocket URL for OpenClaw Gateway
# REQUIRED: Set via OPENCLAW_BASE_URL environment variable
# Format: ws://IP:PORT (default port: 18789)
base_url: null
# Authentication token
# REQUIRED: Set via OPENCLAW_AUTH_TOKEN environment variable
token: null
# Request timeout (seconds)
timeout: 8.0
# Retry timeout (seconds)
retry_timeout: 15.0
# Retry attempts on failure
max_retries: 1
# Model/agent selection
model: "claude-sonnet-4"
# Agent ID for session keys
agent_id: "jarvis"
# Session scope: per-peer or shared
session_scope: "per-peer"
# ============================================================================
# Pipeline Configuration
# ============================================================================
pipeline:
# Voice Activity Detection (Silero VAD)
vad:
# Silence duration to consider speech ended (seconds)
silence_threshold: 0.3
# Minimum speech duration to process (seconds)
min_speech_duration: 0.5
# VAD confidence threshold (0.0-1.0)
speech_threshold: 0.5
# Smart Turn v3 Configuration
turn_detection:
# Turn completion confidence threshold (0.0-1.0)
# Higher = more certain turn is complete before proceeding
threshold: 0.7
# Maximum wait time after silence before forcing completion (seconds)
max_wait: 3.0
# Model path (relative to models/ directory)
# Using v3.2 GPU model for best performance with RTX 5090
model_path: "smart-turn-v3.2-gpu.onnx"
# Speech-to-Text
stt:
# Provider: "deepgram" (cloud, no GPU) or "local" (faster-whisper, requires GPU)
provider: "deepgram"
# Deepgram settings (used when provider is "deepgram")
model: "nova-3"
language: "en"
# Local faster-whisper settings (used when provider is "local")
model_size: "small"
device: "cuda"
compute_type: "float16"
beam_size: 1
# VAD filter (use built-in VAD in whisper)
vad_filter: false
# Relevance Filter
relevance:
# Default sensitivity: low, medium, high
default_sensitivity: "medium"
# Sensitivity thresholds (LLM confidence 0.0-1.0)
thresholds:
low: 1.0 # Only fast path (name mentions)
medium: 0.75 # Fast path + LLM with 75% confidence
high: 0.5 # Fast path + LLM with 50% confidence
# LLM for classification (if not using OpenClaw)
# Can be: openai, anthropic, local, openclaw
classifier: "openclaw"
# Classification timeout (seconds)
timeout: 2.0
# Cache classifications (avoid re-classifying similar utterances)
enable_cache: true
cache_ttl: 300 # seconds
# Transcript Management
transcript:
# Rolling window duration (seconds)
window_duration: 90
# Maximum number of turns to keep
max_turns: 20
# Timezone for timestamp display
timezone: "America/Los_Angeles"
# Text-to-Speech
tts:
# Provider: "venice" (cloud, no GPU) or "local" (chatterbox, requires GPU)
provider: "venice"
# Venice settings (used when provider is "venice")
venice:
voice: "am_liam"
base_url: "https://api.venice.ai/api/v1"
# API key from env: VENICE_API_KEY
# Local settings (used when provider is "local")
engine: "chatterbox"
device: "cuda"
# Streaming: generate and play audio in chunks
streaming: true
# Chunk duration for streaming (seconds)
chunk_duration: 0.5
# Voice cloning settings (for Coqui XTTS)
coqui:
model_name: "tts_models/multilingual/multi-dataset/xtts_v2"
language: "en"
temperature: 0.75
length_penalty: 1.0
repetition_penalty: 5.0
top_k: 50
top_p: 0.85
# Audio Buffering
audio:
# Buffer duration per user (seconds)
buffer_duration: 10.0
# Sample rate for processing (Hz)
processing_sample_rate: 16000
# Discord audio sample rate (Hz)
discord_sample_rate: 48000
# ============================================================================
# FastAPI Server
# ============================================================================
server:
# Server host
host: "0.0.0.0"
# Server port
port: 8880
# Enable TTS endpoint
enable_tts: true
# Enable STT endpoint
enable_stt: true
# API key for authentication (optional)
# Set via SERVER_API_KEY environment variable
api_key: null
# CORS settings
cors:
enabled: true
allowed_origins: ["*"]
allowed_methods: ["*"]
allowed_headers: ["*"]
# ============================================================================
# Logging
# ============================================================================
logging:
# Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL
level: "INFO"
# Log format
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
# Enable latency tracking
track_latency: true
# Per-module log levels (override global level)
modules:
discord_bot: "INFO"
pipeline: "INFO"
server: "INFO"
openclaw_client: "DEBUG"
# Log file (optional, null = console only)
file: null
# Rotate logs
rotation:
enabled: false
max_bytes: 10485760 # 10MB
backup_count: 5