# Jarvis Voice Bot Configuration # Environment variables in .env override these values # ============================================================================ # Discord Settings # ============================================================================ discord: # Bot token from Discord Developer Portal # REQUIRED: Set via DISCORD_TOKEN environment variable token: null # Command prefix for text commands (if needed) command_prefix: "/" # Bot status message status_message: "Listening in voice channels" # Auto-join voice channel on bot start (if user is in voice) auto_join: false # ============================================================================ # Agent Configuration # ============================================================================ agents: default: "main" # ============================================================================ # OpenClaw Gateway # ============================================================================ openclaw: # WebSocket URL for OpenClaw Gateway # REQUIRED: Set via OPENCLAW_BASE_URL environment variable # Format: ws://IP:PORT (default port: 18789) base_url: null # Authentication token # REQUIRED: Set via OPENCLAW_AUTH_TOKEN environment variable token: null # Request timeout (seconds) timeout: 8.0 # Retry timeout (seconds) retry_timeout: 15.0 # Retry attempts on failure max_retries: 1 # Model/agent selection model: "claude-sonnet-4" # Agent ID for session keys agent_id: "jarvis" # Session scope: per-peer or shared session_scope: "per-peer" # ============================================================================ # Pipeline Configuration # ============================================================================ pipeline: # Voice Activity Detection (Silero VAD) vad: # Silence duration to consider speech ended (seconds) silence_threshold: 0.3 # Minimum speech duration to process (seconds) min_speech_duration: 0.5 # VAD confidence threshold (0.0-1.0) speech_threshold: 0.5 # Smart Turn v3 Configuration turn_detection: # Turn completion confidence threshold (0.0-1.0) # Higher = more certain turn is complete before proceeding threshold: 0.7 # Maximum wait time after silence before forcing completion (seconds) max_wait: 3.0 # Model path (relative to models/ directory) # Using v3.2 GPU model for best performance with RTX 5090 model_path: "smart-turn-v3.2-gpu.onnx" # Speech-to-Text stt: # Provider: "deepgram" (cloud, no GPU) or "local" (faster-whisper, requires GPU) provider: "deepgram" # Deepgram settings (used when provider is "deepgram") model: "nova-3" language: "en" # Local faster-whisper settings (used when provider is "local") model_size: "small" device: "cuda" compute_type: "float16" beam_size: 1 # VAD filter (use built-in VAD in whisper) vad_filter: false # Relevance Filter relevance: # Default sensitivity: low, medium, high default_sensitivity: "medium" # Sensitivity thresholds (LLM confidence 0.0-1.0) thresholds: low: 1.0 # Only fast path (name mentions) medium: 0.75 # Fast path + LLM with 75% confidence high: 0.5 # Fast path + LLM with 50% confidence # LLM for classification (if not using OpenClaw) # Can be: openai, anthropic, local, openclaw classifier: "openclaw" # Classification timeout (seconds) timeout: 2.0 # Cache classifications (avoid re-classifying similar utterances) enable_cache: true cache_ttl: 300 # seconds # Transcript Management transcript: # Rolling window duration (seconds) window_duration: 90 # Maximum number of turns to keep max_turns: 20 # Timezone for timestamp display timezone: "America/Los_Angeles" # Text-to-Speech tts: # Provider: "venice" (cloud, no GPU) or "local" (chatterbox, requires GPU) provider: "venice" # Venice settings (used when provider is "venice") venice: voice: "am_liam" base_url: "https://api.venice.ai/api/v1" # API key from env: VENICE_API_KEY # Local settings (used when provider is "local") engine: "chatterbox" device: "cuda" # Streaming: generate and play audio in chunks streaming: true # Chunk duration for streaming (seconds) chunk_duration: 0.5 # Voice cloning settings (for Coqui XTTS) coqui: model_name: "tts_models/multilingual/multi-dataset/xtts_v2" language: "en" temperature: 0.75 length_penalty: 1.0 repetition_penalty: 5.0 top_k: 50 top_p: 0.85 # Audio Buffering audio: # Buffer duration per user (seconds) buffer_duration: 10.0 # Sample rate for processing (Hz) processing_sample_rate: 16000 # Discord audio sample rate (Hz) discord_sample_rate: 48000 # ============================================================================ # FastAPI Server # ============================================================================ server: # Server host host: "0.0.0.0" # Server port port: 8880 # Enable TTS endpoint enable_tts: true # Enable STT endpoint enable_stt: true # API key for authentication (optional) # Set via SERVER_API_KEY environment variable api_key: null # CORS settings cors: enabled: true allowed_origins: ["*"] allowed_methods: ["*"] allowed_headers: ["*"] # ============================================================================ # Logging # ============================================================================ logging: # Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL level: "INFO" # Log format format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" # Enable latency tracking track_latency: true # Per-module log levels (override global level) modules: discord_bot: "INFO" pipeline: "INFO" server: "INFO" openclaw_client: "DEBUG" # Log file (optional, null = console only) file: null # Rotate logs rotation: enabled: false max_bytes: 10485760 # 10MB backup_count: 5