openclaw-voice/config.yaml
Jezza Hehn f0458b9b40 feat: add Deepgram STT provider and cloud-first config
- New DeepgramSTT class using Deepgram nova-3 via REST API
- Factory function create_stt_engine() for provider switching
- faster-whisper import now optional (graceful fallback)
- Config defaults to cloud providers (deepgram STT + venice TTS)
- .env.example updated with DEEPGRAM_API_KEY and VENICE_API_KEY
- requirements.txt adds deepgram-sdk, marks faster-whisper as optional
- Zero GPU required for default configuration
2026-04-10 00:33:57 +00:00

261 lines
7 KiB
YAML

# Jarvis Voice Bot Configuration
# Environment variables in .env override these values
# ============================================================================
# Discord Settings
# ============================================================================
discord:
# Bot token from Discord Developer Portal
# REQUIRED: Set via DISCORD_TOKEN environment variable
token: null
# Command prefix for text commands (if needed)
command_prefix: "/"
# Bot status message
status_message: "Listening in voice channels"
# Auto-join voice channel on bot start (if user is in voice)
auto_join: false
# ============================================================================
# Agent Configuration
# ============================================================================
agents:
# Default agent (jarvis or sage)
default: "jarvis"
# Per-agent settings
jarvis:
# TTS voice reference file (relative to server/voices/)
voice_file: "jarvis.mp3"
# Agent personality for LLM context
personality: |
You are Jarvis, an intelligent, witty, and helpful AI assistant.
You speak naturally and conversationally, with subtle British sophistication.
You provide accurate information and thoughtful insights without being
verbose. You have a dry sense of humor but know when to be serious.
# TTS emotion exaggeration (0.0 = none, 1.0 = full)
emotion_exaggeration: 0.3
sage:
voice_file: "sage.wav"
personality: |
You are Sage, a wise, calm, and philosophical AI assistant.
You speak thoughtfully and deliberately, offering deep insights and
perspectives. You are patient, empathetic, and help people think through
complex problems. Your tone is warm and encouraging.
emotion_exaggeration: 0.2
# ============================================================================
# OpenClaw Gateway
# ============================================================================
openclaw:
# WebSocket URL for OpenClaw Gateway
# REQUIRED: Set via OPENCLAW_BASE_URL environment variable
# Format: ws://IP:PORT (default port: 18789)
base_url: null
# Authentication token
# REQUIRED: Set via OPENCLAW_AUTH_TOKEN environment variable
token: null
# Request timeout (seconds)
timeout: 8.0
# Retry timeout (seconds)
retry_timeout: 15.0
# Retry attempts on failure
max_retries: 1
# Model/agent selection
model: "claude-sonnet-4"
# Agent ID for session keys
agent_id: "jarvis"
# Session scope: per-peer or shared
session_scope: "per-peer"
# ============================================================================
# Pipeline Configuration
# ============================================================================
pipeline:
# Voice Activity Detection (Silero VAD)
vad:
# Silence duration to consider speech ended (seconds)
silence_threshold: 0.3
# Minimum speech duration to process (seconds)
min_speech_duration: 0.5
# VAD confidence threshold (0.0-1.0)
speech_threshold: 0.5
# Smart Turn v3 Configuration
turn_detection:
# Turn completion confidence threshold (0.0-1.0)
# Higher = more certain turn is complete before proceeding
threshold: 0.7
# Maximum wait time after silence before forcing completion (seconds)
max_wait: 3.0
# Model path (relative to models/ directory)
# Using v3.2 GPU model for best performance with RTX 5090
model_path: "smart-turn-v3.2-gpu.onnx"
# Speech-to-Text
stt:
# Provider: "deepgram" (cloud, no GPU) or "local" (faster-whisper, requires GPU)
provider: "deepgram"
# Deepgram settings (used when provider is "deepgram")
model: "nova-3"
language: "en"
# Local faster-whisper settings (used when provider is "local")
model_size: "small"
device: "cuda"
compute_type: "float16"
beam_size: 1
# Language hint (null = auto-detect)
language: "en"
# VAD filter (use built-in VAD in whisper)
vad_filter: false
# Relevance Filter
relevance:
# Default sensitivity: low, medium, high
default_sensitivity: "medium"
# Sensitivity thresholds (LLM confidence 0.0-1.0)
thresholds:
low: 1.0 # Only fast path (name mentions)
medium: 0.75 # Fast path + LLM with 75% confidence
high: 0.5 # Fast path + LLM with 50% confidence
# LLM for classification (if not using OpenClaw)
# Can be: openai, anthropic, local, openclaw
classifier: "openclaw"
# Classification timeout (seconds)
timeout: 2.0
# Cache classifications (avoid re-classifying similar utterances)
enable_cache: true
cache_ttl: 300 # seconds
# Transcript Management
transcript:
# Rolling window duration (seconds)
window_duration: 90
# Maximum number of turns to keep
max_turns: 20
# Timezone for timestamp display
timezone: "America/Los_Angeles"
# Text-to-Speech
tts:
# Provider: "venice" (cloud, no GPU) or "local" (chatterbox, requires GPU)
provider: "venice"
# Venice settings (used when provider is "venice")
venice:
voice: "am_liam"
base_url: "https://api.venice.ai/api/v1"
# API key from env: VENICE_API_KEY
# Local settings (used when provider is "local")
engine: "chatterbox"
device: "cuda"
# Streaming: generate and play audio in chunks
streaming: true
# Chunk duration for streaming (seconds)
chunk_duration: 0.5
# Voice cloning settings (for Coqui XTTS)
coqui:
model_name: "tts_models/multilingual/multi-dataset/xtts_v2"
language: "en"
temperature: 0.75
length_penalty: 1.0
repetition_penalty: 5.0
top_k: 50
top_p: 0.85
# Audio Buffering
audio:
# Buffer duration per user (seconds)
buffer_duration: 10.0
# Sample rate for processing (Hz)
processing_sample_rate: 16000
# Discord audio sample rate (Hz)
discord_sample_rate: 48000
# ============================================================================
# FastAPI Server
# ============================================================================
server:
# Server host
host: "0.0.0.0"
# Server port
port: 8880
# Enable TTS endpoint
enable_tts: true
# Enable STT endpoint
enable_stt: true
# API key for authentication (optional)
# Set via SERVER_API_KEY environment variable
api_key: null
# CORS settings
cors:
enabled: true
allowed_origins: ["*"]
allowed_methods: ["*"]
allowed_headers: ["*"]
# ============================================================================
# Logging
# ============================================================================
logging:
# Log level: DEBUG, INFO, WARNING, ERROR, CRITICAL
level: "INFO"
# Log format
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
# Enable latency tracking
track_latency: true
# Per-module log levels (override global level)
modules:
discord_bot: "INFO"
pipeline: "INFO"
server: "INFO"
openclaw_client: "DEBUG"
# Log file (optional, null = console only)
file: null
# Rotate logs
rotation:
enabled: false
max_bytes: 10485760 # 10MB
backup_count: 5