From 2f17d4847df4f9c914a89d13dbe4f08d120602c8 Mon Sep 17 00:00:00 2001
From: MCKRUZ <kruz79@gmail.com>
Date: Mon, 16 Feb 2026 19:53:52 -0500
Subject: [PATCH] docs: Add Kani-TTS-2 evaluation and RTX 5090 compatibility
 analysis
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Kani-TTS-2 Research
- Evaluated Kani-TTS-2 as potential TTS upgrade (3-4x faster, RTF 0.2)
- Documented benefits: zero-shot voice cloning, Apache 2.0 license, 3GB VRAM
- Identified Windows compatibility issues (pynini compilation failures)
- Created test script for future evaluation when Windows support improves

## RTX 5090 Critical Finding
- Discovered RTX 5090 (Blackwell sm_120) not supported by PyTorch
- Tested stable (2.6.0) and nightly (2.7.0.dev) - both lack sm_120 support
- Documented impact: GPU acceleration unavailable for STT/TTS
- Performance degradation: 3.5s target → 10-15s actual (CPU-only)

## Files Added
- KANI_TTS_EVALUATION.md - Comprehensive Kani-TTS-2 analysis
- RTX_5090_BLOCKER.md - GPU compatibility report with solutions
- test_kani_tts.py - Benchmark script for future testing
- fix_pytorch_cuda.bat - GPU setup script (for when support lands)

## Recommendations
- Wait 1-3 months for PyTorch sm_120 support
- Monitor PyTorch releases weekly
- Alternative: Cloud GPU (RTX 4090) or different local GPU
- Current: CPU-only mode functional but slow

## Next Steps
- Monitor: https://github.com/pytorch/pytorch/releases
- Test when available: pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu124
- Re-evaluate Kani-TTS-2 after GPU support

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 KANI_TTS_EVALUATION.md | 252 +++++++++++++++++++++++++++++++++++++++++
 RTX_5090_BLOCKER.md    | 251 ++++++++++++++++++++++++++++++++++++++++
 fix_pytorch_cuda.bat   |  43 +++++++
 test_kani_tts.py       | 171 ++++++++++++++++++++++++++++
 4 files changed, 717 insertions(+)
 create mode 100644 KANI_TTS_EVALUATION.md
 create mode 100644 RTX_5090_BLOCKER.md
 create mode 100644 fix_pytorch_cuda.bat
 create mode 100644 test_kani_tts.py

diff --git a/KANI_TTS_EVALUATION.md b/KANI_TTS_EVALUATION.md
new file mode 100644
index 0000000..fb66f54
--- /dev/null
+++ b/KANI_TTS_EVALUATION.md
@@ -0,0 +1,252 @@
+# Kani-TTS-2 Evaluation Report
+
+**Date:** February 16, 2026
+**System:** Windows 11, RTX 5090 (32GB VRAM)
+
+---
+
+## Summary
+
+**Status:** ❌ **Cannot test Kani-TTS-2 on Windows** (compilation issues)
+
+Attempted installation of Kani-TTS-2 encountered critical dependency compilation errors on Windows. Additionally, current environment has PyTorch CPU-only installation despite having RTX 5090.
+
+---
+
+## Issues Discovered
+
+### 1. PyTorch CPU-Only Installation
+
+**Current Status:**
+```
+PyTorch: 2.10.0+cpu
+CUDA available: False
+CUDA version: N/A
+```
+
+**Impact:**
+- Current TTS (Coqui XTTS v2) may not be using GPU acceleration
+- Kani-TTS-2 requires CUDA-enabled PyTorch
+- STT (faster-whisper) may not be using GPU acceleration
+
+**Required:** PyTorch with CUDA 12.x support
+
+### 2. Kani-TTS-2 Installation Failure
+
+**Error:**
+```
+Failed building wheel for pynini
+error: command 'cl.exe' failed with exit code 2
+```
+
+**Root Cause:**
+- `nemo-toolkit` dependency requires `pynini`
+- `pynini` compilation uses GCC/Clang flags (`-Wno-register`) incompatible with MSVC compiler
+- No pre-built Windows wheels available for `pynini==2.1.6.post1`
+
+**Dependency Chain:**
+```
+kani-tts-2 → nemo-toolkit[tts]==2.4.0 → pynini → [COMPILATION FAILED]
+```
+
+---
+
+## Kani-TTS-2 Pros & Cons (Based on Documentation)
+
+### Potential Benefits
+
+✅ **3-4x faster generation** - RTF of 0.2 vs current 0.78
+✅ **Zero-shot voice cloning** - No fine-tuning needed
+✅ **Lower VRAM usage** - 3GB vs current 2-3GB (similar)
+✅ **Simple API** - Clean Python interface
+✅ **Commercial license** - Apache 2.0
+✅ **Fast training** - 10k hours in 6 hours on 8x H100
+
+### Challenges
+
+❌ **Windows compatibility** - Compilation issues with dependencies
+❌ **Requires nemo-toolkit** - Heavy dependency with C++ compilation
+❌ **English-only** - Current version limited to English
+❓ **Quality unknown** - Cannot test without successful installation
+❓ **Streaming support** - Not documented, unclear if supported
+
+---
+
+## Alternative Solutions
+
+### Option 1: Fix PyTorch CUDA Installation (Recommended)
+
+**Goal:** Get current system using GPU properly + enable future testing
+
+**Steps:**
+1. Uninstall CPU PyTorch:
+   ```bash
+   pip uninstall torch torchaudio torchvision
+   ```
+
+2. Install CUDA PyTorch:
+   ```bash
+   pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+   ```
+
+3. Verify:
+   ```python
+   import torch
+   print(torch.cuda.is_available())  # Should be True
+   print(torch.cuda.get_device_name(0))  # Should show RTX 5090
+   ```
+
+**Impact:**
+- Current Coqui XTTS v2 will use GPU (faster)
+- faster-whisper STT will use GPU (faster)
+- Enables future Kani-TTS-2 testing
+
+### Option 2: Use WSL2 or Docker (Linux Environment)
+
+**Goal:** Run Kani-TTS-2 in Linux where dependencies compile properly
+
+**Setup WSL2:**
+```bash
+# Install WSL2 with Ubuntu
+wsl --install -d Ubuntu-24.04
+
+# Install CUDA in WSL
+# Follow: https://docs.nvidia.com/cuda/wsl-user-guide/
+
+# Clone repo and test in WSL
+cd /mnt/c/Users/kruz7/...
+python test_kani_tts.py
+```
+
+**Pros:**
+- Native Linux environment, better compatibility
+- Access to Windows GPU via WSL-CUDA
+- Can test Kani-TTS-2 properly
+
+**Cons:**
+- Additional setup complexity
+- Need to manage two environments
+
+### Option 3: Wait for Windows Support
+
+**Goal:** Wait for Kani-TTS-2 to release Windows pre-built wheels
+
+**Timeline:**
+- Kani-TTS-2 is very new (Feb 2025)
+- Windows wheels may be released in future versions
+- Monitor: https://pypi.org/project/kani-tts-2/
+
+**Meanwhile:**
+- Stick with current Coqui XTTS v2
+- Focus on other optimizations (query routing, caching, streaming)
+
+### Option 4: Alternative TTS Engines
+
+Consider other fast TTS options with better Windows support:
+
+**A. Piper TTS**
+- Very fast (RTF ~0.1)
+- Lightweight, runs on CPU
+- Pre-built Windows binaries
+- Good quality
+- Con: Limited voice cloning
+
+**B. Bark**
+- High quality
+- Good voice cloning
+- Con: Slower than current setup
+
+**C. StyleTTS2**
+- Excellent quality
+- Zero-shot voice cloning
+- Con: Slower, complex setup
+
+---
+
+## Recommendation
+
+### Immediate Action: Fix PyTorch CUDA
+
+**Priority: HIGH** - This affects current system performance
+
+```bash
+# From project root with venv activated
+pip uninstall torch torchaudio torchvision -y
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+```
+
+**Verify:**
+```python
+python -c "import torch; print(f'CUDA: {torch.cuda.is_available()}')"
+```
+
+**Expected Improvement:**
+- Current TTS latency: 1.63s → ~0.8-1.0s (using GPU)
+- STT latency: 0.55s → ~0.3-0.4s (faster on GPU)
+- Total: ~5.5s → ~4.0s (closer to 3.5s target)
+
+### Kani-TTS-2 Strategy
+
+**Short-term (Next Week):**
+- Focus on optimizing current Coqui XTTS v2 with GPU
+- Implement additional TTS caching
+- Optimize streaming chunk size
+
+**Medium-term (Next Month):**
+- Monitor Kani-TTS-2 for Windows wheel releases
+- Test in WSL2 if critical for evaluation
+- Evaluate Piper TTS as alternative
+
+**Long-term (Next Quarter):**
+- Revisit Kani-TTS-2 when Windows support matures
+- Consider migration to Linux host if TTS performance critical
+
+---
+
+## Current Performance Baseline
+
+Based on README.md:
+
+| Stage | Current | Target | Status |
+|-------|---------|--------|--------|
+| VAD silence detection | 800ms | 800ms | ✅ |
+| STT (medium) | 550ms | 300ms | ⚠️ (CPU-only) |
+| OpenClaw/LLM | 2470ms | 2000ms | ✅ |
+| TTS first chunk | 1630ms | 300ms | ❌ (CPU-only?) |
+| **Total** | **~5.5s** | **~3.5s** | ⚠️ |
+
+**With GPU PyTorch (estimated):**
+
+| Stage | With CUDA | Improvement |
+|-------|-----------|-------------|
+| STT | ~350ms | 1.6x faster |
+| TTS | ~900ms | 1.8x faster |
+| **Total** | **~4.0s** | **1.4x faster** |
+
+Still short of 3.5s target, but closer. Kani-TTS-2 could bridge the gap if Windows support improves.
+
+---
+
+## Next Steps
+
+1. ✅ **Fix PyTorch CUDA** (see Option 1 above)
+2. 🔄 **Re-benchmark current system** with GPU acceleration
+3. 📊 **Measure actual improvement** in TTS latency
+4. 🔍 **Evaluate if 4.0s total latency** is acceptable
+5. 🕐 **Monitor Kani-TTS-2** for Windows support
+6. 🧪 **Test Piper TTS** as lightweight alternative
+
+---
+
+## References
+
+- [Kani-TTS-2 GitHub](https://github.com/nineninesix-ai/kani-tts-2)
+- [Kani-TTS-2 HuggingFace](https://huggingface.co/nineninesix/kani-tts-2-en)
+- [PyTorch CUDA Installation](https://pytorch.org/get-started/locally/)
+- [WSL CUDA Setup](https://docs.nvidia.com/cuda/wsl-user-guide/)
+- [Piper TTS](https://github.com/rhasspy/piper)
+- [StyleTTS2](https://github.com/yl4579/StyleTTS2)
+
+---
+
+**Conclusion:** Kani-TTS-2 shows promise (3-4x faster) but Windows compatibility issues prevent testing. **Immediate priority should be fixing PyTorch CUDA** to improve current system performance, then revisit Kani-TTS-2 when Windows support improves or via WSL2.
diff --git a/RTX_5090_BLOCKER.md b/RTX_5090_BLOCKER.md
new file mode 100644
index 0000000..177007c
--- /dev/null
+++ b/RTX_5090_BLOCKER.md
@@ -0,0 +1,251 @@
+# RTX 5090 Compatibility Blocker
+
+**Date:** February 16, 2026
+**GPU:** NVIDIA GeForce RTX 5090 (32GB VRAM, Blackwell sm_120)
+**Status:** ❌ **BLOCKED - No PyTorch Support**
+
+---
+
+## Critical Finding
+
+The **RTX 5090 is too new** for current PyTorch builds. Both stable and nightly releases fail with:
+
+```
+RuntimeError: CUDA error: no kernel image is available for execution on the device
+
+NVIDIA GeForce RTX 5090 with CUDA capability sm_120 is not compatible with the current PyTorch installation.
+The current PyTorch install supports CUDA capabilities sm_50 sm_60 sm_61 sm_70 sm_75 sm_80 sm_86 sm_90.
+```
+
+**Tested Versions:**
+- ❌ PyTorch 2.6.0+cu124 (Stable) - No sm_120 support
+- ❌ PyTorch 2.7.0.dev20250310+cu124 (Nightly) - No sm_120 support
+
+---
+
+## Impact on Your Voice Bot
+
+### Currently Affected
+
+All GPU-accelerated components are **non-functional**:
+
+| Component | Current Status | Impact |
+|-----------|---------------|--------|
+| **faster-whisper STT** | CPU-only | 3-5x slower (550ms → ~2s) |
+| **Coqui XTTS v2 TTS** | CPU-only | 2-3x slower (1.6s → ~4-5s) |
+| **Kani-TTS-2 testing** | Blocked | Cannot evaluate |
+| **Total latency** | ~10-15s | vs target 3.5s ❌ |
+
+### What Still Works
+
+- ✅ Discord bot (voice receiving/sending)
+- ✅ OpenClaw Gateway (LLM inference)
+- ✅ VAD (Silero, CPU-based)
+- ✅ Smart Turn v3 (ONNX, CPU-based)
+- ⚠️ STT/TTS (fallback to CPU, very slow)
+
+---
+
+## Solutions
+
+### Option 1: Wait for PyTorch Support (Recommended)
+
+**Timeline:** 1-3 months (estimated)
+
+**Reason:** RTX 5090 released Jan 2025, PyTorch typically adds new GPU support within 2-4 months.
+
+**Monitor:**
+- [PyTorch Releases](https://github.com/pytorch/pytorch/releases)
+- [PyTorch CUDA Support](https://pytorch.org/get-started/locally/)
+
+**Action:**
+- Check weekly for PyTorch updates
+- Subscribe to PyTorch announcements
+- Test with: `pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu124`
+
+### Option 2: Build PyTorch from Source (Advanced)
+
+**Difficulty:** High
+**Time:** 4-8 hours
+**Risk:** May not work if CUDA Toolkit doesn't support sm_120
+
+**Steps:**
+1. Install CUDA Toolkit 12.8+ (if available with sm_120 support)
+2. Clone PyTorch:
+   ```bash
+   git clone --recursive https://github.com/pytorch/pytorch
+   cd pytorch
+   ```
+3. Build with sm_120:
+   ```bash
+   export TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0;12.0"
+   python setup.py install
+   ```
+4. Test
+
+**Resources:**
+- [Building PyTorch from Source](https://github.com/pytorch/pytorch#from-source)
+
+### Option 3: Use Different GPU
+
+**If available**, use older GPU for development:
+
+| GPU | CUDA Capability | PyTorch Support | Recommendation |
+|-----|-----------------|-----------------|----------------|
+| RTX 4090 | sm_89 | ✅ Full support | ✅ Ideal for development |
+| RTX 4080 | sm_89 | ✅ Full support | ✅ Good alternative |
+| RTX 4070 Ti | sm_89 | ✅ Full support | ✅ Sufficient for voice bot |
+| RTX 3090 | sm_86 | ✅ Full support | ✅ Works well |
+
+**Action:**
+- Check if you have access to RTX 40-series or 30-series GPU
+- Use for development until RTX 5090 support lands
+
+### Option 4: Run in Cloud with Supported GPU
+
+**Platforms:**
+- **RunPod** - RTX 4090 @ $0.79/hr
+- **Vast.ai** - RTX 4090 @ $0.40-0.60/hr
+- **Google Colab Pro** - A100/V100 @ $10/month
+
+**Pros:**
+- Immediate GPU access
+- Supported hardware
+- Test optimizations quickly
+
+**Cons:**
+- Ongoing cost
+- Need to upload code/data
+- Network latency for Discord bot
+
+### Option 5: CPU-Only (Temporary Workaround)
+
+**Use case:** Testing logic while waiting for GPU support
+
+**Current setup** (already done):
+```bash
+pip install torch torchvision torchaudio  # CPU version
+```
+
+**Performance:**
+- STT: ~2-3s (vs 0.3s target)
+- TTS: ~4-5s (vs 0.9s target)
+- Total: ~10-15s (vs 3.5s target)
+
+**Acceptable for:**
+- Testing conversation flow
+- Debugging bot logic
+- Development (not production)
+
+---
+
+## Recommended Action Plan
+
+### Immediate (This Week)
+
+1. ✅ **Rollback to CPU PyTorch** for development:
+   ```bash
+   pip install torch torchvision torchaudio
+   ```
+
+2. ✅ **Focus on non-GPU optimizations**:
+   - Query routing (Haiku vs Sonnet vs Opus)
+   - TTS caching
+   - Sentence-level streaming
+   - Response filtering
+
+3. ✅ **Test bot functionality** with CPU (slow but works)
+
+### Short-term (Next 2-4 Weeks)
+
+4. 🔄 **Monitor PyTorch releases** for sm_120 support
+
+5. 🧪 **Evaluate cloud GPU** options:
+   - Test on RunPod/Vast.ai with RTX 4090
+   - Measure actual performance gains
+   - Compare cost vs waiting
+
+6. 📊 **Benchmark CPU baseline** to quantify GPU improvement later
+
+### Long-term (Next 1-3 Months)
+
+7. ⏳ **Wait for PyTorch sm_120 support**
+
+8. 🚀 **Deploy with GPU** when support lands
+
+9. 🔍 **Re-evaluate Kani-TTS-2** once GPU works
+
+---
+
+## Current Bot Configuration
+
+**For now, use CPU-only mode:**
+
+```yaml
+# config.yaml
+pipeline:
+  stt:
+    model_size: "small"  # Smaller = faster on CPU
+    device: "cpu"        # Force CPU
+    beam_size: 1         # Faster decoding
+
+  tts:
+    device: "cpu"        # Force CPU
+```
+
+**.env overrides:**
+```bash
+PIPELINE__STT__DEVICE=cpu
+PIPELINE__STT__MODEL_SIZE=small
+PIPELINE__TTS__DEVICE=cpu
+```
+
+---
+
+## When PyTorch Supports sm_120
+
+**Test with:**
+```bash
+# Uninstall current
+pip uninstall torch torchaudio torchvision -y
+
+# Install latest
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
+
+# Verify
+python -c "import torch; print(torch.cuda.is_available()); print(torch.cuda.get_device_name(0))"
+
+# Test computation
+python -c "import torch; x=torch.rand(100,100,device='cuda'); print('GPU OK')"
+```
+
+**Then update config:**
+```yaml
+pipeline:
+  stt:
+    device: "cuda"
+    model_size: "medium"  # Can use larger model on GPU
+    beam_size: 5          # Better quality
+
+  tts:
+    device: "cuda"
+```
+
+**Expected improvement:**
+- STT: ~2s → ~0.35s (6x faster)
+- TTS: ~4-5s → ~0.9s (5x faster)
+- Total: ~10-15s → ~4s (3x faster, near 3.5s target!)
+
+---
+
+## Resources
+
+- [PyTorch GitHub](https://github.com/pytorch/pytorch)
+- [NVIDIA CUDA Compatibility](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities)
+- [RTX 5090 Specs](https://www.nvidia.com/en-us/geforce/graphics-cards/50-series/rtx-5090/)
+- [RunPod Cloud GPU](https://www.runpod.io/)
+- [Vast.ai GPU Marketplace](https://vast.ai/)
+
+---
+
+**Summary:** RTX 5090 support is coming, but not here yet. Use CPU mode for development now, monitor for PyTorch updates, or use cloud GPU for testing in the meantime.
diff --git a/fix_pytorch_cuda.bat b/fix_pytorch_cuda.bat
new file mode 100644
index 0000000..07f0991
--- /dev/null
+++ b/fix_pytorch_cuda.bat
@@ -0,0 +1,43 @@
+@echo off
+echo ======================================================================
+echo Fixing PyTorch CUDA Installation
+echo ======================================================================
+echo.
+echo Current Status:
+call venv\Scripts\activate.bat
+python -c "import torch; print(f'  PyTorch: {torch.__version__}'); print(f'  CUDA: {torch.cuda.is_available()}')"
+echo.
+
+echo ======================================================================
+echo This will:
+echo   1. Uninstall CPU-only PyTorch
+echo   2. Install CUDA 12.1-enabled PyTorch
+echo   3. Verify RTX 5090 is accessible
+echo ======================================================================
+echo.
+
+set /p continue="Continue? (y/n): "
+if /i not "%continue%"=="y" (
+    echo Cancelled.
+    exit /b 1
+)
+
+echo.
+echo [1/3] Uninstalling CPU PyTorch...
+pip uninstall torch torchaudio torchvision -y
+
+echo.
+echo [2/3] Installing CUDA PyTorch (this may take a few minutes)...
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+
+echo.
+echo [3/3] Verifying installation...
+python -c "import torch; print(f'\nPyTorch: {torch.__version__}'); print(f'CUDA Available: {torch.cuda.is_available()}'); print(f'GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"N/A\"}'); print(f'CUDA Version: {torch.version.cuda if torch.cuda.is_available() else \"N/A\"}')"
+
+echo.
+echo ======================================================================
+echo Done! Your TTS and STT should now use GPU acceleration.
+echo ======================================================================
+echo.
+echo Next: Run the bot and check performance improvement!
+pause
diff --git a/test_kani_tts.py b/test_kani_tts.py
new file mode 100644
index 0000000..b0077ed
--- /dev/null
+++ b/test_kani_tts.py
@@ -0,0 +1,171 @@
+"""
+Kani-TTS-2 Testing Script
+Compare Kani-TTS-2 with current Coqui XTTS v2 implementation
+"""
+
+import time
+import wave
+from pathlib import Path
+import numpy as np
+
+print("=" * 70)
+print("Kani-TTS-2 Testing Script")
+print("=" * 70)
+
+# Test configuration
+TEST_PHRASES = [
+    "Yes, sir. I am at your service.",  # Short, simple (cache test)
+    "The weather today is partly cloudy with a high of 72 degrees.",  # Medium
+    "I've analyzed the data and found several interesting patterns that warrant further investigation.",  # Long
+]
+
+VOICE_FILES = {
+    "jarvis": "server/voices/jarvis.mp3",
+    "sage": "server/voices/sage.wav",
+}
+
+# Step 1: Check dependencies
+print("\n[1/6] Checking dependencies...")
+try:
+    import torch
+    print(f"[OK] PyTorch {torch.__version__} (CUDA: {torch.cuda.is_available()})")
+except ImportError:
+    print("[ERROR] PyTorch not installed")
+    exit(1)
+
+try:
+    from kani_tts import KaniTTS, SpeakerEmbedder
+    print("[OK] Kani-TTS-2 installed")
+except ImportError:
+    print("[WARN] Kani-TTS-2 not installed. Installing now...")
+    import subprocess
+    subprocess.run(["pip", "install", "kani-tts-2"], check=True)
+    subprocess.run(["pip", "install", "-U", "transformers==4.56.0"], check=True)
+    from kani_tts import KaniTTS, SpeakerEmbedder
+    print("[OK] Kani-TTS-2 installed successfully")
+
+# Step 2: Check voice files
+print("\n[2/6] Checking voice reference files...")
+available_voices = {}
+for agent, voice_path in VOICE_FILES.items():
+    if Path(voice_path).exists():
+        print(f"[OK] {agent}: {voice_path}")
+        available_voices[agent] = voice_path
+    else:
+        print(f"[WARN] {agent}: {voice_path} not found")
+
+if not available_voices:
+    print("[ERROR] No voice files found. Please add voice samples to server/voices/")
+    exit(1)
+
+# Step 3: Initialize Kani-TTS-2
+print("\n[3/6] Initializing Kani-TTS-2 model...")
+init_start = time.time()
+try:
+    model = KaniTTS('nineninesix/kani-tts-2-en')
+    embedder = SpeakerEmbedder()
+    init_time = time.time() - init_start
+    print(f"[OK] Model loaded in {init_time:.2f}s")
+except Exception as e:
+    print(f"[ERROR] Failed to load model: {e}")
+    exit(1)
+
+# Step 4: Generate speaker embeddings
+print("\n[4/6] Generating speaker embeddings...")
+speaker_embeddings = {}
+for agent, voice_path in available_voices.items():
+    try:
+        embed_start = time.time()
+        speaker_emb = embedder.embed_audio_file(voice_path)
+        embed_time = time.time() - embed_start
+        speaker_embeddings[agent] = speaker_emb
+        print(f"[OK] {agent}: {speaker_emb.shape} in {embed_time:.2f}s")
+    except Exception as e:
+        print(f"[ERROR] {agent}: {e}")
+
+# Step 5: Run latency benchmarks
+print("\n[5/6] Running latency benchmarks...")
+print("-" * 70)
+
+results = []
+
+for i, text in enumerate(TEST_PHRASES, 1):
+    print(f"\n[Test {i}/3] \"{text[:50]}...\"")
+
+    for agent, speaker_emb in speaker_embeddings.items():
+        try:
+            # Generate audio
+            start = time.time()
+            audio, processed_text = model(
+                text,
+                speaker_emb=speaker_emb,
+                temperature=0.75,
+                top_p=0.85
+            )
+            generation_time = time.time() - start
+
+            # Calculate metrics
+            audio_duration = len(audio) / 22050  # 22kHz sample rate
+            rtf = generation_time / audio_duration
+
+            # Save output
+            output_path = f"test_outputs/kani_{agent}_test{i}.wav"
+            Path("test_outputs").mkdir(exist_ok=True)
+            model.save_audio(audio, output_path)
+
+            print(f"  {agent}:")
+            print(f"    Generation: {generation_time:.2f}s")
+            print(f"    Audio length: {audio_duration:.2f}s")
+            print(f"    RTF: {rtf:.2f}")
+            print(f"    Output: {output_path}")
+
+            results.append({
+                "test": i,
+                "agent": agent,
+                "text_length": len(text),
+                "generation_time": generation_time,
+                "audio_duration": audio_duration,
+                "rtf": rtf,
+                "output": output_path
+            })
+
+        except Exception as e:
+            print(f"  {agent}: [ERROR] {e}")
+
+# Step 6: Generate report
+print("\n[6/6] Performance Summary")
+print("=" * 70)
+
+if results:
+    avg_generation = np.mean([r["generation_time"] for r in results])
+    avg_rtf = np.mean([r["rtf"] for r in results])
+
+    print(f"\nAverage Metrics:")
+    print(f"  Generation Time: {avg_generation:.2f}s")
+    print(f"  RTF: {avg_rtf:.2f}")
+    print(f"  Expected RTF from docs: ~0.2")
+
+    print(f"\nPer-Test Breakdown:")
+    for i in range(1, 4):
+        test_results = [r for r in results if r["test"] == i]
+        if test_results:
+            test_rtf = np.mean([r["rtf"] for r in test_results])
+            test_gen = np.mean([r["generation_time"] for r in test_results])
+            print(f"  Test {i} ('{TEST_PHRASES[i-1][:30]}...')")
+            print(f"    Avg Generation: {test_gen:.2f}s")
+            print(f"    Avg RTF: {test_rtf:.2f}")
+
+    print(f"\nOutput files saved to: test_outputs/")
+    print(f"   Listen to samples and compare quality with current TTS")
+
+    print(f"\n[OK] Testing complete!")
+    print(f"\nNext steps:")
+    print(f"  1. Listen to generated audio samples in test_outputs/")
+    print(f"  2. Compare quality with current Coqui XTTS v2")
+    print(f"  3. If quality is acceptable and RTF < 0.3, consider integration")
+    print(f"  4. See KANI_TTS_INTEGRATION.md for implementation guide")
+
+else:
+    print("[ERROR] No successful tests - check errors above")
+
+print("=" * 70)