docs: Add Kani-TTS-2 evaluation and RTX 5090 compatibility analysis

## Kani-TTS-2 Research - Evaluated Kani-TTS-2 as potential TTS upgrade (3-4x faster, RTF 0.2) - Documented benefits: zero-shot voice cloning, Apache 2.0 license, 3GB VRAM - Identified Windows compatibility issues (pynini compilation failures) - Created test script for future evaluation when Windows support improves ## RTX 5090 Critical Finding - Discovered RTX 5090 (Blackwell sm_120) not supported by PyTorch - Tested stable (2.6.0) and nightly (2.7.0.dev) - both lack sm_120 support - Documented impact: GPU acceleration unavailable for STT/TTS - Performance degradation: 3.5s target → 10-15s actual (CPU-only) ## Files Added - KANI_TTS_EVALUATION.md - Comprehensive Kani-TTS-2 analysis - RTX_5090_BLOCKER.md - GPU compatibility report with solutions - test_kani_tts.py - Benchmark script for future testing - fix_pytorch_cuda.bat - GPU setup script (for when support lands) ## Recommendations - Wait 1-3 months for PyTorch sm_120 support - Monitor PyTorch releases weekly - Alternative: Cloud GPU (RTX 4090) or different local GPU - Current: CPU-only mode functional but slow ## Next Steps - Monitor: https://github.com/pytorch/pytorch/releases - Test when available: pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu124 - Re-evaluate Kani-TTS-2 after GPU support Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-16 19:53:52 -05:00 · 2026-02-16 19:53:52 -05:00 · 2f17d4847d
commit 2f17d4847d
parent 9fde3d31ba
4 changed files with 717 additions and 0 deletions
--- a/test_kani_tts.py
+++ b/test_kani_tts.py
@ -0,0 +1,171 @@
+"""
+Kani-TTS-2 Testing Script
+Compare Kani-TTS-2 with current Coqui XTTS v2 implementation
+"""
+
+import time
+import wave
+from pathlib import Path
+import numpy as np
+
+print("=" * 70)
+print("Kani-TTS-2 Testing Script")
+print("=" * 70)
+
+# Test configuration
+TEST_PHRASES = [
+    "Yes, sir. I am at your service.",  # Short, simple (cache test)
+    "The weather today is partly cloudy with a high of 72 degrees.",  # Medium
+    "I've analyzed the data and found several interesting patterns that warrant further investigation.",  # Long
+]
+
+VOICE_FILES = {
+    "jarvis": "server/voices/jarvis.mp3",
+    "sage": "server/voices/sage.wav",
+}
+
+# Step 1: Check dependencies
+print("\n[1/6] Checking dependencies...")
+try:
+    import torch
+    print(f"[OK] PyTorch {torch.__version__} (CUDA: {torch.cuda.is_available()})")
+except ImportError:
+    print("[ERROR] PyTorch not installed")
+    exit(1)
+
+try:
+    from kani_tts import KaniTTS, SpeakerEmbedder
+    print("[OK] Kani-TTS-2 installed")
+except ImportError:
+    print("[WARN] Kani-TTS-2 not installed. Installing now...")
+    import subprocess
+    subprocess.run(["pip", "install", "kani-tts-2"], check=True)
+    subprocess.run(["pip", "install", "-U", "transformers==4.56.0"], check=True)
+    from kani_tts import KaniTTS, SpeakerEmbedder
+    print("[OK] Kani-TTS-2 installed successfully")
+
+# Step 2: Check voice files
+print("\n[2/6] Checking voice reference files...")
+available_voices = {}
+for agent, voice_path in VOICE_FILES.items():
+    if Path(voice_path).exists():
+        print(f"[OK] {agent}: {voice_path}")
+        available_voices[agent] = voice_path
+    else:
+        print(f"[WARN] {agent}: {voice_path} not found")
+
+if not available_voices:
+    print("[ERROR] No voice files found. Please add voice samples to server/voices/")
+    exit(1)
+
+# Step 3: Initialize Kani-TTS-2
+print("\n[3/6] Initializing Kani-TTS-2 model...")
+init_start = time.time()
+try:
+    model = KaniTTS('nineninesix/kani-tts-2-en')
+    embedder = SpeakerEmbedder()
+    init_time = time.time() - init_start
+    print(f"[OK] Model loaded in {init_time:.2f}s")
+except Exception as e:
+    print(f"[ERROR] Failed to load model: {e}")
+    exit(1)
+
+# Step 4: Generate speaker embeddings
+print("\n[4/6] Generating speaker embeddings...")
+speaker_embeddings = {}
+for agent, voice_path in available_voices.items():
+    try:
+        embed_start = time.time()
+        speaker_emb = embedder.embed_audio_file(voice_path)
+        embed_time = time.time() - embed_start
+        speaker_embeddings[agent] = speaker_emb
+        print(f"[OK] {agent}: {speaker_emb.shape} in {embed_time:.2f}s")
+    except Exception as e:
+        print(f"[ERROR] {agent}: {e}")
+
+# Step 5: Run latency benchmarks
+print("\n[5/6] Running latency benchmarks...")
+print("-" * 70)
+
+results = []
+
+for i, text in enumerate(TEST_PHRASES, 1):
+    print(f"\n[Test {i}/3] \"{text[:50]}...\"")
+
+    for agent, speaker_emb in speaker_embeddings.items():
+        try:
+            # Generate audio
+            start = time.time()
+            audio, processed_text = model(
+                text,
+                speaker_emb=speaker_emb,
+                temperature=0.75,
+                top_p=0.85
+            )
+            generation_time = time.time() - start
+
+            # Calculate metrics
+            audio_duration = len(audio) / 22050  # 22kHz sample rate
+            rtf = generation_time / audio_duration
+
+            # Save output
+            output_path = f"test_outputs/kani_{agent}_test{i}.wav"
+            Path("test_outputs").mkdir(exist_ok=True)
+            model.save_audio(audio, output_path)
+
+            print(f"  {agent}:")
+            print(f"    Generation: {generation_time:.2f}s")
+            print(f"    Audio length: {audio_duration:.2f}s")
+            print(f"    RTF: {rtf:.2f}")
+            print(f"    Output: {output_path}")
+
+            results.append({
+                "test": i,
+                "agent": agent,
+                "text_length": len(text),
+                "generation_time": generation_time,
+                "audio_duration": audio_duration,
+                "rtf": rtf,
+                "output": output_path
+            })
+
+        except Exception as e:
+            print(f"  {agent}: [ERROR] {e}")
+
+# Step 6: Generate report
+print("\n[6/6] Performance Summary")
+print("=" * 70)
+
+if results:
+    avg_generation = np.mean([r["generation_time"] for r in results])
+    avg_rtf = np.mean([r["rtf"] for r in results])
+
+    print(f"\nAverage Metrics:")
+    print(f"  Generation Time: {avg_generation:.2f}s")
+    print(f"  RTF: {avg_rtf:.2f}")
+    print(f"  Expected RTF from docs: ~0.2")
+
+    print(f"\nPer-Test Breakdown:")
+    for i in range(1, 4):
+        test_results = [r for r in results if r["test"] == i]
+        if test_results:
+            test_rtf = np.mean([r["rtf"] for r in test_results])
+            test_gen = np.mean([r["generation_time"] for r in test_results])
+            print(f"  Test {i} ('{TEST_PHRASES[i-1][:30]}...')")
+            print(f"    Avg Generation: {test_gen:.2f}s")
+            print(f"    Avg RTF: {test_rtf:.2f}")
+
+    print(f"\nOutput files saved to: test_outputs/")
+    print(f"   Listen to samples and compare quality with current TTS")
+
+    print(f"\n[OK] Testing complete!")
+    print(f"\nNext steps:")
+    print(f"  1. Listen to generated audio samples in test_outputs/")
+    print(f"  2. Compare quality with current Coqui XTTS v2")
+    print(f"  3. If quality is acceptable and RTF < 0.3, consider integration")
+    print(f"  4. See KANI_TTS_INTEGRATION.md for implementation guide")
+
+else:
+    print("[ERROR] No successful tests - check errors above")
+
+print("=" * 70)