docs: Add Kani-TTS-2 evaluation and RTX 5090 compatibility analysis

## Kani-TTS-2 Research
- Evaluated Kani-TTS-2 as potential TTS upgrade (3-4x faster, RTF 0.2)
- Documented benefits: zero-shot voice cloning, Apache 2.0 license, 3GB VRAM
- Identified Windows compatibility issues (pynini compilation failures)
- Created test script for future evaluation when Windows support improves

## RTX 5090 Critical Finding
- Discovered RTX 5090 (Blackwell sm_120) not supported by PyTorch
- Tested stable (2.6.0) and nightly (2.7.0.dev) - both lack sm_120 support
- Documented impact: GPU acceleration unavailable for STT/TTS
- Performance degradation: 3.5s target → 10-15s actual (CPU-only)

## Files Added
- KANI_TTS_EVALUATION.md - Comprehensive Kani-TTS-2 analysis
- RTX_5090_BLOCKER.md - GPU compatibility report with solutions
- test_kani_tts.py - Benchmark script for future testing
- fix_pytorch_cuda.bat - GPU setup script (for when support lands)

## Recommendations
- Wait 1-3 months for PyTorch sm_120 support
- Monitor PyTorch releases weekly
- Alternative: Cloud GPU (RTX 4090) or different local GPU
- Current: CPU-only mode functional but slow

## Next Steps
- Monitor: https://github.com/pytorch/pytorch/releases
- Test when available: pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu124
- Re-evaluate Kani-TTS-2 after GPU support

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
MCKRUZ 2026-02-16 19:53:52 -05:00
parent 9fde3d31ba
commit 2f17d4847d
4 changed files with 717 additions and 0 deletions

171
test_kani_tts.py Normal file
View file

@ -0,0 +1,171 @@
"""
Kani-TTS-2 Testing Script
Compare Kani-TTS-2 with current Coqui XTTS v2 implementation
"""
import time
import wave
from pathlib import Path
import numpy as np
print("=" * 70)
print("Kani-TTS-2 Testing Script")
print("=" * 70)
# Test configuration
TEST_PHRASES = [
"Yes, sir. I am at your service.", # Short, simple (cache test)
"The weather today is partly cloudy with a high of 72 degrees.", # Medium
"I've analyzed the data and found several interesting patterns that warrant further investigation.", # Long
]
VOICE_FILES = {
"jarvis": "server/voices/jarvis.mp3",
"sage": "server/voices/sage.wav",
}
# Step 1: Check dependencies
print("\n[1/6] Checking dependencies...")
try:
import torch
print(f"[OK] PyTorch {torch.__version__} (CUDA: {torch.cuda.is_available()})")
except ImportError:
print("[ERROR] PyTorch not installed")
exit(1)
try:
from kani_tts import KaniTTS, SpeakerEmbedder
print("[OK] Kani-TTS-2 installed")
except ImportError:
print("[WARN] Kani-TTS-2 not installed. Installing now...")
import subprocess
subprocess.run(["pip", "install", "kani-tts-2"], check=True)
subprocess.run(["pip", "install", "-U", "transformers==4.56.0"], check=True)
from kani_tts import KaniTTS, SpeakerEmbedder
print("[OK] Kani-TTS-2 installed successfully")
# Step 2: Check voice files
print("\n[2/6] Checking voice reference files...")
available_voices = {}
for agent, voice_path in VOICE_FILES.items():
if Path(voice_path).exists():
print(f"[OK] {agent}: {voice_path}")
available_voices[agent] = voice_path
else:
print(f"[WARN] {agent}: {voice_path} not found")
if not available_voices:
print("[ERROR] No voice files found. Please add voice samples to server/voices/")
exit(1)
# Step 3: Initialize Kani-TTS-2
print("\n[3/6] Initializing Kani-TTS-2 model...")
init_start = time.time()
try:
model = KaniTTS('nineninesix/kani-tts-2-en')
embedder = SpeakerEmbedder()
init_time = time.time() - init_start
print(f"[OK] Model loaded in {init_time:.2f}s")
except Exception as e:
print(f"[ERROR] Failed to load model: {e}")
exit(1)
# Step 4: Generate speaker embeddings
print("\n[4/6] Generating speaker embeddings...")
speaker_embeddings = {}
for agent, voice_path in available_voices.items():
try:
embed_start = time.time()
speaker_emb = embedder.embed_audio_file(voice_path)
embed_time = time.time() - embed_start
speaker_embeddings[agent] = speaker_emb
print(f"[OK] {agent}: {speaker_emb.shape} in {embed_time:.2f}s")
except Exception as e:
print(f"[ERROR] {agent}: {e}")
# Step 5: Run latency benchmarks
print("\n[5/6] Running latency benchmarks...")
print("-" * 70)
results = []
for i, text in enumerate(TEST_PHRASES, 1):
print(f"\n[Test {i}/3] \"{text[:50]}...\"")
for agent, speaker_emb in speaker_embeddings.items():
try:
# Generate audio
start = time.time()
audio, processed_text = model(
text,
speaker_emb=speaker_emb,
temperature=0.75,
top_p=0.85
)
generation_time = time.time() - start
# Calculate metrics
audio_duration = len(audio) / 22050 # 22kHz sample rate
rtf = generation_time / audio_duration
# Save output
output_path = f"test_outputs/kani_{agent}_test{i}.wav"
Path("test_outputs").mkdir(exist_ok=True)
model.save_audio(audio, output_path)
print(f" {agent}:")
print(f" Generation: {generation_time:.2f}s")
print(f" Audio length: {audio_duration:.2f}s")
print(f" RTF: {rtf:.2f}")
print(f" Output: {output_path}")
results.append({
"test": i,
"agent": agent,
"text_length": len(text),
"generation_time": generation_time,
"audio_duration": audio_duration,
"rtf": rtf,
"output": output_path
})
except Exception as e:
print(f" {agent}: [ERROR] {e}")
# Step 6: Generate report
print("\n[6/6] Performance Summary")
print("=" * 70)
if results:
avg_generation = np.mean([r["generation_time"] for r in results])
avg_rtf = np.mean([r["rtf"] for r in results])
print(f"\nAverage Metrics:")
print(f" Generation Time: {avg_generation:.2f}s")
print(f" RTF: {avg_rtf:.2f}")
print(f" Expected RTF from docs: ~0.2")
print(f"\nPer-Test Breakdown:")
for i in range(1, 4):
test_results = [r for r in results if r["test"] == i]
if test_results:
test_rtf = np.mean([r["rtf"] for r in test_results])
test_gen = np.mean([r["generation_time"] for r in test_results])
print(f" Test {i} ('{TEST_PHRASES[i-1][:30]}...')")
print(f" Avg Generation: {test_gen:.2f}s")
print(f" Avg RTF: {test_rtf:.2f}")
print(f"\nOutput files saved to: test_outputs/")
print(f" Listen to samples and compare quality with current TTS")
print(f"\n[OK] Testing complete!")
print(f"\nNext steps:")
print(f" 1. Listen to generated audio samples in test_outputs/")
print(f" 2. Compare quality with current Coqui XTTS v2")
print(f" 3. If quality is acceptable and RTF < 0.3, consider integration")
print(f" 4. See KANI_TTS_INTEGRATION.md for implementation guide")
else:
print("[ERROR] No successful tests - check errors above")
print("=" * 70)