openclaw-voice/server/tts.py

"""Text-to-Speech using Chatterbox TTS (or alternatives).

GPU-accelerated TTS with emotion control and paralinguistic support.
"""

import asyncio
import re
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Optional, Tuple

import numpy as np

from utils.logging import get_logger

logger = get_logger(__name__)


@dataclass
class TTSConfig:
    """Configuration for TTS engine."""

    voice_ref_dir: Path = Path("server/voices")
    device: str = "cuda"
    sample_rate: int = 24000  # Common for neural TTS
    emotion_exaggeration: float = 1.0  # 0.0-2.0
    streaming_chunk_size: int = 4800  # ~200ms @ 24kHz
    max_generation_time: float = 10.0  # Timeout for generation


@dataclass
class EmotionTag:
    """Represents an emotion tag in text."""

    tag: str  # e.g., "laugh", "chuckle", "sigh"
    position: int  # Character position in text
    text: str  # Original text with brackets


class ChatterboxTTS:
    """
    Chatterbox TTS engine wrapper.

    Supports emotion control and paralinguistic tags.
    Falls back to stub implementation if not available.
    """

    # Supported emotion tags
    EMOTION_TAGS = {
        "laugh": "laughter",
        "chuckle": "soft laughter",
        "sigh": "exhalation",
        "gasp": "inhalation",
        "whisper": "quiet speech",
        "excited": "high energy",
        "sad": "low energy",
    }

    def __init__(
        self,
        config: TTSConfig,
        voice_references: Dict[str, Path],
    ):
        """
        Initialize Chatterbox TTS engine.

        Args:
            config: TTS configuration
            voice_references: Map of agent_name -> reference audio file
        """
        self.config = config
        self.voice_references = voice_references

        # TTS model (stub - to be replaced with actual Chatterbox)
        self.model = None

        # Load engine
        self._load_engine()

        # Stats
        self.total_generations = 0
        self.total_audio_duration = 0.0
        self.total_processing_time = 0.0

    def _load_engine(self) -> None:
        """Load TTS engine."""
        try:
            logger.info(
                f"Loading Chatterbox TTS engine "
                f"(device: {self.config.device})"
            )

            # TODO: Replace with actual Chatterbox TTS initialization
            # from chatterbox import ChatterboxModel
            # self.model = ChatterboxModel(
            #     device=self.config.device,
            #     sample_rate=self.config.sample_rate,
            # )

            logger.warning(
                "Chatterbox TTS not available - using stub implementation"
            )
            self.model = "stub"  # Placeholder

        except Exception as e:
            logger.error(f"Failed to load Chatterbox TTS: {e}")
            logger.warning("Using stub implementation")
            self.model = "stub"

    def validate_voice_reference(self, voice_ref_path: Path) -> bool:
        """
        Validate voice reference file.

        Args:
            voice_ref_path: Path to voice reference audio

        Returns:
            True if valid, False otherwise
        """
        if not voice_ref_path.exists():
            logger.error(f"Voice reference not found: {voice_ref_path}")
            return False

        # Check file size (should be at least 100KB for 10s of audio)
        file_size = voice_ref_path.stat().st_size
        if file_size < 100_000:
            logger.warning(
                f"Voice reference may be too short: {voice_ref_path} "
                f"({file_size} bytes)"
            )
            return False

        # TODO: Validate audio format, sample rate, duration
        # import soundfile as sf
        # audio, sr = sf.read(voice_ref_path)
        # if len(audio) / sr < 10.0:
        #     logger.error("Voice reference should be at least 10 seconds")
        #     return False

        logger.info(f"Voice reference validated: {voice_ref_path}")
        return True

    def parse_emotion_tags(self, text: str) -> Tuple[str, List[EmotionTag]]:
        """
        Parse emotion tags from text.

        Args:
            text: Text with emotion tags like "Hello [laugh]"

        Returns:
            Tuple of (cleaned_text, emotion_tags)
        """
        emotion_tags = []
        pattern = r"\[(\w+)\]"

        # Find all emotion tags
        for match in re.finditer(pattern, text):
            tag = match.group(1).lower()
            if tag in self.EMOTION_TAGS:
                emotion_tags.append(
                    EmotionTag(
                        tag=tag,
                        position=match.start(),
                        text=match.group(0),
                    )
                )

        # Remove tags from text
        cleaned_text = re.sub(pattern, "", text)

        # Clean up extra spaces
        cleaned_text = " ".join(cleaned_text.split())

        return cleaned_text, emotion_tags

    def generate(
        self,
        text: str,
        voice_ref_path: Path,
        emotion_exaggeration: Optional[float] = None,
    ) -> np.ndarray:
        """
        Generate speech from text.

        Args:
            text: Text to synthesize
            voice_ref_path: Path to voice reference audio
            emotion_exaggeration: Emotion control (0.0-2.0, None = use default)

        Returns:
            Audio array (float32, sample_rate from config)
        """
        start_time = time.time()

        # Parse emotion tags
        cleaned_text, emotion_tags = self.parse_emotion_tags(text)

        if self.model is None or self.model == "stub":
            logger.warning("Using stub TTS - returning silence")
            # Stub: generate silence
            duration = len(cleaned_text) / 15.0  # ~15 chars/second
            duration = max(1.0, min(duration, 10.0))  # Clamp to 1-10s
            audio = np.zeros(
                int(duration * self.config.sample_rate), dtype=np.float32
            )
        else:
            logger.info(
                f"Generating TTS for: '{cleaned_text[:50]}...' "
                f"({len(emotion_tags)} emotion tags)"
            )

            # TODO: Replace with actual Chatterbox TTS generation
            # audio = self.model.generate(
            #     text=cleaned_text,
            #     voice_ref=voice_ref_path,
            #     emotion_tags=emotion_tags,
            #     emotion_exaggeration=emotion_exaggeration or self.config.emotion_exaggeration,
            # )

            # Stub: generate silence
            duration = len(cleaned_text) / 15.0  # ~15 chars/second
            duration = max(1.0, min(duration, 10.0))  # Clamp to 1-10s
            audio = np.zeros(
                int(duration * self.config.sample_rate), dtype=np.float32
            )

        # Update stats
        processing_time = time.time() - start_time
        duration = len(audio) / self.config.sample_rate
        self.total_generations += 1
        self.total_audio_duration += duration
        self.total_processing_time += processing_time

        logger.info(
            f"Generated {duration:.2f}s audio in {processing_time:.2f}s "
            f"(RTF: {processing_time / duration:.2f})"
        )

        return audio

    async def generate_async(
        self,
        text: str,
        voice_ref_path: Path,
        emotion_exaggeration: Optional[float] = None,
    ) -> np.ndarray:
        """
        Async wrapper for generate().

        Args:
            text: Text to synthesize
            voice_ref_path: Voice reference path
            emotion_exaggeration: Emotion control

        Returns:
            Audio array
        """
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(
            None,
            self.generate,
            text,
            voice_ref_path,
            emotion_exaggeration,
        )

    async def generate_streaming(
        self,
        text: str,
        voice_ref_path: Path,
        emotion_exaggeration: Optional[float] = None,
    ) -> List[np.ndarray]:
        """
        Generate speech in streaming chunks.

        Args:
            text: Text to synthesize
            voice_ref_path: Voice reference path
            emotion_exaggeration: Emotion control

        Returns:
            List of audio chunks
        """
        # TODO: Implement actual streaming generation
        # For now, generate full audio and split into chunks
        full_audio = await self.generate_async(
            text, voice_ref_path, emotion_exaggeration
        )

        # Split into chunks
        chunk_size = self.config.streaming_chunk_size
        chunks = []

        for i in range(0, len(full_audio), chunk_size):
            chunk = full_audio[i : i + chunk_size]
            chunks.append(chunk)

        logger.debug(f"Split audio into {len(chunks)} streaming chunks")
        return chunks

    def get_stats(self) -> dict:
        """
        Get TTS statistics.

        Returns:
            Dictionary with stats
        """
        avg_duration = (
            self.total_audio_duration / self.total_generations
            if self.total_generations > 0
            else 0.0
        )

        avg_processing = (
            self.total_processing_time / self.total_generations
            if self.total_generations > 0
            else 0.0
        )

        rtf = (
            avg_processing / avg_duration if avg_duration > 0 else 0.0
        )  # Real-time factor

        return {
            "engine": "Chatterbox TTS (stub)",
            "device": self.config.device,
            "sample_rate": self.config.sample_rate,
            "total_generations": self.total_generations,
            "total_audio_duration": self.total_audio_duration,
            "total_processing_time": self.total_processing_time,
            "avg_audio_duration": avg_duration,
            "avg_processing_time": avg_processing,
            "real_time_factor": rtf,
        }


class TTSSynthesizer:
    """
    Pipeline TTS synthesizer.

    Handles voice selection, generation, and error handling.
    """

    def __init__(
        self,
        engine: ChatterboxTTS,
        voice_map: Dict[str, Path],
    ):
        """
        Initialize TTS synthesizer.

        Args:
            engine: TTS engine instance
            voice_map: Map of agent_name -> voice reference path
        """
        self.engine = engine
        self.voice_map = voice_map

        # Validate voice references
        for agent, ref_path in voice_map.items():
            if not self.engine.validate_voice_reference(ref_path):
                logger.warning(
                    f"Invalid voice reference for {agent}: {ref_path}"
                )

        # Stats
        self.total_syntheses = 0
        self.total_failures = 0

    async def synthesize(
        self,
        agent: str,
        text: str,
        emotion_exaggeration: Optional[float] = None,
    ) -> Optional[np.ndarray]:
        """
        Synthesize speech for an agent.

        Args:
            agent: Agent name
            text: Text to synthesize
            emotion_exaggeration: Emotion control

        Returns:
            Audio array if successful, None on error
        """
        try:
            # Get voice reference
            agent_lower = agent.lower()
            if agent_lower not in self.voice_map:
                logger.error(f"No voice reference for agent: {agent}")
                self.total_failures += 1
                return None

            voice_ref = self.voice_map[agent_lower]

            # Generate audio
            audio = await self.engine.generate_async(
                text=text,
                voice_ref_path=voice_ref,
                emotion_exaggeration=emotion_exaggeration,
            )

            self.total_syntheses += 1

            logger.info(
                f"Synthesized {len(audio) / self.engine.config.sample_rate:.2f}s "
                f"for {agent}: '{text[:50]}...'"
            )

            return audio

        except Exception as e:
            logger.error(f"TTS synthesis failed for {agent}: {e}")
            self.total_failures += 1
            return None

    async def synthesize_streaming(
        self,
        agent: str,
        text: str,
        emotion_exaggeration: Optional[float] = None,
    ) -> Optional[List[np.ndarray]]:
        """
        Synthesize speech in streaming chunks.

        Args:
            agent: Agent name
            text: Text to synthesize
            emotion_exaggeration: Emotion control

        Returns:
            List of audio chunks if successful, None on error
        """
        try:
            agent_lower = agent.lower()
            if agent_lower not in self.voice_map:
                logger.error(f"No voice reference for agent: {agent}")
                self.total_failures += 1
                return None

            voice_ref = self.voice_map[agent_lower]

            # Generate streaming chunks
            chunks = await self.engine.generate_streaming(
                text=text,
                voice_ref_path=voice_ref,
                emotion_exaggeration=emotion_exaggeration,
            )

            self.total_syntheses += 1

            return chunks

        except Exception as e:
            logger.error(f"Streaming TTS failed for {agent}: {e}")
            self.total_failures += 1
            return None

    def get_stats(self) -> dict:
        """
        Get synthesizer statistics.

        Returns:
            Dictionary with stats
        """
        engine_stats = self.engine.get_stats()

        return {
            **engine_stats,
            "total_syntheses": self.total_syntheses,
            "total_failures": self.total_failures,
            "success_rate": (
                self.total_syntheses / (self.total_syntheses + self.total_failures)
                if (self.total_syntheses + self.total_failures) > 0
                else 0.0
            ),
        }


# Convenience function
async def create_tts_synthesizer(
    voice_refs: Dict[str, str],
    device: str = "cuda",
    sample_rate: int = 24000,
) -> TTSSynthesizer:
    """
    Create TTS synthesizer with default settings.

    Args:
        voice_refs: Map of agent_name -> voice reference file path (string)
        device: Device (cuda/cpu)
        sample_rate: Audio sample rate

    Returns:
        TTSSynthesizer instance
    """
    # Convert string paths to Path objects
    voice_map = {agent: Path(path) for agent, path in voice_refs.items()}

    # Create config
    config = TTSConfig(
        device=device,
        sample_rate=sample_rate,
    )

    # Create engine
    engine = ChatterboxTTS(
        config=config,
        voice_references=voice_map,
    )

    # Create synthesizer
    synthesizer = TTSSynthesizer(
        engine=engine,
        voice_map=voice_map,
    )

    return synthesizer