diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 933ecadaf..c40173736 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -1556,7 +1556,7 @@ DEFAULT_CONFIG = { # Each provider supports an optional `max_text_length:` override for the # per-request input-character cap. Omit it to use the provider's documented # limit (OpenAI 4096, xAI 15000, MiniMax 10000, ElevenLabs 5k-40k model-aware, - # Gemini 5000, Edge 5000, Mistral 4000, NeuTTS/KittenTTS 2000). + # Gemini 32000, Edge 5000, Mistral 4000, NeuTTS/KittenTTS 2000). "tts": { "provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "gemini" | "neutts" (local) | "kittentts" (local) | "piper" (local) "edge": { @@ -1572,6 +1572,15 @@ DEFAULT_CONFIG = { "voice": "alloy", # Voices: alloy, echo, fable, onyx, nova, shimmer }, + "gemini": { + "model": "gemini-2.5-flash-preview-tts", + "voice": "Kore", + # Optional local Markdown/text file with Gemini TTS performance + # direction. It may include AUDIO PROFILE, SCENE, DIRECTOR'S NOTES, + # SAMPLE CONTEXT, and either a `{transcript}` placeholder or no + # transcript section; Hermes appends the live transcript when absent. + "persona_prompt_file": "", + }, "xai": { "voice_id": "eve", # or custom voice ID — see https://docs.x.ai/developers/model-capabilities/audio/custom-voices "language": "en", diff --git a/tests/tools/test_tts_gemini.py b/tests/tools/test_tts_gemini.py index 00a028674..6a52a48c0 100644 --- a/tests/tools/test_tts_gemini.py +++ b/tests/tools/test_tts_gemini.py @@ -255,6 +255,63 @@ class TestGenerateGeminiTts: assert mock_post.call_args[0][0].startswith("https://custom-gemini.example.com/v1beta/") + def test_persona_prompt_file_appends_labeled_transcript( + self, tmp_path, monkeypatch, mock_gemini_response + ): + from tools.tts_tool import _generate_gemini_tts + + persona_file = tmp_path / "voice-persona.md" + persona_file.write_text( + "# AUDIO PROFILE: Dry Butler\n\n### DIRECTOR'S NOTES\nStyle: Understated.", + encoding="utf-8", + ) + config = {"gemini": {"persona_prompt_file": str(persona_file)}} + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + + with patch("requests.post", return_value=mock_gemini_response) as mock_post: + _generate_gemini_tts("Hi", str(tmp_path / "test.wav"), config) + + prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"] + assert "Synthesize speech from the TRANSCRIPT only" in prompt_text + assert "# AUDIO PROFILE: Dry Butler" in prompt_text + assert "### DIRECTOR'S NOTES\nStyle: Understated." in prompt_text + assert "#### TRANSCRIPT\nHi" in prompt_text + + def test_persona_prompt_file_supports_transcript_placeholder( + self, tmp_path, monkeypatch, mock_gemini_response + ): + from tools.tts_tool import _generate_gemini_tts + + persona_file = tmp_path / "voice-persona.md" + persona_file.write_text( + "### DIRECTOR'S NOTES\nPacing: Slow.\n\n#### TRANSCRIPT\n{{ transcript }}", + encoding="utf-8", + ) + config = {"gemini": {"persona_prompt_file": str(persona_file)}} + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + + with patch("requests.post", return_value=mock_gemini_response) as mock_post: + _generate_gemini_tts("Read this.", str(tmp_path / "test.wav"), config) + + prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"] + assert "{{ transcript }}" not in prompt_text + assert "#### TRANSCRIPT\nRead this." in prompt_text + + def test_missing_persona_prompt_file_warns_and_continues( + self, tmp_path, monkeypatch, caplog, mock_gemini_response + ): + from tools.tts_tool import _generate_gemini_tts + + config = {"gemini": {"persona_prompt_file": str(tmp_path / "missing.md")}} + monkeypatch.setenv("GEMINI_API_KEY", "test-key") + + with patch("requests.post", return_value=mock_gemini_response) as mock_post: + _generate_gemini_tts("Hi", str(tmp_path / "test.wav"), config) + + prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"] + assert prompt_text == "Hi" + assert "persona prompt file unavailable" in caplog.text + class TestGeminiInCheckRequirements: def test_gemini_api_key_satisfies_requirements(self, monkeypatch): diff --git a/tests/tools/test_tts_max_text_length.py b/tests/tools/test_tts_max_text_length.py index 49ae5ca2f..2ea9348bc 100644 --- a/tests/tools/test_tts_max_text_length.py +++ b/tests/tools/test_tts_max_text_length.py @@ -31,8 +31,8 @@ class TestResolveMaxTextLength: def test_mistral_default(self): assert _resolve_max_text_length("mistral", {}) == PROVIDER_MAX_TEXT_LENGTH["mistral"] - def test_gemini_default(self): - assert _resolve_max_text_length("gemini", {}) == PROVIDER_MAX_TEXT_LENGTH["gemini"] + def test_gemini_default_is_32000(self): + assert _resolve_max_text_length("gemini", {}) == 32000 def test_unknown_provider_falls_back(self): assert _resolve_max_text_length("does-not-exist", {}) == FALLBACK_MAX_TEXT_LENGTH diff --git a/tools/tts_tool.py b/tools/tts_tool.py index 4d43eef21..8b223da60 100644 --- a/tools/tts_tool.py +++ b/tools/tts_tool.py @@ -204,8 +204,8 @@ DEFAULT_OUTPUT_DIR = _get_default_output_dir() # --------------------------------------------------------------------------- # Per-provider input-character limits (from official provider docs). # A single global cap was wrong: OpenAI is 4096, xAI is 15k, MiniMax is 10k, -# ElevenLabs is model-dependent (5k / 10k / 30k / 40k), Gemini caps at ~8k -# input tokens. Users can override any of these via +# ElevenLabs is model-dependent (5k / 10k / 30k / 40k), Gemini has a 32k-token +# context window. Users can override any of these via # ``tts..max_text_length`` in config.yaml. # --------------------------------------------------------------------------- PROVIDER_MAX_TEXT_LENGTH: Dict[str, int] = { @@ -214,7 +214,7 @@ PROVIDER_MAX_TEXT_LENGTH: Dict[str, int] = { "xai": 15000, # https://docs.x.ai/developers/model-capabilities/audio/text-to-speech "minimax": 10000, # https://platform.minimax.io/docs/api-reference/speech-t2a-http (sync) "mistral": 4000, # conservative; no published per-request cap - "gemini": 5000, # Gemini TTS caps at ~8k input tokens / ~655s audio + "gemini": 32000, # Gemini TTS has a 32k-token context window; char cap is conservative "elevenlabs": 10000, # fallback when model-aware lookup can't resolve (multilingual_v2) "neutts": 2000, # local model, quality falls off on long text "kittentts": 2000, # local 25MB model @@ -1394,6 +1394,65 @@ def _wrap_pcm_as_wav( return riff_header + fmt_chunk + data_chunk_header + pcm_bytes +def _resolve_gemini_persona_prompt_path(gemini_config: Dict[str, Any]) -> Optional[Path]: + """Return the configured persona prompt file path, if any.""" + raw = gemini_config.get("persona_prompt_file") + if not isinstance(raw, str) or not raw.strip(): + return None + + expanded = os.path.expandvars(raw.strip()) + path = Path(expanded).expanduser() + if not path.is_absolute(): + try: + from hermes_constants import get_hermes_home + path = get_hermes_home() / path + except Exception: + path = Path.cwd() / path + return path + + +def _read_gemini_persona_prompt(gemini_config: Dict[str, Any]) -> str: + """Read the Gemini persona prompt file, failing soft on config mistakes.""" + path = _resolve_gemini_persona_prompt_path(gemini_config) + if path is None: + return "" + try: + return path.read_text(encoding="utf-8").strip() + except (OSError, UnicodeDecodeError) as exc: + logger.warning( + "Gemini TTS persona prompt file unavailable at %s: %s", + path, + exc, + ) + return "" + + +def _compose_gemini_tts_prompt(text: str, gemini_config: Dict[str, Any]) -> str: + """Build the Gemini prompt from persona direction plus the live transcript.""" + transcript = text.strip() + persona_prompt = _read_gemini_persona_prompt(gemini_config) + if not persona_prompt: + return transcript + + preamble = ( + "Synthesize speech from the TRANSCRIPT only. Treat AUDIO PROFILE, " + "SCENE, DIRECTOR'S NOTES, and SAMPLE CONTEXT as performance direction; " + "do not speak those sections aloud." + ) + + placeholder_patterns = ( + re.compile(r"\{\{\s*transcript\s*\}\}", flags=re.IGNORECASE), + re.compile(r"\{\s*transcript\s*\}", flags=re.IGNORECASE), + ) + prompt = persona_prompt + for pattern in placeholder_patterns: + if pattern.search(prompt): + prompt = pattern.sub(transcript, prompt) + return f"{preamble}\n\n{prompt}".strip() + + return f"{preamble}\n\n{persona_prompt}\n\n#### TRANSCRIPT\n{transcript}".strip() + + def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str: """Generate audio using Google Gemini TTS. @@ -1419,7 +1478,8 @@ def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any] "GEMINI_API_KEY not set. Get one at https://aistudio.google.com/app/apikey" ) - gemini_config = tts_config.get("gemini", {}) + raw_gemini_config = tts_config.get("gemini", {}) + gemini_config = raw_gemini_config if isinstance(raw_gemini_config, dict) else {} model = str(gemini_config.get("model", DEFAULT_GEMINI_TTS_MODEL)).strip() or DEFAULT_GEMINI_TTS_MODEL voice = str(gemini_config.get("voice", DEFAULT_GEMINI_TTS_VOICE)).strip() or DEFAULT_GEMINI_TTS_VOICE base_url = str( @@ -1427,9 +1487,17 @@ def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any] or get_env_value("GEMINI_BASE_URL") or DEFAULT_GEMINI_TTS_BASE_URL ).strip().rstrip("/") + prompt_text = _compose_gemini_tts_prompt(text, gemini_config) + max_len = _resolve_max_text_length("gemini", tts_config) + if len(prompt_text) > max_len: + logger.warning( + "Gemini TTS composed prompt too long (%d chars), truncating to %d", + len(prompt_text), max_len, + ) + prompt_text = prompt_text[:max_len] payload: Dict[str, Any] = { - "contents": [{"parts": [{"text": text}]}], + "contents": [{"parts": [{"text": prompt_text}]}], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { diff --git a/website/docs/user-guide/configuration.md b/website/docs/user-guide/configuration.md index 1d6881029..bf91953f6 100644 --- a/website/docs/user-guide/configuration.md +++ b/website/docs/user-guide/configuration.md @@ -1199,6 +1199,7 @@ tts: gemini: model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts voice: "Kore" # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, etc. + persona_prompt_file: "" # Optional Markdown/text file with Gemini voice direction xai: voice_id: "eve" # xAI TTS voice language: "en" # ISO 639-1 diff --git a/website/docs/user-guide/features/tts.md b/website/docs/user-guide/features/tts.md index 96c33d745..d67efc3e2 100644 --- a/website/docs/user-guide/features/tts.md +++ b/website/docs/user-guide/features/tts.md @@ -68,6 +68,7 @@ tts: gemini: model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts voice: "Kore" # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, Gacrux, etc. + persona_prompt_file: "" # Optional Markdown/text file with Gemini voice direction xai: voice_id: "eve" # or a custom voice ID — see docs below language: "en" # ISO 639-1 code @@ -97,6 +98,20 @@ tts: **Speed control**: The global `tts.speed` value applies to all providers by default. Each provider can override it with its own `speed` setting (e.g., `tts.openai.speed: 1.5`). Provider-specific speed takes precedence over the global value. Default is `1.0` (normal speed). +### Gemini Persona Prompts + +Gemini TTS can follow natural-language performance direction. Set `tts.gemini.persona_prompt_file` to a local Markdown or text file that describes the voice persona. The file can include Gemini-style sections such as `AUDIO PROFILE`, `SCENE`, `DIRECTOR'S NOTES`, `SAMPLE CONTEXT`, and `TRANSCRIPT`. + +If the file contains `{transcript}` or `{{ transcript }}`, Hermes replaces that placeholder with the live TTS text. Otherwise, Hermes appends a labeled `TRANSCRIPT` section automatically. The persona prompt stays local and is not shown in the chat reply. + +```yaml +tts: + provider: gemini + gemini: + voice: Algieba + persona_prompt_file: ~/.hermes/tts/butler-voice.md +``` + ### Input length limits @@ -109,7 +124,7 @@ Each provider has a documented per-request input-character cap. Hermes truncates | xAI | 15000 | | MiniMax | 10000 | | Mistral | 4000 | -| Google Gemini | 5000 | +| Google Gemini | 32000 | | ElevenLabs | Model-aware (see below) | | NeuTTS | 2000 | | KittenTTS | 2000 |