feat(tts): add Gemini persona prompt file

This commit is contained in:
Barron Roth 2026-06-08 20:14:33 -07:00 committed by Teknium
parent af3c8b80b5
commit 5718811de0
6 changed files with 159 additions and 9 deletions

View file

@ -1556,7 +1556,7 @@ DEFAULT_CONFIG = {
# Each provider supports an optional `max_text_length:` override for the
# per-request input-character cap. Omit it to use the provider's documented
# limit (OpenAI 4096, xAI 15000, MiniMax 10000, ElevenLabs 5k-40k model-aware,
# Gemini 5000, Edge 5000, Mistral 4000, NeuTTS/KittenTTS 2000).
# Gemini 32000, Edge 5000, Mistral 4000, NeuTTS/KittenTTS 2000).
"tts": {
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "gemini" | "neutts" (local) | "kittentts" (local) | "piper" (local)
"edge": {
@ -1572,6 +1572,15 @@ DEFAULT_CONFIG = {
"voice": "alloy",
# Voices: alloy, echo, fable, onyx, nova, shimmer
},
"gemini": {
"model": "gemini-2.5-flash-preview-tts",
"voice": "Kore",
# Optional local Markdown/text file with Gemini TTS performance
# direction. It may include AUDIO PROFILE, SCENE, DIRECTOR'S NOTES,
# SAMPLE CONTEXT, and either a `{transcript}` placeholder or no
# transcript section; Hermes appends the live transcript when absent.
"persona_prompt_file": "",
},
"xai": {
"voice_id": "eve", # or custom voice ID — see https://docs.x.ai/developers/model-capabilities/audio/custom-voices
"language": "en",

View file

@ -255,6 +255,63 @@ class TestGenerateGeminiTts:
assert mock_post.call_args[0][0].startswith("https://custom-gemini.example.com/v1beta/")
def test_persona_prompt_file_appends_labeled_transcript(
self, tmp_path, monkeypatch, mock_gemini_response
):
from tools.tts_tool import _generate_gemini_tts
persona_file = tmp_path / "voice-persona.md"
persona_file.write_text(
"# AUDIO PROFILE: Dry Butler\n\n### DIRECTOR'S NOTES\nStyle: Understated.",
encoding="utf-8",
)
config = {"gemini": {"persona_prompt_file": str(persona_file)}}
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), config)
prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
assert "Synthesize speech from the TRANSCRIPT only" in prompt_text
assert "# AUDIO PROFILE: Dry Butler" in prompt_text
assert "### DIRECTOR'S NOTES\nStyle: Understated." in prompt_text
assert "#### TRANSCRIPT\nHi" in prompt_text
def test_persona_prompt_file_supports_transcript_placeholder(
self, tmp_path, monkeypatch, mock_gemini_response
):
from tools.tts_tool import _generate_gemini_tts
persona_file = tmp_path / "voice-persona.md"
persona_file.write_text(
"### DIRECTOR'S NOTES\nPacing: Slow.\n\n#### TRANSCRIPT\n{{ transcript }}",
encoding="utf-8",
)
config = {"gemini": {"persona_prompt_file": str(persona_file)}}
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
_generate_gemini_tts("Read this.", str(tmp_path / "test.wav"), config)
prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
assert "{{ transcript }}" not in prompt_text
assert "#### TRANSCRIPT\nRead this." in prompt_text
def test_missing_persona_prompt_file_warns_and_continues(
self, tmp_path, monkeypatch, caplog, mock_gemini_response
):
from tools.tts_tool import _generate_gemini_tts
config = {"gemini": {"persona_prompt_file": str(tmp_path / "missing.md")}}
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), config)
prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
assert prompt_text == "Hi"
assert "persona prompt file unavailable" in caplog.text
class TestGeminiInCheckRequirements:
def test_gemini_api_key_satisfies_requirements(self, monkeypatch):

View file

@ -31,8 +31,8 @@ class TestResolveMaxTextLength:
def test_mistral_default(self):
assert _resolve_max_text_length("mistral", {}) == PROVIDER_MAX_TEXT_LENGTH["mistral"]
def test_gemini_default(self):
assert _resolve_max_text_length("gemini", {}) == PROVIDER_MAX_TEXT_LENGTH["gemini"]
def test_gemini_default_is_32000(self):
assert _resolve_max_text_length("gemini", {}) == 32000
def test_unknown_provider_falls_back(self):
assert _resolve_max_text_length("does-not-exist", {}) == FALLBACK_MAX_TEXT_LENGTH

View file

@ -204,8 +204,8 @@ DEFAULT_OUTPUT_DIR = _get_default_output_dir()
# ---------------------------------------------------------------------------
# Per-provider input-character limits (from official provider docs).
# A single global cap was wrong: OpenAI is 4096, xAI is 15k, MiniMax is 10k,
# ElevenLabs is model-dependent (5k / 10k / 30k / 40k), Gemini caps at ~8k
# input tokens. Users can override any of these via
# ElevenLabs is model-dependent (5k / 10k / 30k / 40k), Gemini has a 32k-token
# context window. Users can override any of these via
# ``tts.<provider>.max_text_length`` in config.yaml.
# ---------------------------------------------------------------------------
PROVIDER_MAX_TEXT_LENGTH: Dict[str, int] = {
@ -214,7 +214,7 @@ PROVIDER_MAX_TEXT_LENGTH: Dict[str, int] = {
"xai": 15000, # https://docs.x.ai/developers/model-capabilities/audio/text-to-speech
"minimax": 10000, # https://platform.minimax.io/docs/api-reference/speech-t2a-http (sync)
"mistral": 4000, # conservative; no published per-request cap
"gemini": 5000, # Gemini TTS caps at ~8k input tokens / ~655s audio
"gemini": 32000, # Gemini TTS has a 32k-token context window; char cap is conservative
"elevenlabs": 10000, # fallback when model-aware lookup can't resolve (multilingual_v2)
"neutts": 2000, # local model, quality falls off on long text
"kittentts": 2000, # local 25MB model
@ -1394,6 +1394,65 @@ def _wrap_pcm_as_wav(
return riff_header + fmt_chunk + data_chunk_header + pcm_bytes
def _resolve_gemini_persona_prompt_path(gemini_config: Dict[str, Any]) -> Optional[Path]:
"""Return the configured persona prompt file path, if any."""
raw = gemini_config.get("persona_prompt_file")
if not isinstance(raw, str) or not raw.strip():
return None
expanded = os.path.expandvars(raw.strip())
path = Path(expanded).expanduser()
if not path.is_absolute():
try:
from hermes_constants import get_hermes_home
path = get_hermes_home() / path
except Exception:
path = Path.cwd() / path
return path
def _read_gemini_persona_prompt(gemini_config: Dict[str, Any]) -> str:
"""Read the Gemini persona prompt file, failing soft on config mistakes."""
path = _resolve_gemini_persona_prompt_path(gemini_config)
if path is None:
return ""
try:
return path.read_text(encoding="utf-8").strip()
except (OSError, UnicodeDecodeError) as exc:
logger.warning(
"Gemini TTS persona prompt file unavailable at %s: %s",
path,
exc,
)
return ""
def _compose_gemini_tts_prompt(text: str, gemini_config: Dict[str, Any]) -> str:
"""Build the Gemini prompt from persona direction plus the live transcript."""
transcript = text.strip()
persona_prompt = _read_gemini_persona_prompt(gemini_config)
if not persona_prompt:
return transcript
preamble = (
"Synthesize speech from the TRANSCRIPT only. Treat AUDIO PROFILE, "
"SCENE, DIRECTOR'S NOTES, and SAMPLE CONTEXT as performance direction; "
"do not speak those sections aloud."
)
placeholder_patterns = (
re.compile(r"\{\{\s*transcript\s*\}\}", flags=re.IGNORECASE),
re.compile(r"\{\s*transcript\s*\}", flags=re.IGNORECASE),
)
prompt = persona_prompt
for pattern in placeholder_patterns:
if pattern.search(prompt):
prompt = pattern.sub(transcript, prompt)
return f"{preamble}\n\n{prompt}".strip()
return f"{preamble}\n\n{persona_prompt}\n\n#### TRANSCRIPT\n{transcript}".strip()
def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
"""Generate audio using Google Gemini TTS.
@ -1419,7 +1478,8 @@ def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]
"GEMINI_API_KEY not set. Get one at https://aistudio.google.com/app/apikey"
)
gemini_config = tts_config.get("gemini", {})
raw_gemini_config = tts_config.get("gemini", {})
gemini_config = raw_gemini_config if isinstance(raw_gemini_config, dict) else {}
model = str(gemini_config.get("model", DEFAULT_GEMINI_TTS_MODEL)).strip() or DEFAULT_GEMINI_TTS_MODEL
voice = str(gemini_config.get("voice", DEFAULT_GEMINI_TTS_VOICE)).strip() or DEFAULT_GEMINI_TTS_VOICE
base_url = str(
@ -1427,9 +1487,17 @@ def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]
or get_env_value("GEMINI_BASE_URL")
or DEFAULT_GEMINI_TTS_BASE_URL
).strip().rstrip("/")
prompt_text = _compose_gemini_tts_prompt(text, gemini_config)
max_len = _resolve_max_text_length("gemini", tts_config)
if len(prompt_text) > max_len:
logger.warning(
"Gemini TTS composed prompt too long (%d chars), truncating to %d",
len(prompt_text), max_len,
)
prompt_text = prompt_text[:max_len]
payload: Dict[str, Any] = {
"contents": [{"parts": [{"text": text}]}],
"contents": [{"parts": [{"text": prompt_text}]}],
"generationConfig": {
"responseModalities": ["AUDIO"],
"speechConfig": {

View file

@ -1199,6 +1199,7 @@ tts:
gemini:
model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts
voice: "Kore" # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, etc.
persona_prompt_file: "" # Optional Markdown/text file with Gemini voice direction
xai:
voice_id: "eve" # xAI TTS voice
language: "en" # ISO 639-1

View file

@ -68,6 +68,7 @@ tts:
gemini:
model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts
voice: "Kore" # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, Gacrux, etc.
persona_prompt_file: "" # Optional Markdown/text file with Gemini voice direction
xai:
voice_id: "eve" # or a custom voice ID — see docs below
language: "en" # ISO 639-1 code
@ -97,6 +98,20 @@ tts:
**Speed control**: The global `tts.speed` value applies to all providers by default. Each provider can override it with its own `speed` setting (e.g., `tts.openai.speed: 1.5`). Provider-specific speed takes precedence over the global value. Default is `1.0` (normal speed).
### Gemini Persona Prompts
Gemini TTS can follow natural-language performance direction. Set `tts.gemini.persona_prompt_file` to a local Markdown or text file that describes the voice persona. The file can include Gemini-style sections such as `AUDIO PROFILE`, `SCENE`, `DIRECTOR'S NOTES`, `SAMPLE CONTEXT`, and `TRANSCRIPT`.
If the file contains `{transcript}` or `{{ transcript }}`, Hermes replaces that placeholder with the live TTS text. Otherwise, Hermes appends a labeled `TRANSCRIPT` section automatically. The persona prompt stays local and is not shown in the chat reply.
```yaml
tts:
provider: gemini
gemini:
voice: Algieba
persona_prompt_file: ~/.hermes/tts/butler-voice.md
```
### Input length limits
@ -109,7 +124,7 @@ Each provider has a documented per-request input-character cap. Hermes truncates
| xAI | 15000 |
| MiniMax | 10000 |
| Mistral | 4000 |
| Google Gemini | 5000 |
| Google Gemini | 32000 |
| ElevenLabs | Model-aware (see below) |
| NeuTTS | 2000 |
| KittenTTS | 2000 |