feat(tts): add Gemini persona prompt file
This commit is contained in:
parent
af3c8b80b5
commit
5718811de0
6 changed files with 159 additions and 9 deletions
|
|
@ -1556,7 +1556,7 @@ DEFAULT_CONFIG = {
|
|||
# Each provider supports an optional `max_text_length:` override for the
|
||||
# per-request input-character cap. Omit it to use the provider's documented
|
||||
# limit (OpenAI 4096, xAI 15000, MiniMax 10000, ElevenLabs 5k-40k model-aware,
|
||||
# Gemini 5000, Edge 5000, Mistral 4000, NeuTTS/KittenTTS 2000).
|
||||
# Gemini 32000, Edge 5000, Mistral 4000, NeuTTS/KittenTTS 2000).
|
||||
"tts": {
|
||||
"provider": "edge", # "edge" (free) | "elevenlabs" (premium) | "openai" | "xai" | "minimax" | "mistral" | "gemini" | "neutts" (local) | "kittentts" (local) | "piper" (local)
|
||||
"edge": {
|
||||
|
|
@ -1572,6 +1572,15 @@ DEFAULT_CONFIG = {
|
|||
"voice": "alloy",
|
||||
# Voices: alloy, echo, fable, onyx, nova, shimmer
|
||||
},
|
||||
"gemini": {
|
||||
"model": "gemini-2.5-flash-preview-tts",
|
||||
"voice": "Kore",
|
||||
# Optional local Markdown/text file with Gemini TTS performance
|
||||
# direction. It may include AUDIO PROFILE, SCENE, DIRECTOR'S NOTES,
|
||||
# SAMPLE CONTEXT, and either a `{transcript}` placeholder or no
|
||||
# transcript section; Hermes appends the live transcript when absent.
|
||||
"persona_prompt_file": "",
|
||||
},
|
||||
"xai": {
|
||||
"voice_id": "eve", # or custom voice ID — see https://docs.x.ai/developers/model-capabilities/audio/custom-voices
|
||||
"language": "en",
|
||||
|
|
|
|||
|
|
@ -255,6 +255,63 @@ class TestGenerateGeminiTts:
|
|||
|
||||
assert mock_post.call_args[0][0].startswith("https://custom-gemini.example.com/v1beta/")
|
||||
|
||||
def test_persona_prompt_file_appends_labeled_transcript(
|
||||
self, tmp_path, monkeypatch, mock_gemini_response
|
||||
):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
persona_file = tmp_path / "voice-persona.md"
|
||||
persona_file.write_text(
|
||||
"# AUDIO PROFILE: Dry Butler\n\n### DIRECTOR'S NOTES\nStyle: Understated.",
|
||||
encoding="utf-8",
|
||||
)
|
||||
config = {"gemini": {"persona_prompt_file": str(persona_file)}}
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
|
||||
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
|
||||
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), config)
|
||||
|
||||
prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
|
||||
assert "Synthesize speech from the TRANSCRIPT only" in prompt_text
|
||||
assert "# AUDIO PROFILE: Dry Butler" in prompt_text
|
||||
assert "### DIRECTOR'S NOTES\nStyle: Understated." in prompt_text
|
||||
assert "#### TRANSCRIPT\nHi" in prompt_text
|
||||
|
||||
def test_persona_prompt_file_supports_transcript_placeholder(
|
||||
self, tmp_path, monkeypatch, mock_gemini_response
|
||||
):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
persona_file = tmp_path / "voice-persona.md"
|
||||
persona_file.write_text(
|
||||
"### DIRECTOR'S NOTES\nPacing: Slow.\n\n#### TRANSCRIPT\n{{ transcript }}",
|
||||
encoding="utf-8",
|
||||
)
|
||||
config = {"gemini": {"persona_prompt_file": str(persona_file)}}
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
|
||||
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
|
||||
_generate_gemini_tts("Read this.", str(tmp_path / "test.wav"), config)
|
||||
|
||||
prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
|
||||
assert "{{ transcript }}" not in prompt_text
|
||||
assert "#### TRANSCRIPT\nRead this." in prompt_text
|
||||
|
||||
def test_missing_persona_prompt_file_warns_and_continues(
|
||||
self, tmp_path, monkeypatch, caplog, mock_gemini_response
|
||||
):
|
||||
from tools.tts_tool import _generate_gemini_tts
|
||||
|
||||
config = {"gemini": {"persona_prompt_file": str(tmp_path / "missing.md")}}
|
||||
monkeypatch.setenv("GEMINI_API_KEY", "test-key")
|
||||
|
||||
with patch("requests.post", return_value=mock_gemini_response) as mock_post:
|
||||
_generate_gemini_tts("Hi", str(tmp_path / "test.wav"), config)
|
||||
|
||||
prompt_text = mock_post.call_args[1]["json"]["contents"][0]["parts"][0]["text"]
|
||||
assert prompt_text == "Hi"
|
||||
assert "persona prompt file unavailable" in caplog.text
|
||||
|
||||
|
||||
class TestGeminiInCheckRequirements:
|
||||
def test_gemini_api_key_satisfies_requirements(self, monkeypatch):
|
||||
|
|
|
|||
|
|
@ -31,8 +31,8 @@ class TestResolveMaxTextLength:
|
|||
def test_mistral_default(self):
|
||||
assert _resolve_max_text_length("mistral", {}) == PROVIDER_MAX_TEXT_LENGTH["mistral"]
|
||||
|
||||
def test_gemini_default(self):
|
||||
assert _resolve_max_text_length("gemini", {}) == PROVIDER_MAX_TEXT_LENGTH["gemini"]
|
||||
def test_gemini_default_is_32000(self):
|
||||
assert _resolve_max_text_length("gemini", {}) == 32000
|
||||
|
||||
def test_unknown_provider_falls_back(self):
|
||||
assert _resolve_max_text_length("does-not-exist", {}) == FALLBACK_MAX_TEXT_LENGTH
|
||||
|
|
|
|||
|
|
@ -204,8 +204,8 @@ DEFAULT_OUTPUT_DIR = _get_default_output_dir()
|
|||
# ---------------------------------------------------------------------------
|
||||
# Per-provider input-character limits (from official provider docs).
|
||||
# A single global cap was wrong: OpenAI is 4096, xAI is 15k, MiniMax is 10k,
|
||||
# ElevenLabs is model-dependent (5k / 10k / 30k / 40k), Gemini caps at ~8k
|
||||
# input tokens. Users can override any of these via
|
||||
# ElevenLabs is model-dependent (5k / 10k / 30k / 40k), Gemini has a 32k-token
|
||||
# context window. Users can override any of these via
|
||||
# ``tts.<provider>.max_text_length`` in config.yaml.
|
||||
# ---------------------------------------------------------------------------
|
||||
PROVIDER_MAX_TEXT_LENGTH: Dict[str, int] = {
|
||||
|
|
@ -214,7 +214,7 @@ PROVIDER_MAX_TEXT_LENGTH: Dict[str, int] = {
|
|||
"xai": 15000, # https://docs.x.ai/developers/model-capabilities/audio/text-to-speech
|
||||
"minimax": 10000, # https://platform.minimax.io/docs/api-reference/speech-t2a-http (sync)
|
||||
"mistral": 4000, # conservative; no published per-request cap
|
||||
"gemini": 5000, # Gemini TTS caps at ~8k input tokens / ~655s audio
|
||||
"gemini": 32000, # Gemini TTS has a 32k-token context window; char cap is conservative
|
||||
"elevenlabs": 10000, # fallback when model-aware lookup can't resolve (multilingual_v2)
|
||||
"neutts": 2000, # local model, quality falls off on long text
|
||||
"kittentts": 2000, # local 25MB model
|
||||
|
|
@ -1394,6 +1394,65 @@ def _wrap_pcm_as_wav(
|
|||
return riff_header + fmt_chunk + data_chunk_header + pcm_bytes
|
||||
|
||||
|
||||
def _resolve_gemini_persona_prompt_path(gemini_config: Dict[str, Any]) -> Optional[Path]:
|
||||
"""Return the configured persona prompt file path, if any."""
|
||||
raw = gemini_config.get("persona_prompt_file")
|
||||
if not isinstance(raw, str) or not raw.strip():
|
||||
return None
|
||||
|
||||
expanded = os.path.expandvars(raw.strip())
|
||||
path = Path(expanded).expanduser()
|
||||
if not path.is_absolute():
|
||||
try:
|
||||
from hermes_constants import get_hermes_home
|
||||
path = get_hermes_home() / path
|
||||
except Exception:
|
||||
path = Path.cwd() / path
|
||||
return path
|
||||
|
||||
|
||||
def _read_gemini_persona_prompt(gemini_config: Dict[str, Any]) -> str:
|
||||
"""Read the Gemini persona prompt file, failing soft on config mistakes."""
|
||||
path = _resolve_gemini_persona_prompt_path(gemini_config)
|
||||
if path is None:
|
||||
return ""
|
||||
try:
|
||||
return path.read_text(encoding="utf-8").strip()
|
||||
except (OSError, UnicodeDecodeError) as exc:
|
||||
logger.warning(
|
||||
"Gemini TTS persona prompt file unavailable at %s: %s",
|
||||
path,
|
||||
exc,
|
||||
)
|
||||
return ""
|
||||
|
||||
|
||||
def _compose_gemini_tts_prompt(text: str, gemini_config: Dict[str, Any]) -> str:
|
||||
"""Build the Gemini prompt from persona direction plus the live transcript."""
|
||||
transcript = text.strip()
|
||||
persona_prompt = _read_gemini_persona_prompt(gemini_config)
|
||||
if not persona_prompt:
|
||||
return transcript
|
||||
|
||||
preamble = (
|
||||
"Synthesize speech from the TRANSCRIPT only. Treat AUDIO PROFILE, "
|
||||
"SCENE, DIRECTOR'S NOTES, and SAMPLE CONTEXT as performance direction; "
|
||||
"do not speak those sections aloud."
|
||||
)
|
||||
|
||||
placeholder_patterns = (
|
||||
re.compile(r"\{\{\s*transcript\s*\}\}", flags=re.IGNORECASE),
|
||||
re.compile(r"\{\s*transcript\s*\}", flags=re.IGNORECASE),
|
||||
)
|
||||
prompt = persona_prompt
|
||||
for pattern in placeholder_patterns:
|
||||
if pattern.search(prompt):
|
||||
prompt = pattern.sub(transcript, prompt)
|
||||
return f"{preamble}\n\n{prompt}".strip()
|
||||
|
||||
return f"{preamble}\n\n{persona_prompt}\n\n#### TRANSCRIPT\n{transcript}".strip()
|
||||
|
||||
|
||||
def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
|
||||
"""Generate audio using Google Gemini TTS.
|
||||
|
||||
|
|
@ -1419,7 +1478,8 @@ def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]
|
|||
"GEMINI_API_KEY not set. Get one at https://aistudio.google.com/app/apikey"
|
||||
)
|
||||
|
||||
gemini_config = tts_config.get("gemini", {})
|
||||
raw_gemini_config = tts_config.get("gemini", {})
|
||||
gemini_config = raw_gemini_config if isinstance(raw_gemini_config, dict) else {}
|
||||
model = str(gemini_config.get("model", DEFAULT_GEMINI_TTS_MODEL)).strip() or DEFAULT_GEMINI_TTS_MODEL
|
||||
voice = str(gemini_config.get("voice", DEFAULT_GEMINI_TTS_VOICE)).strip() or DEFAULT_GEMINI_TTS_VOICE
|
||||
base_url = str(
|
||||
|
|
@ -1427,9 +1487,17 @@ def _generate_gemini_tts(text: str, output_path: str, tts_config: Dict[str, Any]
|
|||
or get_env_value("GEMINI_BASE_URL")
|
||||
or DEFAULT_GEMINI_TTS_BASE_URL
|
||||
).strip().rstrip("/")
|
||||
prompt_text = _compose_gemini_tts_prompt(text, gemini_config)
|
||||
max_len = _resolve_max_text_length("gemini", tts_config)
|
||||
if len(prompt_text) > max_len:
|
||||
logger.warning(
|
||||
"Gemini TTS composed prompt too long (%d chars), truncating to %d",
|
||||
len(prompt_text), max_len,
|
||||
)
|
||||
prompt_text = prompt_text[:max_len]
|
||||
|
||||
payload: Dict[str, Any] = {
|
||||
"contents": [{"parts": [{"text": text}]}],
|
||||
"contents": [{"parts": [{"text": prompt_text}]}],
|
||||
"generationConfig": {
|
||||
"responseModalities": ["AUDIO"],
|
||||
"speechConfig": {
|
||||
|
|
|
|||
|
|
@ -1199,6 +1199,7 @@ tts:
|
|||
gemini:
|
||||
model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts
|
||||
voice: "Kore" # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, etc.
|
||||
persona_prompt_file: "" # Optional Markdown/text file with Gemini voice direction
|
||||
xai:
|
||||
voice_id: "eve" # xAI TTS voice
|
||||
language: "en" # ISO 639-1
|
||||
|
|
|
|||
|
|
@ -68,6 +68,7 @@ tts:
|
|||
gemini:
|
||||
model: "gemini-2.5-flash-preview-tts" # or gemini-2.5-pro-preview-tts
|
||||
voice: "Kore" # 30 prebuilt voices: Zephyr, Puck, Kore, Enceladus, Gacrux, etc.
|
||||
persona_prompt_file: "" # Optional Markdown/text file with Gemini voice direction
|
||||
xai:
|
||||
voice_id: "eve" # or a custom voice ID — see docs below
|
||||
language: "en" # ISO 639-1 code
|
||||
|
|
@ -97,6 +98,20 @@ tts:
|
|||
|
||||
**Speed control**: The global `tts.speed` value applies to all providers by default. Each provider can override it with its own `speed` setting (e.g., `tts.openai.speed: 1.5`). Provider-specific speed takes precedence over the global value. Default is `1.0` (normal speed).
|
||||
|
||||
### Gemini Persona Prompts
|
||||
|
||||
Gemini TTS can follow natural-language performance direction. Set `tts.gemini.persona_prompt_file` to a local Markdown or text file that describes the voice persona. The file can include Gemini-style sections such as `AUDIO PROFILE`, `SCENE`, `DIRECTOR'S NOTES`, `SAMPLE CONTEXT`, and `TRANSCRIPT`.
|
||||
|
||||
If the file contains `{transcript}` or `{{ transcript }}`, Hermes replaces that placeholder with the live TTS text. Otherwise, Hermes appends a labeled `TRANSCRIPT` section automatically. The persona prompt stays local and is not shown in the chat reply.
|
||||
|
||||
```yaml
|
||||
tts:
|
||||
provider: gemini
|
||||
gemini:
|
||||
voice: Algieba
|
||||
persona_prompt_file: ~/.hermes/tts/butler-voice.md
|
||||
```
|
||||
|
||||
|
||||
### Input length limits
|
||||
|
||||
|
|
@ -109,7 +124,7 @@ Each provider has a documented per-request input-character cap. Hermes truncates
|
|||
| xAI | 15000 |
|
||||
| MiniMax | 10000 |
|
||||
| Mistral | 4000 |
|
||||
| Google Gemini | 5000 |
|
||||
| Google Gemini | 32000 |
|
||||
| ElevenLabs | Model-aware (see below) |
|
||||
| NeuTTS | 2000 |
|
||||
| KittenTTS | 2000 |
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue