fix(gateway): stop poisoning the LLM prompt with STT-mode chatter
The STT-failure enrichment templates injected setup instructions — "no STT provider is configured", "a direct message has already been sent", and a "hermes-agent-setup" skill mention — into the LLM-visible prompt. That text persists in conversation history, so after one STT failure the model kept volunteering Whisper/Vosk setup advice on every later voice turn, even after transcription started working (observed in prod on gpt-5-nano). The gateway also fired a hardcoded English notice via _stt_adapter.send(), producing a second, wrong-language reply that TTS then spoke aloud. - Neutralize all enrichment templates: success passes the transcript through as a plain quoted line; every failure branch emits a single [voice message could not be transcribed] marker. - Move the operator-facing failure cause to logger.info so it stays diagnosable in container logs without leaking into the prompt. - Remove the hardcoded English _stt_adapter.send() notice; the LLM now produces one coherent reply in the user's language. - Update the gateway STT tests to assert the neutral contract. Co-authored-by: Hermes Agent <noreply@nousresearch.com>
This commit is contained in:
parent
cbe397ef45
commit
5582b51a68
3 changed files with 34 additions and 63 deletions
|
|
@ -9443,35 +9443,15 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
logger.debug(
|
||||
"Transcript echo failed (non-fatal): %s", _echo_exc,
|
||||
)
|
||||
_stt_fail_markers = (
|
||||
"No STT provider",
|
||||
"STT is disabled",
|
||||
"can't listen",
|
||||
"VOICE_TOOLS_OPENAI_KEY",
|
||||
)
|
||||
if any(marker in message_text for marker in _stt_fail_markers):
|
||||
_stt_adapter = self.adapters.get(source.platform)
|
||||
_stt_meta = self._thread_metadata_for_source(source, self._reply_anchor_for_event(event))
|
||||
if _stt_adapter:
|
||||
try:
|
||||
_stt_msg = (
|
||||
"🎤 I received your voice message but can't transcribe it — "
|
||||
"no speech-to-text provider is configured.\n\n"
|
||||
"To enable voice: install faster-whisper "
|
||||
"(`uv pip install faster-whisper` in the Hermes venv; "
|
||||
"`pip install faster-whisper` also works if pip is on PATH) "
|
||||
"and set `stt.enabled: true` in config.yaml, "
|
||||
"then /restart the gateway."
|
||||
)
|
||||
if self._has_setup_skill():
|
||||
_stt_msg += "\n\nFor full setup instructions, type: `/skill hermes-agent-setup`"
|
||||
await _stt_adapter.send(
|
||||
source.chat_id,
|
||||
_stt_msg,
|
||||
metadata=_stt_meta,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
# NOTE: Previously, when transcription failed (e.g. no STT
|
||||
# provider configured), the gateway also emitted a hardcoded
|
||||
# English notice via `_stt_adapter.send()`. That bypassed the
|
||||
# LLM and produced two replies — one pre-canned English clip
|
||||
# (which TTS then spoke aloud, in the wrong language) and one
|
||||
# correct, localized LLM reply from the enriched message text.
|
||||
# The enrichment step now leaves a single neutral marker in the
|
||||
# prompt, so the LLM produces one coherent reply in the user's
|
||||
# language. The hardcoded send has therefore been removed.
|
||||
|
||||
if audio_file_paths:
|
||||
from tools.credential_files import to_agent_visible_cache_path as _to_agent_path
|
||||
|
|
@ -13848,41 +13828,28 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
if result["success"]:
|
||||
transcript = result["transcript"]
|
||||
successful_transcripts.append(transcript)
|
||||
enriched_parts.append(
|
||||
f'[The user sent a voice message~ '
|
||||
f'Here\'s what they said: "{transcript}"]'
|
||||
)
|
||||
# Pass the transcript through as a plain quoted line. The
|
||||
# earlier wording ("The user sent a voice message~ Here's
|
||||
# what they said: ...") read as a meta-instruction and made
|
||||
# the LLM volunteer commentary about voice mode rather than
|
||||
# reply to the content.
|
||||
enriched_parts.append(f'"{transcript}"')
|
||||
else:
|
||||
error = result.get("error", "unknown error")
|
||||
if (
|
||||
"No STT provider" in error
|
||||
or error.startswith("Neither VOICE_TOOLS_OPENAI_KEY nor OPENAI_API_KEY is set")
|
||||
):
|
||||
_no_stt_note = (
|
||||
"[The user sent a voice message but I can't listen "
|
||||
"to it right now — no STT provider is configured. "
|
||||
"A direct message has already been sent to the user "
|
||||
"with setup instructions."
|
||||
)
|
||||
if self._has_setup_skill():
|
||||
_no_stt_note += (
|
||||
" You have a skill called hermes-agent-setup "
|
||||
"that can help users configure Hermes features "
|
||||
"including voice, tools, and more."
|
||||
)
|
||||
_no_stt_note += "]"
|
||||
enriched_parts.append(_no_stt_note)
|
||||
else:
|
||||
enriched_parts.append(
|
||||
"[The user sent a voice message but I had trouble "
|
||||
f"transcribing it~ ({error})]"
|
||||
)
|
||||
# All failure branches: a single, minimal, neutral marker.
|
||||
# Do NOT mention "no STT provider configured", "setup
|
||||
# instructions", or the "hermes-agent-setup" skill, and do
|
||||
# NOT claim a direct message was sent — those phrases get
|
||||
# persisted in conversation history and poison every later
|
||||
# turn, so the model keeps volunteering STT-setup advice
|
||||
# even after transcription starts working. The cause is
|
||||
# logged for operator diagnosis but kept out of the
|
||||
# LLM-visible prompt.
|
||||
logger.info("Voice transcription failed for %s: %s", path, error)
|
||||
enriched_parts.append("[voice message could not be transcribed]")
|
||||
except Exception as e:
|
||||
logger.error("Transcription error: %s", e)
|
||||
enriched_parts.append(
|
||||
"[The user sent a voice message but something went wrong "
|
||||
"when I tried to listen to it~ Let them know!]"
|
||||
)
|
||||
enriched_parts.append("[voice message could not be transcribed]")
|
||||
|
||||
if enriched_parts:
|
||||
prefix = "\n\n".join(enriched_parts)
|
||||
|
|
|
|||
|
|
@ -97,7 +97,9 @@ async def test_enrich_message_with_transcription_avoids_bogus_no_provider_messag
|
|||
)
|
||||
|
||||
assert "No STT provider is configured" not in result
|
||||
assert "trouble transcribing" in result
|
||||
assert "[voice message could not be transcribed]" in result
|
||||
# The opaque backend cause must NOT leak into the LLM-visible prompt.
|
||||
assert "VOICE_TOOLS_OPENAI_KEY" not in result
|
||||
assert "caption" in result
|
||||
assert transcripts == []
|
||||
|
||||
|
|
@ -180,5 +182,6 @@ async def test_prepare_inbound_message_text_transcribes_queued_voice_event():
|
|||
)
|
||||
|
||||
assert result is not None
|
||||
# Success path: the transcript passes through as a plain quoted line, with
|
||||
# no "voice message" meta-commentary that the LLM would echo back.
|
||||
assert "queued voice transcript" in result
|
||||
assert "voice message" in result.lower()
|
||||
|
|
|
|||
|
|
@ -75,8 +75,9 @@ async def test_voice_message_still_transcribed():
|
|||
)
|
||||
|
||||
mock_transcribe.assert_called_once_with("/tmp/voice.ogg")
|
||||
# The transcript passes through as a plain quoted line — no "voice message"
|
||||
# meta-commentary in the LLM-visible prompt.
|
||||
assert "hello world" in result
|
||||
assert "voice message" in result.lower()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue