fix(gateway): stop poisoning the LLM prompt with STT-mode chatter

The STT-failure enrichment templates injected setup instructions —
"no STT provider is configured", "a direct message has already been
sent", and a "hermes-agent-setup" skill mention — into the LLM-visible
prompt. That text persists in conversation history, so after one STT
failure the model kept volunteering Whisper/Vosk setup advice on every
later voice turn, even after transcription started working (observed in
prod on gpt-5-nano). The gateway also fired a hardcoded English notice
via _stt_adapter.send(), producing a second, wrong-language reply that
TTS then spoke aloud.

- Neutralize all enrichment templates: success passes the transcript
  through as a plain quoted line; every failure branch emits a single
  [voice message could not be transcribed] marker.
- Move the operator-facing failure cause to logger.info so it stays
  diagnosable in container logs without leaking into the prompt.
- Remove the hardcoded English _stt_adapter.send() notice; the LLM now
  produces one coherent reply in the user's language.
- Update the gateway STT tests to assert the neutral contract.

Co-authored-by: Hermes Agent <noreply@nousresearch.com>
This commit is contained in:
nnnet 2026-06-30 03:18:21 -07:00 committed by Teknium
parent cbe397ef45
commit 5582b51a68
3 changed files with 34 additions and 63 deletions

View file

@ -9443,35 +9443,15 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
logger.debug(
"Transcript echo failed (non-fatal): %s", _echo_exc,
)
_stt_fail_markers = (
"No STT provider",
"STT is disabled",
"can't listen",
"VOICE_TOOLS_OPENAI_KEY",
)
if any(marker in message_text for marker in _stt_fail_markers):
_stt_adapter = self.adapters.get(source.platform)
_stt_meta = self._thread_metadata_for_source(source, self._reply_anchor_for_event(event))
if _stt_adapter:
try:
_stt_msg = (
"🎤 I received your voice message but can't transcribe it — "
"no speech-to-text provider is configured.\n\n"
"To enable voice: install faster-whisper "
"(`uv pip install faster-whisper` in the Hermes venv; "
"`pip install faster-whisper` also works if pip is on PATH) "
"and set `stt.enabled: true` in config.yaml, "
"then /restart the gateway."
)
if self._has_setup_skill():
_stt_msg += "\n\nFor full setup instructions, type: `/skill hermes-agent-setup`"
await _stt_adapter.send(
source.chat_id,
_stt_msg,
metadata=_stt_meta,
)
except Exception:
pass
# NOTE: Previously, when transcription failed (e.g. no STT
# provider configured), the gateway also emitted a hardcoded
# English notice via `_stt_adapter.send()`. That bypassed the
# LLM and produced two replies — one pre-canned English clip
# (which TTS then spoke aloud, in the wrong language) and one
# correct, localized LLM reply from the enriched message text.
# The enrichment step now leaves a single neutral marker in the
# prompt, so the LLM produces one coherent reply in the user's
# language. The hardcoded send has therefore been removed.
if audio_file_paths:
from tools.credential_files import to_agent_visible_cache_path as _to_agent_path
@ -13848,41 +13828,28 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
if result["success"]:
transcript = result["transcript"]
successful_transcripts.append(transcript)
enriched_parts.append(
f'[The user sent a voice message~ '
f'Here\'s what they said: "{transcript}"]'
)
# Pass the transcript through as a plain quoted line. The
# earlier wording ("The user sent a voice message~ Here's
# what they said: ...") read as a meta-instruction and made
# the LLM volunteer commentary about voice mode rather than
# reply to the content.
enriched_parts.append(f'"{transcript}"')
else:
error = result.get("error", "unknown error")
if (
"No STT provider" in error
or error.startswith("Neither VOICE_TOOLS_OPENAI_KEY nor OPENAI_API_KEY is set")
):
_no_stt_note = (
"[The user sent a voice message but I can't listen "
"to it right now — no STT provider is configured. "
"A direct message has already been sent to the user "
"with setup instructions."
)
if self._has_setup_skill():
_no_stt_note += (
" You have a skill called hermes-agent-setup "
"that can help users configure Hermes features "
"including voice, tools, and more."
)
_no_stt_note += "]"
enriched_parts.append(_no_stt_note)
else:
enriched_parts.append(
"[The user sent a voice message but I had trouble "
f"transcribing it~ ({error})]"
)
# All failure branches: a single, minimal, neutral marker.
# Do NOT mention "no STT provider configured", "setup
# instructions", or the "hermes-agent-setup" skill, and do
# NOT claim a direct message was sent — those phrases get
# persisted in conversation history and poison every later
# turn, so the model keeps volunteering STT-setup advice
# even after transcription starts working. The cause is
# logged for operator diagnosis but kept out of the
# LLM-visible prompt.
logger.info("Voice transcription failed for %s: %s", path, error)
enriched_parts.append("[voice message could not be transcribed]")
except Exception as e:
logger.error("Transcription error: %s", e)
enriched_parts.append(
"[The user sent a voice message but something went wrong "
"when I tried to listen to it~ Let them know!]"
)
enriched_parts.append("[voice message could not be transcribed]")
if enriched_parts:
prefix = "\n\n".join(enriched_parts)

View file

@ -97,7 +97,9 @@ async def test_enrich_message_with_transcription_avoids_bogus_no_provider_messag
)
assert "No STT provider is configured" not in result
assert "trouble transcribing" in result
assert "[voice message could not be transcribed]" in result
# The opaque backend cause must NOT leak into the LLM-visible prompt.
assert "VOICE_TOOLS_OPENAI_KEY" not in result
assert "caption" in result
assert transcripts == []
@ -180,5 +182,6 @@ async def test_prepare_inbound_message_text_transcribes_queued_voice_event():
)
assert result is not None
# Success path: the transcript passes through as a plain quoted line, with
# no "voice message" meta-commentary that the LLM would echo back.
assert "queued voice transcript" in result
assert "voice message" in result.lower()

View file

@ -75,8 +75,9 @@ async def test_voice_message_still_transcribed():
)
mock_transcribe.assert_called_once_with("/tmp/voice.ogg")
# The transcript passes through as a plain quoted line — no "voice message"
# meta-commentary in the LLM-visible prompt.
assert "hello world" in result
assert "voice message" in result.lower()
# ---------------------------------------------------------------------------