fix(gateway): stop poisoning the LLM prompt with STT-mode chatter

The STT-failure enrichment templates injected setup instructions — "no STT provider is configured", "a direct message has already been sent", and a "hermes-agent-setup" skill mention — into the LLM-visible prompt. That text persists in conversation history, so after one STT failure the model kept volunteering Whisper/Vosk setup advice on every later voice turn, even after transcription started working (observed in prod on gpt-5-nano). The gateway also fired a hardcoded English notice via _stt_adapter.send(), producing a second, wrong-language reply that TTS then spoke aloud. - Neutralize all enrichment templates: success passes the transcript through as a plain quoted line; every failure branch emits a single [voice message could not be transcribed] marker. - Move the operator-facing failure cause to logger.info so it stays diagnosable in container logs without leaking into the prompt. - Remove the hardcoded English _stt_adapter.send() notice; the LLM now produces one coherent reply in the user's language. - Update the gateway STT tests to assert the neutral contract. Co-authored-by: Hermes Agent <noreply@nousresearch.com>
2026-06-30 03:18:21 -07:00 · 2026-06-30 03:18:21 -07:00 · 5582b51a68
commit 5582b51a68
parent cbe397ef45
3 changed files with 34 additions and 63 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -9443,35 +9443,15 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
                                logger.debug(
                                    "Transcript echo failed (non-fatal): %s", _echo_exc,
                                )
-                _stt_fail_markers = (
-                    "No STT provider",
-                    "STT is disabled",
-                    "can't listen",
-                    "VOICE_TOOLS_OPENAI_KEY",
-                )
-                if any(marker in message_text for marker in _stt_fail_markers):
-                    _stt_adapter = self.adapters.get(source.platform)
-                    _stt_meta = self._thread_metadata_for_source(source, self._reply_anchor_for_event(event))
-                    if _stt_adapter:
-                        try:
-                            _stt_msg = (
-                                "🎤 I received your voice message but can't transcribe it — "
-                                "no speech-to-text provider is configured.\n\n"
-                                "To enable voice: install faster-whisper "
-                                "(`uv pip install faster-whisper` in the Hermes venv; "
-                                "`pip install faster-whisper` also works if pip is on PATH) "
-                                "and set `stt.enabled: true` in config.yaml, "
-                                "then /restart the gateway."
-                            )
-                            if self._has_setup_skill():
-                                _stt_msg += "\n\nFor full setup instructions, type: `/skill hermes-agent-setup`"
-                            await _stt_adapter.send(
-                                source.chat_id,
-                                _stt_msg,
-                                metadata=_stt_meta,
-                            )
-                        except Exception:
-                            pass
+                # NOTE: Previously, when transcription failed (e.g. no STT
+                # provider configured), the gateway also emitted a hardcoded
+                # English notice via `_stt_adapter.send()`. That bypassed the
+                # LLM and produced two replies — one pre-canned English clip
+                # (which TTS then spoke aloud, in the wrong language) and one
+                # correct, localized LLM reply from the enriched message text.
+                # The enrichment step now leaves a single neutral marker in the
+                # prompt, so the LLM produces one coherent reply in the user's
+                # language. The hardcoded send has therefore been removed.

        if audio_file_paths:
            from tools.credential_files import to_agent_visible_cache_path as _to_agent_path
@ -13848,41 +13828,28 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
                if result["success"]:
                    transcript = result["transcript"]
                    successful_transcripts.append(transcript)
-                    enriched_parts.append(
-                        f'[The user sent a voice message~ '
-                        f'Here\'s what they said: "{transcript}"]'
-                    )
+                    # Pass the transcript through as a plain quoted line. The
+                    # earlier wording ("The user sent a voice message~ Here's
+                    # what they said: ...") read as a meta-instruction and made
+                    # the LLM volunteer commentary about voice mode rather than
+                    # reply to the content.
+                    enriched_parts.append(f'"{transcript}"')
                else:
                    error = result.get("error", "unknown error")
-                    if (
-                        "No STT provider" in error
-                        or error.startswith("Neither VOICE_TOOLS_OPENAI_KEY nor OPENAI_API_KEY is set")
-                    ):
-                        _no_stt_note = (
-                            "[The user sent a voice message but I can't listen "
-                            "to it right now — no STT provider is configured. "
-                            "A direct message has already been sent to the user "
-                            "with setup instructions."
-                        )
-                        if self._has_setup_skill():
-                            _no_stt_note += (
-                                " You have a skill called hermes-agent-setup "
-                                "that can help users configure Hermes features "
-                                "including voice, tools, and more."
-                            )
-                        _no_stt_note += "]"
-                        enriched_parts.append(_no_stt_note)
-                    else:
-                        enriched_parts.append(
-                            "[The user sent a voice message but I had trouble "
-                            f"transcribing it~ ({error})]"
-                        )
+                    # All failure branches: a single, minimal, neutral marker.
+                    # Do NOT mention "no STT provider configured", "setup
+                    # instructions", or the "hermes-agent-setup" skill, and do
+                    # NOT claim a direct message was sent — those phrases get
+                    # persisted in conversation history and poison every later
+                    # turn, so the model keeps volunteering STT-setup advice
+                    # even after transcription starts working. The cause is
+                    # logged for operator diagnosis but kept out of the
+                    # LLM-visible prompt.
+                    logger.info("Voice transcription failed for %s: %s", path, error)
+                    enriched_parts.append("[voice message could not be transcribed]")
            except Exception as e:
                logger.error("Transcription error: %s", e)
-                enriched_parts.append(
-                    "[The user sent a voice message but something went wrong "
-                    "when I tried to listen to it~ Let them know!]"
-                )
+                enriched_parts.append("[voice message could not be transcribed]")

        if enriched_parts:
            prefix = "\n\n".join(enriched_parts)
--- a/tests/gateway/test_stt_config.py
+++ b/tests/gateway/test_stt_config.py
@ -97,7 +97,9 @@ async def test_enrich_message_with_transcription_avoids_bogus_no_provider_messag
        )

    assert "No STT provider is configured" not in result
-    assert "trouble transcribing" in result
+    assert "[voice message could not be transcribed]" in result
+    # The opaque backend cause must NOT leak into the LLM-visible prompt.
+    assert "VOICE_TOOLS_OPENAI_KEY" not in result
    assert "caption" in result
    assert transcripts == []

@ -180,5 +182,6 @@ async def test_prepare_inbound_message_text_transcribes_queued_voice_event():
        )

    assert result is not None
+    # Success path: the transcript passes through as a plain quoted line, with
+    # no "voice message" meta-commentary that the LLM would echo back.
    assert "queued voice transcript" in result
-    assert "voice message" in result.lower()
--- a/tests/gateway/test_telegram_audio_vs_voice.py
+++ b/tests/gateway/test_telegram_audio_vs_voice.py
@ -75,8 +75,9 @@ async def test_voice_message_still_transcribed():
        )

    mock_transcribe.assert_called_once_with("/tmp/voice.ogg")
+    # The transcript passes through as a plain quoted line — no "voice message"
+    # meta-commentary in the LLM-visible prompt.
    assert "hello world" in result
-    assert "voice message" in result.lower()


 # ---------------------------------------------------------------------------