diff --git a/gateway/run.py b/gateway/run.py index 90622ef8d..b6737ad16 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -9443,35 +9443,15 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew logger.debug( "Transcript echo failed (non-fatal): %s", _echo_exc, ) - _stt_fail_markers = ( - "No STT provider", - "STT is disabled", - "can't listen", - "VOICE_TOOLS_OPENAI_KEY", - ) - if any(marker in message_text for marker in _stt_fail_markers): - _stt_adapter = self.adapters.get(source.platform) - _stt_meta = self._thread_metadata_for_source(source, self._reply_anchor_for_event(event)) - if _stt_adapter: - try: - _stt_msg = ( - "🎤 I received your voice message but can't transcribe it — " - "no speech-to-text provider is configured.\n\n" - "To enable voice: install faster-whisper " - "(`uv pip install faster-whisper` in the Hermes venv; " - "`pip install faster-whisper` also works if pip is on PATH) " - "and set `stt.enabled: true` in config.yaml, " - "then /restart the gateway." - ) - if self._has_setup_skill(): - _stt_msg += "\n\nFor full setup instructions, type: `/skill hermes-agent-setup`" - await _stt_adapter.send( - source.chat_id, - _stt_msg, - metadata=_stt_meta, - ) - except Exception: - pass + # NOTE: Previously, when transcription failed (e.g. no STT + # provider configured), the gateway also emitted a hardcoded + # English notice via `_stt_adapter.send()`. That bypassed the + # LLM and produced two replies — one pre-canned English clip + # (which TTS then spoke aloud, in the wrong language) and one + # correct, localized LLM reply from the enriched message text. + # The enrichment step now leaves a single neutral marker in the + # prompt, so the LLM produces one coherent reply in the user's + # language. The hardcoded send has therefore been removed. if audio_file_paths: from tools.credential_files import to_agent_visible_cache_path as _to_agent_path @@ -13848,41 +13828,28 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew if result["success"]: transcript = result["transcript"] successful_transcripts.append(transcript) - enriched_parts.append( - f'[The user sent a voice message~ ' - f'Here\'s what they said: "{transcript}"]' - ) + # Pass the transcript through as a plain quoted line. The + # earlier wording ("The user sent a voice message~ Here's + # what they said: ...") read as a meta-instruction and made + # the LLM volunteer commentary about voice mode rather than + # reply to the content. + enriched_parts.append(f'"{transcript}"') else: error = result.get("error", "unknown error") - if ( - "No STT provider" in error - or error.startswith("Neither VOICE_TOOLS_OPENAI_KEY nor OPENAI_API_KEY is set") - ): - _no_stt_note = ( - "[The user sent a voice message but I can't listen " - "to it right now — no STT provider is configured. " - "A direct message has already been sent to the user " - "with setup instructions." - ) - if self._has_setup_skill(): - _no_stt_note += ( - " You have a skill called hermes-agent-setup " - "that can help users configure Hermes features " - "including voice, tools, and more." - ) - _no_stt_note += "]" - enriched_parts.append(_no_stt_note) - else: - enriched_parts.append( - "[The user sent a voice message but I had trouble " - f"transcribing it~ ({error})]" - ) + # All failure branches: a single, minimal, neutral marker. + # Do NOT mention "no STT provider configured", "setup + # instructions", or the "hermes-agent-setup" skill, and do + # NOT claim a direct message was sent — those phrases get + # persisted in conversation history and poison every later + # turn, so the model keeps volunteering STT-setup advice + # even after transcription starts working. The cause is + # logged for operator diagnosis but kept out of the + # LLM-visible prompt. + logger.info("Voice transcription failed for %s: %s", path, error) + enriched_parts.append("[voice message could not be transcribed]") except Exception as e: logger.error("Transcription error: %s", e) - enriched_parts.append( - "[The user sent a voice message but something went wrong " - "when I tried to listen to it~ Let them know!]" - ) + enriched_parts.append("[voice message could not be transcribed]") if enriched_parts: prefix = "\n\n".join(enriched_parts) diff --git a/tests/gateway/test_stt_config.py b/tests/gateway/test_stt_config.py index 6f98a058d..5006eafee 100644 --- a/tests/gateway/test_stt_config.py +++ b/tests/gateway/test_stt_config.py @@ -97,7 +97,9 @@ async def test_enrich_message_with_transcription_avoids_bogus_no_provider_messag ) assert "No STT provider is configured" not in result - assert "trouble transcribing" in result + assert "[voice message could not be transcribed]" in result + # The opaque backend cause must NOT leak into the LLM-visible prompt. + assert "VOICE_TOOLS_OPENAI_KEY" not in result assert "caption" in result assert transcripts == [] @@ -180,5 +182,6 @@ async def test_prepare_inbound_message_text_transcribes_queued_voice_event(): ) assert result is not None + # Success path: the transcript passes through as a plain quoted line, with + # no "voice message" meta-commentary that the LLM would echo back. assert "queued voice transcript" in result - assert "voice message" in result.lower() diff --git a/tests/gateway/test_telegram_audio_vs_voice.py b/tests/gateway/test_telegram_audio_vs_voice.py index 1d1bf0cb7..be9cc3767 100644 --- a/tests/gateway/test_telegram_audio_vs_voice.py +++ b/tests/gateway/test_telegram_audio_vs_voice.py @@ -75,8 +75,9 @@ async def test_voice_message_still_transcribed(): ) mock_transcribe.assert_called_once_with("/tmp/voice.ogg") + # The transcript passes through as a plain quoted line — no "voice message" + # meta-commentary in the LLM-visible prompt. assert "hello world" in result - assert "voice message" in result.lower() # ---------------------------------------------------------------------------