diff --git a/gateway/run.py b/gateway/run.py index 9ac43b5f6..bdc7122cb 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2653,6 +2653,16 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew self._restart_via_service = False self._detached_restart_helper_started = False self._restart_command_source: Optional[SessionSource] = None + # Monotonic-ish wall clock of when this GatewayRunner was constructed. + # Used by the /restart redelivery guard to bound the window in which a + # missing dedup marker is treated as a stale redelivery. + self._startup_time: float = time.time() + # Set True at startup when this process booted as the result of a + # chat-originated /restart (i.e. .restart_notify.json existed on boot). + # A one-shot signal consumed by _is_stale_restart_redelivery so the + # marker-missing fallback only suppresses a /restart when we KNOW we + # just came out of a restart cycle — never on a genuine fresh boot. + self._booted_from_restart: bool = False self._stop_task: Optional[asyncio.Task] = None self._restart_task: Optional[asyncio.Task] = None self._executor_lock = threading.Lock() @@ -6579,6 +6589,13 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew # Notify the chat that initiated /restart that the gateway is back. planned_restart_notification_pending = _planned_restart_notification_pending() + # Capture, before _send_restart_notification() unlinks the marker, + # whether this process booted from a chat-originated /restart. Used as + # a one-shot signal by the /restart redelivery guard so a missing + # dedup marker only suppresses a /restart when we KNOW we just came out + # of a restart cycle (see _is_stale_restart_redelivery). + if _restart_notification_pending() or planned_restart_notification_pending: + self._booted_from_restart = True await self._send_restart_notification() # Broadcast a lightweight "gateway is back" message to configured home @@ -11431,6 +11448,25 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew try: marker_path = _hermes_home / ".restart_last_processed.json" if not marker_path.exists(): + # Belt-and-suspenders for when the dedup marker goes missing + # (manually cleaned up, or the previous cycle's write failed). + # Without a marker the update_id comparison below can't run, so + # a redelivered /restart would sail through and re-restart the + # gateway — an infinite loop (issue #18528). + # + # Suppress ONLY when we can independently confirm we just came + # out of a restart cycle: this process booted from a + # chat-originated /restart (_booted_from_restart) AND is still + # within a short post-boot window. This never swallows a + # genuine first /restart on a fresh boot (no restart marker on + # boot → flag stays False). Consume the flag one-shot so a + # legitimate /restart sent later in the same session is honored. + if ( + getattr(self, "_booted_from_restart", False) + and time.time() - getattr(self, "_startup_time", 0.0) < 60 + ): + self._booted_from_restart = False + return True return False data = json.loads(marker_path.read_text()) except Exception: diff --git a/tests/gateway/test_restart_redelivery_dedup.py b/tests/gateway/test_restart_redelivery_dedup.py index 88cb0223d..7b651d9c8 100644 --- a/tests/gateway/test_restart_redelivery_dedup.py +++ b/tests/gateway/test_restart_redelivery_dedup.py @@ -244,3 +244,74 @@ async def test_different_platform_bypasses_dedup(tmp_path, monkeypatch): assert "Restarting gateway" in result runner.request_restart.assert_called_once() + + +@pytest.mark.asyncio +async def test_marker_missing_but_booted_from_restart_ignores_redelivery(tmp_path, monkeypatch): + """Missing marker + just booted from a /restart + young process → treat as stale. + + Reproduces the infinite-loop scenario (issue #18528): the dedup marker went + missing, so the update_id comparison can't run. Because this process booted + from a chat-originated /restart and is still within the post-boot window, + the redelivered /restart is suppressed instead of re-restarting the gateway. + """ + monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path) + monkeypatch.delenv("INVOCATION_ID", raising=False) + + runner, _adapter = make_restart_runner() + runner.request_restart = MagicMock(return_value=True) + runner._booted_from_restart = True + runner._startup_time = time.time() + + event = _make_restart_event(update_id=100) + result = await runner._handle_restart_command(event) + + assert result == "" # silently ignored + runner.request_restart.assert_not_called() + # One-shot: the flag is consumed so a later legitimate /restart is honored. + assert runner._booted_from_restart is False + + +@pytest.mark.asyncio +async def test_marker_missing_fresh_boot_allows_restart(tmp_path, monkeypatch): + """Missing marker on a genuine fresh boot (not from /restart) → /restart proceeds. + + The guard must NOT swallow the first /restart a user sends shortly after a + normal (non-restart) startup: _booted_from_restart stays False, so the + fallback returns False and the restart goes through. + """ + monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path) + monkeypatch.delenv("INVOCATION_ID", raising=False) + + runner, _adapter = make_restart_runner() + runner.request_restart = MagicMock(return_value=True) + runner._booted_from_restart = False + runner._startup_time = time.time() + + event = _make_restart_event(update_id=100) + result = await runner._handle_restart_command(event) + + assert "Restarting gateway" in result + runner.request_restart.assert_called_once() + + +@pytest.mark.asyncio +async def test_marker_missing_booted_from_restart_but_old_process_allows(tmp_path, monkeypatch): + """Missing marker + booted from /restart but past the window → /restart proceeds. + + A /restart arriving long after boot is a genuine user action, not a boot-time + redelivery, so the uptime bound stops the guard from suppressing it forever. + """ + monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path) + monkeypatch.delenv("INVOCATION_ID", raising=False) + + runner, _adapter = make_restart_runner() + runner.request_restart = MagicMock(return_value=True) + runner._booted_from_restart = True + runner._startup_time = time.time() - 120 # well past the 60s window + + event = _make_restart_event(update_id=100) + result = await runner._handle_restart_command(event) + + assert "Restarting gateway" in result + runner.request_restart.assert_called_once()