fix(gateway): guard stale /restart redelivery when dedup marker is missing (#56107)

When .restart_last_processed.json goes missing, a redelivered /restart from
Telegram polling can no longer be caught by the update_id comparison, so it
re-restarts the gateway forever (issue #18528, reported by @dontcallmejames
who hit it in production — gateway restarting every ~2min, zero messages
processed).

Fallback: on marker-missing, suppress the /restart only when we can confirm
we just came out of a restart cycle (_booted_from_restart, captured at startup
from .restart_notify.json before it is unlinked) AND the process is still
within a 60s post-boot window. Consumed one-shot. This closes the loop without
swallowing a genuine first /restart on a fresh boot — the flaw in the original
bare-uptime approach.

Credit to @dontcallmejames for the diagnosis and original patch.
This commit is contained in:
Teknium 2026-07-01 00:11:23 -07:00 committed by GitHub
parent 36f9f50145
commit cdd553945e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 107 additions and 0 deletions

View file

@ -2653,6 +2653,16 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
self._restart_via_service = False
self._detached_restart_helper_started = False
self._restart_command_source: Optional[SessionSource] = None
# Monotonic-ish wall clock of when this GatewayRunner was constructed.
# Used by the /restart redelivery guard to bound the window in which a
# missing dedup marker is treated as a stale redelivery.
self._startup_time: float = time.time()
# Set True at startup when this process booted as the result of a
# chat-originated /restart (i.e. .restart_notify.json existed on boot).
# A one-shot signal consumed by _is_stale_restart_redelivery so the
# marker-missing fallback only suppresses a /restart when we KNOW we
# just came out of a restart cycle — never on a genuine fresh boot.
self._booted_from_restart: bool = False
self._stop_task: Optional[asyncio.Task] = None
self._restart_task: Optional[asyncio.Task] = None
self._executor_lock = threading.Lock()
@ -6579,6 +6589,13 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
# Notify the chat that initiated /restart that the gateway is back.
planned_restart_notification_pending = _planned_restart_notification_pending()
# Capture, before _send_restart_notification() unlinks the marker,
# whether this process booted from a chat-originated /restart. Used as
# a one-shot signal by the /restart redelivery guard so a missing
# dedup marker only suppresses a /restart when we KNOW we just came out
# of a restart cycle (see _is_stale_restart_redelivery).
if _restart_notification_pending() or planned_restart_notification_pending:
self._booted_from_restart = True
await self._send_restart_notification()
# Broadcast a lightweight "gateway is back" message to configured home
@ -11431,6 +11448,25 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
try:
marker_path = _hermes_home / ".restart_last_processed.json"
if not marker_path.exists():
# Belt-and-suspenders for when the dedup marker goes missing
# (manually cleaned up, or the previous cycle's write failed).
# Without a marker the update_id comparison below can't run, so
# a redelivered /restart would sail through and re-restart the
# gateway — an infinite loop (issue #18528).
#
# Suppress ONLY when we can independently confirm we just came
# out of a restart cycle: this process booted from a
# chat-originated /restart (_booted_from_restart) AND is still
# within a short post-boot window. This never swallows a
# genuine first /restart on a fresh boot (no restart marker on
# boot → flag stays False). Consume the flag one-shot so a
# legitimate /restart sent later in the same session is honored.
if (
getattr(self, "_booted_from_restart", False)
and time.time() - getattr(self, "_startup_time", 0.0) < 60
):
self._booted_from_restart = False
return True
return False
data = json.loads(marker_path.read_text())
except Exception:

View file

@ -244,3 +244,74 @@ async def test_different_platform_bypasses_dedup(tmp_path, monkeypatch):
assert "Restarting gateway" in result
runner.request_restart.assert_called_once()
@pytest.mark.asyncio
async def test_marker_missing_but_booted_from_restart_ignores_redelivery(tmp_path, monkeypatch):
"""Missing marker + just booted from a /restart + young process → treat as stale.
Reproduces the infinite-loop scenario (issue #18528): the dedup marker went
missing, so the update_id comparison can't run. Because this process booted
from a chat-originated /restart and is still within the post-boot window,
the redelivered /restart is suppressed instead of re-restarting the gateway.
"""
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
monkeypatch.delenv("INVOCATION_ID", raising=False)
runner, _adapter = make_restart_runner()
runner.request_restart = MagicMock(return_value=True)
runner._booted_from_restart = True
runner._startup_time = time.time()
event = _make_restart_event(update_id=100)
result = await runner._handle_restart_command(event)
assert result == "" # silently ignored
runner.request_restart.assert_not_called()
# One-shot: the flag is consumed so a later legitimate /restart is honored.
assert runner._booted_from_restart is False
@pytest.mark.asyncio
async def test_marker_missing_fresh_boot_allows_restart(tmp_path, monkeypatch):
"""Missing marker on a genuine fresh boot (not from /restart) → /restart proceeds.
The guard must NOT swallow the first /restart a user sends shortly after a
normal (non-restart) startup: _booted_from_restart stays False, so the
fallback returns False and the restart goes through.
"""
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
monkeypatch.delenv("INVOCATION_ID", raising=False)
runner, _adapter = make_restart_runner()
runner.request_restart = MagicMock(return_value=True)
runner._booted_from_restart = False
runner._startup_time = time.time()
event = _make_restart_event(update_id=100)
result = await runner._handle_restart_command(event)
assert "Restarting gateway" in result
runner.request_restart.assert_called_once()
@pytest.mark.asyncio
async def test_marker_missing_booted_from_restart_but_old_process_allows(tmp_path, monkeypatch):
"""Missing marker + booted from /restart but past the window → /restart proceeds.
A /restart arriving long after boot is a genuine user action, not a boot-time
redelivery, so the uptime bound stops the guard from suppressing it forever.
"""
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
monkeypatch.delenv("INVOCATION_ID", raising=False)
runner, _adapter = make_restart_runner()
runner.request_restart = MagicMock(return_value=True)
runner._booted_from_restart = True
runner._startup_time = time.time() - 120 # well past the 60s window
event = _make_restart_event(update_id=100)
result = await runner._handle_restart_command(event)
assert "Restarting gateway" in result
runner.request_restart.assert_called_once()