fix(gateway): guard stale /restart redelivery when dedup marker is missing (#56107)
When .restart_last_processed.json goes missing, a redelivered /restart from Telegram polling can no longer be caught by the update_id comparison, so it re-restarts the gateway forever (issue #18528, reported by @dontcallmejames who hit it in production — gateway restarting every ~2min, zero messages processed). Fallback: on marker-missing, suppress the /restart only when we can confirm we just came out of a restart cycle (_booted_from_restart, captured at startup from .restart_notify.json before it is unlinked) AND the process is still within a 60s post-boot window. Consumed one-shot. This closes the loop without swallowing a genuine first /restart on a fresh boot — the flaw in the original bare-uptime approach. Credit to @dontcallmejames for the diagnosis and original patch.
This commit is contained in:
parent
36f9f50145
commit
cdd553945e
2 changed files with 107 additions and 0 deletions
|
|
@ -2653,6 +2653,16 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
self._restart_via_service = False
|
||||
self._detached_restart_helper_started = False
|
||||
self._restart_command_source: Optional[SessionSource] = None
|
||||
# Monotonic-ish wall clock of when this GatewayRunner was constructed.
|
||||
# Used by the /restart redelivery guard to bound the window in which a
|
||||
# missing dedup marker is treated as a stale redelivery.
|
||||
self._startup_time: float = time.time()
|
||||
# Set True at startup when this process booted as the result of a
|
||||
# chat-originated /restart (i.e. .restart_notify.json existed on boot).
|
||||
# A one-shot signal consumed by _is_stale_restart_redelivery so the
|
||||
# marker-missing fallback only suppresses a /restart when we KNOW we
|
||||
# just came out of a restart cycle — never on a genuine fresh boot.
|
||||
self._booted_from_restart: bool = False
|
||||
self._stop_task: Optional[asyncio.Task] = None
|
||||
self._restart_task: Optional[asyncio.Task] = None
|
||||
self._executor_lock = threading.Lock()
|
||||
|
|
@ -6579,6 +6589,13 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
|
||||
# Notify the chat that initiated /restart that the gateway is back.
|
||||
planned_restart_notification_pending = _planned_restart_notification_pending()
|
||||
# Capture, before _send_restart_notification() unlinks the marker,
|
||||
# whether this process booted from a chat-originated /restart. Used as
|
||||
# a one-shot signal by the /restart redelivery guard so a missing
|
||||
# dedup marker only suppresses a /restart when we KNOW we just came out
|
||||
# of a restart cycle (see _is_stale_restart_redelivery).
|
||||
if _restart_notification_pending() or planned_restart_notification_pending:
|
||||
self._booted_from_restart = True
|
||||
await self._send_restart_notification()
|
||||
|
||||
# Broadcast a lightweight "gateway is back" message to configured home
|
||||
|
|
@ -11431,6 +11448,25 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
try:
|
||||
marker_path = _hermes_home / ".restart_last_processed.json"
|
||||
if not marker_path.exists():
|
||||
# Belt-and-suspenders for when the dedup marker goes missing
|
||||
# (manually cleaned up, or the previous cycle's write failed).
|
||||
# Without a marker the update_id comparison below can't run, so
|
||||
# a redelivered /restart would sail through and re-restart the
|
||||
# gateway — an infinite loop (issue #18528).
|
||||
#
|
||||
# Suppress ONLY when we can independently confirm we just came
|
||||
# out of a restart cycle: this process booted from a
|
||||
# chat-originated /restart (_booted_from_restart) AND is still
|
||||
# within a short post-boot window. This never swallows a
|
||||
# genuine first /restart on a fresh boot (no restart marker on
|
||||
# boot → flag stays False). Consume the flag one-shot so a
|
||||
# legitimate /restart sent later in the same session is honored.
|
||||
if (
|
||||
getattr(self, "_booted_from_restart", False)
|
||||
and time.time() - getattr(self, "_startup_time", 0.0) < 60
|
||||
):
|
||||
self._booted_from_restart = False
|
||||
return True
|
||||
return False
|
||||
data = json.loads(marker_path.read_text())
|
||||
except Exception:
|
||||
|
|
|
|||
|
|
@ -244,3 +244,74 @@ async def test_different_platform_bypasses_dedup(tmp_path, monkeypatch):
|
|||
|
||||
assert "Restarting gateway" in result
|
||||
runner.request_restart.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_marker_missing_but_booted_from_restart_ignores_redelivery(tmp_path, monkeypatch):
|
||||
"""Missing marker + just booted from a /restart + young process → treat as stale.
|
||||
|
||||
Reproduces the infinite-loop scenario (issue #18528): the dedup marker went
|
||||
missing, so the update_id comparison can't run. Because this process booted
|
||||
from a chat-originated /restart and is still within the post-boot window,
|
||||
the redelivered /restart is suppressed instead of re-restarting the gateway.
|
||||
"""
|
||||
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
|
||||
monkeypatch.delenv("INVOCATION_ID", raising=False)
|
||||
|
||||
runner, _adapter = make_restart_runner()
|
||||
runner.request_restart = MagicMock(return_value=True)
|
||||
runner._booted_from_restart = True
|
||||
runner._startup_time = time.time()
|
||||
|
||||
event = _make_restart_event(update_id=100)
|
||||
result = await runner._handle_restart_command(event)
|
||||
|
||||
assert result == "" # silently ignored
|
||||
runner.request_restart.assert_not_called()
|
||||
# One-shot: the flag is consumed so a later legitimate /restart is honored.
|
||||
assert runner._booted_from_restart is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_marker_missing_fresh_boot_allows_restart(tmp_path, monkeypatch):
|
||||
"""Missing marker on a genuine fresh boot (not from /restart) → /restart proceeds.
|
||||
|
||||
The guard must NOT swallow the first /restart a user sends shortly after a
|
||||
normal (non-restart) startup: _booted_from_restart stays False, so the
|
||||
fallback returns False and the restart goes through.
|
||||
"""
|
||||
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
|
||||
monkeypatch.delenv("INVOCATION_ID", raising=False)
|
||||
|
||||
runner, _adapter = make_restart_runner()
|
||||
runner.request_restart = MagicMock(return_value=True)
|
||||
runner._booted_from_restart = False
|
||||
runner._startup_time = time.time()
|
||||
|
||||
event = _make_restart_event(update_id=100)
|
||||
result = await runner._handle_restart_command(event)
|
||||
|
||||
assert "Restarting gateway" in result
|
||||
runner.request_restart.assert_called_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_marker_missing_booted_from_restart_but_old_process_allows(tmp_path, monkeypatch):
|
||||
"""Missing marker + booted from /restart but past the window → /restart proceeds.
|
||||
|
||||
A /restart arriving long after boot is a genuine user action, not a boot-time
|
||||
redelivery, so the uptime bound stops the guard from suppressing it forever.
|
||||
"""
|
||||
monkeypatch.setattr(gateway_run, "_hermes_home", tmp_path)
|
||||
monkeypatch.delenv("INVOCATION_ID", raising=False)
|
||||
|
||||
runner, _adapter = make_restart_runner()
|
||||
runner.request_restart = MagicMock(return_value=True)
|
||||
runner._booted_from_restart = True
|
||||
runner._startup_time = time.time() - 120 # well past the 60s window
|
||||
|
||||
event = _make_restart_event(update_id=100)
|
||||
result = await runner._handle_restart_command(event)
|
||||
|
||||
assert "Restarting gateway" in result
|
||||
runner.request_restart.assert_called_once()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue