From fb8efbb4a8a3734638cb4118ccb64e2d142f46c8 Mon Sep 17 00:00:00 2001 From: joaomarcos Date: Wed, 1 Jul 2026 00:48:04 -0300 Subject: [PATCH] fix(gateway): ignore stale fatal-error notifications from superseded adapters A delayed fatal-error notification from an adapter instance that has already been replaced by a successful reconnect (a different adapter object now owns the platform slot) was still processed: it overwrote the platform's runtime status back to retrying/fatal and could re-queue an already-healthy platform for reconnection. Snapshot the current owner of the platform slot at the top of _handle_adapter_fatal_error and bail out before any side effect when it belongs to a different, already-installed adapter. --- gateway/run.py | 18 +++++++++- tests/gateway/test_runner_fatal_adapter.py | 39 ++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/gateway/run.py b/gateway/run.py index 049706fda..4c4a4b107 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -3654,6 +3654,23 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew If the error is retryable (e.g. network blip, DNS failure), queue the platform for background reconnection instead of giving up permanently. """ + # Snapshot the current owner of this platform slot before doing + # anything else. If it's neither this adapter nor empty, a different + # adapter has already taken over (e.g. this is a delayed notification + # from a background retry chain that raced with, and lost to, a + # reconnect that already succeeded). Acting on a stale notification + # would overwrite an already-healthy platform's runtime status and + # incorrectly re-queue it for reconnection, so bail out before any of + # that happens. + existing = self.adapters.get(adapter.platform) + if existing is not None and existing is not adapter: + logger.debug( + "Ignoring stale fatal error from a superseded %s adapter instance: %s", + adapter.platform.value, + adapter.fatal_error_code or "unknown", + ) + return + logger.error( "Fatal %s adapter error (%s): %s", adapter.platform.value, @@ -3677,7 +3694,6 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew error_message=adapter.fatal_error_message, ) - existing = self.adapters.get(adapter.platform) if existing is adapter: # Claim this adapter for teardown before awaiting disconnect() — # a second fatal-error notification for the same adapter (e.g. diff --git a/tests/gateway/test_runner_fatal_adapter.py b/tests/gateway/test_runner_fatal_adapter.py index dc1462235..7fce3841f 100644 --- a/tests/gateway/test_runner_fatal_adapter.py +++ b/tests/gateway/test_runner_fatal_adapter.py @@ -153,3 +153,42 @@ async def test_concurrent_fatal_notifications_disconnect_same_adapter_once(monke ) assert disconnect_calls == 1 + + +@pytest.mark.asyncio +async def test_stale_fatal_notification_from_superseded_adapter_is_ignored(monkeypatch, tmp_path): + """ + A delayed fatal-error notification from an adapter instance that has + since been replaced by a different, already-installed adapter (e.g. a + background retry chain on the old instance finally giving up after a + reconnect on a new instance already succeeded) must be ignored: it must + not disconnect the new adapter, must not re-queue an already-healthy + platform for reconnection, and must not shut the gateway down. + """ + config = GatewayConfig( + platforms={ + Platform.WHATSAPP: PlatformConfig(enabled=True, token="token") + }, + sessions_dir=tmp_path / "sessions", + ) + runner = GatewayRunner(config) + + old_adapter = _RuntimeRetryableAdapter() + old_adapter._set_fatal_error( + "whatsapp_bridge_exited", + "stale failure from a superseded adapter instance", + retryable=True, + ) + + new_adapter = _RuntimeRetryableAdapter() + new_adapter.disconnect = AsyncMock() + runner.adapters = {Platform.WHATSAPP: new_adapter} + runner.delivery_router.adapters = runner.adapters + runner.stop = AsyncMock() + + await runner._handle_adapter_fatal_error(old_adapter) + + new_adapter.disconnect.assert_not_awaited() + assert runner.adapters[Platform.WHATSAPP] is new_adapter + assert Platform.WHATSAPP not in runner._failed_platforms + runner.stop.assert_not_awaited()