fix(gateway): ignore stale fatal-error notifications from superseded adapters

A delayed fatal-error notification from an adapter instance that has
already been replaced by a successful reconnect (a different adapter
object now owns the platform slot) was still processed: it overwrote
the platform's runtime status back to retrying/fatal and could
re-queue an already-healthy platform for reconnection.

Snapshot the current owner of the platform slot at the top of
_handle_adapter_fatal_error and bail out before any side effect when
it belongs to a different, already-installed adapter.
This commit is contained in:
joaomarcos 2026-07-01 00:48:04 -03:00 committed by Teknium
parent a682091044
commit fb8efbb4a8
2 changed files with 56 additions and 1 deletions

View file

@ -3654,6 +3654,23 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
If the error is retryable (e.g. network blip, DNS failure), queue the
platform for background reconnection instead of giving up permanently.
"""
# Snapshot the current owner of this platform slot before doing
# anything else. If it's neither this adapter nor empty, a different
# adapter has already taken over (e.g. this is a delayed notification
# from a background retry chain that raced with, and lost to, a
# reconnect that already succeeded). Acting on a stale notification
# would overwrite an already-healthy platform's runtime status and
# incorrectly re-queue it for reconnection, so bail out before any of
# that happens.
existing = self.adapters.get(adapter.platform)
if existing is not None and existing is not adapter:
logger.debug(
"Ignoring stale fatal error from a superseded %s adapter instance: %s",
adapter.platform.value,
adapter.fatal_error_code or "unknown",
)
return
logger.error(
"Fatal %s adapter error (%s): %s",
adapter.platform.value,
@ -3677,7 +3694,6 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
error_message=adapter.fatal_error_message,
)
existing = self.adapters.get(adapter.platform)
if existing is adapter:
# Claim this adapter for teardown before awaiting disconnect() —
# a second fatal-error notification for the same adapter (e.g.

View file

@ -153,3 +153,42 @@ async def test_concurrent_fatal_notifications_disconnect_same_adapter_once(monke
)
assert disconnect_calls == 1
@pytest.mark.asyncio
async def test_stale_fatal_notification_from_superseded_adapter_is_ignored(monkeypatch, tmp_path):
"""
A delayed fatal-error notification from an adapter instance that has
since been replaced by a different, already-installed adapter (e.g. a
background retry chain on the old instance finally giving up after a
reconnect on a new instance already succeeded) must be ignored: it must
not disconnect the new adapter, must not re-queue an already-healthy
platform for reconnection, and must not shut the gateway down.
"""
config = GatewayConfig(
platforms={
Platform.WHATSAPP: PlatformConfig(enabled=True, token="token")
},
sessions_dir=tmp_path / "sessions",
)
runner = GatewayRunner(config)
old_adapter = _RuntimeRetryableAdapter()
old_adapter._set_fatal_error(
"whatsapp_bridge_exited",
"stale failure from a superseded adapter instance",
retryable=True,
)
new_adapter = _RuntimeRetryableAdapter()
new_adapter.disconnect = AsyncMock()
runner.adapters = {Platform.WHATSAPP: new_adapter}
runner.delivery_router.adapters = runner.adapters
runner.stop = AsyncMock()
await runner._handle_adapter_fatal_error(old_adapter)
new_adapter.disconnect.assert_not_awaited()
assert runner.adapters[Platform.WHATSAPP] is new_adapter
assert Platform.WHATSAPP not in runner._failed_platforms
runner.stop.assert_not_awaited()