fix(gateway): ignore stale fatal-error notifications from superseded adapters
A delayed fatal-error notification from an adapter instance that has already been replaced by a successful reconnect (a different adapter object now owns the platform slot) was still processed: it overwrote the platform's runtime status back to retrying/fatal and could re-queue an already-healthy platform for reconnection. Snapshot the current owner of the platform slot at the top of _handle_adapter_fatal_error and bail out before any side effect when it belongs to a different, already-installed adapter.
This commit is contained in:
parent
a682091044
commit
fb8efbb4a8
2 changed files with 56 additions and 1 deletions
|
|
@ -3654,6 +3654,23 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
If the error is retryable (e.g. network blip, DNS failure), queue the
|
||||
platform for background reconnection instead of giving up permanently.
|
||||
"""
|
||||
# Snapshot the current owner of this platform slot before doing
|
||||
# anything else. If it's neither this adapter nor empty, a different
|
||||
# adapter has already taken over (e.g. this is a delayed notification
|
||||
# from a background retry chain that raced with, and lost to, a
|
||||
# reconnect that already succeeded). Acting on a stale notification
|
||||
# would overwrite an already-healthy platform's runtime status and
|
||||
# incorrectly re-queue it for reconnection, so bail out before any of
|
||||
# that happens.
|
||||
existing = self.adapters.get(adapter.platform)
|
||||
if existing is not None and existing is not adapter:
|
||||
logger.debug(
|
||||
"Ignoring stale fatal error from a superseded %s adapter instance: %s",
|
||||
adapter.platform.value,
|
||||
adapter.fatal_error_code or "unknown",
|
||||
)
|
||||
return
|
||||
|
||||
logger.error(
|
||||
"Fatal %s adapter error (%s): %s",
|
||||
adapter.platform.value,
|
||||
|
|
@ -3677,7 +3694,6 @@ class GatewayRunner(GatewayAuthorizationMixin, GatewayKanbanWatchersMixin, Gatew
|
|||
error_message=adapter.fatal_error_message,
|
||||
)
|
||||
|
||||
existing = self.adapters.get(adapter.platform)
|
||||
if existing is adapter:
|
||||
# Claim this adapter for teardown before awaiting disconnect() —
|
||||
# a second fatal-error notification for the same adapter (e.g.
|
||||
|
|
|
|||
|
|
@ -153,3 +153,42 @@ async def test_concurrent_fatal_notifications_disconnect_same_adapter_once(monke
|
|||
)
|
||||
|
||||
assert disconnect_calls == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stale_fatal_notification_from_superseded_adapter_is_ignored(monkeypatch, tmp_path):
|
||||
"""
|
||||
A delayed fatal-error notification from an adapter instance that has
|
||||
since been replaced by a different, already-installed adapter (e.g. a
|
||||
background retry chain on the old instance finally giving up after a
|
||||
reconnect on a new instance already succeeded) must be ignored: it must
|
||||
not disconnect the new adapter, must not re-queue an already-healthy
|
||||
platform for reconnection, and must not shut the gateway down.
|
||||
"""
|
||||
config = GatewayConfig(
|
||||
platforms={
|
||||
Platform.WHATSAPP: PlatformConfig(enabled=True, token="token")
|
||||
},
|
||||
sessions_dir=tmp_path / "sessions",
|
||||
)
|
||||
runner = GatewayRunner(config)
|
||||
|
||||
old_adapter = _RuntimeRetryableAdapter()
|
||||
old_adapter._set_fatal_error(
|
||||
"whatsapp_bridge_exited",
|
||||
"stale failure from a superseded adapter instance",
|
||||
retryable=True,
|
||||
)
|
||||
|
||||
new_adapter = _RuntimeRetryableAdapter()
|
||||
new_adapter.disconnect = AsyncMock()
|
||||
runner.adapters = {Platform.WHATSAPP: new_adapter}
|
||||
runner.delivery_router.adapters = runner.adapters
|
||||
runner.stop = AsyncMock()
|
||||
|
||||
await runner._handle_adapter_fatal_error(old_adapter)
|
||||
|
||||
new_adapter.disconnect.assert_not_awaited()
|
||||
assert runner.adapters[Platform.WHATSAPP] is new_adapter
|
||||
assert Platform.WHATSAPP not in runner._failed_platforms
|
||||
runner.stop.assert_not_awaited()
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue