fix(ci): make parallel runner's exit-4 retry robust for newly-added test files (#42994)

The per-file test runner re-runs a file once when pytest exits 4 ("file or directory not found") while the file exists on disk — a transient seen on loaded shared CI runners where the planner collects a file (--collect-only counts its tests) but the per-file subprocess fails to stat it moments later. A single immediate retry could land in the same brief high-load window and fail again, and the retry was gated on one Path.exists() check that can itself be a flaky stat under that load — so a freshly-added test file that LPT pins to one shard would deterministically red that shard on every run (no actual test failure; the file just never executes). - Extract the subprocess spawn/communicate/process-tree-kill logic into a shared _spawn_pytest_once() helper (removes ~90 lines of duplication between the primary run and the retry). - Replace the single-shot retry with a bounded backoff loop (_EXIT4_RETRY_ATTEMPTS, escalating sleep) that re-runs while the file is present on disk. - Add _file_present() which re-checks existence across a few spaced stats, so a single flaky negative stat doesn't wrongly conclude the file is missing. A genuinely-missing file (typo/deleted) still fails fast — exit 4 is not swallowed when the file truly does not exist. - Tests: transient-then-pass recovery, genuinely-missing fails fast with no retry, give-up after max attempts, and _file_present transient/missing cases.
2026-06-09 21:39:09 -07:00 · 2026-06-09 21:39:09 -07:00 · f082b4ec5c
commit f082b4ec5c
parent 6b330522e1
2 changed files with 220 additions and 96 deletions
--- a/scripts/run_tests_parallel.py
+++ b/scripts/run_tests_parallel.py
@ -246,6 +246,98 @@ def _kill_tree(proc: "subprocess.Popen", pgid: int | None = None) -> None:
        pass


+def _spawn_pytest_once(
+    cmd: List[str],
+    repo_root: Path,
+    file_timeout: float,
+    *,
+    timeout_note: str = "per-file timeout",
+) -> Tuple[int, str]:
+    """Run one ``pytest`` subprocess to completion and return ``(rc, output)``.
+
+    Spawns the child in its own process group / session so a hung file and
+    its grandchildren (uvicorn servers, async runtimes, etc.) can be SIGKILL'd
+    as a tree on timeout rather than orphaning onto PID 1. Shared by the
+    primary per-file run and the exit-4 retry loop so the lifecycle/cleanup
+    logic lives in exactly one place.
+    """
+    proc = subprocess.Popen(
+        cmd,
+        cwd=repo_root,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        # POSIX: place the child at the head of its own process group so
+        # _kill_tree can SIGKILL the group atomically.
+        # Windows: this maps to CREATE_NEW_PROCESS_GROUP in CPython 3.12+;
+        # _kill_tree handles the Windows path via taskkill /F /T.
+        start_new_session=True,
+    )
+
+    # Capture the pgid NOW, before the leader can exit and be reaped. Once
+    # the leader is reaped, os.getpgid(proc.pid) raises ProcessLookupError
+    # even though grandchildren in that group are still alive — defeating
+    # the whole cleanup. None on Windows where the pgid concept doesn't apply.
+    pgid: int | None = None
+    if sys.platform != "win32":
+        try:
+            pgid = os.getpgid(proc.pid)
+        except (ProcessLookupError, PermissionError):
+            pgid = None
+
+    try:
+        output, _ = proc.communicate(timeout=file_timeout)
+        rc = proc.returncode
+    except subprocess.TimeoutExpired:
+        _kill_tree(proc, pgid=pgid)
+        try:
+            output, _ = proc.communicate(timeout=10)
+        except subprocess.TimeoutExpired:
+            output = "(file timeout exceeded; output unavailable)"
+        rc = 124  # de facto convention for "killed by timeout".
+        output = (
+            f"({timeout_note}: {file_timeout:.0f}s exceeded; "
+            f"process tree SIGKILL'd)\n{output}"
+        )
+    except BaseException:
+        # KeyboardInterrupt / runner crash — make sure no zombie
+        # grandchildren outlive us.
+        _kill_tree(proc, pgid=pgid)
+        raise
+    else:
+        # Happy path: pytest exited on its own. Kill the group anyway in
+        # case it left grandchildren behind; already-dead is a no-op.
+        _kill_tree(proc, pgid=pgid)
+
+    return rc, output
+
+
+# How many times to re-run a file that exits 4 ("file or directory not found")
+# while the file demonstrably exists on disk. On loaded shared CI runners the
+# planner can enumerate a file (tests counted via --collect-only) but the
+# per-file subprocess fail to stat it moments later — and a SINGLE immediate
+# retry can land in the same brief high-load window and fail again. We retry a
+# few times with a short backoff so transient I/O pressure has time to settle.
+_EXIT4_RETRY_ATTEMPTS = 3
+_EXIT4_RETRY_BACKOFF_SECONDS = 0.5
+
+
+def _file_present(file: Path, *, attempts: int = 3, delay: float = 0.2) -> bool:
+    """Return True if ``file`` exists, re-checking a few times.
+
+    ``Path.exists()`` itself issues a ``stat`` that can transiently fail under
+    the same load that makes pytest report "file or directory not found", so a
+    single negative check is not authoritative. Only conclude the file is
+    genuinely missing if it's absent across several spaced checks.
+    """
+    for i in range(attempts):
+        if file.exists():
+            return True
+        if i < attempts - 1:
+            time.sleep(delay)
+    return False
+
+
 def _run_one_file(
    file: Path,
    pytest_args: List[str],
@ -280,104 +372,28 @@ def _run_one_file(
    """
    cmd = [sys.executable, "-m", "pytest", str(file), *pytest_args]
    subproc_start = time.monotonic()
-    proc = subprocess.Popen(
-        cmd,
-        cwd=repo_root,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        text=True,
-        # POSIX: place the child at the head of its own process group so
-        # _kill_tree can SIGKILL the group atomically.
-        # Windows: this maps to CREATE_NEW_PROCESS_GROUP in CPython 3.12+;
-        # _kill_tree handles the Windows path via taskkill /F /T.
-        start_new_session=True,
-    )
+    rc, output = _spawn_pytest_once(cmd, repo_root, file_timeout)

-    # Capture the pgid NOW, before the leader can exit and be reaped.
-    # Once the leader is reaped, os.getpgid(proc.pid) raises
-    # ProcessLookupError even though grandchildren in that group are
-    # still alive — defeating the whole cleanup. None on Windows where
-    # the pgid concept doesn't apply (taskkill walks ppid chain instead).
-    pgid: int | None = None
-    if sys.platform != "win32":
-        try:
-            pgid = os.getpgid(proc.pid)
-        except (ProcessLookupError, PermissionError):
-            # Astonishingly fast child? Already dead. _kill_tree's
-            # fallback will handle this case as a no-op.
-            pgid = None
-
-    try:
-        output, _ = proc.communicate(timeout=file_timeout)
-        rc = proc.returncode
-    except subprocess.TimeoutExpired:
-        _kill_tree(proc, pgid=pgid)
-        # Drain whatever the child wrote before we killed it so we have
-        # something to surface in the failure dump.
-        try:
-            output, _ = proc.communicate(timeout=10)
-        except subprocess.TimeoutExpired:
-            output = "(file timeout exceeded; output unavailable)"
-        rc = 124  # de facto convention for "killed by timeout".
-        output = (
-            f"(per-file timeout: {file_timeout:.0f}s exceeded; "
-            f"process tree SIGKILL'd)\n{output}"
+    # pytest exit 4 = "file or directory not found" at exec time. On loaded
+    # shared CI runners we have seen the planner enumerate a file (its tests
+    # counted via --collect-only) but the per-file subprocess fail to stat it
+    # moments later — a transient the deterministic LPT slicer otherwise
+    # reproduces on every rerun (same file set → same shard). Re-run the file a
+    # few times with a short backoff so the I/O pressure has time to settle,
+    # but ONLY while the file demonstrably exists on disk. A single immediate
+    # retry (the old behaviour) could land in the same brief high-load window
+    # and fail again; a single Path.exists() check could itself be a flaky stat
+    # under that load, so we re-check existence across spaced attempts.
+    # We do NOT widen the exit-5 rule: exit 4 on a file that genuinely does not
+    # exist must still fail.
+    attempt = 0
+    while rc == 4 and attempt < _EXIT4_RETRY_ATTEMPTS and _file_present(file):
+        attempt += 1
+        time.sleep(_EXIT4_RETRY_BACKOFF_SECONDS * attempt)
+        rc, output = _spawn_pytest_once(
+            cmd, repo_root, file_timeout,
+            timeout_note=f"per-file timeout on exit-4 retry {attempt}",
        )
-    except BaseException:
-        # KeyboardInterrupt / runner crash — make sure no zombie
-        # grandchildren outlive us.
-        _kill_tree(proc, pgid=pgid)
-        raise
-    else:
-        # Happy path: pytest exited on its own. The child process already
-        # cleaned up its grandchildren if it's well-behaved, but
-        # well-behaved is not universal — kill the group anyway. Already-
-        # dead processes are a no-op.
-        _kill_tree(proc, pgid=pgid)
-
-    if rc == 4 and Path(file).exists():
-        # pytest exit 4 = "file or directory not found" at exec time, yet the
-        # file is present on disk now. On loaded shared CI runners we have seen
-        # the planner enumerate a file (its tests counted via --collect-only)
-        # but the per-file subprocess fail to stat it moments later — a
-        # transient the deterministic LPT slicer otherwise reproduces on every
-        # rerun (same file set → same shard). Retry the file ONCE before
-        # surfacing it as a hard failure. We do NOT widen the exit-5 rule:
-        # exit 4 on a file that genuinely does not exist must still fail.
-        retry_proc = subprocess.Popen(
-            cmd,
-            cwd=repo_root,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            text=True,
-            start_new_session=True,
-        )
-        retry_pgid: int | None = None
-        if sys.platform != "win32":
-            try:
-                retry_pgid = os.getpgid(retry_proc.pid)
-            except (ProcessLookupError, PermissionError):
-                retry_pgid = None
-        try:
-            retry_output, _ = retry_proc.communicate(timeout=file_timeout)
-            retry_rc = retry_proc.returncode
-        except subprocess.TimeoutExpired:
-            _kill_tree(retry_proc, pgid=retry_pgid)
-            try:
-                retry_output, _ = retry_proc.communicate(timeout=10)
-            except subprocess.TimeoutExpired:
-                retry_output = "(file timeout exceeded on retry; output unavailable)"
-            retry_rc = 124
-            retry_output = (
-                f"(per-file timeout on exit-4 retry: {file_timeout:.0f}s exceeded; "
-                f"process tree SIGKILL'd)\n{retry_output}"
-            )
-        except BaseException:
-            _kill_tree(retry_proc, pgid=retry_pgid)
-            raise
-        else:
-            _kill_tree(retry_proc, pgid=retry_pgid)
-        rc, output = retry_rc, retry_output

    if rc == 5:
        # No tests collected — every test in the file was filtered out.