fix(tools): stop _strategy_exact emitting overlapping matches (#56211)

_strategy_exact advanced its scan cursor by pos+1 instead of
pos+len(pattern), so self-overlapping patterns (e.g. "aa" in "aaaa")
matched at overlapping offsets. _apply_replacements works in reverse
order, so the second replacement operated on already-modified content
using stale offsets — corrupting the file and reporting the wrong count
under replace_all=True. Advancing by len(pattern) matches str.replace()
semantics.
This commit is contained in:
Teknium 2026-07-01 02:13:13 -07:00 committed by GitHub
parent ea533e7f41
commit d57a4c197c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 39 additions and 1 deletions

View file

@ -207,6 +207,39 @@ class TestReplaceAll:
assert count == 2
assert new == "ccc bbb ccc"
def test_self_overlapping_pattern_non_overlapping_matches(self):
"""Self-overlapping patterns must produce non-overlapping spans.
Regression: _strategy_exact advanced the scan cursor by 1 instead of
len(pattern), so "aa" in "aaaa" matched at offsets 0, 1, 2 (overlapping)
instead of 0, 2. _apply_replacements works in reverse order, so the
stale offsets corrupted the file. Fix aligns with str.replace().
"""
# replace_all: 2 non-overlapping matches, not 3 overlapping ones.
new, count, _, err = fuzzy_find_and_replace("aaaa", "aa", "b", replace_all=True)
assert err is None
assert count == 2
assert new == "bb"
# single-char pattern still counts every occurrence
new, count, _, err = fuzzy_find_and_replace("aaa", "a", "b", replace_all=True)
assert err is None
assert count == 3
assert new == "bbb"
# embedded in surrounding content — non-matched parts preserved
new, count, _, err = fuzzy_find_and_replace(
"prefix aaaa suffix", "aa", "b", replace_all=True
)
assert err is None
assert count == 2
assert new == "prefix bb suffix"
# without the flag, the non-overlapping count is reported (2, not 3)
new, count, _, err = fuzzy_find_and_replace("aaaa", "aa", "b", replace_all=False)
assert count == 0
assert "2 matches" in err
class TestUnicodeNormalized:
"""Tests for the unicode_normalized strategy (Bug 5)."""

View file

@ -349,7 +349,12 @@ def _strategy_exact(content: str, pattern: str) -> List[Tuple[int, int]]:
if pos == -1:
break
matches.append((pos, pos + len(pattern)))
start = pos + 1
# Advance past the whole match, not just one char, so self-overlapping
# patterns (e.g. "aa" in "aaaa") produce non-overlapping spans matching
# str.replace() semantics. Advancing by 1 yielded overlapping matches
# that corrupt the file under replace_all=True (reverse-order apply on
# stale offsets).
start = pos + len(pattern)
return matches