fix(patch): preserve file Unicode when unicode_normalized strategy matches

The patch tool's strategy 7 (unicode_normalized) matches ASCII old_string
against a file containing real Unicode (em-dashes, smart quotes, ellipsis,
non-breaking spaces). Writing new_string verbatim silently replaced the
file's Unicode with the LLM's ASCII equivalents.

_preserve_unicode_in_replacement() diffs old_string->new_string and applies
only the actual edits to the file's original Unicode text, preserving
unchanged characters.

Salvaged from #50540 by @aj-nt. Only the Unicode-preservation half is
carried over; the write_file line-number-strip half was dropped (the
existing _looks_like_read_file_line_numbered_content reject guard already
covers its target case, and the strip's looser threshold risks silently
mutating legitimate pipe-delimited content).
This commit is contained in:
AJ 2026-07-01 17:38:28 +05:30 committed by kshitij
parent 2f167a2b84
commit 65a6a36093
2 changed files with 128 additions and 0 deletions

View file

@ -271,6 +271,54 @@ class TestUnicodeNormalized:
assert count == 1
assert strategy == "exact"
def test_unicode_preserved_in_output(self):
"""Unicode characters in unchanged portions survive the replacement."""
content = "Hello\u2014world"
new, count, strategy, err = fuzzy_find_and_replace(
content, "Hello--world", "Hello--there"
)
assert count == 1, f"Expected match, got err={err}"
assert strategy == "unicode_normalized"
# The em-dash should be preserved; only "world" → "there" should change
assert new == "Hello\u2014there", f"Got {new!r}"
def test_smart_quotes_preserved(self):
"""Smart quotes survive when only the quoted text changes."""
content = 'He said \u201chello\u201d to her'
new, count, strategy, err = fuzzy_find_and_replace(
content, 'He said "hello" to her', 'He said "goodbye" to her'
)
assert count == 1, f"Expected match, got err={err}"
assert new == 'He said \u201cgoodbye\u201d to her', f"Got {new!r}"
def test_ellipsis_preserved(self):
"""Ellipsis survives when surrounding text changes."""
content = "Wait for it\u2026and done"
new, count, strategy, err = fuzzy_find_and_replace(
content, "Wait for it...and done", "Wait for it...then done"
)
assert count == 1, f"Expected match, got err={err}"
assert new == "Wait for it\u2026then done", f"Got {new!r}"
def test_mixed_unicode_multiline(self):
"""Multiple Unicode types in a multi-line block all survive."""
content = 'Line 1 \u2014 with dash\nLine 2 \u201cquoted\u201d text\nLine 3 plain'
old = 'Line 1 -- with dash\nLine 2 "quoted" text\nLine 3 plain'
new_str = 'Line 1 -- with dash\nLine 2 "quoted" text\nLine 3 changed'
new, count, strategy, err = fuzzy_find_and_replace(content, old, new_str)
assert count == 1, f"Expected match, got err={err}"
expected = 'Line 1 \u2014 with dash\nLine 2 \u201cquoted\u201d text\nLine 3 changed'
assert new == expected, f"Got {new!r}"
def test_no_unicode_no_change(self):
"""When file has no Unicode, replacement is direct (no-op guard)."""
content = "plain text here"
new, count, strategy, err = fuzzy_find_and_replace(
content, "plain text here", "plain text there"
)
assert count == 1
assert new == "plain text there"
class TestBlockAnchorThreshold:
"""Tests for the raised block_anchor threshold (Bug 4)."""

View file

@ -134,6 +134,18 @@ def fuzzy_find_and_replace(content: str, old_string: str, new_string: str,
effective_new = _maybe_unescape_new_string(
new_string, content, matches,
)
# Unicode-preservation guard: when strategy 7 (unicode_normalized)
# matched, the file has Unicode characters (em-dashes, smart quotes,
# ellipsis) but old_string/new_string from the LLM are ASCII
# equivalents. Writing new_string verbatim would silently corrupt
# the file's Unicode — em-dashes become two hyphens, smart quotes
# become straight quotes. Align the replacement with the file's
# actual Unicode so only the LLM's intended changes are applied
# and unchanged portions keep their original characters.
if strategy_name == "unicode_normalized":
effective_new = _preserve_unicode_in_replacement(
content, matches, old_string, effective_new,
)
new_content = _apply_replacements(
content, matches, effective_new,
old_string=old_string if strategy_name != "exact" else None,
@ -304,6 +316,74 @@ def _maybe_unescape_new_string(new_string: str,
return out
def _preserve_unicode_in_replacement(
content: str, matches: List[Tuple[int, int]],
old_string: str, new_string: str,
) -> str:
"""Preserve Unicode characters from the file in the replacement string.
When strategy 7 (unicode_normalized) matched, the file has Unicode
characters (em-dashes, smart quotes, ellipsis, non-breaking spaces)
but old_string/new_string from the LLM are ASCII equivalents.
Writing new_string verbatim would silently corrupt the file's
Unicode em-dashes become two hyphens, smart quotes become
straight quotes.
This function aligns the replacement with the file's actual Unicode
by diffing old_stringnew_string and applying only the actual edits
to the file's original text, preserving Unicode for unchanged portions.
"""
# Aggregate the matched file regions
file_region = "".join(content[start:end] for start, end in matches)
# Normalize both for comparison
norm_old = _unicode_normalize(old_string)
norm_file = _unicode_normalize(file_region)
# If the normalized forms don't match, the strategy shouldn't have
# fired — fall back to direct replacement.
if norm_old != norm_file:
return new_string
# Build position maps from normalized space back to original space
# for both old_string and file_region. UNICODE_MAP replacements can
# expand characters (em-dash → '--'), so normalized positions don't
# map 1:1 to original positions. Reuse the module-level
# _build_orig_to_norm_map, then invert it (same inversion as
# _map_positions_norm_to_orig) to get norm→orig lookups.
file_orig_to_norm = _build_orig_to_norm_map(file_region)
file_norm_to_orig: dict[int, int] = {}
for orig_pos, np in enumerate(file_orig_to_norm[:-1]):
if np not in file_norm_to_orig:
file_norm_to_orig[np] = orig_pos
# Diff norm_old → new_string to find the actual edits
sm = SequenceMatcher(None, norm_old, new_string)
opcodes = sm.get_opcodes()
# Apply edits to file_region, preserving Unicode for unchanged spans
result_parts: List[str] = []
for tag, i1, i2, j1, j2 in opcodes:
if tag == "equal":
# Keep the original file_region text for this span
orig_start = file_norm_to_orig.get(i1, 0)
orig_end = orig_start
while (
orig_end < len(file_region)
and file_orig_to_norm[orig_end] < i2
):
orig_end += 1
result_parts.append(file_region[orig_start:orig_end])
elif tag == "replace":
result_parts.append(new_string[j1:j2])
elif tag == "delete":
pass # skip deleted portion
elif tag == "insert":
result_parts.append(new_string[j1:j2])
return "".join(result_parts)
def _apply_replacements(content: str, matches: List[Tuple[int, int]],
new_string: str, old_string: Optional[str] = None) -> str:
"""