From 65a6a3609332501a687c514526cb5c4a27bb027a Mon Sep 17 00:00:00 2001 From: AJ Date: Wed, 1 Jul 2026 17:38:28 +0530 Subject: [PATCH] fix(patch): preserve file Unicode when unicode_normalized strategy matches The patch tool's strategy 7 (unicode_normalized) matches ASCII old_string against a file containing real Unicode (em-dashes, smart quotes, ellipsis, non-breaking spaces). Writing new_string verbatim silently replaced the file's Unicode with the LLM's ASCII equivalents. _preserve_unicode_in_replacement() diffs old_string->new_string and applies only the actual edits to the file's original Unicode text, preserving unchanged characters. Salvaged from #50540 by @aj-nt. Only the Unicode-preservation half is carried over; the write_file line-number-strip half was dropped (the existing _looks_like_read_file_line_numbered_content reject guard already covers its target case, and the strip's looser threshold risks silently mutating legitimate pipe-delimited content). --- tests/tools/test_fuzzy_match.py | 48 ++++++++++++++++++++ tools/fuzzy_match.py | 80 +++++++++++++++++++++++++++++++++ 2 files changed, 128 insertions(+) diff --git a/tests/tools/test_fuzzy_match.py b/tests/tools/test_fuzzy_match.py index 7a3177002..76250569b 100644 --- a/tests/tools/test_fuzzy_match.py +++ b/tests/tools/test_fuzzy_match.py @@ -271,6 +271,54 @@ class TestUnicodeNormalized: assert count == 1 assert strategy == "exact" + def test_unicode_preserved_in_output(self): + """Unicode characters in unchanged portions survive the replacement.""" + content = "Hello\u2014world" + new, count, strategy, err = fuzzy_find_and_replace( + content, "Hello--world", "Hello--there" + ) + assert count == 1, f"Expected match, got err={err}" + assert strategy == "unicode_normalized" + # The em-dash should be preserved; only "world" → "there" should change + assert new == "Hello\u2014there", f"Got {new!r}" + + def test_smart_quotes_preserved(self): + """Smart quotes survive when only the quoted text changes.""" + content = 'He said \u201chello\u201d to her' + new, count, strategy, err = fuzzy_find_and_replace( + content, 'He said "hello" to her', 'He said "goodbye" to her' + ) + assert count == 1, f"Expected match, got err={err}" + assert new == 'He said \u201cgoodbye\u201d to her', f"Got {new!r}" + + def test_ellipsis_preserved(self): + """Ellipsis survives when surrounding text changes.""" + content = "Wait for it\u2026and done" + new, count, strategy, err = fuzzy_find_and_replace( + content, "Wait for it...and done", "Wait for it...then done" + ) + assert count == 1, f"Expected match, got err={err}" + assert new == "Wait for it\u2026then done", f"Got {new!r}" + + def test_mixed_unicode_multiline(self): + """Multiple Unicode types in a multi-line block all survive.""" + content = 'Line 1 \u2014 with dash\nLine 2 \u201cquoted\u201d text\nLine 3 plain' + old = 'Line 1 -- with dash\nLine 2 "quoted" text\nLine 3 plain' + new_str = 'Line 1 -- with dash\nLine 2 "quoted" text\nLine 3 changed' + new, count, strategy, err = fuzzy_find_and_replace(content, old, new_str) + assert count == 1, f"Expected match, got err={err}" + expected = 'Line 1 \u2014 with dash\nLine 2 \u201cquoted\u201d text\nLine 3 changed' + assert new == expected, f"Got {new!r}" + + def test_no_unicode_no_change(self): + """When file has no Unicode, replacement is direct (no-op guard).""" + content = "plain text here" + new, count, strategy, err = fuzzy_find_and_replace( + content, "plain text here", "plain text there" + ) + assert count == 1 + assert new == "plain text there" + class TestBlockAnchorThreshold: """Tests for the raised block_anchor threshold (Bug 4).""" diff --git a/tools/fuzzy_match.py b/tools/fuzzy_match.py index be4fec05c..2865411bf 100644 --- a/tools/fuzzy_match.py +++ b/tools/fuzzy_match.py @@ -134,6 +134,18 @@ def fuzzy_find_and_replace(content: str, old_string: str, new_string: str, effective_new = _maybe_unescape_new_string( new_string, content, matches, ) + # Unicode-preservation guard: when strategy 7 (unicode_normalized) + # matched, the file has Unicode characters (em-dashes, smart quotes, + # ellipsis) but old_string/new_string from the LLM are ASCII + # equivalents. Writing new_string verbatim would silently corrupt + # the file's Unicode — em-dashes become two hyphens, smart quotes + # become straight quotes. Align the replacement with the file's + # actual Unicode so only the LLM's intended changes are applied + # and unchanged portions keep their original characters. + if strategy_name == "unicode_normalized": + effective_new = _preserve_unicode_in_replacement( + content, matches, old_string, effective_new, + ) new_content = _apply_replacements( content, matches, effective_new, old_string=old_string if strategy_name != "exact" else None, @@ -304,6 +316,74 @@ def _maybe_unescape_new_string(new_string: str, return out +def _preserve_unicode_in_replacement( + content: str, matches: List[Tuple[int, int]], + old_string: str, new_string: str, +) -> str: + """Preserve Unicode characters from the file in the replacement string. + + When strategy 7 (unicode_normalized) matched, the file has Unicode + characters (em-dashes, smart quotes, ellipsis, non-breaking spaces) + but old_string/new_string from the LLM are ASCII equivalents. + Writing new_string verbatim would silently corrupt the file's + Unicode — em-dashes become two hyphens, smart quotes become + straight quotes. + + This function aligns the replacement with the file's actual Unicode + by diffing old_string→new_string and applying only the actual edits + to the file's original text, preserving Unicode for unchanged portions. + """ + # Aggregate the matched file regions + file_region = "".join(content[start:end] for start, end in matches) + + # Normalize both for comparison + norm_old = _unicode_normalize(old_string) + norm_file = _unicode_normalize(file_region) + + # If the normalized forms don't match, the strategy shouldn't have + # fired — fall back to direct replacement. + if norm_old != norm_file: + return new_string + + # Build position maps from normalized space back to original space + # for both old_string and file_region. UNICODE_MAP replacements can + # expand characters (em-dash → '--'), so normalized positions don't + # map 1:1 to original positions. Reuse the module-level + # _build_orig_to_norm_map, then invert it (same inversion as + # _map_positions_norm_to_orig) to get norm→orig lookups. + file_orig_to_norm = _build_orig_to_norm_map(file_region) + file_norm_to_orig: dict[int, int] = {} + for orig_pos, np in enumerate(file_orig_to_norm[:-1]): + if np not in file_norm_to_orig: + file_norm_to_orig[np] = orig_pos + + # Diff norm_old → new_string to find the actual edits + sm = SequenceMatcher(None, norm_old, new_string) + opcodes = sm.get_opcodes() + + # Apply edits to file_region, preserving Unicode for unchanged spans + result_parts: List[str] = [] + for tag, i1, i2, j1, j2 in opcodes: + if tag == "equal": + # Keep the original file_region text for this span + orig_start = file_norm_to_orig.get(i1, 0) + orig_end = orig_start + while ( + orig_end < len(file_region) + and file_orig_to_norm[orig_end] < i2 + ): + orig_end += 1 + result_parts.append(file_region[orig_start:orig_end]) + elif tag == "replace": + result_parts.append(new_string[j1:j2]) + elif tag == "delete": + pass # skip deleted portion + elif tag == "insert": + result_parts.append(new_string[j1:j2]) + + return "".join(result_parts) + + def _apply_replacements(content: str, matches: List[Tuple[int, int]], new_string: str, old_string: Optional[str] = None) -> str: """