fix(patch): preserve file Unicode when unicode_normalized strategy matches

The patch tool's strategy 7 (unicode_normalized) matches ASCII old_string against a file containing real Unicode (em-dashes, smart quotes, ellipsis, non-breaking spaces). Writing new_string verbatim silently replaced the file's Unicode with the LLM's ASCII equivalents. _preserve_unicode_in_replacement() diffs old_string->new_string and applies only the actual edits to the file's original Unicode text, preserving unchanged characters. Salvaged from #50540 by @aj-nt. Only the Unicode-preservation half is carried over; the write_file line-number-strip half was dropped (the existing _looks_like_read_file_line_numbered_content reject guard already covers its target case, and the strip's looser threshold risks silently mutating legitimate pipe-delimited content).
2026-07-01 17:38:28 +05:30 · 2026-07-01 17:38:28 +05:30 · 65a6a36093
commit 65a6a36093
parent 2f167a2b84
2 changed files with 128 additions and 0 deletions
--- a/tests/tools/test_fuzzy_match.py
+++ b/tests/tools/test_fuzzy_match.py
@ -271,6 +271,54 @@ class TestUnicodeNormalized:
        assert count == 1
        assert strategy == "exact"

+    def test_unicode_preserved_in_output(self):
+        """Unicode characters in unchanged portions survive the replacement."""
+        content = "Hello\u2014world"
+        new, count, strategy, err = fuzzy_find_and_replace(
+            content, "Hello--world", "Hello--there"
+        )
+        assert count == 1, f"Expected match, got err={err}"
+        assert strategy == "unicode_normalized"
+        # The em-dash should be preserved; only "world" → "there" should change
+        assert new == "Hello\u2014there", f"Got {new!r}"
+
+    def test_smart_quotes_preserved(self):
+        """Smart quotes survive when only the quoted text changes."""
+        content = 'He said \u201chello\u201d to her'
+        new, count, strategy, err = fuzzy_find_and_replace(
+            content, 'He said "hello" to her', 'He said "goodbye" to her'
+        )
+        assert count == 1, f"Expected match, got err={err}"
+        assert new == 'He said \u201cgoodbye\u201d to her', f"Got {new!r}"
+
+    def test_ellipsis_preserved(self):
+        """Ellipsis survives when surrounding text changes."""
+        content = "Wait for it\u2026and done"
+        new, count, strategy, err = fuzzy_find_and_replace(
+            content, "Wait for it...and done", "Wait for it...then done"
+        )
+        assert count == 1, f"Expected match, got err={err}"
+        assert new == "Wait for it\u2026then done", f"Got {new!r}"
+
+    def test_mixed_unicode_multiline(self):
+        """Multiple Unicode types in a multi-line block all survive."""
+        content = 'Line 1 \u2014 with dash\nLine 2 \u201cquoted\u201d text\nLine 3 plain'
+        old = 'Line 1 -- with dash\nLine 2 "quoted" text\nLine 3 plain'
+        new_str = 'Line 1 -- with dash\nLine 2 "quoted" text\nLine 3 changed'
+        new, count, strategy, err = fuzzy_find_and_replace(content, old, new_str)
+        assert count == 1, f"Expected match, got err={err}"
+        expected = 'Line 1 \u2014 with dash\nLine 2 \u201cquoted\u201d text\nLine 3 changed'
+        assert new == expected, f"Got {new!r}"
+
+    def test_no_unicode_no_change(self):
+        """When file has no Unicode, replacement is direct (no-op guard)."""
+        content = "plain text here"
+        new, count, strategy, err = fuzzy_find_and_replace(
+            content, "plain text here", "plain text there"
+        )
+        assert count == 1
+        assert new == "plain text there"
+

 class TestBlockAnchorThreshold:
    """Tests for the raised block_anchor threshold (Bug 4)."""
--- a/tools/fuzzy_match.py
+++ b/tools/fuzzy_match.py
@ -134,6 +134,18 @@ def fuzzy_find_and_replace(content: str, old_string: str, new_string: str,
            effective_new = _maybe_unescape_new_string(
                new_string, content, matches,
            )
+            # Unicode-preservation guard: when strategy 7 (unicode_normalized)
+            # matched, the file has Unicode characters (em-dashes, smart quotes,
+            # ellipsis) but old_string/new_string from the LLM are ASCII
+            # equivalents.  Writing new_string verbatim would silently corrupt
+            # the file's Unicode — em-dashes become two hyphens, smart quotes
+            # become straight quotes.  Align the replacement with the file's
+            # actual Unicode so only the LLM's intended changes are applied
+            # and unchanged portions keep their original characters.
+            if strategy_name == "unicode_normalized":
+                effective_new = _preserve_unicode_in_replacement(
+                    content, matches, old_string, effective_new,
+                )
            new_content = _apply_replacements(
                content, matches, effective_new,
                old_string=old_string if strategy_name != "exact" else None,
@ -304,6 +316,74 @@ def _maybe_unescape_new_string(new_string: str,
    return out


+def _preserve_unicode_in_replacement(
+    content: str, matches: List[Tuple[int, int]],
+    old_string: str, new_string: str,
+) -> str:
+    """Preserve Unicode characters from the file in the replacement string.
+
+    When strategy 7 (unicode_normalized) matched, the file has Unicode
+    characters (em-dashes, smart quotes, ellipsis, non-breaking spaces)
+    but old_string/new_string from the LLM are ASCII equivalents.
+    Writing new_string verbatim would silently corrupt the file's
+    Unicode — em-dashes become two hyphens, smart quotes become
+    straight quotes.
+
+    This function aligns the replacement with the file's actual Unicode
+    by diffing old_string→new_string and applying only the actual edits
+    to the file's original text, preserving Unicode for unchanged portions.
+    """
+    # Aggregate the matched file regions
+    file_region = "".join(content[start:end] for start, end in matches)
+
+    # Normalize both for comparison
+    norm_old = _unicode_normalize(old_string)
+    norm_file = _unicode_normalize(file_region)
+
+    # If the normalized forms don't match, the strategy shouldn't have
+    # fired — fall back to direct replacement.
+    if norm_old != norm_file:
+        return new_string
+
+    # Build position maps from normalized space back to original space
+    # for both old_string and file_region.  UNICODE_MAP replacements can
+    # expand characters (em-dash → '--'), so normalized positions don't
+    # map 1:1 to original positions.  Reuse the module-level
+    # _build_orig_to_norm_map, then invert it (same inversion as
+    # _map_positions_norm_to_orig) to get norm→orig lookups.
+    file_orig_to_norm = _build_orig_to_norm_map(file_region)
+    file_norm_to_orig: dict[int, int] = {}
+    for orig_pos, np in enumerate(file_orig_to_norm[:-1]):
+        if np not in file_norm_to_orig:
+            file_norm_to_orig[np] = orig_pos
+
+    # Diff norm_old → new_string to find the actual edits
+    sm = SequenceMatcher(None, norm_old, new_string)
+    opcodes = sm.get_opcodes()
+
+    # Apply edits to file_region, preserving Unicode for unchanged spans
+    result_parts: List[str] = []
+    for tag, i1, i2, j1, j2 in opcodes:
+        if tag == "equal":
+            # Keep the original file_region text for this span
+            orig_start = file_norm_to_orig.get(i1, 0)
+            orig_end = orig_start
+            while (
+                orig_end < len(file_region)
+                and file_orig_to_norm[orig_end] < i2
+            ):
+                orig_end += 1
+            result_parts.append(file_region[orig_start:orig_end])
+        elif tag == "replace":
+            result_parts.append(new_string[j1:j2])
+        elif tag == "delete":
+            pass  # skip deleted portion
+        elif tag == "insert":
+            result_parts.append(new_string[j1:j2])
+
+    return "".join(result_parts)
+
+
 def _apply_replacements(content: str, matches: List[Tuple[int, int]],
                        new_string: str, old_string: Optional[str] = None) -> str:
    """