Rewrite encoding detection as inverted UTF-8 validator (#7213)

pstreef · web-flow · commit bdb64b50e976 · 2026-04-01T08:53:58.000+02:00
* Fix EncodingDetectingInputStream missing bytes 0xF8-0xFF as non-UTF-8

Bytes in the range 0xF8-0xFF are never valid in any position of a UTF-8
sequence, but the detection logic had no branch for them — they fell
through without setting charset to Windows-1252. This caused ISO-8859-1
files containing characters like ü (0xFC) to be incorrectly detected as
UTF-8, leading to `mod git apply` failures when patch bytes didn't match
file bytes.

Also reject 0xC0 and 0xC1 as UTF-8 lead bytes — these would encode
code points below U+0080 (overlong encodings forbidden by RFC 3629).

* Rewrite guessCharset as inverted UTF-8 validator

Replace the enumeration-of-bad-bytes approach with a single
remainingContinuationBytes counter. Any byte not explicitly valid
in its position is rejected as non-UTF-8, which eliminates an
entire class of detection gaps by default.

Also tightens the 4-byte lead range to 0xF0-0xF4 (0xF5-0xF7
would encode code points above U+10FFFF).

* Simplify encoding detection tests with @ParameterizedTest

Use CsvSource-based parameterized tests for byte-level validation,
hex-encoded byte arrays for precise control over test inputs, and
consolidate duplicated test patterns.
diff --git a/rewrite-core/src/main/java/org/openrewrite/internal/EncodingDetectingInputStream.java b/rewrite-core/src/main/java/org/openrewrite/internal/EncodingDetectingInputStream.java
@@ -37,14 +37,10 @@ public class EncodingDetectingInputStream extends InputStream {
     private boolean charsetBomMarked;
 
     /**
-     * Last byte read
+     * Number of UTF-8 continuation bytes (0x80-0xBF) still expected
+     * to complete the current multi-byte sequence. Zero when idle.
      */
-    private int prev;
-    private int prev2;
-
-    boolean maybeTwoByteSequence = false;
-    boolean maybeThreeByteSequence = false;
-    boolean maybeFourByteSequence = false;
+    int remainingContinuationBytes = 0;
 
     public EncodingDetectingInputStream(InputStream inputStream) {
         this(inputStream, null);
@@ -84,7 +80,7 @@ public int read() throws IOException {
         // if we haven't yet determined a charset...
         if (read == -1) {
             if (charset == null) {
-                if (maybeTwoByteSequence || maybeThreeByteSequence || maybeFourByteSequence) {
+                if (remainingContinuationBytes > 0) {
                     charset = WINDOWS_1252;
                 } else {
                     charset = StandardCharsets.UTF_8;
@@ -116,43 +112,25 @@ public int read(byte[] b, int off, int len) throws IOException {
     }
 
     private void guessCharset(int aByte) {
-        if (utf8TwoByteSequence(aByte)) {
-            maybeTwoByteSequence = true;
-        } else if (utf8ThreeByteSequence(aByte)) {
-            maybeThreeByteSequence = true;
-        } else if (utf8FourByteSequence(aByte)) {
-            maybeFourByteSequence = true;
-        } else if (maybeTwoByteSequence) {
-            if (!utf8SequenceEnd(aByte)) {
-                charset = WINDOWS_1252;
+        if (remainingContinuationBytes > 0) {
+            if (aByte >= 0x80 && aByte <= 0xBF) {
+                remainingContinuationBytes--;
             } else {
-                maybeTwoByteSequence = false;
-                prev = -1;
-            }
-        } else if (maybeThreeByteSequence) {
-            if (!utf8SequenceEnd(aByte)) {
                 charset = WINDOWS_1252;
             }
-
-            if (utf8SequenceEnd(prev) && utf8SequenceEnd(aByte)) {
-                maybeThreeByteSequence = false;
-                prev = -1;
-            }
-        } else if (maybeFourByteSequence) {
-            if (utf8SequenceEnd(prev2) && utf8SequenceEnd(prev) && !utf8SequenceEnd(aByte) || utf8SequenceEnd(prev) && !utf8SequenceEnd(aByte) || !utf8SequenceEnd(aByte)) {
-                charset = WINDOWS_1252;
-            }
-
-            if (utf8SequenceEnd(prev2) && utf8SequenceEnd(prev) && utf8SequenceEnd(aByte)) {
-                maybeFourByteSequence = false;
-                prev = -1;
-            }
-        } else if (utf8SequenceEnd(aByte)) {
+        } else if (aByte <= 0x7F) {
+            // ASCII — valid, nothing to track
+        } else if (aByte >= 0xC2 && aByte <= 0xDF) {
+            remainingContinuationBytes = 1;
+        } else if (aByte >= 0xE0 && aByte <= 0xEF) {
+            remainingContinuationBytes = 2;
+        } else if (aByte >= 0xF0 && aByte <= 0xF4) {
+            remainingContinuationBytes = 3;
+        } else {
+            // 0x80-0xBF (bare continuation), 0xC0-0xC1 (overlong),
+            // 0xF5-0xFF (above max Unicode) — all invalid UTF-8
             charset = WINDOWS_1252;
         }
-
-        prev2 = prev;
-        prev = aByte;
     }
 
     public String readFully() {
@@ -197,25 +175,7 @@ private int checkAndSkipUtf8Bom() throws IOException {
         return -2;
     }
 
-    // The first byte of a UTF-8 two byte sequence is between 0xC0 - 0xDF.
-    private boolean utf8TwoByteSequence(int b) {
-        return 0xC0 <= b && b <= 0xDF;
-    }
-
-    // The first byte of a UTF-8 three byte sequence is between 0xE0 - 0xEF.
-    private boolean utf8ThreeByteSequence(int b) {
-        return 0xE0 <= b && b <= 0xEF;
-    }
-
-    // The first byte of a UTF-8 four byte sequence is between 0xF0 - 0xF7.
-    private boolean utf8FourByteSequence(int b) {
-        return 0xF0 <= b && b <= 0xF7;
-    }
 
-    // A UTF-8 byte sequence must end between 0x80 - 0xBF.
-    private boolean utf8SequenceEnd(int b) {
-        return 0x80 <= b && b <= 0xBF;
-    }
 
     @Override
     public void close() throws IOException {
diff --git a/rewrite-core/src/test/java/org/openrewrite/internal/EncodingDetectingInputStreamTest.java b/rewrite-core/src/test/java/org/openrewrite/internal/EncodingDetectingInputStreamTest.java
@@ -16,10 +16,11 @@
 package org.openrewrite.internal;
 
 import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.CsvSource;
 
 import java.io.ByteArrayInputStream;
 import java.nio.charset.Charset;
-import java.util.List;
 
 import static java.nio.charset.StandardCharsets.ISO_8859_1;
 import static java.nio.charset.StandardCharsets.UTF_8;
@@ -74,34 +75,59 @@ void skipUTF8BomKnownEncoding() throws Exception {
         }
     }
 
-    @Test
-    void isUtf8() throws Exception {
-        List<String> accents = List.of("Café", "Lýðræðisríki");
-        for (String accent : accents) {
-            try (EncodingDetectingInputStream is = read(accent, UTF_8)) {
-                assertThat(is.getCharset()).isEqualTo(UTF_8);
-            }
+    @ParameterizedTest
+    @CsvSource({
+            "Café,           UTF-8,        UTF-8",
+            "Lyðræðisríki,   UTF-8,        UTF-8",
+            "ࠀ,              UTF-8,        UTF-8",
+            "世,              UTF-8,        UTF-8",
+            "가,              UTF-8,        UTF-8",
+            "Hello 世界,      UTF-8,        UTF-8",
+            "café 世界 🌟,    UTF-8,        UTF-8",
+            "Café,           Windows-1252, Windows-1252",
+            "Lyðræðisríki,   Windows-1252, Windows-1252",
+    })
+    void detectsCharsetForEncodedStrings(String text, String sourceCharset, String expectedCharset) throws Exception {
+        try (EncodingDetectingInputStream is = read(text, Charset.forName(sourceCharset))) {
+            assertThat(is.getCharset()).isEqualTo(Charset.forName(expectedCharset));
         }
     }
 
-    @Test
-    void isWindows1252() throws Exception {
-        List<String> accents = List.of("Café", "Lýðræðisríki");
-        for (String accent : accents) {
-            try (EncodingDetectingInputStream is = read(accent, WINDOWS_1252)) {
-                assertThat(is.getCharset()).isEqualTo(WINDOWS_1252);
-            }
-        }
+    @ParameterizedTest
+    @CsvSource({
+            "48 65 6C 6C 6F C3 BC,           valid 2-byte sequence (Helloü)",
+            "E4 B8 96,                         valid 3-byte sequence (世)",
+            "F0 9F 98 80,                      valid 4-byte sequence (😀)",
+            "63 61 66 C3 A9 20 E4 B8 96 20 F0 9F 8C 9F, mixed 2/3/4-byte sequences",
+    })
+    void detectsUtf8ForValidByteSequences(String hex, String description) {
+        byte[] bytes = parseHex(hex);
+        EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(bytes));
+        is.readFully();
+        assertThat(is.getCharset()).as(description).isEqualTo(UTF_8);
     }
 
-    @Test
-    void oddPairInWindows1252() throws Exception {
-        // Range 1: 0xC0 - 0xDF == "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞß"
-        // Range 2: 0x80 - 0xBF == "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”·–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿"
-        // A character in range 1 followed by a character in range 2 encoded in Windows-1252 will be detected as UTF-8.
-        try (EncodingDetectingInputStream is = read("À€", WINDOWS_1252)) {
-            assertThat(is.getCharset()).isEqualTo(UTF_8);
-        }
+    @ParameterizedTest
+    @CsvSource({
+            "48 65 6C 6C 6F FC,               byte above 0xF7 (ü in ISO-8859-1)",
+            "74 65 73 74 FC 20 E4 20 F6,       multiple high bytes",
+            "C0 80,                            overlong 0xC0 lead byte",
+            "43 61 66 C0,                      overlong 0xC0 at end",
+            "53 61 6F 20 50 61 75 6C 6F C1,    overlong 0xC1 at end",
+            "74 65 73 74 C0 A3,                overlong 0xC0 + continuation byte",
+            "48 65 6C 6C 6F 80,                bare continuation byte",
+            "C3 BC 20 61 6E 64 20 FC,          valid UTF-8 then invalid byte",
+            "E4 B8 96 20 FE,                   invalid byte after valid 3-byte",
+            "61 62 E4 B8,                      truncated 3-byte sequence",
+            "61 F0 9F 98,                      truncated 4-byte sequence",
+            "78 E4 B8 7A,                      broken continuation in 3-byte",
+            "F0 9F 98 78,                      broken continuation in 4-byte",
+    })
+    void detectsWindows1252ForInvalidUtf8Bytes(String hex, String description) {
+        byte[] bytes = parseHex(hex);
+        EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(bytes));
+        is.readFully();
+        assertThat(is.getCharset()).as(description).isEqualTo(WINDOWS_1252);
     }
 
     @Test
@@ -128,7 +154,6 @@ void windows1252SpecialCharacters() throws Exception {
     @Test
     void iso88591() {
         for (int i = 0; i < 255; i++) {
-            // Skip control characters in ISO-8859-1
             if (!(i >= 128 && i <= 159)) {
                 String s = Character.toString((char) i);
                 byte[] win = s.getBytes(WINDOWS_1252);
@@ -138,6 +163,24 @@ void iso88591() {
         }
     }
 
+    @Test
+    void readFullyDecodesIso8859Correctly() {
+        byte[] bytes = new byte[]{0x48, (byte) 0xFC, 0x74, 0x74, 0x65};
+        EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(bytes));
+        String result = is.readFully();
+        assertThat(is.getCharset()).isEqualTo(WINDOWS_1252);
+        assertThat(result).isEqualTo("Hütte");
+    }
+
+    private static byte[] parseHex(String hex) {
+        String[] parts = hex.trim().split("\\s+");
+        byte[] bytes = new byte[parts.length];
+        for (int i = 0; i < parts.length; i++) {
+            bytes[i] = (byte) Integer.parseInt(parts[i], 16);
+        }
+        return bytes;
+    }
+
     private EncodingDetectingInputStream read(String s, Charset charset) {
         EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(s.getBytes(charset)));
         is.readFully();