Skip to content

Commit bdb64b5

Browse files
authored
Rewrite encoding detection as inverted UTF-8 validator (#7213)
* Fix EncodingDetectingInputStream missing bytes 0xF8-0xFF as non-UTF-8 Bytes in the range 0xF8-0xFF are never valid in any position of a UTF-8 sequence, but the detection logic had no branch for them — they fell through without setting charset to Windows-1252. This caused ISO-8859-1 files containing characters like ü (0xFC) to be incorrectly detected as UTF-8, leading to `mod git apply` failures when patch bytes didn't match file bytes. Also reject 0xC0 and 0xC1 as UTF-8 lead bytes — these would encode code points below U+0080 (overlong encodings forbidden by RFC 3629). * Rewrite guessCharset as inverted UTF-8 validator Replace the enumeration-of-bad-bytes approach with a single remainingContinuationBytes counter. Any byte not explicitly valid in its position is rejected as non-UTF-8, which eliminates an entire class of detection gaps by default. Also tightens the 4-byte lead range to 0xF0-0xF4 (0xF5-0xF7 would encode code points above U+10FFFF). * Simplify encoding detection tests with @ParameterizedTest Use CsvSource-based parameterized tests for byte-level validation, hex-encoded byte arrays for precise control over test inputs, and consolidate duplicated test patterns.
1 parent 77eea15 commit bdb64b5

2 files changed

Lines changed: 86 additions & 83 deletions

File tree

rewrite-core/src/main/java/org/openrewrite/internal/EncodingDetectingInputStream.java

Lines changed: 18 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,10 @@ public class EncodingDetectingInputStream extends InputStream {
3737
private boolean charsetBomMarked;
3838

3939
/**
40-
* Last byte read
40+
* Number of UTF-8 continuation bytes (0x80-0xBF) still expected
41+
* to complete the current multi-byte sequence. Zero when idle.
4142
*/
42-
private int prev;
43-
private int prev2;
44-
45-
boolean maybeTwoByteSequence = false;
46-
boolean maybeThreeByteSequence = false;
47-
boolean maybeFourByteSequence = false;
43+
int remainingContinuationBytes = 0;
4844

4945
public EncodingDetectingInputStream(InputStream inputStream) {
5046
this(inputStream, null);
@@ -84,7 +80,7 @@ public int read() throws IOException {
8480
// if we haven't yet determined a charset...
8581
if (read == -1) {
8682
if (charset == null) {
87-
if (maybeTwoByteSequence || maybeThreeByteSequence || maybeFourByteSequence) {
83+
if (remainingContinuationBytes > 0) {
8884
charset = WINDOWS_1252;
8985
} else {
9086
charset = StandardCharsets.UTF_8;
@@ -116,43 +112,25 @@ public int read(byte[] b, int off, int len) throws IOException {
116112
}
117113

118114
private void guessCharset(int aByte) {
119-
if (utf8TwoByteSequence(aByte)) {
120-
maybeTwoByteSequence = true;
121-
} else if (utf8ThreeByteSequence(aByte)) {
122-
maybeThreeByteSequence = true;
123-
} else if (utf8FourByteSequence(aByte)) {
124-
maybeFourByteSequence = true;
125-
} else if (maybeTwoByteSequence) {
126-
if (!utf8SequenceEnd(aByte)) {
127-
charset = WINDOWS_1252;
115+
if (remainingContinuationBytes > 0) {
116+
if (aByte >= 0x80 && aByte <= 0xBF) {
117+
remainingContinuationBytes--;
128118
} else {
129-
maybeTwoByteSequence = false;
130-
prev = -1;
131-
}
132-
} else if (maybeThreeByteSequence) {
133-
if (!utf8SequenceEnd(aByte)) {
134119
charset = WINDOWS_1252;
135120
}
136-
137-
if (utf8SequenceEnd(prev) && utf8SequenceEnd(aByte)) {
138-
maybeThreeByteSequence = false;
139-
prev = -1;
140-
}
141-
} else if (maybeFourByteSequence) {
142-
if (utf8SequenceEnd(prev2) && utf8SequenceEnd(prev) && !utf8SequenceEnd(aByte) || utf8SequenceEnd(prev) && !utf8SequenceEnd(aByte) || !utf8SequenceEnd(aByte)) {
143-
charset = WINDOWS_1252;
144-
}
145-
146-
if (utf8SequenceEnd(prev2) && utf8SequenceEnd(prev) && utf8SequenceEnd(aByte)) {
147-
maybeFourByteSequence = false;
148-
prev = -1;
149-
}
150-
} else if (utf8SequenceEnd(aByte)) {
121+
} else if (aByte <= 0x7F) {
122+
// ASCII — valid, nothing to track
123+
} else if (aByte >= 0xC2 && aByte <= 0xDF) {
124+
remainingContinuationBytes = 1;
125+
} else if (aByte >= 0xE0 && aByte <= 0xEF) {
126+
remainingContinuationBytes = 2;
127+
} else if (aByte >= 0xF0 && aByte <= 0xF4) {
128+
remainingContinuationBytes = 3;
129+
} else {
130+
// 0x80-0xBF (bare continuation), 0xC0-0xC1 (overlong),
131+
// 0xF5-0xFF (above max Unicode) — all invalid UTF-8
151132
charset = WINDOWS_1252;
152133
}
153-
154-
prev2 = prev;
155-
prev = aByte;
156134
}
157135

158136
public String readFully() {
@@ -197,25 +175,7 @@ private int checkAndSkipUtf8Bom() throws IOException {
197175
return -2;
198176
}
199177

200-
// The first byte of a UTF-8 two byte sequence is between 0xC0 - 0xDF.
201-
private boolean utf8TwoByteSequence(int b) {
202-
return 0xC0 <= b && b <= 0xDF;
203-
}
204-
205-
// The first byte of a UTF-8 three byte sequence is between 0xE0 - 0xEF.
206-
private boolean utf8ThreeByteSequence(int b) {
207-
return 0xE0 <= b && b <= 0xEF;
208-
}
209-
210-
// The first byte of a UTF-8 four byte sequence is between 0xF0 - 0xF7.
211-
private boolean utf8FourByteSequence(int b) {
212-
return 0xF0 <= b && b <= 0xF7;
213-
}
214178

215-
// A UTF-8 byte sequence must end between 0x80 - 0xBF.
216-
private boolean utf8SequenceEnd(int b) {
217-
return 0x80 <= b && b <= 0xBF;
218-
}
219179

220180
@Override
221181
public void close() throws IOException {

rewrite-core/src/test/java/org/openrewrite/internal/EncodingDetectingInputStreamTest.java

Lines changed: 68 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,11 @@
1616
package org.openrewrite.internal;
1717

1818
import org.junit.jupiter.api.Test;
19+
import org.junit.jupiter.params.ParameterizedTest;
20+
import org.junit.jupiter.params.provider.CsvSource;
1921

2022
import java.io.ByteArrayInputStream;
2123
import java.nio.charset.Charset;
22-
import java.util.List;
2324

2425
import static java.nio.charset.StandardCharsets.ISO_8859_1;
2526
import static java.nio.charset.StandardCharsets.UTF_8;
@@ -74,34 +75,59 @@ void skipUTF8BomKnownEncoding() throws Exception {
7475
}
7576
}
7677

77-
@Test
78-
void isUtf8() throws Exception {
79-
List<String> accents = List.of("Café", "Lýðræðisríki");
80-
for (String accent : accents) {
81-
try (EncodingDetectingInputStream is = read(accent, UTF_8)) {
82-
assertThat(is.getCharset()).isEqualTo(UTF_8);
83-
}
78+
@ParameterizedTest
79+
@CsvSource({
80+
"Café, UTF-8, UTF-8",
81+
"Lyðræðisríki, UTF-8, UTF-8",
82+
"ࠀ, UTF-8, UTF-8",
83+
"世, UTF-8, UTF-8",
84+
"가, UTF-8, UTF-8",
85+
"Hello 世界, UTF-8, UTF-8",
86+
"café 世界 🌟, UTF-8, UTF-8",
87+
"Café, Windows-1252, Windows-1252",
88+
"Lyðræðisríki, Windows-1252, Windows-1252",
89+
})
90+
void detectsCharsetForEncodedStrings(String text, String sourceCharset, String expectedCharset) throws Exception {
91+
try (EncodingDetectingInputStream is = read(text, Charset.forName(sourceCharset))) {
92+
assertThat(is.getCharset()).isEqualTo(Charset.forName(expectedCharset));
8493
}
8594
}
8695

87-
@Test
88-
void isWindows1252() throws Exception {
89-
List<String> accents = List.of("Café", "Lýðræðisríki");
90-
for (String accent : accents) {
91-
try (EncodingDetectingInputStream is = read(accent, WINDOWS_1252)) {
92-
assertThat(is.getCharset()).isEqualTo(WINDOWS_1252);
93-
}
94-
}
96+
@ParameterizedTest
97+
@CsvSource({
98+
"48 65 6C 6C 6F C3 BC, valid 2-byte sequence (Helloü)",
99+
"E4 B8 96, valid 3-byte sequence (世)",
100+
"F0 9F 98 80, valid 4-byte sequence (😀)",
101+
"63 61 66 C3 A9 20 E4 B8 96 20 F0 9F 8C 9F, mixed 2/3/4-byte sequences",
102+
})
103+
void detectsUtf8ForValidByteSequences(String hex, String description) {
104+
byte[] bytes = parseHex(hex);
105+
EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(bytes));
106+
is.readFully();
107+
assertThat(is.getCharset()).as(description).isEqualTo(UTF_8);
95108
}
96109

97-
@Test
98-
void oddPairInWindows1252() throws Exception {
99-
// Range 1: 0xC0 - 0xDF == "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞß"
100-
// Range 2: 0x80 - 0xBF == "€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”·–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿"
101-
// A character in range 1 followed by a character in range 2 encoded in Windows-1252 will be detected as UTF-8.
102-
try (EncodingDetectingInputStream is = read("À€", WINDOWS_1252)) {
103-
assertThat(is.getCharset()).isEqualTo(UTF_8);
104-
}
110+
@ParameterizedTest
111+
@CsvSource({
112+
"48 65 6C 6C 6F FC, byte above 0xF7 (ü in ISO-8859-1)",
113+
"74 65 73 74 FC 20 E4 20 F6, multiple high bytes",
114+
"C0 80, overlong 0xC0 lead byte",
115+
"43 61 66 C0, overlong 0xC0 at end",
116+
"53 61 6F 20 50 61 75 6C 6F C1, overlong 0xC1 at end",
117+
"74 65 73 74 C0 A3, overlong 0xC0 + continuation byte",
118+
"48 65 6C 6C 6F 80, bare continuation byte",
119+
"C3 BC 20 61 6E 64 20 FC, valid UTF-8 then invalid byte",
120+
"E4 B8 96 20 FE, invalid byte after valid 3-byte",
121+
"61 62 E4 B8, truncated 3-byte sequence",
122+
"61 F0 9F 98, truncated 4-byte sequence",
123+
"78 E4 B8 7A, broken continuation in 3-byte",
124+
"F0 9F 98 78, broken continuation in 4-byte",
125+
})
126+
void detectsWindows1252ForInvalidUtf8Bytes(String hex, String description) {
127+
byte[] bytes = parseHex(hex);
128+
EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(bytes));
129+
is.readFully();
130+
assertThat(is.getCharset()).as(description).isEqualTo(WINDOWS_1252);
105131
}
106132

107133
@Test
@@ -128,7 +154,6 @@ void windows1252SpecialCharacters() throws Exception {
128154
@Test
129155
void iso88591() {
130156
for (int i = 0; i < 255; i++) {
131-
// Skip control characters in ISO-8859-1
132157
if (!(i >= 128 && i <= 159)) {
133158
String s = Character.toString((char) i);
134159
byte[] win = s.getBytes(WINDOWS_1252);
@@ -138,6 +163,24 @@ void iso88591() {
138163
}
139164
}
140165

166+
@Test
167+
void readFullyDecodesIso8859Correctly() {
168+
byte[] bytes = new byte[]{0x48, (byte) 0xFC, 0x74, 0x74, 0x65};
169+
EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(bytes));
170+
String result = is.readFully();
171+
assertThat(is.getCharset()).isEqualTo(WINDOWS_1252);
172+
assertThat(result).isEqualTo("Hütte");
173+
}
174+
175+
private static byte[] parseHex(String hex) {
176+
String[] parts = hex.trim().split("\\s+");
177+
byte[] bytes = new byte[parts.length];
178+
for (int i = 0; i < parts.length; i++) {
179+
bytes[i] = (byte) Integer.parseInt(parts[i], 16);
180+
}
181+
return bytes;
182+
}
183+
141184
private EncodingDetectingInputStream read(String s, Charset charset) {
142185
EncodingDetectingInputStream is = new EncodingDetectingInputStream(new ByteArrayInputStream(s.getBytes(charset)));
143186
is.readFully();

0 commit comments

Comments
 (0)