Skip to content

Commit 500261f

Browse files
committed
XML: harden parser against malformed input crashes
ANTLR error recovery synthesizes closing-tag tokens when input ends with an unclosed element, which caused `XmlParserVisitor` to throw `IndexOutOfBoundsException` from `advanceCursor` and `NullPointerException` when accessing `Name(1)`. Clamp `advanceCursor` to the source length and tolerate null `OPEN(1)`/`Name(1)`/`CLOSE(1)` so these inputs fall back to a `ParseError` (preserving original text) instead of crashing. See #7554 (comment)
1 parent becae3b commit 500261f

2 files changed

Lines changed: 72 additions & 16 deletions

File tree

rewrite-xml/src/main/java/org/openrewrite/xml/internal/XmlParserVisitor.java

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -336,25 +336,35 @@ public Xml.Tag visitElement(XMLParser.ElementContext ctx) {
336336
beforeTagDelimiterPrefix = prefix(ctx.SLASH_CLOSE());
337337
advanceCursor(ctx.SLASH_CLOSE().getSymbol().getStopIndex() + 1);
338338
} else {
339-
beforeTagDelimiterPrefix = prefix(ctx.CLOSE(0));
340-
advanceCursor(ctx.CLOSE(0).getSymbol().getStopIndex() + 1);
339+
beforeTagDelimiterPrefix = ctx.CLOSE(0) == null ? "" : prefix(ctx.CLOSE(0));
340+
if (ctx.CLOSE(0) != null) {
341+
advanceCursor(ctx.CLOSE(0).getSymbol().getStopIndex() + 1);
342+
}
341343

342344
content = ctx.content().stream()
343345
.map(this::visit)
344346
.map(Content.class::cast)
345347
.collect(toList());
346348

347-
String closeTagPrefix = prefix(ctx.OPEN(1));
348-
advanceCursor(codePointCursor + 2);
349-
350-
closeTag = new Xml.Tag.Closing(
351-
randomId(),
352-
closeTagPrefix,
353-
Markers.EMPTY,
354-
convert(ctx.Name(1), (n, p) -> n.getText()),
355-
prefix(ctx.CLOSE(1))
356-
);
357-
advanceCursor(codePointCursor + 1);
349+
// ANTLR may synthesize missing closing-tag tokens during error recovery
350+
// on malformed input; tolerate any combination of null OPEN/Name/CLOSE.
351+
if (ctx.OPEN(1) != null || ctx.Name(1) != null || ctx.CLOSE(1) != null) {
352+
String closeTagPrefix = ctx.OPEN(1) == null ? "" : prefix(ctx.OPEN(1));
353+
advanceCursor(codePointCursor + 2);
354+
355+
String closeName = ctx.Name(1) == null ? "" :
356+
convert(ctx.Name(1), (n, p) -> n.getText());
357+
String afterCloseName = ctx.CLOSE(1) == null ? "" : prefix(ctx.CLOSE(1));
358+
359+
closeTag = new Xml.Tag.Closing(
360+
randomId(),
361+
closeTagPrefix,
362+
Markers.EMPTY,
363+
closeName,
364+
afterCloseName
365+
);
366+
advanceCursor(codePointCursor + 1);
367+
}
358368
}
359369

360370
return new Xml.Tag(randomId(), prefix, Markers.EMPTY, name, attributes,
@@ -468,15 +478,23 @@ private String prefix(Token token) {
468478

469479

470480
/**
471-
* Advance both the cursor and the code point cursor
481+
* Advance both the cursor and the code point cursor.
482+
* Clamps to the end of source if a synthesized token (from ANTLR error recovery
483+
* on malformed input) would advance past the end of the source string.
472484
*/
473485
@SuppressWarnings("UnusedReturnValue")
474486
private int advanceCursor(int newCodePointIndex) {
475487
if (newCodePointIndex <= codePointCursor) {
476488
return cursor;
477489
}
478-
cursor = source.offsetByCodePoints(cursor, newCodePointIndex - codePointCursor);
479-
codePointCursor = newCodePointIndex;
490+
try {
491+
cursor = source.offsetByCodePoints(cursor, newCodePointIndex - codePointCursor);
492+
codePointCursor = newCodePointIndex;
493+
} catch (IndexOutOfBoundsException e) {
494+
int reachableCodePoints = source.codePointCount(cursor, source.length());
495+
cursor = source.length();
496+
codePointCursor += reachableCodePoints;
497+
}
480498
return cursor;
481499
}
482500

rewrite-xml/src/test/java/org/openrewrite/xml/XmlParserTest.java

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,4 +765,42 @@ void utf8SurrogatePairsSimple() {
765765
)
766766
);
767767
}
768+
769+
@Issue("https://github.com/openrewrite/rewrite/issues/7554")
770+
@Test
771+
void malformedMissingRootCloseDoesNotThrow() {
772+
// Inner element is never closed before EOF. Previously threw IndexOutOfBoundsException
773+
// from advanceCursor when ANTLR error recovery synthesized closing-tag tokens past EOF.
774+
SourceFile parsed = XmlParser.builder().build()
775+
.parse(new InMemoryExecutionContext(t -> {
776+
}), "<root>\n<inner>\n wrong format\n</root>")
777+
.findFirst().orElseThrow();
778+
assertThat(parsed).isInstanceOf(ParseError.class);
779+
assertThat(parsed.printAll()).isEqualTo("<root>\n<inner>\n wrong format\n</root>");
780+
}
781+
782+
@Issue("https://github.com/openrewrite/rewrite/issues/7554")
783+
@Test
784+
void malformedUnterminatedEndTagDoesNotThrow() {
785+
// End-tag missing its '>' before EOF.
786+
SourceFile parsed = XmlParser.builder().build()
787+
.parse(new InMemoryExecutionContext(t -> {
788+
}), "<a></a")
789+
.findFirst().orElseThrow();
790+
assertThat(parsed).isInstanceOf(ParseError.class);
791+
assertThat(parsed.printAll()).isEqualTo("<a></a");
792+
}
793+
794+
@Issue("https://github.com/openrewrite/rewrite/issues/7554")
795+
@Test
796+
void malformedBareAmpersandDoesNotThrow() {
797+
// A bare '&' (not part of an entity reference) is invalid XML; the original
798+
// text is preserved by falling back to a ParseError rather than silently dropping it.
799+
SourceFile parsed = XmlParser.builder().build()
800+
.parse(new InMemoryExecutionContext(t -> {
801+
}), "<a>McFarland & Company</a>")
802+
.findFirst().orElseThrow();
803+
assertThat(parsed).isInstanceOf(ParseError.class);
804+
assertThat(parsed.printAll()).isEqualTo("<a>McFarland & Company</a>");
805+
}
768806
}

0 commit comments

Comments
 (0)