Skip to content

Commit 8844f98

Browse files
committed
Also add Unicode support to Proto2 parser
1 parent a8ebd2f commit 8844f98

2 files changed

Lines changed: 61 additions & 7 deletions

File tree

โ€Žrewrite-protobuf/src/main/java/org/openrewrite/protobuf/internal/ProtoParserVisitor.javaโ€Ž

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,14 @@ public class ProtoParserVisitor extends Protobuf2ParserBaseVisitor<Proto> {
4444
private final Charset charset;
4545
private final boolean charsetBomMarked;
4646

47+
/**
48+
* Track position within the file by character (UTF-16 code units)
49+
*/
4750
private int cursor = 0;
51+
/**
52+
* Track parsing position within the file by Unicode code point
53+
*/
54+
private int codePointCursor = 0;
4855

4956
public ProtoParserVisitor(Path path, @Nullable FileAttributes fileAttributes, String source, Charset charset, boolean charsetBomMarked) {
5057
this.path = path;
@@ -429,12 +436,12 @@ private Space prefix(@Nullable TerminalNode terminalNode) {
429436

430437
private Space prefix(Token token) {
431438
int start = token.getStartIndex();
432-
if (start < cursor) {
439+
if (start < codePointCursor) {
433440
return Space.EMPTY;
434441
}
435-
String prefix = source.substring(cursor, start);
436-
cursor = start;
437-
return Space.format(prefix);
442+
int oldCursor = cursor;
443+
advanceCursor(start);
444+
return Space.format(source.substring(oldCursor, cursor));
438445
}
439446

440447
private <C extends ParserRuleContext, T> @Nullable T convert(C ctx, BiFunction<C, Space, T> conversion) {
@@ -445,15 +452,15 @@ private Space prefix(Token token) {
445452

446453
T t = conversion.apply(ctx, prefix(ctx));
447454
if (ctx.getStop() != null) {
448-
cursor = ctx.getStop().getStopIndex() + (Character.isWhitespace(source.charAt(ctx.getStop().getStopIndex())) ? 0 : 1);
455+
advanceCursor(ctx.getStop().getStopIndex() + (Character.isWhitespace(source.charAt(ctx.getStop().getStopIndex())) ? 0 : 1));
449456
}
450457

451458
return t;
452459
}
453460

454461
private <T> T convert(TerminalNode node, BiFunction<TerminalNode, Space, T> conversion) {
455462
T t = conversion.apply(node, prefix(node));
456-
cursor = node.getSymbol().getStopIndex() + 1;
463+
advanceCursor(node.getSymbol().getStopIndex() + 1);
457464
return t;
458465
}
459466

@@ -469,7 +476,9 @@ private Space sourceBefore(String untilDelim) {
469476
}
470477

471478
String prefix = source.substring(cursor, delimIndex);
472-
cursor += prefix.length() + untilDelim.length(); // advance past the delimiter
479+
int codePointsInPrefix = prefix.codePointCount(0, prefix.length());
480+
// All Protobuf delimiters are ASCII, so length == code point count
481+
advanceCursor(codePointCursor + codePointsInPrefix + untilDelim.length());
473482
return Space.format(prefix);
474483
}
475484

@@ -515,4 +524,17 @@ private int positionOfNext(String untilDelim, @Nullable Character stop) {
515524

516525
return delimIndex > source.length() - untilDelim.length() ? -1 : delimIndex;
517526
}
527+
528+
/**
529+
* Advance both the cursor and the code point cursor
530+
*/
531+
@SuppressWarnings("UnusedReturnValue")
532+
private int advanceCursor(int newCodePointIndex) {
533+
if (newCodePointIndex <= codePointCursor) {
534+
return cursor;
535+
}
536+
cursor = source.offsetByCodePoints(cursor, newCodePointIndex - codePointCursor);
537+
codePointCursor = newCodePointIndex;
538+
return cursor;
539+
}
518540
}

โ€Žrewrite-protobuf/src/test/java/org/openrewrite/protobuf/ProtoParserTest.javaโ€Ž

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,43 @@
2323
import java.util.List;
2424

2525
import static org.assertj.core.api.Assertions.assertThat;
26+
import static org.openrewrite.protobuf.Assertions.proto;
2627

2728
class ProtoParserTest implements RewriteTest {
2829
@Test
2930
void noNullsForProto3Files() {
3031
List<SourceFile> sources = ProtoParser.builder().build().parse("syntax = \"proto3\";").toList();
3132
assertThat(sources).singleElement().isInstanceOf(PlainText.class);
3233
}
34+
35+
@Test
36+
void unicodeInComments() {
37+
String protoSource = """
38+
syntax = "proto2";
39+
40+
// ๐Ÿ‘‡ Problem below
41+
message Person {
42+
required string name /* ๐Ÿ‘†*/=/* ๐Ÿ‘†*/ 1; /*๐Ÿ‘†*/
43+
required string emoji = 2 [default = "๐Ÿ‘‡"];
44+
// ๐Ÿ‘† Problem above
45+
}
46+
""";
47+
List<SourceFile> sources = ProtoParser.builder().build().parse(protoSource).toList();
48+
assertThat(sources).hasSize(1);
49+
assertThat(sources.get(0).printAll()).isEqualTo(protoSource);
50+
}
51+
52+
@Test
53+
void moreUnicodeInComments() {
54+
rewriteRun(
55+
proto(
56+
"""
57+
syntax = 'proto2';
58+
service SearchService {
59+
/*๐Ÿ‘†๐Ÿ‘†*/ rpc /*๐Ÿ‘†๐Ÿ‘†*/ Search /*๐Ÿ‘†๐Ÿ‘†*/(/*๐Ÿ‘†๐Ÿ‘†*/ SearchRequest ) returns ( SearchResponse );
60+
}
61+
"""
62+
)
63+
);
64+
}
3365
}

0 commit comments

Comments
ย (0)
โšก