@@ -44,7 +44,14 @@ public class ProtoParserVisitor extends Protobuf2ParserBaseVisitor<Proto> {
4444 private final Charset charset ;
4545 private final boolean charsetBomMarked ;
4646
47+ /**
48+ * Track position within the file by character (UTF-16 code units)
49+ */
4750 private int cursor = 0 ;
51+ /**
52+ * Track parsing position within the file by Unicode code point
53+ */
54+ private int codePointCursor = 0 ;
4855
4956 public ProtoParserVisitor (Path path , @ Nullable FileAttributes fileAttributes , String source , Charset charset , boolean charsetBomMarked ) {
5057 this .path = path ;
@@ -429,12 +436,12 @@ private Space prefix(@Nullable TerminalNode terminalNode) {
429436
430437 private Space prefix (Token token ) {
431438 int start = token .getStartIndex ();
432- if (start < cursor ) {
439+ if (start < codePointCursor ) {
433440 return Space .EMPTY ;
434441 }
435- String prefix = source . substring ( cursor , start ) ;
436- cursor = start ;
437- return Space .format (prefix );
442+ int oldCursor = cursor ;
443+ advanceCursor ( start ) ;
444+ return Space .format (source . substring ( oldCursor , cursor ) );
438445 }
439446
440447 private <C extends ParserRuleContext , T > @ Nullable T convert (C ctx , BiFunction <C , Space , T > conversion ) {
@@ -445,15 +452,15 @@ private Space prefix(Token token) {
445452
446453 T t = conversion .apply (ctx , prefix (ctx ));
447454 if (ctx .getStop () != null ) {
448- cursor = ctx .getStop ().getStopIndex () + (Character .isWhitespace (source .charAt (ctx .getStop ().getStopIndex ())) ? 0 : 1 );
455+ advanceCursor ( ctx .getStop ().getStopIndex () + (Character .isWhitespace (source .charAt (ctx .getStop ().getStopIndex ())) ? 0 : 1 ) );
449456 }
450457
451458 return t ;
452459 }
453460
454461 private <T > T convert (TerminalNode node , BiFunction <TerminalNode , Space , T > conversion ) {
455462 T t = conversion .apply (node , prefix (node ));
456- cursor = node .getSymbol ().getStopIndex () + 1 ;
463+ advanceCursor ( node .getSymbol ().getStopIndex () + 1 ) ;
457464 return t ;
458465 }
459466
@@ -469,7 +476,9 @@ private Space sourceBefore(String untilDelim) {
469476 }
470477
471478 String prefix = source .substring (cursor , delimIndex );
472- cursor += prefix .length () + untilDelim .length (); // advance past the delimiter
479+ int codePointsInPrefix = prefix .codePointCount (0 , prefix .length ());
480+ // All Protobuf delimiters are ASCII, so length == code point count
481+ advanceCursor (codePointCursor + codePointsInPrefix + untilDelim .length ());
473482 return Space .format (prefix );
474483 }
475484
@@ -515,4 +524,17 @@ private int positionOfNext(String untilDelim, @Nullable Character stop) {
515524
516525 return delimIndex > source .length () - untilDelim .length () ? -1 : delimIndex ;
517526 }
527+
528+ /**
529+ * Advance both the cursor and the code point cursor
530+ */
531+ @ SuppressWarnings ("UnusedReturnValue" )
532+ private int advanceCursor (int newCodePointIndex ) {
533+ if (newCodePointIndex <= codePointCursor ) {
534+ return cursor ;
535+ }
536+ cursor = source .offsetByCodePoints (cursor , newCodePointIndex - codePointCursor );
537+ codePointCursor = newCodePointIndex ;
538+ return cursor ;
539+ }
518540}
0 commit comments