Skip to content

Commit ce325fc

Browse files
authored
Handle invalid UTF-16 surrogate pairs in JavaScript/TypeScript parser (#6599)
Previously, the parser threw InvalidSurrogatesNotSupportedError when encountering unmatched surrogate pairs, causing ingestion failures. Now, surrogate pairs (both \uXXXX escape sequences and raw characters) are extracted into J.LiteralUnicodeEscape objects, matching the approach used in the Java parser. This allows Jackson to serialize the AST without encountering invalid surrogate pair errors. Fixes moderneinc/customer-requests#829 Fixes moderneinc/customer-requests#830
1 parent f76f274 commit ce325fc

1 file changed

Lines changed: 54 additions & 12 deletions

File tree

  • rewrite-javascript/rewrite/src/javascript

rewrite-javascript/rewrite/src/javascript/parser.ts

Lines changed: 54 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ import {
4343
getPreviousSibling,
4444
hasFlowAnnotation,
4545
isStatement,
46-
isValidSurrogateRange,
4746
TextSpan
4847
} from "./parser-utils";
4948
import {JavaScriptTypeMapping} from "./type-mapping";
@@ -908,19 +907,17 @@ export class JavaScriptParserVisitor {
908907
private mapLiteral(node: ts.LiteralExpression | ts.TrueLiteral | ts.FalseLiteral | ts.NullLiteral | ts.Identifier
909908
| ts.TemplateHead | ts.TemplateMiddle | ts.TemplateTail | ts.JsxText, value: any): J.Literal {
910909

911-
let valueSource = node.getText();
912-
if (!isValidSurrogateRange(valueSource)) {
913-
// TODO: Fix to prevent ingestion failure for invalid surrogate pairs. Should be reworked with J.Literal.UnicodeEscape
914-
throw new InvalidSurrogatesNotSupportedError();
915-
}
910+
const valueSource = node.getText();
911+
const { cleanedSource, unicodeEscapes } = extractSurrogateEscapes(valueSource);
916912

917913
return {
918914
kind: J.Kind.Literal,
919915
id: randomId(),
920916
prefix: this.prefix(node),
921917
markers: emptyMarkers,
922-
value: value,
923-
valueSource: valueSource,
918+
value: unicodeEscapes.length > 0 ? cleanedSource : value,
919+
valueSource: cleanedSource,
920+
unicodeEscapes: unicodeEscapes.length > 0 ? unicodeEscapes : undefined,
924921
type: this.mapPrimitiveType(node)
925922
};
926923
}
@@ -4671,11 +4668,56 @@ class FlowSyntaxNotSupportedError extends SyntaxError {
46714668
}
46724669
}
46734670

4674-
class InvalidSurrogatesNotSupportedError extends SyntaxError {
4675-
constructor(message: string = "String literal contains invalid surrogate pairs, that is not supported") {
4676-
super(message);
4677-
this.name = "InvalidSurrogatesNotSupportedError";
4671+
const SURR_FIRST = 0xD800;
4672+
const SURR_LAST = 0xDFFF;
4673+
4674+
/**
4675+
* Extracts invalid UTF-16 surrogate pairs from a string literal's value source.
4676+
* Unmatched UTF-16 surrogate pairs (composed of two escape and code point pairs) are unserializable
4677+
* by technologies like Jackson. So we separate and store the code point off and reconstruct
4678+
* the escape sequence when printing later.
4679+
* We only escape unicode characters that are part of UTF-16 surrogate pairs. Others are generally
4680+
* treated well by tools like Jackson.
4681+
*
4682+
* Handles both:
4683+
* 1. Unicode escape sequences (\uXXXX) where XXXX is in the surrogate range
4684+
* 2. Raw surrogate characters in the source (character codes 0xD800-0xDFFF)
4685+
*/
4686+
function extractSurrogateEscapes(valueSource: string): { cleanedSource: string, unicodeEscapes: J.LiteralUnicodeEscape[] } {
4687+
const unicodeEscapes: J.LiteralUnicodeEscape[] = [];
4688+
let cleanedSource = '';
4689+
let cleanedIndex = 0;
4690+
4691+
for (let j = 0; j < valueSource.length; j++) {
4692+
const c = valueSource.charAt(j);
4693+
const charCode = valueSource.charCodeAt(j);
4694+
4695+
// Check for unicode escape sequence: \uXXXX
4696+
// Ensure we're not escaped (previous char is not \) or we're at the start
4697+
if (c === '\\' && j < valueSource.length - 1 && (j === 0 || valueSource.charAt(j - 1) !== '\\')) {
4698+
if (valueSource.charAt(j + 1) === 'u' && j < valueSource.length - 5) {
4699+
const codePoint = valueSource.substring(j + 2, j + 6);
4700+
const codePointNumeric = parseInt(codePoint, 16);
4701+
if (!isNaN(codePointNumeric) && codePointNumeric >= SURR_FIRST && codePointNumeric <= SURR_LAST) {
4702+
unicodeEscapes.push({ valueSourceIndex: cleanedIndex, codePoint });
4703+
j += 5; // Skip the \uXXXX sequence (we're already at \, skip u and 4 hex digits)
4704+
continue;
4705+
}
4706+
}
4707+
}
4708+
4709+
// Check for raw surrogate characters in the source
4710+
if (charCode >= SURR_FIRST && charCode <= SURR_LAST) {
4711+
const codePoint = charCode.toString(16).toUpperCase().padStart(4, '0');
4712+
unicodeEscapes.push({ valueSourceIndex: cleanedIndex, codePoint });
4713+
continue;
4714+
}
4715+
4716+
cleanedSource += c;
4717+
cleanedIndex++;
46784718
}
4719+
4720+
return { cleanedSource, unicodeEscapes };
46794721
}
46804722

46814723
Parsers.registerParser("javascript", JavaScriptParser);

0 commit comments

Comments
 (0)