Handle invalid UTF-16 surrogate pairs in JavaScript/TypeScript parser (#6599)

jkschneider · web-flow · commit ce325fccddf7 · 2026-01-26T13:29:32.000-05:00
Previously, the parser threw InvalidSurrogatesNotSupportedError when encountering unmatched surrogate pairs, causing ingestion failures. Now, surrogate pairs (both \uXXXX escape sequences and raw characters) are extracted into J.LiteralUnicodeEscape objects, matching the approach used in the Java parser. This allows Jackson to serialize the AST without encountering invalid surrogate pair errors. Fixes moderneinc/customer-requests#829 Fixes moderneinc/customer-requests#830
diff --git a/rewrite-javascript/rewrite/src/javascript/parser.ts b/rewrite-javascript/rewrite/src/javascript/parser.ts
@@ -43,7 +43,6 @@ import {
     getPreviousSibling,
     hasFlowAnnotation,
     isStatement,
-    isValidSurrogateRange,
     TextSpan
 } from "./parser-utils";
 import {JavaScriptTypeMapping} from "./type-mapping";
@@ -908,19 +907,17 @@ export class JavaScriptParserVisitor {
     private mapLiteral(node: ts.LiteralExpression | ts.TrueLiteral | ts.FalseLiteral | ts.NullLiteral | ts.Identifier
         | ts.TemplateHead | ts.TemplateMiddle | ts.TemplateTail | ts.JsxText, value: any): J.Literal {
 
-        let valueSource = node.getText();
-        if (!isValidSurrogateRange(valueSource)) {
-            // TODO: Fix to prevent ingestion failure for invalid surrogate pairs. Should be reworked with J.Literal.UnicodeEscape
-            throw new InvalidSurrogatesNotSupportedError();
-        }
+        const valueSource = node.getText();
+        const { cleanedSource, unicodeEscapes } = extractSurrogateEscapes(valueSource);
 
         return {
             kind: J.Kind.Literal,
             id: randomId(),
             prefix: this.prefix(node),
             markers: emptyMarkers,
-            value: value,
-            valueSource: valueSource,
+            value: unicodeEscapes.length > 0 ? cleanedSource : value,
+            valueSource: cleanedSource,
+            unicodeEscapes: unicodeEscapes.length > 0 ? unicodeEscapes : undefined,
             type: this.mapPrimitiveType(node)
         };
     }
@@ -4671,11 +4668,56 @@ class FlowSyntaxNotSupportedError extends SyntaxError {
     }
 }
 
-class InvalidSurrogatesNotSupportedError extends SyntaxError {
-    constructor(message: string = "String literal contains invalid surrogate pairs, that is not supported") {
-        super(message);
-        this.name = "InvalidSurrogatesNotSupportedError";
+const SURR_FIRST = 0xD800;
+const SURR_LAST = 0xDFFF;
+
+/**
+ * Extracts invalid UTF-16 surrogate pairs from a string literal's value source.
+ * Unmatched UTF-16 surrogate pairs (composed of two escape and code point pairs) are unserializable
+ * by technologies like Jackson. So we separate and store the code point off and reconstruct
+ * the escape sequence when printing later.
+ * We only escape unicode characters that are part of UTF-16 surrogate pairs. Others are generally
+ * treated well by tools like Jackson.
+ *
+ * Handles both:
+ * 1. Unicode escape sequences (\uXXXX) where XXXX is in the surrogate range
+ * 2. Raw surrogate characters in the source (character codes 0xD800-0xDFFF)
+ */
+function extractSurrogateEscapes(valueSource: string): { cleanedSource: string, unicodeEscapes: J.LiteralUnicodeEscape[] } {
+    const unicodeEscapes: J.LiteralUnicodeEscape[] = [];
+    let cleanedSource = '';
+    let cleanedIndex = 0;
+
+    for (let j = 0; j < valueSource.length; j++) {
+        const c = valueSource.charAt(j);
+        const charCode = valueSource.charCodeAt(j);
+
+        // Check for unicode escape sequence: \uXXXX
+        // Ensure we're not escaped (previous char is not \) or we're at the start
+        if (c === '\\' && j < valueSource.length - 1 && (j === 0 || valueSource.charAt(j - 1) !== '\\')) {
+            if (valueSource.charAt(j + 1) === 'u' && j < valueSource.length - 5) {
+                const codePoint = valueSource.substring(j + 2, j + 6);
+                const codePointNumeric = parseInt(codePoint, 16);
+                if (!isNaN(codePointNumeric) && codePointNumeric >= SURR_FIRST && codePointNumeric <= SURR_LAST) {
+                    unicodeEscapes.push({ valueSourceIndex: cleanedIndex, codePoint });
+                    j += 5; // Skip the \uXXXX sequence (we're already at \, skip u and 4 hex digits)
+                    continue;
+                }
+            }
+        }
+
+        // Check for raw surrogate characters in the source
+        if (charCode >= SURR_FIRST && charCode <= SURR_LAST) {
+            const codePoint = charCode.toString(16).toUpperCase().padStart(4, '0');
+            unicodeEscapes.push({ valueSourceIndex: cleanedIndex, codePoint });
+            continue;
+        }
+
+        cleanedSource += c;
+        cleanedIndex++;
     }
+
+    return { cleanedSource, unicodeEscapes };
 }
 
 Parsers.registerParser("javascript", JavaScriptParser);