@@ -43,7 +43,6 @@ import {
4343 getPreviousSibling ,
4444 hasFlowAnnotation ,
4545 isStatement ,
46- isValidSurrogateRange ,
4746 TextSpan
4847} from "./parser-utils" ;
4948import { JavaScriptTypeMapping } from "./type-mapping" ;
@@ -908,19 +907,17 @@ export class JavaScriptParserVisitor {
908907 private mapLiteral ( node : ts . LiteralExpression | ts . TrueLiteral | ts . FalseLiteral | ts . NullLiteral | ts . Identifier
909908 | ts . TemplateHead | ts . TemplateMiddle | ts . TemplateTail | ts . JsxText , value : any ) : J . Literal {
910909
911- let valueSource = node . getText ( ) ;
912- if ( ! isValidSurrogateRange ( valueSource ) ) {
913- // TODO: Fix to prevent ingestion failure for invalid surrogate pairs. Should be reworked with J.Literal.UnicodeEscape
914- throw new InvalidSurrogatesNotSupportedError ( ) ;
915- }
910+ const valueSource = node . getText ( ) ;
911+ const { cleanedSource, unicodeEscapes } = extractSurrogateEscapes ( valueSource ) ;
916912
917913 return {
918914 kind : J . Kind . Literal ,
919915 id : randomId ( ) ,
920916 prefix : this . prefix ( node ) ,
921917 markers : emptyMarkers ,
922- value : value ,
923- valueSource : valueSource ,
918+ value : unicodeEscapes . length > 0 ? cleanedSource : value ,
919+ valueSource : cleanedSource ,
920+ unicodeEscapes : unicodeEscapes . length > 0 ? unicodeEscapes : undefined ,
924921 type : this . mapPrimitiveType ( node )
925922 } ;
926923 }
@@ -4671,11 +4668,56 @@ class FlowSyntaxNotSupportedError extends SyntaxError {
46714668 }
46724669}
46734670
4674- class InvalidSurrogatesNotSupportedError extends SyntaxError {
4675- constructor ( message : string = "String literal contains invalid surrogate pairs, that is not supported" ) {
4676- super ( message ) ;
4677- this . name = "InvalidSurrogatesNotSupportedError" ;
4671+ const SURR_FIRST = 0xD800 ;
4672+ const SURR_LAST = 0xDFFF ;
4673+
4674+ /**
4675+ * Extracts invalid UTF-16 surrogate pairs from a string literal's value source.
4676+ * Unmatched UTF-16 surrogate pairs (composed of two escape and code point pairs) are unserializable
4677+ * by technologies like Jackson. So we separate and store the code point off and reconstruct
4678+ * the escape sequence when printing later.
4679+ * We only escape unicode characters that are part of UTF-16 surrogate pairs. Others are generally
4680+ * treated well by tools like Jackson.
4681+ *
4682+ * Handles both:
4683+ * 1. Unicode escape sequences (\uXXXX) where XXXX is in the surrogate range
4684+ * 2. Raw surrogate characters in the source (character codes 0xD800-0xDFFF)
4685+ */
4686+ function extractSurrogateEscapes ( valueSource : string ) : { cleanedSource : string , unicodeEscapes : J . LiteralUnicodeEscape [ ] } {
4687+ const unicodeEscapes : J . LiteralUnicodeEscape [ ] = [ ] ;
4688+ let cleanedSource = '' ;
4689+ let cleanedIndex = 0 ;
4690+
4691+ for ( let j = 0 ; j < valueSource . length ; j ++ ) {
4692+ const c = valueSource . charAt ( j ) ;
4693+ const charCode = valueSource . charCodeAt ( j ) ;
4694+
4695+ // Check for unicode escape sequence: \uXXXX
4696+ // Ensure we're not escaped (previous char is not \) or we're at the start
4697+ if ( c === '\\' && j < valueSource . length - 1 && ( j === 0 || valueSource . charAt ( j - 1 ) !== '\\' ) ) {
4698+ if ( valueSource . charAt ( j + 1 ) === 'u' && j < valueSource . length - 5 ) {
4699+ const codePoint = valueSource . substring ( j + 2 , j + 6 ) ;
4700+ const codePointNumeric = parseInt ( codePoint , 16 ) ;
4701+ if ( ! isNaN ( codePointNumeric ) && codePointNumeric >= SURR_FIRST && codePointNumeric <= SURR_LAST ) {
4702+ unicodeEscapes . push ( { valueSourceIndex : cleanedIndex , codePoint } ) ;
4703+ j += 5 ; // Skip the \uXXXX sequence (we're already at \, skip u and 4 hex digits)
4704+ continue ;
4705+ }
4706+ }
4707+ }
4708+
4709+ // Check for raw surrogate characters in the source
4710+ if ( charCode >= SURR_FIRST && charCode <= SURR_LAST ) {
4711+ const codePoint = charCode . toString ( 16 ) . toUpperCase ( ) . padStart ( 4 , '0' ) ;
4712+ unicodeEscapes . push ( { valueSourceIndex : cleanedIndex , codePoint } ) ;
4713+ continue ;
4714+ }
4715+
4716+ cleanedSource += c ;
4717+ cleanedIndex ++ ;
46784718 }
4719+
4720+ return { cleanedSource, unicodeEscapes } ;
46794721}
46804722
46814723Parsers . registerParser ( "javascript" , JavaScriptParser ) ;
0 commit comments