Skip to content

Commit 24f9128

Browse files
authored
decoder2: add support for decoding utf-16 surrogates, produced by some JSON encoder implementations (Python, Java, C#) (#25193)
1 parent ae81347 commit 24f9128

2 files changed

Lines changed: 79 additions & 2 deletions

File tree

vlib/x/json2/decoder2/decode.v

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -412,11 +412,41 @@ fn (mut decoder Decoder) decode_value[T](mut val T) ! {
412412
string_buffer << `\t`
413413
}
414414
`u` {
415-
string_buffer << rune(strconv.parse_uint(decoder.json[
415+
unicode_point := rune(strconv.parse_uint(decoder.json[
416416
string_info.position + string_index..string_info.position +
417-
string_index + 4], 16, 32)!).bytes()
417+
string_index + 4], 16, 32)!)
418418

419419
string_index += 4
420+
421+
if unicode_point < 0xD800 { // normal utf-8
422+
string_buffer << unicode_point.bytes()
423+
} else if unicode_point >= 0xDC00 { // trail surrogate -> invalid
424+
decoder.decode_error('Got trail surrogate: ${u32(unicode_point):04X} before head surrogate.')!
425+
} else { // head surrogate -> treat as utf-16
426+
if string_index > string_info.length - 6 {
427+
decoder.decode_error('Expected a trail surrogate after a head surrogate, but got no valid escape sequence.')!
428+
}
429+
if decoder.json[string_info.position + string_index..
430+
string_info.position + string_index + 2] != '\\u' {
431+
decoder.decode_error('Expected a trail surrogate after a head surrogate, but got no valid escape sequence.')!
432+
}
433+
434+
string_index += 2
435+
436+
unicode_point2 := rune(strconv.parse_uint(decoder.json[
437+
string_info.position + string_index..string_info.position +
438+
string_index + 4], 16, 32)!)
439+
440+
string_index += 4
441+
442+
if unicode_point2 < 0xDC00 {
443+
decoder.decode_error('Expected a trail surrogate after a head surrogate, but got ${u32(unicode_point):04X}.')!
444+
}
445+
446+
final_unicode_point := (unicode_point2 & 0x3FF) +
447+
((unicode_point & 0x3FF) << 10) + 0x10000
448+
string_buffer << final_unicode_point.bytes()
449+
}
420450
}
421451
else {} // has already been checked
422452
}

vlib/x/json2/decoder2/tests/decode_escaped_string_test.v

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,50 @@ fn test_decode_escaped_string() {
1010

1111
assert escaped_strings == decoded_strings
1212
}
13+
14+
fn test_surrogate() {
15+
assert decoder2.decode[string](r'"\ud83d\ude00"')! == '😀'
16+
assert decoder2.decode[string](r'"\ud83d\ude00 text"')! == '😀 text'
17+
}
18+
19+
fn test_invalid_surrogate() {
20+
if x := decoder2.decode[string](r'"\ud83d"') {
21+
assert false
22+
} else {
23+
if err is decoder2.JsonDecodeError {
24+
assert err.line == 1
25+
assert err.character == 1
26+
assert err.message == 'Data: Expected a trail surrogate after a head surrogate, but got no valid escape sequence.'
27+
}
28+
}
29+
30+
if x := decoder2.decode[string](r'"\ud83d\n\n\n\n"') {
31+
assert false
32+
} else {
33+
if err is decoder2.JsonDecodeError {
34+
assert err.line == 1
35+
assert err.character == 1
36+
assert err.message == 'Data: Expected a trail surrogate after a head surrogate, but got no valid escape sequence.'
37+
}
38+
}
39+
40+
if x := decoder2.decode[string](r'"\ud83d\ud83d"') {
41+
assert false
42+
} else {
43+
if err is decoder2.JsonDecodeError {
44+
assert err.line == 1
45+
assert err.character == 1
46+
assert err.message == 'Data: Expected a trail surrogate after a head surrogate, but got D83D.'
47+
}
48+
}
49+
50+
if x := decoder2.decode[string](r'"\ude00\ud83d"') {
51+
assert false
52+
} else {
53+
if err is decoder2.JsonDecodeError {
54+
assert err.line == 1
55+
assert err.character == 1
56+
assert err.message == 'Data: Got trail surrogate: DE00 before head surrogate.'
57+
}
58+
}
59+
}

0 commit comments

Comments
 (0)