@@ -14,7 +14,9 @@ type error =
1414 | UnclosedChar (int )
1515 | UnclosedBlockComment (int )
1616 | UnclosedDocComment (int )
17- | IllegalUnicodeCodePoint (string );
17+ | IllegalUnicodeCodePoint (string )
18+ | IllegalByteStringUnicodeChar (string )
19+ | IllegalByteStringUnicodeEscape (string );
1820
1921exception Error ( Location . t, error) ;
2022
@@ -32,6 +34,18 @@ let report_error = (ppf, err) =>
3234 Format . fprintf (ppf, "Unclosed doc comment, opened on line % d " , line)
3335 | IllegalUnicodeCodePoint (cp ) =>
3436 Format . fprintf (ppf, "Illegal unicode code point: % S" , cp)
37+ | IllegalByteStringUnicodeChar (cp ) =>
38+ Format . fprintf (
39+ ppf,
40+ "Byte strings may not contain non-ascii unicode characters: % S" ,
41+ cp,
42+ )
43+ | IllegalByteStringUnicodeEscape (cp ) =>
44+ Format . fprintf (
45+ ppf,
46+ "Byte strings may not contain unicode escapes: % S" ,
47+ cp,
48+ )
3549 };
3650
3751let () =
@@ -42,13 +56,15 @@ let () =
4256 | _ => None ,
4357 );
4458
45- let add_code_point = (buf, str, loc) => {
59+ let add_code_point = (buf, str, unicode , loc) => {
4660 let (esc , numstr ) = (
4761 String . sub(str, 1 , 1 ),
4862 String . sub(str, 2 , String . length(str) - 2 ),
4963 );
5064 let code_point =
5165 switch (esc) {
66+ | "u" when ! unicode =>
67+ raise (Error (loc, IllegalByteStringUnicodeEscape (str)))
5268 | "u" when numstr. [0] == '{' =>
5369 Scanf . sscanf(String . sub(numstr, 1 , String . length(numstr) - 1 ), "% x" , x =>
5470 x
@@ -334,9 +350,12 @@ let rec token = lexbuf => {
334350 positioned(INFIX_50 (Sedlexing . Utf8 . lexeme(lexbuf)))
335351 | "!" => positioned(PREFIX_150 (Sedlexing . Utf8 . lexeme(lexbuf)))
336352 | "@" => positioned(AT )
353+ | "b\" " =>
354+ let (start_p , _ ) = Sedlexing . lexing_positions(lexbuf);
355+ read_str(start_p, Buffer . create(16 ), false , lexbuf);
337356 | '"' =>
338357 let (start_p , _ ) = Sedlexing . lexing_positions(lexbuf);
339- read_str(start_p, Buffer . create(16 ), lexbuf);
358+ read_str(start_p, Buffer . create(16 ), true , lexbuf);
340359 | "'" =>
341360 let (start_p , _ ) = Sedlexing . lexing_positions(lexbuf);
342361 read_char(start_p, Buffer . create(4 ), lexbuf);
@@ -347,42 +366,63 @@ let rec token = lexbuf => {
347366 | _ => raise (Error (lexbuf_loc(lexbuf), UnrecognizedToken ))
348367 };
349368}
350- and read_str = (start_p, buf, lexbuf) => {
369+ and read_str = (start_p, buf, unicode , lexbuf) => {
351370 switch %sedlex (lexbuf) {
352- | ('\\' , newline_char ) => read_str(start_p, buf, lexbuf)
371+ | ('\\' , newline_char ) => read_str(start_p, buf, unicode , lexbuf)
353372 | "\\ b" =>
354373 Buffer . add_char(buf, '\b' );
355- read_str(start_p, buf, lexbuf);
374+ read_str(start_p, buf, unicode , lexbuf);
356375 | "\\ f" =>
357376 Buffer . add_char(buf, '\012' );
358- read_str(start_p, buf, lexbuf);
377+ read_str(start_p, buf, unicode , lexbuf);
359378 | "\\ n" =>
360379 Buffer . add_char(buf, '\n' );
361- read_str(start_p, buf, lexbuf);
380+ read_str(start_p, buf, unicode , lexbuf);
362381 | "\\ r" =>
363382 Buffer . add_char(buf, '\r' );
364- read_str(start_p, buf, lexbuf);
383+ read_str(start_p, buf, unicode , lexbuf);
365384 | "\\ t" =>
366385 Buffer . add_char(buf, '\t' );
367- read_str(start_p, buf, lexbuf);
386+ read_str(start_p, buf, unicode , lexbuf);
368387 | "\\ v" =>
369388 Buffer . add_char(buf, '\011' );
370- read_str(start_p, buf, lexbuf);
389+ read_str(start_p, buf, unicode , lexbuf);
371390 | "\\\" " =>
372391 Buffer . add_char(buf, '"' );
373- read_str(start_p, buf, lexbuf);
392+ read_str(start_p, buf, unicode , lexbuf);
374393 | "\\\\ " =>
375394 Buffer . add_char(buf, '\\' );
376- read_str(start_p, buf, lexbuf);
395+ read_str(start_p, buf, unicode , lexbuf);
377396 | num_esc =>
378- add_code_point(buf, Sedlexing . Utf8 . lexeme(lexbuf), lexbuf_loc(lexbuf));
379- read_str(start_p, buf, lexbuf);
397+ add_code_point(
398+ buf,
399+ Sedlexing . Utf8 . lexeme(lexbuf),
400+ unicode,
401+ lexbuf_loc(lexbuf),
402+ );
403+ read_str(start_p, buf, unicode, lexbuf);
380404 | '"' =>
381405 let (_ , end_p ) = Sedlexing . lexing_positions(lexbuf);
382- (STRING (Buffer . contents(buf)), start_p, end_p);
383- | any =>
406+ if (unicode) {
407+ (STRING (Buffer . contents(buf)), start_p, end_p);
408+ } else {
409+ (BYTES (Buffer . contents(buf)), start_p, end_p);
410+ };
411+ | 0 .. 127 =>
384412 Buffer . add_string(buf, Sedlexing . Utf8 . lexeme(lexbuf));
385- read_str(start_p, buf, lexbuf);
413+ read_str(start_p, buf, unicode, lexbuf);
414+ | any =>
415+ if (unicode) {
416+ Buffer . add_string(buf, Sedlexing . Utf8 . lexeme(lexbuf));
417+ read_str(start_p, buf, unicode, lexbuf);
418+ } else {
419+ raise (
420+ Error (
421+ lexbuf_loc(lexbuf),
422+ IllegalByteStringUnicodeChar (Sedlexing . Utf8 . lexeme(lexbuf)),
423+ ),
424+ );
425+ }
386426 | _ =>
387427 let (_, end_p) = Sedlexing . lexing_positions(lexbuf);
388428 raise (
@@ -420,7 +460,12 @@ and read_char = (start_p, buf, lexbuf) => {
420460 Buffer . add_char(buf, '\\' );
421461 read_char(start_p, buf, lexbuf);
422462 | num_esc =>
423- add_code_point(buf, Sedlexing . Utf8 . lexeme(lexbuf), lexbuf_loc(lexbuf));
463+ add_code_point(
464+ buf,
465+ Sedlexing . Utf8 . lexeme(lexbuf),
466+ true ,
467+ lexbuf_loc(lexbuf),
468+ );
424469 read_char(start_p, buf, lexbuf);
425470 | "'" =>
426471 let (_ , end_p ) = Sedlexing . lexing_positions(lexbuf);
0 commit comments