Skip to content

Commit 3d8e4c5

Browse files
authored
feat(compiler): Bytes literals (#1662)
1 parent 50cd4ed commit 3d8e4c5

File tree

4 files changed

+85
-20
lines changed

4 files changed

+85
-20
lines changed

compiler/src/parsing/ast_helper.rei

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ type str = loc(string);
3333
type loc = Location.t;
3434

3535
module Constant: {
36+
let bytes: string => constant;
3637
let string: string => constant;
3738
let char: string => constant;
3839
let number: number_type => constant;

compiler/src/parsing/lexer.re

Lines changed: 64 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ type error =
1414
| UnclosedChar(int)
1515
| UnclosedBlockComment(int)
1616
| UnclosedDocComment(int)
17-
| IllegalUnicodeCodePoint(string);
17+
| IllegalUnicodeCodePoint(string)
18+
| IllegalByteStringUnicodeChar(string)
19+
| IllegalByteStringUnicodeEscape(string);
1820

1921
exception Error(Location.t, error);
2022

@@ -32,6 +34,18 @@ let report_error = (ppf, err) =>
3234
Format.fprintf(ppf, "Unclosed doc comment, opened on line %d", line)
3335
| IllegalUnicodeCodePoint(cp) =>
3436
Format.fprintf(ppf, "Illegal unicode code point: %S", cp)
37+
| IllegalByteStringUnicodeChar(cp) =>
38+
Format.fprintf(
39+
ppf,
40+
"Byte strings may not contain non-ascii unicode characters: %S",
41+
cp,
42+
)
43+
| IllegalByteStringUnicodeEscape(cp) =>
44+
Format.fprintf(
45+
ppf,
46+
"Byte strings may not contain unicode escapes: %S",
47+
cp,
48+
)
3549
};
3650

3751
let () =
@@ -42,13 +56,15 @@ let () =
4256
| _ => None,
4357
);
4458

45-
let add_code_point = (buf, str, loc) => {
59+
let add_code_point = (buf, str, unicode, loc) => {
4660
let (esc, numstr) = (
4761
String.sub(str, 1, 1),
4862
String.sub(str, 2, String.length(str) - 2),
4963
);
5064
let code_point =
5165
switch (esc) {
66+
| "u" when !unicode =>
67+
raise(Error(loc, IllegalByteStringUnicodeEscape(str)))
5268
| "u" when numstr.[0] == '{' =>
5369
Scanf.sscanf(String.sub(numstr, 1, String.length(numstr) - 1), "%x", x =>
5470
x
@@ -334,9 +350,12 @@ let rec token = lexbuf => {
334350
positioned(INFIX_50(Sedlexing.Utf8.lexeme(lexbuf)))
335351
| "!" => positioned(PREFIX_150(Sedlexing.Utf8.lexeme(lexbuf)))
336352
| "@" => positioned(AT)
353+
| "b\"" =>
354+
let (start_p, _) = Sedlexing.lexing_positions(lexbuf);
355+
read_str(start_p, Buffer.create(16), false, lexbuf);
337356
| '"' =>
338357
let (start_p, _) = Sedlexing.lexing_positions(lexbuf);
339-
read_str(start_p, Buffer.create(16), lexbuf);
358+
read_str(start_p, Buffer.create(16), true, lexbuf);
340359
| "'" =>
341360
let (start_p, _) = Sedlexing.lexing_positions(lexbuf);
342361
read_char(start_p, Buffer.create(4), lexbuf);
@@ -347,42 +366,63 @@ let rec token = lexbuf => {
347366
| _ => raise(Error(lexbuf_loc(lexbuf), UnrecognizedToken))
348367
};
349368
}
350-
and read_str = (start_p, buf, lexbuf) => {
369+
and read_str = (start_p, buf, unicode, lexbuf) => {
351370
switch%sedlex (lexbuf) {
352-
| ('\\', newline_char) => read_str(start_p, buf, lexbuf)
371+
| ('\\', newline_char) => read_str(start_p, buf, unicode, lexbuf)
353372
| "\\b" =>
354373
Buffer.add_char(buf, '\b');
355-
read_str(start_p, buf, lexbuf);
374+
read_str(start_p, buf, unicode, lexbuf);
356375
| "\\f" =>
357376
Buffer.add_char(buf, '\012');
358-
read_str(start_p, buf, lexbuf);
377+
read_str(start_p, buf, unicode, lexbuf);
359378
| "\\n" =>
360379
Buffer.add_char(buf, '\n');
361-
read_str(start_p, buf, lexbuf);
380+
read_str(start_p, buf, unicode, lexbuf);
362381
| "\\r" =>
363382
Buffer.add_char(buf, '\r');
364-
read_str(start_p, buf, lexbuf);
383+
read_str(start_p, buf, unicode, lexbuf);
365384
| "\\t" =>
366385
Buffer.add_char(buf, '\t');
367-
read_str(start_p, buf, lexbuf);
386+
read_str(start_p, buf, unicode, lexbuf);
368387
| "\\v" =>
369388
Buffer.add_char(buf, '\011');
370-
read_str(start_p, buf, lexbuf);
389+
read_str(start_p, buf, unicode, lexbuf);
371390
| "\\\"" =>
372391
Buffer.add_char(buf, '"');
373-
read_str(start_p, buf, lexbuf);
392+
read_str(start_p, buf, unicode, lexbuf);
374393
| "\\\\" =>
375394
Buffer.add_char(buf, '\\');
376-
read_str(start_p, buf, lexbuf);
395+
read_str(start_p, buf, unicode, lexbuf);
377396
| num_esc =>
378-
add_code_point(buf, Sedlexing.Utf8.lexeme(lexbuf), lexbuf_loc(lexbuf));
379-
read_str(start_p, buf, lexbuf);
397+
add_code_point(
398+
buf,
399+
Sedlexing.Utf8.lexeme(lexbuf),
400+
unicode,
401+
lexbuf_loc(lexbuf),
402+
);
403+
read_str(start_p, buf, unicode, lexbuf);
380404
| '"' =>
381405
let (_, end_p) = Sedlexing.lexing_positions(lexbuf);
382-
(STRING(Buffer.contents(buf)), start_p, end_p);
383-
| any =>
406+
if (unicode) {
407+
(STRING(Buffer.contents(buf)), start_p, end_p);
408+
} else {
409+
(BYTES(Buffer.contents(buf)), start_p, end_p);
410+
};
411+
| 0 .. 127 =>
384412
Buffer.add_string(buf, Sedlexing.Utf8.lexeme(lexbuf));
385-
read_str(start_p, buf, lexbuf);
413+
read_str(start_p, buf, unicode, lexbuf);
414+
| any =>
415+
if (unicode) {
416+
Buffer.add_string(buf, Sedlexing.Utf8.lexeme(lexbuf));
417+
read_str(start_p, buf, unicode, lexbuf);
418+
} else {
419+
raise(
420+
Error(
421+
lexbuf_loc(lexbuf),
422+
IllegalByteStringUnicodeChar(Sedlexing.Utf8.lexeme(lexbuf)),
423+
),
424+
);
425+
}
386426
| _ =>
387427
let (_, end_p) = Sedlexing.lexing_positions(lexbuf);
388428
raise(
@@ -420,7 +460,12 @@ and read_char = (start_p, buf, lexbuf) => {
420460
Buffer.add_char(buf, '\\');
421461
read_char(start_p, buf, lexbuf);
422462
| num_esc =>
423-
add_code_point(buf, Sedlexing.Utf8.lexeme(lexbuf), lexbuf_loc(lexbuf));
463+
add_code_point(
464+
buf,
465+
Sedlexing.Utf8.lexeme(lexbuf),
466+
true,
467+
lexbuf_loc(lexbuf),
468+
);
424469
read_char(start_p, buf, lexbuf);
425470
| "'" =>
426471
let (_, end_p) = Sedlexing.lexing_positions(lexbuf);

compiler/src/parsing/parser.mly

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ module Grain_parsing = struct end
1616
%token <string> INT32 INT64 UINT32 UINT64 FLOAT32 FLOAT64 BIGINT
1717
%token <string> WASMI32 WASMI64 WASMF32 WASMF64
1818
%token <string> LIDENT UIDENT
19-
%token <string> STRING CHAR
19+
%token <string> STRING BYTES CHAR
2020
%token LBRACK LBRACKRCARET RBRACK LPAREN RPAREN LBRACE RBRACE LCARET RCARET
2121
%token COMMA SEMI AS
2222
%token THICKARROW ARROW
@@ -212,6 +212,7 @@ const:
212212
| FALSE { Constant.bool false, $loc }
213213
| VOID { Constant.void, $loc }
214214
| STRING { Constant.string $1, $loc }
215+
| BYTES { Constant.bytes $1, $loc }
215216
| CHAR { Constant.char $1, $loc }
216217

217218
expr:

compiler/test/suites/strings.re

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,4 +291,22 @@ bar", 1))|},
291291
{|include "float64"; from Float64 use *; print(div(-1.0d, 0.0d))|},
292292
"-Infinity\n",
293293
);
294+
295+
// Bytes literals
296+
assertRun("bytes_literal", {|print(b"abc")|}, "<bytes: 61 62 63 >\n");
297+
assertCompileError(
298+
"bytes_literal_err1",
299+
{|print(b"abc\u1234")|},
300+
"Byte strings may not contain unicode escapes",
301+
);
302+
assertCompileError(
303+
"bytes_literal_err2",
304+
{|print(b"abc\u{1234}")|},
305+
"Byte strings may not contain unicode escapes",
306+
);
307+
assertCompileError(
308+
"bytes_literal_err3",
309+
{|print(b"abc😂")|},
310+
"Byte strings may not contain non-ascii unicode characters",
311+
);
294312
});

0 commit comments

Comments
 (0)