feat(compiler): Bytes literals (#1662)

ospencer · web-flow · commit 3d8e4c5ee074 · 2023-02-16T17:54:28.000Z
diff --git a/compiler/src/parsing/ast_helper.rei b/compiler/src/parsing/ast_helper.rei
@@ -33,6 +33,7 @@ type str = loc(string);
 type loc = Location.t;
 
 module Constant: {
+  let bytes: string => constant;
   let string: string => constant;
   let char: string => constant;
   let number: number_type => constant;
diff --git a/compiler/src/parsing/lexer.re b/compiler/src/parsing/lexer.re
@@ -14,7 +14,9 @@ type error =
   | UnclosedChar(int)
   | UnclosedBlockComment(int)
   | UnclosedDocComment(int)
-  | IllegalUnicodeCodePoint(string);
+  | IllegalUnicodeCodePoint(string)
+  | IllegalByteStringUnicodeChar(string)
+  | IllegalByteStringUnicodeEscape(string);
 
 exception Error(Location.t, error);
 
@@ -32,6 +34,18 @@ let report_error = (ppf, err) =>
     Format.fprintf(ppf, "Unclosed doc comment, opened on line %d", line)
   | IllegalUnicodeCodePoint(cp) =>
     Format.fprintf(ppf, "Illegal unicode code point: %S", cp)
+  | IllegalByteStringUnicodeChar(cp) =>
+    Format.fprintf(
+      ppf,
+      "Byte strings may not contain non-ascii unicode characters: %S",
+      cp,
+    )
+  | IllegalByteStringUnicodeEscape(cp) =>
+    Format.fprintf(
+      ppf,
+      "Byte strings may not contain unicode escapes: %S",
+      cp,
+    )
   };
 
 let () =
@@ -42,13 +56,15 @@ let () =
     | _ => None,
   );
 
-let add_code_point = (buf, str, loc) => {
+let add_code_point = (buf, str, unicode, loc) => {
   let (esc, numstr) = (
     String.sub(str, 1, 1),
     String.sub(str, 2, String.length(str) - 2),
   );
   let code_point =
     switch (esc) {
+    | "u" when !unicode =>
+      raise(Error(loc, IllegalByteStringUnicodeEscape(str)))
     | "u" when numstr.[0] == '{' =>
       Scanf.sscanf(String.sub(numstr, 1, String.length(numstr) - 1), "%x", x =>
         x
@@ -334,9 +350,12 @@ let rec token = lexbuf => {
     positioned(INFIX_50(Sedlexing.Utf8.lexeme(lexbuf)))
   | "!" => positioned(PREFIX_150(Sedlexing.Utf8.lexeme(lexbuf)))
   | "@" => positioned(AT)
+  | "b\"" =>
+    let (start_p, _) = Sedlexing.lexing_positions(lexbuf);
+    read_str(start_p, Buffer.create(16), false, lexbuf);
   | '"' =>
     let (start_p, _) = Sedlexing.lexing_positions(lexbuf);
-    read_str(start_p, Buffer.create(16), lexbuf);
+    read_str(start_p, Buffer.create(16), true, lexbuf);
   | "'" =>
     let (start_p, _) = Sedlexing.lexing_positions(lexbuf);
     read_char(start_p, Buffer.create(4), lexbuf);
@@ -347,42 +366,63 @@ let rec token = lexbuf => {
   | _ => raise(Error(lexbuf_loc(lexbuf), UnrecognizedToken))
   };
 }
-and read_str = (start_p, buf, lexbuf) => {
+and read_str = (start_p, buf, unicode, lexbuf) => {
   switch%sedlex (lexbuf) {
-  | ('\\', newline_char) => read_str(start_p, buf, lexbuf)
+  | ('\\', newline_char) => read_str(start_p, buf, unicode, lexbuf)
   | "\\b" =>
     Buffer.add_char(buf, '\b');
-    read_str(start_p, buf, lexbuf);
+    read_str(start_p, buf, unicode, lexbuf);
   | "\\f" =>
     Buffer.add_char(buf, '\012');
-    read_str(start_p, buf, lexbuf);
+    read_str(start_p, buf, unicode, lexbuf);
   | "\\n" =>
     Buffer.add_char(buf, '\n');
-    read_str(start_p, buf, lexbuf);
+    read_str(start_p, buf, unicode, lexbuf);
   | "\\r" =>
     Buffer.add_char(buf, '\r');
-    read_str(start_p, buf, lexbuf);
+    read_str(start_p, buf, unicode, lexbuf);
   | "\\t" =>
     Buffer.add_char(buf, '\t');
-    read_str(start_p, buf, lexbuf);
+    read_str(start_p, buf, unicode, lexbuf);
   | "\\v" =>
     Buffer.add_char(buf, '\011');
-    read_str(start_p, buf, lexbuf);
+    read_str(start_p, buf, unicode, lexbuf);
   | "\\\"" =>
     Buffer.add_char(buf, '"');
-    read_str(start_p, buf, lexbuf);
+    read_str(start_p, buf, unicode, lexbuf);
   | "\\\\" =>
     Buffer.add_char(buf, '\\');
-    read_str(start_p, buf, lexbuf);
+    read_str(start_p, buf, unicode, lexbuf);
   | num_esc =>
-    add_code_point(buf, Sedlexing.Utf8.lexeme(lexbuf), lexbuf_loc(lexbuf));
-    read_str(start_p, buf, lexbuf);
+    add_code_point(
+      buf,
+      Sedlexing.Utf8.lexeme(lexbuf),
+      unicode,
+      lexbuf_loc(lexbuf),
+    );
+    read_str(start_p, buf, unicode, lexbuf);
   | '"' =>
     let (_, end_p) = Sedlexing.lexing_positions(lexbuf);
-    (STRING(Buffer.contents(buf)), start_p, end_p);
-  | any =>
+    if (unicode) {
+      (STRING(Buffer.contents(buf)), start_p, end_p);
+    } else {
+      (BYTES(Buffer.contents(buf)), start_p, end_p);
+    };
+  | 0 .. 127 =>
     Buffer.add_string(buf, Sedlexing.Utf8.lexeme(lexbuf));
-    read_str(start_p, buf, lexbuf);
+    read_str(start_p, buf, unicode, lexbuf);
+  | any =>
+    if (unicode) {
+      Buffer.add_string(buf, Sedlexing.Utf8.lexeme(lexbuf));
+      read_str(start_p, buf, unicode, lexbuf);
+    } else {
+      raise(
+        Error(
+          lexbuf_loc(lexbuf),
+          IllegalByteStringUnicodeChar(Sedlexing.Utf8.lexeme(lexbuf)),
+        ),
+      );
+    }
   | _ =>
     let (_, end_p) = Sedlexing.lexing_positions(lexbuf);
     raise(
@@ -420,7 +460,12 @@ and read_char = (start_p, buf, lexbuf) => {
     Buffer.add_char(buf, '\\');
     read_char(start_p, buf, lexbuf);
   | num_esc =>
-    add_code_point(buf, Sedlexing.Utf8.lexeme(lexbuf), lexbuf_loc(lexbuf));
+    add_code_point(
+      buf,
+      Sedlexing.Utf8.lexeme(lexbuf),
+      true,
+      lexbuf_loc(lexbuf),
+    );
     read_char(start_p, buf, lexbuf);
   | "'" =>
     let (_, end_p) = Sedlexing.lexing_positions(lexbuf);
diff --git a/compiler/src/parsing/parser.mly b/compiler/src/parsing/parser.mly
@@ -16,7 +16,7 @@ module Grain_parsing = struct end
 %token <string> INT32 INT64 UINT32 UINT64 FLOAT32 FLOAT64 BIGINT
 %token <string> WASMI32 WASMI64 WASMF32 WASMF64
 %token <string> LIDENT UIDENT
-%token <string> STRING CHAR
+%token <string> STRING BYTES CHAR
 %token LBRACK LBRACKRCARET RBRACK LPAREN RPAREN LBRACE RBRACE LCARET RCARET
 %token COMMA SEMI AS
 %token THICKARROW ARROW
@@ -212,6 +212,7 @@ const:
   | FALSE { Constant.bool false, $loc }
   | VOID { Constant.void, $loc }
   | STRING { Constant.string $1, $loc }
+  | BYTES { Constant.bytes $1, $loc }
   | CHAR { Constant.char $1, $loc }
 
 expr:
diff --git a/compiler/test/suites/strings.re b/compiler/test/suites/strings.re
@@ -291,4 +291,22 @@ bar", 1))|},
     {|include "float64"; from Float64 use *; print(div(-1.0d, 0.0d))|},
     "-Infinity\n",
   );
+
+  // Bytes literals
+  assertRun("bytes_literal", {|print(b"abc")|}, "<bytes: 61 62 63 >\n");
+  assertCompileError(
+    "bytes_literal_err1",
+    {|print(b"abc\u1234")|},
+    "Byte strings may not contain unicode escapes",
+  );
+  assertCompileError(
+    "bytes_literal_err2",
+    {|print(b"abc\u{1234}")|},
+    "Byte strings may not contain unicode escapes",
+  );
+  assertCompileError(
+    "bytes_literal_err3",
+    {|print(b"abc😂")|},
+    "Byte strings may not contain non-ascii unicode characters",
+  );
 });