Skip to content

Commit 4f19d71

Browse files
authored
fix(compiler)!: Apply correct rules for parsing Unicode whitespace (#1554)
1 parent cce2821 commit 4f19d71

File tree

2 files changed

+162
-10
lines changed

2 files changed

+162
-10
lines changed

compiler/src/parsing/lexer.re

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,9 @@ let collect_comment = (comment_type, source, loc, lexbuf) => {
7979
comments := [comment_type(source, loc), ...comments^];
8080
};
8181

82+
// Grain follows the Unicode properties for programming languages outlined in
83+
// https://unicode.org/reports/tr31/#Pattern_Syntax
84+
8285
let dec_digit = [%sedlex.regexp? '0' .. '9'];
8386
let hex_digit = [%sedlex.regexp? '0' .. '9' | 'A' .. 'F' | 'a' .. 'f'];
8487
let oct_digit = [%sedlex.regexp? '0' .. '7'];
@@ -118,7 +121,9 @@ let dec_float = [%sedlex.regexp?
118121

119122
let unsigned_float = [%sedlex.regexp? dec_float];
120123

121-
let uident = [%sedlex.regexp? (lu, Star(xid_continue))];
124+
let uident = [%sedlex.regexp?
125+
(Intersect(xid_start, lu), Star(xid_continue))
126+
];
122127
let lident = [%sedlex.regexp?
123128
(Sub(xid_start, lu) | '_', Star(xid_continue))
124129
];
@@ -150,22 +155,35 @@ let slash_operator_chars = [%sedlex.regexp?
150155
(Sub(operator_char, '/' | '*'), operator_chars)
151156
];
152157

153-
// Tabs and space separators (https://www.compart.com/en/unicode/category/Zs)
154-
let blank = [%sedlex.regexp? Plus(zs | '\t')];
155-
156158
let unicode_esc = [%sedlex.regexp? ("\\u{", Rep(hex_digit, 1 .. 6), "}")];
157159
let unicode4_esc = [%sedlex.regexp? ("\\u", Rep(hex_digit, 4))];
158160
let hex_esc = [%sedlex.regexp? ("\\x", Rep(hex_digit, 1 .. 2))];
159161
let oct_esc = [%sedlex.regexp? ("\\", Rep(oct_digit, 1 .. 3))];
160162
let num_esc = [%sedlex.regexp? unicode_esc | unicode4_esc | hex_esc | oct_esc];
161163

162-
let newline_char = [%sedlex.regexp? "\r\n" | '\n'];
163-
let newline_chars = [%sedlex.regexp?
164-
(Star(newline_char | blank), newline_char)
164+
// Whitespace follows Pattern_White_Space, though we separate spaces from newlines
165+
// https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=[:Pattern_White_Space=Yes:]
166+
167+
// HORIZONTAL TABULATION
168+
// VERTICAL TABULATION
169+
// SPACE
170+
// LEFT-TO-RIGHT MARK
171+
// RIGHT-TO-LEFT MARK
172+
let blank = [%sedlex.regexp? Plus(0x09 | 0x0B | 0x20 | 0x200E | 0x200F)];
173+
174+
// LINE FEED
175+
// FORM FEED
176+
// CARRIAGE RETURN
177+
// NEXT LINE
178+
// LINE SEPARATOR
179+
// PARAGRAPH SEPARATOR
180+
let newline_char = [%sedlex.regexp?
181+
0x0A | 0x0C | 0x0D | 0x85 | 0x2028 | 0x2029
165182
];
183+
let newlines = [%sedlex.regexp? (Star(newline_char | blank), newline_char)];
166184

167-
let line_comment = [%sedlex.regexp? ("//", Star(Compl('\r' | '\n')))];
168-
let shebang_comment = [%sedlex.regexp? ("#!", Star(Compl('\r' | '\n')))];
185+
let line_comment = [%sedlex.regexp? ("//", Star(Compl(newline_char)))];
186+
let shebang_comment = [%sedlex.regexp? ("#!", Star(Compl(newline_char)))];
169187

170188
let sub_lexeme = (lexbuf, first, last) => {
171189
// We use this implementation over Sedlexing's sub_lexeme since it supports negative indexing
@@ -205,7 +223,7 @@ let rec token = lexbuf => {
205223
Buffer.add_string(buf, "/**");
206224
read_doc_comment(start_p, buf, lexbuf);
207225
| blank => token(lexbuf)
208-
| newline_chars => positioned(EOL)
226+
| newlines => positioned(EOL)
209227
| (unsigned_float, 'f') => positioned(FLOAT32(sub_lexeme(lexbuf, 0, -1)))
210228
| (unsigned_float, 'd') => positioned(FLOAT64(sub_lexeme(lexbuf, 0, -1)))
211229
| unsigned_float =>

compiler/test/suites/parsing.re

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ describe("parsing", ({test, testSkip}) => {
77
let test_or_skip =
88
Sys.backend_type == Other("js_of_ocaml") ? testSkip : test;
99
let assertParse = makeParseRunner(test);
10+
let assertCompileError = makeCompileErrorRunner(test);
1011
let assertFileRun = makeFileRunner(test_or_skip);
1112

1213
// operators
@@ -237,4 +238,137 @@ describe("parsing", ({test, testSkip}) => {
237238
prog_loc: Location.dummy_loc,
238239
},
239240
);
241+
242+
// Whitespace tests
243+
244+
// Reason does not support OCaml's Unicode escapes, which is why these are
245+
// UTF-8 byte sequences instead of pretty Unicode escapes
246+
247+
assertParse(
248+
"whitespace_1",
249+
// In order,
250+
// HORIZONTAL TABULATION
251+
// VERTICAL TABULATION
252+
// SPACE
253+
// LEFT-TO-RIGHT MARK
254+
// RIGHT-TO-LEFT MARK
255+
// LINE FEED
256+
// FORM FEED
257+
// CARRIAGE RETURN
258+
// NEXT LINE
259+
// LINE SEPARATOR
260+
// PARAGRAPH SEPARATOR
261+
"
262+
module Test
263+
\x09
264+
\x0b
265+
\x20
266+
\xe2\x80\x8e
267+
\xe2\x80\x8f
268+
\x0a
269+
\x0c
270+
\x0d
271+
\xc2\x85
272+
\xe2\x80\xa8
273+
\xe2\x80\xa9
274+
",
275+
{
276+
module_name: Location.mknoloc("Test"),
277+
statements: [],
278+
comments: [],
279+
prog_loc: Location.dummy_loc,
280+
},
281+
);
282+
283+
assertCompileError(
284+
"invalid_whitespace_nbsp",
285+
"\xc2\xa0",
286+
"Grain lexer doesn't recognize this token",
287+
);
288+
assertCompileError(
289+
"invalid_whitespace_emspace",
290+
"\xe2\x80\x83",
291+
"Grain lexer doesn't recognize this token",
292+
);
293+
assertCompileError(
294+
"invalid_whitespace_hairspace",
295+
"\xe2\x80\x8a",
296+
"Grain lexer doesn't recognize this token",
297+
);
298+
assertCompileError(
299+
"invalid_whitespace_ideographicspace",
300+
"\xe3\x80\x80",
301+
"Grain lexer doesn't recognize this token",
302+
);
303+
304+
assertParse(
305+
"end_of_statement_linefeed",
306+
"module Test; a\x0ab",
307+
{
308+
module_name: Location.mknoloc("Test"),
309+
statements: [Toplevel.expr(a), Toplevel.expr(b)],
310+
comments: [],
311+
prog_loc: Location.dummy_loc,
312+
},
313+
);
314+
assertParse(
315+
"end_of_statement_formfeed",
316+
"module Test; a\x0cb",
317+
{
318+
module_name: Location.mknoloc("Test"),
319+
statements: [Toplevel.expr(a), Toplevel.expr(b)],
320+
comments: [],
321+
prog_loc: Location.dummy_loc,
322+
},
323+
);
324+
assertParse(
325+
"end_of_statement_carriagereturn",
326+
"module Test; a\x0db",
327+
{
328+
module_name: Location.mknoloc("Test"),
329+
statements: [Toplevel.expr(a), Toplevel.expr(b)],
330+
comments: [],
331+
prog_loc: Location.dummy_loc,
332+
},
333+
);
334+
assertParse(
335+
"end_of_statement_crlf",
336+
"module Test; a\x0d\x0ab",
337+
{
338+
module_name: Location.mknoloc("Test"),
339+
statements: [Toplevel.expr(a), Toplevel.expr(b)],
340+
comments: [],
341+
prog_loc: Location.dummy_loc,
342+
},
343+
);
344+
assertParse(
345+
"end_of_statement_nextline",
346+
"module Test; a\xc2\x85b",
347+
{
348+
module_name: Location.mknoloc("Test"),
349+
statements: [Toplevel.expr(a), Toplevel.expr(b)],
350+
comments: [],
351+
prog_loc: Location.dummy_loc,
352+
},
353+
);
354+
assertParse(
355+
"end_of_statement_lineseparator",
356+
"module Test; a\xe2\x80\xa8b",
357+
{
358+
module_name: Location.mknoloc("Test"),
359+
statements: [Toplevel.expr(a), Toplevel.expr(b)],
360+
comments: [],
361+
prog_loc: Location.dummy_loc,
362+
},
363+
);
364+
assertParse(
365+
"end_of_statement_paragraphseparator",
366+
"module Test; a\xe2\x80\xa9b",
367+
{
368+
module_name: Location.mknoloc("Test"),
369+
statements: [Toplevel.expr(a), Toplevel.expr(b)],
370+
comments: [],
371+
prog_loc: Location.dummy_loc,
372+
},
373+
);
240374
});

0 commit comments

Comments
 (0)