Skip to content

Commit b92082b

Browse files
implement lenient parser (#2129)
* move query parser to nom * add suupport for term grouping * initial work on infallible parser * fmt * add tests and fix minor parsing bugs * address review comments * add support for lenient queries in tantivy * make lenient parser report errors * allow mixing occur and bool in query
1 parent c2be660 commit b92082b

File tree

8 files changed

+1662
-509
lines changed

8 files changed

+1662
-509
lines changed

query-grammar/Cargo.toml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,4 @@ keywords = ["search", "information", "retrieval"]
1212
edition = "2021"
1313

1414
[dependencies]
15-
combine = {version="4", default-features=false, features=[] }
16-
once_cell = "1.7.2"
17-
regex ={ version = "1.5.4", default-features = false, features = ["std", "unicode"] }
15+
nom = "7"

query-grammar/src/infallible.rs

Lines changed: 353 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,353 @@
1+
//! nom combinators for infallible operations
2+
3+
use std::convert::Infallible;
4+
5+
use nom::{AsChar, IResult, InputLength, InputTakeAtPosition};
6+
7+
pub(crate) type ErrorList = Vec<LenientErrorInternal>;
8+
pub(crate) type JResult<I, O> = IResult<I, (O, ErrorList), Infallible>;
9+
10+
/// An error, with an end-of-string based offset
11+
#[derive(Debug)]
12+
pub(crate) struct LenientErrorInternal {
13+
pub pos: usize,
14+
pub message: String,
15+
}
16+
17+
/// A recoverable error and the position it happened at
18+
#[derive(Debug, PartialEq)]
19+
pub struct LenientError {
20+
pub pos: usize,
21+
pub message: String,
22+
}
23+
24+
impl LenientError {
25+
pub(crate) fn from_internal(internal: LenientErrorInternal, str_len: usize) -> LenientError {
26+
LenientError {
27+
pos: str_len - internal.pos,
28+
message: internal.message,
29+
}
30+
}
31+
}
32+
33+
fn unwrap_infallible<T>(res: Result<T, nom::Err<Infallible>>) -> T {
34+
match res {
35+
Ok(val) => val,
36+
Err(_) => unreachable!(),
37+
}
38+
}
39+
40+
// when rfcs#1733 get stabilized, this can make things clearer
41+
// trait InfallibleParser<I, O> = nom::Parser<I, (O, ErrorList), std::convert::Infallible>;
42+
43+
/// A variant of the classical `opt` parser, except it returns an infallible error type.
44+
///
45+
/// It's less generic than the original to ease type resolution in the rest of the code.
46+
pub(crate) fn opt_i<I: Clone, O, F>(mut f: F) -> impl FnMut(I) -> JResult<I, Option<O>>
47+
where F: nom::Parser<I, O, nom::error::Error<I>> {
48+
move |input: I| {
49+
let i = input.clone();
50+
match f.parse(input) {
51+
Ok((i, o)) => Ok((i, (Some(o), Vec::new()))),
52+
Err(_) => Ok((i, (None, Vec::new()))),
53+
}
54+
}
55+
}
56+
57+
pub(crate) fn opt_i_err<'a, I: Clone + InputLength, O, F>(
58+
mut f: F,
59+
message: impl ToString + 'a,
60+
) -> impl FnMut(I) -> JResult<I, Option<O>> + 'a
61+
where
62+
F: nom::Parser<I, O, nom::error::Error<I>> + 'a,
63+
{
64+
move |input: I| {
65+
let i = input.clone();
66+
match f.parse(input) {
67+
Ok((i, o)) => Ok((i, (Some(o), Vec::new()))),
68+
Err(_) => {
69+
let errs = vec![LenientErrorInternal {
70+
pos: i.input_len(),
71+
message: message.to_string(),
72+
}];
73+
Ok((i, (None, errs)))
74+
}
75+
}
76+
}
77+
}
78+
79+
pub(crate) fn space0_infallible<T>(input: T) -> JResult<T, T>
80+
where
81+
T: InputTakeAtPosition + Clone,
82+
<T as InputTakeAtPosition>::Item: AsChar + Clone,
83+
{
84+
opt_i(nom::character::complete::space0)(input)
85+
.map(|(left, (spaces, errors))| (left, (spaces.expect("space0 can't fail"), errors)))
86+
}
87+
88+
pub(crate) fn space1_infallible<T>(input: T) -> JResult<T, Option<T>>
89+
where
90+
T: InputTakeAtPosition + Clone + InputLength,
91+
<T as InputTakeAtPosition>::Item: AsChar + Clone,
92+
{
93+
opt_i(nom::character::complete::space1)(input).map(|(left, (spaces, mut errors))| {
94+
if spaces.is_none() {
95+
errors.push(LenientErrorInternal {
96+
pos: left.input_len(),
97+
message: "missing space".to_string(),
98+
})
99+
}
100+
(left, (spaces, errors))
101+
})
102+
}
103+
104+
pub(crate) fn fallible<I, O, E: nom::error::ParseError<I>, F>(
105+
mut f: F,
106+
) -> impl FnMut(I) -> IResult<I, O, E>
107+
where F: nom::Parser<I, (O, ErrorList), Infallible> {
108+
use nom::Err;
109+
move |input: I| match f.parse(input) {
110+
Ok((input, (output, _err))) => Ok((input, output)),
111+
Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)),
112+
Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {},
113+
}
114+
}
115+
116+
pub(crate) fn delimited_infallible<I, O1, O2, O3, F, G, H>(
117+
mut first: F,
118+
mut second: G,
119+
mut third: H,
120+
) -> impl FnMut(I) -> JResult<I, O2>
121+
where
122+
F: nom::Parser<I, (O1, ErrorList), Infallible>,
123+
G: nom::Parser<I, (O2, ErrorList), Infallible>,
124+
H: nom::Parser<I, (O3, ErrorList), Infallible>,
125+
{
126+
move |input: I| {
127+
let (input, (_, mut err)) = first.parse(input)?;
128+
let (input, (o2, mut err2)) = second.parse(input)?;
129+
err.append(&mut err2);
130+
let (input, (_, mut err3)) = third.parse(input)?;
131+
err.append(&mut err3);
132+
Ok((input, (o2, err)))
133+
}
134+
}
135+
136+
// Parse nothing. Just a lazy way to not implement terminated/preceded and use delimited instead
137+
pub(crate) fn nothing(i: &str) -> JResult<&str, ()> {
138+
Ok((i, ((), Vec::new())))
139+
}
140+
141+
pub(crate) trait TupleInfallible<I, O> {
142+
/// Parses the input and returns a tuple of results of each parser.
143+
fn parse(&mut self, input: I) -> JResult<I, O>;
144+
}
145+
146+
impl<Input, Output, F: nom::Parser<Input, (Output, ErrorList), Infallible>>
147+
TupleInfallible<Input, (Output,)> for (F,)
148+
{
149+
fn parse(&mut self, input: Input) -> JResult<Input, (Output,)> {
150+
self.0.parse(input).map(|(i, (o, e))| (i, ((o,), e)))
151+
}
152+
}
153+
154+
// these macros are heavily copied from nom, with some minor adaptations for our type
155+
macro_rules! tuple_trait(
156+
($name1:ident $ty1:ident, $name2: ident $ty2:ident, $($name:ident $ty:ident),*) => (
157+
tuple_trait!(__impl $name1 $ty1, $name2 $ty2; $($name $ty),*);
158+
);
159+
(__impl $($name:ident $ty: ident),+; $name1:ident $ty1:ident, $($name2:ident $ty2:ident),*) => (
160+
tuple_trait_impl!($($name $ty),+);
161+
tuple_trait!(__impl $($name $ty),+ , $name1 $ty1; $($name2 $ty2),*);
162+
);
163+
(__impl $($name:ident $ty: ident),+; $name1:ident $ty1:ident) => (
164+
tuple_trait_impl!($($name $ty),+);
165+
tuple_trait_impl!($($name $ty),+, $name1 $ty1);
166+
);
167+
);
168+
169+
macro_rules! tuple_trait_impl(
170+
($($name:ident $ty: ident),+) => (
171+
impl<
172+
Input: Clone, $($ty),+ ,
173+
$($name: nom::Parser<Input, ($ty, ErrorList), Infallible>),+
174+
> TupleInfallible<Input, ( $($ty),+ )> for ( $($name),+ ) {
175+
176+
fn parse(&mut self, input: Input) -> JResult<Input, ( $($ty),+ )> {
177+
let mut error_list = Vec::new();
178+
tuple_trait_inner!(0, self, input, (), error_list, $($name)+)
179+
}
180+
}
181+
);
182+
);
183+
184+
macro_rules! tuple_trait_inner(
185+
($it:tt, $self:expr, $input:expr, (), $error_list:expr, $head:ident $($id:ident)+) => ({
186+
let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
187+
$error_list.append(&mut err);
188+
189+
succ!($it, tuple_trait_inner!($self, i, ( o ), $error_list, $($id)+))
190+
});
191+
($it:tt, $self:expr, $input:expr, ($($parsed:tt)*), $error_list:expr, $head:ident $($id:ident)+) => ({
192+
let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
193+
$error_list.append(&mut err);
194+
195+
succ!($it, tuple_trait_inner!($self, i, ($($parsed)* , o), $error_list, $($id)+))
196+
});
197+
($it:tt, $self:expr, $input:expr, ($($parsed:tt)*), $error_list:expr, $head:ident) => ({
198+
let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
199+
$error_list.append(&mut err);
200+
201+
Ok((i, (($($parsed)* , o), $error_list)))
202+
});
203+
);
204+
205+
macro_rules! succ (
206+
(0, $submac:ident ! ($($rest:tt)*)) => ($submac!(1, $($rest)*));
207+
(1, $submac:ident ! ($($rest:tt)*)) => ($submac!(2, $($rest)*));
208+
(2, $submac:ident ! ($($rest:tt)*)) => ($submac!(3, $($rest)*));
209+
(3, $submac:ident ! ($($rest:tt)*)) => ($submac!(4, $($rest)*));
210+
(4, $submac:ident ! ($($rest:tt)*)) => ($submac!(5, $($rest)*));
211+
(5, $submac:ident ! ($($rest:tt)*)) => ($submac!(6, $($rest)*));
212+
(6, $submac:ident ! ($($rest:tt)*)) => ($submac!(7, $($rest)*));
213+
(7, $submac:ident ! ($($rest:tt)*)) => ($submac!(8, $($rest)*));
214+
(8, $submac:ident ! ($($rest:tt)*)) => ($submac!(9, $($rest)*));
215+
(9, $submac:ident ! ($($rest:tt)*)) => ($submac!(10, $($rest)*));
216+
(10, $submac:ident ! ($($rest:tt)*)) => ($submac!(11, $($rest)*));
217+
(11, $submac:ident ! ($($rest:tt)*)) => ($submac!(12, $($rest)*));
218+
(12, $submac:ident ! ($($rest:tt)*)) => ($submac!(13, $($rest)*));
219+
(13, $submac:ident ! ($($rest:tt)*)) => ($submac!(14, $($rest)*));
220+
(14, $submac:ident ! ($($rest:tt)*)) => ($submac!(15, $($rest)*));
221+
(15, $submac:ident ! ($($rest:tt)*)) => ($submac!(16, $($rest)*));
222+
(16, $submac:ident ! ($($rest:tt)*)) => ($submac!(17, $($rest)*));
223+
(17, $submac:ident ! ($($rest:tt)*)) => ($submac!(18, $($rest)*));
224+
(18, $submac:ident ! ($($rest:tt)*)) => ($submac!(19, $($rest)*));
225+
(19, $submac:ident ! ($($rest:tt)*)) => ($submac!(20, $($rest)*));
226+
(20, $submac:ident ! ($($rest:tt)*)) => ($submac!(21, $($rest)*));
227+
);
228+
229+
tuple_trait!(FnA A, FnB B, FnC C, FnD D, FnE E, FnF F, FnG G, FnH H, FnI I, FnJ J, FnK K, FnL L,
230+
FnM M, FnN N, FnO O, FnP P, FnQ Q, FnR R, FnS S, FnT T, FnU U);
231+
232+
// Special case: implement `TupleInfallible` for `()`, the unit type.
233+
// This can come up in macros which accept a variable number of arguments.
234+
// Literally, `()` is an empty tuple, so it should simply parse nothing.
235+
impl<I> TupleInfallible<I, ()> for () {
236+
fn parse(&mut self, input: I) -> JResult<I, ()> {
237+
Ok((input, ((), Vec::new())))
238+
}
239+
}
240+
241+
pub(crate) fn tuple_infallible<I, O, List: TupleInfallible<I, O>>(
242+
mut l: List,
243+
) -> impl FnMut(I) -> JResult<I, O> {
244+
move |i: I| l.parse(i)
245+
}
246+
247+
pub(crate) fn separated_list_infallible<I, O, O2, F, G>(
248+
mut sep: G,
249+
mut f: F,
250+
) -> impl FnMut(I) -> JResult<I, Vec<O>>
251+
where
252+
I: Clone + InputLength,
253+
F: nom::Parser<I, (O, ErrorList), Infallible>,
254+
G: nom::Parser<I, (O2, ErrorList), Infallible>,
255+
{
256+
move |i: I| {
257+
let mut res: Vec<O> = Vec::new();
258+
let mut errors: ErrorList = Vec::new();
259+
260+
let (mut i, (o, mut err)) = unwrap_infallible(f.parse(i.clone()));
261+
errors.append(&mut err);
262+
res.push(o);
263+
264+
loop {
265+
let (i_sep_parsed, (_, mut err_sep)) = unwrap_infallible(sep.parse(i.clone()));
266+
let len_before = i_sep_parsed.input_len();
267+
268+
let (i_elem_parsed, (o, mut err_elem)) =
269+
unwrap_infallible(f.parse(i_sep_parsed.clone()));
270+
271+
// infinite loop check: the parser must always consume
272+
// if we consumed nothing here, don't produce an element.
273+
if i_elem_parsed.input_len() == len_before {
274+
return Ok((i, (res, errors)));
275+
}
276+
res.push(o);
277+
errors.append(&mut err_sep);
278+
errors.append(&mut err_elem);
279+
i = i_elem_parsed;
280+
}
281+
}
282+
}
283+
284+
pub(crate) trait Alt<I, O> {
285+
/// Tests each parser in the tuple and returns the result of the first one that succeeds
286+
fn choice(&mut self, input: I) -> Option<JResult<I, O>>;
287+
}
288+
289+
macro_rules! alt_trait(
290+
($first_cond:ident $first:ident, $($id_cond:ident $id: ident),+) => (
291+
alt_trait!(__impl $first_cond $first; $($id_cond $id),+);
292+
);
293+
(__impl $($current_cond:ident $current:ident),*; $head_cond:ident $head:ident, $($id_cond:ident $id:ident),+) => (
294+
alt_trait_impl!($($current_cond $current),*);
295+
296+
alt_trait!(__impl $($current_cond $current,)* $head_cond $head; $($id_cond $id),+);
297+
);
298+
(__impl $($current_cond:ident $current:ident),*; $head_cond:ident $head:ident) => (
299+
alt_trait_impl!($($current_cond $current),*);
300+
alt_trait_impl!($($current_cond $current,)* $head_cond $head);
301+
);
302+
);
303+
304+
macro_rules! alt_trait_impl(
305+
($($id_cond:ident $id:ident),+) => (
306+
impl<
307+
Input: Clone, Output,
308+
$(
309+
// () are to make things easier on me, but I'm not entirely sure whether we can do better
310+
// with rule E0207
311+
$id_cond: nom::Parser<Input, (), ()>,
312+
$id: nom::Parser<Input, (Output, ErrorList), Infallible>
313+
),+
314+
> Alt<Input, Output> for ( $(($id_cond, $id),)+ ) {
315+
316+
fn choice(&mut self, input: Input) -> Option<JResult<Input, Output>> {
317+
match self.0.0.parse(input.clone()) {
318+
Err(_) => alt_trait_inner!(1, self, input, $($id_cond $id),+),
319+
Ok((input_left, _)) => Some(self.0.1.parse(input_left)),
320+
}
321+
}
322+
}
323+
);
324+
);
325+
326+
macro_rules! alt_trait_inner(
327+
($it:tt, $self:expr, $input:expr, $head_cond:ident $head:ident, $($id_cond:ident $id:ident),+) => (
328+
match $self.$it.0.parse($input.clone()) {
329+
Err(_) => succ!($it, alt_trait_inner!($self, $input, $($id_cond $id),+)),
330+
Ok((input_left, _)) => Some($self.$it.1.parse(input_left)),
331+
}
332+
);
333+
($it:tt, $self:expr, $input:expr, $head_cond:ident $head:ident) => (
334+
None
335+
);
336+
);
337+
338+
alt_trait!(A1 A, B1 B, C1 C, D1 D, E1 E, F1 F, G1 G, H1 H, I1 I, J1 J, K1 K,
339+
L1 L, M1 M, N1 N, O1 O, P1 P, Q1 Q, R1 R, S1 S, T1 T, U1 U);
340+
341+
/// An alt() like combinator. For each branch, it first tries a fallible parser, which commits to
342+
/// this branch, or tells to check next branch, and the execute the infallible parser which follow.
343+
///
344+
/// In case no branch match, the default (fallible) parser is executed.
345+
pub(crate) fn alt_infallible<I: Clone, O, F, List: Alt<I, O>>(
346+
mut l: List,
347+
mut default: F,
348+
) -> impl FnMut(I) -> JResult<I, O>
349+
where
350+
F: nom::Parser<I, (O, ErrorList), Infallible>,
351+
{
352+
move |i: I| l.choice(i.clone()).unwrap_or_else(|| default.parse(i))
353+
}

query-grammar/src/lib.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,26 @@
11
#![allow(clippy::derive_partial_eq_without_eq)]
22

3+
mod infallible;
34
mod occur;
45
mod query_grammar;
56
mod user_input_ast;
6-
use combine::parser::Parser;
77

8+
pub use crate::infallible::LenientError;
89
pub use crate::occur::Occur;
9-
use crate::query_grammar::parse_to_ast;
10+
use crate::query_grammar::{parse_to_ast, parse_to_ast_lenient};
1011
pub use crate::user_input_ast::{
1112
Delimiter, UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral,
1213
};
1314

1415
pub struct Error;
1516

17+
/// Parse a query
1618
pub fn parse_query(query: &str) -> Result<UserInputAst, Error> {
17-
let (user_input_ast, _remaining) = parse_to_ast().parse(query).map_err(|_| Error)?;
19+
let (_remaining, user_input_ast) = parse_to_ast(query).map_err(|_| Error)?;
1820
Ok(user_input_ast)
1921
}
22+
23+
/// Parse a query, trying to recover from syntax errors, and giving hints toward fixing errors.
24+
pub fn parse_query_lenient(query: &str) -> (UserInputAst, Vec<LenientError>) {
25+
parse_to_ast_lenient(query)
26+
}

0 commit comments

Comments
 (0)