Skip to content

Commit c423054

Browse files
Add a recursion limit to the parser (#24810)
## Summary Partial fix for #22930. Without this malicious or machine generated code could cause a stack overflow with something as simple as `'(' * 5000 + '1' + ')' * 5000`. I decided to do the simplest thing and have a limit that's always applied with a reasonable default. Since: * the overhead of this check will be tiny * it seems inconceivable that anyone will want to have no limit ## Test Plan PR includes tests. --------- Co-authored-by: Charlie Marsh <charlie.r.marsh@gmail.com>
1 parent 4ff86c7 commit c423054

10 files changed

Lines changed: 519 additions & 19 deletions

File tree

crates/ruff_python_parser/src/error.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,9 @@ pub enum ParseErrorType {
200200
TStringError(InterpolatedStringErrorType),
201201
/// Parser encountered an error during lexing.
202202
Lexical(LexicalErrorType),
203+
204+
/// Parser aborted because [`crate::ParseOptions::max_recursion_depth`] was exceeded.
205+
RecursionLimitExceeded,
203206
}
204207

205208
impl ParseErrorType {
@@ -329,6 +332,7 @@ impl std::fmt::Display for ParseErrorType {
329332
ParseErrorType::UnexpectedExpressionToken => {
330333
write!(f, "Unexpected token at the end of an expression")
331334
}
335+
ParseErrorType::RecursionLimitExceeded => f.write_str("Source is too deeply nested"),
332336
}
333337
}
334338
}

crates/ruff_python_parser/src/lexer.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,12 @@ impl<'src> Lexer<'src> {
131131
self.current_range
132132
}
133133

134+
/// Returns the current parenthesis, bracket, and brace nesting level.
135+
#[inline]
136+
pub(crate) const fn nesting(&self) -> u32 {
137+
self.nesting
138+
}
139+
134140
/// Returns the flags for the current token.
135141
pub(crate) fn current_flags(&self) -> TokenFlags {
136142
self.current_flags

crates/ruff_python_parser/src/parser/expression.rs

Lines changed: 118 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,22 @@ impl<'src> Parser<'src> {
300300
BinaryLikeOperator::Binary(bin_op) => {
301301
self.bump(TokenKind::from(bin_op));
302302

303-
let right = self.parse_binary_expression_or_higher(new_precedence, context);
303+
let right = if new_precedence.is_right_associative() {
304+
// For right-associative operators (`**`), the right
305+
// operand recursion is unbounded in `a**a**a**...`,
306+
// and it bypasses the guard in `parse_lhs_expression`
307+
// (that scope is exited once the atom is parsed).
308+
if let Some(right) = self.with_recursion(|parser| {
309+
parser.parse_binary_expression_or_higher(new_precedence, context)
310+
}) {
311+
right
312+
} else {
313+
self.report_recursion_limit_exceeded(self.current_token_range());
314+
self.recursion_recovery_expr()
315+
}
316+
} else {
317+
self.parse_binary_expression_or_higher(new_precedence, context)
318+
};
304319

305320
Expr::BinOp(ast::ExprBinOp {
306321
left: Box::new(left.expr),
@@ -330,8 +345,61 @@ impl<'src> Parser<'src> {
330345
left_precedence: OperatorPrecedence,
331346
context: ExpressionContext,
332347
) -> ParsedExpr {
333-
let start = self.node_start();
334348
let token = self.current_token_kind();
349+
if !Self::token_starts_recursive_lhs(token) {
350+
return self.parse_lhs_expression_inner(left_precedence, context, token);
351+
}
352+
353+
if let Some(result) = self.with_recursion(|parser| {
354+
parser.parse_lhs_expression_inner(left_precedence, context, token)
355+
}) {
356+
result
357+
} else {
358+
self.report_recursion_limit_exceeded(self.current_token_range());
359+
self.recursion_recovery_expr()
360+
}
361+
}
362+
363+
/// Returns whether parsing an expression that starts with `token` can
364+
/// immediately recurse through another expression parse.
365+
#[inline]
366+
fn token_starts_recursive_lhs(token: TokenKind) -> bool {
367+
token.as_unary_operator().is_some()
368+
|| matches!(
369+
token,
370+
TokenKind::Star
371+
| TokenKind::Await
372+
| TokenKind::Lambda
373+
| TokenKind::Yield
374+
| TokenKind::FStringStart
375+
| TokenKind::TStringStart
376+
| TokenKind::Lpar
377+
| TokenKind::Lsqb
378+
| TokenKind::Lbrace
379+
)
380+
}
381+
382+
/// The standard expression-recovery node returned when the recursion
383+
/// limit is exceeded: an empty `Name` with the `Invalid` context.
384+
fn recursion_recovery_expr(&mut self) -> ParsedExpr {
385+
ParsedExpr {
386+
expr: Expr::Name(ast::ExprName {
387+
range: self.missing_node_range(),
388+
id: Name::empty(),
389+
ctx: ExprContext::Invalid,
390+
node_index: AtomicNodeIndex::NONE,
391+
}),
392+
is_parenthesized: false,
393+
}
394+
}
395+
396+
fn parse_lhs_expression_inner(
397+
&mut self,
398+
left_precedence: OperatorPrecedence,
399+
context: ExpressionContext,
400+
token: TokenKind,
401+
) -> ParsedExpr {
402+
let start = self.node_start();
335403

336404
if let Some(unary_op) = token.as_unary_operator() {
337405
let expr = self.parse_unary_expression(unary_op, context);
@@ -365,7 +433,7 @@ impl<'src> Parser<'src> {
365433
return Expr::UnaryOp(expr).into();
366434
}
367435

368-
match self.current_token_kind() {
436+
match token {
369437
TokenKind::Star => {
370438
let starred_expr = self.parse_starred_expression(context);
371439

@@ -665,8 +733,20 @@ impl<'src> Parser<'src> {
665733
pub(super) fn parse_postfix_expression(&mut self, mut lhs: Expr, start: TextSize) -> Expr {
666734
loop {
667735
lhs = match self.current_token_kind() {
668-
TokenKind::Lpar => Expr::Call(self.parse_call_expression(lhs, start)),
669-
TokenKind::Lsqb => Expr::Subscript(self.parse_subscript_expression(lhs, start)),
736+
TokenKind::Lpar => {
737+
if self.tokens.nesting() > self.max_nesting_depth {
738+
self.report_recursion_limit_exceeded(self.current_token_range());
739+
break lhs;
740+
}
741+
Expr::Call(self.parse_call_expression(lhs, start))
742+
}
743+
TokenKind::Lsqb => {
744+
if self.tokens.nesting() > self.max_nesting_depth {
745+
self.report_recursion_limit_exceeded(self.current_token_range());
746+
break lhs;
747+
}
748+
Expr::Subscript(self.parse_subscript_expression(lhs, start))
749+
}
670750
TokenKind::Dot => Expr::Attribute(self.parse_attribute_expression(lhs, start)),
671751
_ => break lhs,
672752
};
@@ -1802,11 +1882,18 @@ impl<'src> Parser<'src> {
18021882

18031883
let format_spec = if self.eat(TokenKind::Colon) {
18041884
let spec_start = self.node_start();
1805-
let elements = self.parse_interpolated_string_elements(
1806-
flags,
1807-
InterpolatedStringElementsKind::FormatSpec(string_kind),
1808-
string_kind,
1809-
);
1885+
let elements = if let Some(elements) = self.with_recursion(|parser| {
1886+
parser.parse_interpolated_string_elements(
1887+
flags,
1888+
InterpolatedStringElementsKind::FormatSpec(string_kind),
1889+
string_kind,
1890+
)
1891+
}) {
1892+
elements
1893+
} else {
1894+
self.report_recursion_limit_exceeded(self.current_token_range());
1895+
ast::InterpolatedStringElements::from(vec![])
1896+
};
18101897
Some(Box::new(ast::InterpolatedStringFormatSpec {
18111898
range: self.node_range(spec_start),
18121899
elements,
@@ -2878,7 +2965,16 @@ impl<'src> Parser<'src> {
28782965
// test_err lambda_body_with_yield_expr
28792966
// lambda x: yield y
28802967
// lambda x: yield from y
2881-
let body = self.parse_conditional_expression_or_higher();
2968+
2969+
// `lambda: lambda: lambda: ...` recurses through the lambda body at
2970+
// the conditional layer, bypassing the `parse_lhs_expression` guard.
2971+
let body =
2972+
if let Some(body) = self.with_recursion(Self::parse_conditional_expression_or_higher) {
2973+
body
2974+
} else {
2975+
self.report_recursion_limit_exceeded(self.current_token_range());
2976+
self.recursion_recovery_expr()
2977+
};
28822978

28832979
ast::ExprLambda {
28842980
body: Box::new(body.expr),
@@ -2902,7 +2998,17 @@ impl<'src> Parser<'src> {
29022998

29032999
self.expect(TokenKind::Else);
29043000

2905-
let orelse = self.parse_conditional_expression_or_higher();
3001+
// `a if b else a if b else ...` recurses through `orelse` at the
3002+
// conditional layer, which is not covered by the `parse_lhs_expression`
3003+
// guard (that scope is released once each atom is parsed). Guard here.
3004+
let orelse = if let Some(orelse) =
3005+
self.with_recursion(Self::parse_conditional_expression_or_higher)
3006+
{
3007+
orelse
3008+
} else {
3009+
self.report_recursion_limit_exceeded(self.current_token_range());
3010+
self.recursion_recovery_expr()
3011+
};
29063012

29073013
ast::ExprIf {
29083014
body: Box::new(body),

crates/ruff_python_parser/src/parser/mod.rs

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
use std::cmp::Ordering;
22

33
use bitflags::bitflags;
4-
54
use ruff_python_ast::token::TokenKind;
65
use ruff_python_ast::{AtomicNodeIndex, Mod, ModExpression, ModModule};
76
use ruff_text_size::{Ranged, TextRange, TextSize};
@@ -56,6 +55,12 @@ pub(crate) struct Parser<'src> {
5655

5756
/// The start offset in the source code from which to start parsing at.
5857
start_offset: TextSize,
58+
59+
/// Current parser recursion depth remaining before the depth limit is exceeded.
60+
depth_remaining: u16,
61+
62+
/// Maximum lexer nesting depth before postfix calls and subscripts should stop recursing.
63+
max_nesting_depth: u32,
5964
}
6065

6166
impl<'src> Parser<'src> {
@@ -71,6 +76,8 @@ impl<'src> Parser<'src> {
7176
options: ParseOptions,
7277
) -> Self {
7378
let tokens = TokenSource::from_source(source, options.mode, start_offset);
79+
let depth_remaining = options.max_recursion_depth;
80+
let max_nesting_depth = u32::from(options.max_recursion_depth.saturating_sub(2));
7481

7582
Parser {
7683
options,
@@ -82,6 +89,38 @@ impl<'src> Parser<'src> {
8289
prev_token_end: TextSize::new(0),
8390
start_offset,
8491
current_token_id: TokenId::default(),
92+
depth_remaining,
93+
max_nesting_depth,
94+
}
95+
}
96+
97+
/// Runs `f` if the recursive parser depth limit has not been hit.
98+
///
99+
/// # Note
100+
///
101+
/// This recursion guard is a temporary fix for #22930.
102+
#[must_use]
103+
#[inline]
104+
fn with_recursion<T>(&mut self, f: impl FnOnce(&mut Self) -> T) -> Option<T> {
105+
if self.depth_remaining == 0 {
106+
return None;
107+
}
108+
109+
self.depth_remaining -= 1;
110+
let result = f(self);
111+
self.depth_remaining += 1;
112+
Some(result)
113+
}
114+
115+
#[cold]
116+
#[inline(never)]
117+
fn report_recursion_limit_exceeded<R: Ranged>(&mut self, ranged: R) {
118+
self.add_error(ParseErrorType::RecursionLimitExceeded, ranged);
119+
// Skip to end-of-file so outer parser frames unwind quickly and our
120+
// `ParserProgress` infinite-loop guards don't fire when they see the
121+
// same `(` / `[` etc. that this frame failed to consume.
122+
while self.current_token_kind() != TokenKind::EndOfFile {
123+
self.bump_any();
85124
}
86125
}
87126

crates/ruff_python_parser/src/parser/options.rs

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,20 @@ use ruff_python_ast::{PySourceType, PythonVersion};
22

33
use crate::{AsMode, Mode};
44

5+
/// The default maximum recursion depth used by the parser.
6+
///
7+
/// Real-world Python rarely nests more than a handful of levels deep; this cap
8+
/// exists to keep the parser from overflowing the stack on adversarial or
9+
/// machine-generated input.
10+
///
11+
/// The default value mirrors CPython's `MAXSTACK` of 200 nested parentheses
12+
/// (`Parser/parser.c`): a one-statement module of the form `((((1))))` at
13+
/// depth 200 must parse, and one at depth 201 must fail. Each nesting level
14+
/// costs one `with_recursion` call, plus two framing calls (one for the
15+
/// surrounding statement and one for the innermost atom), so the cap is set
16+
/// to `200 + 2`.
17+
const DEFAULT_MAX_RECURSION_DEPTH: u16 = 202;
18+
519
/// Options for controlling how a source file is parsed.
620
///
721
/// You can construct a [`ParseOptions`] directly from a [`Mode`]:
@@ -26,6 +40,11 @@ pub struct ParseOptions {
2640
pub(crate) mode: Mode,
2741
/// Target version for detecting version-related syntax errors.
2842
pub(crate) target_version: PythonVersion,
43+
/// Maximum recursion depth for the parser. The parser aborts with a
44+
/// [`crate::ParseErrorType::RecursionLimitExceeded`] error once this many
45+
/// nested expression / statement / pattern nodes are on the parser's call
46+
/// stack. Defaults to [`DEFAULT_MAX_RECURSION_DEPTH`].
47+
pub(crate) max_recursion_depth: u16,
2948
}
3049

3150
impl ParseOptions {
@@ -38,13 +57,25 @@ impl ParseOptions {
3857
pub fn target_version(&self) -> PythonVersion {
3958
self.target_version
4059
}
60+
61+
/// Set the maximum recursion depth for the parser.
62+
#[must_use]
63+
pub fn with_max_recursion_depth(mut self, depth: u16) -> Self {
64+
self.max_recursion_depth = depth;
65+
self
66+
}
67+
68+
pub fn max_recursion_depth(&self) -> u16 {
69+
self.max_recursion_depth
70+
}
4171
}
4272

4373
impl From<Mode> for ParseOptions {
4474
fn from(mode: Mode) -> Self {
4575
Self {
4676
mode,
4777
target_version: PythonVersion::default(),
78+
max_recursion_depth: DEFAULT_MAX_RECURSION_DEPTH,
4879
}
4980
}
5081
}
@@ -54,6 +85,7 @@ impl From<PySourceType> for ParseOptions {
5485
Self {
5586
mode: source_type.as_mode(),
5687
target_version: PythonVersion::default(),
88+
max_recursion_depth: DEFAULT_MAX_RECURSION_DEPTH,
5789
}
5890
}
5991
}

crates/ruff_python_parser/src/parser/pattern.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,28 @@ impl Parser<'_> {
8888
///
8989
/// See: <https://docs.python.org/3/reference/compound_stmts.html#grammar-token-python-grammar-pattern>
9090
fn parse_match_pattern(&mut self, allow_star_pattern: AllowStarPattern) -> Pattern {
91+
if let Some(result) =
92+
self.with_recursion(|parser| parser.parse_match_pattern_inner(allow_star_pattern))
93+
{
94+
result
95+
} else {
96+
let range = self.missing_node_range();
97+
self.report_recursion_limit_exceeded(self.current_token_range());
98+
let invalid_node = Expr::Name(ast::ExprName {
99+
range,
100+
id: Name::empty(),
101+
ctx: ExprContext::Invalid,
102+
node_index: AtomicNodeIndex::NONE,
103+
});
104+
Pattern::MatchValue(ast::PatternMatchValue {
105+
range: invalid_node.range(),
106+
value: Box::new(invalid_node),
107+
node_index: AtomicNodeIndex::NONE,
108+
})
109+
}
110+
}
111+
112+
fn parse_match_pattern_inner(&mut self, allow_star_pattern: AllowStarPattern) -> Pattern {
91113
let start = self.node_start();
92114

93115
// We don't yet know if it's an or pattern or an as pattern, so use whatever

0 commit comments

Comments
 (0)