Skip to content

Commit 45acbd0

Browse files
authored
[ty] Improve support for goto-type, goto-declaration, hover, and highlighting of string annotations (#22878)
I ultimately hunted down the core issue with my previous approach to "if you give string annotation sub-AST nodes any kind of NodeIndex, a bunch of random places in the code will start thinking it's ok to store info about them, which is a problem because they all count up from index 0 which creates conflicts/crashes". Rather than trying to play whackamole I decided to create a scheme to give string annotation nodes a unique NodeIndex by shifting up the parent node's index -- so 0xAB's sub-AST nodes look like 0xAB00...0000, 0xAB00..0001, etc. This scheme avoids any collisions for any reasonable AST (most string annotations are like, a dozen sub-nodes, so they need maybe 4 or 5 bits, which would require hundreds of MB of python code to run out of bits...). As a bonus, this admits an extremely simple implementation of recording and fetching sub-AST types... they just are stored now, and you can just pass in their NodeIndex and get back actual results. * Fixes astral-sh/ty#1640 * Fixes astral-sh/ty#2028
1 parent 7cde863 commit 45acbd0

13 files changed

Lines changed: 1147 additions & 69 deletions

File tree

crates/ruff_db/src/parsed.rs

Lines changed: 78 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,13 @@ use std::sync::Arc;
33

44
use arc_swap::ArcSwapOption;
55
use get_size2::GetSize;
6-
use ruff_python_ast::{AnyRootNodeRef, ModModule, NodeIndex};
7-
use ruff_python_parser::{ParseOptions, Parsed, parse_unchecked};
6+
use ruff_python_ast::{
7+
AnyRootNodeRef, HasNodeIndex, ModExpression, ModModule, NodeIndex, NodeIndexError,
8+
StringLiteral,
9+
};
10+
use ruff_python_parser::{
11+
ParseError, ParseErrorType, ParseOptions, Parsed, parse_string_annotation, parse_unchecked,
12+
};
813

914
use crate::Db;
1015
use crate::files::File;
@@ -45,6 +50,36 @@ pub fn parsed_module_impl(db: &dyn Db, file: File) -> Parsed<ModModule> {
4550
.expect("PySourceType always parses into a module")
4651
}
4752

53+
pub fn parsed_string_annotation(
54+
source: &str,
55+
string: &StringLiteral,
56+
) -> Result<Parsed<ModExpression>, ParseError> {
57+
let expr = parse_string_annotation(source, string)?;
58+
59+
// We need the sub-ast of the string annotation to be indexed
60+
indexed::ensure_indexed(&expr, string.node_index().load()).map_err(|err| {
61+
let message = match err {
62+
NodeIndexError::NoParent => {
63+
"Internal Error: string annotation's parent had no NodeIndex".to_owned()
64+
}
65+
NodeIndexError::OutOfIndices => {
66+
"File too long, ran out of encoding space for string annotations".to_owned()
67+
}
68+
NodeIndexError::OutOfSubIndices => {
69+
"Substring annotation is too complex, ran out of encoding space".to_owned()
70+
}
71+
NodeIndexError::TooNested => "Too many levels of nested string annotations".to_owned(),
72+
};
73+
74+
ParseError {
75+
error: ParseErrorType::OtherError(message),
76+
location: string.range,
77+
}
78+
})?;
79+
80+
Ok(expr)
81+
}
82+
4883
/// A wrapper around a parsed module.
4984
///
5085
/// This type manages instances of the module AST. A particular instance of the AST
@@ -169,13 +204,39 @@ mod indexed {
169204
pub parsed: Parsed<ModModule>,
170205
}
171206

207+
/// Ensure the following sub-AST is indexed, using the parent node's index
208+
/// as a basis for unambiguous AST node indices.
209+
pub fn ensure_indexed(
210+
parsed: &Parsed<ModExpression>,
211+
parent_node_index: NodeIndex,
212+
) -> Result<(), NodeIndexError> {
213+
let parent_index = parent_node_index.as_u32().ok_or(NodeIndexError::NoParent)?;
214+
let (index, max_index) = sub_indices(parent_index)?;
215+
let mut visitor = Visitor {
216+
overflowed: false,
217+
nodes: None,
218+
index,
219+
max_index,
220+
};
221+
222+
AnyNodeRef::from(parsed.syntax()).visit_source_order(&mut visitor);
223+
224+
if visitor.overflowed {
225+
return Err(NodeIndexError::OutOfSubIndices);
226+
}
227+
228+
Ok(())
229+
}
230+
172231
impl IndexedModule {
173232
/// Create a new [`IndexedModule`] from the given AST.
174233
#[allow(clippy::unnecessary_cast)]
175234
pub fn new(parsed: Parsed<ModModule>) -> Arc<Self> {
176235
let mut visitor = Visitor {
177-
nodes: Vec::new(),
236+
nodes: Some(Vec::new()),
178237
index: 0,
238+
max_index: MAX_REAL_INDEX,
239+
overflowed: false,
179240
};
180241

181242
let mut inner = Arc::new(IndexedModule {
@@ -185,7 +246,7 @@ mod indexed {
185246

186247
AnyNodeRef::from(inner.parsed.syntax()).visit_source_order(&mut visitor);
187248

188-
let index: Box<[AnyRootNodeRef<'_>]> = visitor.nodes.into_boxed_slice();
249+
let index: Box<[AnyRootNodeRef<'_>]> = visitor.nodes.unwrap().into_boxed_slice();
189250

190251
// SAFETY: We cast from `Box<[AnyRootNodeRef<'_>]>` to `Box<[AnyRootNodeRef<'static>]>`,
191252
// faking the 'static lifetime to create the self-referential struct. The node references
@@ -214,7 +275,9 @@ mod indexed {
214275
/// A visitor that collects nodes in source order.
215276
pub struct Visitor<'a> {
216277
pub index: u32,
217-
pub nodes: Vec<AnyRootNodeRef<'a>>,
278+
pub max_index: u32,
279+
pub nodes: Option<Vec<AnyRootNodeRef<'a>>>,
280+
pub overflowed: bool,
218281
}
219282

220283
impl<'a> Visitor<'a> {
@@ -223,8 +286,16 @@ mod indexed {
223286
T: HasNodeIndex + std::fmt::Debug,
224287
AnyRootNodeRef<'a>: From<&'a T>,
225288
{
226-
node.node_index().set(NodeIndex::from(self.index));
227-
self.nodes.push(AnyRootNodeRef::from(node));
289+
// Only check on write (the maximum is orders of magnitude less than u32::MAX)
290+
if self.index > self.max_index {
291+
self.overflowed = true;
292+
} else {
293+
node.node_index().set(NodeIndex::from(self.index));
294+
}
295+
296+
if let Some(nodes) = &mut self.nodes {
297+
nodes.push(AnyRootNodeRef::from(node));
298+
}
228299
self.index += 1;
229300
}
230301
}

crates/ruff_python_ast/src/node_index.rs

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,45 @@ where
1717
}
1818

1919
/// A unique index for a node within an AST.
20+
///
21+
/// Our encoding of 32-bit AST node indices is as follows:
22+
///
23+
/// * `u32::MAX` (1111...1) is reserved as a forbidden value (mapped to 0 for `NonZero`)
24+
/// * `u32::MAX - 1` (1111...0) is reserved for `NodeIndex::NONE`
25+
/// * The top two bits encode the sub-AST level:
26+
/// * 00 is top-level AST
27+
/// * 01 is sub-AST (string annotation)
28+
/// * 10 is sub-sub-AST (string annotation in string annotation)
29+
/// * 11 is forbidden (well, it only appears in the above reserved values)
30+
/// * The remaining 30 bits are the real (sub)-AST node index
31+
///
32+
/// To get the first sub-index of a node's sub-AST we:
33+
///
34+
/// * increment the sub-AST level in the high-bits
35+
/// * at level 1, multiply the real index by 256
36+
/// * at level 2, multiply the real index by 8
37+
///
38+
/// The multiplication gives each node a reserved space of 256 nodes for its sub-AST
39+
/// to work with ("should be enough for anybody"), and 8 nodes for a sub-sub-AST
40+
/// (enough for an identifier and maybe some simple unions).
41+
///
42+
/// Here are some implications:
43+
///
44+
/// * We have 2^30 top-level AST nodes (1 billion)
45+
/// * To have a string annotation, the parent node needs to be multiplied by 256 without
46+
/// overflowing 30 bits, so string annotations cannot be used after 2^22 nodes (4 million),
47+
/// which would be like, a million lines of code.
48+
/// * To have a sub-string annotation, the top-level node needs to be multiplied
49+
/// by 256 * 8, so sub-string annotations cannot be used after 2^19 nodes (500 thousand),
50+
/// or about 100k lines of code.
51+
///
52+
/// This feels like a pretty reasonable compromise that will work well in practice,
53+
/// although it creates some very wonky boundary conditions that will be very unpleasant
54+
/// if someone runs into them.
55+
///
56+
/// That said, string annotations are in many regards "legacy" and so new code ideally
57+
/// doesn't have to use them, and there's never a real reason to use sub-annotation
58+
/// let-alone a sub-sub-annotation.
2059
#[derive(PartialEq, Eq, PartialOrd, Ord, Clone, Copy, Hash)]
2160
#[cfg_attr(feature = "get-size", derive(get_size2::GetSize))]
2261
pub struct NodeIndex(NonZeroU32);
@@ -39,6 +78,59 @@ impl NodeIndex {
3978
}
4079
}
4180

81+
pub enum NodeIndexError {
82+
TooNested,
83+
OutOfSubIndices,
84+
NoParent,
85+
OutOfIndices,
86+
}
87+
88+
const MAX_LEVEL: u32 = 2;
89+
const LEVEL_BITS: u32 = 32 - MAX_LEVEL.leading_zeros();
90+
const LEVEL_SHIFT: u32 = 32 - LEVEL_BITS;
91+
const LEVEL_MASK: u32 = ((LEVEL_BITS << 1) - 1) << LEVEL_SHIFT;
92+
const SUB_NODES: u32 = 256;
93+
const SUB_SUB_NODES: u32 = 8;
94+
pub const MAX_REAL_INDEX: u32 = (1 << LEVEL_SHIFT) - 1;
95+
96+
/// sub-AST level is stored in the top two bits
97+
fn sub_ast_level(index: u32) -> u32 {
98+
(index & LEVEL_MASK) >> LEVEL_SHIFT
99+
}
100+
101+
/// Get the first and last index of the sub-AST of the input
102+
pub fn sub_indices(index: u32) -> Result<(u32, u32), NodeIndexError> {
103+
let level = sub_ast_level(index);
104+
if level >= MAX_LEVEL {
105+
return Err(NodeIndexError::TooNested);
106+
}
107+
let next_level = (level + 1) << LEVEL_SHIFT;
108+
let without_level = index & !LEVEL_MASK;
109+
let nodes_in_level = if level == 0 {
110+
SUB_NODES
111+
} else if level == 1 {
112+
SUB_SUB_NODES
113+
} else {
114+
unreachable!(
115+
"Someone made a mistake updating the encoding of node indices: {index:08X} had level {level}"
116+
);
117+
};
118+
119+
// If this overflows the file has hundreds of thousands of lines of code,
120+
// but that *can* happen (we just can't support string annotations that deep)
121+
let sub_index_without_level = without_level
122+
.checked_mul(SUB_NODES)
123+
.ok_or(NodeIndexError::OutOfIndices)?;
124+
if sub_index_without_level > MAX_REAL_INDEX {
125+
return Err(NodeIndexError::OutOfIndices);
126+
}
127+
128+
let first_index = sub_index_without_level | next_level;
129+
// Can't overflow by construction
130+
let last_index = first_index + nodes_in_level - 1;
131+
Ok((first_index, last_index))
132+
}
133+
42134
impl From<u32> for NodeIndex {
43135
fn from(value: u32) -> Self {
44136
match NonZeroU32::new(value + 1).map(NodeIndex) {

crates/ty/docs/rules.md

Lines changed: 6 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)