Skip to content

Commit 1fc1909

Browse files
committed
Align XPath grammar with spec
1 parent 7731bd8 commit 1fc1909

14 files changed

Lines changed: 2601 additions & 1764 deletions

rewrite-xml/src/main/antlr/XPathLexer.g4

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,13 @@ NUMBER : [0-9]+ ('.' [0-9]+)? ;
5555
AND : 'and' ;
5656
OR : 'or' ;
5757

58+
// Node type keywords (must be before NCNAME to take precedence)
59+
// These are the only valid node types per XPath 1.0 spec [38]
60+
TEXT : 'text' ;
61+
COMMENT : 'comment' ;
62+
NODE : 'node' ;
63+
PROCESSING_INSTRUCTION : 'processing-instruction' ;
64+
5865
// String literals
5966
STRING_LITERAL
6067
: '\'' (~['])* '\''

rewrite-xml/src/main/antlr/XPathParser.g4

Lines changed: 169 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
*/
1616

1717
/**
18-
* XPath parser for a limited subset of XPath expressions.
18+
* XPath parser following the XPath 1.0 specification grammar structure.
19+
* See: https://www.w3.org/TR/1999/REC-xpath-19991116/
1920
*
2021
* Supports:
2122
* - Absolute paths: /root/child
@@ -29,59 +30,131 @@
2930
* - Positional predicates: /root/element[1], /root/element[last()]
3031
* - Parenthesized expressions with predicates: (/root/element)[1], (/root/a)[last()]
3132
* - XPath functions: local-name(), namespace-uri(), text(), contains(), position(), last(), etc.
32-
* - Logical operators in predicates: and, or
33+
* - Logical operators: and, or
34+
* - Comparison operators: =, !=, <, >, <=, >=
3335
* - Multiple predicates: /root/element[@attr='value'][local-name()='element']
34-
* - Top-level function expressions: contains(/root/element, 'value')
35-
* - Boolean expressions: not(contains(...)), string-length(...) > 2
3636
* - Abbreviated syntax: . (self), .. (parent)
37-
* - Parent axis: parent::node(), parent::element
37+
* - Axis steps: parent::node(), parent::element
38+
*
39+
* Not yet implemented:
40+
* - Union operator: |
41+
* - Arithmetic operators: +, -, *, div, mod
42+
* - Variable references: $var
3843
*/
3944
parser grammar XPathParser;
4045

4146
options { tokenVocab=XPathLexer; }
4247

43-
// Entry point for XPath expression
48+
//==============================================================================
49+
// Entry point
50+
//==============================================================================
51+
52+
// [14] Expr ::= OrExpr
4453
xpathExpression
45-
: booleanExpr
46-
| filterExpr
47-
| absoluteLocationPath
48-
| relativeLocationPath
54+
: expr
4955
;
5056

51-
// Filter expression - parenthesized path with predicates and optional trailing path: (/root/a)[1]/child
52-
filterExpr
53-
: LPAREN (absoluteLocationPath | relativeLocationPath) RPAREN predicate+ (pathSeparator relativeLocationPath)?
57+
// Top-level expression - any expression type
58+
expr
59+
: orExpr
5460
;
5561

56-
// Boolean expression (function calls with optional comparison)
57-
booleanExpr
58-
: functionCall comparisonOp comparand
59-
| functionCall
62+
//==============================================================================
63+
// Expression hierarchy (following XPath 1.0 spec precedence)
64+
//==============================================================================
65+
66+
// [21] OrExpr ::= AndExpr | OrExpr 'or' AndExpr
67+
orExpr
68+
: andExpr (OR andExpr)*
69+
;
70+
71+
// [22] AndExpr ::= EqualityExpr | AndExpr 'and' EqualityExpr
72+
andExpr
73+
: equalityExpr (AND equalityExpr)*
74+
;
75+
76+
// [23] EqualityExpr ::= RelationalExpr | EqualityExpr '=' RelationalExpr | EqualityExpr '!=' RelationalExpr
77+
equalityExpr
78+
: relationalExpr ((EQUALS | NOT_EQUALS) relationalExpr)*
79+
;
80+
81+
// [24] RelationalExpr ::= AdditiveExpr | RelationalExpr '<' AdditiveExpr | ...
82+
// (Skipping AdditiveExpr/MultiplicativeExpr for now - going directly to UnaryExpr)
83+
relationalExpr
84+
: unaryExpr ((LT | GT | LTE | GTE) unaryExpr)*
6085
;
6186

62-
// Comparison operators
63-
comparisonOp
64-
: EQUALS
65-
| NOT_EQUALS
66-
| LT
67-
| GT
68-
| LTE
69-
| GTE
87+
// [27] UnaryExpr ::= UnionExpr | '-' UnaryExpr
88+
// (Skipping '-' for now since we don't have arithmetic)
89+
unaryExpr
90+
: unionExpr
7091
;
7192

72-
// Value to compare against
73-
comparand
74-
: stringLiteral
93+
// [18] UnionExpr ::= PathExpr | UnionExpr '|' PathExpr
94+
// (Union operator not yet implemented - going directly to PathExpr)
95+
unionExpr
96+
: pathExpr
97+
;
98+
99+
// [19] PathExpr ::= LocationPath | FilterExpr | FilterExpr '/' RelativeLocationPath | FilterExpr '//' RelativeLocationPath
100+
// Restructured to eliminate ambiguity: filterExpr alternatives are made explicit
101+
// The key distinction: function calls require LPAREN after the name, location paths don't
102+
pathExpr
103+
: functionCallExpr (pathSeparator relativeLocationPath)? // func() possibly followed by /path
104+
| bracketedExpr (pathSeparator relativeLocationPath)? // (expr) possibly followed by /path
105+
| literalOrNumber (pathSeparator relativeLocationPath)? // 'string' or 123 possibly followed by /path
106+
| locationPath // /a/b, a/b, //a, etc.
107+
;
108+
109+
// [20] FilterExpr ::= PrimaryExpr | FilterExpr Predicate
110+
// Function call with optional predicates - requires LPAREN after name to disambiguate from locationPath
111+
functionCallExpr
112+
: functionCall predicate*
113+
;
114+
115+
// Bracketed expression with optional predicates
116+
bracketedExpr
117+
: LPAREN expr RPAREN predicate*
118+
;
119+
120+
// Literal or number with optional predicates
121+
literalOrNumber
122+
: literal predicate*
123+
| NUMBER predicate*
124+
;
125+
126+
// Legacy filterExpr for backward compatibility (used in compileFilterExpr)
127+
filterExpr
128+
: functionCallExpr
129+
| bracketedExpr
130+
| literalOrNumber
131+
;
132+
133+
// [15] PrimaryExpr ::= VariableReference | '(' Expr ')' | Literal | Number | FunctionCall
134+
primaryExpr
135+
: LPAREN expr RPAREN
136+
| literal
75137
| NUMBER
138+
| functionCall
76139
;
77140

78-
// Absolute path starting with / or //
141+
//==============================================================================
142+
// Location paths
143+
//==============================================================================
144+
145+
// [1] LocationPath ::= RelativeLocationPath | AbsoluteLocationPath
146+
locationPath
147+
: absoluteLocationPath
148+
| relativeLocationPath
149+
;
150+
151+
// [2] AbsoluteLocationPath ::= '/' RelativeLocationPath? | AbbreviatedAbsoluteLocationPath
79152
absoluteLocationPath
80153
: SLASH relativeLocationPath?
81154
| DOUBLE_SLASH relativeLocationPath
82155
;
83156

84-
// Relative path (series of steps)
157+
// [3] RelativeLocationPath ::= Step | RelativeLocationPath '/' Step | AbbreviatedRelativeLocationPath
85158
relativeLocationPath
86159
: step (pathSeparator step)*
87160
;
@@ -92,123 +165,109 @@ pathSeparator
92165
| DOUBLE_SLASH
93166
;
94167

95-
// A single step in the path
168+
//==============================================================================
169+
// Location steps
170+
//==============================================================================
171+
172+
// [4] Step ::= AxisSpecifier NodeTest Predicate* | AbbreviatedStep
173+
// Restructured to eliminate ambiguity: axis specifier is optional prefix
96174
step
97-
: axisStep predicate*
98-
| nodeTest predicate*
175+
: axisSpecifier? nodeTest predicate*
99176
| attributeStep predicate*
100-
| nodeTypeTest
101177
| abbreviatedStep
102178
;
103179

104-
// Axis step - explicit axis like parent::node()
105-
axisStep
106-
: axisName AXIS_SEP nodeTest
180+
// [5] AxisSpecifier ::= AxisName '::' | AbbreviatedAxisSpecifier
181+
// Made as a separate rule to allow optional usage in step
182+
axisSpecifier
183+
: axisName AXIS_SEP
107184
;
108185

109-
// Supported axis names (NCName - no namespace prefix)
186+
// [6] AxisName - validated at runtime
110187
axisName
111-
: NCNAME // parent, ancestor, self, child, etc. - validated at runtime
188+
: NCNAME
112189
;
113190

114-
// Abbreviated step - . or ..
191+
// [12] AbbreviatedStep ::= '.' | '..'
115192
abbreviatedStep
116-
: DOTDOT // parent::node()
117-
| DOT // self::node()
118-
;
119-
120-
// Node type test - text(), comment(), node(), processing-instruction()
121-
// Validation of which functions are valid node type tests happens at runtime
122-
nodeTypeTest
123-
: NCNAME LPAREN RPAREN
193+
: DOTDOT
194+
| DOT
124195
;
125196

126-
// Attribute step (@attr, @ns:attr, or @*)
197+
// [13] AbbreviatedAxisSpecifier ::= '@'?
127198
attributeStep
128199
: AT (QNAME | NCNAME | WILDCARD)
129200
;
130201

131-
// Node test (element name, ns:element, or wildcard)
202+
//==============================================================================
203+
// Node tests
204+
//==============================================================================
205+
206+
// [7] NodeTest ::= NameTest | NodeType '(' ')' | 'processing-instruction' '(' Literal ')'
132207
nodeTest
133-
: QNAME
134-
| NCNAME
135-
| WILDCARD
208+
: nameTest
209+
| nodeType LPAREN RPAREN
136210
;
137211

138-
// Predicate in square brackets
139-
predicate
140-
: LBRACKET predicateExpr RBRACKET
212+
// [37] NameTest ::= '*' | NCName ':' '*' | QName
213+
nameTest
214+
: WILDCARD
215+
| QNAME
216+
| NCNAME
141217
;
142218

143-
// Predicate expression (supports and/or)
144-
predicateExpr
145-
: orExpr
219+
// [38] NodeType ::= 'comment' | 'text' | 'processing-instruction' | 'node'
220+
// Uses specific tokens to avoid ambiguity with function calls
221+
nodeType
222+
: TEXT
223+
| COMMENT
224+
| NODE
225+
| PROCESSING_INSTRUCTION
146226
;
147227

148-
// OR expression (lowest precedence)
149-
orExpr
150-
: andExpr (OR andExpr)*
151-
;
228+
//==============================================================================
229+
// Predicates
230+
//==============================================================================
152231

153-
// AND expression (higher precedence than OR)
154-
andExpr
155-
: primaryExpr (AND primaryExpr)*
232+
// [8] Predicate ::= '[' PredicateExpr ']'
233+
predicate
234+
: LBRACKET predicateExpr RBRACKET
156235
;
157236

158-
// Primary expression in a predicate
159-
primaryExpr
160-
: predicateValue comparisonOp comparand // any value expression with comparison
161-
| predicateValue // standalone value (last(), position(), number, boolean)
237+
// [9] PredicateExpr ::= Expr
238+
predicateExpr
239+
: expr
162240
;
163241

164-
// A value-producing expression in a predicate
165-
predicateValue
166-
: functionCall // local-name(), last(), position(), contains(), etc.
167-
| attributeStep // @attr, @*
168-
| relativeLocationPath // bar/baz/text()
169-
| childElementTest // child, *
170-
| NUMBER // positional predicate [1], [2], etc.
171-
;
242+
//==============================================================================
243+
// Functions
244+
//==============================================================================
172245

173-
// XPath function call - unified for both top-level and predicate use
174-
// Function names are NCNames (no namespace prefix in standard XPath 1.0)
175-
// Specific functions like local-name, namespace-uri, contains, etc. are
176-
// validated at runtime in the compiler via getFunctionType()
246+
// [16] FunctionCall ::= FunctionName '(' (Argument (',' Argument)*)? ')'
177247
functionCall
178-
: NCNAME LPAREN functionArgs? RPAREN
248+
: functionName LPAREN (argument (COMMA argument)*)? RPAREN
179249
;
180250

181-
// Function arguments (comma-separated)
182-
functionArgs
183-
: functionArg (COMMA functionArg)*
251+
// [35] FunctionName ::= QName - NodeType
252+
// Node type tokens can also be function names (text(), comment(), node(), processing-instruction())
253+
functionName
254+
: NCNAME
255+
| TEXT
256+
| COMMENT
257+
| NODE
258+
| PROCESSING_INSTRUCTION
184259
;
185260

186-
// A single function argument
187-
// Note: comparisonArg must come first to handle not(path = 'value')
188-
// functionCall must come before relativeLocationPath
189-
// because both can start with QNAME, but we need to check for '(' to distinguish them
190-
functionArg
191-
: comparisonArg
192-
| absoluteLocationPath
193-
| functionCall
194-
| relativeLocationPath
195-
| stringLiteral
196-
| NUMBER
261+
// [17] Argument ::= Expr
262+
argument
263+
: expr
197264
;
198265

199-
// Comparison expression as function argument (for not(x = 'y'), etc.)
200-
comparisonArg
201-
: (functionCall | relativeLocationPath | absoluteLocationPath) comparisonOp comparand
202-
;
203-
204-
// Child element test in predicate (element name, ns:element, or wildcard)
205-
childElementTest
206-
: QNAME
207-
| NCNAME
208-
| WILDCARD
209-
;
266+
//==============================================================================
267+
// Literals
268+
//==============================================================================
210269

211-
// String literal value
212-
stringLiteral
270+
// [29] Literal ::= '"' [^"]* '"' | "'" [^']* "'"
271+
literal
213272
: STRING_LITERAL
214273
;

0 commit comments

Comments
 (0)