1515 */
1616
1717/* *
18- * XPath parser for a limited subset of XPath expressions.
18+ * XPath parser following the XPath 1.0 specification grammar structure.
19+ * See: https://www.w3.org/TR/1999/REC-xpath-19991116/
1920 *
2021 * Supports:
2122 * - Absolute paths: /root/child
2930 * - Positional predicates: /root/element[1], /root/element[last()]
3031 * - Parenthesized expressions with predicates: (/root/element)[1], (/root/a)[last()]
3132 * - XPath functions: local-name(), namespace-uri(), text(), contains(), position(), last(), etc.
32- * - Logical operators in predicates: and, or
33+ * - Logical operators: and, or
34+ * - Comparison operators: =, !=, <, >, <=, >=
3335 * - Multiple predicates: /root/element[@attr='value'][local-name()='element']
34- * - Top-level function expressions: contains(/root/element, 'value')
35- * - Boolean expressions: not(contains(...)), string-length(...) > 2
3636 * - Abbreviated syntax: . (self), .. (parent)
37- * - Parent axis: parent::node(), parent::element
37+ * - Axis steps: parent::node(), parent::element
38+ *
39+ * Not yet implemented:
40+ * - Union operator: |
41+ * - Arithmetic operators: +, -, *, div, mod
42+ * - Variable references: $var
3843 */
3944parser grammar XPathParser;
4045
4146options { tokenVocab=XPathLexer; }
4247
43- // Entry point for XPath expression
48+ // ==============================================================================
49+ // Entry point
50+ // ==============================================================================
51+
52+ // [14] Expr ::= OrExpr
4453xpathExpression
45- : booleanExpr
46- | filterExpr
47- | absoluteLocationPath
48- | relativeLocationPath
54+ : expr
4955 ;
5056
51- // Filter expression - parenthesized path with predicates and optional trailing path: (/root/a)[1]/child
52- filterExpr
53- : LPAREN (absoluteLocationPath | relativeLocationPath) RPAREN predicate+ (pathSeparator relativeLocationPath)?
57+ // Top-level expression - any expression type
58+ expr
59+ : orExpr
5460 ;
5561
56- // Boolean expression (function calls with optional comparison)
57- booleanExpr
58- : functionCall comparisonOp comparand
59- | functionCall
62+ // ==============================================================================
63+ // Expression hierarchy (following XPath 1.0 spec precedence)
64+ // ==============================================================================
65+
66+ // [21] OrExpr ::= AndExpr | OrExpr 'or' AndExpr
67+ orExpr
68+ : andExpr (OR andExpr)*
69+ ;
70+
71+ // [22] AndExpr ::= EqualityExpr | AndExpr 'and' EqualityExpr
72+ andExpr
73+ : equalityExpr (AND equalityExpr)*
74+ ;
75+
76+ // [23] EqualityExpr ::= RelationalExpr | EqualityExpr '=' RelationalExpr | EqualityExpr '!=' RelationalExpr
77+ equalityExpr
78+ : relationalExpr ((EQUALS | NOT_EQUALS ) relationalExpr)*
79+ ;
80+
81+ // [24] RelationalExpr ::= AdditiveExpr | RelationalExpr '<' AdditiveExpr | ...
82+ // (Skipping AdditiveExpr/MultiplicativeExpr for now - going directly to UnaryExpr)
83+ relationalExpr
84+ : unaryExpr ((LT | GT | LTE | GTE ) unaryExpr)*
6085 ;
6186
62- // Comparison operators
63- comparisonOp
64- : EQUALS
65- | NOT_EQUALS
66- | LT
67- | GT
68- | LTE
69- | GTE
87+ // [27] UnaryExpr ::= UnionExpr | '-' UnaryExpr
88+ // (Skipping '-' for now since we don't have arithmetic)
89+ unaryExpr
90+ : unionExpr
7091 ;
7192
72- // Value to compare against
73- comparand
74- : stringLiteral
93+ // [18] UnionExpr ::= PathExpr | UnionExpr '|' PathExpr
94+ // (Union operator not yet implemented - going directly to PathExpr)
95+ unionExpr
96+ : pathExpr
97+ ;
98+
99+ // [19] PathExpr ::= LocationPath | FilterExpr | FilterExpr '/' RelativeLocationPath | FilterExpr '//' RelativeLocationPath
100+ // Restructured to eliminate ambiguity: filterExpr alternatives are made explicit
101+ // The key distinction: function calls require LPAREN after the name, location paths don't
102+ pathExpr
103+ : functionCallExpr (pathSeparator relativeLocationPath)? // func() possibly followed by /path
104+ | bracketedExpr (pathSeparator relativeLocationPath)? // (expr) possibly followed by /path
105+ | literalOrNumber (pathSeparator relativeLocationPath)? // 'string' or 123 possibly followed by /path
106+ | locationPath // /a/b, a/b, //a, etc.
107+ ;
108+
109+ // [20] FilterExpr ::= PrimaryExpr | FilterExpr Predicate
110+ // Function call with optional predicates - requires LPAREN after name to disambiguate from locationPath
111+ functionCallExpr
112+ : functionCall predicate*
113+ ;
114+
115+ // Bracketed expression with optional predicates
116+ bracketedExpr
117+ : LPAREN expr RPAREN predicate*
118+ ;
119+
120+ // Literal or number with optional predicates
121+ literalOrNumber
122+ : literal predicate*
123+ | NUMBER predicate*
124+ ;
125+
126+ // Legacy filterExpr for backward compatibility (used in compileFilterExpr)
127+ filterExpr
128+ : functionCallExpr
129+ | bracketedExpr
130+ | literalOrNumber
131+ ;
132+
133+ // [15] PrimaryExpr ::= VariableReference | '(' Expr ')' | Literal | Number | FunctionCall
134+ primaryExpr
135+ : LPAREN expr RPAREN
136+ | literal
75137 | NUMBER
138+ | functionCall
76139 ;
77140
78- // Absolute path starting with / or //
141+ // ==============================================================================
142+ // Location paths
143+ // ==============================================================================
144+
145+ // [1] LocationPath ::= RelativeLocationPath | AbsoluteLocationPath
146+ locationPath
147+ : absoluteLocationPath
148+ | relativeLocationPath
149+ ;
150+
151+ // [2] AbsoluteLocationPath ::= '/' RelativeLocationPath? | AbbreviatedAbsoluteLocationPath
79152absoluteLocationPath
80153 : SLASH relativeLocationPath?
81154 | DOUBLE_SLASH relativeLocationPath
82155 ;
83156
84- // Relative path (series of steps)
157+ // [3] RelativeLocationPath ::= Step | RelativeLocationPath '/' Step | AbbreviatedRelativeLocationPath
85158relativeLocationPath
86159 : step (pathSeparator step)*
87160 ;
@@ -92,123 +165,109 @@ pathSeparator
92165 | DOUBLE_SLASH
93166 ;
94167
95- // A single step in the path
168+ // ==============================================================================
169+ // Location steps
170+ // ==============================================================================
171+
172+ // [4] Step ::= AxisSpecifier NodeTest Predicate* | AbbreviatedStep
173+ // Restructured to eliminate ambiguity: axis specifier is optional prefix
96174step
97- : axisStep predicate*
98- | nodeTest predicate*
175+ : axisSpecifier? nodeTest predicate*
99176 | attributeStep predicate*
100- | nodeTypeTest
101177 | abbreviatedStep
102178 ;
103179
104- // Axis step - explicit axis like parent::node()
105- axisStep
106- : axisName AXIS_SEP nodeTest
180+ // [5] AxisSpecifier ::= AxisName '::' | AbbreviatedAxisSpecifier
181+ // Made as a separate rule to allow optional usage in step
182+ axisSpecifier
183+ : axisName AXIS_SEP
107184 ;
108185
109- // Supported axis names (NCName - no namespace prefix)
186+ // [6] AxisName - validated at runtime
110187axisName
111- : NCNAME // parent, ancestor, self, child, etc. - validated at runtime
188+ : NCNAME
112189 ;
113190
114- // Abbreviated step - . or ..
191+ // [12] AbbreviatedStep ::= '.' | '..'
115192abbreviatedStep
116- : DOTDOT // parent::node()
117- | DOT // self::node()
118- ;
119-
120- // Node type test - text(), comment(), node(), processing-instruction()
121- // Validation of which functions are valid node type tests happens at runtime
122- nodeTypeTest
123- : NCNAME LPAREN RPAREN
193+ : DOTDOT
194+ | DOT
124195 ;
125196
126- // Attribute step (@attr, @ns:attr, or @*)
197+ // [13] AbbreviatedAxisSpecifier ::= '@'?
127198attributeStep
128199 : AT (QNAME | NCNAME | WILDCARD )
129200 ;
130201
131- // Node test (element name, ns:element, or wildcard)
202+ // ==============================================================================
203+ // Node tests
204+ // ==============================================================================
205+
206+ // [7] NodeTest ::= NameTest | NodeType '(' ')' | 'processing-instruction' '(' Literal ')'
132207nodeTest
133- : QNAME
134- | NCNAME
135- | WILDCARD
208+ : nameTest
209+ | nodeType LPAREN RPAREN
136210 ;
137211
138- // Predicate in square brackets
139- predicate
140- : LBRACKET predicateExpr RBRACKET
212+ // [37] NameTest ::= '*' | NCName ':' '*' | QName
213+ nameTest
214+ : WILDCARD
215+ | QNAME
216+ | NCNAME
141217 ;
142218
143- // Predicate expression (supports and/or)
144- predicateExpr
145- : orExpr
219+ // [38] NodeType ::= 'comment' | 'text' | 'processing-instruction' | 'node'
220+ // Uses specific tokens to avoid ambiguity with function calls
221+ nodeType
222+ : TEXT
223+ | COMMENT
224+ | NODE
225+ | PROCESSING_INSTRUCTION
146226 ;
147227
148- // OR expression (lowest precedence)
149- orExpr
150- : andExpr (OR andExpr)*
151- ;
228+ // ==============================================================================
229+ // Predicates
230+ // ==============================================================================
152231
153- // AND expression (higher precedence than OR)
154- andExpr
155- : primaryExpr ( AND primaryExpr)*
232+ // [8] Predicate ::= '[' PredicateExpr ']'
233+ predicate
234+ : LBRACKET predicateExpr RBRACKET
156235 ;
157236
158- // Primary expression in a predicate
159- primaryExpr
160- : predicateValue comparisonOp comparand // any value expression with comparison
161- | predicateValue // standalone value (last(), position(), number, boolean)
237+ // [9] PredicateExpr ::= Expr
238+ predicateExpr
239+ : expr
162240 ;
163241
164- // A value-producing expression in a predicate
165- predicateValue
166- : functionCall // local-name(), last(), position(), contains(), etc.
167- | attributeStep // @attr, @*
168- | relativeLocationPath // bar/baz/text()
169- | childElementTest // child, *
170- | NUMBER // positional predicate [1], [2], etc.
171- ;
242+ // ==============================================================================
243+ // Functions
244+ // ==============================================================================
172245
173- // XPath function call - unified for both top-level and predicate use
174- // Function names are NCNames (no namespace prefix in standard XPath 1.0)
175- // Specific functions like local-name, namespace-uri, contains, etc. are
176- // validated at runtime in the compiler via getFunctionType()
246+ // [16] FunctionCall ::= FunctionName '(' (Argument (',' Argument)*)? ')'
177247functionCall
178- : NCNAME LPAREN functionArgs ? RPAREN
248+ : functionName LPAREN (argument ( COMMA argument)*) ? RPAREN
179249 ;
180250
181- // Function arguments (comma-separated)
182- functionArgs
183- : functionArg (COMMA functionArg)*
251+ // [35] FunctionName ::= QName - NodeType
252+ // Node type tokens can also be function names (text(), comment(), node(), processing-instruction())
253+ functionName
254+ : NCNAME
255+ | TEXT
256+ | COMMENT
257+ | NODE
258+ | PROCESSING_INSTRUCTION
184259 ;
185260
186- // A single function argument
187- // Note: comparisonArg must come first to handle not(path = 'value')
188- // functionCall must come before relativeLocationPath
189- // because both can start with QNAME, but we need to check for '(' to distinguish them
190- functionArg
191- : comparisonArg
192- | absoluteLocationPath
193- | functionCall
194- | relativeLocationPath
195- | stringLiteral
196- | NUMBER
261+ // [17] Argument ::= Expr
262+ argument
263+ : expr
197264 ;
198265
199- // Comparison expression as function argument (for not(x = 'y'), etc.)
200- comparisonArg
201- : (functionCall | relativeLocationPath | absoluteLocationPath) comparisonOp comparand
202- ;
203-
204- // Child element test in predicate (element name, ns:element, or wildcard)
205- childElementTest
206- : QNAME
207- | NCNAME
208- | WILDCARD
209- ;
266+ // ==============================================================================
267+ // Literals
268+ // ==============================================================================
210269
211- // String literal value
212- stringLiteral
270+ // [29] Literal ::= '"' [^"]* '"' | "'" [^']* "'"
271+ literal
213272 : STRING_LITERAL
214273 ;
0 commit comments