Skip to content

Commit d27370a

Browse files
committed
Follow spec so < can start an attribute name
Fixes #1483
1 parent 0ef4b70 commit d27370a

File tree

4 files changed

+22
-10
lines changed

4 files changed

+22
-10
lines changed

CHANGES.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@
3939
created (`html` or `body`). [2204](https://github.com/jhy/jsoup/issues/2204)
4040
* Follow the current HTML specification in the tokenizer to allow `<` as part of a tag name, instead of emitting it as a
4141
character node. [2230](https://github.com/jhy/jsoup/issues/2230)
42+
* Similarly, allow a `<` as the start of an attribute name, vs creating a new element. The previous behavior was
43+
intended to parse closer to what we anticipated the author's intent to be, but that does not align to the spec or to
44+
how browsers behave. [1483](https://github.com/jhy/jsoup/issues/1483)
4245

4346
## 1.18.1 (2024-Jul-10)
4447

src/main/java/org/jsoup/parser/TokeniserState.java

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -568,10 +568,6 @@ private void anythingElse(Tokeniser t, CharacterReader r) {
568568
case '/':
569569
t.transition(SelfClosingStartTag);
570570
break;
571-
case '<': // NOTE: out of spec, but clear (spec has this as a part of the attribute name)
572-
r.unconsume();
573-
t.error(this);
574-
// intended fall through as if >
575571
case '>':
576572
t.emitTagPending();
577573
t.transition(Data);

src/test/java/org/jsoup/parser/HtmlParserTest.java

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,13 +79,13 @@ private static Stream<Arguments> dupeAttributeData() {
7979

8080
@Test public void parsesQuiteRoughAttributes() {
8181
String html = "<p =a>One<a <p>Something</p>Else";
82-
// this (used to; now gets cleaner) gets a <p> with attr '=a' and an <a tag with an attribute named '<p'; and then auto-recreated
82+
// this gets a <p> with attr '=a' and an <a tag with an attribute named '<p'; and then auto-recreated
8383
Document doc = Jsoup.parse(html);
8484

85-
// NOTE: per spec this should be the test case. but impacts too many ppl
86-
// assertEquals("<p =a>One<a <p>Something</a></p>\n<a <p>Else</a>", doc.body().html());
87-
88-
assertEquals("<p _a>One<a></a></p><p><a>Something</a></p><a>Else</a>", TextUtil.stripNewlines(doc.body().html()));
85+
// =a is output as _a
86+
assertEquals("<p _a>One<a <p>Something</a></p><a <p>Else</a>", TextUtil.stripNewlines(doc.body().html()));
87+
Element p = doc.expectFirst("p");
88+
assertNotNull(p.attribute("=a"));
8989

9090
doc = Jsoup.parse("<p .....>");
9191
assertEquals("<p .....></p>", doc.body().html());
@@ -1939,4 +1939,17 @@ private static void assertMathNamespace(Element el) {
19391939
assertEquals("Hello", ab.text());
19401940
assertEquals("a<b", ab.tag().normalName());
19411941
}
1942+
1943+
@Test void ltInAttrStart() {
1944+
// https://github.com/jhy/jsoup/issues/1483
1945+
String html = "<a before='foo' <junk after='bar'>One</a>";
1946+
Document doc = Jsoup.parse(html);
1947+
assertEquals("<a before=\"foo\" <junk after=\"bar\">One</a>", TextUtil.normalizeSpaces(doc.body().html()));
1948+
1949+
Element el = doc.expectFirst("a");
1950+
Attribute attribute = el.attribute("<junk");
1951+
assertNotNull(attribute);
1952+
assertEquals("", attribute.getValue());
1953+
1954+
}
19421955
}

src/test/java/org/jsoup/parser/TokeniserStateTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -230,7 +230,7 @@ public void testOpeningAngleBracketInsteadOfAttribute() {
230230

231231
Parser.parseFragment(triggeringSnippet, null, "", errorList);
232232

233-
assertEquals(6, errorList.get(0).getPosition());
233+
assertEquals(7, errorList.get(0).getPosition());
234234
}
235235

236236
@Test

0 commit comments

Comments
 (0)