Skip to content
This repository was archived by the owner on Feb 26, 2026. It is now read-only.

Commit 9d89514

Browse files
authored
Merge pull request #721 from alex-rantos/upgrade-htmlparser2-10.1.0
chore: upgrade htmlparser2 from 8.x to 10.1.0
2 parents 311b7f4 + ab34262 commit 9d89514

4 files changed

Lines changed: 23 additions & 7 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
## UNRELEASED
44

55
- Fix unclosed tags (e.g., `<hello`) returning empty string in `escape` and `recursiveEscape` modes. Fixes [#706](https://github.com/apostrophecms/sanitize-html/issues/706).
6+
- Upgrade `htmlparser2` from 8.x to 10.1.0. This improves security by correctly decoding zero-padded numeric character references (e.g., `&#0000001`) that previously bypassed `javascript:` URL detection. Also fixes double-encoding of entities inside raw text elements like `textarea` and `option`.
67

78
## 2.17.0 (2025-05-14)
89

index.js

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,11 @@ function sanitizeHtml(html, options, _recursing) {
541541
// your concern, don't allow them. The same is essentially true for style tags
542542
// which have their own collection of XSS vectors.
543543
result += text;
544+
} else if ((options.disallowedTagsMode === 'discard' || options.disallowedTagsMode === 'completelyDiscard') && (nonTextTagsArray.indexOf(tag) !== -1)) {
545+
// htmlparser2 does not decode entities inside raw text elements like
546+
// textarea and option. The text is already properly encoded, so pass
547+
// it through without additional escaping to avoid double-encoding.
548+
result += text;
544549
} else if (!addedText) {
545550
const escaped = escapeHtml(text, false);
546551
if (options.textFilter) {

package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
"dependencies": {
2626
"deepmerge": "^4.2.2",
2727
"escape-string-regexp": "^4.0.0",
28-
"htmlparser2": "^8.0.0",
28+
"htmlparser2": "^10.1.0",
2929
"is-plain-object": "^5.0.0",
3030
"parse-srcset": "^1.0.2",
3131
"postcss": "^8.3.11"
@@ -41,4 +41,4 @@
4141
"mocha": "^10.2.0",
4242
"sinon": "^9.0.2"
4343
}
44-
}
44+
}

test/test.js

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -179,11 +179,14 @@ describe('sanitizeHtml', function() {
179179
assert.equal(sanitizeHtml('<a href="java\0&#14;\t\r\n script:alert(\'foo\')">Hax</a>'), '<a>Hax</a>');
180180
});
181181
it('should dump character codes 1-32 even when escaped with padding rather than trailing ;', function() {
182-
assert.equal(sanitizeHtml('<a href="java&#0000001script:alert(\'foo\')">Hax</a>'), '<a href="java&amp;#0000001script:alert(\'foo\')">Hax</a>');
183-
// This one is weird, but the browser does not interpret it
184-
// as a scheme, so we're OK. That character is 65535, not null. I
185-
// think it's a limitation of the entities module
186-
assert.equal(sanitizeHtml('<a href="java&#0000000script:alert(\'foo\')">Hax</a>'), '<a href="java&amp;#0000000script:alert(\'foo\')">Hax</a>');
182+
// htmlparser2 10.x correctly decodes zero-padded numeric entities.
183+
// &#0000001 decodes to U+0001, which is stripped as a control char,
184+
// revealing the javascript: scheme
185+
assert.equal(sanitizeHtml('<a href="java&#0000001script:alert(\'foo\')">Hax</a>'), '<a>Hax</a>');
186+
// &#0000000 decodes to U+FFFD (replacement character per HTML spec),
187+
// which is not a control char, so the URL is preserved safely since
188+
// browsers don't interpret java�script: as javascript:
189+
assert.equal(sanitizeHtml('<a href="java&#0000000script:alert(\'foo\')">Hax</a>'), '<a href="java\uFFFDscript:alert(\'foo\')">Hax</a>');
187190
});
188191
it('should still like nice schemes', function() {
189192
assert.equal(sanitizeHtml('<a href="http://google.com/">Hi</a>'), '<a href="http://google.com/">Hi</a>');
@@ -876,6 +879,13 @@ describe('sanitizeHtml', function() {
876879
), '!<textarea>&lt;/textarea&gt;&lt;svg/onload=prompt`xs`&gt;</textarea>!'
877880
);
878881
});
882+
it('should not double-encode entities inside an allowed textarea element', function() {
883+
assert.equal(
884+
sanitizeHtml('<textarea>&lt;div&gt;hello&lt;/div&gt;&amp;amp;</textarea>',
885+
{ allowedTags: [ 'textarea' ] }
886+
), '<textarea>&lt;div&gt;hello&lt;/div&gt;&amp;amp;</textarea>'
887+
);
888+
});
879889
it('should allow protocol relative links by default', function() {
880890
assert.equal(
881891
sanitizeHtml('<a href="//cnn.com/example">test</a>'),

0 commit comments

Comments
 (0)