From 50d9a65e300cf2353e61827d973abb73ca88a3e6 Mon Sep 17 00:00:00 2001 From: Yarchik Date: Tue, 30 Jun 2026 12:48:23 +0100 Subject: [PATCH] fix(tokenizer): treat `` (WHATWG HTML §13.2.5.42 Markup declaration open state, §13.2.5.43 Bogus comment state). The tokenizer instead entered the comment-like CDATA path on every full `CDATA[` match, so the section only ended at `]]>` (or EOF) and swallowed all following markup into one inert comment. `stateCDATASequence` now only takes the real-CDATA path when CDATA should be recognized (`xmlMode`, `recognizeCDATA`, or foreign content), mirroring the existing partial-match branch which already routes to a bogus comment in HTML mode. xmlMode, foreign-content, and `recognizeCDATA` behavior is unchanged. Before: `` -> comment `[CDATA[x>` (img hidden). After: `` -> comment `[CDATA[x` + live ``, matching parse5 and browsers. --- src/Tokenizer.ts | 27 ++++++++++++++++++++++++--- src/index.spec.ts | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 3 deletions(-) diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts index be64a327f..6f679cd7f 100644 --- a/src/Tokenizer.ts +++ b/src/Tokenizer.ts @@ -201,6 +201,7 @@ export default class Tokenizer { private readonly xmlMode: boolean; private readonly decodeEntities: boolean; private readonly recognizeSelfClosing: boolean; + private readonly recognizeCDATA: boolean; private readonly entityDecoder: EntityDecoder; constructor( @@ -208,16 +209,19 @@ export default class Tokenizer { xmlMode = false, decodeEntities = true, recognizeSelfClosing = xmlMode, + recognizeCDATA = false, }: { xmlMode?: boolean; decodeEntities?: boolean; recognizeSelfClosing?: boolean; + recognizeCDATA?: boolean; }, private readonly cbs: Callbacks, ) { this.xmlMode = xmlMode; this.decodeEntities = decodeEntities; this.recognizeSelfClosing = recognizeSelfClosing; + this.recognizeCDATA = recognizeCDATA; this.entityDecoder = new EntityDecoder( xmlMode ? xmlDecodeTree : htmlDecodeTree, (cp, consumed) => this.emitCodePoint(cp, consumed), @@ -354,10 +358,19 @@ export default class Tokenizer { private stateCDATASequence(c: number): void { if (c === Sequences.Cdata[this.sequenceIndex]) { if (++this.sequenceIndex === Sequences.Cdata.length) { - this.state = State.InCommentLike; - this.currentSequence = Sequences.CdataEnd; this.sequenceIndex = 0; - this.sectionStart = this.index + 1; + if (this.shouldRecognizeCDATA()) { + this.state = State.InCommentLike; + this.currentSequence = Sequences.CdataEnd; + this.sectionStart = this.index + 1; + } else { + /* + * Outside XML / foreign content ``, per WHATWG HTML + * §13.2.5.42/§13.2.5.43, leaving following markup live. + */ + this.state = State.InSpecialComment; + } } } else { this.sequenceIndex = 0; @@ -371,6 +384,14 @@ export default class Tokenizer { } } + private shouldRecognizeCDATA(): boolean { + return ( + this.xmlMode || + this.recognizeCDATA || + (this.cbs.isInForeignContext?.() ?? false) + ); + } + /** * When we wait for one specific character, we can speed things up * by skipping through the buffer until we find it. diff --git a/src/index.spec.ts b/src/index.spec.ts index 80e4ccc3b..a6db86527 100644 --- a/src/index.spec.ts +++ b/src/index.spec.ts @@ -31,6 +31,52 @@ describe("Index", () => { expect(dom).toMatchSnapshot(); }); + /* + * In HTML mode, `` (WHATWG HTML §13.2.5.42/§13.2.5.43), so the markup + * after `>` stays live. parse5 and every browser produce the comment plus a + * live ``; htmlparser2 used to swallow it all into one comment. + */ + it("treats ``", () => { + const dom = parseDocument(""); + + expect(dom.children).toHaveLength(2); + + const [comment, img] = dom.children; + expect(comment.type).toBe("comment"); + expect((comment as { data: string }).data).toBe("[CDATA[x"); + + expect(img.type).toBe("tag"); + expect((img as Element).name).toBe("img"); + expect((img as Element).attribs).toEqual({ + src: "x", + onerror: "alert(1)", + }); + }); + + it("keeps real CDATA in foreign content (regression guard)", () => { + const dom = parseDocument("a<b"); + const svg = dom.children[0] as Element; + expect(svg.name).toBe("svg"); + expect(svg.children).toHaveLength(1); + const text = svg.children[0]; + expect(text.type).toBe("text"); + expect((text as { data: string }).data).toBe("a { + const dom = parseDocument("", { + xmlMode: true, + }); + const root = dom.children[0] as Element; + const cdata = root.children[0]; + expect(cdata.type).toBe("cdata"); + expect((cdata as Element).children[0]).toMatchObject({ + type: "text", + data: "a { let documentStream!: Parser;