From 50d9a65e300cf2353e61827d973abb73ca88a3e6 Mon Sep 17 00:00:00 2001
From: Yarchik <spoko.dev@gmail.com>
Date: Tue, 30 Jun 2026 12:48:23 +0100
Subject: [PATCH] fix(tokenizer): treat `<![CDATA[` as a bogus comment in HTML
 mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In HTML mode `<![CDATA[` outside foreign content must be tokenized as a
bogus comment that ends at the first `>` (WHATWG HTML §13.2.5.42 Markup
declaration open state, §13.2.5.43 Bogus comment state). The tokenizer
instead entered the comment-like CDATA path on every full `CDATA[`
match, so the section only ended at `]]>` (or EOF) and swallowed all
following markup into one inert comment.

`stateCDATASequence` now only takes the real-CDATA path when CDATA
should be recognized (`xmlMode`, `recognizeCDATA`, or foreign content),
mirroring the existing partial-match branch which already routes to a
bogus comment in HTML mode. xmlMode, foreign-content, and
`recognizeCDATA` behavior is unchanged.

Before: `<![CDATA[x><img>` -> comment `[CDATA[x><img>` (img hidden).
After:  `<![CDATA[x><img>` -> comment `[CDATA[x` + live `<img>`,
matching parse5 and browsers.
---
 src/Tokenizer.ts  | 27 ++++++++++++++++++++++++---
 src/index.spec.ts | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 3 deletions(-)
diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index be64a327f..6f679cd7f 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -201,6 +201,7 @@ export default class Tokenizer {
     private readonly xmlMode: boolean;
     private readonly decodeEntities: boolean;
     private readonly recognizeSelfClosing: boolean;
+    private readonly recognizeCDATA: boolean;
     private readonly entityDecoder: EntityDecoder;
 
     constructor(
@@ -208,16 +209,19 @@ export default class Tokenizer {
             xmlMode = false,
             decodeEntities = true,
             recognizeSelfClosing = xmlMode,
+            recognizeCDATA = false,
         }: {
             xmlMode?: boolean;
             decodeEntities?: boolean;
             recognizeSelfClosing?: boolean;
+            recognizeCDATA?: boolean;
         },
         private readonly cbs: Callbacks,
     ) {
         this.xmlMode = xmlMode;
         this.decodeEntities = decodeEntities;
         this.recognizeSelfClosing = recognizeSelfClosing;
+        this.recognizeCDATA = recognizeCDATA;
         this.entityDecoder = new EntityDecoder(
             xmlMode ? xmlDecodeTree : htmlDecodeTree,
             (cp, consumed) => this.emitCodePoint(cp, consumed),
@@ -354,10 +358,19 @@ export default class Tokenizer {
     private stateCDATASequence(c: number): void {
         if (c === Sequences.Cdata[this.sequenceIndex]) {
             if (++this.sequenceIndex === Sequences.Cdata.length) {
-                this.state = State.InCommentLike;
-                this.currentSequence = Sequences.CdataEnd;
                 this.sequenceIndex = 0;
-                this.sectionStart = this.index + 1;
+                if (this.shouldRecognizeCDATA()) {
+                    this.state = State.InCommentLike;
+                    this.currentSequence = Sequences.CdataEnd;
+                    this.sectionStart = this.index + 1;
+                } else {
+                    /*
+                     * Outside XML / foreign content `<![CDATA[` is a bogus
+                     * comment that ends at the first `>`, per WHATWG HTML
+                     * §13.2.5.42/§13.2.5.43, leaving following markup live.
+                     */
+                    this.state = State.InSpecialComment;
+                }
             }
         } else {
             this.sequenceIndex = 0;
@@ -371,6 +384,14 @@ export default class Tokenizer {
         }
     }
 
+    private shouldRecognizeCDATA(): boolean {
+        return (
+            this.xmlMode ||
+            this.recognizeCDATA ||
+            (this.cbs.isInForeignContext?.() ?? false)
+        );
+    }
+
     /**
      * When we wait for one specific character, we can speed things up
      * by skipping through the buffer until we find it.
diff --git a/src/index.spec.ts b/src/index.spec.ts
index 80e4ccc3b..a6db86527 100644
--- a/src/index.spec.ts
+++ b/src/index.spec.ts
@@ -31,6 +31,52 @@ describe("Index", () => {
         expect(dom).toMatchSnapshot();
     });
 
+    /*
+     * In HTML mode, `<![CDATA[` outside foreign content is a bogus comment that
+     * ends at the first `>` (WHATWG HTML §13.2.5.42/§13.2.5.43), so the markup
+     * after `>` stays live. parse5 and every browser produce the comment plus a
+     * live `<img>`; htmlparser2 used to swallow it all into one comment.
+     */
+    it("treats `<![CDATA[` in HTML mode as a bogus comment ending at `>`", () => {
+        const dom = parseDocument("<![CDATA[x><img src=x onerror=alert(1)>");
+
+        expect(dom.children).toHaveLength(2);
+
+        const [comment, img] = dom.children;
+        expect(comment.type).toBe("comment");
+        expect((comment as { data: string }).data).toBe("[CDATA[x");
+
+        expect(img.type).toBe("tag");
+        expect((img as Element).name).toBe("img");
+        expect((img as Element).attribs).toEqual({
+            src: "x",
+            onerror: "alert(1)",
+        });
+    });
+
+    it("keeps real CDATA in foreign content (regression guard)", () => {
+        const dom = parseDocument("<svg><![CDATA[a<b]]></svg>");
+        const svg = dom.children[0] as Element;
+        expect(svg.name).toBe("svg");
+        expect(svg.children).toHaveLength(1);
+        const text = svg.children[0];
+        expect(text.type).toBe("text");
+        expect((text as { data: string }).data).toBe("a<b");
+    });
+
+    it("keeps real CDATA in xmlMode (regression guard)", () => {
+        const dom = parseDocument("<root><![CDATA[a<b]]></root>", {
+            xmlMode: true,
+        });
+        const root = dom.children[0] as Element;
+        const cdata = root.children[0];
+        expect(cdata.type).toBe("cdata");
+        expect((cdata as Element).children[0]).toMatchObject({
+            type: "text",
+            data: "a<b",
+        });
+    });
+
     it("createDocumentStream", () => {
         let documentStream!: Parser;