diff --git a/src/Tokenizer.ts b/src/Tokenizer.ts
index be64a327f..6f679cd7f 100644
--- a/src/Tokenizer.ts
+++ b/src/Tokenizer.ts
@@ -201,6 +201,7 @@ export default class Tokenizer {
private readonly xmlMode: boolean;
private readonly decodeEntities: boolean;
private readonly recognizeSelfClosing: boolean;
+ private readonly recognizeCDATA: boolean;
private readonly entityDecoder: EntityDecoder;
constructor(
@@ -208,16 +209,19 @@ export default class Tokenizer {
xmlMode = false,
decodeEntities = true,
recognizeSelfClosing = xmlMode,
+ recognizeCDATA = false,
}: {
xmlMode?: boolean;
decodeEntities?: boolean;
recognizeSelfClosing?: boolean;
+ recognizeCDATA?: boolean;
},
private readonly cbs: Callbacks,
) {
this.xmlMode = xmlMode;
this.decodeEntities = decodeEntities;
this.recognizeSelfClosing = recognizeSelfClosing;
+ this.recognizeCDATA = recognizeCDATA;
this.entityDecoder = new EntityDecoder(
xmlMode ? xmlDecodeTree : htmlDecodeTree,
(cp, consumed) => this.emitCodePoint(cp, consumed),
@@ -354,10 +358,19 @@ export default class Tokenizer {
private stateCDATASequence(c: number): void {
if (c === Sequences.Cdata[this.sequenceIndex]) {
if (++this.sequenceIndex === Sequences.Cdata.length) {
- this.state = State.InCommentLike;
- this.currentSequence = Sequences.CdataEnd;
this.sequenceIndex = 0;
- this.sectionStart = this.index + 1;
+ if (this.shouldRecognizeCDATA()) {
+ this.state = State.InCommentLike;
+ this.currentSequence = Sequences.CdataEnd;
+ this.sectionStart = this.index + 1;
+ } else {
+ /*
+ * Outside XML / foreign content ``, per WHATWG HTML
+ * §13.2.5.42/§13.2.5.43, leaving following markup live.
+ */
+ this.state = State.InSpecialComment;
+ }
}
} else {
this.sequenceIndex = 0;
@@ -371,6 +384,14 @@ export default class Tokenizer {
}
}
+ private shouldRecognizeCDATA(): boolean {
+ return (
+ this.xmlMode ||
+ this.recognizeCDATA ||
+ (this.cbs.isInForeignContext?.() ?? false)
+ );
+ }
+
/**
* When we wait for one specific character, we can speed things up
* by skipping through the buffer until we find it.
diff --git a/src/index.spec.ts b/src/index.spec.ts
index 80e4ccc3b..a6db86527 100644
--- a/src/index.spec.ts
+++ b/src/index.spec.ts
@@ -31,6 +31,52 @@ describe("Index", () => {
expect(dom).toMatchSnapshot();
});
+ /*
+ * In HTML mode, `` (WHATWG HTML §13.2.5.42/§13.2.5.43), so the markup
+ * after `>` stays live. parse5 and every browser produce the comment plus a
+ * live `
`; htmlparser2 used to swallow it all into one comment.
+ */
+ it("treats ``", () => {
+ const dom = parseDocument("
");
+
+ expect(dom.children).toHaveLength(2);
+
+ const [comment, img] = dom.children;
+ expect(comment.type).toBe("comment");
+ expect((comment as { data: string }).data).toBe("[CDATA[x");
+
+ expect(img.type).toBe("tag");
+ expect((img as Element).name).toBe("img");
+ expect((img as Element).attribs).toEqual({
+ src: "x",
+ onerror: "alert(1)",
+ });
+ });
+
+ it("keeps real CDATA in foreign content (regression guard)", () => {
+ const dom = parseDocument("");
+ const svg = dom.children[0] as Element;
+ expect(svg.name).toBe("svg");
+ expect(svg.children).toHaveLength(1);
+ const text = svg.children[0];
+ expect(text.type).toBe("text");
+ expect((text as { data: string }).data).toBe("a {
+ const dom = parseDocument("", {
+ xmlMode: true,
+ });
+ const root = dom.children[0] as Element;
+ const cdata = root.children[0];
+ expect(cdata.type).toBe("cdata");
+ expect((cdata as Element).children[0]).toMatchObject({
+ type: "text",
+ data: "a {
let documentStream!: Parser;