Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions src/Tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -201,23 +201,27 @@ export default class Tokenizer {
private readonly xmlMode: boolean;
private readonly decodeEntities: boolean;
private readonly recognizeSelfClosing: boolean;
private readonly recognizeCDATA: boolean;
private readonly entityDecoder: EntityDecoder;

constructor(
{
xmlMode = false,
decodeEntities = true,
recognizeSelfClosing = xmlMode,
recognizeCDATA = false,
}: {
xmlMode?: boolean;
decodeEntities?: boolean;
recognizeSelfClosing?: boolean;
recognizeCDATA?: boolean;
},
private readonly cbs: Callbacks,
) {
this.xmlMode = xmlMode;
this.decodeEntities = decodeEntities;
this.recognizeSelfClosing = recognizeSelfClosing;
this.recognizeCDATA = recognizeCDATA;
this.entityDecoder = new EntityDecoder(
xmlMode ? xmlDecodeTree : htmlDecodeTree,
(cp, consumed) => this.emitCodePoint(cp, consumed),
Expand Down Expand Up @@ -354,10 +358,19 @@ export default class Tokenizer {
private stateCDATASequence(c: number): void {
if (c === Sequences.Cdata[this.sequenceIndex]) {
if (++this.sequenceIndex === Sequences.Cdata.length) {
this.state = State.InCommentLike;
this.currentSequence = Sequences.CdataEnd;
this.sequenceIndex = 0;
this.sectionStart = this.index + 1;
if (this.shouldRecognizeCDATA()) {
this.state = State.InCommentLike;
this.currentSequence = Sequences.CdataEnd;
this.sectionStart = this.index + 1;
} else {
/*
* Outside XML / foreign content `<![CDATA[` is a bogus
* comment that ends at the first `>`, per WHATWG HTML
* §13.2.5.42/§13.2.5.43, leaving following markup live.
*/
this.state = State.InSpecialComment;
}
}
} else {
this.sequenceIndex = 0;
Expand All @@ -371,6 +384,14 @@ export default class Tokenizer {
}
}

private shouldRecognizeCDATA(): boolean {
return (
this.xmlMode ||
this.recognizeCDATA ||
(this.cbs.isInForeignContext?.() ?? false)
);
}

/**
* When we wait for one specific character, we can speed things up
* by skipping through the buffer until we find it.
Expand Down
46 changes: 46 additions & 0 deletions src/index.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,52 @@ describe("Index", () => {
expect(dom).toMatchSnapshot();
});

/*
* In HTML mode, `<![CDATA[` outside foreign content is a bogus comment that
* ends at the first `>` (WHATWG HTML §13.2.5.42/§13.2.5.43), so the markup
* after `>` stays live. parse5 and every browser produce the comment plus a
* live `<img>`; htmlparser2 used to swallow it all into one comment.
*/
it("treats `<![CDATA[` in HTML mode as a bogus comment ending at `>`", () => {
const dom = parseDocument("<![CDATA[x><img src=x onerror=alert(1)>");

expect(dom.children).toHaveLength(2);

const [comment, img] = dom.children;
expect(comment.type).toBe("comment");
expect((comment as { data: string }).data).toBe("[CDATA[x");

expect(img.type).toBe("tag");
expect((img as Element).name).toBe("img");
expect((img as Element).attribs).toEqual({
src: "x",
onerror: "alert(1)",
});
});

it("keeps real CDATA in foreign content (regression guard)", () => {
const dom = parseDocument("<svg><![CDATA[a<b]]></svg>");
const svg = dom.children[0] as Element;
expect(svg.name).toBe("svg");
expect(svg.children).toHaveLength(1);
const text = svg.children[0];
expect(text.type).toBe("text");
expect((text as { data: string }).data).toBe("a<b");
});

it("keeps real CDATA in xmlMode (regression guard)", () => {
const dom = parseDocument("<root><![CDATA[a<b]]></root>", {
xmlMode: true,
});
const root = dom.children[0] as Element;
const cdata = root.children[0];
expect(cdata.type).toBe("cdata");
expect((cdata as Element).children[0]).toMatchObject({
type: "text",
data: "a<b",
});
});

it("createDocumentStream", () => {
let documentStream!: Parser;

Expand Down