Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions packages/cms-admin/src/components/editor/rich-text-editor.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import { TextStyle } from "@tiptap/extension-text-style";
import { Markdown } from "tiptap-markdown";
import { useEffect, useState, useRef, useCallback } from "react";
import { cn } from "@/lib/utils";
import { sanitizeWordPasteHtml } from "@/lib/paste-sanitizer";
import {
Tooltip, TooltipContent, TooltipProvider, TooltipTrigger,
} from "@/components/ui/tooltip";
Expand Down Expand Up @@ -2373,17 +2374,18 @@ const SvgEmbed = TipTapNode.create({

/* ─── Toolbar button ─────────────────────────────────────────── */
function Btn({
tooltip, active, disabled, onClick, children,
tooltip, active, disabled, onClick, children, testId,
}: {
tooltip: string; active?: boolean; disabled?: boolean;
onClick: () => void; children: React.ReactNode;
onClick: () => void; children: React.ReactNode; testId?: string;
}) {
return (
<Tooltip>
<TooltipTrigger
render={
<button
type="button"
data-testid={testId}
onMouseDown={(e) => { e.preventDefault(); onClick(); }}
disabled={disabled}
aria-label={tooltip}
Expand Down Expand Up @@ -2702,6 +2704,9 @@ function RichTextEditorInner({ value, onChange, disabled, stickyOffset = 132, fe
onChange(md);
},
editorProps: {
// F150: strip Word/Office paste cruft before ProseMirror parses, so the
// junk never round-trips into stored Markdown (html:true serialiser).
transformPastedHTML: (html) => sanitizeWordPasteHtml(html),
attributes: {
class: "rte outline-none min-h-[120px]",
// Block browser extensions (Grammarly, spell-checkers) from injecting
Expand Down Expand Up @@ -3154,6 +3159,13 @@ function RichTextEditorInner({ value, onChange, disabled, stickyOffset = 132, fe
<IconSubscript />
</Btn>}

{/* F150: strip all formatting from the selection (inline marks + block type) */}
{(has("bold") || has("italic") || has("strike") || has("code")) && <Btn
tooltip="Clear formatting" testId="clear-formatting-button"
onClick={() => editor.chain().focus().unsetAllMarks().clearNodes().run()}>
<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round"><path d="M4 7V4h16v3"/><path d="M5 20h6"/><path d="M13 4 8 20"/><path d="m15 15 5 5"/><path d="m20 15-5 5"/></svg>
</Btn>}

{(has("bold") || has("italic") || has("strike") || has("code")) && <Sep />}

{has("bulletList") && <Btn tooltip="Bullet list" active={toolbarState?.isBulletList ?? false}
Expand Down
98 changes: 98 additions & 0 deletions packages/cms-admin/src/lib/__tests__/paste-sanitizer.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import { describe, it, expect } from "vitest";
import { sanitizeWordPasteHtml } from "../paste-sanitizer";

describe("sanitizeWordPasteHtml — F150 Word/Office paste cleanup", () => {
it("unwraps the sanneandersen span-wrapped markdown, preserving inner markdown", () => {
// Ground-truth payload pattern from prod (products/fordjelsen.json).
const input =
"<span>*Undervisning hjemme hos dig selv via Zoom.*</span>\n\n" +
"<span>**<u>1. En sund – og god forløjelse – er grundlaget for al liv.</u>**</span>";
const out = sanitizeWordPasteHtml(input);
expect(out).not.toContain("<span>");
expect(out).not.toContain("</span>");
// inner markdown + intentional <u> survive intact
expect(out).toContain("*Undervisning hjemme hos dig selv via Zoom.*");
expect(out).toContain("**<u>1. En sund");
expect(out).toContain("</u>**");
});

it("removes XML-namespace tags like <o:p>", () => {
expect(sanitizeWordPasteHtml("<p>hi<o:p></o:p></p>")).toBe("<p>hi</p>");
expect(sanitizeWordPasteHtml("<o:p>keep</o:p>")).toBe("keep");
});

it("unwraps <font> tags but keeps their content", () => {
expect(sanitizeWordPasteHtml('<font face="Arial" size="3">hello</font>')).toBe("hello");
});

it("strips Office conditional comments and fragment markers", () => {
const input =
"<!--[if gte mso 9]><xml><o:OfficeDocumentSettings/></xml><![endif]-->" +
"<!--StartFragment-->text<!--EndFragment-->";
expect(sanitizeWordPasteHtml(input)).toBe("text");
});

it("strips downlevel-revealed conditional comments", () => {
expect(sanitizeWordPasteHtml("<![if !mso]>x<![endif]>")).toBe("x");
});

it("removes mso-* style declarations but keeps real ones", () => {
const out = sanitizeWordPasteHtml('<p style="margin:0; mso-pagination:none">x</p>');
expect(out).toBe('<p style="margin:0">x</p>');
});

it("drops a style attribute that becomes empty after mso stripping, then unwraps the bare span", () => {
const out = sanitizeWordPasteHtml('<span style="mso-fareast-language:DA">x</span>');
expect(out).toBe("x");
});

it('removes class="Mso…" attributes', () => {
expect(sanitizeWordPasteHtml('<p class="MsoNormal">x</p>')).toBe("<p>x</p>");
});

it("drops Word <style> and <xml> blocks wholesale", () => {
const input = "<style>p{mso-x:1}</style><xml><w:WordDocument/></xml><p>body</p>";
expect(sanitizeWordPasteHtml(input)).toBe("<p>body</p>");
});

// ── edge guards: never destroy intent ──

it("preserves a deliberate color span (TextStyle/Color)", () => {
const input = '<span style="color:#f00">red</span>';
expect(sanitizeWordPasteHtml(input)).toBe(input);
});

it("preserves a background-color span", () => {
const input = '<span style="background-color:#ff0">hi</span>';
expect(sanitizeWordPasteHtml(input)).toBe(input);
});

it("preserves a color span even when it also carried mso noise", () => {
const out = sanitizeWordPasteHtml('<span style="mso-x:1; color:#0f0">g</span>');
expect(out).toBe('<span style="color:#0f0">g</span>');
});

it("preserves spans with a real class/id/data attribute", () => {
expect(sanitizeWordPasteHtml('<span class="badge">x</span>')).toBe('<span class="badge">x</span>');
expect(sanitizeWordPasteHtml('<span data-foo="1">x</span>')).toBe('<span data-foo="1">x</span>');
});

it("leaves clean third-party HTML untouched", () => {
const input = '<strong>bold</strong> and <a href="https://x.com">link</a><ul><li>a</li><li>b</li></ul>';
expect(sanitizeWordPasteHtml(input)).toBe(input);
});

it("returns empty and plain input unchanged", () => {
expect(sanitizeWordPasteHtml("")).toBe("");
expect(sanitizeWordPasteHtml("just text")).toBe("just text");
});

it("fully unwraps nested noise-only spans", () => {
expect(sanitizeWordPasteHtml("<span><span>deep</span></span>")).toBe("deep");
});

it("keeps a semantic color span while unwrapping a noise span around it", () => {
const out = sanitizeWordPasteHtml('<span><span style="color:#f00">red</span></span>');
expect(out).toBe('<span style="color:#f00">red</span>');
});
});
109 changes: 109 additions & 0 deletions packages/cms-admin/src/lib/paste-sanitizer.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/**
* F150 — Paste formatting sanitization.
*
* Strips Microsoft Word / Office clipboard cruft from pasted HTML *before*
* ProseMirror parses it, so the junk never reaches the editor schema and never
* round-trips into stored Markdown (the editor serialises with `html: true`,
* which otherwise re-emits leftover inline wrappers as literal `<span>` text on
* public sites — the sanneandersen "fordjelsen" bug).
*
* Pure string transform on purpose: no DOM dependency, so it runs unchanged in
* the browser paste path AND is unit-testable under the repo's `node` vitest env.
*
* What it removes:
* - Office conditional comments + all HTML comments (StartFragment/EndFragment)
* - `<style>` / `<xml>` blocks Word injects
* - XML-namespace tags: `<o:p>`, `<w:…>`, `<m:…>`, `<v:…>`, `<st1:…>`, …
* - `<font>` tags (unwrapped — content kept)
* - `mso-*` declarations inside `style=""` (and the attribute if it empties)
* - `class="Mso…"` attributes
* - noise-only `<span>` wrappers (unwrapped — content kept)
*
* What it preserves:
* - all real content + intentional inline markup (`<u>`, `<strong>`, `<a>`, …)
* - spans that carry a deliberate `color`/`background-color` (TextStyle/Color),
* or a real `class`/`id`/`data-*` attribute
*/

/** Remove `mso-*` declarations from a CSS string; return the cleaned remainder. */
function stripMsoFromCss(css: string): string {
return css
.split(";")
.map((d) => d.trim())
.filter((d) => d.length > 0)
.filter((d) => !/^mso-/i.test(d.split(":")[0].trim()))
.join("; ");
}

/** True if a span's attribute string carries something worth keeping the span for. */
function spanIsSemantic(attrs: string): boolean {
// a deliberate colour (covers `color:` and `background-color:`)
if (/\bcolor\s*:/i.test(attrs)) return true;
// a real structural attribute another feature may rely on
if (/\s(?:class|id|data-[\w-]+)\s*=/i.test(attrs)) return true;
return false;
}

/** Unwrap noise-only `<span>` wrappers from innermost out, keeping semantic ones. */
function unwrapNoiseSpans(html: string): string {
// Match an innermost span (its content holds no further `<span` opening).
const innermost = /<span((?:\s[^>]*)?)>((?:(?!<span)[\s\S])*?)<\/span>/i;
let out = html;
let prev = "";
let guard = 0;
while (out !== prev && guard < 100) {
prev = out;
out = out.replace(innermost, (_m, attrs: string, inner: string) =>
spanIsSemantic(attrs)
? // park kept spans under a placeholder so the loop can't re-match them
`<keptspan${attrs}>${inner}</keptspan>`
: inner,
);
guard++;
}
// restore parked spans
return out.replace(/<keptspan/gi, "<span").replace(/<\/keptspan>/gi, "</span>");
}

/**
* Sanitise pasted HTML, removing Word/Office cruft while preserving real
* content and intentional formatting. Safe to call on any pasted HTML.
*/
export function sanitizeWordPasteHtml(html: string): string {
if (!html) return html;
// Quick exit: nothing that looks like Office cruft → return untouched.
if (!/<(?:o|w|m|v|st1|x):|<font\b|mso-|class=["']?Mso|<span\b|<!--|<!\[|<style\b|<xml\b/i.test(html)) {
return html;
}

let out = html;

// 1. Drop `<style>` and `<xml>` blocks Word injects wholesale.
out = out.replace(/<style\b[\s\S]*?<\/style>/gi, "");
out = out.replace(/<xml\b[\s\S]*?<\/xml>/gi, "");

// 2. Drop all HTML comments (incl. Office conditional `<!--[if …]>…<![endif]-->`
// and StartFragment/EndFragment markers) + downlevel-revealed `<![if]>` forms.
out = out.replace(/<!--[\s\S]*?-->/g, "");
out = out.replace(/<!\[(?:end)?if[^\]]*\]>/gi, "");

// 3. Drop XML-namespace tags (`<o:p>`, `<w:…>`, …) — open + close, keep content.
out = out.replace(/<\/?[a-z]+:[^>]*>/gi, "");

// 4. Unwrap `<font …>` tags — keep their content.
out = out.replace(/<\/?font\b[^>]*>/gi, "");

// 5. Drop Office-generated `class="Mso…"` attributes.
out = out.replace(/\sclass=("|')Mso[^"']*\1/gi, "");

// 6. Strip `mso-*` declarations from style attributes; drop the attr if empty.
out = out.replace(/\sstyle=("|')([\s\S]*?)\1/gi, (_m, _q, css: string) => {
const cleaned = stripMsoFromCss(css);
return cleaned ? ` style="${cleaned}"` : "";
});

// 7. Unwrap noise-only spans (after step 6 mso-only spans are attribute-less).
out = unwrapNoiseSpans(out);

return out;
}
Loading