From eaf57b45611f3f4926883e7751641e715f315c77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Morten=20L=C3=B8nskov?= Date: Thu, 11 Jun 2026 10:22:31 +0200 Subject: [PATCH 1/6] Add a generation for sitemap to the doc site --- build-docs.py | 9 +- build_scripts/config_loader.py | 33 +++++ build_scripts/gen_redirects.py | 10 ++ build_scripts/gen_sitemap_index.py | 197 +++++++++++++++++++++++++++++ docfx-template.json | 5 + metadata/build-config.json | 15 ++- 6 files changed, 267 insertions(+), 2 deletions(-) create mode 100644 build_scripts/gen_sitemap_index.py diff --git a/build-docs.py b/build-docs.py index 1b7d517bf..05b0f8b26 100644 --- a/build-docs.py +++ b/build-docs.py @@ -329,7 +329,14 @@ def main() -> int: [sys.executable, "build_scripts/gen_staticwebapp_config.py"], "Generating staticwebapp.config.json" ) - + + # Generate site-wide sitemap index (references each language's sitemap.xml) + # and robots.txt. Runs last so all per-language sitemaps already exist. + run_command( + [sys.executable, "build_scripts/gen_sitemap_index.py"], + "Generating sitemap index and robots.txt" + ) + print(f"\n{'='*60}") print(" Build complete!") print(f"{'='*60}") diff --git a/build_scripts/config_loader.py b/build_scripts/config_loader.py index ec69f3dae..a3bd7c228 100644 --- a/build_scripts/config_loader.py +++ b/build_scripts/config_loader.py @@ -152,6 +152,39 @@ def get_default_language(config: dict | None = None) -> str: return config.get("defaultLanguage", "en") +def get_base_url(config: dict | None = None) -> str: + """Get the canonical base URL of the published site, without trailing slash. + + Used for canonical/hreflang tags, per-language sitemap baseUrls, and the + site-wide sitemap index. Languages are served under /{code}/ beneath this. + """ + if config is None: + config = load_build_config() + return config.get("baseUrl", "https://docs.tabulareditor.com").rstrip("/") + + +def get_sitemap_downrank(config: dict | None = None) -> list[dict]: + """Get sitemap downrank rules: a list of {match, priority} dicts. + + Each rule lowers the of sitemap URLs whose path contains the + 'match' substring. Rules are evaluated in order; the first match wins. + """ + if config is None: + config = load_build_config() + return config.get("sitemap", {}).get("downrank", []) + + +def get_sitemap_exclude(config: dict | None = None) -> list[dict]: + """Get sitemap exclude rules: a list of {match} dicts. + + Each rule removes sitemap URLs whose path contains the 'match' substring. + Used to drop legacy Tabular Editor 2-only pages from the published sitemap. + """ + if config is None: + config = load_build_config() + return config.get("sitemap", {}).get("exclude", []) + + def compute_file_hash(file_path: Path | str) -> str: """Compute SHA256 hash of a file's contents. diff --git a/build_scripts/gen_redirects.py b/build_scripts/gen_redirects.py index 7f7eaf3e3..1645abb00 100644 --- a/build_scripts/gen_redirects.py +++ b/build_scripts/gen_redirects.py @@ -19,6 +19,8 @@ import sys import traceback +from config_loader import get_base_url, get_default_language + def get_available_languages() -> list[str]: """Scan localizedContent/ folder and return list of language codes (excluding 'en').""" @@ -69,6 +71,10 @@ def generate_localized_config(template: dict, lang: str) -> dict: # Set output destination to language subfolder (relative to project root) # From localizedContent/{lang}/, we go up twice to reach project root build["dest"] = f"../../_site/{lang}" + + # The published sitemap covers the default (English) language only, so + # non-default languages do not emit their own sitemap.xml. + build.pop("sitemap", None) # Update template paths - need to go up two levels to reach project root if "template" in build: @@ -113,6 +119,10 @@ def generate_redirects_config(template: dict) -> dict: # Set English output destination (relative to localizedContent/en/) config["build"]["dest"] = "../../_site/en" + # Point the sitemap at the default language's URL prefix + if "sitemap" in config["build"]: + config["build"]["sitemap"]["baseUrl"] = f"{get_base_url()}/{get_default_language()}" + # Update template paths - need to go up two levels to reach project root if "template" in config["build"]: new_templates = [] diff --git a/build_scripts/gen_sitemap_index.py b/build_scripts/gen_sitemap_index.py new file mode 100644 index 000000000..c438e4bc4 --- /dev/null +++ b/build_scripts/gen_sitemap_index.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Post-process the English sitemap and generate the site-wide entry point. + +DocFX emits a sitemap for the default (English) language at +_site/{default_lang}/sitemap.xml (configured via build.sitemap in the generated +docfx.json). Non-default languages do not emit a sitemap. This script: + + 1. Removes Tabular Editor 2-only URLs from the English sitemap per the + 'exclude' rules in build-config.json. + 2. Downranks selected URLs in the English sitemap (API reference pages) per + the 'downrank' rules in build-config.json. + 3. Writes _site/sitemap.xml: a pointing at the English sitemap + (the published sitemap covers English only). + 4. Writes _site/robots.txt: allows all crawlers and advertises the index. + +Run after the English build so _site/{default_lang}/sitemap.xml exists. + +Usage: + python gen_sitemap_index.py # Process _site/ + python gen_sitemap_index.py --dry-run # Preview without writing +""" + +import argparse +import xml.etree.ElementTree as ET +from pathlib import Path + +from config_loader import ( + get_base_url, + get_default_language, + get_sitemap_downrank, + get_sitemap_exclude, +) + + +SITEMAP_NS = "http://www.sitemaps.org/schemas/sitemap/0.9" + + +def apply_exclude(sitemap_path: Path, rules: list[dict], dry_run: bool = False) -> int: + """Remove entries whose matches an exclude rule. Returns count removed. + + A rule matches when its 'match' substring appears anywhere in the URL's + . Used to drop Tabular Editor 2-only pages from the published sitemap. + """ + if not rules or not sitemap_path.exists(): + return 0 + + ET.register_namespace("", SITEMAP_NS) + tree = ET.parse(sitemap_path) + root = tree.getroot() + + matches = [m for r in rules if (m := r.get("match"))] + + removed = 0 + for url in list(root.findall(f"{{{SITEMAP_NS}}}url")): + loc_el = url.find(f"{{{SITEMAP_NS}}}loc") + if loc_el is None or not loc_el.text: + continue + if any(m in loc_el.text for m in matches): + root.remove(url) + removed += 1 + + if removed and not dry_run: + tree.write(sitemap_path, encoding="utf-8", xml_declaration=True) + + return removed + + +def apply_downrank(sitemap_path: Path, rules: list[dict], dry_run: bool = False) -> int: + """Lower for URLs matching downrank rules. Returns count changed. + + A rule matches when its 'match' substring appears anywhere in the URL's + . Rules are evaluated in order; the first match wins. + """ + if not rules or not sitemap_path.exists(): + return 0 + + ET.register_namespace("", SITEMAP_NS) + tree = ET.parse(sitemap_path) + root = tree.getroot() + + changed = 0 + for url in root.findall(f"{{{SITEMAP_NS}}}url"): + loc_el = url.find(f"{{{SITEMAP_NS}}}loc") + if loc_el is None or not loc_el.text: + continue + loc = loc_el.text + for rule in rules: + if rule["match"] in loc: + priority_el = url.find(f"{{{SITEMAP_NS}}}priority") + if priority_el is None: + priority_el = ET.SubElement(url, f"{{{SITEMAP_NS}}}priority") + priority_el.text = f"{float(rule['priority']):.1f}" + changed += 1 + break + + if changed and not dry_run: + tree.write(sitemap_path, encoding="utf-8", xml_declaration=True) + + return changed + + +def build_index_xml(base_url: str, default_lang: str, site_dir: Path) -> tuple[str, bool]: + """Build the sitemap index XML referencing only the default language. + + Returns (xml_string, included) where included is True if the default + language sitemap exists. + """ + included = (site_dir / default_lang / "sitemap.xml").exists() + + lines = [ + '', + '', + ] + if included: + lines.append(" ") + lines.append(f" {base_url}/{default_lang}/sitemap.xml") + lines.append(" ") + lines.append("") + return "\n".join(lines) + "\n", included + + +def build_robots_txt(base_url: str) -> str: + """Build a robots.txt that allows all crawlers and advertises the sitemap index.""" + return ( + "User-agent: *\n" + "Allow: /\n\n" + f"Sitemap: {base_url}/sitemap.xml\n" + ) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Post-process the English sitemap and generate the site-wide index" + ) + parser.add_argument( + "--site-dir", "-s", + default="_site", + help="Site output directory (default: _site)" + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Preview output without writing files" + ) + + args = parser.parse_args() + + site_dir = Path(args.site_dir) + if not site_dir.exists(): + print(f"Error: site directory {site_dir} does not exist") + return 1 + + base_url = get_base_url() + default_lang = get_default_language() + en_sitemap = site_dir / default_lang / "sitemap.xml" + + print(f"Base URL: {base_url}") + print(f"Default language: {default_lang}") + + # 1. Remove Tabular Editor 2-only pages from the English sitemap + exclude_rules = get_sitemap_exclude() + removed = apply_exclude(en_sitemap, exclude_rules, dry_run=args.dry_run) + print(f"Exclude rules: {len(exclude_rules)}; URLs removed: {removed}") + + # 2. Downrank API reference pages in the English sitemap + rules = get_sitemap_downrank() + changed = apply_downrank(en_sitemap, rules, dry_run=args.dry_run) + print(f"Downrank rules: {len(rules)}; URLs adjusted: {changed}") + + # 3. Build the English-only sitemap index + index_xml, included = build_index_xml(base_url, default_lang, site_dir) + if not included: + print(f"Warning: {en_sitemap} not found - sitemap index will be empty. " + "Check that build.sitemap is set in the English docfx.json.") + + # 4. robots.txt + robots_txt = build_robots_txt(base_url) + + if args.dry_run: + print("\n--- _site/sitemap.xml ---") + print(index_xml) + print("--- _site/robots.txt ---") + print(robots_txt) + return 0 + + (site_dir / "sitemap.xml").write_text(index_xml, encoding="utf-8") + (site_dir / "robots.txt").write_text(robots_txt, encoding="utf-8") + + print(f"\nGenerated: {site_dir / 'sitemap.xml'} (English only)") + print(f"Generated: {site_dir / 'robots.txt'}") + return 0 + + +if __name__ == "__main__": + exit(main()) diff --git a/docfx-template.json b/docfx-template.json index 7158659a3..c7513a0b5 100644 --- a/docfx-template.json +++ b/docfx-template.json @@ -49,6 +49,11 @@ "_disableContribution": true }, "markdownEngineName": "markdig", + "sitemap": { + "baseUrl": "https://docs.tabulareditor.com", + "changefreq": "weekly", + "priority": 0.5 + }, "dest": "_site", "xrefService": [ "https://xref.docs.microsoft.com/query?uid={uid}" ] } diff --git a/metadata/build-config.json b/metadata/build-config.json index aea98afd8..3cd1e63e4 100644 --- a/metadata/build-config.json +++ b/metadata/build-config.json @@ -1,6 +1,18 @@ { "_comment": "Shared build configuration for all build scripts. Edit this file to add/remove content directories.", "defaultLanguage": "en", + "baseUrl": "https://docs.tabulareditor.com", + "sitemap": { + "_comment": "Published sitemap covers the default language only. 'exclude' removes URLs whose path contains 'match' entirely. 'downrank' lowers for URLs whose path contains 'match' (first matching rule wins). Exclude is applied first.", + "exclude": [ + { "match": "/features/Command-line-Options.html" }, + { "match": "/getting-started/Getting-Started-te2.html" }, + { "match": "/references/user-settings-files-te2.html" } + ], + "downrank": [ + { "match": "/api/", "priority": 0.1 } + ] + }, "contentDirectories": { "_comment": "Directories that contain translatable content (markdown and HTML files)", "directories": [ @@ -12,7 +24,8 @@ "security", "troubleshooting", "tutorials", - "whats-new" + "whats-new", + "includes" ] }, "sharedDirectories": { From 924a094932f91b5472fa96bcc937d9d376387e8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Morten=20L=C3=B8nskov?= Date: Thu, 11 Jun 2026 13:50:20 +0200 Subject: [PATCH 2/6] remove includes from build config --- metadata/build-config.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metadata/build-config.json b/metadata/build-config.json index 3cd1e63e4..77456735f 100644 --- a/metadata/build-config.json +++ b/metadata/build-config.json @@ -24,8 +24,7 @@ "security", "troubleshooting", "tutorials", - "whats-new", - "includes" + "whats-new" ] }, "sharedDirectories": { From 3dc0fdad8dd7e013067cc0b4f2cdc7291d69313c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Morten=20L=C3=B8nskov?= Date: Fri, 19 Jun 2026 14:01:49 +0200 Subject: [PATCH 3/6] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- build-docs.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/build-docs.py b/build-docs.py index 05b0f8b26..b4edadab5 100644 --- a/build-docs.py +++ b/build-docs.py @@ -330,9 +330,8 @@ def main() -> int: "Generating staticwebapp.config.json" ) - # Generate site-wide sitemap index (references each language's sitemap.xml) - # and robots.txt. Runs last so all per-language sitemaps already exist. - run_command( + # Generate site-wide sitemap index (references the default language's sitemap.xml) + # and robots.txt. Runs last so the default language sitemap already exists. [sys.executable, "build_scripts/gen_sitemap_index.py"], "Generating sitemap index and robots.txt" ) From 107f853667a2278ed9f36d0d2822dd4f2948d00f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Morten=20L=C3=B8nskov?= Date: Fri, 19 Jun 2026 14:02:22 +0200 Subject: [PATCH 4/6] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- build_scripts/gen_redirects.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/build_scripts/gen_redirects.py b/build_scripts/gen_redirects.py index 1645abb00..d47f55d9c 100644 --- a/build_scripts/gen_redirects.py +++ b/build_scripts/gen_redirects.py @@ -119,10 +119,9 @@ def generate_redirects_config(template: dict) -> dict: # Set English output destination (relative to localizedContent/en/) config["build"]["dest"] = "../../_site/en" - # Point the sitemap at the default language's URL prefix + # Point the sitemap at the English URL prefix if "sitemap" in config["build"]: - config["build"]["sitemap"]["baseUrl"] = f"{get_base_url()}/{get_default_language()}" - + config["build"]["sitemap"]["baseUrl"] = f"{get_base_url()}/en" # Update template paths - need to go up two levels to reach project root if "template" in config["build"]: new_templates = [] From ee314a6bbe69c4df797f4ee2d009c97e87240cc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Morten=20L=C3=B8nskov?= Date: Fri, 19 Jun 2026 14:02:52 +0200 Subject: [PATCH 5/6] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- build_scripts/gen_sitemap_index.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/build_scripts/gen_sitemap_index.py b/build_scripts/gen_sitemap_index.py index c438e4bc4..a1d70d8d6 100644 --- a/build_scripts/gen_sitemap_index.py +++ b/build_scripts/gen_sitemap_index.py @@ -87,11 +87,15 @@ def apply_downrank(sitemap_path: Path, rules: list[dict], dry_run: bool = False) continue loc = loc_el.text for rule in rules: - if rule["match"] in loc: + match = rule.get("match") + priority = rule.get("priority") + if not match or priority is None: + continue + if match in loc: priority_el = url.find(f"{{{SITEMAP_NS}}}priority") if priority_el is None: priority_el = ET.SubElement(url, f"{{{SITEMAP_NS}}}priority") - priority_el.text = f"{float(rule['priority']):.1f}" + priority_el.text = f"{float(priority):.1f}" changed += 1 break From 7980fac4e774f34228ec4b30ed3e80283c4411a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Morten=20L=C3=B8nskov?= Date: Fri, 19 Jun 2026 14:03:31 +0200 Subject: [PATCH 6/6] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- build_scripts/gen_sitemap_index.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/build_scripts/gen_sitemap_index.py b/build_scripts/gen_sitemap_index.py index a1d70d8d6..27bc26a1f 100644 --- a/build_scripts/gen_sitemap_index.py +++ b/build_scripts/gen_sitemap_index.py @@ -198,4 +198,6 @@ def main() -> int: if __name__ == "__main__": - exit(main()) + import sys + + sys.exit(main())