Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 181 additions & 6 deletions generate-railroad.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,78 @@ const { spawnSync } = require("node:child_process");
const path = require("node:path");

// Customization section
const DEFAULT_INPUT_ABNF = "grammar/jsonc.abnf";
const DEFAULT_INPUT_ABNF = "grammar/JSONC.abnf";
const DEFAULT_PROCESSED_ABNF = "grammar/jsonc-processed.abnf";
const DEFAULT_OUTPUT_HTML = "grammar/railroad-diagram.html";
const FORCED_HTML_HEADER = "JSONC GRAMMAR";

// Rules to inline from their %x... definitions as literal ABNF strings.
// Add more rule names here to apply the same transformation.
const INLINE_HEX_RULES = [
"multi-line-comment-start",
"multi-line-comment-end",
"asterisk",
"escape"
"escape",
"single-line-comment-start",
"quotation-mark",
"decimal-point",
"minus",
"plus",
"zero",
];

// Inline selected rule references as quoted literals in specific target rules.
// Add more mappings here to reuse this transformation pattern.
const INLINE_LITERAL_REFS = [
{
targetRule: "value",
referencedRules: ["false", "true", "null"],
},
];

// Move selected rule definitions after another rule in the processed ABNF.
// Add more entries here to control rule ordering in generated output.
const REPOSITION_RULES_AFTER = [
{
ruleName: "begin-array",
afterRule: "array",
},
{
ruleName: "end-array",
afterRule: "begin-array",
},
{
ruleName: "begin-object",
afterRule: "object",
},
{
ruleName: "end-object",
afterRule: "begin-object",
},
{
ruleName: "name-separator",
afterRule: "member",
},
{
ruleName: "value-separator",
afterRule: "value",
},
{
ruleName: "digit",
afterRule: "unescaped",
},
{
ruleName: "digit1-9",
afterRule: "digit",
},
{
ruleName: "hexdigit",
afterRule: "digit1-9",
},
{
ruleName: "four-hexdigits",
afterRule: "hexdigit",
}
];

function escapeRegExp(value) {
Expand All @@ -36,6 +97,20 @@ function decodeAbnfHexSequence(value) {
return String.fromCodePoint(...bytes);
}

function getHexRuleLiteral(source, ruleName) {
const escapedRuleName = escapeRegExp(ruleName);
const ruleRegex = new RegExp(
`^\\s*${escapedRuleName}\\s*=\\s*(%x[0-9A-Fa-f]+(?:\\.[0-9A-Fa-f]+)*)\\b.*$`,
"m",
);
const ruleMatch = source.match(ruleRegex);
if (!ruleMatch) {
throw new Error(`Rule ${ruleName} was not found.`);
}

return decodeAbnfHexSequence(ruleMatch[1]);
}

function inlineHexRuleAsLiteral(source, ruleName) {
const escapedRuleName = escapeRegExp(ruleName);
const ruleRegex = new RegExp(
Expand All @@ -50,10 +125,10 @@ function inlineHexRuleAsLiteral(source, ruleName) {
const hexSequence = ruleMatch[1];
const literalChars = decodeAbnfHexSequence(hexSequence);

// For backslash or other problematic characters, keep them as hex format
// ABNF doesn't support backslash escaping in quoted strings
// Keep hex format for characters that cannot be represented safely
// as a single ABNF quoted string literal.
let replacement;
if (literalChars === "\\") {
if (literalChars === "\\" || literalChars === '"') {
replacement = hexSequence;
} else {
// For other characters, escape only double quotes (not backslashes)
Expand Down Expand Up @@ -90,16 +165,105 @@ function inlineHexRuleAsLiteral(source, ruleName) {
.join("\n");
}

function inlineLiteralRefsInTargetRule(source, targetRule, referencedRules) {
const escapedTargetRule = escapeRegExp(targetRule);
const targetRuleRegex = new RegExp(`^(\\s*${escapedTargetRule}\\s*=\\s*)(.*)$`, "m");
const match = source.match(targetRuleRegex);
if (!match) {
throw new Error(`Rule ${targetRule} was not found.`);
}

const targetRulePrefix = match[1];
const targetRuleRhs = match[2];

let updatedRhs = targetRuleRhs;
for (const referencedRule of referencedRules) {
const literalValue = getHexRuleLiteral(source, referencedRule);
const replacementLiteral = `"${literalValue.replace(/"/g, '\\"')}"`;
const referencedRuleRegex = new RegExp(
`(?<![A-Za-z0-9-])${escapeRegExp(referencedRule)}(?![A-Za-z0-9-])`,
"g",
);
updatedRhs = updatedRhs.replace(referencedRuleRegex, replacementLiteral);
}

return source.replace(targetRuleRegex, `${targetRulePrefix}${updatedRhs}`);
}

function removeRuleDefinitions(source, ruleNames) {
const removalSet = new Set(ruleNames);

return source
.split(/\r?\n/)
.filter((line) => {
const match = line.match(/^\s*([A-Za-z][A-Za-z0-9-]*)\s*=/);
if (!match) {
return true;
}
return !removalSet.has(match[1]);
})
.join("\n");
}

function findRuleBlock(lines, ruleName) {
const ruleStartRegex = new RegExp(`^\\s*${escapeRegExp(ruleName)}\\s*=`);
const startIndex = lines.findIndex((line) => ruleStartRegex.test(line));
if (startIndex === -1) {
throw new Error(`Rule ${ruleName} was not found.`);
}

let endIndex = startIndex + 1;
while (endIndex < lines.length && /^\s/.test(lines[endIndex])) {
endIndex += 1;
}

return {
startIndex,
endIndex,
blockLines: lines.slice(startIndex, endIndex),
};
}

function repositionRulesAfter(source, reorderings) {
let lines = source.split(/\r?\n/);

for (const { ruleName, afterRule } of reorderings) {
const ruleBlock = findRuleBlock(lines, ruleName);
lines.splice(ruleBlock.startIndex, ruleBlock.endIndex - ruleBlock.startIndex);

const afterRuleBlock = findRuleBlock(lines, afterRule);
lines.splice(afterRuleBlock.endIndex, 0, ...ruleBlock.blockLines);
}

return lines.join("\n");
}

function processAbnfSource(source) {
let processed = source;

for (const ruleName of INLINE_HEX_RULES) {
processed = inlineHexRuleAsLiteral(processed, ruleName);
}

for (const { targetRule, referencedRules } of INLINE_LITERAL_REFS) {
processed = inlineLiteralRefsInTargetRule(processed, targetRule, referencedRules);
processed = removeRuleDefinitions(processed, referencedRules);
}

processed = repositionRulesAfter(processed, REPOSITION_RULES_AFTER);

return processed;
}

function postProcessGeneratedHtml(htmlPath) {
const html = fs.readFileSync(htmlPath, "utf8");
const updated = html.replace(/<h1>[^<]*<\/h1>/, `<h1>${FORCED_HTML_HEADER}</h1>`);

if (updated !== html) {
fs.writeFileSync(htmlPath, updated, "utf8");
}
}

const args = process.argv.slice(2);
const titleIndex = args.indexOf("--title");

Expand Down Expand Up @@ -173,4 +337,15 @@ if (result.error) {
process.exit(1);
}

process.exit(result.status === null ? 1 : result.status);
if (result.status !== 0) {
process.exit(result.status === null ? 1 : result.status);
}

try {
postProcessGeneratedHtml(outputPath);
} catch (error) {
console.error(`Failed to post-process generated HTML: ${error.message}`);
process.exit(1);
}

process.exit(0);
55 changes: 26 additions & 29 deletions grammar/jsonc.abnf → grammar/JSONC.abnf
Original file line number Diff line number Diff line change
@@ -1,38 +1,29 @@
; JSONC grammar with comments support (RFC 8259 extended with JavaScript-style comments)
;
; Notes:
; - Rule names and structure follow RFC 8259 ABNF snippets.
; - DIGIT and HEXDIG are core rules from RFC 5234.
; - comments are an extension not in RFC 8259.
; - Rule names and structure follow RFC 8259 ABNF.
; - Comments are an extension not in RFC 8259.
; - Trailing commas are NOT supported in this grammar.

; A JSONC-text is a serialized value surrounded by optional whitespace and comments.
; Comments can appear anywhere insignificant whitespace is allowed in JSON.
JSONC-text = wsc value wsc

; Whitespace with Comments: zero or more whitespace characters or comments
wsc = *(ws-char / comment)
wsc = *(ws-char / comment) ; Whitespace and/or comments

; Single whitespace character (space, tab, line feed, carriage return)
ws-char = %x20 / %x09 / %x0A / %x0D ; space / tab / LF / CR

; Comments: single-line or multi-line
comment = single-line-comment / multi-line-comment

; Source character: any Unicode code point, as per ECMAScript.
source-character = %x00-10FFFF

; Comment terminators and sequences (based on ECMAScript line terminators)
comment-terminator = %x0A / %x0D / %x2028 / %x2029 ; LF / CR / LS / PS
comment-terminator-sequence = %x0D.0A / %x0A / %x0D / %x2028 / %x2029

; Single-line comment: starts with //, continues until line ending
; Terminator is not part of the comment body.
; Note that the single-line-comment-end is optional, allowing comments to end at the end of the file without a line terminator.
single-line-comment-start = %x2F.2F ; // double solidus
single-line-comment-end = comment-terminator-sequence
single-line-comment-end = %x0D.0A / %x0A / %x0D
single-line-comment = single-line-comment-start *single-line-comment-char [ single-line-comment-end ]
single-line-comment-char = %x00-09 / %x0B-0C / %x0E-2027 / %x202A-10FFFF ; Any source character except comment terminators
single-line-comment-char = %x00-09 / %x0B-0C / %x0E-10FFFF ; Any source character except CR and LF (line terminator)

; Multi-line comment: /* ... */
; Cannot be nested. The first */ closes the comment.
Expand All @@ -58,7 +49,7 @@ name-separator = wsc %x3A wsc ; : colon
value-separator = wsc %x2C wsc ; , comma

; Any JSON value
value = false / null / true / object / array / number / string
value = object / array / number / string / true / false / null

; Literal names (boolean values and null)
false = %x66.61.6C.73.65 ; false
Expand All @@ -73,31 +64,37 @@ member = string name-separator value
array = begin-array [ value *( value-separator value ) ] end-array

; Numbers
number = [ minus ] int [ frac ] [ exp ]
number = [ minus ] ( zero / ( digit1-9 *digit ) ) [ decimal-point 1*digit ] [ ( %x65 / %x45 ) [ minus / plus ] 1*digit ]
decimal-point = %x2E ; .
digit = %x30-39 ; 0-9
digit1-9 = %x31-39 ; 1-9
e = %x65 / %x45 ; e E
exp = e [ minus / plus ] 1*DIGIT
frac = decimal-point 1*DIGIT
int = zero / ( digit1-9 *DIGIT )

minus = %x2D ; -
plus = %x2B ; +
zero = %x30 ; 0
hexdigit = digit /
%x41 / %x61 / ; A a
%x42 / %x62 / ; B b
%x43 / %x63 / ; C c
%x44 / %x64 / ; D d
%x45 / %x65 / ; E e
%x46 / %x66 ; F f
four-hexdigits = 4hexdigit

; Strings
string = quotation-mark *char quotation-mark

char = unescaped /
escape (
%x22 / ; " quotation mark U+0022
%x5C / ; \ reverse solidus U+005C
%x2F / ; / solidus U+002F
%x62 / ; b backspace U+0008
%x66 / ; f form feed U+000C
%x6E / ; n line feed U+000A
%x72 / ; r carriage return U+000D
%x74 / ; t tab U+0009
%x75 4HEXDIG ; uXXXX U+XXXX
%x22 / ; " quotation mark U+0022
%x5C / ; \ reverse solidus U+005C
%x2F / ; / solidus U+002F
%x62 / ; b backspace U+0008
%x66 / ; f form feed U+000C
%x6E / ; n line feed U+000A
%x72 / ; r carriage return U+000D
%x74 / ; t tab U+0009
%x75 four-hexdigits ; uXXXX U+XXXX
)

escape = %x5C ; \
Expand Down
8 changes: 4 additions & 4 deletions grammar/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ This directory contains the ABNF grammar for JSONC, along with plans for generat

## Railroad Diagram Generation Plan

Generate railroad diagrams from `grammar/jsonc.abnf` using a simple one-file Node.js script.
Generate railroad diagrams from `grammar/JSONC.abnf` using a simple one-file Node.js script.

Instead of building a custom ABNF parser and converter to Tab Atkins constructor calls, use:

Expand All @@ -18,7 +18,7 @@ The wrapper script should:

1. Accept input ABNF path and optional output HTML path.
2. Default to:
- input: `grammar/jsonc.abnf`
- input: `grammar/JSONC.abnf`
- output: `grammar/railroad-diagram.html`
3. Optionally accept `--title` to set the HTML title.
4. Execute the upstream CLI from our installed dependency.
Expand Down Expand Up @@ -53,13 +53,13 @@ npm run railroad
Generate from a specific input and output:

```bash
npm run railroad -- grammar/jsonc.abnf grammar/railroad-diagram.html
npm run railroad -- grammar/JSONC.abnf grammar/railroad-diagram.html
```

Generate with a custom title:

```bash
npm run railroad -- grammar/jsonc.abnf grammar/railroad-diagram.html --title "JSONC Grammar"
npm run railroad -- grammar/JSONC.abnf grammar/railroad-diagram.html --title "JSONC Grammar"
```

### Notes on EOF for single-line comments
Expand Down
Loading