diff --git a/generate-railroad.js b/generate-railroad.js index 9ba7b55..448b145 100644 --- a/generate-railroad.js +++ b/generate-railroad.js @@ -5,9 +5,10 @@ const { spawnSync } = require("node:child_process"); const path = require("node:path"); // Customization section -const DEFAULT_INPUT_ABNF = "grammar/jsonc.abnf"; +const DEFAULT_INPUT_ABNF = "grammar/JSONC.abnf"; const DEFAULT_PROCESSED_ABNF = "grammar/jsonc-processed.abnf"; const DEFAULT_OUTPUT_HTML = "grammar/railroad-diagram.html"; +const FORCED_HTML_HEADER = "JSONC GRAMMAR"; // Rules to inline from their %x... definitions as literal ABNF strings. // Add more rule names here to apply the same transformation. @@ -15,7 +16,67 @@ const INLINE_HEX_RULES = [ "multi-line-comment-start", "multi-line-comment-end", "asterisk", - "escape" + "escape", + "single-line-comment-start", + "quotation-mark", + "decimal-point", + "minus", + "plus", + "zero", +]; + +// Inline selected rule references as quoted literals in specific target rules. +// Add more mappings here to reuse this transformation pattern. +const INLINE_LITERAL_REFS = [ + { + targetRule: "value", + referencedRules: ["false", "true", "null"], + }, +]; + +// Move selected rule definitions after another rule in the processed ABNF. +// Add more entries here to control rule ordering in generated output. +const REPOSITION_RULES_AFTER = [ + { + ruleName: "begin-array", + afterRule: "array", + }, + { + ruleName: "end-array", + afterRule: "begin-array", + }, + { + ruleName: "begin-object", + afterRule: "object", + }, + { + ruleName: "end-object", + afterRule: "begin-object", + }, + { + ruleName: "name-separator", + afterRule: "member", + }, + { + ruleName: "value-separator", + afterRule: "value", + }, + { + ruleName: "digit", + afterRule: "unescaped", + }, + { + ruleName: "digit1-9", + afterRule: "digit", + }, + { + ruleName: "hexdigit", + afterRule: "digit1-9", + }, + { + ruleName: "four-hexdigits", + afterRule: "hexdigit", + } ]; function escapeRegExp(value) { @@ -36,6 +97,20 @@ function decodeAbnfHexSequence(value) { return String.fromCodePoint(...bytes); } +function getHexRuleLiteral(source, ruleName) { + const escapedRuleName = escapeRegExp(ruleName); + const ruleRegex = new RegExp( + `^\\s*${escapedRuleName}\\s*=\\s*(%x[0-9A-Fa-f]+(?:\\.[0-9A-Fa-f]+)*)\\b.*$`, + "m", + ); + const ruleMatch = source.match(ruleRegex); + if (!ruleMatch) { + throw new Error(`Rule ${ruleName} was not found.`); + } + + return decodeAbnfHexSequence(ruleMatch[1]); +} + function inlineHexRuleAsLiteral(source, ruleName) { const escapedRuleName = escapeRegExp(ruleName); const ruleRegex = new RegExp( @@ -50,10 +125,10 @@ function inlineHexRuleAsLiteral(source, ruleName) { const hexSequence = ruleMatch[1]; const literalChars = decodeAbnfHexSequence(hexSequence); - // For backslash or other problematic characters, keep them as hex format - // ABNF doesn't support backslash escaping in quoted strings + // Keep hex format for characters that cannot be represented safely + // as a single ABNF quoted string literal. let replacement; - if (literalChars === "\\") { + if (literalChars === "\\" || literalChars === '"') { replacement = hexSequence; } else { // For other characters, escape only double quotes (not backslashes) @@ -90,6 +165,79 @@ function inlineHexRuleAsLiteral(source, ruleName) { .join("\n"); } +function inlineLiteralRefsInTargetRule(source, targetRule, referencedRules) { + const escapedTargetRule = escapeRegExp(targetRule); + const targetRuleRegex = new RegExp(`^(\\s*${escapedTargetRule}\\s*=\\s*)(.*)$`, "m"); + const match = source.match(targetRuleRegex); + if (!match) { + throw new Error(`Rule ${targetRule} was not found.`); + } + + const targetRulePrefix = match[1]; + const targetRuleRhs = match[2]; + + let updatedRhs = targetRuleRhs; + for (const referencedRule of referencedRules) { + const literalValue = getHexRuleLiteral(source, referencedRule); + const replacementLiteral = `"${literalValue.replace(/"/g, '\\"')}"`; + const referencedRuleRegex = new RegExp( + `(? { + const match = line.match(/^\s*([A-Za-z][A-Za-z0-9-]*)\s*=/); + if (!match) { + return true; + } + return !removalSet.has(match[1]); + }) + .join("\n"); +} + +function findRuleBlock(lines, ruleName) { + const ruleStartRegex = new RegExp(`^\\s*${escapeRegExp(ruleName)}\\s*=`); + const startIndex = lines.findIndex((line) => ruleStartRegex.test(line)); + if (startIndex === -1) { + throw new Error(`Rule ${ruleName} was not found.`); + } + + let endIndex = startIndex + 1; + while (endIndex < lines.length && /^\s/.test(lines[endIndex])) { + endIndex += 1; + } + + return { + startIndex, + endIndex, + blockLines: lines.slice(startIndex, endIndex), + }; +} + +function repositionRulesAfter(source, reorderings) { + let lines = source.split(/\r?\n/); + + for (const { ruleName, afterRule } of reorderings) { + const ruleBlock = findRuleBlock(lines, ruleName); + lines.splice(ruleBlock.startIndex, ruleBlock.endIndex - ruleBlock.startIndex); + + const afterRuleBlock = findRuleBlock(lines, afterRule); + lines.splice(afterRuleBlock.endIndex, 0, ...ruleBlock.blockLines); + } + + return lines.join("\n"); +} + function processAbnfSource(source) { let processed = source; @@ -97,9 +245,25 @@ function processAbnfSource(source) { processed = inlineHexRuleAsLiteral(processed, ruleName); } + for (const { targetRule, referencedRules } of INLINE_LITERAL_REFS) { + processed = inlineLiteralRefsInTargetRule(processed, targetRule, referencedRules); + processed = removeRuleDefinitions(processed, referencedRules); + } + + processed = repositionRulesAfter(processed, REPOSITION_RULES_AFTER); + return processed; } +function postProcessGeneratedHtml(htmlPath) { + const html = fs.readFileSync(htmlPath, "utf8"); + const updated = html.replace(/
wsc := *(ws-char / comment)
+ wsc := *(ws-char / comment) ; White space and comments
source-character := %x00-10FFFF
- comment-terminator := %x0A / %x0D / %x2028 / %x2029 ; LF / CR / LS / PS
- comment-terminator-sequence := %x0D.0A / %x0A / %x0D / %x2028 / %x2029
- single-line-comment-start := %x2F.2F ; // double solidus
- single-line-comment-end := comment-terminator-sequence
- single-line-comment-end := %x0D.0A / %x0A / %x0D
+ single-line-comment := single-line-comment-start *single-line-comment-char [ single-line-comment-end ]
- single-line-comment := "//" *single-line-comment-char [ single-line-comment-end ]
+ single-line-comment-char := %x00-09 / %x0B-0C / %x0E-2027 / %x202A-10FFFF ; Any source character except comment terminators
- single-line-comment-char := %x00-09 / %x0B-0C / %x0E-10FFFF ; Any source character except CR and LF (line terminator)
+ begin-array := wsc %x5B wsc ; [ left square bracket
- value := object / array / number / string / "true" / "false" / "null"
+ value-separator := wsc %x2C wsc ; , comma
+ object := begin-object [ member *( value-separator member ) ] end-object
+ end-array := wsc %x5D wsc ; ] right square bracket
- name-separator := wsc %x3A wsc ; : colon
- value-separator := wsc %x2C wsc ; , comma
- value := false / null / true / object / array / number / string
- false := %x66.61.6C.73.65 ; false
- true := %x74.72.75.65 ; true
- null := %x6E.75.6C.6C ; null
- object := begin-object [ member *( value-separator member ) ] end-object
- name-separator := wsc %x3A wsc ; : colon
+ number := [ minus ] int [ frac ] [ exp ]
- decimal-point := %x2E ; .
- digit1-9 := %x31-39 ; 1-9
- e := %x65 / %x45 ; e E
- exp := e [ minus / plus ] 1*DIGIT
- frac := decimal-point 1*DIGIT
- int := zero / ( digit1-9 *DIGIT )
- minus := %x2D ; -
- begin-array := wsc %x5B wsc ; [ left square bracket
+ plus := %x2B ; +
- end-array := wsc %x5D wsc ; ] right square bracket
+ zero := %x30 ; 0
- number := [ "-" ] ( "0" / ( digit1-9 *digit ) ) [ "." 1*digit ] [ ( %x65 / %x45 ) [ "-" / "+" ] 1*digit ]
+ string := quotation-mark *char quotation-mark
- string := %x22 *char %x22
+ char := unescaped /
%x5C (
- %x22 / ; " quotation mark U+0022
- %x5C / ; \ reverse solidus U+005C
- %x2F / ; / solidus U+002F
- %x62 / ; b backspace U+0008
- %x66 / ; f form feed U+000C
- %x6E / ; n line feed U+000A
- %x72 / ; r carriage return U+000D
- %x74 / ; t tab U+0009
- %x75 4HEXDIG ; uXXXX U+XXXX
+ %x22 / ; " quotation mark U+0022
+ %x5C / ; \ reverse solidus U+005C
+ %x2F / ; / solidus U+002F
+ %x62 / ; b backspace U+0008
+ %x66 / ; f form feed U+000C
+ %x6E / ; n line feed U+000A
+ %x72 / ; r carriage return U+000D
+ %x74 / ; t tab U+0009
+ %x75 four-hexdigits ; uXXXX U+XXXX
)
- quotation-mark := %x22 ; "
- digit := %x30-39 ; 0-9
+ digit1-9 := %x31-39 ; 1-9
+ hexdigit := digit /
+ %x41 / %x61 / ; A a
+ %x42 / %x62 / ; B b
+ %x43 / %x63 / ; C c
+ %x44 / %x64 / ; D d
+ %x45 / %x65 / ; E e
+ %x46 / %x66 ; F f
+ four-hexdigits := 4hexdigit
+