Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/commands/query.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { Cli, z } from 'incur'
import { createClient, requireApiKey } from '../lib/client'
import { stripTrailingFormatClause } from '../lib/sql'

export const query = Cli.create('query', {
description: 'SQL analytics query commands',
Expand All @@ -8,7 +9,10 @@ export const query = Cli.create('query', {
export function queryRunRun(sql: string) {
requireApiKey()
const client = createClient()
return client.post('/v0/query/', { query: sql })
// The API wraps the query in a paginating subquery with its own
// `FORMAT JSON`. ClickHouse forbids a `FORMAT` clause inside a subquery, so
// strip any trailing `FORMAT`/semicolon before sending to avoid a 400.
return client.post('/v0/query/', { query: stripTrailingFormatClause(sql) })
}

query.command('run', {
Expand Down
159 changes: 159 additions & 0 deletions src/lib/sql.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
/**
* SQL helpers for the query command.
*
* The Formo query API wraps the SQL you submit so it can paginate the result
* set and force a machine-readable response:
*
* SELECT * FROM (<your query>) LIMIT <n> FORMAT JSON
*
* ClickHouse does not allow a `FORMAT` clause inside a subquery, so if your
* query ends in its own `FORMAT` clause the wrapped statement becomes:
*
* SELECT * FROM (SELECT ... FORMAT CSV) LIMIT 100 FORMAT JSON
*
* which ClickHouse rejects with a 400. A trailing `FORMAT` (or a trailing
* semicolon) can never take effect through this endpoint anyway — the outer
* `FORMAT JSON` always wins, and output shaping is the CLI's `--format` job —
* so we remove it before sending and let the server wrap a clean query.
*/

/**
* Strip a trailing, top-level `FORMAT <name>` clause and any trailing
* semicolons from a SQL statement.
*
* The scan is aware of string literals, quoted identifiers, and comments, so
* `FORMAT`-looking text inside them is never mistaken for a real clause. The
* match is anchored to the end of the statement, so a `FORMAT` nested inside
* parentheses (a subquery) or part of an identifier/function such as
* `formatDateTime(...)` — or a column aliased `format` — is left untouched.
*
* Returns the original input unchanged when there is nothing to strip.
*/
export function stripTrailingFormatClause(sql: string): string {
if (!sql) return sql

const masked = maskLiterals(sql)
// A genuine trailing FORMAT clause: the keyword `format` preceded by a word
// boundary (so `formatDateTime` or a column aliased `format` is safe),
// followed by exactly one identifier (the format name) and nothing else.
const formatClause = /(^|[^A-Za-z0-9_])format\s+[A-Za-z_][A-Za-z0-9_]*\s*$/i

let end = sql.length
let didStrip = false
// Peel top-level semicolons and a trailing FORMAT clause repeatedly so any
// ordering collapses to the bare query, e.g. `... FORMAT CSV;`,
// `...; FORMAT CSV`, or `... FORMAT CSV ;`. Trailing whitespace and comments
// are only ever skipped to *look* past them — never removed on their own.
for (;;) {
let e = end
while (e > 0 && /\s/.test(masked[e - 1])) e--
if (e === 0) break

if (masked[e - 1] === ';') {
end = e - 1
didStrip = true
continue
}

const match = formatClause.exec(masked.slice(0, e))
if (match) {
// Cut at the `format` keyword, after the leading word-boundary char.
end = match.index + match[1].length
didStrip = true
continue
}

break
}

if (!didStrip) return sql

// Tidy the real whitespace now left dangling where the clause used to be.
// Comments are not whitespace, so any genuine comment is preserved.
const stripped = sql.slice(0, end).replace(/\s+$/, '')
// Never manufacture an empty query from non-empty input (degenerate inputs
// such as a bare `FORMAT JSON`): let the original surface its own error.
return stripped === '' ? sql : stripped
}

/**
* Return a copy of `sql` with the *contents* of string literals, quoted
* identifiers, and comments replaced by spaces, preserving the original length
* so indices stay aligned with the source. Quote delimiters are kept; comments
* are blanked entirely. This lets the clause scanner reason about real SQL code
* without tripping over keywords that merely appear inside literals or comments.
*/
function maskLiterals(sql: string): string {
const out: string[] = []
const n = sql.length
let i = 0

while (i < n) {
const c = sql[i]
const next = i + 1 < n ? sql[i + 1] : ''

// Line comment: -- ... or # ... to end of line. ClickHouse accepts `#`
// and `#!` line comments for MySQL compatibility.
if ((c === '-' && next === '-') || c === '#') {
while (i < n && sql[i] !== '\n') {
out.push(' ')
i++
}
continue
}

// Block comment: /* ... */
if (c === '/' && next === '*') {
out.push(' ', ' ')
i += 2
while (i < n && !(sql[i] === '*' && sql[i + 1] === '/')) {
out.push(' ')
i++
}
if (i < n) {
out.push(' ', ' ')
i += 2
}
continue
}

// String literal or quoted identifier: '...', "...", `...`
if (c === "'" || c === '"' || c === '`') {
const quote = c
out.push(quote)
i++
while (i < n) {
const d = sql[i]
// Backslash escapes are honored inside single-quoted strings.
if (d === '\\' && quote === "'") {
out.push(' ')
i++
if (i < n) {
out.push(' ')
i++
}
continue
}
if (d === quote) {
// A doubled delimiter is an escaped quote, not the terminator.
if (i + 1 < n && sql[i + 1] === quote) {
out.push(' ', ' ')
i += 2
continue
}
out.push(quote)
i++
break
}
out.push(' ')
i++
}
continue
}

out.push(c)
i++
}

return out.join('')
}
168 changes: 168 additions & 0 deletions test/lib/sql.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import { expect } from 'chai';
import { stripTrailingFormatClause } from '../../src/lib/sql';

/**
* Mirror what the API does to a submitted query: wrap it in a paginating
* subquery with a single outer FORMAT JSON. Used to assert the stripped query
* produces a statement with no FORMAT inside the parentheses.
*/
function wrap(inner: string): string {
return `SELECT * FROM (${inner}) LIMIT 100 FORMAT JSON`;
}

/** Count real `FORMAT <name>` clauses (ignores formatDateTime, etc.). */
function countFormatClauses(sql: string): number {
const matches = sql.match(/\bformat\s+[A-Za-z_][A-Za-z0-9_]*/gi);
return matches ? matches.length : 0;
}

describe('lib/sql / stripTrailingFormatClause', function () {
describe('strips a trailing top-level FORMAT clause', function () {
it('removes FORMAT CSV', function () {
expect(stripTrailingFormatClause('SELECT * FROM events FORMAT CSV')).to.equal(
'SELECT * FROM events',
);
});

it('removes FORMAT CSVWithNames', function () {
expect(
stripTrailingFormatClause('SELECT a, b FROM t FORMAT CSVWithNames'),
).to.equal('SELECT a, b FROM t');
});

it('removes FORMAT JSON', function () {
expect(stripTrailingFormatClause('SELECT 1 FORMAT JSON')).to.equal('SELECT 1');
});

it('is case-insensitive on the keyword and name', function () {
expect(stripTrailingFormatClause('SELECT 1 format json')).to.equal('SELECT 1');
expect(stripTrailingFormatClause('SELECT 1 Format JSONEachRow')).to.equal(
'SELECT 1',
);
});

it('handles newlines and extra whitespace before FORMAT', function () {
expect(
stripTrailingFormatClause('SELECT 1\n FROM t\nFORMAT TabSeparated'),
).to.equal('SELECT 1\n FROM t');
});
});

describe('handles trailing semicolons', function () {
it('removes a bare trailing semicolon', function () {
expect(stripTrailingFormatClause('SELECT 1;')).to.equal('SELECT 1');
});

it('removes a semicolon after a FORMAT clause', function () {
expect(stripTrailingFormatClause('SELECT 1 FORMAT CSV;')).to.equal('SELECT 1');
});

it('removes a FORMAT clause that follows a semicolon', function () {
expect(
stripTrailingFormatClause('SELECT x FROM t ORDER BY x; FORMAT CSV'),
).to.equal('SELECT x FROM t ORDER BY x');
});

it('removes whitespace and multiple trailing semicolons around FORMAT', function () {
expect(stripTrailingFormatClause('SELECT 1 FORMAT CSV ; ')).to.equal('SELECT 1');
});
});

describe('does not truncate FORMAT-like identifiers (no false positives)', function () {
it('keeps a formatDateTime(...) call with no trailing clause', function () {
const sql = 'SELECT formatDateTime(ts, \'%Y-%m-%d\') AS day FROM events';
expect(stripTrailingFormatClause(sql)).to.equal(sql);
});

it('strips only the real clause, keeping formatDateTime(...) intact', function () {
expect(
stripTrailingFormatClause(
'SELECT formatDateTime(ts) AS day, count() FROM events GROUP BY day FORMAT CSV',
),
).to.equal('SELECT formatDateTime(ts) AS day, count() FROM events GROUP BY day');
});

it('keeps a column/alias literally named format', function () {
const sql = 'SELECT id AS format FROM t';
expect(stripTrailingFormatClause(sql)).to.equal(sql);
});

it('keeps a real FORMAT clause even when a column is named format', function () {
expect(
stripTrailingFormatClause('SELECT format FROM t FORMAT JSON'),
).to.equal('SELECT format FROM t');
});
});

describe('is aware of quotes, comments, and parentheses', function () {
it('ignores FORMAT inside a string literal', function () {
const sql = "SELECT 'FORMAT CSV' AS note FROM t";
expect(stripTrailingFormatClause(sql)).to.equal(sql);
});

it('ignores FORMAT inside a line comment', function () {
const sql = 'SELECT 1 -- FORMAT CSV';
expect(stripTrailingFormatClause(sql)).to.equal(sql);
});
Comment thread
yosriady marked this conversation as resolved.

it('ignores FORMAT inside a # line comment (MySQL-style)', function () {
const sql = 'SELECT 1 # FORMAT CSV';
expect(stripTrailingFormatClause(sql)).to.equal(sql);
});

it('strips a real FORMAT clause trailed by a # comment', function () {
expect(stripTrailingFormatClause('SELECT 1 FORMAT CSV # note')).to.equal(
'SELECT 1',
);
});

it('ignores FORMAT inside a block comment', function () {
const sql = 'SELECT 1 /* FORMAT CSV */';
expect(stripTrailingFormatClause(sql)).to.equal(sql);
});

it('ignores a FORMAT nested inside a subquery (not trailing)', function () {
const sql = 'SELECT * FROM (SELECT 1 FORMAT CSV) AS x';
expect(stripTrailingFormatClause(sql)).to.equal(sql);
});
});

describe('no-ops', function () {
it('leaves a plain query untouched', function () {
const sql = 'SELECT count(*) FROM events';
expect(stripTrailingFormatClause(sql)).to.equal(sql);
});

it('returns empty input unchanged', function () {
expect(stripTrailingFormatClause('')).to.equal('');
});

it('does not manufacture an empty query from a bare FORMAT clause', function () {
expect(stripTrailingFormatClause('FORMAT JSON')).to.equal('FORMAT JSON');
});
});

describe('produces a query that wraps cleanly (the bug being fixed)', function () {
const cases = [
'SELECT * FROM events FORMAT CSV',
'SELECT a, b FROM t FORMAT CSVWithNames;',
'SELECT x FROM t ORDER BY x; FORMAT JSON',
'SELECT formatDateTime(ts) AS day FROM events FORMAT CSV',
];

cases.forEach(function (sql) {
it(`leaves no FORMAT inside the parens for: ${sql}`, function () {
const wrapped = wrap(stripTrailingFormatClause(sql));
// Exactly one FORMAT clause survives — the outer one the API adds.
expect(countFormatClauses(wrapped)).to.equal(1);
// ...and it sits at the very end, outside the subquery parentheses.
expect(wrapped).to.match(/\)\s+LIMIT\s+100\s+FORMAT\s+JSON$/);
const innerParens = wrapped.slice(
wrapped.indexOf('(') + 1,
wrapped.lastIndexOf(')'),
);
expect(countFormatClauses(innerParens)).to.equal(0);
});
});
});
});