diff --git a/.gitignore b/.gitignore index 40f2d5c..e3b2633 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ models/generated/extensions/*.md # Large data files *.parquet +*.duckdb # Node / Playwright node_modules/ diff --git a/tools/build_fts_index.py b/tools/build_fts_index.py new file mode 100644 index 0000000..9ff34f6 --- /dev/null +++ b/tools/build_fts_index.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +""" +STATUS: spike artifact — NOT in production pipeline. + +Build a DuckDB full-text search index for the iSamples Explorer. + +This script was used to evaluate whether DuckDB FTS could replace the +ILIKE-based search in the Explorer. Findings (PR #95): + - Full index (label + description + place_name): 358 MB + - Lite index (label + place_name only): 211 MB + - ATTACH-over-HTTP works in DuckDB-WASM, but the download is too + large for an interactive page. +The Explorer continues to use ILIKE; this script is preserved so we +can revisit FTS once we have a smaller index strategy (e.g. +pre-tokenized inverted index as parquet, or on-demand loading behind +an "Enhanced Search" toggle). + +Usage: + python tools/build_fts_index.py + +Output: + tools/isamples_fts_index.duckdb (NOT currently uploaded anywhere) + +Requirements: + pip install duckdb +""" + +import duckdb +import os +import sys +from pathlib import Path + +PARQUET_URL = "https://data.isamples.org/isamples_202601_wide.parquet" +OUTPUT_DB = Path(__file__).parent / "isamples_fts_index.duckdb" + +# Local fallback for faster builds +LOCAL_PARQUET = Path.home() / "Data/iSample/pqg_refining/zenodo_wide_2026-01-09.parquet" + + +def build_fts_index(): + # Use local file if available, otherwise remote + source = str(LOCAL_PARQUET) if LOCAL_PARQUET.exists() else PARQUET_URL + print(f"Source: {source}") + + # Remove existing index file + if OUTPUT_DB.exists(): + OUTPUT_DB.unlink() + + con = duckdb.connect(str(OUTPUT_DB)) + + print("Creating samples table from parquet...") + con.execute(f""" + CREATE TABLE samples AS + SELECT + pid, + label, + COALESCE(description, '') AS description, + COALESCE(CAST(place_name AS VARCHAR), '') AS place_name + FROM read_parquet('{source}') + WHERE otype = 'MaterialSampleRecord' + """) + + row_count = con.execute("SELECT COUNT(*) FROM samples").fetchone()[0] + print(f"Loaded {row_count:,} rows") + + print("Installing and loading FTS extension...") + con.execute("INSTALL fts") + con.execute("LOAD fts") + + print("Building FTS index (this may take a few minutes)...") + con.execute(""" + PRAGMA create_fts_index( + 'samples', 'pid', + 'label', 'description', 'place_name', + stemmer = 'porter', + stopwords = 'english', + overwrite = 1 + ) + """) + + # Verify the index works + test_result = con.execute(""" + SELECT pid, fts_main_samples.match_bm25(pid, 'pottery') AS score + FROM samples + WHERE score IS NOT NULL + ORDER BY score DESC + LIMIT 5 + """).fetchall() + print(f"Test query 'pottery': {len(test_result)} results") + for pid, score in test_result: + print(f" {pid[:60]} score={score:.4f}") + + # Keep samples table — FTS macros reference it internally. + # The table has only pid + text columns (not the full schema), + # so it's much smaller than the full parquet. + + con.close() + + size_mb = OUTPUT_DB.stat().st_size / (1024 * 1024) + print(f"\nIndex file: {OUTPUT_DB}") + print(f"Size: {size_mb:.1f} MB") + print(f"\nNOTE: Index is too large to ship to the browser as-is.") + print(f" See module docstring for the spike findings.") + + +if __name__ == "__main__": + build_fts_index() diff --git a/tutorials/isamples_explorer.qmd b/tutorials/isamples_explorer.qmd index 5b7c466..3b76d2a 100644 --- a/tutorials/isamples_explorer.qmd +++ b/tutorials/isamples_explorer.qmd @@ -173,7 +173,7 @@ Circle size = log(sample count). Color = dominant data source.
@@ -315,6 +315,37 @@ function sourceFilterSQL(col) { return ` AND ${col} IN (${list})`; } +// === Text search SQL helpers === +function searchTerms(value) { + return String(value || '').trim().split(/\s+/).filter(Boolean); +} + +function escapeSqlString(value) { + return String(value).replace(/'/g, "''"); +} + +function escapeIlikePattern(value) { + return escapeSqlString(value).replace(/[\\%_]/g, "\\$&"); +} + +function textSearchWhere(terms, columns) { + return terms.map(raw => { + const term = escapeIlikePattern(raw); + const checks = columns.map(col => `${col} ILIKE '%${term}%' ESCAPE '\\'`); + return `(${checks.join(' OR ')})`; + }).join(' AND '); +} + +function textSearchScore(terms, weightedColumns) { + if (!terms.length) return '0'; + return terms.map(raw => { + const term = escapeIlikePattern(raw); + return weightedColumns.map(({ col, weight }) => + `CASE WHEN ${col} ILIKE '%${term}%' ESCAPE '\\' THEN ${weight} ELSE 0 END` + ).join(' + '); + }).map(score => `(${score})`).join(' + '); +} + // === Material/Context Filters === function getCheckedValues(containerId) { const checks = document.querySelectorAll(`#${containerId} input[type="checkbox"]`); @@ -1620,26 +1651,40 @@ zoomWatcher = { } searchResults.textContent = 'Searching...'; try { - const escaped = term.replace(/'/g, "''"); + const terms = searchTerms(term); // Compose with facet filters so search honors the same Material / // Sampled Feature / Specimen Type selections that the table and // point-mode globe use. Without this, search would surface (and // fly to) samples outside the active filters. const facetActive = hasFacetFilters(); const facetSQL = facetActive ? facetFilterSQL() : ''; + const aliasedSearchWhere = textSearchWhere(terms, ['l.label', 'CAST(l.place_name AS VARCHAR)']); + const aliasedScore = textSearchScore(terms, [ + { col: 'l.label', weight: 3 }, + { col: 'CAST(l.place_name AS VARCHAR)', weight: 2 }, + ]); + const searchWhere = textSearchWhere(terms, ['label', 'CAST(place_name AS VARCHAR)']); + const score = textSearchScore(terms, [ + { col: 'label', weight: 3 }, + { col: 'CAST(place_name AS VARCHAR)', weight: 2 }, + ]); const query = facetActive ? ` - SELECT l.pid, l.label, l.source, l.latitude, l.longitude, l.place_name + SELECT l.pid, l.label, l.source, l.latitude, l.longitude, l.place_name, + (${aliasedScore}) AS relevance_score FROM read_parquet('${lite_url}') l JOIN read_parquet('${facets_url}') f ON l.pid = f.pid - WHERE l.label ILIKE '%${escaped}%' + WHERE ${aliasedSearchWhere} ${sourceFilterSQL('l.source')} ${facetSQL} + ORDER BY relevance_score DESC, l.label LIMIT 50 ` : ` - SELECT pid, label, source, latitude, longitude, place_name + SELECT pid, label, source, latitude, longitude, place_name, + (${score}) AS relevance_score FROM read_parquet('${lite_url}') - WHERE label ILIKE '%${escaped}%' + WHERE ${searchWhere} ${sourceFilterSQL('source')} + ORDER BY relevance_score DESC, label LIMIT 50 `; const results = await db.query(query);