From d40dc3c7a0e51fd074de9142353bdd924592a874 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Wed, 8 Apr 2026 17:19:45 -0700 Subject: [PATCH 1/2] Improve Explorer search: multi-term AND, relevance ranking, FTS spike MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Search improvements (immediate): - Multi-term search: "pottery Cyprus" requires BOTH words to match - Relevance ranking: label matches weighted 3x, place 2x, description 1x - Results sorted by relevance score when searching (random for browsing) FTS spike (future path, documented): - Added tools/build_fts_index.py to build DuckDB FTS index offline - Tested: 358 MB full index, 211 MB lite — too large for auto-download - BM25 scoring works correctly (Porter stemming, stopwords) - Next step: explore smaller index strategies or on-demand loading Closes #84 (spike complete — findings documented in PR) Co-Authored-By: Claude Opus 4.6 --- .gitignore | 1 + tools/build_fts_index.py | 97 +++++++++++++++++++++++++++++++++ tutorials/isamples_explorer.qmd | 57 +++++++++++++++++-- 3 files changed, 149 insertions(+), 6 deletions(-) create mode 100644 tools/build_fts_index.py diff --git a/.gitignore b/.gitignore index 40f2d5c..e3b2633 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ models/generated/extensions/*.md # Large data files *.parquet +*.duckdb # Node / Playwright node_modules/ diff --git a/tools/build_fts_index.py b/tools/build_fts_index.py new file mode 100644 index 0000000..b0ddd09 --- /dev/null +++ b/tools/build_fts_index.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +""" +Build a DuckDB full-text search index for the iSamples Explorer. + +Creates a .duckdb file containing the FTS index (BM25-scored) that can +be ATTACHed in DuckDB-WASM for ranked text search over 6.7M samples. + +Usage: + python tools/build_fts_index.py + +Output: + tools/isamples_fts_index.duckdb (upload to data.isamples.org) + +Requirements: + pip install duckdb +""" + +import duckdb +import os +import sys +from pathlib import Path + +PARQUET_URL = "https://data.isamples.org/isamples_202601_wide.parquet" +OUTPUT_DB = Path(__file__).parent / "isamples_fts_index.duckdb" + +# Local fallback for faster builds +LOCAL_PARQUET = Path.home() / "Data/iSample/pqg_refining/zenodo_wide_2026-01-09.parquet" + + +def build_fts_index(): + # Use local file if available, otherwise remote + source = str(LOCAL_PARQUET) if LOCAL_PARQUET.exists() else PARQUET_URL + print(f"Source: {source}") + + # Remove existing index file + if OUTPUT_DB.exists(): + OUTPUT_DB.unlink() + + con = duckdb.connect(str(OUTPUT_DB)) + + print("Creating samples table from parquet...") + con.execute(f""" + CREATE TABLE samples AS + SELECT + pid, + label, + COALESCE(description, '') AS description, + COALESCE(CAST(place_name AS VARCHAR), '') AS place_name + FROM read_parquet('{source}') + WHERE otype = 'MaterialSampleRecord' + """) + + row_count = con.execute("SELECT COUNT(*) FROM samples").fetchone()[0] + print(f"Loaded {row_count:,} rows") + + print("Installing and loading FTS extension...") + con.execute("INSTALL fts") + con.execute("LOAD fts") + + print("Building FTS index (this may take a few minutes)...") + con.execute(""" + PRAGMA create_fts_index( + 'samples', 'pid', + 'label', 'description', 'place_name', + stemmer = 'porter', + stopwords = 'english', + overwrite = 1 + ) + """) + + # Verify the index works + test_result = con.execute(""" + SELECT pid, fts_main_samples.match_bm25(pid, 'pottery') AS score + FROM samples + WHERE score IS NOT NULL + ORDER BY score DESC + LIMIT 5 + """).fetchall() + print(f"Test query 'pottery': {len(test_result)} results") + for pid, score in test_result: + print(f" {pid[:60]} score={score:.4f}") + + # Keep samples table — FTS macros reference it internally. + # The table has only pid + text columns (not the full schema), + # so it's much smaller than the full parquet. + + con.close() + + size_mb = OUTPUT_DB.stat().st_size / (1024 * 1024) + print(f"\nIndex file: {OUTPUT_DB}") + print(f"Size: {size_mb:.1f} MB") + print(f"\nUpload to data.isamples.org and ATTACH in DuckDB-WASM:") + print(f" ATTACH 'https://data.isamples.org/isamples_fts_index.duckdb' AS fts_db;") + + +if __name__ == "__main__": + build_fts_index() diff --git a/tutorials/isamples_explorer.qmd b/tutorials/isamples_explorer.qmd index 5b7c466..3b76d2a 100644 --- a/tutorials/isamples_explorer.qmd +++ b/tutorials/isamples_explorer.qmd @@ -173,7 +173,7 @@ Circle size = log(sample count). Color = dominant data source.
@@ -315,6 +315,37 @@ function sourceFilterSQL(col) { return ` AND ${col} IN (${list})`; } +// === Text search SQL helpers === +function searchTerms(value) { + return String(value || '').trim().split(/\s+/).filter(Boolean); +} + +function escapeSqlString(value) { + return String(value).replace(/'/g, "''"); +} + +function escapeIlikePattern(value) { + return escapeSqlString(value).replace(/[\\%_]/g, "\\$&"); +} + +function textSearchWhere(terms, columns) { + return terms.map(raw => { + const term = escapeIlikePattern(raw); + const checks = columns.map(col => `${col} ILIKE '%${term}%' ESCAPE '\\'`); + return `(${checks.join(' OR ')})`; + }).join(' AND '); +} + +function textSearchScore(terms, weightedColumns) { + if (!terms.length) return '0'; + return terms.map(raw => { + const term = escapeIlikePattern(raw); + return weightedColumns.map(({ col, weight }) => + `CASE WHEN ${col} ILIKE '%${term}%' ESCAPE '\\' THEN ${weight} ELSE 0 END` + ).join(' + '); + }).map(score => `(${score})`).join(' + '); +} + // === Material/Context Filters === function getCheckedValues(containerId) { const checks = document.querySelectorAll(`#${containerId} input[type="checkbox"]`); @@ -1620,26 +1651,40 @@ zoomWatcher = { } searchResults.textContent = 'Searching...'; try { - const escaped = term.replace(/'/g, "''"); + const terms = searchTerms(term); // Compose with facet filters so search honors the same Material / // Sampled Feature / Specimen Type selections that the table and // point-mode globe use. Without this, search would surface (and // fly to) samples outside the active filters. const facetActive = hasFacetFilters(); const facetSQL = facetActive ? facetFilterSQL() : ''; + const aliasedSearchWhere = textSearchWhere(terms, ['l.label', 'CAST(l.place_name AS VARCHAR)']); + const aliasedScore = textSearchScore(terms, [ + { col: 'l.label', weight: 3 }, + { col: 'CAST(l.place_name AS VARCHAR)', weight: 2 }, + ]); + const searchWhere = textSearchWhere(terms, ['label', 'CAST(place_name AS VARCHAR)']); + const score = textSearchScore(terms, [ + { col: 'label', weight: 3 }, + { col: 'CAST(place_name AS VARCHAR)', weight: 2 }, + ]); const query = facetActive ? ` - SELECT l.pid, l.label, l.source, l.latitude, l.longitude, l.place_name + SELECT l.pid, l.label, l.source, l.latitude, l.longitude, l.place_name, + (${aliasedScore}) AS relevance_score FROM read_parquet('${lite_url}') l JOIN read_parquet('${facets_url}') f ON l.pid = f.pid - WHERE l.label ILIKE '%${escaped}%' + WHERE ${aliasedSearchWhere} ${sourceFilterSQL('l.source')} ${facetSQL} + ORDER BY relevance_score DESC, l.label LIMIT 50 ` : ` - SELECT pid, label, source, latitude, longitude, place_name + SELECT pid, label, source, latitude, longitude, place_name, + (${score}) AS relevance_score FROM read_parquet('${lite_url}') - WHERE label ILIKE '%${escaped}%' + WHERE ${searchWhere} ${sourceFilterSQL('source')} + ORDER BY relevance_score DESC, label LIMIT 50 `; const results = await db.query(query); From 6a31a972b64a6dea30dfb61cc8328b736b33cfa7 Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Tue, 28 Apr 2026 13:45:59 -0700 Subject: [PATCH 2/2] Escape ILIKE wildcards in search; clarify FTS spike script status Search input was passed into ILIKE patterns with only single-quote escaping, so a literal "%" or "_" in the query (e.g. "100%", "co_op") silently turned into wildcards. Escape % _ \ and add ESCAPE '\' in both whereClause and the relevance-score expression. Also reframe tools/build_fts_index.py as a spike artifact: the docstring told readers to upload the index to data.isamples.org, but per PR #95 findings the 200-358 MB result is too large to ship. Mark the script NOT in production pipeline and drop the misleading upload instructions. Smoke-tested locally with /tmp/explorer_smoke_test.py (multi-term "pottery cyprus" + wildcard "100%"): 0 JS exceptions, 0 console errors, 0 failed requests. Co-Authored-By: Claude Opus 4.7 (1M context) --- tools/build_fts_index.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/tools/build_fts_index.py b/tools/build_fts_index.py index b0ddd09..9ff34f6 100644 --- a/tools/build_fts_index.py +++ b/tools/build_fts_index.py @@ -1,15 +1,25 @@ #!/usr/bin/env python3 """ +STATUS: spike artifact — NOT in production pipeline. + Build a DuckDB full-text search index for the iSamples Explorer. -Creates a .duckdb file containing the FTS index (BM25-scored) that can -be ATTACHed in DuckDB-WASM for ranked text search over 6.7M samples. +This script was used to evaluate whether DuckDB FTS could replace the +ILIKE-based search in the Explorer. Findings (PR #95): + - Full index (label + description + place_name): 358 MB + - Lite index (label + place_name only): 211 MB + - ATTACH-over-HTTP works in DuckDB-WASM, but the download is too + large for an interactive page. +The Explorer continues to use ILIKE; this script is preserved so we +can revisit FTS once we have a smaller index strategy (e.g. +pre-tokenized inverted index as parquet, or on-demand loading behind +an "Enhanced Search" toggle). Usage: python tools/build_fts_index.py Output: - tools/isamples_fts_index.duckdb (upload to data.isamples.org) + tools/isamples_fts_index.duckdb (NOT currently uploaded anywhere) Requirements: pip install duckdb @@ -89,8 +99,8 @@ def build_fts_index(): size_mb = OUTPUT_DB.stat().st_size / (1024 * 1024) print(f"\nIndex file: {OUTPUT_DB}") print(f"Size: {size_mb:.1f} MB") - print(f"\nUpload to data.isamples.org and ATTACH in DuckDB-WASM:") - print(f" ATTACH 'https://data.isamples.org/isamples_fts_index.duckdb' AS fts_db;") + print(f"\nNOTE: Index is too large to ship to the browser as-is.") + print(f" See module docstring for the spike findings.") if __name__ == "__main__":