Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ models/generated/extensions/*.md

# Large data files
*.parquet
*.duckdb

# Node / Playwright
node_modules/
Expand Down
107 changes: 107 additions & 0 deletions tools/build_fts_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/env python3
"""
STATUS: spike artifact — NOT in production pipeline.

Build a DuckDB full-text search index for the iSamples Explorer.

This script was used to evaluate whether DuckDB FTS could replace the
ILIKE-based search in the Explorer. Findings (PR #95):
- Full index (label + description + place_name): 358 MB
- Lite index (label + place_name only): 211 MB
- ATTACH-over-HTTP works in DuckDB-WASM, but the download is too
large for an interactive page.
The Explorer continues to use ILIKE; this script is preserved so we
can revisit FTS once we have a smaller index strategy (e.g.
pre-tokenized inverted index as parquet, or on-demand loading behind
an "Enhanced Search" toggle).

Usage:
python tools/build_fts_index.py

Output:
tools/isamples_fts_index.duckdb (NOT currently uploaded anywhere)

Requirements:
pip install duckdb
"""

import duckdb
import os
import sys
from pathlib import Path

PARQUET_URL = "https://data.isamples.org/isamples_202601_wide.parquet"
OUTPUT_DB = Path(__file__).parent / "isamples_fts_index.duckdb"

# Local fallback for faster builds
LOCAL_PARQUET = Path.home() / "Data/iSample/pqg_refining/zenodo_wide_2026-01-09.parquet"


def build_fts_index():
# Use local file if available, otherwise remote
source = str(LOCAL_PARQUET) if LOCAL_PARQUET.exists() else PARQUET_URL
print(f"Source: {source}")

# Remove existing index file
if OUTPUT_DB.exists():
OUTPUT_DB.unlink()

con = duckdb.connect(str(OUTPUT_DB))

print("Creating samples table from parquet...")
con.execute(f"""
CREATE TABLE samples AS
SELECT
pid,
label,
COALESCE(description, '') AS description,
COALESCE(CAST(place_name AS VARCHAR), '') AS place_name
FROM read_parquet('{source}')
WHERE otype = 'MaterialSampleRecord'
""")

row_count = con.execute("SELECT COUNT(*) FROM samples").fetchone()[0]
print(f"Loaded {row_count:,} rows")

print("Installing and loading FTS extension...")
con.execute("INSTALL fts")
con.execute("LOAD fts")

print("Building FTS index (this may take a few minutes)...")
con.execute("""
PRAGMA create_fts_index(
'samples', 'pid',
'label', 'description', 'place_name',
stemmer = 'porter',
stopwords = 'english',
overwrite = 1
)
""")

# Verify the index works
test_result = con.execute("""
SELECT pid, fts_main_samples.match_bm25(pid, 'pottery') AS score
FROM samples
WHERE score IS NOT NULL
ORDER BY score DESC
LIMIT 5
""").fetchall()
print(f"Test query 'pottery': {len(test_result)} results")
for pid, score in test_result:
print(f" {pid[:60]} score={score:.4f}")

# Keep samples table — FTS macros reference it internally.
# The table has only pid + text columns (not the full schema),
# so it's much smaller than the full parquet.

con.close()

size_mb = OUTPUT_DB.stat().st_size / (1024 * 1024)
print(f"\nIndex file: {OUTPUT_DB}")
print(f"Size: {size_mb:.1f} MB")
print(f"\nNOTE: Index is too large to ship to the browser as-is.")
print(f" See module docstring for the spike findings.")


if __name__ == "__main__":
build_fts_index()
57 changes: 51 additions & 6 deletions tutorials/isamples_explorer.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ Circle size = log(sample count). Color = dominant data source.

<!-- Static layout: globe + side panel. Updated via DOM, not OJS reactivity. -->
<div class="search-bar">
<input type="text" id="sampleSearch" placeholder="Search samples (e.g., basalt, pottery, coral...)" />
<input type="text" id="sampleSearch" placeholder="Search samples - multiple words narrow results (e.g., pottery Cyprus)" />
<button id="searchBtn">Search</button>
</div>
<div id="searchResults" class="search-results"></div>
Expand Down Expand Up @@ -315,6 +315,37 @@ function sourceFilterSQL(col) {
return ` AND ${col} IN (${list})`;
}

// === Text search SQL helpers ===
function searchTerms(value) {
return String(value || '').trim().split(/\s+/).filter(Boolean);
}

function escapeSqlString(value) {
return String(value).replace(/'/g, "''");
}

function escapeIlikePattern(value) {
return escapeSqlString(value).replace(/[\\%_]/g, "\\$&");
}

function textSearchWhere(terms, columns) {
return terms.map(raw => {
const term = escapeIlikePattern(raw);
const checks = columns.map(col => `${col} ILIKE '%${term}%' ESCAPE '\\'`);
return `(${checks.join(' OR ')})`;
}).join(' AND ');
}

function textSearchScore(terms, weightedColumns) {
if (!terms.length) return '0';
return terms.map(raw => {
const term = escapeIlikePattern(raw);
return weightedColumns.map(({ col, weight }) =>
`CASE WHEN ${col} ILIKE '%${term}%' ESCAPE '\\' THEN ${weight} ELSE 0 END`
).join(' + ');
}).map(score => `(${score})`).join(' + ');
}

// === Material/Context Filters ===
function getCheckedValues(containerId) {
const checks = document.querySelectorAll(`#${containerId} input[type="checkbox"]`);
Expand Down Expand Up @@ -1620,26 +1651,40 @@ zoomWatcher = {
}
searchResults.textContent = 'Searching...';
try {
const escaped = term.replace(/'/g, "''");
const terms = searchTerms(term);
// Compose with facet filters so search honors the same Material /
// Sampled Feature / Specimen Type selections that the table and
// point-mode globe use. Without this, search would surface (and
// fly to) samples outside the active filters.
const facetActive = hasFacetFilters();
const facetSQL = facetActive ? facetFilterSQL() : '';
const aliasedSearchWhere = textSearchWhere(terms, ['l.label', 'CAST(l.place_name AS VARCHAR)']);
const aliasedScore = textSearchScore(terms, [
{ col: 'l.label', weight: 3 },
{ col: 'CAST(l.place_name AS VARCHAR)', weight: 2 },
]);
const searchWhere = textSearchWhere(terms, ['label', 'CAST(place_name AS VARCHAR)']);
const score = textSearchScore(terms, [
{ col: 'label', weight: 3 },
{ col: 'CAST(place_name AS VARCHAR)', weight: 2 },
]);
const query = facetActive ? `
SELECT l.pid, l.label, l.source, l.latitude, l.longitude, l.place_name
SELECT l.pid, l.label, l.source, l.latitude, l.longitude, l.place_name,
(${aliasedScore}) AS relevance_score
FROM read_parquet('${lite_url}') l
JOIN read_parquet('${facets_url}') f ON l.pid = f.pid
WHERE l.label ILIKE '%${escaped}%'
WHERE ${aliasedSearchWhere}
${sourceFilterSQL('l.source')}
${facetSQL}
ORDER BY relevance_score DESC, l.label
LIMIT 50
` : `
SELECT pid, label, source, latitude, longitude, place_name
SELECT pid, label, source, latitude, longitude, place_name,
(${score}) AS relevance_score
FROM read_parquet('${lite_url}')
WHERE label ILIKE '%${escaped}%'
WHERE ${searchWhere}
${sourceFilterSQL('source')}
ORDER BY relevance_score DESC, label
LIMIT 50
`;
const results = await db.query(query);
Expand Down
Loading