isamplesorg · rdhyee · May 1, 2026 · Apr 9, 2026 · Apr 28, 2026
diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,7 @@ models/generated/extensions/*.md
 
 # Large data files
 *.parquet
+*.duckdb
 
 # Node / Playwright
 node_modules/

diff --git a/tools/build_fts_index.py b/tools/build_fts_index.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+"""
+STATUS: spike artifact — NOT in production pipeline.
+
+Build a DuckDB full-text search index for the iSamples Explorer.
+
+This script was used to evaluate whether DuckDB FTS could replace the
+ILIKE-based search in the Explorer. Findings (PR #95):
+  - Full index (label + description + place_name): 358 MB
+  - Lite index (label + place_name only):          211 MB
+  - ATTACH-over-HTTP works in DuckDB-WASM, but the download is too
+    large for an interactive page.
+The Explorer continues to use ILIKE; this script is preserved so we
+can revisit FTS once we have a smaller index strategy (e.g.
+pre-tokenized inverted index as parquet, or on-demand loading behind
+an "Enhanced Search" toggle).
+
+Usage:
+    python tools/build_fts_index.py
+
+Output:
+    tools/isamples_fts_index.duckdb  (NOT currently uploaded anywhere)
+
+Requirements:
+    pip install duckdb
+"""
+
+import duckdb
+import os
+import sys
+from pathlib import Path
+
+PARQUET_URL = "https://data.isamples.org/isamples_202601_wide.parquet"
+OUTPUT_DB = Path(__file__).parent / "isamples_fts_index.duckdb"
+
+# Local fallback for faster builds
+LOCAL_PARQUET = Path.home() / "Data/iSample/pqg_refining/zenodo_wide_2026-01-09.parquet"
+
+
+def build_fts_index():
+    # Use local file if available, otherwise remote
+    source = str(LOCAL_PARQUET) if LOCAL_PARQUET.exists() else PARQUET_URL
+    print(f"Source: {source}")
+
+    # Remove existing index file
+    if OUTPUT_DB.exists():
+        OUTPUT_DB.unlink()
+
+    con = duckdb.connect(str(OUTPUT_DB))
+
+    print("Creating samples table from parquet...")
+    con.execute(f"""
+        CREATE TABLE samples AS
+        SELECT
+            pid,
+            label,
+            COALESCE(description, '') AS description,
+            COALESCE(CAST(place_name AS VARCHAR), '') AS place_name
+        FROM read_parquet('{source}')
+        WHERE otype = 'MaterialSampleRecord'
+    """)
+
+    row_count = con.execute("SELECT COUNT(*) FROM samples").fetchone()[0]
+    print(f"Loaded {row_count:,} rows")
+
+    print("Installing and loading FTS extension...")
+    con.execute("INSTALL fts")
+    con.execute("LOAD fts")
+
+    print("Building FTS index (this may take a few minutes)...")
+    con.execute("""
+        PRAGMA create_fts_index(
+            'samples', 'pid',
+            'label', 'description', 'place_name',
+            stemmer = 'porter',
+            stopwords = 'english',
+            overwrite = 1
+        )
+    """)
+
+    # Verify the index works
+    test_result = con.execute("""
+        SELECT pid, fts_main_samples.match_bm25(pid, 'pottery') AS score
+        FROM samples
+        WHERE score IS NOT NULL
+        ORDER BY score DESC
+        LIMIT 5
+    """).fetchall()
+    print(f"Test query 'pottery': {len(test_result)} results")
+    for pid, score in test_result:
+        print(f"  {pid[:60]}  score={score:.4f}")
+
+    # Keep samples table — FTS macros reference it internally.
+    # The table has only pid + text columns (not the full schema),
+    # so it's much smaller than the full parquet.
+
+    con.close()
+
+    size_mb = OUTPUT_DB.stat().st_size / (1024 * 1024)
+    print(f"\nIndex file: {OUTPUT_DB}")
+    print(f"Size: {size_mb:.1f} MB")
+    print(f"\nNOTE: Index is too large to ship to the browser as-is.")
+    print(f"      See module docstring for the spike findings.")
+
+
+if __name__ == "__main__":
+    build_fts_index()
diff --git a/tutorials/isamples_explorer.qmd b/tutorials/isamples_explorer.qmd
@@ -173,7 +173,7 @@ Circle size = log(sample count). Color = dominant data source.
 
 <!-- Static layout: globe + side panel. Updated via DOM, not OJS reactivity. -->
 <div class="search-bar">
-<input type="text" id="sampleSearch" placeholder="Search samples (e.g., basalt, pottery, coral...)" />
+<input type="text" id="sampleSearch" placeholder="Search samples - multiple words narrow results (e.g., pottery Cyprus)" />
 <button id="searchBtn">Search</button>
 </div>
 <div id="searchResults" class="search-results"></div>
@@ -315,6 +315,37 @@ function sourceFilterSQL(col) {
     return ` AND ${col} IN (${list})`;
 }
 
+// === Text search SQL helpers ===
+function searchTerms(value) {
+    return String(value || '').trim().split(/\s+/).filter(Boolean);
+}
+
+function escapeSqlString(value) {
+    return String(value).replace(/'/g, "''");
+}
+
+function escapeIlikePattern(value) {
+    return escapeSqlString(value).replace(/[\\%_]/g, "\\$&");
+}
+
+function textSearchWhere(terms, columns) {
+    return terms.map(raw => {
+        const term = escapeIlikePattern(raw);
+        const checks = columns.map(col => `${col} ILIKE '%${term}%' ESCAPE '\\'`);
+        return `(${checks.join(' OR ')})`;
+    }).join(' AND ');
+}
+
+function textSearchScore(terms, weightedColumns) {
+    if (!terms.length) return '0';
+    return terms.map(raw => {
+        const term = escapeIlikePattern(raw);
+        return weightedColumns.map(({ col, weight }) =>
+            `CASE WHEN ${col} ILIKE '%${term}%' ESCAPE '\\' THEN ${weight} ELSE 0 END`
+        ).join(' + ');
+    }).map(score => `(${score})`).join(' + ');
+}
+
 // === Material/Context Filters ===
 function getCheckedValues(containerId) {
     const checks = document.querySelectorAll(`#${containerId} input[type="checkbox"]`);
@@ -1620,26 +1651,40 @@ zoomWatcher = {
         }
         searchResults.textContent = 'Searching...';
         try {
-            const escaped = term.replace(/'/g, "''");
+            const terms = searchTerms(term);
             // Compose with facet filters so search honors the same Material /
             // Sampled Feature / Specimen Type selections that the table and
             // point-mode globe use. Without this, search would surface (and
             // fly to) samples outside the active filters.
             const facetActive = hasFacetFilters();
             const facetSQL = facetActive ? facetFilterSQL() : '';
+            const aliasedSearchWhere = textSearchWhere(terms, ['l.label', 'CAST(l.place_name AS VARCHAR)']);
+            const aliasedScore = textSearchScore(terms, [
+                { col: 'l.label', weight: 3 },
+                { col: 'CAST(l.place_name AS VARCHAR)', weight: 2 },
+            ]);
+            const searchWhere = textSearchWhere(terms, ['label', 'CAST(place_name AS VARCHAR)']);
+            const score = textSearchScore(terms, [
+                { col: 'label', weight: 3 },
+                { col: 'CAST(place_name AS VARCHAR)', weight: 2 },
+            ]);
             const query = facetActive ? `
-                SELECT l.pid, l.label, l.source, l.latitude, l.longitude, l.place_name
+                SELECT l.pid, l.label, l.source, l.latitude, l.longitude, l.place_name,
+                       (${aliasedScore}) AS relevance_score
                 FROM read_parquet('${lite_url}') l
                 JOIN read_parquet('${facets_url}') f ON l.pid = f.pid
-                WHERE l.label ILIKE '%${escaped}%'
+                WHERE ${aliasedSearchWhere}
                 ${sourceFilterSQL('l.source')}
                 ${facetSQL}
+                ORDER BY relevance_score DESC, l.label
                 LIMIT 50
             ` : `
-                SELECT pid, label, source, latitude, longitude, place_name
+                SELECT pid, label, source, latitude, longitude, place_name,
+                       (${score}) AS relevance_score
                 FROM read_parquet('${lite_url}')
-                WHERE label ILIKE '%${escaped}%'
+                WHERE ${searchWhere}
                 ${sourceFilterSQL('source')}
+                ORDER BY relevance_score DESC, label
                 LIMIT 50
             `;
             const results = await db.query(query);