Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,27 @@ def visit_full_text_search(self, full_text_search) -> Optional[ScoredGlobalIndex

searcher = self._searcher
query = self._index.parse_query(query_text, ["text"])
results = searcher.search(query, limit)

scored_results = searcher.search(query, limit)
if not scored_results.hits:
return DictBasedScoredIndexResult({})

addr_to_score: Dict[tuple, float] = {
(addr.segment_ord, addr.doc): score
for score, addr in scored_results.hits
}

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fallback looks like a potentially large performance regression for broad queries. We only need the row_id for the top-limit hits in scored_results, but this second search asks tantivy-py to collect up to searcher.num_docs matches ordered by row_id. For a common term on a large shard, a limit=10 lookup can now degenerate into scanning/materializing almost the full match set just to recover 10 ids. Could we keep row_id stored until batch fast-field access is available in the shipped tantivy-py version, or add a direct fast-field read path instead?

Copy link
Copy Markdown
Contributor Author

@chenghuichen chenghuichen Apr 21, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the review! Based on feedback from the tantivy-py community, v0.26 with fast-field access should be released in a few weeks (see quickwit-oss/tantivy-py#641). I think it makes sense to hold this PR for now — I'll track the community progress and ping you once the code is updated.

# This can be replaced by https://github.com/quickwit-oss/tantivy-py/pull/641 when it's released
all_by_rowid = searcher.search(query, searcher.num_docs, order_by_field='row_id')
id_to_scores: Dict[int, float] = {}
for score, doc_address in results.hits:
doc = searcher.doc(doc_address)
row_id = doc["row_id"][0]
id_to_scores[row_id] = score
remaining = len(addr_to_score)
for row_id, addr in all_by_rowid.hits:
score = addr_to_score.get((addr.segment_ord, addr.doc))
if score is not None:
id_to_scores[row_id] = score
remaining -= 1
if remaining == 0:
break

return DictBasedScoredIndexResult(id_to_scores)

Expand All @@ -178,7 +192,7 @@ def _ensure_loaded(self):

# Open tantivy index from stream-backed directory
schema_builder = tantivy.SchemaBuilder()
schema_builder.add_unsigned_field("row_id", stored=True, indexed=True, fast=True)
schema_builder.add_unsigned_field("row_id", stored=False, indexed=True, fast=True)
schema_builder.add_text_field("text", stored=False)
schema = schema_builder.build()

Expand Down
14 changes: 8 additions & 6 deletions paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use jni::JNIEnv;
use std::ptr;
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Field, NumericOptions, Schema, TEXT};
use tantivy::schema::{Field, IndexRecordOption, NumericOptions, Schema, TextFieldIndexing, TextOptions};
use tantivy::{Index, IndexReader, IndexWriter, ReloadPolicy};

use crate::jni_directory::JniDirectory;
Expand Down Expand Up @@ -56,12 +56,14 @@ fn build_schema() -> (Schema, Field, Field) {
let mut builder = Schema::builder();
let row_id_field = builder.add_u64_field(
"row_id",
NumericOptions::default()
.set_stored()
.set_indexed()
.set_fast(),
NumericOptions::default().set_indexed().set_fast(),
);
let text_field = builder.add_text_field("text", TEXT);
let text_options = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
);
let text_field = builder.add_text_field("text", text_options);
(builder.build(), row_id_field, text_field)
}

Expand Down
Loading