diff --git a/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py b/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py index c8b1711b59da..d8b422ff707c 100644 --- a/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py +++ b/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py @@ -152,13 +152,27 @@ def visit_full_text_search(self, full_text_search) -> Optional[ScoredGlobalIndex searcher = self._searcher query = self._index.parse_query(query_text, ["text"]) - results = searcher.search(query, limit) + scored_results = searcher.search(query, limit) + if not scored_results.hits: + return DictBasedScoredIndexResult({}) + + addr_to_score: Dict[tuple, float] = { + (addr.segment_ord, addr.doc): score + for score, addr in scored_results.hits + } + + # This can be replaced by https://github.com/quickwit-oss/tantivy-py/pull/641 when it's released + all_by_rowid = searcher.search(query, searcher.num_docs, order_by_field='row_id') id_to_scores: Dict[int, float] = {} - for score, doc_address in results.hits: - doc = searcher.doc(doc_address) - row_id = doc["row_id"][0] - id_to_scores[row_id] = score + remaining = len(addr_to_score) + for row_id, addr in all_by_rowid.hits: + score = addr_to_score.get((addr.segment_ord, addr.doc)) + if score is not None: + id_to_scores[row_id] = score + remaining -= 1 + if remaining == 0: + break return DictBasedScoredIndexResult(id_to_scores) @@ -178,7 +192,7 @@ def _ensure_loaded(self): # Open tantivy index from stream-backed directory schema_builder = tantivy.SchemaBuilder() - schema_builder.add_unsigned_field("row_id", stored=True, indexed=True, fast=True) + schema_builder.add_unsigned_field("row_id", stored=False, indexed=True, fast=True) schema_builder.add_text_field("text", stored=False) schema = schema_builder.build() diff --git a/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs b/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs index f768a0d8a622..aec47eaa8564 100644 --- a/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs +++ b/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs @@ -23,7 +23,7 @@ use jni::JNIEnv; use std::ptr; use tantivy::collector::TopDocs; use tantivy::query::QueryParser; -use tantivy::schema::{Field, NumericOptions, Schema, TEXT}; +use tantivy::schema::{Field, IndexRecordOption, NumericOptions, Schema, TextFieldIndexing, TextOptions}; use tantivy::{Index, IndexReader, IndexWriter, ReloadPolicy}; use crate::jni_directory::JniDirectory; @@ -56,12 +56,14 @@ fn build_schema() -> (Schema, Field, Field) { let mut builder = Schema::builder(); let row_id_field = builder.add_u64_field( "row_id", - NumericOptions::default() - .set_stored() - .set_indexed() - .set_fast(), + NumericOptions::default().set_indexed().set_fast(), ); - let text_field = builder.add_text_field("text", TEXT); + let text_options = TextOptions::default().set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("default") + .set_index_option(IndexRecordOption::WithFreqsAndPositions), + ); + let text_field = builder.add_text_field("text", text_options); (builder.build(), row_id_field, text_field) }