From bdb43c860c056b2267b066c1a36cbc991da17d5a Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Sun, 19 Apr 2026 18:52:06 +0800 Subject: [PATCH 1/3] Fix unnecessary .store file generation in full-text index --- .../tantivy_full_text_global_index_reader.py | 26 ++++++++++++++----- .../TantivyFullTextGlobalIndexWriter.java | 2 +- .../paimon-tantivy-jni/rust/src/lib.rs | 5 +--- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py b/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py index c8b1711b59da..d8b422ff707c 100644 --- a/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py +++ b/paimon-python/pypaimon/globalindex/tantivy/tantivy_full_text_global_index_reader.py @@ -152,13 +152,27 @@ def visit_full_text_search(self, full_text_search) -> Optional[ScoredGlobalIndex searcher = self._searcher query = self._index.parse_query(query_text, ["text"]) - results = searcher.search(query, limit) + scored_results = searcher.search(query, limit) + if not scored_results.hits: + return DictBasedScoredIndexResult({}) + + addr_to_score: Dict[tuple, float] = { + (addr.segment_ord, addr.doc): score + for score, addr in scored_results.hits + } + + # This can be replaced by https://github.com/quickwit-oss/tantivy-py/pull/641 when it's released + all_by_rowid = searcher.search(query, searcher.num_docs, order_by_field='row_id') id_to_scores: Dict[int, float] = {} - for score, doc_address in results.hits: - doc = searcher.doc(doc_address) - row_id = doc["row_id"][0] - id_to_scores[row_id] = score + remaining = len(addr_to_score) + for row_id, addr in all_by_rowid.hits: + score = addr_to_score.get((addr.segment_ord, addr.doc)) + if score is not None: + id_to_scores[row_id] = score + remaining -= 1 + if remaining == 0: + break return DictBasedScoredIndexResult(id_to_scores) @@ -178,7 +192,7 @@ def _ensure_loaded(self): # Open tantivy index from stream-backed directory schema_builder = tantivy.SchemaBuilder() - schema_builder.add_unsigned_field("row_id", stored=True, indexed=True, fast=True) + schema_builder.add_unsigned_field("row_id", stored=False, indexed=True, fast=True) schema_builder.add_text_field("text", stored=False) schema = schema_builder.build() diff --git a/paimon-tantivy/paimon-tantivy-index/src/main/java/org/apache/paimon/tantivy/index/TantivyFullTextGlobalIndexWriter.java b/paimon-tantivy/paimon-tantivy-index/src/main/java/org/apache/paimon/tantivy/index/TantivyFullTextGlobalIndexWriter.java index 6def9cb69e4f..5cfaaf2a09e4 100644 --- a/paimon-tantivy/paimon-tantivy-index/src/main/java/org/apache/paimon/tantivy/index/TantivyFullTextGlobalIndexWriter.java +++ b/paimon-tantivy/paimon-tantivy-index/src/main/java/org/apache/paimon/tantivy/index/TantivyFullTextGlobalIndexWriter.java @@ -134,7 +134,7 @@ private ResultEntry packIndex() throws IOException { // Filter to regular files only before writing count List indexFiles = new ArrayList<>(); for (File file : allFiles) { - if (file.isFile()) { + if (file.isFile() && !file.getName().endsWith(".store")) { indexFiles.add(file); } } diff --git a/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs b/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs index f768a0d8a622..1c0f7b7ad144 100644 --- a/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs +++ b/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs @@ -56,10 +56,7 @@ fn build_schema() -> (Schema, Field, Field) { let mut builder = Schema::builder(); let row_id_field = builder.add_u64_field( "row_id", - NumericOptions::default() - .set_stored() - .set_indexed() - .set_fast(), + NumericOptions::default().set_indexed().set_fast(), ); let text_field = builder.add_text_field("text", TEXT); (builder.build(), row_id_field, text_field) From 86025b5f635cbdd1108b1b51670b8360047cddba Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Sun, 19 Apr 2026 20:27:12 +0800 Subject: [PATCH 2/3] Fix unnecessary .store file generation in full-text index --- paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs b/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs index 1c0f7b7ad144..aec47eaa8564 100644 --- a/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs +++ b/paimon-tantivy/paimon-tantivy-jni/rust/src/lib.rs @@ -23,7 +23,7 @@ use jni::JNIEnv; use std::ptr; use tantivy::collector::TopDocs; use tantivy::query::QueryParser; -use tantivy::schema::{Field, NumericOptions, Schema, TEXT}; +use tantivy::schema::{Field, IndexRecordOption, NumericOptions, Schema, TextFieldIndexing, TextOptions}; use tantivy::{Index, IndexReader, IndexWriter, ReloadPolicy}; use crate::jni_directory::JniDirectory; @@ -58,7 +58,12 @@ fn build_schema() -> (Schema, Field, Field) { "row_id", NumericOptions::default().set_indexed().set_fast(), ); - let text_field = builder.add_text_field("text", TEXT); + let text_options = TextOptions::default().set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("default") + .set_index_option(IndexRecordOption::WithFreqsAndPositions), + ); + let text_field = builder.add_text_field("text", text_options); (builder.build(), row_id_field, text_field) } From da0833cbb14a1ea5b374da1ad7d55b2365371f9b Mon Sep 17 00:00:00 2001 From: ChengHui Chen <27797326+chenghuichen@users.noreply.github.com> Date: Sun, 19 Apr 2026 21:35:59 +0800 Subject: [PATCH 3/3] Fix unnecessary .store file generation in full-text index --- .../paimon/tantivy/index/TantivyFullTextGlobalIndexWriter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paimon-tantivy/paimon-tantivy-index/src/main/java/org/apache/paimon/tantivy/index/TantivyFullTextGlobalIndexWriter.java b/paimon-tantivy/paimon-tantivy-index/src/main/java/org/apache/paimon/tantivy/index/TantivyFullTextGlobalIndexWriter.java index 5cfaaf2a09e4..6def9cb69e4f 100644 --- a/paimon-tantivy/paimon-tantivy-index/src/main/java/org/apache/paimon/tantivy/index/TantivyFullTextGlobalIndexWriter.java +++ b/paimon-tantivy/paimon-tantivy-index/src/main/java/org/apache/paimon/tantivy/index/TantivyFullTextGlobalIndexWriter.java @@ -134,7 +134,7 @@ private ResultEntry packIndex() throws IOException { // Filter to regular files only before writing count List indexFiles = new ArrayList<>(); for (File file : allFiles) { - if (file.isFile() && !file.getName().endsWith(".store")) { + if (file.isFile()) { indexFiles.add(file); } }