From c1c4feb3d72f7de17d40ba5646092d318ac0afe3 Mon Sep 17 00:00:00 2001 From: Estelle Da Date: Fri, 10 Apr 2026 17:41:58 +1000 Subject: [PATCH 1/8] Add controlled keywords to score set search filter options. --- src/mavedb/lib/score_sets.py | 15 +++++++++++++-- src/mavedb/view_models/search.py | 3 ++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index 142623dd2..cedccef3f 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -216,12 +216,12 @@ def build_search_score_sets_query_filter( ) ) - if search.keywords: + if search.controlled_keywords: query = query.filter( ScoreSet.experiment.has( Experiment.keyword_objs.any( ExperimentControlledKeywordAssociation.controlled_keyword.has( - ControlledKeyword.label.in_(search.keywords) + ControlledKeyword.label.in_(search.controlled_keywords) ) ) ) @@ -334,6 +334,9 @@ def fetch_score_set_search_filter_options( publication_author_name_counter: Counter[str] = Counter() publication_db_name_counter: Counter[str] = Counter() publication_journal_counter: Counter[str] = Counter() + # Controlled keywords related counters + controlled_keywords_label_counter: Counter[str] = Counter() + # --- PERFORMANCE NOTE --- # The following counter construction loop is a bottleneck for large score set queries. @@ -388,6 +391,13 @@ def fetch_score_set_search_filter_options( if journal: publication_journal_counter[journal] += 1 + # Controlled keywords related options + for controlled_keyword in getattr(score_set.experiment, "keyword_objs", []): + keyword = getattr(controlled_keyword, "controlled_keyword", []) + label = getattr(keyword, "label", None) + if label: + controlled_keywords_label_counter[label] += 1 + logger.debug(msg="Score set search filter options were fetched.", extra=logging_context()) return { @@ -398,6 +408,7 @@ def fetch_score_set_search_filter_options( "publication_author_names": score_set_search_filter_options_from_counter(publication_author_name_counter), "publication_db_names": score_set_search_filter_options_from_counter(publication_db_name_counter), "publication_journals": score_set_search_filter_options_from_counter(publication_journal_counter), + "controlled_keywords": score_set_search_filter_options_from_counter(controlled_keywords_label_counter), } diff --git a/src/mavedb/view_models/search.py b/src/mavedb/view_models/search.py index 712cb5e32..09e7ff00d 100644 --- a/src/mavedb/view_models/search.py +++ b/src/mavedb/view_models/search.py @@ -25,7 +25,7 @@ class ScoreSetsSearch(BaseModel): databases: Optional[list[str]] = None journals: Optional[list[str]] = None publication_identifiers: Optional[list[str]] = None - keywords: Optional[list[str]] = None + controlled_keywords: Optional[list[str]] = None text: Optional[str] = None include_experiment_score_set_urns_and_count: Optional[bool] = True offset: Optional[int] = None @@ -56,6 +56,7 @@ class ScoreSetsSearchFilterOptionsResponse(BaseModel): publication_author_names: list[ScoreSetsSearchFilterOption] publication_db_names: list[ScoreSetsSearchFilterOption] publication_journals: list[ScoreSetsSearchFilterOption] + controlled_keywords: list[ScoreSetsSearchFilterOption] class Config: from_attributes = True From 527847ea7558b4c134b60bcc0bccaa04072d0431 Mon Sep 17 00:00:00 2001 From: Estelle Da Date: Fri, 10 Apr 2026 17:45:24 +1000 Subject: [PATCH 2/8] Modify keywords to controlled_keywords in constant test. --- tests/helpers/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/helpers/constants.py b/tests/helpers/constants.py index e06d07a12..b39691a32 100644 --- a/tests/helpers/constants.py +++ b/tests/helpers/constants.py @@ -2184,7 +2184,7 @@ "databases": ["uniprot"], "journals": ["biomed"], "publication_identifiers": ["12345678"], - "keywords": ["keyword"], + "controlled_keywords": ["keyword"], "text": "testtesttest", } From a04442e76884d65131b60923a4cc7ce7a25c179a Mon Sep 17 00:00:00 2001 From: Estelle Da Date: Fri, 10 Apr 2026 18:19:15 +1000 Subject: [PATCH 3/8] Add controlled_keywords in some tests. --- tests/lib/test_score_set.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/lib/test_score_set.py b/tests/lib/test_score_set.py index 0b7852dae..f74ae57ba 100644 --- a/tests/lib/test_score_set.py +++ b/tests/lib/test_score_set.py @@ -391,6 +391,7 @@ def test_fetch_score_set_search_filter_options_no_score_sets(setup_lib_db, sessi filter_options = fetch_score_set_search_filter_options(session, None, None, score_set_search) assert filter_options == { + "controlled_keywords": [], "target_gene_categories": [], "target_gene_names": [], "target_organism_names": [], @@ -437,6 +438,7 @@ def test_fetch_score_set_search_filter_options_with_score_set(setup_lib_db, sess filter_options = fetch_score_set_search_filter_options(session, user_data, None, score_set_search) assert filter_options == { + "controlled_keywords": [], "target_gene_categories": [{"value": TargetCategory.protein_coding, "count": 1}], "target_gene_names": [{"value": "TEST2", "count": 1}], "target_organism_names": [], @@ -511,6 +513,7 @@ def test_fetch_score_set_search_filter_options_with_partial_filtered_score_sets( user_data = UserData(user=requesting_user, active_roles=[]) filter_options = fetch_score_set_search_filter_options(session, user_data, None, score_set_search) assert filter_options == { + "controlled_keywords": [], "target_gene_categories": [{"value": TargetCategory.protein_coding, "count": 1}], "target_gene_names": [{"value": "TEST1", "count": 1}], "target_organism_names": [{"count": 1, "value": "Organism name"}], @@ -528,6 +531,7 @@ def test_fetch_score_set_search_filter_options_with_no_matching_score_sets(setup filter_options = fetch_score_set_search_filter_options(session, user_data, None, score_set_search) assert filter_options == { + "controlled_keywords": [], "target_gene_categories": [], "target_gene_names": [], "target_organism_names": [], @@ -543,6 +547,7 @@ def test_fetch_score_set_search_filter_options_with_no_permitted_score_sets(setu filter_options = fetch_score_set_search_filter_options(session, None, None, score_set_search) assert filter_options == { + "controlled_keywords": [], "target_gene_categories": [], "target_gene_names": [], "target_organism_names": [], From 02a2d76cab20547b3a9addff6271b701a328fc68 Mon Sep 17 00:00:00 2001 From: Estelle Da Date: Thu, 16 Apr 2026 12:32:23 +1000 Subject: [PATCH 4/8] Count the controlled keywords by key and labels. --- src/mavedb/lib/score_sets.py | 23 ++++++++++++++++------- src/mavedb/view_models/search.py | 11 ++++++++++- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index cedccef3f..f6fa4cb8e 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -2,7 +2,7 @@ import io import logging import re -from collections import Counter +from collections import Counter, defaultdict from operator import attrgetter from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Literal, Optional, Sequence @@ -54,7 +54,7 @@ from mavedb.models.uniprot_offset import UniprotOffset from mavedb.models.user import User from mavedb.models.variant import Variant -from mavedb.view_models.search import ScoreSetsSearch +from mavedb.view_models.search import ScoreSetsSearch, ControlledKeywordFilterOption if TYPE_CHECKING: from mavedb.lib.permissions import Action @@ -335,8 +335,7 @@ def fetch_score_set_search_filter_options( publication_db_name_counter: Counter[str] = Counter() publication_journal_counter: Counter[str] = Counter() # Controlled keywords related counters - controlled_keywords_label_counter: Counter[str] = Counter() - + controlled_keywords_counter: dict[str, Counter[str]] = defaultdict(Counter) # --- PERFORMANCE NOTE --- # The following counter construction loop is a bottleneck for large score set queries. @@ -394,9 +393,19 @@ def fetch_score_set_search_filter_options( # Controlled keywords related options for controlled_keyword in getattr(score_set.experiment, "keyword_objs", []): keyword = getattr(controlled_keyword, "controlled_keyword", []) + if not keyword: + continue + key = getattr(keyword, "key", None) label = getattr(keyword, "label", None) - if label: - controlled_keywords_label_counter[label] += 1 + if key and label: + controlled_keywords_counter[key][label] += 1 + + controlled_keywords_counter_list = [] + for key, label_counter in controlled_keywords_counter.items(): + for label, count in label_counter.items(): + controlled_keywords_counter_list.append( + ControlledKeywordFilterOption(key=key, value=label, count=count) + ) logger.debug(msg="Score set search filter options were fetched.", extra=logging_context()) @@ -408,7 +417,7 @@ def fetch_score_set_search_filter_options( "publication_author_names": score_set_search_filter_options_from_counter(publication_author_name_counter), "publication_db_names": score_set_search_filter_options_from_counter(publication_db_name_counter), "publication_journals": score_set_search_filter_options_from_counter(publication_journal_counter), - "controlled_keywords": score_set_search_filter_options_from_counter(controlled_keywords_label_counter), + "controlled_keywords": controlled_keywords_counter_list, } diff --git a/src/mavedb/view_models/search.py b/src/mavedb/view_models/search.py index 09e7ff00d..a47b9392e 100644 --- a/src/mavedb/view_models/search.py +++ b/src/mavedb/view_models/search.py @@ -40,6 +40,15 @@ class Config: from_attributes = True +class ControlledKeywordFilterOption(BaseModel): + key: str + value: str + count: int + + class Config: + from_attributes = True + + class ScoreSetsSearchFilterOption(BaseModel): value: str count: int @@ -56,7 +65,7 @@ class ScoreSetsSearchFilterOptionsResponse(BaseModel): publication_author_names: list[ScoreSetsSearchFilterOption] publication_db_names: list[ScoreSetsSearchFilterOption] publication_journals: list[ScoreSetsSearchFilterOption] - controlled_keywords: list[ScoreSetsSearchFilterOption] + controlled_keywords: list[ControlledKeywordFilterOption] class Config: from_attributes = True From 650221b40cfa263a205cdaf8401e9b5281950e5d Mon Sep 17 00:00:00 2001 From: Estelle Da Date: Tue, 21 Apr 2026 12:31:29 +1000 Subject: [PATCH 5/8] Debug the build_search_score_sets_query_filter by controlled keyword part. --- src/mavedb/lib/score_sets.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index f6fa4cb8e..8ca3f63d5 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -217,15 +217,17 @@ def build_search_score_sets_query_filter( ) if search.controlled_keywords: - query = query.filter( - ScoreSet.experiment.has( - Experiment.keyword_objs.any( - ExperimentControlledKeywordAssociation.controlled_keyword.has( - ControlledKeyword.label.in_(search.controlled_keywords) + for label in search.controlled_keywords: + query = query.filter( + ScoreSet.experiment.has( + Experiment.keyword_objs.any( + ExperimentControlledKeywordAssociation.controlled_keyword.has( + ControlledKeyword.label == label + ) ) ) ) - ) + return query From 71df148f0b814ea84a07a00c4c64c1aec80738ba Mon Sep 17 00:00:00 2001 From: Estelle Da Date: Fri, 24 Apr 2026 16:31:08 +1000 Subject: [PATCH 6/8] Debug searching keywords 'Other' --- src/mavedb/lib/score_sets.py | 7 +++++-- src/mavedb/view_models/search.py | 7 ++++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index 8ca3f63d5..8d1fde14e 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -217,12 +217,15 @@ def build_search_score_sets_query_filter( ) if search.controlled_keywords: - for label in search.controlled_keywords: + for item in search.controlled_keywords: query = query.filter( ScoreSet.experiment.has( Experiment.keyword_objs.any( ExperimentControlledKeywordAssociation.controlled_keyword.has( - ControlledKeyword.label == label + and_( + ControlledKeyword.key == item.key, + ControlledKeyword.label == item.label, + ) ) ) ) diff --git a/src/mavedb/view_models/search.py b/src/mavedb/view_models/search.py index a47b9392e..eb9e0c5a1 100644 --- a/src/mavedb/view_models/search.py +++ b/src/mavedb/view_models/search.py @@ -4,6 +4,11 @@ from mavedb.view_models.score_set import ShortScoreSet +class ControlledKeywordSearch(BaseModel): + key: str + label: str + + class ExperimentsSearch(BaseModel): published: Optional[bool] = None authors: Optional[list[str]] = None @@ -25,7 +30,7 @@ class ScoreSetsSearch(BaseModel): databases: Optional[list[str]] = None journals: Optional[list[str]] = None publication_identifiers: Optional[list[str]] = None - controlled_keywords: Optional[list[str]] = None + controlled_keywords: Optional[list[ControlledKeywordSearch]] = None text: Optional[str] = None include_experiment_score_set_urns_and_count: Optional[bool] = True offset: Optional[int] = None From a676fad8820b8cf1c0712038e1b6211752495d20 Mon Sep 17 00:00:00 2001 From: Estelle Da Date: Fri, 24 Apr 2026 16:50:35 +1000 Subject: [PATCH 7/8] Debug test_populated_score_set_search and related part. --- tests/helpers/constants.py | 4 +++- tests/view_models/test_search.py | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/helpers/constants.py b/tests/helpers/constants.py index b39691a32..57dfbcb53 100644 --- a/tests/helpers/constants.py +++ b/tests/helpers/constants.py @@ -2184,7 +2184,9 @@ "databases": ["uniprot"], "journals": ["biomed"], "publication_identifiers": ["12345678"], - "controlled_keywords": ["keyword"], + "controlled_keywords": [ + {"key": "keyword_key", "label": "keyword_label"} + ], "text": "testtesttest", } diff --git a/tests/view_models/test_search.py b/tests/view_models/test_search.py index 5c86e2991..d0b9366d0 100644 --- a/tests/view_models/test_search.py +++ b/tests/view_models/test_search.py @@ -14,7 +14,11 @@ def test_populated_experiment_search(): def test_populated_score_set_search(): score_set_search = ScoreSetsSearch(**TEST_POPULATED_SCORE_SET_SEARCH) - assert all(score_set_search.__getattribute__(k) == v for k, v in TEST_POPULATED_SCORE_SET_SEARCH.items()) + for k, v in TEST_POPULATED_SCORE_SET_SEARCH.items(): + if k == "controlled_keywords": + assert [item.model_dump() for item in score_set_search.controlled_keywords] == v + else: + assert getattr(score_set_search, k) == v def test_populated_text_search(): From 909c10c64b826fbfb1e605bf2a9424b1382b0f08 Mon Sep 17 00:00:00 2001 From: Estelle Da Date: Fri, 1 May 2026 16:29:36 +1000 Subject: [PATCH 8/8] Add a model validator for old keywords. --- src/mavedb/view_models/search.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/mavedb/view_models/search.py b/src/mavedb/view_models/search.py index eb9e0c5a1..846ebfc2c 100644 --- a/src/mavedb/view_models/search.py +++ b/src/mavedb/view_models/search.py @@ -1,4 +1,5 @@ from typing import Optional +from pydantic import model_validator from mavedb.view_models.base.base import BaseModel from mavedb.view_models.score_set import ShortScoreSet @@ -36,6 +37,16 @@ class ScoreSetsSearch(BaseModel): offset: Optional[int] = None limit: Optional[int] = None + # TODO#XXX - Remove validator after consumers have had a chance to update + @model_validator(mode="before") + @classmethod + def reject_deprecated_keywords(cls, data): + if isinstance(data, dict) and ("keywords" in data or "Keywords" in data): + raise ValueError( + "'keywords' is no longer supported. Use 'controlled_keywords' with " + "a list of {key, label} objects to filter by specific keyword groups." + ) + return data class ScoreSetsSearchResponse(BaseModel): score_sets: list[ShortScoreSet]