diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index 142623dd..8d1fde14 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -2,7 +2,7 @@ import io import logging import re -from collections import Counter +from collections import Counter, defaultdict from operator import attrgetter from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, List, Literal, Optional, Sequence @@ -54,7 +54,7 @@ from mavedb.models.uniprot_offset import UniprotOffset from mavedb.models.user import User from mavedb.models.variant import Variant -from mavedb.view_models.search import ScoreSetsSearch +from mavedb.view_models.search import ScoreSetsSearch, ControlledKeywordFilterOption if TYPE_CHECKING: from mavedb.lib.permissions import Action @@ -216,16 +216,21 @@ def build_search_score_sets_query_filter( ) ) - if search.keywords: - query = query.filter( - ScoreSet.experiment.has( - Experiment.keyword_objs.any( - ExperimentControlledKeywordAssociation.controlled_keyword.has( - ControlledKeyword.label.in_(search.keywords) + if search.controlled_keywords: + for item in search.controlled_keywords: + query = query.filter( + ScoreSet.experiment.has( + Experiment.keyword_objs.any( + ExperimentControlledKeywordAssociation.controlled_keyword.has( + and_( + ControlledKeyword.key == item.key, + ControlledKeyword.label == item.label, + ) + ) ) ) ) - ) + return query @@ -334,6 +339,8 @@ def fetch_score_set_search_filter_options( publication_author_name_counter: Counter[str] = Counter() publication_db_name_counter: Counter[str] = Counter() publication_journal_counter: Counter[str] = Counter() + # Controlled keywords related counters + controlled_keywords_counter: dict[str, Counter[str]] = defaultdict(Counter) # --- PERFORMANCE NOTE --- # The following counter construction loop is a bottleneck for large score set queries. @@ -388,6 +395,23 @@ def fetch_score_set_search_filter_options( if journal: publication_journal_counter[journal] += 1 + # Controlled keywords related options + for controlled_keyword in getattr(score_set.experiment, "keyword_objs", []): + keyword = getattr(controlled_keyword, "controlled_keyword", []) + if not keyword: + continue + key = getattr(keyword, "key", None) + label = getattr(keyword, "label", None) + if key and label: + controlled_keywords_counter[key][label] += 1 + + controlled_keywords_counter_list = [] + for key, label_counter in controlled_keywords_counter.items(): + for label, count in label_counter.items(): + controlled_keywords_counter_list.append( + ControlledKeywordFilterOption(key=key, value=label, count=count) + ) + logger.debug(msg="Score set search filter options were fetched.", extra=logging_context()) return { @@ -398,6 +422,7 @@ def fetch_score_set_search_filter_options( "publication_author_names": score_set_search_filter_options_from_counter(publication_author_name_counter), "publication_db_names": score_set_search_filter_options_from_counter(publication_db_name_counter), "publication_journals": score_set_search_filter_options_from_counter(publication_journal_counter), + "controlled_keywords": controlled_keywords_counter_list, } diff --git a/src/mavedb/view_models/search.py b/src/mavedb/view_models/search.py index 712cb5e3..846ebfc2 100644 --- a/src/mavedb/view_models/search.py +++ b/src/mavedb/view_models/search.py @@ -1,9 +1,15 @@ from typing import Optional +from pydantic import model_validator from mavedb.view_models.base.base import BaseModel from mavedb.view_models.score_set import ShortScoreSet +class ControlledKeywordSearch(BaseModel): + key: str + label: str + + class ExperimentsSearch(BaseModel): published: Optional[bool] = None authors: Optional[list[str]] = None @@ -25,12 +31,22 @@ class ScoreSetsSearch(BaseModel): databases: Optional[list[str]] = None journals: Optional[list[str]] = None publication_identifiers: Optional[list[str]] = None - keywords: Optional[list[str]] = None + controlled_keywords: Optional[list[ControlledKeywordSearch]] = None text: Optional[str] = None include_experiment_score_set_urns_and_count: Optional[bool] = True offset: Optional[int] = None limit: Optional[int] = None + # TODO#XXX - Remove validator after consumers have had a chance to update + @model_validator(mode="before") + @classmethod + def reject_deprecated_keywords(cls, data): + if isinstance(data, dict) and ("keywords" in data or "Keywords" in data): + raise ValueError( + "'keywords' is no longer supported. Use 'controlled_keywords' with " + "a list of {key, label} objects to filter by specific keyword groups." + ) + return data class ScoreSetsSearchResponse(BaseModel): score_sets: list[ShortScoreSet] @@ -40,6 +56,15 @@ class Config: from_attributes = True +class ControlledKeywordFilterOption(BaseModel): + key: str + value: str + count: int + + class Config: + from_attributes = True + + class ScoreSetsSearchFilterOption(BaseModel): value: str count: int @@ -56,6 +81,7 @@ class ScoreSetsSearchFilterOptionsResponse(BaseModel): publication_author_names: list[ScoreSetsSearchFilterOption] publication_db_names: list[ScoreSetsSearchFilterOption] publication_journals: list[ScoreSetsSearchFilterOption] + controlled_keywords: list[ControlledKeywordFilterOption] class Config: from_attributes = True diff --git a/tests/helpers/constants.py b/tests/helpers/constants.py index e06d07a1..57dfbcb5 100644 --- a/tests/helpers/constants.py +++ b/tests/helpers/constants.py @@ -2184,7 +2184,9 @@ "databases": ["uniprot"], "journals": ["biomed"], "publication_identifiers": ["12345678"], - "keywords": ["keyword"], + "controlled_keywords": [ + {"key": "keyword_key", "label": "keyword_label"} + ], "text": "testtesttest", } diff --git a/tests/lib/test_score_set.py b/tests/lib/test_score_set.py index 0b7852da..f74ae57b 100644 --- a/tests/lib/test_score_set.py +++ b/tests/lib/test_score_set.py @@ -391,6 +391,7 @@ def test_fetch_score_set_search_filter_options_no_score_sets(setup_lib_db, sessi filter_options = fetch_score_set_search_filter_options(session, None, None, score_set_search) assert filter_options == { + "controlled_keywords": [], "target_gene_categories": [], "target_gene_names": [], "target_organism_names": [], @@ -437,6 +438,7 @@ def test_fetch_score_set_search_filter_options_with_score_set(setup_lib_db, sess filter_options = fetch_score_set_search_filter_options(session, user_data, None, score_set_search) assert filter_options == { + "controlled_keywords": [], "target_gene_categories": [{"value": TargetCategory.protein_coding, "count": 1}], "target_gene_names": [{"value": "TEST2", "count": 1}], "target_organism_names": [], @@ -511,6 +513,7 @@ def test_fetch_score_set_search_filter_options_with_partial_filtered_score_sets( user_data = UserData(user=requesting_user, active_roles=[]) filter_options = fetch_score_set_search_filter_options(session, user_data, None, score_set_search) assert filter_options == { + "controlled_keywords": [], "target_gene_categories": [{"value": TargetCategory.protein_coding, "count": 1}], "target_gene_names": [{"value": "TEST1", "count": 1}], "target_organism_names": [{"count": 1, "value": "Organism name"}], @@ -528,6 +531,7 @@ def test_fetch_score_set_search_filter_options_with_no_matching_score_sets(setup filter_options = fetch_score_set_search_filter_options(session, user_data, None, score_set_search) assert filter_options == { + "controlled_keywords": [], "target_gene_categories": [], "target_gene_names": [], "target_organism_names": [], @@ -543,6 +547,7 @@ def test_fetch_score_set_search_filter_options_with_no_permitted_score_sets(setu filter_options = fetch_score_set_search_filter_options(session, None, None, score_set_search) assert filter_options == { + "controlled_keywords": [], "target_gene_categories": [], "target_gene_names": [], "target_organism_names": [], diff --git a/tests/view_models/test_search.py b/tests/view_models/test_search.py index 5c86e299..d0b9366d 100644 --- a/tests/view_models/test_search.py +++ b/tests/view_models/test_search.py @@ -14,7 +14,11 @@ def test_populated_experiment_search(): def test_populated_score_set_search(): score_set_search = ScoreSetsSearch(**TEST_POPULATED_SCORE_SET_SEARCH) - assert all(score_set_search.__getattribute__(k) == v for k, v in TEST_POPULATED_SCORE_SET_SEARCH.items()) + for k, v in TEST_POPULATED_SCORE_SET_SEARCH.items(): + if k == "controlled_keywords": + assert [item.model_dump() for item in score_set_search.controlled_keywords] == v + else: + assert getattr(score_set_search, k) == v def test_populated_text_search():