Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 73 additions & 5 deletions pyprophet/io/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,16 @@ def export_quant_matrix(self, data: pd.DataFrame) -> pd.DataFrame:
"""
cfg = self.config

# Check if data is empty
if data.empty:
raise ValueError(
"No identification results passed the filtering criteria. "
"The filtered dataset is empty. Please check your filter settings: "
f"max_rs_peakgroup_qvalue={cfg.max_rs_peakgroup_qvalue}, "
f"max_global_peptide_qvalue={cfg.max_global_peptide_qvalue}, "
f"max_global_protein_qvalue={cfg.max_global_protein_qvalue}"
)
Comment thread
singjc marked this conversation as resolved.

sep = "," if cfg.out_type == "csv" else "\t"
level = self.level
normalization = self.config.normalization
Expand Down Expand Up @@ -777,6 +787,13 @@ def _summarize_precursor_level(
# Select top ranking peak group only
idx = data.groupby(["run_id", "transition_group_id"])["m_score"].idxmin()
data = data.loc[idx]

if data.empty:
raise ValueError(
"No data available for precursor-level summarization. "
"This typically occurs when no peak groups pass the q-value thresholds."
)

logger.info("Summarizing to precursor level.")
# Create matrix
matrix = data.pivot_table(
Expand All @@ -801,6 +818,13 @@ def _summarize_peptide_level(
# First get top peak group per precursor
idx = data.groupby(["run_id", "transition_group_id"])["m_score"].idxmin()
data = data.loc[idx]

if data.empty:
raise ValueError(
"No data available after filtering for peptide-level summarization. "
"This typically occurs when no peak groups pass the q-value thresholds."
)

logger.info("Summarizing to peptide level.")
# Get top precursors for each peptide
if consistent_top:
Expand Down Expand Up @@ -830,12 +854,30 @@ def _summarize_peptide_level(
.reset_index(drop=True)
)

if data.empty:
raise ValueError(
"No data available after selecting top precursors for peptide-level summarization. "
"Check that top_n is not too large for your dataset."
)

# Summarize by peptide (mean of top precursors)
peptide_matrix = (
data.groupby(["Sequence", "FullPeptideName", "filename"])["Intensity"]
.mean()
.unstack()
).reset_index()
# Group and aggregate
grouped = data.groupby(["Sequence", "FullPeptideName", "filename"])["Intensity"].mean()

# Unstack and reset index carefully to avoid column name conflicts
try:
peptide_matrix = grouped.unstack(fill_value=None)
# Reset index to convert index columns to regular columns
peptide_matrix = peptide_matrix.reset_index()
except ValueError as e:
if "cannot insert" in str(e):
raise ValueError(
"Failed to create quantification matrix because a run filename "
"conflicts with reserved column names ('Sequence' or 'FullPeptideName'). "
"Please rename the conflicting input file or adjust the data before summarization."
) from e
raise

return peptide_matrix

def _summarize_protein_level(
Expand Down Expand Up @@ -864,6 +906,13 @@ def _summarize_protein_level(
right_on="FullPeptideName",
how="left",
)

if protein_map.empty or protein_matrix["ProteinName"].isna().all():
raise ValueError(
"No protein data available after mapping peptides to proteins. "
"Check that protein annotations are present in the data."
)

protein_matrix = protein_matrix.explode("ProteinName")

if consistent_top:
Expand All @@ -888,6 +937,12 @@ def _summarize_protein_level(
protein_matrix = (
protein_matrix.groupby("ProteinName").mean(numeric_only=True).reset_index()
)

if protein_matrix.empty:
raise ValueError(
"No data available after protein-level summarization. "
"This may indicate all proteins were filtered out."
)

return protein_matrix

Expand Down Expand Up @@ -917,6 +972,13 @@ def _summarize_gene_level(
right_on="FullPeptideName",
how="left",
)

if gene_map.empty or gene_matrix["Gene"].isna().all():
raise ValueError(
"No gene data available after mapping peptides to genes. "
"Check that gene annotations are present in the data."
)

Comment thread
singjc marked this conversation as resolved.
gene_matrix = gene_matrix.explode("Gene")

if consistent_top:
Expand All @@ -939,6 +1001,12 @@ def _summarize_gene_level(

# Summarize by gene (mean of top peptides)
gene_matrix = gene_matrix.groupby("Gene").mean(numeric_only=True).reset_index()

if gene_matrix.empty:
raise ValueError(
"No data available after gene-level summarization. "
"This may indicate all genes were filtered out."
)

return gene_matrix

Expand Down
Loading