From e9aab2f5968563aff04cf5731d08ad966ea25111 Mon Sep 17 00:00:00 2001 From: Rich Abdill Date: Wed, 22 Apr 2026 00:06:10 -0400 Subject: [PATCH 1/8] Cleaning up to add projection --- LICENSE.md | 2 +- R/loader.R | 80 ++++++++++++++++++++++++++++++------------------------ 2 files changed, 45 insertions(+), 37 deletions(-) diff --git a/LICENSE.md b/LICENSE.md index d4e83a1..9691111 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,6 +1,6 @@ # MIT License -Copyright (c) 2023 MicroBioMap authors +Copyright (c) 2026 The University of Chicago Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/R/loader.R b/R/loader.R index 2b42d09..79b6917 100644 --- a/R/loader.R +++ b/R/loader.R @@ -1,4 +1,28 @@ -.getVersions <- function(bfc, verbose=FALSE) { +#' load all compendium data into a TreeSummarizedExperiment +#' +#' @param bfc BiocFileCache object to use +#' +#' @returns a `TreeSummarizedExperiment` +#' +#' @importFrom data.table fread setkey +#' @importClassesFrom Matrix TsparseMatrix +#' @import TreeSummarizedExperiment +#' @import R.utils +#' @import ape +#' @importFrom BiocFileCache BiocFileCache bfcrpath bfcquery bfcnew +#' +#' @export +#' +#' @examples +#' cpd <- getCompendium() +#' +#' dim(cpd) +#' cpd +#' assayNames(cpd) +#' head(colData(cpd)) +#' + +.getVersions <- function(bfc, entry, verbose=FALSE) { # Determines the most recent version of the compendium # and retrieves the manifest that describes all available releases. # Returns a data.table listing all versions and the necessary URLs @@ -7,12 +31,12 @@ # Check if we've already cached the manifest. # If not, make an HTTP call to get the URL we need - rpath <- BiocFileCache::bfcquery(bfc, 'manifest')$rpath + rpath <- BiocFileCache::bfcquery(bfc, entry)$rpath if(length(rpath) == 0) { if(verbose) { print('Retrieving version information.') } - resolve <- curl::curl_fetch_memory(canonical_doi) + resolve <- curl::curl_fetch_memory(canonical_doi[entry]) if(resolve$status_code != 200) { stop(paste0( 'Could not resolve canonical DOI. Status code: ', @@ -31,15 +55,22 @@ bfcrpath(bfc, manifest) }, error = function(msg){ - print('Could not retrieve manifest file. Falling back to manifest as of v1.1.0') + print('Could not retrieve manifest file. Falling back to manifest as of April 2026.') towrite <- data.table::data.table( version = c('1.1.0', '1.0.1'), zenodo_id = c('13733642', '10452633'), - default = c(TRUE, FALSE) - ) + if(entry == 'projection') { + print('Falling back to projection manifest.' + towrite <- data.table::data.table( + version = c('0.2.0', '0.1.0'), + zenodo_id = c('19633215', '19631961'), + default = c(TRUE, FALSE) + ) + } + # we save this to the cache so the app remembers not to keep looking online # for a manifest every time the version information is needed - savepath <- BiocFileCache::bfcnew(bfc, 'manifest', ext='.csv') + savepath <- BiocFileCache::bfcnew(bfc, entry, ext='.csv') data.table::fwrite(towrite, file=savepath) savepath } @@ -59,46 +90,23 @@ results } -.getCompendiumData <- function(version, bfc) { - versions <- .getVersions(bfc) +.getProjectData <- function(version, dataset, bfc) { + # dataset in ('compendium', 'projection') + versions <- .getVersions(bfc, dataset) rpath <- bfcrpath(bfc, versions[version]$data_url) data.table::fread(rpath) } .getCompendiumColdata <- function(version, bfc) { - versions <- .getVersions(bfc) + versions <- .getVersions(bfc, 'compendium') rpath <- bfcrpath(bfc, versions[version]$coldata_url) sampdat <- as.data.frame(data.table::fread(rpath)) rownames(sampdat) <- paste(sampdat[[2]], sampdat[[3]], sep = "_") sampdat } -#' load all compendium data into a TreeSummarizedExperiment -#' -#' @param bfc BiocFileCache object to use -#' -#' @returns a `TreeSummarizedExperiment` -#' -#' @importFrom data.table fread setkey -#' @importClassesFrom Matrix TsparseMatrix -#' @import TreeSummarizedExperiment -#' @import R.utils -#' @import ape -#' @importFrom BiocFileCache BiocFileCache bfcrpath bfcquery bfcnew -#' -#' @export -#' -#' @examples -#' cpd <- getCompendium() -#' -#' dim(cpd) -#' cpd -#' assayNames(cpd) -#' head(colData(cpd)) -#' - getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) { - versions <- .getVersions(bfc) + versions <- .getVersions(bfc, 'compendium') if(is.na(version)) { # If the user has not specified a version, grab whichever @@ -106,7 +114,7 @@ getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) { version <- versions[versions$default,]$version[1] } print(paste('Retrieving compendium version',version)) - dat <-.getCompendiumData(version, bfc) + dat <-.getProjectData(version, 'compendium', bfc) coldat <- .getCompendiumColdata(version, bfc) sampnames <- dat[[2]] From 424c3269744971093da9874739d538e94e523df8 Mon Sep 17 00:00:00 2001 From: Rich Abdill Date: Wed, 22 Apr 2026 12:24:31 -0400 Subject: [PATCH 2/8] Projection methods --- R/projection.R | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) create mode 100644 R/projection.R diff --git a/R/projection.R b/R/projection.R new file mode 100644 index 0000000..2f760ca --- /dev/null +++ b/R/projection.R @@ -0,0 +1,78 @@ +library(dplyr) +library(tidyr) +library(tibble) +library(vegan) + +.gm_mean = function(x){ + exp(mean(log(x[x > 0]))) +} + +.rclr <- function(a) { + answer <- log(a/.gm_mean(a)) + answer[] <- lapply(answer, function(i) if(is.numeric(i)) ifelse(is.infinite(i), 0, i) else i) + return(answer) +} + +.loadUserData <- function(userdata, usertaxa, seed=NA) { + if(!is.na(seed)) set.seed(seed) + + userdata |> + pivot_longer(!sample + , names_to = 'col_id' + , values_to = 'count' + ) |> # one row for each sample/taxon count + left_join(usertaxa, by='col_id') |> # add taxon names + mutate(famlevel = paste(kingdom,phylum,class,order,family, sep='||')) |> + select(!c('kingdom','phylum','class','order','family','genus')) |> + group_by(sample, famlevel) |> # summarize at family level + summarise( + aggcount = sum(count) + ) |> + ungroup() |> + pivot_wider(names_from='famlevel' # make taxon table + , values_from='aggcount' + ) |> + column_to_rownames('sample') |> + (\(x) { + x[,colSums(x) > 0] # grab non-empty taxa + })() |> + rownames_to_column('sample') |> + unique(by=!c('sample')) |> # ensure no duplicate entries + rowwise() |> + mutate( + totalreads = sum(c_across(where(is.numeric))) + ) |> + filter( + totalreads >= 3000 # make sure each has enough reads + ) |> # TODO log the results of this + select(!totalreads) |> + column_to_rownames('sample') |> + rrarefy(3000) |> + as.data.frame() |> + .rclr() |> + rownames_to_column('sample') +} + +.loadProjection <- function(pname, load_all) { + load_all |> + filter(projection==pname) |> + mutate(taxon = paste(kingdom,phylum,class,order,family, sep='||')) # TODO: use numbers for this instead +} + +project_it <- function(indata, loadings) { + indata |> + pivot_longer(cols=!c(sample) + , names_to='taxon' + , values_to='val' + , values_drop_na=TRUE + ) |> + left_join(loadings, by='taxon') |> + mutate( + across(starts_with('PC'), \(d) val * d) + ) |> + ungroup() |> + group_by(sample) |> + summarise( + across(starts_with('PC'), sum) + ) +} From ffa192f279e145d91685c3bf8aca911a1d53e2e1 Mon Sep 17 00:00:00 2001 From: Rich Abdill Date: Thu, 23 Apr 2026 00:31:52 -0400 Subject: [PATCH 3/8] Projection helpers --- R/loader.R | 9 +++++-- R/projection.R | 34 ++++++++++++++++++++------- vignettes/projection.Rmd | 51 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 10 deletions(-) create mode 100644 vignettes/projection.Rmd diff --git a/R/loader.R b/R/loader.R index 79b6917..8f311ef 100644 --- a/R/loader.R +++ b/R/loader.R @@ -58,9 +58,10 @@ print('Could not retrieve manifest file. Falling back to manifest as of April 2026.') towrite <- data.table::data.table( version = c('1.1.0', '1.0.1'), - zenodo_id = c('13733642', '10452633'), + zenodo_id = c('13733642', '10452633') + ) if(entry == 'projection') { - print('Falling back to projection manifest.' + print('Falling back to projection manifest.') towrite <- data.table::data.table( version = c('0.2.0', '0.1.0'), zenodo_id = c('19633215', '19631961'), @@ -86,6 +87,10 @@ colnames(results) <- c('version','zenodo_id','default') results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/taxonomic_table.csv.gz') results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv') + if(entry == 'projection') { + results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/loadings.txt') + results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv') + } data.table::setkey(results, version) results } diff --git a/R/projection.R b/R/projection.R index 2f760ca..6079e28 100644 --- a/R/projection.R +++ b/R/projection.R @@ -13,7 +13,30 @@ library(vegan) return(answer) } -.loadUserData <- function(userdata, usertaxa, seed=NA) { +.loadProjection <- function(pname, load_all) { + load_all |> + filter(projection==pname) |> + mutate(taxon = paste(kingdom,phylum,class,order,family, sep='||')) # TODO: use numbers for this instead +} + +projection_library <- function(bfc = BiocFileCache::BiocFileCache()) { + versions <- .getVersions(bfc, 'projection') + + if(is.na(version)) { + # If the user has not specified a version, grab whichever + # is indicated in the manifest as the default (i.e. most recent) + version <- versions[versions$default,]$version[1] + } + print(paste('Retrieving projections version',version)) + load_all <- .getProjectData(version, 'projection', bfc) + + curried <- function(pname) { + .loadProjection(pname, load_all) + } + curried +} + +loadUserData <- function(userdata, usertaxa, seed=NA) { if(!is.na(seed)) set.seed(seed) userdata |> @@ -53,12 +76,6 @@ library(vegan) rownames_to_column('sample') } -.loadProjection <- function(pname, load_all) { - load_all |> - filter(projection==pname) |> - mutate(taxon = paste(kingdom,phylum,class,order,family, sep='||')) # TODO: use numbers for this instead -} - project_it <- function(indata, loadings) { indata |> pivot_longer(cols=!c(sample) @@ -68,7 +85,8 @@ project_it <- function(indata, loadings) { ) |> left_join(loadings, by='taxon') |> mutate( - across(starts_with('PC'), \(d) val * d) + across(starts_with('PC') + , \(d) val * d) ) |> ungroup() |> group_by(sample) |> diff --git a/vignettes/projection.Rmd b/vignettes/projection.Rmd new file mode 100644 index 0000000..6b57827 --- /dev/null +++ b/vignettes/projection.Rmd @@ -0,0 +1,51 @@ +--- +title: "Compendium Explorer" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Compendium} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Overview + +Intro *tk + +## Getting started + +### Installation + +```{r eval=FALSE} +library(BiocManager) +BiocManager::install("blekhmanlab/MicroBioMap") +``` + +## Basic usage + +```{r message=FALSE} +library(MicroBioMap) + +my_asvs <- read.delim('~/code/pcalab/data/examples/toy/userdata.txt') +my_taxa <- read.delim('~/code/pcalab/data/examples/toy/usertaxa.txt') +my_meta <- read.delim('~/code/pcalab/data/examples/toy/usermeta.txt') + +taxtable <- loadUserData(my_asvs, my_taxa) +``` + +*tk + +```{r} +# this operation requires about 4GB of RAM +plib <- projection_library() +``` + +*tk +```{r} +my_results <- project_it(taxtable, plib('full')) +``` + +## sessionInfo + +```{r} +sessionInfo() +``` From f1ace8507c491f321b6f1ace2a7cc207901af321 Mon Sep 17 00:00:00 2001 From: Rich Abdill Date: Thu, 23 Apr 2026 22:31:00 -0400 Subject: [PATCH 4/8] Adding classifier demo --- vignettes/classifier.Rmd | 155 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 vignettes/classifier.Rmd diff --git a/vignettes/classifier.Rmd b/vignettes/classifier.Rmd new file mode 100644 index 0000000..46a84bf --- /dev/null +++ b/vignettes/classifier.Rmd @@ -0,0 +1,155 @@ +--- +title: "Projection Recipes" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Recipes} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Introduction + +This page describes examples of using microbiome data projected using the Compendium Explorer web app and R helpers. + +More *tk + +### Setup + +See the Projection vignette for the process to generate a data table in which each row is a sample and each column is a principal component, as defined by your selected projection: + +```r +my_results <- project_it(taxtable, plib('full')) +``` + +Loading libraries, setting up helper functions +```r +library(data.table) +library(ranger) +library(tibble) + +library(dplyr) +library(tidyr) + +fetchmeta.curried <- function(meta) { + # generates a function that always appends the same metadata to a + # taxonomic table + + toreturn <- function(ids, append=NA) { + # Helper function for adding sample-level metadata to a table + # of read counts + if(!is.na(append)) ids$sample <- append + + toreturn <- ids |> + inner_join(meta, by='sample') |> + mutate( + Community_Group = as.factor(Community_Group) + , Lifestyle = as.factor(Lifestyle) + ) + + toreturn + } +} + +dropmeta <- function(df, save = c()) { + # Cleave the metadata off a data table, leaving only the read counts + toreturn <- df |> + select(save, !colnames(my_meta)) + + toreturn +} + +scoremodel <- function(calls, modelname) { + # Fll in the confusion matrix with diagonal values; + # expects a data frame with two columns: + # baseline - the true classes for a given test set + # pred - the predicted calls + + half <- calls |> + mutate( + correct = pred==baseline + ) |> + group_by(baseline) |> + summarise( + TP = sum(correct) + , total = n() + ) |> + mutate( + FN = total - TP + ) |> + arrange(baseline) |> + mutate(matchup = rev(baseline)) # this just adds a column to show which values we want to look up + + half |> # Fill in the other half of the confusion matrix + inner_join(half + , by=c('baseline'='matchup') + , suffix = c('', 'new') + ) |> + select(baseline, TP, FN + , TN=TPnew + , FP=FNnew) |> + mutate(model=modelname + , Precision = TP / (TP + FP) # How many of the positive calls are true? + , Recall = TP / (TP + FN) # How many group members were ID'd? + , PPV = Precision + , FDR = 1 - PPV + , F1 = (2 * Precision * Recall) / (Precision + Recall) + , Sensitivity = Recall # How many group members were ID'd? + , Specificity = TN / (TN + FP) # How many non-members were excluded? + , Accuracy = (TP + TN) / (TP + TN + FP + FN) # How many of the classifications were right? + ) +} +``` + +### Classifier + +This trains a random forest classifier on a subset of your samples (`trainer`) and uses it to generate predictions of a single variable (`dependent`) for a different subset of samples (`tester`). + +```r +classify_it <- function(trainer, tester, dependent) { + # trainer - training set + # tester - test set + # dependent - string indicating the name of the dependent variable, i.e. the + # one to predict + # key - list of the correct classificatiosn for the test set + + print(paste0('dimensions=', ncol(trainer))) + + rf = ranger(dependent.variable.name=dependent, data = (trainer) + , num.trees = 800 + , sample.fraction = 0.5 + ) + + + test_prediction <- predict(rf, tester)$predictions |> + as.data.frame() |> + rename(pred = "predict(rf, tester)$predictions") + + test_prediction +} +``` + +Set up all our data frames. First, we select the first 25 (projected) principal components and separate out one community group as the test set. + +```r +dataset <- my_results |> + select(sample, PC1:PC25) + +train <- dataset |> + fetchmeta() |> + filter(!Community_Group == 'Gond') |> + arrange(sample) |> + dropmeta(save=c('Lifestyle')) + +testset <- dataset |> + fetchmeta() |> + filter(Community_Group == 'Gond') |> + arrange(sample) +``` + +All our helpers make the actual classification statement look mundane: + +```r +projected_model <- classify_it(train, testset, 'Lifestyle') + +projected <- scoremodel(projected_model, "projected") +``` From ff103dd06b563d49a8dab87535a7b826eef3ed39 Mon Sep 17 00:00:00 2001 From: Rich Abdill Date: Thu, 23 Apr 2026 22:35:47 -0400 Subject: [PATCH 5/8] Cleaning up --- R/loader.R | 32 +++++++++++++++----------------- vignettes/projection.Rmd | 2 +- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/R/loader.R b/R/loader.R index 8f311ef..62afc6e 100644 --- a/R/loader.R +++ b/R/loader.R @@ -1,26 +1,9 @@ -#' load all compendium data into a TreeSummarizedExperiment -#' -#' @param bfc BiocFileCache object to use -#' -#' @returns a `TreeSummarizedExperiment` -#' #' @importFrom data.table fread setkey #' @importClassesFrom Matrix TsparseMatrix #' @import TreeSummarizedExperiment #' @import R.utils #' @import ape #' @importFrom BiocFileCache BiocFileCache bfcrpath bfcquery bfcnew -#' -#' @export -#' -#' @examples -#' cpd <- getCompendium() -#' -#' dim(cpd) -#' cpd -#' assayNames(cpd) -#' head(colData(cpd)) -#' .getVersions <- function(bfc, entry, verbose=FALSE) { # Determines the most recent version of the compendium @@ -111,6 +94,21 @@ } getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) { +#' load all compendium data into a TreeSummarizedExperiment +#' @param version an optional parameter indicating which compendium version to retrieve +#' @param bfc BiocFileCache object to use +#' +#' @returns a `TreeSummarizedExperiment` +#' @export +#' +#' @examples +#' cpd <- getCompendium() +#' +#' dim(cpd) +#' cpd +#' assayNames(cpd) +#' head(colData(cpd)) +#' versions <- .getVersions(bfc, 'compendium') if(is.na(version)) { diff --git a/vignettes/projection.Rmd b/vignettes/projection.Rmd index 6b57827..6390e41 100644 --- a/vignettes/projection.Rmd +++ b/vignettes/projection.Rmd @@ -2,7 +2,7 @@ title: "Compendium Explorer" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Compendium} + %\VignetteIndexEntry{Projection} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- From ed3857e0f6d09d6b5299358d552c470fad6d7ea3 Mon Sep 17 00:00:00 2001 From: Rich Abdill Date: Mon, 27 Apr 2026 22:18:33 -0400 Subject: [PATCH 6/8] More modeling --- vignettes/classifier.Rmd | 132 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/vignettes/classifier.Rmd b/vignettes/classifier.Rmd index 46a84bf..54dd8dc 100644 --- a/vignettes/classifier.Rmd +++ b/vignettes/classifier.Rmd @@ -153,3 +153,135 @@ projected_model <- classify_it(train, testset, 'Lifestyle') projected <- scoremodel(projected_model, "projected") ``` + +### Contextualizing model + +These scores are difficult to interpret on their own, especially because of the class imbalance in the test set. To provide some additional context, we'll also generate predictions using a zero-information model that always guesses the majority class. + +```r +naive_model <- data.frame( + pred = rep('Urban', 97) + , baseline = fetchmeta(testset)$Lifestyle +) + +allurbs <- scoremodel(naive_model, "naive") +``` + +We can also train a new model on 25 principal components, but derived from conventional PCA: + +```r +my_asvs <- read.delim('~/code/pcalab/data/examples/toy/userdata.txt') +my_taxa <- read.delim('~/code/pcalab/data/examples/toy/usertaxa.txt') +my_meta <- read.delim('~/code/pcalab/data/examples/toy/usermeta.txt') + +taxtable <- loadUserData(my_asvs, my_taxa) + +ord <- taxtable |> + prcomp( + tol=0, + center=FALSE + ) +``` + +As above, we can split the resulting points into train and test sets and see how well the Gond classification task works: + +```r +points <- as.data.frame(ord$x) |> + select(PC1:PC25) |> + rownames_to_column('sample') |> + fetchmeta() + +data_train = points |> + filter(!Community_Group == 'Gond') |> + arrange(sample) |> + dropmeta(save=c('Lifestyle')) + +data_test = points |> + filter(Community_Group == 'Gond') |> + arrange(sample) + +key <- data_test$Lifestyle + +data_test <- data_test |> + dropmeta() + +solo_pca <- classify_it(data_train, data_test, 'Lifestyle') |> + scoremodel('solo_pca') +``` + +```r +scores <- rbind(projected, solo_pca, allurbs) |> + mutate( + model = factor(model, levels=c('projected','solo_pca', 'naive')) + ) |> + filter(baseline == 'Urban') |> + select(!baseline) |> + pivot_longer( + !model, names_to='measure', values_to = 'val' + ) +``` + +# PROJECTED +matdefine <- list("Toeval" = c("correct", "wrong"), + "Naive" =c("correct", "wrong")) + +perf.base <- matrix(c(33, 11, 0, 32), # TP, FN, 0, FP + nrow = 2, dimnames = matdefine) + +perf.projected <- matrix(c(36, 8, 0, 22), + nrow = 2, dimnames = matdefine) +mcnemar.test(perf.base) +mcnemar.test(perf.projected) +``` + +```r +# https://machinelearningmastery.com/mcnemars-test-for-machine-learning/ + +mcnemar <- function(classes) { + yesno <- classes |> + filter(test == baseline) |> + filter(naive != baseline) |> + nrow() + + noyes <- classes |> + filter(test != baseline) |> + filter(naive == baseline) |> + nrow() + + print(paste0('yesno = ', yesno)) + print(paste0('noyes = ', noyes)) + + #mcnstat <- ((yesno - noyes)^2) / (yesno + noyes) + mcnstat <- ((noyes - yesno - 1)^2) / (yesno + noyes) + + print(paste0('stat = ', mcnstat)) + + print(paste0('p = ' + , pchisq(mcnstat + , df=1, lower.tail = FALSE) + )) +} + +# ORIGINAL to naive +compare |> + select(baseline, naive + , test=original + ) |> + mcnemar() + +# PROJECTED to naive +compare |> + select(baseline, naive + , test=projected + ) |> + mcnemar() + +# DIRECT COMPARE +compare |> + select(baseline, naive=original + , test=projected + ) |> + mcnemar() +``` + +All done here *tk* \ No newline at end of file From 9e32a15f86e87850d615e4afe10a0b3b1b71ec1c Mon Sep 17 00:00:00 2001 From: Rich Abdill Date: Wed, 29 Apr 2026 21:51:17 -0400 Subject: [PATCH 7/8] Documenting imports, plus exported functions --- DESCRIPTION | 7 ++- R/loader.R | 33 +++++------ R/projection.R | 151 ++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 141 insertions(+), 50 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d76c01e..f536413 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: MicroBioMap Type: Package Title: Access the microbiome compendium from R -Version: 0.99.13 +Version: 0.99.14 Description: The MicroBioMap offers access to a dataset including more than 168,000 samples of publicly available 16S rRNA amplicon sequencing data, all processed using the same pipeline @@ -16,7 +16,10 @@ Imports: Matrix, data.table, BiocFileCache, - R.utils + R.utils, + dplr, + vegan, + tibble Encoding: UTF-8 Authors@R: c( diff --git a/R/loader.R b/R/loader.R index 62afc6e..69a86e8 100644 --- a/R/loader.R +++ b/R/loader.R @@ -4,17 +4,18 @@ #' @import R.utils #' @import ape #' @importFrom BiocFileCache BiocFileCache bfcrpath bfcquery bfcnew - +#' Compendium metadata retrieval +#' +#' Determines the most recent version of the compendium +#' and retrieves the manifest that describes all available releases. +#' @returns +#' This requires the canonical_doi configuration value stored in +#' constants.R, which always resolves to the most recent version. +#' @param bfc BiocFileCache object to use +#' @param entry A string from ['compendium','projection'] indicating which manifest to return. +#' @returns a data.table listing all versions and the necessary URLs .getVersions <- function(bfc, entry, verbose=FALSE) { - # Determines the most recent version of the compendium - # and retrieves the manifest that describes all available releases. - # Returns a data.table listing all versions and the necessary URLs - # This requires the canonical_doi configuration value stored in - # constants.R, which always resolves to the most recent version. - - # Check if we've already cached the manifest. - # If not, make an HTTP call to get the URL we need - rpath <- BiocFileCache::bfcquery(bfc, entry)$rpath + rpath <- BiocFileCache::bfcquery(bfc, entry)$rpath # check if we've already cached the manifest if(length(rpath) == 0) { if(verbose) { print('Retrieving version information.') @@ -66,7 +67,7 @@ } } results <- data.table::fread(rpath) - + # TODO: This is a silly bandage colnames(results) <- c('version','zenodo_id','default') results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/taxonomic_table.csv.gz') results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv') @@ -93,14 +94,12 @@ sampdat } -getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) { -#' load all compendium data into a TreeSummarizedExperiment +#' Compendium download +#' +#' Load all compendium data into a TreeSummarizedExperiment #' @param version an optional parameter indicating which compendium version to retrieve #' @param bfc BiocFileCache object to use -#' #' @returns a `TreeSummarizedExperiment` -#' @export -#' #' @examples #' cpd <- getCompendium() #' @@ -109,6 +108,8 @@ getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) { #' assayNames(cpd) #' head(colData(cpd)) #' +#' @export +getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) { versions <- .getVersions(bfc, 'compendium') if(is.na(version)) { diff --git a/R/projection.R b/R/projection.R index 6079e28..583c47d 100644 --- a/R/projection.R +++ b/R/projection.R @@ -1,8 +1,8 @@ -library(dplyr) -library(tidyr) -library(tibble) -library(vegan) +#' @import dplyr +#' @importFrom vegan rrarefy +#' @importFrom tibble rownames_to_column column_to_rownames +#library(tidyr) .gm_mean = function(x){ exp(mean(log(x[x > 0]))) } @@ -10,15 +10,35 @@ library(vegan) .rclr <- function(a) { answer <- log(a/.gm_mean(a)) answer[] <- lapply(answer, function(i) if(is.numeric(i)) ifelse(is.infinite(i), 0, i) else i) - return(answer) + answer } +#' Projection retrieval and formatting +#' +#' Filters a data frame containing coordinates for +#' multiple projections and formats it for further use. +#' @param pname string indicating the name of the projection to use +#' @param load_all a matrix of weights (with a "projection" field on which to filter) +#' @returns A data frame of weights for a single projection, +#' plus a "taxon" field containing a string unique to that taxon. .loadProjection <- function(pname, load_all) { load_all |> filter(projection==pname) |> mutate(taxon = paste(kingdom,phylum,class,order,family, sep='||')) # TODO: use numbers for this instead } +#' Projection library builder +#' +#' Utility for working with remotely hosted projection files. +#' @param pname string indicating the name of the projection to use +#' @param bfc BiocFileCache object to use +#' @returns A **function** that allows a user to load a projection using +#' its name. +#' @examples +#' lib <- projection_library() +#' projected <- project_it(my_data, projection_library('full')) +#' +#' @export projection_library <- function(bfc = BiocFileCache::BiocFileCache()) { versions <- .getVersions(bfc, 'projection') @@ -36,46 +56,113 @@ projection_library <- function(bfc = BiocFileCache::BiocFileCache()) { curried } -loadUserData <- function(userdata, usertaxa, seed=NA) { - if(!is.na(seed)) set.seed(seed) - - userdata |> +#' User data formatting +#' +#' @description +#' Formats user input data from two input files into a single data frame for further analysis. +#' Consolidates read counts at the family level. +#' @details +#' Taxonomic information is provided across two files: the matrix of read counts uses +#' id-based column names ("TAX1", "TAX2", etc). The second file associates those column +#' names with inferred taxonomic ranks. +#' This function modifies the count table by annotating the column names with the +#' taxonomic information. +#' @param userdata data frame countaining samples in rows and taxa in columns +#' @param usertaxa data frame associating the column names from userdata with kingdom, phylum, class, +#' order, and family classifications. +#' @returns A data frame with the same number of rows as userdata and one column for each distinct +#' family in the usertaxa file that was observed at least once in the dataset. +#' +#' @export +loadUserData <- function(userdata, usertaxa) { + test <- userdata |> pivot_longer(!sample , names_to = 'col_id' - , values_to = 'count' + , values_to = 'countnum' ) |> # one row for each sample/taxon count + filter(countnum > 0) |> left_join(usertaxa, by='col_id') |> # add taxon names mutate(famlevel = paste(kingdom,phylum,class,order,family, sep='||')) |> - select(!c('kingdom','phylum','class','order','family','genus')) |> + select(!c('kingdom','phylum','class','order','family')) |> group_by(sample, famlevel) |> # summarize at family level summarise( - aggcount = sum(count) + aggcount = sum(countnum) ) |> ungroup() |> pivot_wider(names_from='famlevel' # make taxon table , values_from='aggcount' - ) |> - column_to_rownames('sample') |> - (\(x) { - x[,colSums(x) > 0] # grab non-empty taxa - })() |> - rownames_to_column('sample') |> - unique(by=!c('sample')) |> # ensure no duplicate entries - rowwise() |> - mutate( - totalreads = sum(c_across(where(is.numeric))) - ) |> - filter( - totalreads >= 3000 # make sure each has enough reads - ) |> # TODO log the results of this - select(!totalreads) |> - column_to_rownames('sample') |> - rrarefy(3000) |> - as.data.frame() |> - .rclr() |> - rownames_to_column('sample') + , values_fill=0 + ) +} + +#' Sample rarefaction +#' +#' @description +#' Helper function for rarefying samples in a taxonomic table to a set read count. +#' @details +#' Throwing away data is generally not helpful, but this step is done to ensure +#' samples being projected into an existing latent space are of the same size as the samples +#' that were included in the original ordination. +#' @param userdata data frame countaining samples in rows and taxa in columns. (Generally here the output of `loadUserData()`) +#' @param level The desired number of reads per sample. **Samples with fewer reads than this will be filtered out.** +#' @returns A data frame of the same dimensions as userdata, but with rarefied read counts that result in +#' all rows summing to `level`. +#' +#' @export +rarefy_dataset <- function(userdata, level=3000, seed=NA) { + if(!is.na(seed)) set.seed(seed) + + userdata |> + unique(by=!c('sample')) |> # ensure no duplicate entries + rowwise() |> + mutate( + totalreads = sum(c_across(where(is.numeric))) + ) |> + filter( + totalreads >= 3000 # make sure each has enough reads + ) |> # TODO log the results of this + select(!totalreads) |> + column_to_rownames('sample') |> + rrarefy(3000) |> + as.data.frame() |> + rownames_to_column('sample') +} + +#' Robust centered log-ratio transformation +#' +#' @description +#' Helper function for applying rCLR to a data frame with a "sample" column. +#' @details +#' This step should happen *after* rarefaction, and converts a matrix of non-negative read counts into +#' one containing unbounded floating-point numbers. +#' @param userdata data frame countaining samples in rows and taxa in columns. +#' @returns A data frame of the same dimensions as userdata, but with rCLR-transformed values +#' +#' @export +transform <- function(userdata) { + userdata |> + column_to_rownames('sample') |> + .rclr() |> + rownames_to_column('sample') } +#' Projection operation +#' +#' @description +#' Projects a taxonomic table into an existing ordination using a set of pre-calculated loadings. +#' @details +#' This is the final step in the projection process. The input describes samples with taxa, and the output +#' describes the same taxa with principal components. Note that this step doesn't require rCLR-transformed +#' values *per se* -- input data should be transformed using the same process as the data used in the original +#' ordination. For the compendium ordinations currently available, this means rCLR. +#' @param indata data frame countaining samples in rows and taxa in columns. +#' @param loadings data frame containing taxa in each row (i.e. the indata column names) and +#' a principal component ("PC1", "PC2", etc) in each column. The values in each +#' cell indicate the weight that taxon's read count should be given when calculating +#' a sample's value for that PC. +#' @returns A data frame of the same rows as data and the same columns as loadings +#' +#' @export project_it <- function(indata, loadings) { indata |> pivot_longer(cols=!c(sample) From d5ab31d8993609b6540a54b884bb1d363c5718db Mon Sep 17 00:00:00 2001 From: Rich Abdill Date: Wed, 6 May 2026 10:58:23 -0400 Subject: [PATCH 8/8] Fixing projection downloads --- R/constants.R | 5 +- R/loader.R | 125 ++++++++++++++++++++++--------------------------- R/projection.R | 5 +- 3 files changed, 62 insertions(+), 73 deletions(-) diff --git a/R/constants.R b/R/constants.R index 36e85fb..6f7cae8 100644 --- a/R/constants.R +++ b/R/constants.R @@ -1 +1,4 @@ -canonical_doi <- 'https://doi.org/10.5281/zenodo.8186993' +canonical_doi <- c( + 'compendium' = 'https://doi.org/10.5281/zenodo.8186993' + , 'projection' = 'https://doi.org/10.5281/zenodo.19631960' +) diff --git a/R/loader.R b/R/loader.R index 69a86e8..f20615d 100644 --- a/R/loader.R +++ b/R/loader.R @@ -15,83 +15,68 @@ #' @param entry A string from ['compendium','projection'] indicating which manifest to return. #' @returns a data.table listing all versions and the necessary URLs .getVersions <- function(bfc, entry, verbose=FALSE) { - rpath <- BiocFileCache::bfcquery(bfc, entry)$rpath # check if we've already cached the manifest - if(length(rpath) == 0) { - if(verbose) { - print('Retrieving version information.') - } - resolve <- curl::curl_fetch_memory(canonical_doi[entry]) - if(resolve$status_code != 200) { - stop(paste0( - 'Could not resolve canonical DOI. Status code: ', - resolve$status_code - )) - } + rpath <- BiocFileCache::bfcquery(bfc, entry)$rpath # check if we've already cached the manifest + if(length(rpath) == 0) { + if(verbose) { + print('Retrieving version information.') + } + resolve <- curl::curl_fetch_memory(canonical_doi[entry]) + if(resolve$status_code != 200) { + stop(paste0( + 'Could not resolve canonical DOI. Status code: ', + resolve$status_code + )) + } - if(verbose) { - print('Determined data address:') - print(resolve$url) - } - manifest <- paste0(resolve$url, '/files/manifest.csv') - - rpath <- tryCatch( - { - bfcrpath(bfc, manifest) - }, - error = function(msg){ - print('Could not retrieve manifest file. Falling back to manifest as of April 2026.') + if(verbose) { + print('Determined data address:') + print(resolve$url) + } + manifest <- paste0(resolve$url, '/files/manifest.csv') + + rpath <- tryCatch( + { + bfcrpath(bfc, manifest) + }, + error = function(msg){ + print('Could not retrieve manifest file. Falling back to manifest as of April 2026.') + towrite <- data.table::data.table( + version = c('1.1.0', '1.0.1'), + zenodo_id = c('13733642', '10452633') + ) + if(entry == 'projection') { + print('Falling back to projection manifest.') towrite <- data.table::data.table( - version = c('1.1.0', '1.0.1'), - zenodo_id = c('13733642', '10452633') + version = c('0.3.0','0.2.0', '0.1.0'), + zenodo_id = c('20040560','19633215', '19631961'), + default = c(TRUE, FALSE, FALSE) ) - if(entry == 'projection') { - print('Falling back to projection manifest.') - towrite <- data.table::data.table( - version = c('0.2.0', '0.1.0'), - zenodo_id = c('19633215', '19631961'), - default = c(TRUE, FALSE) - ) - } - - # we save this to the cache so the app remembers not to keep looking online - # for a manifest every time the version information is needed - savepath <- BiocFileCache::bfcnew(bfc, entry, ext='.csv') - data.table::fwrite(towrite, file=savepath) - savepath } - ) - } - else { - if(verbose) { - print('Cached version information found.') + + # we save this to the cache so the app remembers not to keep looking online + # for a manifest every time the version information is needed + savepath <- BiocFileCache::bfcnew(bfc, entry, ext='.csv') + data.table::fwrite(towrite, file=savepath) + savepath } + ) + } + else { + if(verbose) { + print('Cached version information found.') } - results <- data.table::fread(rpath) - # TODO: This is a silly bandage - colnames(results) <- c('version','zenodo_id','default') - results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/taxonomic_table.csv.gz') + } + results <- data.table::fread(rpath) + # TODO: This is a silly bandage + colnames(results) <- c('version','zenodo_id','default') + results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/taxonomic_table.csv.gz') + results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv') + if(entry == 'projection') { + results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/loadings.txt') results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv') - if(entry == 'projection') { - results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/loadings.txt') - results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv') - } - data.table::setkey(results, version) - results -} - -.getProjectData <- function(version, dataset, bfc) { - # dataset in ('compendium', 'projection') - versions <- .getVersions(bfc, dataset) - rpath <- bfcrpath(bfc, versions[version]$data_url) - data.table::fread(rpath) -} - -.getCompendiumColdata <- function(version, bfc) { - versions <- .getVersions(bfc, 'compendium') - rpath <- bfcrpath(bfc, versions[version]$coldata_url) - sampdat <- as.data.frame(data.table::fread(rpath)) - rownames(sampdat) <- paste(sampdat[[2]], sampdat[[3]], sep = "_") - sampdat + } + data.table::setkey(results, version) + results } #' Compendium download diff --git a/R/projection.R b/R/projection.R index 583c47d..7a8e218 100644 --- a/R/projection.R +++ b/R/projection.R @@ -39,8 +39,9 @@ #' projected <- project_it(my_data, projection_library('full')) #' #' @export -projection_library <- function(bfc = BiocFileCache::BiocFileCache()) { - versions <- .getVersions(bfc, 'projection') +projection_library <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) { + versions <- .getVersions(bfc, entry='projection') + print(versions) if(is.na(version)) { # If the user has not specified a version, grab whichever