From e9aab2f5968563aff04cf5731d08ad966ea25111 Mon Sep 17 00:00:00 2001
From: Rich Abdill <rabdill@uchicago.edu>
Date: Wed, 22 Apr 2026 00:06:10 -0400
Subject: [PATCH 1/8] Cleaning up to add projection

---
 LICENSE.md |  2 +-
 R/loader.R | 80 ++++++++++++++++++++++++++++++------------------------
 2 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/LICENSE.md b/LICENSE.md
index d4e83a1..9691111 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,6 +1,6 @@
 # MIT License
 
-Copyright (c) 2023 MicroBioMap authors
+Copyright (c) 2026 The University of Chicago
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/R/loader.R b/R/loader.R
index 2b42d09..79b6917 100644
--- a/R/loader.R
+++ b/R/loader.R
@@ -1,4 +1,28 @@
-.getVersions <- function(bfc, verbose=FALSE) {
+#' load all compendium data into a TreeSummarizedExperiment
+#'
+#' @param bfc BiocFileCache object to use
+#'
+#' @returns a `TreeSummarizedExperiment`
+#'
+#' @importFrom data.table fread setkey
+#' @importClassesFrom Matrix TsparseMatrix
+#' @import TreeSummarizedExperiment
+#' @import R.utils
+#' @import ape
+#' @importFrom BiocFileCache BiocFileCache bfcrpath bfcquery bfcnew
+#'
+#' @export
+#'
+#' @examples
+#' cpd <- getCompendium()
+#'
+#' dim(cpd)
+#' cpd
+#' assayNames(cpd)
+#' head(colData(cpd))
+#'
+
+.getVersions <- function(bfc, entry, verbose=FALSE) {
     # Determines the most recent version of the compendium
     # and retrieves the manifest that describes all available releases.
     # Returns a data.table listing all versions and the necessary URLs
@@ -7,12 +31,12 @@
 
     # Check if we've already cached the manifest.
     # If not, make an HTTP call to get the URL we need
-    rpath <- BiocFileCache::bfcquery(bfc, 'manifest')$rpath
+    rpath <- BiocFileCache::bfcquery(bfc, entry)$rpath
     if(length(rpath) == 0) {
       if(verbose) {
         print('Retrieving version information.')
       }
-      resolve <- curl::curl_fetch_memory(canonical_doi)
+      resolve <- curl::curl_fetch_memory(canonical_doi[entry])
       if(resolve$status_code != 200) {
         stop(paste0(
           'Could not resolve canonical DOI. Status code: ',
@@ -31,15 +55,22 @@
           bfcrpath(bfc, manifest)
         },
         error = function(msg){
-          print('Could not retrieve manifest file. Falling back to manifest as of v1.1.0')
+          print('Could not retrieve manifest file. Falling back to manifest as of April 2026.')
           towrite <- data.table::data.table(
             version = c('1.1.0', '1.0.1'),
             zenodo_id = c('13733642', '10452633'),
-            default = c(TRUE, FALSE)
-          )
+          if(entry == 'projection') {
+            print('Falling back to projection manifest.'
+            towrite <- data.table::data.table(
+              version = c('0.2.0', '0.1.0'),
+              zenodo_id = c('19633215', '19631961'),
+              default = c(TRUE, FALSE)
+            )
+          }
+
           # we save this to the cache so the app remembers not to keep looking online
           # for a manifest every time the version information is needed
-          savepath <- BiocFileCache::bfcnew(bfc, 'manifest', ext='.csv')
+          savepath <- BiocFileCache::bfcnew(bfc, entry, ext='.csv')
           data.table::fwrite(towrite, file=savepath)
           savepath
         }
@@ -59,46 +90,23 @@
     results
 }
 
-.getCompendiumData <- function(version, bfc) {
-    versions <- .getVersions(bfc)
+.getProjectData <- function(version, dataset, bfc) {
+    # dataset in ('compendium', 'projection')
+    versions <- .getVersions(bfc, dataset)
     rpath <- bfcrpath(bfc, versions[version]$data_url)
     data.table::fread(rpath)
 }
 
 .getCompendiumColdata <- function(version, bfc) {
-    versions <- .getVersions(bfc)
+    versions <- .getVersions(bfc, 'compendium')
     rpath <- bfcrpath(bfc, versions[version]$coldata_url)
     sampdat <- as.data.frame(data.table::fread(rpath))
     rownames(sampdat) <- paste(sampdat[[2]], sampdat[[3]], sep = "_")
     sampdat
 }
 
-#' load all compendium data into a TreeSummarizedExperiment
-#'
-#' @param bfc BiocFileCache object to use
-#'
-#' @returns a `TreeSummarizedExperiment`
-#'
-#' @importFrom data.table fread setkey
-#' @importClassesFrom Matrix TsparseMatrix
-#' @import TreeSummarizedExperiment
-#' @import R.utils
-#' @import ape
-#' @importFrom BiocFileCache BiocFileCache bfcrpath bfcquery bfcnew
-#'
-#' @export
-#'
-#' @examples
-#' cpd <- getCompendium()
-#'
-#' dim(cpd)
-#' cpd
-#' assayNames(cpd)
-#' head(colData(cpd))
-#'
-
 getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) {
-    versions <- .getVersions(bfc)
+    versions <- .getVersions(bfc, 'compendium')
 
     if(is.na(version)) {
         # If the user has not specified a version, grab whichever
@@ -106,7 +114,7 @@ getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) {
         version <- versions[versions$default,]$version[1]
     }
     print(paste('Retrieving compendium version',version))
-    dat <-.getCompendiumData(version, bfc)
+    dat <-.getProjectData(version, 'compendium', bfc)
     coldat <- .getCompendiumColdata(version, bfc)
 
     sampnames <- dat[[2]]

From 424c3269744971093da9874739d538e94e523df8 Mon Sep 17 00:00:00 2001
From: Rich Abdill <rabdill@uchicago.edu>
Date: Wed, 22 Apr 2026 12:24:31 -0400
Subject: [PATCH 2/8] Projection methods

---
 R/projection.R | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)
 create mode 100644 R/projection.R

diff --git a/R/projection.R b/R/projection.R
new file mode 100644
index 0000000..2f760ca
--- /dev/null
+++ b/R/projection.R
@@ -0,0 +1,78 @@
+library(dplyr)
+library(tidyr)
+library(tibble)
+library(vegan)
+
+.gm_mean = function(x){
+  exp(mean(log(x[x > 0])))
+}
+
+.rclr <- function(a) {
+  answer <- log(a/.gm_mean(a))
+  answer[] <- lapply(answer, function(i) if(is.numeric(i)) ifelse(is.infinite(i), 0, i) else i)
+  return(answer)
+}
+
+.loadUserData <- function(userdata, usertaxa, seed=NA) {
+    if(!is.na(seed)) set.seed(seed)
+
+    userdata |>
+        pivot_longer(!sample
+            , names_to = 'col_id'
+            , values_to = 'count'
+        ) |> # one row for each sample/taxon count
+        left_join(usertaxa, by='col_id') |> # add taxon names
+        mutate(famlevel = paste(kingdom,phylum,class,order,family, sep='||')) |>
+        select(!c('kingdom','phylum','class','order','family','genus')) |>
+        group_by(sample, famlevel) |> # summarize at family level
+        summarise(
+            aggcount = sum(count)
+        ) |>
+        ungroup() |>
+        pivot_wider(names_from='famlevel' # make taxon table
+            , values_from='aggcount'
+        ) |>
+        column_to_rownames('sample') |>
+        (\(x) {
+            x[,colSums(x) > 0] # grab non-empty taxa
+        })() |>
+        rownames_to_column('sample') |>
+        unique(by=!c('sample')) |> # ensure no duplicate entries
+        rowwise() |>
+        mutate(
+          totalreads = sum(c_across(where(is.numeric)))
+        ) |>
+        filter(
+          totalreads >= 3000 # make sure each has enough reads
+        ) |> # TODO log the results of this
+        select(!totalreads) |>
+        column_to_rownames('sample') |>
+        rrarefy(3000) |>
+        as.data.frame() |>
+        .rclr() |>
+        rownames_to_column('sample')
+}
+
+.loadProjection <- function(pname, load_all) {
+  load_all |>
+    filter(projection==pname) |>
+    mutate(taxon = paste(kingdom,phylum,class,order,family, sep='||')) # TODO: use numbers for this instead
+}
+
+project_it <- function(indata, loadings) {
+    indata |>
+        pivot_longer(cols=!c(sample)
+            , names_to='taxon'
+            , values_to='val'
+            , values_drop_na=TRUE
+        ) |>
+        left_join(loadings, by='taxon') |>
+        mutate(
+          across(starts_with('PC'), \(d) val * d)
+        ) |>
+        ungroup() |>
+        group_by(sample) |>
+        summarise(
+          across(starts_with('PC'), sum)
+        )
+}

From ffa192f279e145d91685c3bf8aca911a1d53e2e1 Mon Sep 17 00:00:00 2001
From: Rich Abdill <rabdill@uchicago.edu>
Date: Thu, 23 Apr 2026 00:31:52 -0400
Subject: [PATCH 3/8] Projection helpers

---
 R/loader.R               |  9 +++++--
 R/projection.R           | 34 ++++++++++++++++++++-------
 vignettes/projection.Rmd | 51 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 10 deletions(-)
 create mode 100644 vignettes/projection.Rmd

diff --git a/R/loader.R b/R/loader.R
index 79b6917..8f311ef 100644
--- a/R/loader.R
+++ b/R/loader.R
@@ -58,9 +58,10 @@
           print('Could not retrieve manifest file. Falling back to manifest as of April 2026.')
           towrite <- data.table::data.table(
             version = c('1.1.0', '1.0.1'),
-            zenodo_id = c('13733642', '10452633'),
+            zenodo_id = c('13733642', '10452633')
+          )
           if(entry == 'projection') {
-            print('Falling back to projection manifest.'
+            print('Falling back to projection manifest.')
             towrite <- data.table::data.table(
               version = c('0.2.0', '0.1.0'),
               zenodo_id = c('19633215', '19631961'),
@@ -86,6 +87,10 @@
     colnames(results) <- c('version','zenodo_id','default')
     results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/taxonomic_table.csv.gz')
     results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv')
+    if(entry == 'projection') {
+      results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/loadings.txt')
+      results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv')
+    }
     data.table::setkey(results, version)
     results
 }
diff --git a/R/projection.R b/R/projection.R
index 2f760ca..6079e28 100644
--- a/R/projection.R
+++ b/R/projection.R
@@ -13,7 +13,30 @@ library(vegan)
   return(answer)
 }
 
-.loadUserData <- function(userdata, usertaxa, seed=NA) {
+.loadProjection <- function(pname, load_all) {
+  load_all |>
+    filter(projection==pname) |>
+    mutate(taxon = paste(kingdom,phylum,class,order,family, sep='||')) # TODO: use numbers for this instead
+}
+
+projection_library <- function(bfc = BiocFileCache::BiocFileCache()) {
+  versions <- .getVersions(bfc, 'projection')
+
+  if(is.na(version)) {
+    # If the user has not specified a version, grab whichever
+    # is indicated in the manifest as the default (i.e. most recent)
+    version <- versions[versions$default,]$version[1]
+  }
+  print(paste('Retrieving projections version',version))
+  load_all <- .getProjectData(version, 'projection', bfc)
+
+  curried <- function(pname) {
+    .loadProjection(pname, load_all)
+  }
+  curried
+}
+
+loadUserData <- function(userdata, usertaxa, seed=NA) {
     if(!is.na(seed)) set.seed(seed)
 
     userdata |>
@@ -53,12 +76,6 @@ library(vegan)
         rownames_to_column('sample')
 }
 
-.loadProjection <- function(pname, load_all) {
-  load_all |>
-    filter(projection==pname) |>
-    mutate(taxon = paste(kingdom,phylum,class,order,family, sep='||')) # TODO: use numbers for this instead
-}
-
 project_it <- function(indata, loadings) {
     indata |>
         pivot_longer(cols=!c(sample)
@@ -68,7 +85,8 @@ project_it <- function(indata, loadings) {
         ) |>
         left_join(loadings, by='taxon') |>
         mutate(
-          across(starts_with('PC'), \(d) val * d)
+          across(starts_with('PC')
+          , \(d) val * d)
         ) |>
         ungroup() |>
         group_by(sample) |>
diff --git a/vignettes/projection.Rmd b/vignettes/projection.Rmd
new file mode 100644
index 0000000..6b57827
--- /dev/null
+++ b/vignettes/projection.Rmd
@@ -0,0 +1,51 @@
+---
+title: "Compendium Explorer"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Compendium}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+## Overview
+
+Intro *tk
+
+## Getting started
+
+### Installation
+
+```{r eval=FALSE}
+library(BiocManager)
+BiocManager::install("blekhmanlab/MicroBioMap")
+```
+
+## Basic usage
+
+```{r message=FALSE}
+library(MicroBioMap)
+
+my_asvs <- read.delim('~/code/pcalab/data/examples/toy/userdata.txt')
+my_taxa <- read.delim('~/code/pcalab/data/examples/toy/usertaxa.txt')
+my_meta <- read.delim('~/code/pcalab/data/examples/toy/usermeta.txt')
+
+taxtable <- loadUserData(my_asvs, my_taxa)
+```
+
+*tk
+
+```{r}
+# this operation requires about 4GB of RAM
+plib <- projection_library()
+```
+
+*tk
+```{r}
+my_results <- project_it(taxtable, plib('full'))
+```
+
+## sessionInfo
+
+```{r}
+sessionInfo()
+```

From f1ace8507c491f321b6f1ace2a7cc207901af321 Mon Sep 17 00:00:00 2001
From: Rich Abdill <rabdill@uchicago.edu>
Date: Thu, 23 Apr 2026 22:31:00 -0400
Subject: [PATCH 4/8] Adding classifier demo

---
 vignettes/classifier.Rmd | 155 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)
 create mode 100644 vignettes/classifier.Rmd

diff --git a/vignettes/classifier.Rmd b/vignettes/classifier.Rmd
new file mode 100644
index 0000000..46a84bf
--- /dev/null
+++ b/vignettes/classifier.Rmd
@@ -0,0 +1,155 @@
+---
+title: "Projection Recipes"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{Recipes}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+## Introduction
+
+This page describes examples of using microbiome data projected using the Compendium Explorer web app and R helpers.
+
+More *tk
+
+### Setup
+
+See the Projection vignette for the process to generate a data table in which each row is a sample and each column is a principal component, as defined by your selected projection:
+
+```r
+my_results <- project_it(taxtable, plib('full'))
+```
+
+Loading libraries, setting up helper functions
+```r
+library(data.table)
+library(ranger)
+library(tibble)
+
+library(dplyr)
+library(tidyr)
+
+fetchmeta.curried <- function(meta) {
+  # generates a function that always appends the same metadata to a
+  # taxonomic table
+  
+  toreturn <- function(ids, append=NA) {
+    # Helper function for adding sample-level metadata to a table
+    # of read counts
+    if(!is.na(append)) ids$sample <- append
+  
+    toreturn <- ids |>
+      inner_join(meta, by='sample') |>
+      mutate(
+        Community_Group = as.factor(Community_Group)
+        , Lifestyle = as.factor(Lifestyle)
+      )
+  
+    toreturn
+  }
+}
+
+dropmeta <- function(df, save = c()) {
+  # Cleave the metadata off a data table, leaving only the read counts
+  toreturn <- df |>
+    select(save, !colnames(my_meta))
+
+  toreturn
+}
+
+scoremodel <- function(calls, modelname) {
+  # Fll in the confusion matrix with diagonal values;
+  # expects a data frame with two columns:
+  # baseline - the true classes for a given test set
+  # pred - the predicted calls
+  
+  half <- calls |>
+    mutate(
+      correct = pred==baseline
+    ) |>
+    group_by(baseline) |>
+    summarise(
+      TP = sum(correct)
+      , total = n()
+    ) |>
+    mutate(
+      FN = total - TP
+    ) |>
+    arrange(baseline) |>
+    mutate(matchup = rev(baseline))   # this just adds a column to show which values we want to look up
+  
+  half |> # Fill in the other half of the confusion matrix
+    inner_join(half
+               , by=c('baseline'='matchup')
+               , suffix = c('', 'new')
+    ) |>
+    select(baseline, TP, FN
+     , TN=TPnew
+     , FP=FNnew) |>
+    mutate(model=modelname
+      , Precision = TP / (TP + FP) # How many of the positive calls are true?
+      , Recall = TP / (TP + FN)  # How many group members were ID'd?
+      , PPV = Precision
+      , FDR = 1 - PPV
+      , F1 = (2 * Precision * Recall) / (Precision + Recall)
+      , Sensitivity = Recall         # How many group members were ID'd?
+      , Specificity = TN / (TN + FP) # How many non-members were excluded?
+      , Accuracy = (TP + TN) / (TP + TN + FP + FN) # How many of the classifications were right?
+    )
+}
+```
+
+### Classifier
+
+This trains a random forest classifier on a subset of your samples (`trainer`) and uses it to generate predictions of a single variable (`dependent`) for a different subset of samples (`tester`).
+
+```r
+classify_it <- function(trainer, tester, dependent) {
+  # trainer - training set
+  # tester - test set
+  # dependent - string indicating the name of the dependent variable, i.e. the
+  #             one to predict
+  # key - list of the correct classificatiosn for the test set
+  
+  print(paste0('dimensions=', ncol(trainer)))
+
+  rf = ranger(dependent.variable.name=dependent, data = (trainer)
+              , num.trees = 800
+              , sample.fraction = 0.5
+  )
+
+  
+  test_prediction <- predict(rf, tester)$predictions |>
+    as.data.frame() |>
+    rename(pred = "predict(rf, tester)$predictions")
+  
+  test_prediction
+}
+```
+
+Set up all our data frames. First, we select the first 25 (projected) principal components and separate out one community group as the test set.
+
+```r
+dataset <- my_results |>
+  select(sample, PC1:PC25)
+
+train <- dataset |>
+  fetchmeta() |>
+  filter(!Community_Group == 'Gond') |>
+  arrange(sample) |>
+  dropmeta(save=c('Lifestyle'))
+
+testset <- dataset |>
+  fetchmeta() |>
+  filter(Community_Group == 'Gond') |>
+  arrange(sample)
+```
+
+All our helpers make the actual classification statement look mundane:
+
+```r
+projected_model <- classify_it(train, testset, 'Lifestyle')
+
+projected <- scoremodel(projected_model, "projected")
+```

From ff103dd06b563d49a8dab87535a7b826eef3ed39 Mon Sep 17 00:00:00 2001
From: Rich Abdill <rabdill@uchicago.edu>
Date: Thu, 23 Apr 2026 22:35:47 -0400
Subject: [PATCH 5/8] Cleaning up

---
 R/loader.R               | 32 +++++++++++++++-----------------
 vignettes/projection.Rmd |  2 +-
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/R/loader.R b/R/loader.R
index 8f311ef..62afc6e 100644
--- a/R/loader.R
+++ b/R/loader.R
@@ -1,26 +1,9 @@
-#' load all compendium data into a TreeSummarizedExperiment
-#'
-#' @param bfc BiocFileCache object to use
-#'
-#' @returns a `TreeSummarizedExperiment`
-#'
 #' @importFrom data.table fread setkey
 #' @importClassesFrom Matrix TsparseMatrix
 #' @import TreeSummarizedExperiment
 #' @import R.utils
 #' @import ape
 #' @importFrom BiocFileCache BiocFileCache bfcrpath bfcquery bfcnew
-#'
-#' @export
-#'
-#' @examples
-#' cpd <- getCompendium()
-#'
-#' dim(cpd)
-#' cpd
-#' assayNames(cpd)
-#' head(colData(cpd))
-#'
 
 .getVersions <- function(bfc, entry, verbose=FALSE) {
     # Determines the most recent version of the compendium
@@ -111,6 +94,21 @@
 }
 
 getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) {
+#' load all compendium data into a TreeSummarizedExperiment
+#' @param version an optional parameter indicating which compendium version to retrieve
+#' @param bfc BiocFileCache object to use
+#'
+#' @returns a `TreeSummarizedExperiment`
+#' @export
+#'
+#' @examples
+#' cpd <- getCompendium()
+#'
+#' dim(cpd)
+#' cpd
+#' assayNames(cpd)
+#' head(colData(cpd))
+#'
     versions <- .getVersions(bfc, 'compendium')
 
     if(is.na(version)) {
diff --git a/vignettes/projection.Rmd b/vignettes/projection.Rmd
index 6b57827..6390e41 100644
--- a/vignettes/projection.Rmd
+++ b/vignettes/projection.Rmd
@@ -2,7 +2,7 @@
 title: "Compendium Explorer"
 output: rmarkdown::html_vignette
 vignette: >
-  %\VignetteIndexEntry{Compendium}
+  %\VignetteIndexEntry{Projection}
   %\VignetteEngine{knitr::rmarkdown}
   %\VignetteEncoding{UTF-8}
 ---

From ed3857e0f6d09d6b5299358d552c470fad6d7ea3 Mon Sep 17 00:00:00 2001
From: Rich Abdill <rabdill@uchicago.edu>
Date: Mon, 27 Apr 2026 22:18:33 -0400
Subject: [PATCH 6/8] More modeling

---
 vignettes/classifier.Rmd | 132 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 132 insertions(+)

diff --git a/vignettes/classifier.Rmd b/vignettes/classifier.Rmd
index 46a84bf..54dd8dc 100644
--- a/vignettes/classifier.Rmd
+++ b/vignettes/classifier.Rmd
@@ -153,3 +153,135 @@ projected_model <- classify_it(train, testset, 'Lifestyle')
 
 projected <- scoremodel(projected_model, "projected")
 ```
+
+### Contextualizing model
+
+These scores are difficult to interpret on their own, especially because of the class imbalance in the test set. To provide some additional context, we'll also generate predictions using a zero-information model that always guesses the majority class.
+
+```r
+naive_model <- data.frame(
+  pred = rep('Urban', 97)
+  , baseline = fetchmeta(testset)$Lifestyle
+)
+
+allurbs <- scoremodel(naive_model, "naive")
+```
+
+We can also train a new model on 25 principal components, but derived from conventional PCA:
+
+```r
+my_asvs <- read.delim('~/code/pcalab/data/examples/toy/userdata.txt')
+my_taxa <- read.delim('~/code/pcalab/data/examples/toy/usertaxa.txt')
+my_meta <- read.delim('~/code/pcalab/data/examples/toy/usermeta.txt')
+
+taxtable <- loadUserData(my_asvs, my_taxa)
+
+ord <- taxtable |>
+  prcomp(
+    tol=0,
+    center=FALSE
+  )
+```
+
+As above, we can split the resulting points into train and test sets and see how well the Gond classification task works:
+
+```r
+points <- as.data.frame(ord$x) |>
+  select(PC1:PC25) |>
+  rownames_to_column('sample') |>
+  fetchmeta()
+
+data_train = points |>
+  filter(!Community_Group == 'Gond') |>
+  arrange(sample) |>
+  dropmeta(save=c('Lifestyle'))
+
+data_test = points |>
+  filter(Community_Group == 'Gond') |>
+  arrange(sample)
+
+key <- data_test$Lifestyle
+
+data_test <- data_test |>
+  dropmeta()
+
+solo_pca <- classify_it(data_train, data_test, 'Lifestyle') |>
+  scoremodel('solo_pca')
+```
+
+```r
+scores <- rbind(projected, solo_pca, allurbs) |>
+  mutate(
+    model = factor(model, levels=c('projected','solo_pca', 'naive'))
+  ) |>
+  filter(baseline == 'Urban') |>
+  select(!baseline) |>
+  pivot_longer(
+    !model, names_to='measure', values_to = 'val'
+  )
+```
+
+# PROJECTED
+matdefine <- list("Toeval" = c("correct", "wrong"),
+                  "Naive" =c("correct", "wrong"))
+
+perf.base <-  matrix(c(33, 11, 0, 32),    # TP, FN, 0, FP
+                     nrow = 2, dimnames = matdefine)
+
+perf.projected <-  matrix(c(36, 8, 0, 22),
+                          nrow = 2, dimnames = matdefine)
+mcnemar.test(perf.base)
+mcnemar.test(perf.projected)
+```
+
+```r
+# https://machinelearningmastery.com/mcnemars-test-for-machine-learning/
+
+mcnemar <- function(classes) {
+  yesno <- classes |>
+    filter(test == baseline) |>
+    filter(naive != baseline) |>
+    nrow()
+
+  noyes <- classes |>
+    filter(test != baseline) |>
+    filter(naive == baseline) |>
+    nrow()
+
+  print(paste0('yesno = ', yesno))
+  print(paste0('noyes = ', noyes))
+
+  #mcnstat <- ((yesno - noyes)^2) / (yesno + noyes)
+  mcnstat <- ((noyes - yesno - 1)^2) / (yesno + noyes)
+
+  print(paste0('stat = ', mcnstat))
+
+  print(paste0('p = '
+           , pchisq(mcnstat
+                    , df=1, lower.tail = FALSE)
+  ))
+}
+
+# ORIGINAL to naive
+compare |>
+  select(baseline, naive
+      , test=original
+  ) |>
+  mcnemar()
+
+# PROJECTED to naive
+compare |>
+  select(baseline, naive
+         , test=projected
+  ) |>
+  mcnemar()
+
+# DIRECT COMPARE
+compare |>
+  select(baseline, naive=original
+         , test=projected
+  ) |>
+  mcnemar()
+```
+
+All done here *tk*
\ No newline at end of file

From 9e32a15f86e87850d615e4afe10a0b3b1b71ec1c Mon Sep 17 00:00:00 2001
From: Rich Abdill <rabdill@uchicago.edu>
Date: Wed, 29 Apr 2026 21:51:17 -0400
Subject: [PATCH 7/8] Documenting imports, plus exported functions

---
 DESCRIPTION    |   7 ++-
 R/loader.R     |  33 +++++------
 R/projection.R | 151 ++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 141 insertions(+), 50 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index d76c01e..f536413 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: MicroBioMap
 Type: Package
 Title: Access the microbiome compendium from R
-Version: 0.99.13
+Version: 0.99.14
 Description: The MicroBioMap offers access to a dataset including more than
     168,000 samples of publicly available
     16S rRNA amplicon sequencing data, all processed using the same pipeline
@@ -16,7 +16,10 @@ Imports:
     Matrix,
     data.table,
     BiocFileCache,
-    R.utils
+    R.utils,
+    dplr,
+    vegan,
+    tibble
 Encoding: UTF-8
 Authors@R:
     c(
diff --git a/R/loader.R b/R/loader.R
index 62afc6e..69a86e8 100644
--- a/R/loader.R
+++ b/R/loader.R
@@ -4,17 +4,18 @@
 #' @import R.utils
 #' @import ape
 #' @importFrom BiocFileCache BiocFileCache bfcrpath bfcquery bfcnew
-
+#' Compendium metadata retrieval
+#'
+#' Determines the most recent version of the compendium
+#' and retrieves the manifest that describes all available releases.
+#' @returns
+#' This requires the canonical_doi configuration value stored in
+#' constants.R, which always resolves to the most recent version.
+#' @param bfc BiocFileCache object to use
+#' @param entry A string from ['compendium','projection'] indicating which manifest to return.
+#' @returns a data.table listing all versions and the necessary URLs
 .getVersions <- function(bfc, entry, verbose=FALSE) {
-    # Determines the most recent version of the compendium
-    # and retrieves the manifest that describes all available releases.
-    # Returns a data.table listing all versions and the necessary URLs
-    # This requires the canonical_doi configuration value stored in
-    # constants.R, which always resolves to the most recent version.
-
-    # Check if we've already cached the manifest.
-    # If not, make an HTTP call to get the URL we need
-    rpath <- BiocFileCache::bfcquery(bfc, entry)$rpath
+    rpath <- BiocFileCache::bfcquery(bfc, entry)$rpath # check if we've already cached the manifest
     if(length(rpath) == 0) {
       if(verbose) {
         print('Retrieving version information.')
@@ -66,7 +67,7 @@
       }
     }
     results <- data.table::fread(rpath)
-
+    # TODO: This is a silly bandage
     colnames(results) <- c('version','zenodo_id','default')
     results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/taxonomic_table.csv.gz')
     results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv')
@@ -93,14 +94,12 @@
     sampdat
 }
 
-getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) {
-#' load all compendium data into a TreeSummarizedExperiment
+#' Compendium download
+#'
+#' Load all compendium data into a TreeSummarizedExperiment
 #' @param version an optional parameter indicating which compendium version to retrieve
 #' @param bfc BiocFileCache object to use
-#'
 #' @returns a `TreeSummarizedExperiment`
-#' @export
-#'
 #' @examples
 #' cpd <- getCompendium()
 #'
@@ -109,6 +108,8 @@ getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) {
 #' assayNames(cpd)
 #' head(colData(cpd))
 #'
+#' @export
+getCompendium <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) {
     versions <- .getVersions(bfc, 'compendium')
 
     if(is.na(version)) {
diff --git a/R/projection.R b/R/projection.R
index 6079e28..583c47d 100644
--- a/R/projection.R
+++ b/R/projection.R
@@ -1,8 +1,8 @@
-library(dplyr)
-library(tidyr)
-library(tibble)
-library(vegan)
+#' @import dplyr
+#' @importFrom vegan rrarefy
+#' @importFrom tibble rownames_to_column column_to_rownames
 
+#library(tidyr)
 .gm_mean = function(x){
   exp(mean(log(x[x > 0])))
 }
@@ -10,15 +10,35 @@ library(vegan)
 .rclr <- function(a) {
   answer <- log(a/.gm_mean(a))
   answer[] <- lapply(answer, function(i) if(is.numeric(i)) ifelse(is.infinite(i), 0, i) else i)
-  return(answer)
+  answer
 }
 
+#' Projection retrieval and formatting
+#'
+#' Filters a data frame containing coordinates for
+#' multiple projections and formats it for further use.
+#' @param pname string indicating the name of the projection to use
+#' @param load_all a matrix of weights (with a "projection" field on which to filter)
+#' @returns A data frame of weights for a single projection,
+#'              plus a "taxon" field containing a string unique to that taxon.
 .loadProjection <- function(pname, load_all) {
   load_all |>
     filter(projection==pname) |>
     mutate(taxon = paste(kingdom,phylum,class,order,family, sep='||')) # TODO: use numbers for this instead
 }
 
+#' Projection library builder
+#'
+#' Utility for working with remotely hosted projection files.
+#' @param pname string indicating the name of the projection to use
+#' @param bfc BiocFileCache object to use
+#' @returns A **function** that allows a user to load a projection using
+#'            its name.
+#' @examples
+#' lib <- projection_library()
+#' projected <- project_it(my_data, projection_library('full'))
+#'
+#' @export
 projection_library <- function(bfc = BiocFileCache::BiocFileCache()) {
   versions <- .getVersions(bfc, 'projection')
 
@@ -36,46 +56,113 @@ projection_library <- function(bfc = BiocFileCache::BiocFileCache()) {
   curried
 }
 
-loadUserData <- function(userdata, usertaxa, seed=NA) {
-    if(!is.na(seed)) set.seed(seed)
-
-    userdata |>
+#' User data formatting
+#'
+#' @description
+#' Formats user input data from two input files into a single data frame for further analysis.
+#' Consolidates read counts at the family level.
+#' @details
+#' Taxonomic information is provided across two files: the matrix of read counts uses
+#' id-based column names ("TAX1", "TAX2", etc). The second file associates those column
+#' names with inferred taxonomic ranks.
+#' This function modifies the count table by annotating the column names with the
+#' taxonomic information.
+#' @param userdata data frame countaining samples in rows and taxa in columns
+#' @param usertaxa data frame associating the column names from userdata with kingdom, phylum, class,
+#'                 order, and family classifications.
+#' @returns A data frame with the same number of rows as userdata and one column for each distinct
+#'            family in the usertaxa file that was observed at least once in the dataset.
+#'
+#' @export
+loadUserData <- function(userdata, usertaxa) {
+  test <- userdata |>
         pivot_longer(!sample
             , names_to = 'col_id'
-            , values_to = 'count'
+            , values_to = 'countnum'
         ) |> # one row for each sample/taxon count
+        filter(countnum > 0) |>
         left_join(usertaxa, by='col_id') |> # add taxon names
         mutate(famlevel = paste(kingdom,phylum,class,order,family, sep='||')) |>
-        select(!c('kingdom','phylum','class','order','family','genus')) |>
+        select(!c('kingdom','phylum','class','order','family')) |>
         group_by(sample, famlevel) |> # summarize at family level
         summarise(
-            aggcount = sum(count)
+            aggcount = sum(countnum)
         ) |>
         ungroup() |>
         pivot_wider(names_from='famlevel' # make taxon table
             , values_from='aggcount'
-        ) |>
-        column_to_rownames('sample') |>
-        (\(x) {
-            x[,colSums(x) > 0] # grab non-empty taxa
-        })() |>
-        rownames_to_column('sample') |>
-        unique(by=!c('sample')) |> # ensure no duplicate entries
-        rowwise() |>
-        mutate(
-          totalreads = sum(c_across(where(is.numeric)))
-        ) |>
-        filter(
-          totalreads >= 3000 # make sure each has enough reads
-        ) |> # TODO log the results of this
-        select(!totalreads) |>
-        column_to_rownames('sample') |>
-        rrarefy(3000) |>
-        as.data.frame() |>
-        .rclr() |>
-        rownames_to_column('sample')
+            , values_fill=0
+        )
+}
+
+#' Sample rarefaction
+#'
+#' @description
+#' Helper function for rarefying samples in a taxonomic table to a set read count.
+#' @details
+#' Throwing away data is generally not helpful, but this step is done to ensure
+#' samples being projected into an existing latent space are of the same size as the samples
+#' that were included in the original ordination.
+#' @param userdata data frame countaining samples in rows and taxa in columns. (Generally here the output of `loadUserData()`)
+#' @param level The desired number of reads per sample. **Samples with fewer reads than this will be filtered out.**
+#' @returns A data frame of the same dimensions as userdata, but with rarefied read counts that result in
+#'            all rows summing to `level`.
+#'
+#' @export
+rarefy_dataset <- function(userdata, level=3000, seed=NA) {
+  if(!is.na(seed)) set.seed(seed)
+
+  userdata |>
+    unique(by=!c('sample')) |> # ensure no duplicate entries
+    rowwise() |>
+    mutate(
+      totalreads = sum(c_across(where(is.numeric)))
+    ) |>
+    filter(
+      totalreads >= 3000 # make sure each has enough reads
+    ) |> # TODO log the results of this
+    select(!totalreads) |>
+    column_to_rownames('sample') |>
+    rrarefy(3000) |>
+    as.data.frame() |>
+    rownames_to_column('sample')
+}
+
+#' Robust centered log-ratio transformation
+#'
+#' @description
+#' Helper function for applying rCLR to a data frame with a "sample" column.
+#' @details
+#' This step should happen *after* rarefaction, and converts a matrix of non-negative read counts into
+#' one containing unbounded floating-point numbers.
+#' @param userdata data frame countaining samples in rows and taxa in columns.
+#' @returns A data frame of the same dimensions as userdata, but with rCLR-transformed values
+#'
+#' @export
+transform <- function(userdata) {
+  userdata |>
+    column_to_rownames('sample') |>
+    .rclr() |>
+    rownames_to_column('sample')
 }
 
+#' Projection operation
+#'
+#' @description
+#' Projects a taxonomic table into an existing ordination using a set of pre-calculated loadings.
+#' @details
+#' This is the final step in the projection process. The input describes samples with taxa, and the output
+#' describes the same taxa with principal components. Note that this step doesn't require rCLR-transformed
+#' values *per se* -- input data should be transformed using the same process as the data used in the original
+#' ordination. For the compendium ordinations currently available, this means rCLR.
+#' @param indata data frame countaining samples in rows and taxa in columns.
+#' @param loadings data frame containing taxa in each row (i.e. the indata column names) and
+#'                  a principal component ("PC1", "PC2", etc) in each column. The values in each
+#'                  cell indicate the weight that taxon's read count should be given when calculating
+#'                   a sample's value for that PC.
+#' @returns A data frame of the same rows as data and the same columns as loadings
+#'
+#' @export
 project_it <- function(indata, loadings) {
     indata |>
         pivot_longer(cols=!c(sample)

From d5ab31d8993609b6540a54b884bb1d363c5718db Mon Sep 17 00:00:00 2001
From: Rich Abdill <rabdill@uchicago.edu>
Date: Wed, 6 May 2026 10:58:23 -0400
Subject: [PATCH 8/8] Fixing projection downloads

---
 R/constants.R  |   5 +-
 R/loader.R     | 125 ++++++++++++++++++++++---------------------------
 R/projection.R |   5 +-
 3 files changed, 62 insertions(+), 73 deletions(-)

diff --git a/R/constants.R b/R/constants.R
index 36e85fb..6f7cae8 100644
--- a/R/constants.R
+++ b/R/constants.R
@@ -1 +1,4 @@
-canonical_doi <- 'https://doi.org/10.5281/zenodo.8186993'
+canonical_doi <- c(
+  'compendium' = 'https://doi.org/10.5281/zenodo.8186993'
+  , 'projection' = 'https://doi.org/10.5281/zenodo.19631960'
+)
diff --git a/R/loader.R b/R/loader.R
index 69a86e8..f20615d 100644
--- a/R/loader.R
+++ b/R/loader.R
@@ -15,83 +15,68 @@
 #' @param entry A string from ['compendium','projection'] indicating which manifest to return.
 #' @returns a data.table listing all versions and the necessary URLs
 .getVersions <- function(bfc, entry, verbose=FALSE) {
-    rpath <- BiocFileCache::bfcquery(bfc, entry)$rpath # check if we've already cached the manifest
-    if(length(rpath) == 0) {
-      if(verbose) {
-        print('Retrieving version information.')
-      }
-      resolve <- curl::curl_fetch_memory(canonical_doi[entry])
-      if(resolve$status_code != 200) {
-        stop(paste0(
-          'Could not resolve canonical DOI. Status code: ',
-          resolve$status_code
-        ))
-      }
+  rpath <- BiocFileCache::bfcquery(bfc, entry)$rpath # check if we've already cached the manifest
+  if(length(rpath) == 0) {
+    if(verbose) {
+      print('Retrieving version information.')
+    }
+    resolve <- curl::curl_fetch_memory(canonical_doi[entry])
+    if(resolve$status_code != 200) {
+      stop(paste0(
+        'Could not resolve canonical DOI. Status code: ',
+        resolve$status_code
+      ))
+    }
 
-      if(verbose) {
-        print('Determined data address:')
-        print(resolve$url)
-      }
-      manifest <- paste0(resolve$url, '/files/manifest.csv')
-
-      rpath <- tryCatch(
-        {
-          bfcrpath(bfc, manifest)
-        },
-        error = function(msg){
-          print('Could not retrieve manifest file. Falling back to manifest as of April 2026.')
+    if(verbose) {
+      print('Determined data address:')
+      print(resolve$url)
+    }
+    manifest <- paste0(resolve$url, '/files/manifest.csv')
+
+    rpath <- tryCatch(
+      {
+        bfcrpath(bfc, manifest)
+      },
+      error = function(msg){
+        print('Could not retrieve manifest file. Falling back to manifest as of April 2026.')
+        towrite <- data.table::data.table(
+          version = c('1.1.0', '1.0.1'),
+          zenodo_id = c('13733642', '10452633')
+        )
+        if(entry == 'projection') {
+          print('Falling back to projection manifest.')
           towrite <- data.table::data.table(
-            version = c('1.1.0', '1.0.1'),
-            zenodo_id = c('13733642', '10452633')
+            version = c('0.3.0','0.2.0', '0.1.0'),
+            zenodo_id = c('20040560','19633215', '19631961'),
+            default = c(TRUE, FALSE, FALSE)
           )
-          if(entry == 'projection') {
-            print('Falling back to projection manifest.')
-            towrite <- data.table::data.table(
-              version = c('0.2.0', '0.1.0'),
-              zenodo_id = c('19633215', '19631961'),
-              default = c(TRUE, FALSE)
-            )
-          }
-
-          # we save this to the cache so the app remembers not to keep looking online
-          # for a manifest every time the version information is needed
-          savepath <- BiocFileCache::bfcnew(bfc, entry, ext='.csv')
-          data.table::fwrite(towrite, file=savepath)
-          savepath
         }
-      )
-    }
-    else {
-      if(verbose) {
-        print('Cached version information found.')
+
+        # we save this to the cache so the app remembers not to keep looking online
+        # for a manifest every time the version information is needed
+        savepath <- BiocFileCache::bfcnew(bfc, entry, ext='.csv')
+        data.table::fwrite(towrite, file=savepath)
+        savepath
       }
+    )
+  }
+  else {
+    if(verbose) {
+      print('Cached version information found.')
     }
-    results <- data.table::fread(rpath)
-    # TODO: This is a silly bandage
-    colnames(results) <- c('version','zenodo_id','default')
-    results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/taxonomic_table.csv.gz')
+  }
+  results <- data.table::fread(rpath)
+  # TODO: This is a silly bandage
+  colnames(results) <- c('version','zenodo_id','default')
+  results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/taxonomic_table.csv.gz')
+  results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv')
+  if(entry == 'projection') {
+    results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/loadings.txt')
     results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv')
-    if(entry == 'projection') {
-      results$data_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/loadings.txt')
-      results$coldata_url <- paste0('https://zenodo.org/record/', results$zenodo_id, '/files/sample_metadata.tsv')
-    }
-    data.table::setkey(results, version)
-    results
-}
-
-.getProjectData <- function(version, dataset, bfc) {
-    # dataset in ('compendium', 'projection')
-    versions <- .getVersions(bfc, dataset)
-    rpath <- bfcrpath(bfc, versions[version]$data_url)
-    data.table::fread(rpath)
-}
-
-.getCompendiumColdata <- function(version, bfc) {
-    versions <- .getVersions(bfc, 'compendium')
-    rpath <- bfcrpath(bfc, versions[version]$coldata_url)
-    sampdat <- as.data.frame(data.table::fread(rpath))
-    rownames(sampdat) <- paste(sampdat[[2]], sampdat[[3]], sep = "_")
-    sampdat
+  }
+  data.table::setkey(results, version)
+  results
 }
 
 #' Compendium download
diff --git a/R/projection.R b/R/projection.R
index 583c47d..7a8e218 100644
--- a/R/projection.R
+++ b/R/projection.R
@@ -39,8 +39,9 @@
 #' projected <- project_it(my_data, projection_library('full'))
 #'
 #' @export
-projection_library <- function(bfc = BiocFileCache::BiocFileCache()) {
-  versions <- .getVersions(bfc, 'projection')
+projection_library <- function(version=NA, bfc = BiocFileCache::BiocFileCache()) {
+  versions <- .getVersions(bfc, entry='projection')
+  print(versions)
 
   if(is.na(version)) {
     # If the user has not specified a version, grab whichever