diff --git a/.Rbuildignore b/.Rbuildignore index 1aa03d51..f3832be8 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -15,6 +15,7 @@ cran-comments.md man-roxygen data-raw docs +^.*vtune.*$ revdep \.covrignore ^\.git$ diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 00000000..f80c4d56 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,7 @@ +{ + "permissions": { + "allow": [ + "Bash(Rscript *)" + ] + } +} diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index df89093f..4255992b 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -265,8 +265,6 @@ jobs: uses: r-lib/actions/setup-r-dependencies@v2 with: dependencies: '"hard"' - extra-packages: | - github::ms609/TreeTools needs: benchmark - name: Benchmark PR diff --git a/DESCRIPTION b/DESCRIPTION index 504ffdb4..2472c834 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -87,10 +87,11 @@ VignetteBuilder: knitr Config/Needs/app/optional: uwot Config/Needs/check: rcmdcheck Config/Needs/coverage: covr -Config/Needs/memcheck: devtools +Config/Needs/memcheck: pkgdown, testthat Config/Needs/metadata: codemetar Config/Needs/revdeps: revdepcheck Config/Needs/website: openssl, pkgdown, remotes, shinylive +Config/roxygen2/version: 8.0.0 Config/testthat/parallel: false Config/testthat/edition: 3 SystemRequirements: C++17, pandoc-citeproc @@ -98,5 +99,4 @@ ByteCompile: true Encoding: UTF-8 Language: en-GB X-schema.org-keywords: phylogenetics, tree-distance -RoxygenNote: 7.3.3 Roxygen: list(markdown = TRUE) diff --git a/NEWS.md b/NEWS.md index a051898a..fa770284 100644 --- a/NEWS.md +++ b/NEWS.md @@ -23,6 +23,17 @@ removing a hard dependency on the compile-time `SL_MAX_SPLITS` constant. TreeDist now supports trees of any size permitted by TreeTools. +- **Large-tree support (requires TreeTools ≥ 2.3.0):** all distance + functions now accept trees with up to 32 767 tips (previously limited + to `SL_MAX_TIPS`, 2048 with TreeTools ≤ 2.2.0). The R-level tip-count + guard (`.CheckMaxTips()`) detects the TreeTools version at load time and + unlocks the higher ceiling automatically; no code changes are needed. + All integer counters in the C++ hot paths have been widened from `int16` + to `split_int` (`int32`) to handle split counts above 32 767 without + overflow. Direct `lg2[]` table accesses have been replaced with + `lg2_lookup()` fallback helpers so that trees with more tips than + `SL_MAX_TIPS` are computed correctly via `std::log2` / `std::lgamma`. + ## Performance - `RobinsonFoulds()` now uses a fast C++ batch path for cross-distance diff --git a/R/RcppExports.R b/R/RcppExports.R index 8dd5b1a2..592b4772 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -303,7 +303,7 @@ cpp_mutual_clustering <- function(x, y, nTip) { .Call(`_TreeDist_cpp_mutual_clustering`, x, y, nTip) } -cpp_shared_phylo <- function(x, y, nTip) { - .Call(`_TreeDist_cpp_shared_phylo`, x, y, nTip) +cpp_shared_phylo <- function(x, y, nTip, force_slow = FALSE) { + .Call(`_TreeDist_cpp_shared_phylo`, x, y, nTip, force_slow) } diff --git a/R/tree_distance.R b/R/tree_distance.R index 5ea3a9cb..12caa469 100644 --- a/R/tree_distance.R +++ b/R/tree_distance.R @@ -230,6 +230,23 @@ GeneralizedRF <- function(splits1, splits2, nTip, PairScorer, g[lower.tri(g)] } +# Floor sub-noise distances to zero before normalization. +# Two sources of numerical noise scale with treesIndependentInfo: +# (1) LAP int64 cost-matrix quantization in *Splits scoring; per-cell +# truncation of up to (max_possible / BIG) bits, summed over n_splits. +# (2) Float-accumulation drift between independently-built tables (e.g. +# InfoRobinsonFoulds vs cpp_splitwise_info_batch sum the same per-split +# info contributions, but using different lookup-table constructions). +# Both grow with the magnitude of the answer, so an absolute sqrt(eps) +# tolerance becomes too tight beyond a few thousand tips. Scaling by +# treesIndependentInfo self-adjusts; pmax(1, ·) preserves the original +# tolerance for tiny trees where these errors are negligible anyway. +.FloorNumericalNoise <- function(ret, treesIndependentInfo) { + tol <- pmax(1, treesIndependentInfo) * .Machine[["double.eps"]] ^ 0.5 + ret[ret < tol] <- 0 + ret +} + .AllTipsSame <- function(x, y) { if (is.list(x)) { xPrime <- x[[1]] diff --git a/R/tree_distance_info.R b/R/tree_distance_info.R index dc7441e2..07693302 100644 --- a/R/tree_distance_info.R +++ b/R/tree_distance_info.R @@ -249,16 +249,15 @@ DifferentPhylogeneticInfo <- function(tree1, tree2 = NULL, normalize = FALSE, if (!is.null(fast)) { spi <- fast[["info"]] treesIndependentInfo <- .PairwiseSums(fast[["entropies"]]) - - ret <- treesIndependentInfo - spi - spi + + ret <- .FloorNumericalNoise(treesIndependentInfo - spi - spi, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = SplitwiseInfo, Combine = "+") - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 attributes(ret) <- attributes(spi) return(ret) } - + # Fast path (cross-pairs): same tips, no matching — avoids duplicate as.Splits() fast_many <- .FastManyManyPath(tree1, tree2, reportMatching, cpp_shared_phylo_cross_pairs, @@ -268,25 +267,22 @@ DifferentPhylogeneticInfo <- function(tree1, tree2 = NULL, normalize = FALSE, info1 <- fast_many[["info1"]] info2 <- fast_many[["info2"]] treesIndependentInfo <- outer(info1, info2, "+") - - ret <- treesIndependentInfo - spi - spi + + ret <- .FloorNumericalNoise(treesIndependentInfo - spi - spi, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = SplitwiseInfo, Combine = "+") - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 return(ret) } - + spi <- SharedPhylogeneticInfo(tree1, tree2, normalize = FALSE, diag = FALSE, reportMatching = reportMatching) treesIndependentInfo <- .MaxValue(tree1, tree2, SplitwiseInfo) - - ret <- treesIndependentInfo - spi - spi - ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, + + ret <- .FloorNumericalNoise(treesIndependentInfo - spi - spi, treesIndependentInfo) + ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = SplitwiseInfo, Combine = "+") - - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 # Catch floating point inaccuracy attributes(ret) <- attributes(spi) # Return: @@ -310,16 +306,15 @@ ClusteringInfoDistance <- function(tree1, tree2 = NULL, normalize = FALSE, if (!is.null(fast)) { mci <- fast[["info"]] treesIndependentInfo <- .PairwiseSums(fast[["entropies"]]) - - ret <- treesIndependentInfo - mci - mci + + ret <- .FloorNumericalNoise(treesIndependentInfo - mci - mci, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = ClusteringEntropy, Combine = "+") - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 attributes(ret) <- attributes(mci) return(ret) } - + # Fast path (cross-pairs): same tips, no matching — avoids duplicate as.Splits() fast_many <- .FastManyManyPath(tree1, tree2, reportMatching, cpp_mutual_clustering_cross_pairs, @@ -329,25 +324,22 @@ ClusteringInfoDistance <- function(tree1, tree2 = NULL, normalize = FALSE, info1 <- fast_many[["info1"]] info2 <- fast_many[["info2"]] treesIndependentInfo <- outer(info1, info2, "+") - - ret <- treesIndependentInfo - mci - mci + + ret <- .FloorNumericalNoise(treesIndependentInfo - mci - mci, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = ClusteringEntropy, Combine = "+") - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 return(ret) } - + mci <- MutualClusteringInfo(tree1, tree2, normalize = FALSE, diag = FALSE, reportMatching = reportMatching) treesIndependentInfo <- .MaxValue(tree1, tree2, ClusteringEntropy) - - ret <- treesIndependentInfo - mci - mci + + ret <- .FloorNumericalNoise(treesIndependentInfo - mci - mci, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = ClusteringEntropy, Combine = "+") - - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 # Handle floating point inaccuracy attributes(ret) <- attributes(mci) # Return: diff --git a/R/tree_distance_msi.R b/R/tree_distance_msi.R index aadfc2f6..ff89cb18 100644 --- a/R/tree_distance_msi.R +++ b/R/tree_distance_msi.R @@ -29,15 +29,14 @@ MatchingSplitInfoDistance <- function(tree1, tree2 = NULL, msi <- fast[["info"]] treesIndependentInfo <- .PairwiseSums(fast[["entropies"]]) - ret <- treesIndependentInfo - msi - msi + ret <- .FloorNumericalNoise(treesIndependentInfo - msi - msi, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = SplitwiseInfo, Combine = "+") - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 attributes(ret) <- attributes(msi) return(ret) } - + # Fast path (cross-pairs): same tips, no matching — avoids duplicate as.Splits() fast_many <- .FastManyManyPath(tree1, tree2, reportMatching, cpp_msi_cross_pairs, @@ -47,25 +46,22 @@ MatchingSplitInfoDistance <- function(tree1, tree2 = NULL, info1 <- fast_many[["info1"]] info2 <- fast_many[["info2"]] treesIndependentInfo <- outer(info1, info2, "+") - - ret <- treesIndependentInfo - msi - msi + + ret <- .FloorNumericalNoise(treesIndependentInfo - msi - msi, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = SplitwiseInfo, Combine = "+") - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 return(ret) } - + msi <- MatchingSplitInfo(tree1, tree2, normalize = FALSE, diag = FALSE, reportMatching = reportMatching) - + treesIndependentInfo <- .MaxValue(tree1, tree2, SplitwiseInfo) - ret <- treesIndependentInfo - msi - msi + ret <- .FloorNumericalNoise(treesIndependentInfo - msi - msi, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = SplitwiseInfo, Combine = "+") - - ret[ret < .Machine[["double.eps"]]^0.5] <- 0 # In case of floating point inaccuracy attributes(ret) <- attributes(msi) # Return: ret diff --git a/R/tree_distance_nni.R b/R/tree_distance_nni.R index 45854764..d47d4ac6 100644 --- a/R/tree_distance_nni.R +++ b/R/tree_distance_nni.R @@ -73,9 +73,7 @@ NNIDist <- function(tree1, tree2 = tree1) { #' @importFrom TreeTools Postorder RenumberTips #' @importFrom ape Nnode.phylo .NNIDistSingle <- function(tree1, tree2, nTip, ...) { - if (nTip > 32768L) { - stop("Cannot calculate NNI distance for trees with so many tips.") - } + .CheckMaxTips(nTip) if (nrow(tree1[["edge"]]) != nrow(tree2[["edge"]])) { stop("Both trees must have the same number of edges. ", "Is one rooted and the other unrooted?") diff --git a/R/tree_distance_rf.R b/R/tree_distance_rf.R index 9f9454cc..f5622b81 100644 --- a/R/tree_distance_rf.R +++ b/R/tree_distance_rf.R @@ -75,14 +75,15 @@ InfoRobinsonFoulds <- function(tree1, tree2 = NULL, similarity = FALSE, cpp_splitwise_info_batch) if (!is.null(fast)) { treesIndependentInfo <- .PairwiseSums(fast[["entropies"]]) - unnormalized <- treesIndependentInfo - fast[["info"]] - fast[["info"]] - unnormalized[unnormalized < .Machine[["double.eps"]] ^ 0.5] <- 0 + unnormalized <- .FloorNumericalNoise( + treesIndependentInfo - fast[["info"]] - fast[["info"]], + treesIndependentInfo) ret <- NormalizeInfo(unnormalized, tree1, tree2, how = normalize, InfoInTree = SplitwiseInfo, Combine = "+") attributes(ret) <- attributes(fast[["info"]]) return(ret) } - + # Cross-pairs fast path fast_many <- .FastManyManyPath(tree1, tree2, reportMatching, cpp_rf_info_cross_pairs, @@ -92,25 +93,24 @@ InfoRobinsonFoulds <- function(tree1, tree2 = NULL, similarity = FALSE, info1 <- fast_many[["info1"]] info2 <- fast_many[["info2"]] treesIndependentInfo <- outer(info1, info2, "+") - - unnormalized <- treesIndependentInfo - irf - irf - unnormalized[unnormalized < .Machine[["double.eps"]] ^ 0.5] <- 0 + + unnormalized <- .FloorNumericalNoise(treesIndependentInfo - irf - irf, + treesIndependentInfo) ret <- NormalizeInfo(unnormalized, tree1, tree2, how = normalize, InfoInTree = SplitwiseInfo, Combine = "+") return(ret) } } - - unnormalized <- CalculateTreeDistance(InfoRobinsonFouldsSplits, tree1, tree2, + + unnormalized <- CalculateTreeDistance(InfoRobinsonFouldsSplits, tree1, tree2, reportMatching) * 2 - + if (!similarity) { - unnormalized <- .MaxValue(tree1, tree2, SplitwiseInfo) - unnormalized + treesIndependentInfo <- .MaxValue(tree1, tree2, SplitwiseInfo) + unnormalized <- .FloorNumericalNoise(treesIndependentInfo - unnormalized, + treesIndependentInfo) } - # In case of floating point inaccuracy - unnormalized[unnormalized < .Machine[["double.eps"]] ^ 0.5] <- 0 - # Return: NormalizeInfo(unnormalized, tree1, tree2, how = normalize, InfoInTree = SplitwiseInfo, Combine = "+") diff --git a/R/tree_distance_utilities.R b/R/tree_distance_utilities.R index b85fab32..84b19e2d 100644 --- a/R/tree_distance_utilities.R +++ b/R/tree_distance_utilities.R @@ -1,16 +1,23 @@ -# Validate that nTip does not exceed the compiled SL_MAX_TIPS limit. +# Validate that nTip does not exceed the supported tip-count ceiling. +# cpp_sl_max_tips() > 2048L iff TreeTools >= 2.3.0 raised the stack threshold +# and provides heap-backed split storage; accept up to 32767 tips in that case. +# Otherwise cap at the compiled SL_MAX_TIPS. # Called from every distance entry point before any C++ work. .CheckMaxTips <- function(nTip) { - if (!is.na(nTip) && nTip > .SL_MAX_TIPS) { - if (.SL_MAX_TIPS < 32704L) { - stop( - "Trees with ", nTip, " tips exceed the compiled limit of ", - .SL_MAX_TIPS, " tips.", - "\nUpdate TreeTools and reinstall TreeDist to support more tips." - ) + if (is.na(nTip)) return(invisible(NULL)) + sl_max <- cpp_sl_max_tips() + if (sl_max > 2048L) { + if (nTip > 32767L) { + stop("Trees with ", nTip, + " tips are not yet supported (maximum 32767).") } - stop("Trees with ", nTip, " tips are not yet supported (maximum ", - .SL_MAX_TIPS, ")") + } else if (nTip > sl_max) { + # else-if fires only when sl_max <= 2048 (TreeTools < 2.3.0) + stop( + "Trees with ", nTip, " tips exceed the compiled limit of ", + sl_max, " tips.", + "\nUpdate TreeTools and reinstall TreeDist to support more tips." + ) } } diff --git a/R/zzz.R b/R/zzz.R index 614ebbda..826528ea 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,9 +1,3 @@ -.SL_MAX_TIPS <- NULL # populated in .onLoad - -.onLoad <- function(libname, pkgname) { - .SL_MAX_TIPS <<- cpp_sl_max_tips() -} - .onUnload <- function(libpath) { StopParallel(quietly = TRUE) library.dynam.unload("TreeDist", libpath) diff --git a/cran-comments.md b/cran-comments.md index 31e312b3..3bae24af 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,7 +1,8 @@ ## Test environments -* Local machine, Windows 10, R devel (2024-09-02 r87090 ucrt) +* Local machine, Windows 11, R version 4.6.0 (2026-04-24 ucrt) * `devtools::check_win_devel()` +* `devtools::check_mac_devel()` * [GitHub Actions](https://github.com/ms609/TreeDist/actions): - windows-latest, R release diff --git a/inst/_pkgdown.yml b/inst/_pkgdown.yml index 2922547e..664ef8e6 100644 --- a/inst/_pkgdown.yml +++ b/inst/_pkgdown.yml @@ -85,6 +85,7 @@ navbar: href: news/index.html github: icon: fa-github fa-lg + aria-label: View this repo on GitHub href: https://github.com/ms609/TreeDist articles: diff --git a/inst/include/TreeDist/mutual_clustering.h b/inst/include/TreeDist/mutual_clustering.h index 437633f0..23134905 100644 --- a/inst/include/TreeDist/mutual_clustering.h +++ b/inst/include/TreeDist/mutual_clustering.h @@ -22,7 +22,9 @@ namespace TreeDist { // lg2_unrooted[i] = log2((2i-5)!!) for i >= 3 // lg2_rooted = &lg2_unrooted[0] + 1 (so lg2_rooted[i] = lg2_unrooted[i+1]) // - // These are defined in mutual_clustering_impl.h. + // These are fast-path caches sized to TreeTools' stack threshold. + // For larger trees, fall back to on-the-fly computation via the _lookup helpers. + // Definitions are in mutual_clustering_impl.h. extern double lg2[SL_MAX_TIPS + 1]; extern double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2]; @@ -33,15 +35,46 @@ namespace TreeDist { // computation. max_tips should be >= the largest tree size used. void init_lg2_tables(int max_tips); + // Out-of-line slow paths for inputs that exceed the precomputed tables + // (i.e. n_tips > SL_MAX_TIPS). Defined in mutual_clustering_impl.h. + // Kept out-of-line so the *_lookup() inline thunks below stay tiny — + // table load + predicted branch, no extern function calls — which some + // compilers refuse to inline through, suppressing optimization in the + // per-cell hot loops of PID/MCI/MSI/RF info. + double lg2_slow(split_int x); + double lg2_unrooted_slow(split_int n_tips); + double lg2_rooted_slow(split_int n_tips); + + // log2(x) — table-fast for x <= SL_MAX_TIPS, runtime otherwise. + [[nodiscard]] inline double lg2_lookup(split_int x) noexcept { + return (x <= static_cast(SL_MAX_TIPS)) + ? lg2[x] + : lg2_slow(x); // #nocov + } + + // log2((2n-5)!!) — table-fast for n <= SL_MAX_TIPS+1, lgamma otherwise. + [[nodiscard]] inline double lg2_unrooted_lookup(split_int n_tips) noexcept { + return (n_tips <= static_cast(SL_MAX_TIPS + 1)) + ? lg2_unrooted[n_tips] + : lg2_unrooted_slow(n_tips); // #nocov + } + + // log2((2n-3)!!) — table-fast for n <= SL_MAX_TIPS+1, lgamma otherwise. + [[nodiscard]] inline double lg2_rooted_lookup(split_int n_tips) noexcept { + return (n_tips <= static_cast(SL_MAX_TIPS + 1)) + ? lg2_rooted[n_tips] + : lg2_rooted_slow(n_tips); // #nocov + } + // ---- Inline helpers ---- // Information content of a perfectly-matching split pair. // ic_matching(a, b, n) = (a + b) * lg2[n] - a * lg2[a] - b * lg2[b] - [[nodiscard]] inline double ic_matching(int16 a, int16 b, - int16 n) noexcept { - const double lg2a = lg2[a]; - const double lg2b = lg2[b]; - const double lg2n = lg2[n]; + [[nodiscard]] inline double ic_matching(split_int a, split_int b, + split_int n) noexcept { + const double lg2a = lg2_lookup(a); + const double lg2b = lg2_lookup(b); + const double lg2n = lg2_lookup(n); return (a + b) * lg2n - a * lg2a - b * lg2b; } @@ -77,9 +110,9 @@ namespace TreeDist { // Implementation in mutual_clustering_impl.h. double mutual_clustering_score( - const splitbit* const* a_state, const int16* a_in, int16 a_n_splits, - const splitbit* const* b_state, const int16* b_in, int16 b_n_splits, - int16 n_bins, int32 n_tips, + const splitbit* const* a_state, const split_int* a_in, split_int a_n_splits, + const splitbit* const* b_state, const split_int* b_in, split_int b_n_splits, + split_int n_bins, int32 n_tips, LapScratch& scratch); } // namespace TreeDist diff --git a/inst/include/TreeDist/mutual_clustering_impl.h b/inst/include/TreeDist/mutual_clustering_impl.h index 7f98c8fb..cea725be 100644 --- a/inst/include/TreeDist/mutual_clustering_impl.h +++ b/inst/include/TreeDist/mutual_clustering_impl.h @@ -58,6 +58,30 @@ void init_lg2_tables(int max_tips) { } } +// ---- Out-of-line slow paths (n_tips > SL_MAX_TIPS) ---- +// Defined out-of-line so that the *_lookup() inline thunks in +// mutual_clustering.h stay trivial enough for any compiler to inline. + +// LCOV_EXCL_START +double lg2_slow(split_int x) { + return std::log2(static_cast(x)); +} + +double lg2_unrooted_slow(split_int n_tips) { + if (n_tips < 3) return 0.0; + const double n = static_cast(n_tips); + return (std::lgamma(2.0 * n - 3.0) - std::lgamma(n - 1.0)) / std::log(2.0) + - (n - 2.0); +} + +double lg2_rooted_slow(split_int n_tips) { + if (n_tips < 2) return 0.0; + const double n = static_cast(n_tips); + return (std::lgamma(2.0 * n - 1.0) - std::lgamma(n)) / std::log(2.0) + - (n - 1.0); +} +// LCOV_EXCL_STOP + // ---- Sort+merge exact-match detection (internal) ---- // // Canonicalise each split so bit 0 is always set (flip complement if not), @@ -68,17 +92,17 @@ void init_lg2_tables(int max_tips) { namespace detail { -static int16 find_exact_matches_raw( - const splitbit* const* a_state, const int16* /*a_in*/, int16 a_n, - const splitbit* const* b_state, const int16* /*b_in*/, int16 b_n, - int16 n_bins, int32 n_tips, - int16* a_match, int16* b_match) +static split_int find_exact_matches_raw( + const splitbit* const* a_state, const split_int* /*a_in*/, split_int a_n, + const splitbit* const* b_state, const split_int* /*b_in*/, split_int b_n, + split_int n_bins, int32 n_tips, + split_int* a_match, split_int* b_match) { - std::fill(a_match, a_match + a_n, int16(0)); - std::fill(b_match, b_match + b_n, int16(0)); + std::fill(a_match, a_match + a_n, split_int(0)); + std::fill(b_match, b_match + b_n, split_int(0)); if (a_n == 0 || b_n == 0) return 0; - const int16 last_bin = n_bins - 1; + const split_int last_bin = n_bins - 1; const splitbit last_mask = (n_tips % SL_BIN_SIZE == 0) ? ~splitbit(0) : (splitbit(1) << (n_tips % SL_BIN_SIZE)) - 1; @@ -87,17 +111,17 @@ static int16 find_exact_matches_raw( std::vector a_canon(static_cast(a_n) * n_bins); std::vector b_canon(static_cast(b_n) * n_bins); - for (int16 i = 0; i < a_n; ++i) { + for (split_int i = 0; i < a_n; ++i) { const bool flip = !(a_state[i][0] & 1); - for (int16 bin = 0; bin < n_bins; ++bin) { + for (split_int bin = 0; bin < n_bins; ++bin) { splitbit val = flip ? ~a_state[i][bin] : a_state[i][bin]; if (bin == last_bin) val &= last_mask; a_canon[i * n_bins + bin] = val; } } - for (int16 i = 0; i < b_n; ++i) { + for (split_int i = 0; i < b_n; ++i) { const bool flip = !(b_state[i][0] & 1); - for (int16 bin = 0; bin < n_bins; ++bin) { + for (split_int bin = 0; bin < n_bins; ++bin) { splitbit val = flip ? ~b_state[i][bin] : b_state[i][bin]; if (bin == last_bin) val &= last_mask; b_canon[i * n_bins + bin] = val; @@ -105,8 +129,8 @@ static int16 find_exact_matches_raw( } // Sort index arrays by canonical form - auto canon_less = [&](const splitbit* canon, int16 n_b, int16 i, int16 j) { - for (int16 bin = 0; bin < n_b; ++bin) { + auto canon_less = [&](const splitbit* canon, split_int n_b, split_int i, split_int j) { + for (split_int bin = 0; bin < n_b; ++bin) { const splitbit vi = canon[i * n_b + bin]; const splitbit vj = canon[j * n_b + bin]; if (vi < vj) return true; @@ -115,28 +139,28 @@ static int16 find_exact_matches_raw( return false; }; - std::vector a_order(a_n), b_order(b_n); - std::iota(a_order.begin(), a_order.end(), int16(0)); - std::iota(b_order.begin(), b_order.end(), int16(0)); + std::vector a_order(a_n), b_order(b_n); + std::iota(a_order.begin(), a_order.end(), split_int(0)); + std::iota(b_order.begin(), b_order.end(), split_int(0)); std::sort(a_order.begin(), a_order.end(), - [&](int16 i, int16 j) { + [&](split_int i, split_int j) { return canon_less(a_canon.data(), n_bins, i, j); }); std::sort(b_order.begin(), b_order.end(), - [&](int16 i, int16 j) { + [&](split_int i, split_int j) { return canon_less(b_canon.data(), n_bins, i, j); }); // Merge-scan - int16 exact_n = 0; - int16 ai_pos = 0, bi_pos = 0; + split_int exact_n = 0; + split_int ai_pos = 0, bi_pos = 0; while (ai_pos < a_n && bi_pos < b_n) { - const int16 ai = a_order[ai_pos]; - const int16 bi = b_order[bi_pos]; + const split_int ai = a_order[ai_pos]; + const split_int bi = b_order[bi_pos]; int cmp = 0; - for (int16 bin = 0; bin < n_bins; ++bin) { + for (split_int bin = 0; bin < n_bins; ++bin) { const splitbit va = a_canon[ai * n_bins + bin]; const splitbit vb = b_canon[bi * n_bins + bin]; if (va < vb) { cmp = -1; break; } @@ -165,41 +189,41 @@ static int16 find_exact_matches_raw( // ---- MCI score implementation ---- double mutual_clustering_score( - const splitbit* const* a_state, const int16* a_in, int16 a_n_splits, - const splitbit* const* b_state, const int16* b_in, int16 b_n_splits, - int16 n_bins, int32 n_tips, + const splitbit* const* a_state, const split_int* a_in, split_int a_n_splits, + const splitbit* const* b_state, const split_int* b_in, split_int b_n_splits, + split_int n_bins, int32 n_tips, LapScratch& scratch) { if (a_n_splits == 0 || b_n_splits == 0 || n_tips == 0) return 0.0; - const int16 most_splits = std::max(a_n_splits, b_n_splits); + const split_int most_splits = std::max(a_n_splits, b_n_splits); const double n_tips_rcp = 1.0 / static_cast(n_tips); constexpr cost max_score = BIG; constexpr double over_max = 1.0 / static_cast(BIG); const double max_over_tips = static_cast(BIG) * n_tips_rcp; - const double lg2_n = lg2[n_tips]; + const double lg2_n = lg2_lookup(n_tips); // --- Phase 1: O(n log n) exact-match detection --- - std::vector a_match_buf(a_n_splits); - std::vector b_match_buf(b_n_splits); + std::vector a_match_buf(a_n_splits); + std::vector b_match_buf(b_n_splits); - const int16 exact_n = detail::find_exact_matches_raw( + const split_int exact_n = detail::find_exact_matches_raw( a_state, a_in, a_n_splits, b_state, b_in, b_n_splits, n_bins, n_tips, a_match_buf.data(), b_match_buf.data()); - const int16* a_match = a_match_buf.data(); - const int16* b_match = b_match_buf.data(); + const split_int* a_match = a_match_buf.data(); + const split_int* b_match = b_match_buf.data(); // Accumulate exact-match score double exact_score = 0.0; - for (int16 ai = 0; ai < a_n_splits; ++ai) { + for (split_int ai = 0; ai < a_n_splits; ++ai) { if (a_match[ai]) { - const int16 na = a_in[ai]; - const int16 nA = static_cast(n_tips - na); - exact_score += ic_matching(na, nA, static_cast(n_tips)); + const split_int na = a_in[ai]; + const split_int nA = n_tips - na; + exact_score += ic_matching(na, nA, n_tips); } } @@ -209,58 +233,58 @@ double mutual_clustering_score( } // --- Phase 2: fill cost matrix for unmatched splits only (O(k²)) --- - const int16 lap_n = most_splits - exact_n; + const split_int lap_n = most_splits - exact_n; - std::vector a_unmatch, b_unmatch; + std::vector a_unmatch, b_unmatch; a_unmatch.reserve(lap_n); b_unmatch.reserve(lap_n); - for (int16 ai = 0; ai < a_n_splits; ++ai) { + for (split_int ai = 0; ai < a_n_splits; ++ai) { if (!a_match[ai]) a_unmatch.push_back(ai); } - for (int16 bi = 0; bi < b_n_splits; ++bi) { + for (split_int bi = 0; bi < b_n_splits; ++bi) { if (!b_match[bi]) b_unmatch.push_back(bi); } scratch.score_pool.resize(lap_n); CostMatrix& score = scratch.score_pool; - const int16 a_unmatched_n = static_cast(a_unmatch.size()); - const int16 b_unmatched_n = static_cast(b_unmatch.size()); + const split_int a_unmatched_n = static_cast(a_unmatch.size()); + const split_int b_unmatched_n = static_cast(b_unmatch.size()); - for (int16 a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { - const int16 ai = a_unmatch[a_pos]; - const int16 na = a_in[ai]; - const int16 nA = static_cast(n_tips - na); + for (split_int a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { + const split_int ai = a_unmatch[a_pos]; + const split_int na = a_in[ai]; + const split_int nA = n_tips - na; const splitbit* a_row = a_state[ai]; - const double offset_a = lg2_n - lg2[na]; - const double offset_A = lg2_n - lg2[nA]; + const double offset_a = lg2_n - lg2_lookup(na); + const double offset_A = lg2_n - lg2_lookup(nA); - for (int16 b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { - const int16 bi = b_unmatch[b_pos]; + for (split_int b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { + const split_int bi = b_unmatch[b_pos]; const splitbit* b_row = b_state[bi]; - int16 a_and_b = 0; - for (int16 bin = 0; bin < n_bins; ++bin) { + split_int a_and_b = 0; + for (split_int bin = 0; bin < n_bins; ++bin) { a_and_b += TreeTools::count_bits(a_row[bin] & b_row[bin]); } - const int16 nb = b_in[bi]; - const int16 nB = static_cast(n_tips - nb); - const int16 a_and_B = na - a_and_b; - const int16 A_and_b = nb - a_and_b; - const int16 A_and_B = nA - A_and_b; + const split_int nb = b_in[bi]; + const split_int nB = n_tips - nb; + const split_int a_and_B = na - a_and_b; + const split_int A_and_b = nb - a_and_b; + const split_int A_and_B = nA - A_and_b; if (a_and_b == A_and_b && a_and_b == a_and_B && a_and_b == A_and_B) { score(a_pos, b_pos) = max_score; } else { - const double lg2_nb = lg2[nb]; - const double lg2_nB = lg2[nB]; + const double lg2_nb = lg2_lookup(nb); + const double lg2_nB = lg2_lookup(nB); const double ic_sum = - a_and_b * (lg2[a_and_b] + offset_a - lg2_nb) + - a_and_B * (lg2[a_and_B] + offset_a - lg2_nB) + - A_and_b * (lg2[A_and_b] + offset_A - lg2_nb) + - A_and_B * (lg2[A_and_B] + offset_A - lg2_nB); + a_and_b * (lg2_lookup(a_and_b) + offset_a - lg2_nb) + + a_and_B * (lg2_lookup(a_and_B) + offset_a - lg2_nB) + + A_and_b * (lg2_lookup(A_and_b) + offset_A - lg2_nb) + + A_and_B * (lg2_lookup(A_and_B) + offset_A - lg2_nB); score(a_pos, b_pos) = max_score - static_cast(ic_sum * max_over_tips); } diff --git a/inst/include/TreeDist/types.h b/inst/include/TreeDist/types.h index c953989b..60b49ec0 100644 --- a/inst/include/TreeDist/types.h +++ b/inst/include/TreeDist/types.h @@ -16,6 +16,10 @@ namespace TreeDist { using int32 = int_fast32_t; using cost = int_fast64_t; + // Primary type for split/tip/bin counters. int32 benchmarks faster than int16 + // (avoids sign-extension when mixing with SplitList int32 members). + using split_int = int32; + using lap_dim = int; using lap_row = lap_dim; using lap_col = lap_dim; diff --git a/man/HierarchicalMutualInfo.Rd b/man/HierarchicalMutualInfo.Rd index 193b18ba..0da33247 100644 --- a/man/HierarchicalMutualInfo.Rd +++ b/man/HierarchicalMutualInfo.Rd @@ -124,17 +124,17 @@ AHMI(tree1, tree2, precision = 0.1) \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \concept{tree distances} diff --git a/man/Islands.Rd b/man/Islands.Rd index 41c24d1b..9bfcfb4a 100644 --- a/man/Islands.Rd +++ b/man/Islands.Rd @@ -64,13 +64,13 @@ par(oPar) \insertAllCited{} } \seealso{ -Other tree space functions: -\code{\link{MSTSegments}()}, -\code{\link{MapTrees}()}, -\code{\link{MappingQuality}()}, -\code{\link{SpectralEigens}()}, +Other tree space functions: +\code{\link[=MSTSegments]{MSTSegments()}}, +\code{\link[=MapTrees]{MapTrees()}}, +\code{\link[=MappingQuality]{MappingQuality()}}, +\code{\link[=SpectralEigens]{SpectralEigens()}}, \code{\link{cluster-statistics}}, -\code{\link{median.multiPhylo}()} +\code{\link[=median.multiPhylo]{median.multiPhylo()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/JaccardRobinsonFoulds.Rd b/man/JaccardRobinsonFoulds.Rd index 7e515be7..ad70a60f 100644 --- a/man/JaccardRobinsonFoulds.Rd +++ b/man/JaccardRobinsonFoulds.Rd @@ -126,18 +126,18 @@ VisualizeMatching(JRF2, tree1, tree2, matchZeros = FALSE) \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/KMeansPP.Rd b/man/KMeansPP.Rd index e279beb3..cab7aebe 100644 --- a/man/KMeansPP.Rd +++ b/man/KMeansPP.Rd @@ -45,7 +45,7 @@ plot(x, col = plusters$cluster, pch = rep(15:19, each = 5)) \seealso{ \code{\link[stats]{kmeans}} -Other cluster functions: +Other cluster functions: \code{\link{cluster-statistics}} } \author{ diff --git a/man/KendallColijn.Rd b/man/KendallColijn.Rd index 57d01772..8bb666b0 100644 --- a/man/KendallColijn.Rd +++ b/man/KendallColijn.Rd @@ -125,18 +125,18 @@ KCDiameter(trees) is a more sophisticated, if more cumbersome, implementation that supports lambda > 0, i.e. use of edge lengths in tree comparison. -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/MASTSize.Rd b/man/MASTSize.Rd index db028b0a..a50bf2d1 100644 --- a/man/MASTSize.Rd +++ b/man/MASTSize.Rd @@ -62,18 +62,18 @@ CompareAll(as.phylo(0:4, 8), MASTInfo) \code{\link[phangorn:mast]{phangorn::mast()}}, a slower implementation that also lists the leaves contained within the subtree. -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/MCITree.Rd b/man/MCITree.Rd index 8a119ba0..18528bab 100644 --- a/man/MCITree.Rd +++ b/man/MCITree.Rd @@ -47,8 +47,8 @@ SplitwiseInfo(mcc) } \seealso{ -Other summary trees: -\code{\link{TransferConsensus}()} +Other summary trees: +\code{\link[=TransferConsensus]{TransferConsensus()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/MSTSegments.Rd b/man/MSTSegments.Rd index 1ac77390..0b3c5186 100644 --- a/man/MSTSegments.Rd +++ b/man/MSTSegments.Rd @@ -70,13 +70,13 @@ PlotTools::SpectrumLegend( \insertAllCited{} } \seealso{ -Other tree space functions: -\code{\link{Islands}()}, -\code{\link{MapTrees}()}, -\code{\link{MappingQuality}()}, -\code{\link{SpectralEigens}()}, +Other tree space functions: +\code{\link[=Islands]{Islands()}}, +\code{\link[=MapTrees]{MapTrees()}}, +\code{\link[=MappingQuality]{MappingQuality()}}, +\code{\link[=SpectralEigens]{SpectralEigens()}}, \code{\link{cluster-statistics}}, -\code{\link{median.multiPhylo}()} +\code{\link[=median.multiPhylo]{median.multiPhylo()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/MapTrees.Rd b/man/MapTrees.Rd index 59890840..69f9ed6d 100644 --- a/man/MapTrees.Rd +++ b/man/MapTrees.Rd @@ -110,13 +110,13 @@ this application. The application itself can be cited using Full detail of tree space analysis in R is provided in the accompanying \href{https://ms609.github.io/TreeDist/articles/treespace.html}{vignette}. -Other tree space functions: -\code{\link{Islands}()}, -\code{\link{MSTSegments}()}, -\code{\link{MappingQuality}()}, -\code{\link{SpectralEigens}()}, +Other tree space functions: +\code{\link[=Islands]{Islands()}}, +\code{\link[=MSTSegments]{MSTSegments()}}, +\code{\link[=MappingQuality]{MappingQuality()}}, +\code{\link[=SpectralEigens]{SpectralEigens()}}, \code{\link{cluster-statistics}}, -\code{\link{median.multiPhylo}()} +\code{\link[=median.multiPhylo]{median.multiPhylo()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/MappingQuality.Rd b/man/MappingQuality.Rd index 05751dfb..e9c77c05 100644 --- a/man/MappingQuality.Rd +++ b/man/MappingQuality.Rd @@ -41,13 +41,13 @@ MappingQuality(distances, dist(mapping), 4) \insertAllCited{} } \seealso{ -Other tree space functions: -\code{\link{Islands}()}, -\code{\link{MSTSegments}()}, -\code{\link{MapTrees}()}, -\code{\link{SpectralEigens}()}, +Other tree space functions: +\code{\link[=Islands]{Islands()}}, +\code{\link[=MSTSegments]{MSTSegments()}}, +\code{\link[=MapTrees]{MapTrees()}}, +\code{\link[=SpectralEigens]{SpectralEigens()}}, \code{\link{cluster-statistics}}, -\code{\link{median.multiPhylo}()} +\code{\link[=median.multiPhylo]{median.multiPhylo()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/MatchingSplitDistance.Rd b/man/MatchingSplitDistance.Rd index b5d64b27..5b1eb124 100644 --- a/man/MatchingSplitDistance.Rd +++ b/man/MatchingSplitDistance.Rd @@ -84,18 +84,18 @@ VisualizeMatching(MatchingSplitDistance, TreeTools::BalancedTree(6), \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/NNIDist.Rd b/man/NNIDist.Rd index 5e24301b..23ed0839 100644 --- a/man/NNIDist.Rd +++ b/man/NNIDist.Rd @@ -101,18 +101,18 @@ CompareAll(as.phylo(30:33, 8), NNIDist) \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/NyeSimilarity.Rd b/man/NyeSimilarity.Rd index d984b129..21a40aee 100644 --- a/man/NyeSimilarity.Rd +++ b/man/NyeSimilarity.Rd @@ -122,18 +122,18 @@ NyeSimilarity(as.phylo(0:5, nTip = 8), similarity = FALSE) \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/PathDist.Rd b/man/PathDist.Rd index 1192d4ce..17a0a094 100644 --- a/man/PathDist.Rd +++ b/man/PathDist.Rd @@ -68,18 +68,18 @@ PathDist(as.phylo(30:33, 8)) \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/Robinson-Foulds.Rd b/man/Robinson-Foulds.Rd index 7510e094..1b4c656b 100644 --- a/man/Robinson-Foulds.Rd +++ b/man/Robinson-Foulds.Rd @@ -144,18 +144,18 @@ VisualizeMatching(InfoRobinsonFoulds, balanced7, pectinate7) \seealso{ Display paired splits: \code{\link[=VisualizeMatching]{VisualizeMatching()}} -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/SPRDist.Rd b/man/SPRDist.Rd index 5075abda..572f54a8 100644 --- a/man/SPRDist.Rd +++ b/man/SPRDist.Rd @@ -74,18 +74,18 @@ functions \code{USPRDist()} and \code{ReplugDist()}. the \insertCite{deOliveira2008;textual}{TreeDist} algorithm but can crash when sent trees of certain formats, and tends to have a longer running time. -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/SpectralEigens.Rd b/man/SpectralEigens.Rd index 16067450..546e2b98 100644 --- a/man/SpectralEigens.Rd +++ b/man/SpectralEigens.Rd @@ -37,13 +37,13 @@ plot(eigens, pch = 15, col = clusts$cluster) plot(cmdscale(distances), pch = 15, col = clusts$cluster) } \seealso{ -Other tree space functions: -\code{\link{Islands}()}, -\code{\link{MSTSegments}()}, -\code{\link{MapTrees}()}, -\code{\link{MappingQuality}()}, +Other tree space functions: +\code{\link[=Islands]{Islands()}}, +\code{\link[=MSTSegments]{MSTSegments()}}, +\code{\link[=MapTrees]{MapTrees()}}, +\code{\link[=MappingQuality]{MappingQuality()}}, \code{\link{cluster-statistics}}, -\code{\link{median.multiPhylo}()} +\code{\link[=median.multiPhylo]{median.multiPhylo()}} } \author{ Adapted by MRS from script by \href{https://rpubs.com/nurakawa/spectral-clustering}{Nura Kawa} diff --git a/man/SplitEntropy.Rd b/man/SplitEntropy.Rd index 4388b274..494dd322 100644 --- a/man/SplitEntropy.Rd +++ b/man/SplitEntropy.Rd @@ -40,8 +40,8 @@ SplitEntropy(c(A, A, A, B, B, B), c(A, A, B, B, B, B)) \insertAllCited{} } \seealso{ -Other information functions: -\code{\link{SplitSharedInformation}()}, +Other information functions: +\code{\link[=SplitSharedInformation]{SplitSharedInformation()}}, \code{\link{TreeInfo}} } \author{ diff --git a/man/SplitSharedInformation.Rd b/man/SplitSharedInformation.Rd index 57560887..25f1fc78 100644 --- a/man/SplitSharedInformation.Rd +++ b/man/SplitSharedInformation.Rd @@ -87,8 +87,8 @@ splits. \insertAllCited{} } \seealso{ -Other information functions: -\code{\link{SplitEntropy}()}, +Other information functions: +\code{\link[=SplitEntropy]{SplitEntropy()}}, \code{\link{TreeInfo}} } \author{ diff --git a/man/TransferConsensus.Rd b/man/TransferConsensus.Rd index 219111ab..1c964d4d 100644 --- a/man/TransferConsensus.Rd +++ b/man/TransferConsensus.Rd @@ -66,7 +66,7 @@ cat("Transfer consensus splits:", NSplits(tc), "\n") \insertAllCited{} } \seealso{ -Other summary trees: -\code{\link{MCITree}()} +Other summary trees: +\code{\link[=MCITree]{MCITree()}} } \concept{summary trees} diff --git a/man/TransferDist.Rd b/man/TransferDist.Rd index 2439130f..d9361014 100644 --- a/man/TransferDist.Rd +++ b/man/TransferDist.Rd @@ -102,17 +102,17 @@ TransferDist(as.phylo(0:5, 8)) \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \concept{tree distances} diff --git a/man/TreeDist-package.Rd b/man/TreeDist-package.Rd index 15872b54..7d4da0d5 100644 --- a/man/TreeDist-package.Rd +++ b/man/TreeDist-package.Rd @@ -134,6 +134,11 @@ Geodesic distance \author{ \strong{Maintainer}: Martin R. Smith \email{martin.smith@durham.ac.uk} (\href{https://orcid.org/0000-0001-5660-1727}{ORCID}) [copyright holder, programmer] +Authors: +\itemize{ + \item Martin R. Smith \email{martin.smith@durham.ac.uk} (\href{https://orcid.org/0000-0001-5660-1727}{ORCID}) [copyright holder, programmer] +} + Other contributors: \itemize{ \item Roy Jonker \email{roy_jonker@magiclogic.com} (LAP algorithm) [programmer, copyright holder] diff --git a/man/TreeDistance.Rd b/man/TreeDistance.Rd index f081b6d3..3bda3fed 100644 --- a/man/TreeDistance.Rd +++ b/man/TreeDistance.Rd @@ -318,18 +318,18 @@ MutualClusteringInfoSplits(splits1, splits2) \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/TreeInfo.Rd b/man/TreeInfo.Rd index a31234d8..9f80d53f 100644 --- a/man/TreeInfo.Rd +++ b/man/TreeInfo.Rd @@ -205,9 +205,9 @@ An introduction to the phylogenetic information content of a split is given in \href{https://ms609.github.io/TreeTools/reference/SplitInformation.html}{\code{SplitInformation()}} and in a \href{https://ms609.github.io/TreeDist/articles/information.html}{package vignette}. -Other information functions: -\code{\link{SplitEntropy}()}, -\code{\link{SplitSharedInformation}()} +Other information functions: +\code{\link[=SplitEntropy]{SplitEntropy()}}, +\code{\link[=SplitSharedInformation]{SplitSharedInformation()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/cluster-statistics.Rd b/man/cluster-statistics.Rd index eb997329..e892a26d 100644 --- a/man/cluster-statistics.Rd +++ b/man/cluster-statistics.Rd @@ -87,16 +87,16 @@ MeanNN(points, cluster) MeanMSTEdge(points, cluster) } \seealso{ -Other tree space functions: -\code{\link{Islands}()}, -\code{\link{MSTSegments}()}, -\code{\link{MapTrees}()}, -\code{\link{MappingQuality}()}, -\code{\link{SpectralEigens}()}, -\code{\link{median.multiPhylo}()} - -Other cluster functions: -\code{\link{KMeansPP}()} +Other tree space functions: +\code{\link[=Islands]{Islands()}}, +\code{\link[=MSTSegments]{MSTSegments()}}, +\code{\link[=MapTrees]{MapTrees()}}, +\code{\link[=MappingQuality]{MappingQuality()}}, +\code{\link[=SpectralEigens]{SpectralEigens()}}, +\code{\link[=median.multiPhylo]{median.multiPhylo()}} + +Other cluster functions: +\code{\link[=KMeansPP]{KMeansPP()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/median.multiPhylo.Rd b/man/median.multiPhylo.Rd index 43282ceb..ad1fe2f8 100644 --- a/man/median.multiPhylo.Rd +++ b/man/median.multiPhylo.Rd @@ -79,12 +79,12 @@ Consensus methods: \code{\link[ape:consensus]{ape::consensus()}}, \code{\link[TreeTools:ConsensusWithout]{TreeTools::ConsensusWithout()}} -Other tree space functions: -\code{\link{Islands}()}, -\code{\link{MSTSegments}()}, -\code{\link{MapTrees}()}, -\code{\link{MappingQuality}()}, -\code{\link{SpectralEigens}()}, +Other tree space functions: +\code{\link[=Islands]{Islands()}}, +\code{\link[=MSTSegments]{MSTSegments()}}, +\code{\link[=MapTrees]{MapTrees()}}, +\code{\link[=MappingQuality]{MappingQuality()}}, +\code{\link[=SpectralEigens]{SpectralEigens()}}, \code{\link{cluster-statistics}} } \author{ diff --git a/memcheck/tests.R b/memcheck/tests.R index e85bebd2..14c7388a 100644 --- a/memcheck/tests.R +++ b/memcheck/tests.R @@ -1,4 +1,3 @@ # Code to be run with # R -d "valgrind --tool=memcheck --leak-check=full --error-exitcode=1" --vanilla < memcheck/thisfile.R -library("TreeDist") -devtools::test() +testthat::test_local() diff --git a/memcheck/vignettes.R b/memcheck/vignettes.R index 0c02f716..9c2ac44c 100644 --- a/memcheck/vignettes.R +++ b/memcheck/vignettes.R @@ -1,3 +1,3 @@ # Code to be run with # R -d "valgrind --tool=memcheck --leak-check=full --error-exitcode=1" --vanilla < memcheck/thisfile.R -devtools::build_vignettes(install = FALSE) +pkgdown::build_articles(preview = FALSE) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index c78c3efc..116f48d3 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -727,15 +727,16 @@ BEGIN_RCPP END_RCPP } // cpp_shared_phylo -List cpp_shared_phylo(const RawMatrix& x, const RawMatrix& y, const IntegerVector& nTip); -RcppExport SEXP _TreeDist_cpp_shared_phylo(SEXP xSEXP, SEXP ySEXP, SEXP nTipSEXP) { +List cpp_shared_phylo(const RawMatrix& x, const RawMatrix& y, const IntegerVector& nTip, const bool force_slow); +RcppExport SEXP _TreeDist_cpp_shared_phylo(SEXP xSEXP, SEXP ySEXP, SEXP nTipSEXP, SEXP force_slowSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const RawMatrix& >::type x(xSEXP); Rcpp::traits::input_parameter< const RawMatrix& >::type y(ySEXP); Rcpp::traits::input_parameter< const IntegerVector& >::type nTip(nTipSEXP); - rcpp_result_gen = Rcpp::wrap(cpp_shared_phylo(x, y, nTip)); + Rcpp::traits::input_parameter< const bool >::type force_slow(force_slowSEXP); + rcpp_result_gen = Rcpp::wrap(cpp_shared_phylo(x, y, nTip, force_slow)); return rcpp_result_gen; END_RCPP } @@ -797,7 +798,7 @@ static const R_CallMethodDef CallEntries[] = { {"_TreeDist_cpp_jaccard_similarity", (DL_FUNC) &_TreeDist_cpp_jaccard_similarity, 5}, {"_TreeDist_cpp_msi_distance", (DL_FUNC) &_TreeDist_cpp_msi_distance, 3}, {"_TreeDist_cpp_mutual_clustering", (DL_FUNC) &_TreeDist_cpp_mutual_clustering, 3}, - {"_TreeDist_cpp_shared_phylo", (DL_FUNC) &_TreeDist_cpp_shared_phylo, 3}, + {"_TreeDist_cpp_shared_phylo", (DL_FUNC) &_TreeDist_cpp_shared_phylo, 4}, {NULL, NULL, 0} }; diff --git a/src/day_1985.cpp b/src/day_1985.cpp index 70ab4dbd..157bf073 100644 --- a/src/day_1985.cpp +++ b/src/day_1985.cpp @@ -98,7 +98,7 @@ double calc_consensus_info(const List &trees, const LogicalVector &phylo, std::vector tables; if (std::size_t(n_trees) > tables.max_size()) { - ASSERT(false && "Not enough memory for consensus of so many trees"); // LCOV_EXCL_LINE + ASSERT(false && "Not enough memory for consensus of so many trees"); // #nocov } tables.reserve(n_trees); diff --git a/src/ints.h b/src/ints.h index 50fd3cc7..3d424c4c 100644 --- a/src/ints.h +++ b/src/ints.h @@ -6,6 +6,7 @@ // Re-export shared types to global scope for backward compatibility. using TreeDist::int16; using TreeDist::int32; +using TreeDist::split_int; // Types used only within TreeDist's own source. using uint32 = uint_fast32_t; diff --git a/src/pairwise_distances.cpp b/src/pairwise_distances.cpp index 67db48ab..760e7ff8 100644 --- a/src/pairwise_distances.cpp +++ b/src/pairwise_distances.cpp @@ -34,12 +34,12 @@ using TreeTools::count_bits; // per-pair heap allocation. Vectors grow lazily and are never shrunk. // --------------------------------------------------------------------------- struct MatchScratch { - std::vector a_canon; - std::vector b_canon; - std::vector a_order; - std::vector b_order; - std::vector a_match; - std::vector b_match; + std::vector a_canon; + std::vector b_canon; + std::vector a_order; + std::vector b_order; + std::vector a_match; + std::vector b_match; }; // --------------------------------------------------------------------------- @@ -59,19 +59,19 @@ struct MatchScratch { // a_match[ai] = bi+1 if split ai matched split bi, else 0. // b_match[bi] = ai+1 if split bi matched split ai, else 0. // --------------------------------------------------------------------------- -static int16 find_exact_matches( +static split_int find_exact_matches( const SplitList& a, const SplitList& b, const int32 n_tips, MatchScratch& scratch ) { - const int16 n_bins = a.n_bins; - const int16 last_bin = n_bins - 1; + const split_int n_bins = a.n_bins; + const split_int last_bin = n_bins - 1; const splitbit last_mask = (n_tips % SL_BIN_SIZE == 0) ? ~splitbit(0) : (splitbit(1) << (n_tips % SL_BIN_SIZE)) - 1; - const int16 a_n = a.n_splits; - const int16 b_n = b.n_splits; + const split_int a_n = a.n_splits; + const split_int b_n = b.n_splits; // Ensure buffers are large enough (grow lazily, never shrink) const size_t a_canon_sz = static_cast(a_n) * n_bins; @@ -83,10 +83,10 @@ static int16 find_exact_matches( if (scratch.a_match.size() < static_cast(a_n)) scratch.a_match.resize(a_n); if (scratch.b_match.size() < static_cast(b_n)) scratch.b_match.resize(b_n); - int16* a_match = scratch.a_match.data(); - int16* b_match = scratch.b_match.data(); - std::fill(a_match, a_match + a_n, int16(0)); - std::fill(b_match, b_match + b_n, int16(0)); + split_int* a_match = scratch.a_match.data(); + split_int* b_match = scratch.b_match.data(); + std::fill(a_match, a_match + a_n, split_int(0)); + std::fill(b_match, b_match + b_n, split_int(0)); if (a_n == 0 || b_n == 0) return 0; @@ -94,17 +94,17 @@ static int16 find_exact_matches( splitbit* b_canon = scratch.b_canon.data(); // --- 1. Compute canonical forms into flat buffers --- - for (int16 i = 0; i < a_n; ++i) { + for (split_int i = 0; i < a_n; ++i) { const bool flip = !(a.state[i][0] & 1); - for (int16 bin = 0; bin < n_bins; ++bin) { + for (split_int bin = 0; bin < n_bins; ++bin) { splitbit val = flip ? ~a.state[i][bin] : a.state[i][bin]; if (bin == last_bin) val &= last_mask; a_canon[i * n_bins + bin] = val; } } - for (int16 i = 0; i < b_n; ++i) { + for (split_int i = 0; i < b_n; ++i) { const bool flip = !(b.state[i][0] & 1); - for (int16 bin = 0; bin < n_bins; ++bin) { + for (split_int bin = 0; bin < n_bins; ++bin) { splitbit val = flip ? ~b.state[i][bin] : b.state[i][bin]; if (bin == last_bin) val &= last_mask; b_canon[i * n_bins + bin] = val; @@ -112,8 +112,8 @@ static int16 find_exact_matches( } // --- 2. Sort index arrays by canonical form --- - auto canon_less = [&](const splitbit* canon, int16 i, int16 j) { - for (int16 bin = 0; bin < n_bins; ++bin) { + auto canon_less = [&](const splitbit* canon, split_int i, split_int j) { + for (split_int bin = 0; bin < n_bins; ++bin) { const splitbit vi = canon[i * n_bins + bin]; const splitbit vj = canon[j * n_bins + bin]; if (vi < vj) return true; @@ -122,29 +122,29 @@ static int16 find_exact_matches( return false; // #nocov }; - int16* a_order = scratch.a_order.data(); - int16* b_order = scratch.b_order.data(); - std::iota(a_order, a_order + a_n, int16(0)); - std::iota(b_order, b_order + b_n, int16(0)); + split_int* a_order = scratch.a_order.data(); + split_int* b_order = scratch.b_order.data(); + std::iota(a_order, a_order + a_n, split_int(0)); + std::iota(b_order, b_order + b_n, split_int(0)); std::sort(a_order, a_order + a_n, - [&](int16 i, int16 j) { + [&](split_int i, split_int j) { return canon_less(a_canon, i, j); }); std::sort(b_order, b_order + b_n, - [&](int16 i, int16 j) { + [&](split_int i, split_int j) { return canon_less(b_canon, i, j); }); // --- 3. Merge-scan to find matches --- - int16 exact_n = 0; - int16 ai_pos = 0, bi_pos = 0; + split_int exact_n = 0; + split_int ai_pos = 0, bi_pos = 0; while (ai_pos < a_n && bi_pos < b_n) { - const int16 ai = a_order[ai_pos]; - const int16 bi = b_order[bi_pos]; + const split_int ai = a_order[ai_pos]; + const split_int bi = b_order[bi_pos]; int cmp = 0; - for (int16 bin = 0; bin < n_bins; ++bin) { + for (split_int bin = 0; bin < n_bins; ++bin) { const splitbit va = a_canon[ai * n_bins + bin]; const splitbit vb = b_canon[bi * n_bins + bin]; if (va < vb) { cmp = -1; break; } @@ -178,25 +178,25 @@ static double mutual_clustering_score( ) { if (a.n_splits == 0 || b.n_splits == 0 || n_tips == 0) return 0.0; - const int16 most_splits = std::max(a.n_splits, b.n_splits); + const split_int most_splits = std::max(a.n_splits, b.n_splits); const double n_tips_rcp = 1.0 / static_cast(n_tips); constexpr cost max_score = BIG; constexpr double over_max = 1.0 / static_cast(BIG); const double max_over_tips = static_cast(BIG) * n_tips_rcp; - const double lg2_n = lg2[n_tips]; + const double lg2_n = lg2_lookup(n_tips); // --- Phase 1: O(n log n) exact-match detection --- - const int16 exact_n = find_exact_matches(a, b, n_tips, mscratch); - const int16* a_match = mscratch.a_match.data(); - const int16* b_match = mscratch.b_match.data(); + const split_int exact_n = find_exact_matches(a, b, n_tips, mscratch); + const split_int* a_match = mscratch.a_match.data(); + const split_int* b_match = mscratch.b_match.data(); // Accumulate exact-match score double exact_score = 0.0; - for (int16 ai = 0; ai < a.n_splits; ++ai) { + for (split_int ai = 0; ai < a.n_splits; ++ai) { if (a_match[ai]) { - const int16 na = a.in_split[ai]; - const int16 nA = n_tips - na; + const split_int na = a.in_split[ai]; + const split_int nA = n_tips - na; exact_score += TreeDist::ic_matching(na, nA, n_tips); } } @@ -207,58 +207,58 @@ static double mutual_clustering_score( } // --- Phase 2: fill cost matrix for unmatched splits only (O(k²)) --- - const int16 lap_n = most_splits - exact_n; + const split_int lap_n = most_splits - exact_n; // Build index maps for unmatched splits - std::vector a_unmatch, b_unmatch; + std::vector a_unmatch, b_unmatch; a_unmatch.reserve(lap_n); b_unmatch.reserve(lap_n); - for (int16 ai = 0; ai < a.n_splits; ++ai) { + for (split_int ai = 0; ai < a.n_splits; ++ai) { if (!a_match[ai]) a_unmatch.push_back(ai); } - for (int16 bi = 0; bi < b.n_splits; ++bi) { + for (split_int bi = 0; bi < b.n_splits; ++bi) { if (!b_match[bi]) b_unmatch.push_back(bi); } scratch.score_pool.resize(lap_n); cost_matrix& score = scratch.score_pool; - const int16 a_unmatched_n = static_cast(a_unmatch.size()); - const int16 b_unmatched_n = static_cast(b_unmatch.size()); + const split_int a_unmatched_n = static_cast(a_unmatch.size()); + const split_int b_unmatched_n = static_cast(b_unmatch.size()); - for (int16 a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { - const int16 ai = a_unmatch[a_pos]; - const int16 na = a.in_split[ai]; - const int16 nA = n_tips - na; + for (split_int a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { + const split_int ai = a_unmatch[a_pos]; + const split_int na = a.in_split[ai]; + const split_int nA = n_tips - na; const auto* a_row = a.state[ai]; - const double offset_a = lg2_n - lg2[na]; - const double offset_A = lg2_n - lg2[nA]; + const double offset_a = lg2_n - lg2_lookup(na); + const double offset_A = lg2_n - lg2_lookup(nA); - for (int16 b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { - const int16 bi = b_unmatch[b_pos]; + for (split_int b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { + const split_int bi = b_unmatch[b_pos]; const auto* b_row = b.state[bi]; - int16 a_and_b = 0; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + split_int a_and_b = 0; + for (split_int bin = 0; bin < a.n_bins; ++bin) { a_and_b += count_bits(a_row[bin] & b_row[bin]); } - const int16 nb = b.in_split[bi]; - const int16 nB = n_tips - nb; - const int16 a_and_B = na - a_and_b; - const int16 A_and_b = nb - a_and_b; - const int16 A_and_B = nA - A_and_b; + const split_int nb = b.in_split[bi]; + const split_int nB = n_tips - nb; + const split_int a_and_B = na - a_and_b; + const split_int A_and_b = nb - a_and_b; + const split_int A_and_B = nA - A_and_b; if (a_and_b == A_and_b && a_and_b == a_and_B && a_and_b == A_and_B) { score(a_pos, b_pos) = max_score; } else { - const double lg2_nb = lg2[nb]; - const double lg2_nB = lg2[nB]; + const double lg2_nb = lg2_lookup(nb); + const double lg2_nB = lg2_lookup(nB); const double ic_sum = - a_and_b * (lg2[a_and_b] + offset_a - lg2_nb) + - a_and_B * (lg2[a_and_B] + offset_a - lg2_nB) + - A_and_b * (lg2[A_and_b] + offset_A - lg2_nb) + - A_and_B * (lg2[A_and_B] + offset_A - lg2_nB); + a_and_b * (lg2_lookup(a_and_b) + offset_a - lg2_nb) + + a_and_B * (lg2_lookup(a_and_B) + offset_a - lg2_nB) + + A_and_b * (lg2_lookup(A_and_b) + offset_A - lg2_nb) + + A_and_B * (lg2_lookup(A_and_B) + offset_A - lg2_nB); score(a_pos, b_pos) = max_score - static_cast(ic_sum * max_over_tips); } } @@ -366,27 +366,27 @@ static double rf_info_score( const SplitList& a, const SplitList& b, const int32 n_tips, MatchScratch& mscratch ) { - const int16 a_n = a.n_splits; - const int16 b_n = b.n_splits; + const split_int a_n = a.n_splits; + const split_int b_n = b.n_splits; if (a_n == 0 || b_n == 0) return 0; // Use sort+merge to find exact matches in O(n log n) - const int16 exact_n = find_exact_matches(a, b, n_tips, mscratch); + const split_int exact_n = find_exact_matches(a, b, n_tips, mscratch); if (exact_n == 0) return 0; // Sum info contribution for each matched split in a - const int16* a_match = mscratch.a_match.data(); - const double lg2_unrooted_n = lg2_unrooted[n_tips]; + const split_int* a_match = mscratch.a_match.data(); + const double lg2_unrooted_n = lg2_unrooted_lookup(n_tips); double score = 0; - for (int16 ai = 0; ai < a_n; ++ai) { + for (split_int ai = 0; ai < a_n; ++ai) { if (a_match[ai] == 0) continue; - int16 leaves_in_split = 0; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + split_int leaves_in_split = 0; + for (split_int bin = 0; bin < a.n_bins; ++bin) { leaves_in_split += count_bits(a.state[ai][bin]); } score += lg2_unrooted_n - - lg2_rooted[leaves_in_split] - - lg2_rooted[n_tips - leaves_in_split]; + - lg2_rooted_lookup(leaves_in_split) + - lg2_rooted_lookup(n_tips - leaves_in_split); } return score; } @@ -446,48 +446,48 @@ static double msd_score( const SplitList& a, const SplitList& b, const int32 n_tips, LapScratch& scratch, MatchScratch& mscratch ) { - const int16 most_splits = std::max(a.n_splits, b.n_splits); + const split_int most_splits = std::max(a.n_splits, b.n_splits); if (most_splits == 0) return 0.0; - const bool a_has_more = (a.n_splits > b.n_splits); - const int16 a_extra = a_has_more ? most_splits - b.n_splits : 0; - const int16 b_extra = a_has_more ? 0 : most_splits - a.n_splits; - const int16 half_tips = n_tips / 2; - const cost max_score = BIG / most_splits; + const bool a_has_more = (a.n_splits > b.n_splits); + const split_int a_extra = a_has_more ? most_splits - b.n_splits : 0; + const split_int b_extra = a_has_more ? 0 : most_splits - a.n_splits; + const split_int half_tips = n_tips / 2; + const cost max_score = BIG / most_splits; // --- Phase 1: O(n log n) exact-match detection --- - const int16 exact_n = find_exact_matches(a, b, n_tips, mscratch); - const int16* a_match = mscratch.a_match.data(); - const int16* b_match = mscratch.b_match.data(); + const split_int exact_n = find_exact_matches(a, b, n_tips, mscratch); + const split_int* a_match = mscratch.a_match.data(); + const split_int* b_match = mscratch.b_match.data(); if (exact_n == b.n_splits || exact_n == a.n_splits) { return 0.0; } // --- Phase 2: fill cost matrix for unmatched splits only --- - const int16 lap_n = most_splits - exact_n; + const split_int lap_n = most_splits - exact_n; - std::vector a_unmatch, b_unmatch; + std::vector a_unmatch, b_unmatch; a_unmatch.reserve(lap_n); b_unmatch.reserve(lap_n); - for (int16 ai = 0; ai < a.n_splits; ++ai) { + for (split_int ai = 0; ai < a.n_splits; ++ai) { if (!a_match[ai]) a_unmatch.push_back(ai); } - for (int16 bi = 0; bi < b.n_splits; ++bi) { + for (split_int bi = 0; bi < b.n_splits; ++bi) { if (!b_match[bi]) b_unmatch.push_back(bi); } scratch.score_pool.resize(lap_n); cost_matrix& score = scratch.score_pool; - const int16 a_unmatched_n = static_cast(a_unmatch.size()); - const int16 b_unmatched_n = static_cast(b_unmatch.size()); + const split_int a_unmatched_n = static_cast(a_unmatch.size()); + const split_int b_unmatched_n = static_cast(b_unmatch.size()); - for (int16 a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { - const int16 ai = a_unmatch[a_pos]; - for (int16 b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { - const int16 bi = b_unmatch[b_pos]; + for (split_int a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { + const split_int ai = a_unmatch[a_pos]; + for (split_int b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { + const split_int bi = b_unmatch[b_pos]; splitbit total = 0; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + for (split_int bin = 0; bin < a.n_bins; ++bin) { total += count_bits(a.state[ai][bin] ^ b.state[bi][bin]); } score(a_pos, b_pos) = static_cast( @@ -574,30 +574,30 @@ static double msi_score( const SplitList& a, const SplitList& b, const int32 n_tips, LapScratch& scratch ) { - const int16 most_splits = std::max(a.n_splits, b.n_splits); + const split_int most_splits = std::max(a.n_splits, b.n_splits); if (most_splits == 0) return 0.0; constexpr cost max_score = BIG; - const double max_possible = lg2_unrooted[n_tips] - - lg2_rooted[int16((n_tips + 1) / 2)] - - lg2_rooted[int16(n_tips / 2)]; + const double max_possible = lg2_unrooted_lookup(n_tips) + - lg2_rooted_lookup((n_tips + 1) / 2) + - lg2_rooted_lookup(n_tips / 2); const double score_over_possible = static_cast(max_score) / max_possible; const double possible_over_score = max_possible / static_cast(max_score); scratch.score_pool.resize(most_splits); cost_matrix& score = scratch.score_pool; - for (int16 ai = 0; ai < a.n_splits; ++ai) { - for (int16 bi = 0; bi < b.n_splits; ++bi) { - int16 n_a_only = 0, n_a_and_b = 0, n_different = 0; + for (split_int ai = 0; ai < a.n_splits; ++ai) { + for (split_int bi = 0; bi < b.n_splits; ++bi) { + split_int n_a_only = 0, n_a_and_b = 0, n_different = 0; splitbit different; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + for (split_int bin = 0; bin < a.n_bins; ++bin) { different = a.state[ai][bin] ^ b.state[bi][bin]; n_different += count_bits(different); n_a_only += count_bits(a.state[ai][bin] & different); n_a_and_b += count_bits(a.state[ai][bin] & ~different); } - const int16 n_same = n_tips - n_different; + const split_int n_same = n_tips - n_different; score(ai, bi) = cost(max_score - score_over_possible * TreeDist::mmsi_score(n_same, n_a_and_b, n_different, n_a_only)); } @@ -669,32 +669,68 @@ NumericVector cpp_msi_all_pairs( // metric, spi_overlap(A, B) where B contains A can EXCEED spi_overlap(A, A) // for identical splits, so the LAP can find a better global assignment by NOT // matching identical splits. The full LAP is always solved. +// LAP-fill body for shared_phylo_score, templated on Fast so the per-cell +// spi_overlap call resolves at compile time to direct lg2_rooted[] access +// (Fast=true) or the bounds-checked lookup (Fast=false). +template +static inline void shared_phylo_fill_lap( + const SplitList& a, const SplitList& b, const int32 n_tips, + const double best_overlap, const double score_over_possible, + const cost max_score, cost_matrix& score +) { + for (split_int ai = 0; ai < a.n_splits; ++ai) { + for (split_int bi = 0; bi < b.n_splits; ++bi) { + const double spi = TreeDist::spi_overlap( + a.state[ai], b.state[bi], n_tips, + a.in_split[ai], b.in_split[bi], a.n_bins); + score(ai, bi) = (spi == 0.0) ? max_score + : cost((spi - best_overlap) * score_over_possible); + } + score.padRowAfterCol(ai, b.n_splits, max_score); + } +} + static double shared_phylo_score( const SplitList& a, const SplitList& b, const int32 n_tips, LapScratch& scratch ) { - const int16 most_splits = std::max(a.n_splits, b.n_splits); + const split_int most_splits = std::max(a.n_splits, b.n_splits); if (most_splits == 0) return 0.0; - const int16 overlap_a = int16(n_tips + 1) / 2; + const split_int overlap_a = (n_tips + 1) / 2; constexpr cost max_score = BIG; - const double best_overlap = TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); - const double max_possible = lg2_unrooted[n_tips] - best_overlap; + // Hot path: n_tips <= SL_MAX_TIPS, so all lg2_rooted/lg2_unrooted indices + // fall in the precomputed table. Dispatch once here so the per-cell loop + // emits direct table loads with no bounds-check branch. + // The else branch is reachable only on TT 2.3 builds with n_tips > 32705, + // which is too large to exercise in routine tests; the equivalent slow + // template instantiations (one_overlap etc.) are covered by the + // legacy single-pair entry (cpp_shared_phylo) via force_slow=true. + const bool fast = n_tips <= static_cast(SL_MAX_TIPS + 1); + double best_overlap, max_possible; + if (fast) { + best_overlap = TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); + max_possible = lg2_unrooted[n_tips] - best_overlap; + } else { + // LCOV_EXCL_START + best_overlap = TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); + max_possible = lg2_unrooted_lookup(n_tips) - best_overlap; + // LCOV_EXCL_STOP + } const double score_over_possible = static_cast(max_score) / max_possible; const double possible_over_score = max_possible / static_cast(max_score); scratch.score_pool.resize(most_splits); cost_matrix& score = scratch.score_pool; - for (int16 ai = 0; ai < a.n_splits; ++ai) { - for (int16 bi = 0; bi < b.n_splits; ++bi) { - const double spi = TreeDist::spi_overlap( - a.state[ai], b.state[bi], n_tips, - a.in_split[ai], b.in_split[bi], a.n_bins); - score(ai, bi) = (spi == 0.0) ? max_score - : cost((spi - best_overlap) * score_over_possible); - } - score.padRowAfterCol(ai, b.n_splits, max_score); + if (fast) { + shared_phylo_fill_lap(a, b, n_tips, best_overlap, + score_over_possible, max_score, score); + } else { + // LCOV_EXCL_START + shared_phylo_fill_lap(a, b, n_tips, best_overlap, + score_over_possible, max_score, score); + // LCOV_EXCL_STOP } score.padAfterRow(a.n_splits, max_score); @@ -763,7 +799,7 @@ static double jaccard_score( const double exponent, const bool allow_conflict, LapScratch& scratch, MatchScratch& mscratch ) { - const int16 most_splits = std::max(a.n_splits, b.n_splits); + const split_int most_splits = std::max(a.n_splits, b.n_splits); if (most_splits == 0) return 0.0; constexpr cost max_score = BIG; @@ -772,7 +808,7 @@ static double jaccard_score( // --- Phase 1: O(n log n) exact-match detection --- // Only used when allow_conflict=true; otherwise the full LAP may reassign // non-matching splits to compatible (non-exact) partners. - int16 exact_n = 0; + split_int exact_n = 0; if (allow_conflict) { exact_n = find_exact_matches(a, b, n_tips, mscratch); } else { @@ -781,61 +817,61 @@ static double jaccard_score( mscratch.a_match.resize(a.n_splits); if (mscratch.b_match.size() < static_cast(b.n_splits)) mscratch.b_match.resize(b.n_splits); - std::fill(mscratch.a_match.data(), mscratch.a_match.data() + a.n_splits, int16(0)); - std::fill(mscratch.b_match.data(), mscratch.b_match.data() + b.n_splits, int16(0)); + std::fill(mscratch.a_match.data(), mscratch.a_match.data() + a.n_splits, split_int(0)); + std::fill(mscratch.b_match.data(), mscratch.b_match.data() + b.n_splits, split_int(0)); } - const int16* a_match = mscratch.a_match.data(); - const int16* b_match = mscratch.b_match.data(); + const split_int* a_match = mscratch.a_match.data(); + const split_int* b_match = mscratch.b_match.data(); if (exact_n == b.n_splits || exact_n == a.n_splits) { return static_cast(exact_n); } // --- Phase 2: fill cost matrix for unmatched splits only --- - const int16 lap_n = most_splits - exact_n; + const split_int lap_n = most_splits - exact_n; - std::vector a_unmatch, b_unmatch; + std::vector a_unmatch, b_unmatch; a_unmatch.reserve(lap_n); b_unmatch.reserve(lap_n); - for (int16 ai = 0; ai < a.n_splits; ++ai) { + for (split_int ai = 0; ai < a.n_splits; ++ai) { if (!a_match[ai]) a_unmatch.push_back(ai); } - for (int16 bi = 0; bi < b.n_splits; ++bi) { + for (split_int bi = 0; bi < b.n_splits; ++bi) { if (!b_match[bi]) b_unmatch.push_back(bi); } scratch.score_pool.resize(lap_n); cost_matrix& score = scratch.score_pool; - const int16 a_unmatched_n = static_cast(a_unmatch.size()); - const int16 b_unmatched_n = static_cast(b_unmatch.size()); + const split_int a_unmatched_n = static_cast(a_unmatch.size()); + const split_int b_unmatched_n = static_cast(b_unmatch.size()); - for (int16 a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { - const int16 ai = a_unmatch[a_pos]; - const int16 na = a.in_split[ai]; - const int16 nA = n_tips - na; + for (split_int a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { + const split_int ai = a_unmatch[a_pos]; + const split_int na = a.in_split[ai]; + const split_int nA = n_tips - na; - for (int16 b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { - const int16 bi = b_unmatch[b_pos]; - int16 a_and_b = 0; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + for (split_int b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { + const split_int bi = b_unmatch[b_pos]; + split_int a_and_b = 0; + for (split_int bin = 0; bin < a.n_bins; ++bin) { a_and_b += count_bits(a.state[ai][bin] & b.state[bi][bin]); } - const int16 nb = b.in_split[bi]; - const int16 nB = n_tips - nb; - const int16 a_and_B = na - a_and_b; - const int16 A_and_b = nb - a_and_b; - const int16 A_and_B = nB - a_and_B; + const split_int nb = b.in_split[bi]; + const split_int nB = n_tips - nb; + const split_int a_and_B = na - a_and_b; + const split_int A_and_b = nb - a_and_b; + const split_int A_and_B = nB - a_and_B; if (!allow_conflict && !( a_and_b == na || a_and_B == na || A_and_b == nA || A_and_B == nA)) { score(a_pos, b_pos) = max_score; } else { - const int16 A_or_b = n_tips - a_and_B; - const int16 a_or_B = n_tips - A_and_b; - const int16 a_or_b = n_tips - A_and_B; - const int16 A_or_B = n_tips - a_and_b; + const split_int A_or_b = n_tips - a_and_B; + const split_int a_or_B = n_tips - A_and_b; + const split_int a_or_b = n_tips - A_and_B; + const split_int A_or_B = n_tips - a_and_b; const double ars_ab = static_cast(a_and_b) / static_cast(a_or_b); const double ars_Ab = static_cast(A_and_b) / static_cast(A_or_b); const double ars_aB = static_cast(a_and_B) / static_cast(a_or_B); @@ -1229,7 +1265,7 @@ NumericVector cpp_clustering_entropy_batch( for (int i = 0; i < N; ++i) { SplitList sl(Rcpp::as(splits_list[i])); double total = 0.0; - for (int16 s = 0; s < sl.n_splits; ++s) { + for (split_int s = 0; s < sl.n_splits; ++s) { const int k = sl.in_split[s]; if (k <= 0 || k >= n_tip) continue; const double p = k * invN; @@ -1271,7 +1307,7 @@ NumericVector cpp_splitwise_info_batch( for (int i = 0; i < N; ++i) { SplitList sl(Rcpp::as(splits_list[i])); double total = 0.0; - for (int16 s = 0; s < sl.n_splits; ++s) { + for (split_int s = 0; s < sl.n_splits; ++s) { const int k = sl.in_split[s]; if (k < 2 || (n_tip - k) < 2) continue; total += l2u_n - l2r[k] - l2r[n_tip - k]; diff --git a/src/tree_distance_functions.cpp b/src/tree_distance_functions.cpp index 99347300..5ab745cb 100644 --- a/src/tree_distance_functions.cpp +++ b/src/tree_distance_functions.cpp @@ -28,15 +28,15 @@ double cpp_mci_impl_score(const Rcpp::RawMatrix& x, // Build arrays matching the header's raw-pointer API types. std::vector a_ptrs(a.n_splits); std::vector b_ptrs(b.n_splits); - std::vector a_in(a.n_splits); - std::vector b_in(b.n_splits); - for (TreeDist::int16 i = 0; i < a.n_splits; ++i) { + std::vector a_in(a.n_splits); + std::vector b_in(b.n_splits); + for (TreeDist::split_int i = 0; i < a.n_splits; ++i) { a_ptrs[i] = a.state[i]; - a_in[i] = static_cast(a.in_split[i]); + a_in[i] = static_cast(a.in_split[i]); } - for (TreeDist::int16 i = 0; i < b.n_splits; ++i) { + for (TreeDist::split_int i = 0; i < b.n_splits; ++i) { b_ptrs[i] = b.state[i]; - b_in[i] = static_cast(b.in_split[i]); + b_in[i] = static_cast(b.in_split[i]); } return TreeDist::mutual_clustering_score( diff --git a/src/tree_distances.cpp b/src/tree_distances.cpp index 0da45e65..3651caab 100644 --- a/src/tree_distances.cpp +++ b/src/tree_distances.cpp @@ -29,18 +29,15 @@ namespace TreeDist { } } + // Hard C++ ceiling: n_tips^2 must fit int32 and all counters must fit split_int. + // 32767 = INT16_MAX is the chosen practical cap (n^2 <= 1.07e9 < INT32_MAX). + // R-level .CheckMaxTips() fires before this; this is a safety backstop. + static constexpr int32 TREEDIST_MAX_TIPS = 32767; + void check_ntip(const int32 n) { - if (n > SL_MAX_TIPS) { - if (SL_MAX_TIPS <= 2048) { - Rcpp::stop("Trees with %d tips exceed the compiled limit of %d. " - "Update TreeTools to support more tips, then reinstall " - "TreeDist.", static_cast(n), - static_cast(SL_MAX_TIPS)); - } else { - Rcpp::stop("Trees with %d tips are not yet supported (maximum %d). ", - static_cast(n), - static_cast(SL_MAX_TIPS)); - } + if (n > TREEDIST_MAX_TIPS) { + Rcpp::stop("Trees with %d tips are not yet supported (maximum %d).", + static_cast(n), static_cast(TREEDIST_MAX_TIPS)); } } @@ -114,41 +111,41 @@ inline List robinson_foulds_info(const RawMatrix &x, const RawMatrix &y, const int32 n_tips) { const SplitList a(x), b(y); - const int16 last_bin = a.n_bins - 1; - const int16 unset_tips = (n_tips % SL_BIN_SIZE) ? + const split_int last_bin = a.n_bins - 1; + const split_int unset_tips = (n_tips % SL_BIN_SIZE) ? SL_BIN_SIZE - n_tips % SL_BIN_SIZE : 0; - + const splitbit unset_mask = ALL_ONES >> unset_tips; - const double lg2_unrooted_n = lg2_unrooted[n_tips]; + const double lg2_unrooted_n = lg2_unrooted_lookup(n_tips); double score = 0; - + grf_match matching(a.n_splits, NA_INTEGER); - - const int16 n_bins = a.n_bins; + + const split_int n_bins = a.n_bins; std::vector b_complement(b.n_splits * n_bins); - for (int16 i = 0; i < b.n_splits; i++) { + for (split_int i = 0; i < b.n_splits; i++) { splitbit* bc_i = &b_complement[i * n_bins]; - for (int16 bin = 0; bin < last_bin; ++bin) { + for (split_int bin = 0; bin < last_bin; ++bin) { bc_i[bin] = ~b.state[i][bin]; } bc_i[last_bin] = b.state[i][last_bin] ^ unset_mask; } - - for (int16 ai = 0; ai < a.n_splits; ++ai) { + + for (split_int ai = 0; ai < a.n_splits; ++ai) { if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); - for (int16 bi = 0; bi < b.n_splits; ++bi) { - + for (split_int bi = 0; bi < b.n_splits; ++bi) { + bool all_match = true, all_complement = true; const splitbit* bc_bi = &b_complement[bi * n_bins]; - - for (int16 bin = 0; bin < n_bins; ++bin) { + + for (split_int bin = 0; bin < n_bins; ++bin) { if ((a.state[ai][bin] != b.state[bi][bin])) { all_match = false; break; } } if (!all_match) { - for (int16 bin = 0; bin < n_bins; ++bin) { + for (split_int bin = 0; bin < n_bins; ++bin) { if ((a.state[ai][bin] != bc_bi[bin])) { all_complement = false; break; @@ -156,13 +153,13 @@ inline List robinson_foulds_info(const RawMatrix &x, const RawMatrix &y, } } if (all_match || all_complement) { - int16 leaves_in_split = 0; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + split_int leaves_in_split = 0; + for (split_int bin = 0; bin < a.n_bins; ++bin) { leaves_in_split += count_bits(a.state[ai][bin]); } - - score += lg2_unrooted_n - lg2_rooted[leaves_in_split] - - lg2_rooted[n_tips - leaves_in_split]; + + score += lg2_unrooted_n - lg2_rooted_lookup(leaves_in_split) - + lg2_rooted_lookup(n_tips - leaves_in_split); matching[ai] = bi + 1; break; /* Only one match possible per split */ @@ -180,50 +177,50 @@ inline List robinson_foulds_info(const RawMatrix &x, const RawMatrix &y, inline List matching_split_distance(const RawMatrix &x, const RawMatrix &y, const int32 n_tips) { const SplitList a(x), b(y); - const int16 most_splits = std::max(a.n_splits, b.n_splits); - const int16 split_diff = most_splits - std::min(a.n_splits, b.n_splits); - const int16 half_tips = n_tips / 2; + const split_int most_splits = std::max(a.n_splits, b.n_splits); + const split_int split_diff = most_splits - std::min(a.n_splits, b.n_splits); + const split_int half_tips = n_tips / 2; if (most_splits == 0) { return List::create(Named("score") = 0); } const cost max_score = BIG / most_splits; cost_matrix score(most_splits); - - for (int16 ai = 0; ai < a.n_splits; ++ai) { + + for (split_int ai = 0; ai < a.n_splits; ++ai) { if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); - for (int16 bi = 0; bi < b.n_splits; ++bi) { + for (split_int bi = 0; bi < b.n_splits; ++bi) { splitbit total = 0; - for (int16 bin = 0; bin < a.n_bins; ++bin) { - total += count_bits(a.state[ai][bin] ^ b.state[bi][bin]); + for (split_int bin = 0; bin < a.n_bins; ++bin) { + total += count_bits(a.state[ai][bin] ^ b.state[bi][bin]); } score(ai, bi) = total; } } - - for (int16 ai = 0; ai < a.n_splits; ++ai) { - for (int16 bi = 0; bi < b.n_splits; ++bi) { + + for (split_int ai = 0; ai < a.n_splits; ++ai) { + for (split_int bi = 0; bi < b.n_splits; ++bi) { if (score(ai, bi) > half_tips) { score(ai, bi) = n_tips - score(ai, bi); } } score.padRowAfterCol(ai, b.n_splits, max_score); } - + score.padAfterRow(a.n_splits, max_score); - + std::vector rowsol; std::vector colsol; - + resize_uninitialized(rowsol, most_splits); resize_uninitialized(colsol, most_splits); - + const double final_score = lap(most_splits, score, rowsol, colsol) - (max_score * split_diff); - + std::vector final_matching; final_matching.reserve(a.n_splits); - - for (int16 i = 0; i < a.n_splits; ++i) { + + for (split_int i = 0; i < a.n_splits; ++i) { const int match = (rowsol[i] < b.n_splits) ? static_cast(rowsol[i]) + 1 : NA_INTEGER; @@ -239,49 +236,49 @@ inline List jaccard_similarity(const RawMatrix &x, const RawMatrix &y, const int32 n_tips, const NumericVector &k, const LogicalVector &allowConflict) { const SplitList a(x), b(y); - const int16 most_splits = std::max(a.n_splits, b.n_splits); - + const split_int most_splits = std::max(a.n_splits, b.n_splits); + constexpr cost max_score = BIG; constexpr double max_scoreL = max_score; const double exponent = k[0]; - + bool allow_conflict = allowConflict[0]; - + cost_matrix score(most_splits); - - for (int16 ai = 0; ai < a.n_splits; ++ai) { + + for (split_int ai = 0; ai < a.n_splits; ++ai) { if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); - - const int16 na = a.in_split[ai]; - const int16 nA = n_tips - na; - - for (int16 bi = 0; bi < b.n_splits; ++bi) { - + + const split_int na = a.in_split[ai]; + const split_int nA = n_tips - na; + + for (split_int bi = 0; bi < b.n_splits; ++bi) { + // x divides tips into a|A; y divides tips into b|B - int16 a_and_b = 0; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + split_int a_and_b = 0; + for (split_int bin = 0; bin < a.n_bins; ++bin) { a_and_b += count_bits(a.state[ai][bin] & b.state[bi][bin]); } - - const int16 + + const split_int nb = b.in_split[bi], nB = n_tips - nb, a_and_B = na - a_and_b, A_and_b = nb - a_and_b, A_and_B = nB - a_and_B ; - + if (!allow_conflict && !( a_and_b == na || a_and_B == na || A_and_b == nA || A_and_B == nA) ) { - + score(ai, bi) = max_score; /* Prohibited */ - + } else { - const int16 + const split_int A_or_b = n_tips - a_and_B, a_or_B = n_tips - A_and_b, a_or_b = n_tips - A_and_B, @@ -330,76 +327,77 @@ inline List jaccard_similarity(const RawMatrix &x, const RawMatrix &y, std::vector final_matching; final_matching.reserve(a.n_splits); - for (int16 i = 0; i < a.n_splits; ++i) { + for (split_int i = 0; i < a.n_splits; ++i) { const int match = (rowsol[i] < b.n_splits) ? static_cast(rowsol[i]) + 1 : NA_INTEGER; final_matching.push_back(match); } - - + + return List::create(Named("score") = final_score, _["matching"] = final_matching); - + } List msi_distance(const RawMatrix &x, const RawMatrix &y, const int32 n_tips) { const SplitList a(x), b(y); - const int16 most_splits = std::max(a.n_splits, b.n_splits); + const split_int most_splits = std::max(a.n_splits, b.n_splits); constexpr cost max_score = BIG; - const double max_possible = lg2_unrooted[n_tips] - - lg2_rooted[int16((n_tips + 1) / 2)] - lg2_rooted[int16(n_tips / 2)]; + const double max_possible = lg2_unrooted_lookup(n_tips) - + lg2_rooted_lookup((n_tips + 1) / 2) - lg2_rooted_lookup(n_tips / 2); const double score_over_possible = static_cast(max_score) / max_possible; const double possible_over_score = max_possible / max_score; - + cost_matrix score(most_splits); - - splitbit different[SL_MAX_BINS]; - - for (int16 ai = 0; ai < a.n_splits; ++ai) { + + // Heap-allocated: n_bins can exceed SL_MAX_BINS for large trees. + std::vector different(a.n_bins); + + for (split_int ai = 0; ai < a.n_splits; ++ai) { if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); - for (int16 bi = 0; bi < b.n_splits; ++bi) { - int16 + for (split_int bi = 0; bi < b.n_splits; ++bi) { + split_int n_different = 0, n_a_only = 0, n_a_and_b = 0 ; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + for (split_int bin = 0; bin < a.n_bins; ++bin) { different[bin] = a.state[ai][bin] ^ b.state[bi][bin]; n_different += count_bits(different[bin]); n_a_only += count_bits(a.state[ai][bin] & different[bin]); n_a_and_b += count_bits(a.state[ai][bin] & ~different[bin]); } - const int16 n_same = n_tips - n_different; - - score(ai, bi) = cost(max_score - + const split_int n_same = n_tips - n_different; + + score(ai, bi) = cost(max_score - (score_over_possible * TreeDist::mmsi_score(n_same, n_a_and_b, n_different, n_a_only))); } score.padRowAfterCol(ai, b.n_splits, max_score); } score.padAfterRow(a.n_splits, max_score); - + std::vector rowsol; std::vector colsol; - + resize_uninitialized(rowsol, most_splits); resize_uninitialized(colsol, most_splits); - + const double final_score = static_cast( (max_score * most_splits) - lap(most_splits, score, rowsol, colsol)) * possible_over_score; - + std::vector final_matching; final_matching.reserve(a.n_splits); - - for (int16 i = 0; i < a.n_splits; ++i) { + + for (split_int i = 0; i < a.n_splits; ++i) { const int match = (rowsol[i] < b.n_splits) ? static_cast(rowsol[i]) + 1 : NA_INTEGER; final_matching.push_back(match); } - + return List::create(Named("score") = final_score, _["matching"] = final_matching); @@ -410,55 +408,55 @@ List mutual_clustering(const RawMatrix &x, const RawMatrix &y, const SplitList a(x); const SplitList b(y); const bool a_has_more_splits = (a.n_splits > b.n_splits); - const int16 most_splits = a_has_more_splits ? a.n_splits : b.n_splits; - const int16 a_extra_splits = a_has_more_splits ? most_splits - b.n_splits : 0; - const int16 b_extra_splits = a_has_more_splits ? 0 : most_splits - a.n_splits; + const split_int most_splits = a_has_more_splits ? a.n_splits : b.n_splits; + const split_int a_extra_splits = a_has_more_splits ? most_splits - b.n_splits : 0; + const split_int b_extra_splits = a_has_more_splits ? 0 : most_splits - a.n_splits; const double n_tips_reciprocal = 1.0 / n_tips; - + if (most_splits == 0 || n_tips == 0) { return List::create(Named("score") = 0, _["matching"] = IntegerVector(0)); } - + constexpr cost max_score = BIG; constexpr double over_max_score = 1.0 / static_cast(max_score); const double max_over_tips = static_cast(max_score) * n_tips_reciprocal; - const double lg2_n = lg2[n_tips]; - + const double lg2_n = lg2_lookup(n_tips); + cost_matrix score(most_splits); - + double exact_match_score = 0; - int16 exact_matches = 0; + split_int exact_matches = 0; // vector zero-initializes [so does make_unique] // match will have one added to it so numbering follows R; hence 0 = UNMATCHED std::vector a_match(a.n_splits); - std::unique_ptr b_match = std::make_unique(b.n_splits); - - for (int16 ai = 0; ai < a.n_splits; ++ai) { + std::unique_ptr b_match = std::make_unique(b.n_splits); + + for (split_int ai = 0; ai < a.n_splits; ++ai) { if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); if (a_match[ai]) continue; - - const int16 na = a.in_split[ai]; - const int16 nA = n_tips - na; + + const split_int na = a.in_split[ai]; + const split_int nA = n_tips - na; const auto *a_row = a.state[ai]; - const double offset_a = lg2_n - lg2[na]; - const double offset_A = lg2_n - lg2[nA]; - - for (int16 bi = 0; bi < b.n_splits; ++bi) { - + const double offset_a = lg2_n - lg2_lookup(na); + const double offset_A = lg2_n - lg2_lookup(nA); + + for (split_int bi = 0; bi < b.n_splits; ++bi) { + // x divides tips into a|A; y divides tips into b|B - int16 a_and_b = 0; + split_int a_and_b = 0; const auto *b_row = b.state[bi]; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + for (split_int bin = 0; bin < a.n_bins; ++bin) { a_and_b += count_bits(a_row[bin] & b_row[bin]); } - - const int16 nb = b.in_split[bi]; - const int16 nB = n_tips - nb; - const int16 a_and_B = na - a_and_b; - const int16 A_and_b = nb - a_and_b; - const int16 A_and_B = nA - A_and_b; - + + const split_int nb = b.in_split[bi]; + const split_int nB = n_tips - nb; + const split_int a_and_B = na - a_and_b; + const split_int A_and_b = nb - a_and_b; + const split_int A_and_B = nA - A_and_b; + if ((!a_and_B && !A_and_b) || (!a_and_b && !A_and_B)) { exact_match_score += TreeDist::ic_matching(na, nA, n_tips); @@ -471,44 +469,44 @@ List mutual_clustering(const RawMatrix &x, const RawMatrix &y, && a_and_b == A_and_B) { score(ai, bi) = max_score; // Avoid rounding errors } else { - const double lg2_nb = lg2[nb]; - const double lg2_nB = lg2[nB]; + const double lg2_nb = lg2_lookup(nb); + const double lg2_nB = lg2_lookup(nB); const double ic_sum = - a_and_b * (lg2[a_and_b] + offset_a - lg2_nb) + - a_and_B * (lg2[a_and_B] + offset_a - lg2_nB) + - A_and_b * (lg2[A_and_b] + offset_A - lg2_nb) + - A_and_B * (lg2[A_and_B] + offset_A - lg2_nB); - + a_and_b * (lg2_lookup(a_and_b) + offset_a - lg2_nb) + + a_and_B * (lg2_lookup(a_and_B) + offset_a - lg2_nB) + + A_and_b * (lg2_lookup(A_and_b) + offset_A - lg2_nb) + + A_and_B * (lg2_lookup(A_and_B) + offset_A - lg2_nB); + // Division by n_tips converts n(A&B) to P(A&B) for each ic_element score(ai, bi) = max_score - static_cast(ic_sum * max_over_tips); } } - + if (b.n_splits < most_splits) { score.padRowAfterCol(ai, b.n_splits, max_score); } } - + if (exact_matches == b.n_splits || exact_matches == a.n_splits) { return List::create( Named("score") = exact_match_score * n_tips_reciprocal, _["matching"] = a_match); } - - const int16 lap_dim = most_splits - exact_matches; + + const split_int lap_dim = most_splits - exact_matches; ASSERT(lap_dim > 0); std::vector rowsol; std::vector colsol; resize_uninitialized(rowsol, lap_dim); resize_uninitialized(colsol, lap_dim); cost_matrix small_score(lap_dim); - + if (exact_matches) { - int16 a_pos = 0; - for (int16 ai = 0; ai < a.n_splits; ++ai) { + split_int a_pos = 0; + for (split_int ai = 0; ai < a.n_splits; ++ai) { if (a_match[ai]) continue; - int16 b_pos = 0; - for (int16 bi = 0; bi < b.n_splits; ++bi) { + split_int b_pos = 0; + for (split_int bi = 0; bi < b.n_splits; ++bi) { if (b_match[bi]) continue; small_score(a_pos, b_pos) = score(ai, bi); b_pos++; @@ -517,115 +515,143 @@ List mutual_clustering(const RawMatrix &x, const RawMatrix &y, a_pos++; } small_score.padAfterRow(lap_dim - b_extra_splits, max_score); - - const double lap_score = static_cast((max_score * lap_dim) - + + const double lap_score = static_cast((max_score * lap_dim) - lap(lap_dim, small_score, rowsol, colsol)) * over_max_score; const double final_score = lap_score + (exact_match_score / n_tips); - - std::unique_ptr lap_decode = std::make_unique(lap_dim); - int16 fuzzy_match = 0; - for (int16 bi = 0; bi < b.n_splits; ++bi) { + + std::unique_ptr lap_decode = std::make_unique(lap_dim); + split_int fuzzy_match = 0; + for (split_int bi = 0; bi < b.n_splits; ++bi) { if (!b_match[bi]) { assert(fuzzy_match < lap_dim); lap_decode[fuzzy_match++] = bi + 1; } } - + fuzzy_match = 0; std::vector final_matching; TreeDist::resize_uninitialized(final_matching, a.n_splits); - for (int16 i = 0; i < a.n_splits; ++i) { + for (split_int i = 0; i < a.n_splits; ++i) { if (a_match[i]) { final_matching[i] = a_match[i]; } else { assert(fuzzy_match < lap_dim); - const int16 row_idx = fuzzy_match++; - const int16 col_idx = rowsol[row_idx]; + const split_int row_idx = fuzzy_match++; + const split_int col_idx = rowsol[row_idx]; final_matching[i] = (col_idx >= lap_dim - a_extra_splits) ? NA_INTEGER : lap_decode[col_idx]; } } - + return List::create(Named("score") = final_score, _["matching"] = final_matching); } else { - - for (int16 ai = a.n_splits; ai < most_splits; ++ai) { - for (int16 bi = 0; bi < most_splits; ++bi) { + + for (split_int ai = a.n_splits; ai < most_splits; ++ai) { + for (split_int bi = 0; bi < most_splits; ++bi) { score(ai, bi) = max_score; } } - + const double final_score = static_cast( (max_score * lap_dim) - lap(lap_dim, score, rowsol, colsol) ) / max_score; - + std::vector final_matching; final_matching.reserve(a.n_splits); - for (int16 i = 0; i < a.n_splits; ++i) { + for (split_int i = 0; i < a.n_splits; ++i) { const int match = (rowsol[i] < b.n_splits) ? static_cast(rowsol[i]) + 1 : NA_INTEGER; final_matching.push_back(match); } - + return List::create(Named("score") = final_score, _["matching"] = final_matching); } } +// Templated LAP-fill so the per-cell spi_overlap resolves to direct +// lg2_rooted[] access on the hot path (n_tips <= SL_MAX_TIPS). +template +static inline void shared_phylo_fill(const SplitList& a, const SplitList& b, + const int32 n_tips, + const double best_overlap, + const double score_over_possible, + const cost max_score, + cost_matrix& score) { + for (split_int ai = a.n_splits; ai--; ) { + if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); + for (split_int bi = b.n_splits; bi--; ) { + const double spi_over = TreeDist::spi_overlap( + a.state[ai], b.state[bi], n_tips, a.in_split[ai], b.in_split[bi], + a.n_bins); + + score(ai, bi) = spi_over == 0 ? max_score : + cost((spi_over - best_overlap) * score_over_possible); + } + score.padRowAfterCol(ai, b.n_splits, max_score); + } +} + inline List shared_phylo (const RawMatrix &x, const RawMatrix &y, - const int32 n_tips) { + const int32 n_tips, + const bool force_slow = false) { const SplitList a(x), b(y); - const int16 most_splits = std::max(a.n_splits, b.n_splits); - const int16 overlap_a = int16(n_tips + 1) / 2; // avoids promotion to int - + const split_int most_splits = std::max(a.n_splits, b.n_splits); + const split_int overlap_a = (n_tips + 1) / 2; + constexpr cost max_score = BIG; - const double lg2_unrooted_n = lg2_unrooted[n_tips]; - const double best_overlap = TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); + // force_slow: test-only hook so the bounds-checked lookup path can be + // exercised on small trees for code coverage of the template + // instantiations and the lg2_*_slow fallbacks. + const bool fast = !force_slow && n_tips <= static_cast(SL_MAX_TIPS + 1); + double lg2_unrooted_n, best_overlap; + if (fast) { + lg2_unrooted_n = lg2_unrooted[n_tips]; + best_overlap = TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); + } else { + lg2_unrooted_n = lg2_unrooted_lookup(n_tips); + best_overlap = TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); + } const double max_possible = lg2_unrooted_n - best_overlap; const double score_over_possible = max_score / max_possible; const double possible_over_score = max_possible / max_score; - + // a and b are "clades" separating an "ingroup" [1] from an "outgroup" [0]. // In/out direction [i.e. 1/0 bit] is arbitrary. cost_matrix score(most_splits); - - for (int16 ai = a.n_splits; ai--; ) { - if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); - for (int16 bi = b.n_splits; bi--; ) { - const double spi_over = TreeDist::spi_overlap( - a.state[ai], b.state[bi], n_tips, a.in_split[ai], b.in_split[bi], - a.n_bins); - - score(ai, bi) = spi_over == 0 ? max_score : - cost((spi_over - best_overlap) * score_over_possible); - } - score.padRowAfterCol(ai, b.n_splits, max_score); + + if (fast) { + shared_phylo_fill(a, b, n_tips, best_overlap, + score_over_possible, max_score, score); + } else { + shared_phylo_fill(a, b, n_tips, best_overlap, + score_over_possible, max_score, score); } score.padAfterRow(a.n_splits, max_score); - + std::vector rowsol; std::vector colsol; - + resize_uninitialized(rowsol, most_splits); resize_uninitialized(colsol, most_splits); - - + const double final_score = static_cast( (max_score * most_splits) - lap(most_splits, score, rowsol, colsol)) * possible_over_score; - + std::vector final_matching; final_matching.reserve(a.n_splits); - - for (int16 i = 0; i < a.n_splits; ++i) { + + for (split_int i = 0; i < a.n_splits; ++i) { const int match = (rowsol[i] < b.n_splits) ? static_cast(rowsol[i]) + 1 : NA_INTEGER; final_matching.push_back(match); } - + return List::create(Named("score") = final_score, _["matching"] = final_matching); } @@ -687,9 +713,10 @@ List cpp_mutual_clustering(const RawMatrix &x, const RawMatrix &y, // [[Rcpp::export]] List cpp_shared_phylo(const RawMatrix &x, const RawMatrix &y, - const IntegerVector &nTip) { + const IntegerVector &nTip, + const bool force_slow = false) { ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips."); const int32 n_tip = static_cast(nTip[0]); TreeDist::check_ntip(n_tip); - return shared_phylo(x, y, n_tip); + return shared_phylo(x, y, n_tip, force_slow); } diff --git a/src/tree_distances.h b/src/tree_distances.h index 1b24c2b3..89ee6b2d 100644 --- a/src/tree_distances.h +++ b/src/tree_distances.h @@ -5,12 +5,15 @@ #include #include -/***** Re-export shared lookup tables to global scope *****/ +/***** Re-export shared lookup tables and fallback functions to global scope *****/ using TreeDist::lg2; using TreeDist::lg2_double_factorial; using TreeDist::lg2_unrooted; using TreeDist::lg2_rooted; +using TreeDist::lg2_lookup; +using TreeDist::lg2_unrooted_lookup; +using TreeDist::lg2_rooted_lookup; /***** Constants *****/ @@ -23,85 +26,115 @@ namespace TreeDist { void check_ntip(int32 n); // Re-exported from mutual_clustering.h: - // ic_matching(int16 a, int16 b, int16 n) + // ic_matching(split_int a, split_int b, split_int n) // See equation 16 in Meila 2007 (k' denoted K). // nkK is converted to pkK in the calling function when divided by n. - inline void add_ic_element(double& ic_sum, const int16 nkK, const int16 nk, - const int16 nK, const int16 n_tips, + // Parameters are split_int (int32) to handle n_tips up to 32767 safely; + // products nkK*n_tips and nk*nK fit int32 since both factors <= 32767. + inline void add_ic_element(double& ic_sum, const split_int nkK, const split_int nk, + const split_int nK, const split_int n_tips, const double lg2_n) noexcept { if (nkK && nk && nK) { assert(!(nkK == nk && nkK == nK && nkK << 1 == n_tips)); const int32 numerator = nkK * n_tips; const int32 denominator = nk * nK; if (numerator != denominator) { - ic_sum += nkK * (lg2[nkK] + lg2_n - lg2[nk] - lg2[nK]); + ic_sum += nkK * (lg2_lookup(nkK) + lg2_n - lg2_lookup(nk) - lg2_lookup(nK)); } } } // Returns lg2_unrooted[x] - lg2_trees_matching_split(y, x - y) - [[nodiscard]] inline double mmsi_pair_score(const int16 x, const int16 y) noexcept { + [[nodiscard]] inline double mmsi_pair_score(const split_int x, const split_int y) noexcept { static_assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max(), "int16 too narrow for SL_MAX_TIPS"); - - return lg2_unrooted[x] - (lg2_rooted[y] + lg2_rooted[x - y]); + + return lg2_unrooted_lookup(x) - (lg2_rooted_lookup(y) + lg2_rooted_lookup(x - y)); } - [[nodiscard]] inline double mmsi_score(const int16 n_same, const int16 n_a_and_b, - const int16 n_different, const int16 n_a_only) noexcept { + [[nodiscard]] inline double mmsi_score(const split_int n_same, const split_int n_a_and_b, + const split_int n_different, const split_int n_a_only) noexcept { if (n_same == 0 || n_same == n_a_and_b) return mmsi_pair_score(n_different, n_a_only); if (n_different == 0 || n_different == n_a_only) return mmsi_pair_score(n_same, n_a_and_b); - + const double score1 = mmsi_pair_score(n_same, n_a_and_b), score2 = mmsi_pair_score(n_different, n_a_only); - + return (score1 > score2) ? score1 : score2; } -[[nodiscard]] inline double one_overlap(const int16 a, const int16 b, const int16 n) noexcept { + // Compile-time-dispatched lg2 helpers: when Fast=true, callers guarantee + // x <= SL_MAX_TIPS, so the bounds-check + branch in lg2_rooted_lookup is + // skipped entirely and the inner kernel collapses to a bare table load. + // Used by spi_overlap / one_overlap below; large-tree callers pass + // Fast=false (default) and pay the predictable branch. + template + [[nodiscard]] inline double lg2_rooted_fast(split_int x) noexcept { + if constexpr (Fast) return lg2_rooted[x]; + else return lg2_rooted_lookup(x); + } + + template + [[nodiscard]] inline double lg2_unrooted_fast(split_int x) noexcept { + if constexpr (Fast) return lg2_unrooted[x]; + else return lg2_unrooted_lookup(x); + } + + template + [[nodiscard]] inline double one_overlap(const split_int a, const split_int b, const split_int n) noexcept { static_assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max(), "int16 too narrow for SL_MAX_TIPS"); if (a == b) { - return lg2_rooted[a] + lg2_rooted[n - a]; + return lg2_rooted_fast(a) + lg2_rooted_fast(n - a); } // Unify ab via lo/hi: removes an unpredictable branch. - const int16 lo = (a < b) ? a : b; - const int16 hi = (a < b) ? b : a; - return lg2_rooted[hi] + lg2_rooted[n - lo] - lg2_rooted[hi - lo + 1]; + const split_int lo = (a < b) ? a : b; + const split_int hi = (a < b) ? b : a; + return lg2_rooted_fast(hi) + lg2_rooted_fast(n - lo) + - lg2_rooted_fast(hi - lo + 1); } - - [[nodiscard]] inline double one_overlap_notb(const int16 a, const int16 n_minus_b, const int16 n) noexcept { + + template + [[nodiscard]] inline double one_overlap_notb(const split_int a, const split_int n_minus_b, const split_int n) noexcept { static_assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max(), "int16 too narrow for SL_MAX_TIPS"); - const int16 b = n - n_minus_b; + const split_int b = n - n_minus_b; if (a == b) { - return lg2_rooted[b] + lg2_rooted[n_minus_b]; + return lg2_rooted_fast(b) + lg2_rooted_fast(n_minus_b); } else if (a < b) { - return lg2_rooted[b] + lg2_rooted[n - a] - lg2_rooted[b - a + 1]; + return lg2_rooted_fast(b) + lg2_rooted_fast(n - a) + - lg2_rooted_fast(b - a + 1); } else { - return lg2_rooted[a] + lg2_rooted[n_minus_b] - lg2_rooted[a - b + 1]; + return lg2_rooted_fast(a) + lg2_rooted_fast(n_minus_b) + - lg2_rooted_fast(a - b + 1); } } -// Popcount-based: single pass over bins replaces 4 sequential boolean scans. + // Popcount-based: single pass over bins replaces 4 sequential boolean scans. // Counts n_ab = |A ∩ B| via hardware POPCNT, then derives all 4 Venn-diagram // region populations from arithmetic on n_ab, in_a, in_b, n_tips. + // split_int used throughout so in_a + in_b does not overflow (max ~65534 for 32767-tip trees). + // + // Fast=true: caller guarantees n_tips <= SL_MAX_TIPS, so inner one_overlap* + // calls use direct lg2_rooted[] access. Caller (shared_phylo_score) dispatches + // once on n_tips; per-cell branches are eliminated. + template [[nodiscard]] inline double spi_overlap(const splitbit* a_state, const splitbit* b_state, - const int16 n_tips, const int16 in_a, - const int16 in_b, const int16 n_bins) noexcept { + const split_int n_tips, const split_int in_a, + const split_int in_b, const split_int n_bins) noexcept { static_assert(SL_MAX_BINS <= INT16_MAX, "int16 too narrow for SL_MAX_BINS"); - int16 n_ab = 0; - for (int16 bin = 0; bin < n_bins; ++bin) { + split_int n_ab = 0; + for (split_int bin = 0; bin < n_bins; ++bin) { n_ab += TreeTools::count_bits(a_state[bin] & b_state[bin]); } @@ -113,15 +146,15 @@ namespace TreeDist { // unrelated splits). Otherwise return the appropriate one_overlap score. if (n_ab == 0) { - return one_overlap_notb(in_a, in_b, n_tips); + return one_overlap_notb(in_a, in_b, n_tips); } if (n_ab == in_b || n_ab == in_a) { // B ⊆ A (n_b_only == 0) or A ⊆ B (n_a_only == 0) - return one_overlap(in_a, in_b, n_tips); + return one_overlap(in_a, in_b, n_tips); } if (in_a + in_b - n_ab == n_tips) { // A ∪ B covers all tips (n_neither == 0) - return one_overlap_notb(in_a, in_b, n_tips); + return one_overlap_notb(in_a, in_b, n_tips); } return 0.0; diff --git a/tests/testthat/test-large-trees.R b/tests/testthat/test-large-trees.R new file mode 100644 index 00000000..3600fbe5 --- /dev/null +++ b/tests/testthat/test-large-trees.R @@ -0,0 +1,127 @@ +library("TreeTools", quietly = TRUE) + +# Tests that distance functions handle trees larger than the old SL_MAX_TIPS +# threshold (2048 in TreeTools <= 2.2.0). These require TreeTools >= 2.3.0 +# for heap-backed split storage; they skip gracefully on older builds. + +test_that("Distance functions handle trees exceeding TT 2.2.0 limit", { + skip_on_cran() + n <- 2049L + skip_if(TreeDist:::cpp_sl_max_tips() < n, + "Requires TreeTools >= 2.3.0 (SL_MAX_TIPS > 2048)") + + t1 <- as.phylo(1, n) + t2 <- as.phylo(2, n) + + expect_gt(RobinsonFoulds(t1, t2)[[1]], 0) + expect_equal(RobinsonFoulds(t1, t1)[[1]], 0) + + expect_gt(InfoRobinsonFoulds(t1, t2)[[1]], 0) + expect_equal(InfoRobinsonFoulds(t1, t1)[[1]], 0) + + expect_gt(ClusteringInfoDistance(t1, t2)[[1]], 0) + expect_equal(ClusteringInfoDistance(t1, t1)[[1]], 0) + + expect_gt(DifferentPhylogeneticInfo(t1, t2)[[1]], 0) + expect_equal(DifferentPhylogeneticInfo(t1, t1)[[1]], 0) + + expect_gt(MatchingSplitInfoDistance(t1, t2)[[1]], 0) + expect_equal(MatchingSplitInfoDistance(t1, t1)[[1]], 0) +}) + +test_that("Distance scores agree across stack/heap storage threshold", { + skip_on_cran() + skip_if_not_installed("TreeTools", "2.3.0") + # SL_STACK_SPLITS in TreeTools 2.3.0+ admits trees of n_tips <= 2048 to + # stack-backed split storage; n_tips == 2049 forces the heap path. This + # test compares each metric on a stack-storage tree pair against the same + # pair after inserting a cherry next to tip 1, which both pushes them + # across the threshold and leaves their split structure invariant: each + # existing split simply gains the new tip on whichever side already + # contained tip 1, and one perfectly-matched cherry split is added + # (contributing 0 to any distance). + stackMax <- 2048L + skip_if(TreeDist:::cpp_sl_max_tips() <= stackMax, + "Requires TreeTools >= 2.3.0 (heap-backed storage)") + + t1s <- as.phylo(1234, stackMax) + t2s <- PectinateTree(stackMax) + t1h <- AddTip(t1s, where = 1, label = "extra") + t2h <- AddTip(t2s, where = 1, label = "extra") + + # RF (count of unmatched splits) is invariant under the inserted cherry. + expect_equal(RobinsonFoulds(t1s, t2s)[[1]], + RobinsonFoulds(t1h, t2h)[[1]]) + + # Per-split info depends on n_tips and on side sizes, so absolute + # information-theoretic distances drift by a few % when n grows by 1; + # normalized distances are far less sensitive. + tol <- 0.002 + + expect_equal(InfoRobinsonFoulds(t1s, t2s, normalize = TRUE)[[1]], + InfoRobinsonFoulds(t1h, t2h, normalize = TRUE)[[1]], + tolerance = tol) + expect_equal(ClusteringInfoDistance(t1s, t2s, normalize = TRUE)[[1]], + ClusteringInfoDistance(t1h, t2h, normalize = TRUE)[[1]], + tolerance = tol) + expect_equal(DifferentPhylogeneticInfo(t1s, t2s, normalize = TRUE)[[1]], + DifferentPhylogeneticInfo(t1h, t2h, normalize = TRUE)[[1]], + tolerance = tol) + expect_equal(MatchingSplitInfoDistance(t1s, t2s, normalize = TRUE)[[1]], + MatchingSplitInfoDistance(t1h, t2h, normalize = TRUE)[[1]], + tolerance = tol) +}) + +test_that("RF and IRF work for 8000-tip trees", { + skip_on_cran() + skip_if(TreeDist:::cpp_sl_max_tips() < 2049L, + "Requires TreeTools >= 2.3.0 (SL_MAX_TIPS > 2048)") + skip_if(!getOption("slowMode", FALSE), "Only runs in slow mode") + + n <- 8000 + t1 <- PectinateTree(n) + t2 <- BalancedTree(n) + + expect_gt(RobinsonFoulds(t1, t2)[[1]], 0) + expect_equal(RobinsonFoulds(t1, t1)[[1]], 0) + + expect_gt(InfoRobinsonFoulds(t1, t2)[[1]], 0) + expect_equal(InfoRobinsonFoulds(t1, t1)[[1]], 0) +}) + +test_that("Slow lookup path produces identical SPI to fast path", { + skip_on_cran() + # Coverage for the n_tips > SL_MAX_TIPS branches in shared_phylo: + # - lg2_*_slow fallback functions + # - one_overlap, one_overlap_notb, spi_overlap + # These are only triggered with > 32705 tips in production, but the + # cpp_shared_phylo entry accepts a force_slow flag so the same code paths + # can be exercised on small trees. Both paths must produce identical + # values: lg2_rooted_lookup() == lg2_rooted[] within the table range. + t1 <- as.phylo(0, 30) + t2 <- as.phylo(11, 30) + splits1 <- as.Splits(t1) + splits2 <- as.Splits(t2) + + fast <- TreeDist:::cpp_shared_phylo(splits1, splits2, 30L, force_slow = FALSE) + slow <- TreeDist:::cpp_shared_phylo(splits1, splits2, 30L, force_slow = TRUE) + + expect_equal(fast$score, slow$score) + expect_equal(fast$matching, slow$matching) + + # Identical-tree case: both paths must report the self-information score. + fast_self <- TreeDist:::cpp_shared_phylo(splits1, splits1, 30L, + force_slow = FALSE) + slow_self <- TreeDist:::cpp_shared_phylo(splits1, splits1, 30L, + force_slow = TRUE) + expect_equal(fast_self$score, slow_self$score) +}) + +test_that("Tip-count ceiling is enforced correctly", { + skip_on_cran() + skip_if(TreeDist:::cpp_sl_max_tips() < 2049L, + "Requires TreeTools >= 2.3.0 (SL_MAX_TIPS > 2048)") + + expect_no_error(.CheckMaxTips(32767L)) + expect_error(.CheckMaxTips(32768L), "not yet supported") +}) diff --git a/tests/testthat/test-real-data.R b/tests/testthat/test-real-data.R deleted file mode 100644 index d34bbc89..00000000 --- a/tests/testthat/test-real-data.R +++ /dev/null @@ -1,32 +0,0 @@ -#if (requireNamespace("rwty", quietly = TRUE)) { -# test_that("Fungi MSID/PID differ", { -# data("fungus", package = "rwty") -# trees <- fungus[[1]]$trees -# -# expect_equal(PhylogeneticInfoDistance(trees[[1]], trees[[2]]), -# PhylogeneticInfoDistance(trees[1:2])[1]) -# expect_equal(MatchingSplitInfoDistance(trees[[1]], trees[[2]]), -# MatchingSplitInfoDistance(trees[1:2])[1]) -# expect_false(identical(PhylogeneticInfoDistance(trees[1:4]), -# MatchingSplitInfoDistance(trees[1:4]))) -# }) -# -# if (requireNamespace("whalehead", quietly = TRUE)) { -# test_that("Whalehead MSID/PID differ", { -# whaleRoot <- system.file("Data", package = "whalehead", mustWork = TRUE) -# whaleRoot <- "../ProjectWhalehead/Data" -# treeFiles <- list.files(path = paste0(whaleRoot, "/CombinedTrees"), -# pattern = "*.nex", -# full.names = TRUE, recursive = FALSE) -# trees <- unname(rwty::load.trees(treeFiles[2], trim = 100, skip = 0)$trees) -# -# expect_equal(PhylogeneticInfoDistance(trees[[1]], trees[[2]]), -# PhylogeneticInfoDistance(trees[1:2])[1]) -# expect_equal(MatchingSplitInfoDistance(trees[[1]], trees[[2]]), -# MatchingSplitInfoDistance(trees[1:2])[1]) -# expect_false(identical(PhylogeneticInfoDistance(trees[1:4]), -# MatchingSplitInfoDistance(trees[1:4]))) -# }) -# } -#} -# diff --git a/tests/testthat/test-tree_distance_nni.R b/tests/testthat/test-tree_distance_nni.R index 20bcda23..2259f4e8 100644 --- a/tests/testthat/test-tree_distance_nni.R +++ b/tests/testthat/test-tree_distance_nni.R @@ -5,21 +5,32 @@ test_that("NNIDist() handles exceptions", { "trees must contain the same number of leaves") expect_error(NNIDist(list(PectinateTree(1:8), PectinateTree(8))), "trees must bear identical labels") - expect_error(NNIDist(list(PectinateTree(1:8), + expect_error(NNIDist(list(PectinateTree(1:8), PectinateTree(as.character(1:8)))), "trees must bear identical labels") - # R-level guard catches too-many-tips - expect_error(NNIDist(PectinateTree(40000), BalancedTree(40000)), "so many tips") - expect_error(NNIDist(BalancedTree(5), RootOnNode(BalancedTree(5), 1))) - +}) + +test_that("NNIDist() rejects too many tips (TreeTools >= 2.3.0)", { + skip_if(TreeDist:::cpp_sl_max_tips() <= 2048L, + "Compiled against TreeTools < 2.3.0 (SL_MAX_TIPS = 2048)") + expect_error(NNIDist(PectinateTree(40000), BalancedTree(40000)), + "not yet supported") + expect_error(.NNIDistSingle(PectinateTree(32769), BalancedTree(32769), 32769L), + "not yet supported") +}) + +test_that("NNIDist() rejects too many tips (TreeTools < 2.3.0)", { + skip_if(TreeDist:::cpp_sl_max_tips() > 2048L, + "Compiled against TreeTools >= 2.3.0 (SL_MAX_TIPS > 2048)") + expect_error(NNIDist(PectinateTree(40000), BalancedTree(40000)), + "exceed the compiled limit") + expect_error(.NNIDistSingle(PectinateTree(32769), BalancedTree(32769), 32769L), + "exceed the compiled limit") }) test_that("NNIDist() at NNI_MAX_TIPS", { maxTips <- 32768 - more <- maxTips + 1 - expect_error(.NNIDistSingle(PectinateTree(more), BalancedTree(more), more), - "so many tips") goingQuickly <- TRUE skip_if(goingQuickly) diff --git a/tests/testthat/test-tree_distance_spr.R b/tests/testthat/test-tree_distance_spr.R index d97c0263..ff0a3ee8 100644 --- a/tests/testthat/test-tree_distance_spr.R +++ b/tests/testthat/test-tree_distance_spr.R @@ -115,6 +115,7 @@ test_that("SPR shortcuts okay - exhaustive", { test_that("SPR shortcuts okay - known answer", { skip_if_not(getOption("slowMode", FALSE)) + skip_if_not_installed("TBRDist") library("TreeTools", quietly = TRUE) set.seed(0) trees <- lapply(1:8, function(XX) RandomTree(9, root = TRUE)) diff --git a/tests/testthat/test-tree_distance_utilities.R b/tests/testthat/test-tree_distance_utilities.R index 4a7eff99..3579e072 100644 --- a/tests/testthat/test-tree_distance_utilities.R +++ b/tests/testthat/test-tree_distance_utilities.R @@ -31,35 +31,64 @@ test_that("CalculateTreeDistance() errs appropriately", { expect_error(CalculateTreeDistance(RobinsonFouldsSplits, BalancedTree(8), "Not a tree")) }) -test_that(".SL_MAX_TIPS is populated", { - expect_true(is.integer(TreeDist:::.SL_MAX_TIPS)) - expect_true(TreeDist:::.SL_MAX_TIPS >= 2048L) +test_that("cpp_sl_max_tips() returns a valid value", { + expect_true(is.integer(TreeDist:::cpp_sl_max_tips())) + expect_true(TreeDist:::cpp_sl_max_tips() >= 2048L) }) test_that("Tip-count guard is applied consistently", { expect_no_error(.CheckMaxTips(1000L)) - - limit32704 <- TreeDist:::.SL_MAX_TIPS == 32704L - if (limit32704) expect_no_error(.CheckMaxTips(32704L)) - - errMsg <- if (limit32704) { - "Trees with 327.. tips are not yet supported \\(maximum 32704\\)" - } else { - "Trees with 327.. tips exceed the compiled limit of 2048" - } - expect_error(.CheckMaxTips(32705L), errMsg) - + + # Direct C++ calls bypass the R guard; check_ntip() always says "not yet supported". + cppErrMsg <- "Trees with 327.. tips are not yet supported \\(maximum 32767\\)" splits8 <- unclass(as.Splits(BalancedTree(8))) - expect_error(cpp_robinson_foulds_distance(splits8, splits8, 32768L), - errMsg) - expect_error(cpp_robinson_foulds_info(splits8, splits8, 32768L), - errMsg) + expect_error(cpp_robinson_foulds_distance(splits8, splits8, 32768L), cppErrMsg) + expect_error(cpp_robinson_foulds_info(splits8, splits8, 32768L), cppErrMsg) +}) + +test_that("Tip-count guard accepts up to 32767 tips (TreeTools >= 2.3.0)", { + skip_if(TreeDist:::cpp_sl_max_tips() <= 2048L, + "Compiled against TreeTools < 2.3.0 (SL_MAX_TIPS = 2048)") + expect_no_error(.CheckMaxTips(32705L)) + expect_no_error(.CheckMaxTips(32767L)) + rErrMsg <- "Trees with 327.. tips are not yet supported \\(maximum 32767\\)" + expect_error(.CheckMaxTips(32768L), rErrMsg) + + trees <- list(BalancedTree(8), PectinateTree(8)) + class(trees) <- "multiPhylo" + expect_error( + .SplitDistanceAllPairs(RobinsonFouldsSplits, trees, letters[1:8], 32768L), + rErrMsg + ) +}) + +test_that("Tip-count guard caps at SL_MAX_TIPS (TreeTools < 2.3.0)", { + skip_if(TreeDist:::cpp_sl_max_tips() > 2048L, + "Compiled against TreeTools >= 2.3.0 (SL_MAX_TIPS > 2048)") + rErrMsg <- "Trees with 327.. tips exceed the compiled limit of 2048" + expect_error(.CheckMaxTips(32705L), rErrMsg) trees <- list(BalancedTree(8), PectinateTree(8)) class(trees) <- "multiPhylo" expect_error( .SplitDistanceAllPairs(RobinsonFouldsSplits, trees, letters[1:8], 32768L), - errMsg + rErrMsg + ) +}) + +test_that("Tip-count guard: TT < 2.3.0 code path (mocked)", { + local_mocked_bindings( + cpp_sl_max_tips = function() 2048L, + .package = "TreeDist" + ) + expect_no_error(.CheckMaxTips(100L)) + expect_error(.CheckMaxTips(3000L), + "Trees with 3000 tips exceed the compiled limit of 2048") + trees <- list(BalancedTree(8), PectinateTree(8)) + class(trees) <- "multiPhylo" + expect_error( + .SplitDistanceAllPairs(RobinsonFouldsSplits, trees, letters[1:8], 3000L), + "Trees with 3000 tips exceed the compiled limit of 2048" ) })