Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ cran-comments.md
man-roxygen
data-raw
docs
^.*vtune.*$
revdep
\.covrignore
^\.git$
Expand Down
7 changes: 7 additions & 0 deletions .claude/settings.local.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"permissions": {
"allow": [
"Bash(Rscript *)"
]
}
}
2 changes: 0 additions & 2 deletions .github/workflows/R-CMD-check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,6 @@ jobs:
uses: r-lib/actions/setup-r-dependencies@v2
with:
dependencies: '"hard"'
extra-packages: |
github::ms609/TreeTools
needs: benchmark

- name: Benchmark PR
Expand Down
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -87,16 +87,16 @@ VignetteBuilder: knitr
Config/Needs/app/optional: uwot
Config/Needs/check: rcmdcheck
Config/Needs/coverage: covr
Config/Needs/memcheck: devtools
Config/Needs/memcheck: pkgdown, testthat
Config/Needs/metadata: codemetar
Config/Needs/revdeps: revdepcheck
Config/Needs/website: openssl, pkgdown, remotes, shinylive
Config/roxygen2/version: 8.0.0
Config/testthat/parallel: false
Config/testthat/edition: 3
SystemRequirements: C++17, pandoc-citeproc
ByteCompile: true
Encoding: UTF-8
Language: en-GB
X-schema.org-keywords: phylogenetics, tree-distance
RoxygenNote: 7.3.3
Roxygen: list(markdown = TRUE)
11 changes: 11 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,17 @@
removing a hard dependency on the compile-time `SL_MAX_SPLITS` constant.
TreeDist now supports trees of any size permitted by TreeTools.

- **Large-tree support (requires TreeTools ≥ 2.3.0):** all distance
functions now accept trees with up to 32 767 tips (previously limited
to `SL_MAX_TIPS`, 2048 with TreeTools ≤ 2.2.0). The R-level tip-count
guard (`.CheckMaxTips()`) detects the TreeTools version at load time and
unlocks the higher ceiling automatically; no code changes are needed.
All integer counters in the C++ hot paths have been widened from `int16`
to `split_int` (`int32`) to handle split counts above 32 767 without
overflow. Direct `lg2[]` table accesses have been replaced with
`lg2_lookup()` fallback helpers so that trees with more tips than
`SL_MAX_TIPS` are computed correctly via `std::log2` / `std::lgamma`.

## Performance

- `RobinsonFoulds()` now uses a fast C++ batch path for cross-distance
Expand Down
4 changes: 2 additions & 2 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ cpp_mutual_clustering <- function(x, y, nTip) {
.Call(`_TreeDist_cpp_mutual_clustering`, x, y, nTip)
}

cpp_shared_phylo <- function(x, y, nTip) {
.Call(`_TreeDist_cpp_shared_phylo`, x, y, nTip)
cpp_shared_phylo <- function(x, y, nTip, force_slow = FALSE) {
.Call(`_TreeDist_cpp_shared_phylo`, x, y, nTip, force_slow)
}

17 changes: 17 additions & 0 deletions R/tree_distance.R
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,23 @@ GeneralizedRF <- function(splits1, splits2, nTip, PairScorer,
g[lower.tri(g)]
}

# Floor sub-noise distances to zero before normalization.
# Two sources of numerical noise scale with treesIndependentInfo:
# (1) LAP int64 cost-matrix quantization in *Splits scoring; per-cell
# truncation of up to (max_possible / BIG) bits, summed over n_splits.
# (2) Float-accumulation drift between independently-built tables (e.g.
# InfoRobinsonFoulds vs cpp_splitwise_info_batch sum the same per-split
# info contributions, but using different lookup-table constructions).
# Both grow with the magnitude of the answer, so an absolute sqrt(eps)
# tolerance becomes too tight beyond a few thousand tips. Scaling by
# treesIndependentInfo self-adjusts; pmax(1, ·) preserves the original
# tolerance for tiny trees where these errors are negligible anyway.
.FloorNumericalNoise <- function(ret, treesIndependentInfo) {
tol <- pmax(1, treesIndependentInfo) * .Machine[["double.eps"]] ^ 0.5
ret[ret < tol] <- 0
ret
}

.AllTipsSame <- function(x, y) {
if (is.list(x)) {
xPrime <- x[[1]]
Expand Down
42 changes: 17 additions & 25 deletions R/tree_distance_info.R
Original file line number Diff line number Diff line change
Expand Up @@ -249,16 +249,15 @@ DifferentPhylogeneticInfo <- function(tree1, tree2 = NULL, normalize = FALSE,
if (!is.null(fast)) {
spi <- fast[["info"]]
treesIndependentInfo <- .PairwiseSums(fast[["entropies"]])
ret <- treesIndependentInfo - spi - spi

ret <- .FloorNumericalNoise(treesIndependentInfo - spi - spi, treesIndependentInfo)
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,
infoInBoth = treesIndependentInfo,
InfoInTree = SplitwiseInfo, Combine = "+")
ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0
attributes(ret) <- attributes(spi)
return(ret)
}

# Fast path (cross-pairs): same tips, no matching — avoids duplicate as.Splits()
fast_many <- .FastManyManyPath(tree1, tree2, reportMatching,
cpp_shared_phylo_cross_pairs,
Expand All @@ -268,25 +267,22 @@ DifferentPhylogeneticInfo <- function(tree1, tree2 = NULL, normalize = FALSE,
info1 <- fast_many[["info1"]]
info2 <- fast_many[["info2"]]
treesIndependentInfo <- outer(info1, info2, "+")
ret <- treesIndependentInfo - spi - spi

ret <- .FloorNumericalNoise(treesIndependentInfo - spi - spi, treesIndependentInfo)
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,
infoInBoth = treesIndependentInfo,
InfoInTree = SplitwiseInfo, Combine = "+")
ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0
return(ret)
}

spi <- SharedPhylogeneticInfo(tree1, tree2, normalize = FALSE, diag = FALSE,
reportMatching = reportMatching)
treesIndependentInfo <- .MaxValue(tree1, tree2, SplitwiseInfo)
ret <- treesIndependentInfo - spi - spi
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,

ret <- .FloorNumericalNoise(treesIndependentInfo - spi - spi, treesIndependentInfo)
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,
infoInBoth = treesIndependentInfo,
InfoInTree = SplitwiseInfo, Combine = "+")

ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 # Catch floating point inaccuracy
attributes(ret) <- attributes(spi)

# Return:
Expand All @@ -310,16 +306,15 @@ ClusteringInfoDistance <- function(tree1, tree2 = NULL, normalize = FALSE,
if (!is.null(fast)) {
mci <- fast[["info"]]
treesIndependentInfo <- .PairwiseSums(fast[["entropies"]])
ret <- treesIndependentInfo - mci - mci

ret <- .FloorNumericalNoise(treesIndependentInfo - mci - mci, treesIndependentInfo)
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,
infoInBoth = treesIndependentInfo,
InfoInTree = ClusteringEntropy, Combine = "+")
ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0
attributes(ret) <- attributes(mci)
return(ret)
}

# Fast path (cross-pairs): same tips, no matching — avoids duplicate as.Splits()
fast_many <- .FastManyManyPath(tree1, tree2, reportMatching,
cpp_mutual_clustering_cross_pairs,
Expand All @@ -329,25 +324,22 @@ ClusteringInfoDistance <- function(tree1, tree2 = NULL, normalize = FALSE,
info1 <- fast_many[["info1"]]
info2 <- fast_many[["info2"]]
treesIndependentInfo <- outer(info1, info2, "+")
ret <- treesIndependentInfo - mci - mci

ret <- .FloorNumericalNoise(treesIndependentInfo - mci - mci, treesIndependentInfo)
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,
infoInBoth = treesIndependentInfo,
InfoInTree = ClusteringEntropy, Combine = "+")
ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0
return(ret)
}

mci <- MutualClusteringInfo(tree1, tree2, normalize = FALSE, diag = FALSE,
reportMatching = reportMatching)
treesIndependentInfo <- .MaxValue(tree1, tree2, ClusteringEntropy)
ret <- treesIndependentInfo - mci - mci

ret <- .FloorNumericalNoise(treesIndependentInfo - mci - mci, treesIndependentInfo)
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,
infoInBoth = treesIndependentInfo,
InfoInTree = ClusteringEntropy, Combine = "+")

ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 # Handle floating point inaccuracy
attributes(ret) <- attributes(mci)

# Return:
Expand Down
18 changes: 7 additions & 11 deletions R/tree_distance_msi.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,14 @@ MatchingSplitInfoDistance <- function(tree1, tree2 = NULL,
msi <- fast[["info"]]
treesIndependentInfo <- .PairwiseSums(fast[["entropies"]])

ret <- treesIndependentInfo - msi - msi
ret <- .FloorNumericalNoise(treesIndependentInfo - msi - msi, treesIndependentInfo)
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,
infoInBoth = treesIndependentInfo,
InfoInTree = SplitwiseInfo, Combine = "+")
ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0
attributes(ret) <- attributes(msi)
return(ret)
}

# Fast path (cross-pairs): same tips, no matching — avoids duplicate as.Splits()
fast_many <- .FastManyManyPath(tree1, tree2, reportMatching,
cpp_msi_cross_pairs,
Expand All @@ -47,25 +46,22 @@ MatchingSplitInfoDistance <- function(tree1, tree2 = NULL,
info1 <- fast_many[["info1"]]
info2 <- fast_many[["info2"]]
treesIndependentInfo <- outer(info1, info2, "+")
ret <- treesIndependentInfo - msi - msi

ret <- .FloorNumericalNoise(treesIndependentInfo - msi - msi, treesIndependentInfo)
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,
infoInBoth = treesIndependentInfo,
InfoInTree = SplitwiseInfo, Combine = "+")
ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0
return(ret)
}

msi <- MatchingSplitInfo(tree1, tree2, normalize = FALSE, diag = FALSE,
reportMatching = reportMatching)

treesIndependentInfo <- .MaxValue(tree1, tree2, SplitwiseInfo)
ret <- treesIndependentInfo - msi - msi
ret <- .FloorNumericalNoise(treesIndependentInfo - msi - msi, treesIndependentInfo)
ret <- NormalizeInfo(ret, tree1, tree2, how = normalize,
infoInBoth = treesIndependentInfo,
InfoInTree = SplitwiseInfo, Combine = "+")

ret[ret < .Machine[["double.eps"]]^0.5] <- 0 # In case of floating point inaccuracy
attributes(ret) <- attributes(msi)
# Return:
ret
Expand Down
4 changes: 1 addition & 3 deletions R/tree_distance_nni.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,7 @@ NNIDist <- function(tree1, tree2 = tree1) {
#' @importFrom TreeTools Postorder RenumberTips
#' @importFrom ape Nnode.phylo
.NNIDistSingle <- function(tree1, tree2, nTip, ...) {
if (nTip > 32768L) {
stop("Cannot calculate NNI distance for trees with so many tips.")
}
.CheckMaxTips(nTip)
if (nrow(tree1[["edge"]]) != nrow(tree2[["edge"]])) {
stop("Both trees must have the same number of edges. ",
"Is one rooted and the other unrooted?")
Expand Down
26 changes: 13 additions & 13 deletions R/tree_distance_rf.R
Original file line number Diff line number Diff line change
Expand Up @@ -75,14 +75,15 @@ InfoRobinsonFoulds <- function(tree1, tree2 = NULL, similarity = FALSE,
cpp_splitwise_info_batch)
if (!is.null(fast)) {
treesIndependentInfo <- .PairwiseSums(fast[["entropies"]])
unnormalized <- treesIndependentInfo - fast[["info"]] - fast[["info"]]
unnormalized[unnormalized < .Machine[["double.eps"]] ^ 0.5] <- 0
unnormalized <- .FloorNumericalNoise(
treesIndependentInfo - fast[["info"]] - fast[["info"]],
treesIndependentInfo)
ret <- NormalizeInfo(unnormalized, tree1, tree2, how = normalize,
InfoInTree = SplitwiseInfo, Combine = "+")
attributes(ret) <- attributes(fast[["info"]])
return(ret)
}

# Cross-pairs fast path
fast_many <- .FastManyManyPath(tree1, tree2, reportMatching,
cpp_rf_info_cross_pairs,
Expand All @@ -92,25 +93,24 @@ InfoRobinsonFoulds <- function(tree1, tree2 = NULL, similarity = FALSE,
info1 <- fast_many[["info1"]]
info2 <- fast_many[["info2"]]
treesIndependentInfo <- outer(info1, info2, "+")
unnormalized <- treesIndependentInfo - irf - irf
unnormalized[unnormalized < .Machine[["double.eps"]] ^ 0.5] <- 0

unnormalized <- .FloorNumericalNoise(treesIndependentInfo - irf - irf,
treesIndependentInfo)
ret <- NormalizeInfo(unnormalized, tree1, tree2, how = normalize,
InfoInTree = SplitwiseInfo, Combine = "+")
return(ret)
}
}
unnormalized <- CalculateTreeDistance(InfoRobinsonFouldsSplits, tree1, tree2,

unnormalized <- CalculateTreeDistance(InfoRobinsonFouldsSplits, tree1, tree2,
reportMatching) * 2

if (!similarity) {
unnormalized <- .MaxValue(tree1, tree2, SplitwiseInfo) - unnormalized
treesIndependentInfo <- .MaxValue(tree1, tree2, SplitwiseInfo)
unnormalized <- .FloorNumericalNoise(treesIndependentInfo - unnormalized,
treesIndependentInfo)
}

# In case of floating point inaccuracy
unnormalized[unnormalized < .Machine[["double.eps"]] ^ 0.5] <- 0

# Return:
NormalizeInfo(unnormalized, tree1, tree2, how = normalize,
InfoInTree = SplitwiseInfo, Combine = "+")
Expand Down
27 changes: 17 additions & 10 deletions R/tree_distance_utilities.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
# Validate that nTip does not exceed the compiled SL_MAX_TIPS limit.
# Validate that nTip does not exceed the supported tip-count ceiling.
# cpp_sl_max_tips() > 2048L iff TreeTools >= 2.3.0 raised the stack threshold
# and provides heap-backed split storage; accept up to 32767 tips in that case.
# Otherwise cap at the compiled SL_MAX_TIPS.
# Called from every distance entry point before any C++ work.
.CheckMaxTips <- function(nTip) {
if (!is.na(nTip) && nTip > .SL_MAX_TIPS) {
if (.SL_MAX_TIPS < 32704L) {
stop(
"Trees with ", nTip, " tips exceed the compiled limit of ",
.SL_MAX_TIPS, " tips.",
"\nUpdate TreeTools and reinstall TreeDist to support more tips."
)
if (is.na(nTip)) return(invisible(NULL))
sl_max <- cpp_sl_max_tips()
if (sl_max > 2048L) {
if (nTip > 32767L) {
stop("Trees with ", nTip,
" tips are not yet supported (maximum 32767).")
}
stop("Trees with ", nTip, " tips are not yet supported (maximum ",
.SL_MAX_TIPS, ")")
} else if (nTip > sl_max) {
# else-if fires only when sl_max <= 2048 (TreeTools < 2.3.0)
stop(
"Trees with ", nTip, " tips exceed the compiled limit of ",
sl_max, " tips.",
"\nUpdate TreeTools and reinstall TreeDist to support more tips."
)
}
}

Expand Down
6 changes: 0 additions & 6 deletions R/zzz.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,3 @@
.SL_MAX_TIPS <- NULL # populated in .onLoad

.onLoad <- function(libname, pkgname) {
.SL_MAX_TIPS <<- cpp_sl_max_tips()
}

.onUnload <- function(libpath) {
StopParallel(quietly = TRUE)
library.dynam.unload("TreeDist", libpath)
Expand Down
3 changes: 2 additions & 1 deletion cran-comments.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
## Test environments
* Local machine, Windows 10, R devel (2024-09-02 r87090 ucrt)
* Local machine, Windows 11, R version 4.6.0 (2026-04-24 ucrt)

* `devtools::check_win_devel()`
* `devtools::check_mac_devel()`

* [GitHub Actions](https://github.com/ms609/TreeDist/actions):
- windows-latest, R release
Expand Down
1 change: 1 addition & 0 deletions inst/_pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ navbar:
href: news/index.html
github:
icon: fa-github fa-lg
aria-label: View this repo on GitHub
href: https://github.com/ms609/TreeDist

articles:
Expand Down
Loading
Loading