deadwood/0000755000176200001440000000000015146437772012060 5ustar liggesusersdeadwood/MD50000644000176200001440000000172715146437772012377 0ustar liggesusers71bd6d07aa1ff19f380be0eaef7a5fad *DESCRIPTION 5e79a1d8bd3e77cfc8b7c3e0342847e9 *NAMESPACE cd5197bbf7b0a05e84008478e9da41ef *NEWS 32a1532ce8c241317ab947965bfbe99d *R/RcppExports.R 8b18b01bb364157da2786510007c57bb *R/deadwood-package.R 9cb18140d2d68aafcbb0acc4df11b1aa *R/deadwood.R a34bca78deac96f3b1e23d1998f8f446 *R/mst.R 0606d32e60b6cf35c14fac3be71f9f69 *man/deadwood-package.Rd 446a61eb15cfc3fca6fc0b94a0f954c4 *man/deadwood.Rd e9bd1b942038d0418e730280cfeb0275 *man/kneedle.Rd 7ff78044fc080f6838aff81926f4247a *man/mst.Rd 3715f6c1c549edc5c6bb6a632d133d69 *src/Makevars 2d8eb1009aff927946614c2ea9d6ac87 *src/RcppDeadwood.cpp 5ce61edc480e8daf049a8b9594adf604 *src/RcppExports.cpp 742ea456ac0ebfaa18eddaff5725b539 *src/RcppOldmst.cpp 8469090d7c5af93c971354bf8c01aaab *src/c_argfuns.h cbe51411409823d98688e287530401ac *src/c_common.h d9aed924ab50f448edfbe6b55f91c14e *src/c_deadwood.h 1777356b3042ad17034cd373e71dcefa *src/c_kneedle.h 7d5026ce67ff442a97e707c34fff5a48 *src/c_oldmst.h deadwood/R/0000755000176200001440000000000015146132775012254 5ustar liggesusersdeadwood/R/RcppExports.R0000644000176200001440000000315215146132775014671 0ustar liggesusers# Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #' @title Knee/Elbow Point Detection #' #' @description #' Finds the most significant knee/elbow using the Kneedle algorithm #' with exponential smoothing. #' #' @param x data vector (increasing) #' #' @param convex whether the data in \code{x} are convex-ish (elbow detection) #' or not (knee lookup) #' #' @param dt controls the smoothing parameter \eqn{\alpha = 1-\exp(-dt)} #' of the exponential moving average, #' \eqn{y_i = \alpha x_i + (1-\alpha) x_{i-1}}, \eqn{y_1 = x_1} #' #' #' @return #' Returns the index of the knee/elbow point; 1 if not found. #' #' @references #' V. Satopaa, J. Albrecht, D. Irwin, B. Raghavan, #' Finding a "Kneedle" in a haystack: Detecting knee points in system behavior, #' In: 31st Intl. Conf. Distributed Computing Systems Workshops, #' 2011, 166-171, \doi{10.1109/ICDCSW.2011.20} #' #' @name kneedle #' @rdname kneedle #' @export kneedle_increasing <- function(x, convex = TRUE, dt = 0.01) { .Call(`_deadwood_kneedle_increasing`, x, convex, dt) } .deadwood <- function(mst, cut_edges, max_contamination, ema_dt, max_debris_size, verbose) { .Call(`_deadwood_dot_deadwood`, mst, cut_edges, max_contamination, ema_dt, max_debris_size, verbose) } .oldmst.matrix <- function(X, distance = "euclidean", M = 0L, cast_float32 = FALSE, verbose = FALSE) { .Call(`_deadwood_dot_oldmst_matrix`, X, distance, M, cast_float32, verbose) } .oldmst.dist <- function(d, M = 0L, verbose = FALSE) { .Call(`_deadwood_dot_oldmst_dist`, d, M, verbose) } deadwood/R/deadwood-package.R0000644000176200001440000000316715145137373015563 0ustar liggesusers# This file is part of the deadwood package for R. # ############################################################################ # # # # Copyleft (C) 2025-2026, Marek Gagolewski # # # # # # This program is free software: you can redistribute it and/or modify # # it under the terms of the GNU Affero General Public License # # Version 3, 19 November 2007, published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # # but WITHOUT ANY WARRANTY; without even the implied warranty of # # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # # GNU Affero General Public License Version 3 for more details. # # You should have received a copy of the License along with this program. # # If this is not the case, refer to . # # # # ############################################################################ # #' @title The Deadwood Outlier Detection Algorithm #' #' @description #' See \code{\link{deadwood}()} for more details. #' #' @useDynLib deadwood, .registration=TRUE #' @importFrom Rcpp evalCpp #' @importFrom quitefastmst knn_euclid #' @importFrom quitefastmst mst_euclid #' @keywords internal "_PACKAGE" deadwood/R/deadwood.R0000644000176200001440000002320015145137373014160 0ustar liggesusers# This file is part of the deadwood package for R. # ############################################################################ # # # # Copyleft (C) 2020-2026, Marek Gagolewski # # # # # # This program is free software: you can redistribute it and/or modify # # it under the terms of the GNU Affero General Public License # # Version 3, 19 November 2007, published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # # but WITHOUT ANY WARRANTY; without even the implied warranty of # # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # # GNU Affero General Public License Version 3 for more details. # # You should have received a copy of the License along with this program. # # If this is not the case, refer to . # # # # ############################################################################ # #' @title Deadwood: Outlier Detection via Trimming of Mutual Reachability Minimum Spanning Trees #' #' @description #' Deadwood is an anomaly detection algorithm based on Mutual Reachability #' Minimum Spanning Trees. It trims protruding tree segments and marks small #' debris as outliers. #' #' More precisely, the use of a mutual reachability distance #' pulls peripheral points farther away from each other. #' Tree edges with weights beyond the detected elbow point #' are removed. All the resulting connected components whose #' sizes are smaller than a given threshold are deemed anomalous. #' #' #' @details #' As with all distance-based methods (this includes k-means and DBSCAN as well), #' applying data preprocessing and feature engineering techniques #' (e.g., feature scaling, feature selection, dimensionality reduction) #' might lead to more meaningful results. #' #' If \code{d} is a numeric matrix or an object of class \code{dist}, #' \code{\link{mst}} will be called to compute an MST, which #' generally takes at most \eqn{O(n^2)} time. However, by default, #' for low-dimensional Euclidean spaces, a faster algorithm based on K-d trees #' is selected automatically; see \code{\link[quitefastmst]{mst_euclid}} from #' the \pkg{quitefastmst} package. #' #' Once the spanning tree is determined (\eqn{\Omega(n \log n)}-\eqn{O(n^2)}), #' the Deadwood algorithm runs in \eqn{O(n)} time. #' Memory use is also \eqn{O(n)}. #' #' #' @references #' M. Gagolewski, deadwood, in preparation, 2026, TODO #' #' V. Satopaa, J. Albrecht, D. Irwin, B. Raghavan, Finding a "Kneedle" #' in a haystack: Detecting knee points in system behavior, #' In: 31st Intl. Conf. Distributed Computing Systems Workshops, #' 2011, 166-171, \doi{10.1109/ICDCSW.2011.20} #' #' R.J.G.B. Campello, D. Moulavi, J. Sander, #' Density-based clustering based on hierarchical density estimates, #' Lecture Notes in Computer Science 7819, 2013, 160-172, #' \doi{10.1007/978-3-642-37456-2_14} #' #' #' @param d a numeric matrix with \eqn{n} rows and \eqn{p} columns #' (or an object coercible to one, e.g., a data frame with numeric-like #' columns), an object of class \code{dist} (see \code{\link[stats]{dist}}), #' an object of class \code{mstclust} (see \pkg{genieclust} #' and \pkg{lumbermark}), #' or an object of class \code{mst} (see \code{\link{mst}}) #' #' @param distance metric used in the case where \code{d} is a matrix; one of: #' \code{"euclidean"} (synonym: \code{"l2"}), #' \code{"manhattan"} (a.k.a. \code{"l1"} and \code{"cityblock"}), #' \code{"cosine"} #' #' @param M smoothing factor; \eqn{M \leq 1} gives the selected \code{distance}; #' otherwise, the mutual reachability distance based on the \eqn{M}-th #' nearest neighbours is used #' #' @param contamination single numeric value or \code{NA}; #' the estimated (approximate) proportion of outliers in the dataset; #' if \code{NA}, the contamination amount will be determined #' by identifying the most significant elbow point of the curve #' comprised of increasingly ordered tree edge weights #' smoothened with an exponential moving average #' #' @param max_contamination single numeric value; #' maximal contamination level assumed when \code{contamination} is \code{NA} #' #' @param ema_dt single numeric value; #' controls the smoothing parameter \eqn{\alpha = 1-\exp(-dt)} #' of the exponential moving average (in edge length elbow point detection), #' \eqn{y_i = \alpha w_i + (1-\alpha) w_{i-1}}, \eqn{y_1 = d_1} #' #' @param max_debris_size single integer value or \code{NA}; #' the maximal size of the leftover connected components that #' will be considered outliers; if \code{NA}, \eqn{\sqrt{n}} is assumed #' #' @param cut_edges numeric vector or \code{NULL}; #' \eqn{k-1} indexes of the tree edges whose omission lead to #' \eqn{k} connected components (clusters), where the outliers are to #' be sought independently; most frequently this is generated #' via \pkg{genieclust} or \pkg{lumbermark}. #' #' @param verbose logical; whether to print diagnostic messages #' and progress information #' #' @param ... further arguments passed to \code{\link{mst}} #' #' #' @return #' A logical vector \code{y} of length \eqn{n}, where \code{y[i] == TRUE} #' means that the \code{i}-th observation is deemed to be an outlier. #' #' The \code{mst} attribute gives the computed minimum #' spanning tree which can be reused in further calls to the functions #' from \pkg{genieclust}, \pkg{lumbermark}, and \pkg{deadwood}. #' \code{cut_edges} gives the \code{cut_edges} passed as argument. #' \code{contamination} gives the detected contamination levels #' in each cluster (which can be different from the observed proportion #' of outliers detected). #' #' #' @examples #' library("datasets") #' data("iris") #' X <- jitter(as.matrix(iris[1:2])) # some data #' is_outlier <- deadwood(X, M=5) #' plot(X, col=c("#ff000066", "#55555555")[is_outlier+1], #' pch=c(16, 1)[is_outlier+1], asp=1, las=1) #' #' @rdname deadwood #' @export deadwood <- function(d, ...) { UseMethod("deadwood") } #' @export #' @rdname deadwood #' @method deadwood default deadwood.default <- function( d, M=5L, # TODO: set default contamination=NA_real_, max_debris_size=NA_real_, max_contamination=0.5, ema_dt=0.01, distance=c("euclidean", "l2", "manhattan", "cityblock", "l1", "cosine"), verbose=FALSE, ... ) { distance <- match.arg(distance) tree <- mst(d, M=M, distance=distance, verbose=verbose, ...) deadwood.mst( tree, contamination=contamination, max_debris_size=max_debris_size, max_contamination=max_contamination, ema_dt=ema_dt, verbose=verbose ) } #' @export #' @rdname deadwood #' @method deadwood dist deadwood.dist <- function( d, M=5L, # TODO: set default contamination=NA_real_, max_debris_size=NA_real_, max_contamination=0.5, ema_dt=0.01, verbose=FALSE, ... ) { deadwood.mst( mst(d, M=M, verbose=verbose, ...), contamination=contamination, max_debris_size=max_debris_size, max_contamination=max_contamination, ema_dt=ema_dt, verbose=verbose ) } #' @export #' @rdname deadwood #' @method deadwood mstclust deadwood.mstclust <- function( d, contamination=NA_real_, max_debris_size=NA_real_, max_contamination=0.5, ema_dt=0.01, verbose=FALSE, ... ) { tree <- attr(d, "mst") stopifnot(!is.null(tree)) cut_edges <- attr(d, "cut_edges") stopifnot(!is.null(cut_edges)) deadwood.mst( tree, contamination=contamination, max_debris_size=max_debris_size, max_contamination=max_contamination, ema_dt=ema_dt, cut_edges=cut_edges, verbose=verbose ) } #' @export #' @rdname deadwood #' @method deadwood mst deadwood.mst <- function( d, contamination=NA_real_, max_debris_size=NA_real_, max_contamination=0.5, ema_dt=0.01, cut_edges=NULL, verbose=FALSE, ... ) { verbose <- !identical(verbose, FALSE) contamination <- as.double(contamination)[1] max_debris_size <- as.integer(max_debris_size)[1] max_contamination <- as.double(max_contamination)[1] ema_dt <- as.double(ema_dt)[1] cut_edges <- as.numeric(cut_edges) # numeric(0) if NULL stopifnot(ema_dt > 0.0) n <- NROW(d)+1 if (is.na(max_debris_size)) max_debris_size <- as.integer(sqrt(n)) max_debris_size <- max(1L, max_debris_size) stopifnot(max_contamination >= 0, max_contamination <= 1) if (!is.na(contamination)) { stopifnot(contamination >= 0, contamination <= 1) max_contamination <- -contamination } is_outlier <- .deadwood( d, cut_edges, max_contamination, ema_dt, max_debris_size, verbose ) stopifnot(attr(is_outlier, "contamination") >= 0) stopifnot(attr(is_outlier, "contamination") <= 1) structure( is_outlier, names=attr(d, "Labels"), mst=d, cut_edges=cut_edges, class="mstoutlier" ) } registerS3method("deadwood", "default", "deadwood.default") registerS3method("deadwood", "dist", "deadwood.dist") # registerS3method("deadwood", "msthclust", "deadwood.msthclust") TODO registerS3method("deadwood", "mstclust", "deadwood.mstclust") registerS3method("deadwood", "mst", "deadwood.mst") deadwood/R/mst.R0000644000176200001440000001752615145137373013213 0ustar liggesusers# This file is part of the deadwood package for R. # ############################################################################ # # # # Copyleft (C) 2020-2026, Marek Gagolewski # # # # # # This program is free software: you can redistribute it and/or modify # # it under the terms of the GNU Affero General Public License # # Version 3, 19 November 2007, published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # # but WITHOUT ANY WARRANTY; without even the implied warranty of # # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # # GNU Affero General Public License Version 3 for more details. # # You should have received a copy of the License along with this program. # # If this is not the case, refer to . # # # # ############################################################################ # #' @title Euclidean and Mutual Reachability Minimum Spanning Trees #' #' @description #' A Euclidean minimum spanning tree (MST) provides a computationally #' convenient representation of a dataset: the \eqn{n} points are connected #' via \eqn{n-1} shortest segments. Provided that the dataset #' has been appropriately preprocessed so that the distances between the #' points are informative, an MST can be applied in outlier detection, #' clustering, density estimation, dimensionality reduction, and many other #' topological data analysis tasks. #' #' #' @details #' If \code{d} is a matrix and the Euclidean distance is requested #' (the default), then the MST is computed via a call to #' \code{\link[quitefastmst]{mst_euclid}} from \pkg{quitefastmst}. #' It is efficient in low-dimensional spaces. Otherwise, a general-purpose #' implementation of the Jarník (Prim/Dijkstra)-like \eqn{O(n^2)}-time algorithm #' is called. #' #' If \eqn{M>0}, then the minimum spanning tree is computed with respect to #' a mutual reachability distance (Campello et al., 2013): #' \eqn{d_M(i,j)=\max(d(i,j), c_M(i), c_M(j))}, where \eqn{d(i,j)} is #' an ordinary distance and \eqn{c_M(i)} is the core distance given by #' \eqn{d(i,k)} with \eqn{k} being \eqn{i}'s \eqn{M}-th nearest neighbour #' (not including self, unlike in Campello et al., 2013). #' This pulls outliers away from their neighbours. #' #' If the distances are not unique, there might be multiple trees #' spanning a given graph that meet the minimality property. #' #' #' @seealso #' \code{\link[quitefastmst]{mst_euclid}} #' #' @references #' V. Jarník, O jistem problemu minimalnim (z dopisu panu O. Borůvkovi), #' \emph{Prace Moravske Prirodovedecke Spolecnosti} 6, 1930, 57-63 #' #' C.F. Olson, Parallel algorithms for hierarchical clustering, #' \emph{Parallel Computing} 21, 1995, 1313-1325 #' #' R. Prim, Shortest connection networks and some generalisations, #' \emph{The Bell System Technical Journal} 36(6), 1957, 1389-1401 #' #' O. Borůvka, O jistém problému minimálním, \emph{Práce Moravské #' Přírodovědecké Společnosti} 3, 1926, 37–58 #' #' J.L. Bentley, Multidimensional binary search trees used for associative #' searching, \emph{Communications of the ACM} 18(9), 509–517, 1975, #' \doi{10.1145/361002.361007} # #' W.B. March, R. Parikshit, A. Gray, Fast Euclidean minimum spanning #' tree: Algorithm, analysis, and applications, \emph{Proc. 16th ACM SIGKDD #' Intl. Conf. Knowledge Discovery and Data Mining (KDD '10)}, 2010, 603–612 #' #' R.J.G.B. Campello, D. Moulavi, J. Sander, #' Density-based clustering based on hierarchical density estimates, #' \emph{Lecture Notes in Computer Science} 7819, 2013, 160-172, #' \doi{10.1007/978-3-642-37456-2_14} #' #' M. Gagolewski, quitefastmst, in preparation, 2026, TODO #' #' #' @param d either a numeric matrix (or an object coercible to one, #' e.g., a data frame with numeric-like columns) or an #' object of class \code{dist}; see \code{\link[stats]{dist}} #' #' @param distance metric used in the case where \code{d} is a matrix; one of: #' \code{"euclidean"} (synonym: \code{"l2"}), #' \code{"manhattan"} (a.k.a. \code{"l1"} and \code{"cityblock"}), #' \code{"cosine"} #' #' @param M smoothing factor; \eqn{M=0} selects the requested #' \code{distance}; otherwise, the corresponding degree-\code{M} mutual #' reachability distance is used #' #' @param verbose logical; whether to print diagnostic messages #' and progress information #' #' @param ... further arguments passed to \code{\link[quitefastmst]{mst_euclid}} #' from \pkg{quitefastmst} #' #' #' @return #' Returns a numeric matrix of class \code{mst} with \eqn{n-1} rows and #' three columns: \code{from}, \code{to}, and \code{dist}. #' Its \eqn{i}-th row specifies the \eqn{i}-th edge of the MST #' which is incident to the vertices \code{from[i]} and \code{to[i]} with #' \code{from[i] < to[i]} (in \eqn{1,...,n}) and \code{dist[i]} gives #' the corresponding weight, i.e., the distance between the point pair. #' Edges are ordered increasingly with respect to their weights. #' #' The \code{Size} attribute specifies the number of points, \eqn{n}. #' The \code{Labels} attribute gives the labels of the input points, #' if available. #' The \code{method} attribute provides the name of the distance function used. #' #' If \eqn{M>0}, the \code{nn.index} attribute gives the indexes #' of the \code{M} nearest neighbours of each point #' and \code{nn.dist} provides the corresponding distances, #' both in the form of an \eqn{n} by \eqn{M} matrix. #' #' #' @examples #' library("datasets") #' data("iris") #' X <- jitter(as.matrix(iris[1:2])) # some data #' T <- mst(X) #' plot(X, asp=1, las=1) #' segments(X[T[, 1], 1], X[T[, 1], 2], #' X[T[, 2], 1], X[T[, 2], 2]) #' #' @rdname mst #' @export mst <- function(d, ...) { UseMethod("mst") } #' @export #' @rdname mst #' @method mst default mst.default <- function( d, M=0L, distance=c("euclidean", "l2", "manhattan", "cityblock", "l1", "cosine"), verbose=FALSE, ... ) { d <- as.matrix(d) M <- as.integer(M)[1] distance <- match.arg(distance) verbose <- !identical(verbose, FALSE) if (distance %in% c("euclidean", "l2")) { .res <- mst_euclid( d, M, ..., verbose=verbose ) result <- cbind(.res[["mst.index"]], .res[["mst.dist"]]) attr(result, "nn.index") <- .res[["nn.index"]] attr(result, "nn.dist") <- .res[["nn.dist"]] } else { result <- .oldmst.matrix(d, distance, M, verbose=verbose) } stopifnot(result[, 1] < result[, 2]) stopifnot(!is.unsorted(result[, 3])) structure( result, class="mst", Size=nrow(d), Labels=dimnames(d)[[1]], # dist() returns `Labels`, not `labels` method=if (M == 0L) distance else sprintf("mutual reachability distance (%s, M=%d)", distance, M) ) } #' @export #' @rdname mst #' @method mst dist mst.dist <- function( d, M=0L, verbose=FALSE, ... ) { #cast_float32 <- !identical(cast_float32, FALSE) verbose <- !identical(verbose, FALSE) M <- as.integer(M)[1] result <- .oldmst.dist(d, M, verbose) structure( result, class="mst", Size=nrow(result)+1, Labels=attr(d, "Labels"), # dist() returns `Labels`, not `labels` method=if (M == 0L) attr(d, "method") else sprintf("mutual reachability distance (%s, M=%d)", attr(d, "method"), M) ) } registerS3method("mst", "default", "mst.default") registerS3method("mst", "dist", "mst.dist") deadwood/NEWS0000644000176200001440000000014615145137373012551 0ustar liggesusers# Changelog ## 0.9.0 (2026-02-13) * [Python] Initial PyPI release. * [R] Initial CRAN release. deadwood/src/0000755000176200001440000000000015146132776012643 5ustar liggesusersdeadwood/src/RcppExports.cpp0000644000176200001440000000740315140721771015635 0ustar liggesusers// Generated by using Rcpp::compileAttributes() -> do not edit by hand // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 #include using namespace Rcpp; #ifdef RCPP_USE_GLOBAL_ROSTREAM Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); #endif // kneedle_increasing double kneedle_increasing(NumericVector x, bool convex, double dt); RcppExport SEXP _deadwood_kneedle_increasing(SEXP xSEXP, SEXP convexSEXP, SEXP dtSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericVector >::type x(xSEXP); Rcpp::traits::input_parameter< bool >::type convex(convexSEXP); Rcpp::traits::input_parameter< double >::type dt(dtSEXP); rcpp_result_gen = Rcpp::wrap(kneedle_increasing(x, convex, dt)); return rcpp_result_gen; END_RCPP } // dot_deadwood LogicalVector dot_deadwood(NumericMatrix mst, NumericVector cut_edges, double max_contamination, double ema_dt, int max_debris_size, bool verbose); RcppExport SEXP _deadwood_dot_deadwood(SEXP mstSEXP, SEXP cut_edgesSEXP, SEXP max_contaminationSEXP, SEXP ema_dtSEXP, SEXP max_debris_sizeSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type mst(mstSEXP); Rcpp::traits::input_parameter< NumericVector >::type cut_edges(cut_edgesSEXP); Rcpp::traits::input_parameter< double >::type max_contamination(max_contaminationSEXP); Rcpp::traits::input_parameter< double >::type ema_dt(ema_dtSEXP); Rcpp::traits::input_parameter< int >::type max_debris_size(max_debris_sizeSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); rcpp_result_gen = Rcpp::wrap(dot_deadwood(mst, cut_edges, max_contamination, ema_dt, max_debris_size, verbose)); return rcpp_result_gen; END_RCPP } // dot_oldmst_matrix NumericMatrix dot_oldmst_matrix(NumericMatrix X, String distance, int M, bool cast_float32, bool verbose); RcppExport SEXP _deadwood_dot_oldmst_matrix(SEXP XSEXP, SEXP distanceSEXP, SEXP MSEXP, SEXP cast_float32SEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericMatrix >::type X(XSEXP); Rcpp::traits::input_parameter< String >::type distance(distanceSEXP); Rcpp::traits::input_parameter< int >::type M(MSEXP); Rcpp::traits::input_parameter< bool >::type cast_float32(cast_float32SEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); rcpp_result_gen = Rcpp::wrap(dot_oldmst_matrix(X, distance, M, cast_float32, verbose)); return rcpp_result_gen; END_RCPP } // dot_oldmst_dist NumericMatrix dot_oldmst_dist(NumericVector d, int M, bool verbose); RcppExport SEXP _deadwood_dot_oldmst_dist(SEXP dSEXP, SEXP MSEXP, SEXP verboseSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< NumericVector >::type d(dSEXP); Rcpp::traits::input_parameter< int >::type M(MSEXP); Rcpp::traits::input_parameter< bool >::type verbose(verboseSEXP); rcpp_result_gen = Rcpp::wrap(dot_oldmst_dist(d, M, verbose)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { {"_deadwood_kneedle_increasing", (DL_FUNC) &_deadwood_kneedle_increasing, 3}, {"_deadwood_dot_deadwood", (DL_FUNC) &_deadwood_dot_deadwood, 6}, {"_deadwood_dot_oldmst_matrix", (DL_FUNC) &_deadwood_dot_oldmst_matrix, 5}, {"_deadwood_dot_oldmst_dist", (DL_FUNC) &_deadwood_dot_oldmst_dist, 3}, {NULL, NULL, 0} }; RcppExport void R_init_deadwood(DllInfo *dll) { R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); } deadwood/src/c_common.h0000644000176200001440000000603215130422365014574 0ustar liggesusers/* Common functions, macros, includes * * Copyleft (C) 2018-2026, Marek Gagolewski * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License * Version 3, 19 November 2007, published by the Free Software Foundation. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License Version 3 for more details. * You should have received a copy of the License along with this program. * If this is not the case, refer to . */ #ifndef __c_common_h #define __c_common_h #ifdef DEADWOOD_PYTHON #undef DEADWOOD_PYTHON #define DEADWOOD_PYTHON 1 #endif #ifdef DEADWOOD_R #undef DEADWOOD_R #define DEADWOOD_R 1 #endif #include #include #include #include #ifndef DEADWOOD_ASSERT #define __DEADWOOD_STR(x) #x #define DEADWOOD_STR(x) __DEADWOOD_STR(x) #define DEADWOOD_ASSERT(EXPR) { if (!(EXPR)) \ throw std::runtime_error( "[deadwood] Assertion " #EXPR " failed in "\ __FILE__ ":" DEADWOOD_STR(__LINE__) ); } #endif #if DEADWOOD_R #include #else #include "Python.h" #include #endif #if DEADWOOD_R #define DEADWOOD_PRINT(...) REprintf(__VA_ARGS__); #else #define DEADWOOD_PRINT(...) fprintf(stderr, __VA_ARGS__); #endif #ifdef DEADWOOD_PROFILER #include #define DEADWOOD_PROFILER_START \ _deadwood_profiler_t0 = std::chrono::high_resolution_clock::now(); #define DEADWOOD_PROFILER_GETDIFF \ _deadwood_profiler_td = std::chrono::duration(std::chrono::high_resolution_clock::now()-_deadwood_profiler_t0); #define DEADWOOD_PROFILER_USE \ auto DEADWOOD_PROFILER_START \ auto DEADWOOD_PROFILER_GETDIFF \ char _deadwood_profiler_strbuf[256]; #define DEADWOOD_PROFILER_STOP(...) \ DEADWOOD_PROFILER_GETDIFF; \ snprintf(_deadwood_profiler_strbuf, sizeof(_deadwood_profiler_strbuf), __VA_ARGS__); \ DEADWOOD_PRINT("%-64s: time=%12.3lf s\n", _deadwood_profiler_strbuf, _deadwood_profiler_td.count()/1000.0); /* use like: DEADWOOD_PROFILER_USE DEADWOOD_PROFILER_START DEADWOOD_PROFILER_STOP("message %d", 7) */ #else #define DEADWOOD_PROFILER_START ; /* no-op */ #define DEADWOOD_PROFILER_STOP(...) ; /* no-op */ #define DEADWOOD_PROFILER_GETDIFF ; /* no-op */ #define DEADWOOD_PROFILER_USE ; /* no-op */ #endif #if DEADWOOD_R typedef ssize_t Py_ssize_t; #endif typedef double FLOAT_T; ///< float type we are working internally with #define IS_PLUS_INFINITY(x) ((x) > 0.0 && !std::isfinite(x)) #define IS_MINUS_INFINITY(x) ((x) < 0.0 && !std::isfinite(x)) #ifdef OPENMP_DISABLED #define OPENMP_IS_ENABLED 0 #ifdef _OPENMP #undef _OPENMP #endif #else #ifdef _OPENMP #include #define OPENMP_IS_ENABLED 1 #else #define OPENMP_IS_ENABLED 0 #endif #endif #endif deadwood/src/c_deadwood.h0000644000176200001440000006716015146132652015107 0ustar liggesusers/* Deadwood: Outlier Detection via Minimum Spanning Trees * * Copyleft (C) 2025-2026, Marek Gagolewski * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License * Version 3, 19 November 2007, published by the Free Software Foundation. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License Version 3 for more details. * You should have received a copy of the License along with this program. * If this is not the case, refer to . */ #ifndef __c_deadwood_h #define __c_deadwood_h #include "c_common.h" #include "c_kneedle.h" #include /** because std::vector has no data().... */ template class cvector { private: T* x; size_t n; public: cvector(size_t n) : x(nullptr), n(n) { DEADWOOD_ASSERT(n>0); x = new T[n]; DEADWOOD_ASSERT(x); } cvector(size_t n, T v) : x(nullptr), n(n) { DEADWOOD_ASSERT(n>0); x = new T[n]; DEADWOOD_ASSERT(x); for (size_t i=0; i= k is disallowed. * * @param x [in] array of size n * @param n * @param c [in] array of size n with elements in {...,0,1,..,k-1} * @param k * @param y [out] array of size n * @param ind [out] array of size k+1 */ template void Csort_groups( const FLOAT* x, Py_ssize_t n, const Py_ssize_t* c, Py_ssize_t k, FLOAT* y, Py_ssize_t* ind ) { for (Py_ssize_t j=0; j<=k; ++j) ind[j] = 0; for (Py_ssize_t i=0; i= 0 && u < n); DEADWOOD_ASSERT(v >= 0 && v < n); if (c[u] != c[v]) { s++; skip[i] = true; } else skip[i] = false; } return s; } /*! Decode indexes based on a skip array. * * If `skip=[False, True, False, False, True, False, False]`, * then the indexes in `ind` are mapped in such a way that: * 0 → 0, * 1 → 2, * 2 → 3, * 3 → 5, * 4 → 6. * * This function might be useful if we apply a method on `X[~skip,:]` * (a subset of rows in `X`), obtain a vector of indexes `ind` relative to * the indexes of rows in `X[~skip,:]` as a result, and wish to translate `ind` * back to the original row space of `X[:,:]`. * * For instance, `unskip_indexes([0, 2, 1], [True, False, True, False, False])` * yields `[1, 4, 3]`. * * @param ind [in/out] array of m indexes in 0..k-1 to translate * @param m size of ind * @param skip Boolean array of size n with k elements equal to False * @param n size of skip */ void Cunskip_indexes( Py_ssize_t* ind, Py_ssize_t m, const bool* skip, Py_ssize_t n ) { if (m <= 0) return; DEADWOOD_ASSERT(n > 0); cvector o(n); // actually, k needed Py_ssize_t k = 0; for (Py_ssize_t i=0; i= 0 && ind[i] < k) ind[i] = o[ind[i]]; } // std::vector o(m); // Cargsort(o.data(), ind, m, false); // // Py_ssize_t j = 0; // Py_ssize_t k = 0; // for (Py_ssize_t i=0; i 0); cvector o(n); Py_ssize_t k = 0; for (Py_ssize_t i=0; i= 0 && ind[i] < n) ind[i] = o[ind[i]]; } } /** Count the number of non-zero elements in a Boolean array x of length n */ Py_ssize_t Csum_bool(const bool* x, Py_ssize_t n) { Py_ssize_t s = 0; for (Py_ssize_t i=0; i= 0"); else if (u >= n || v >= n) throw std::domain_error("All elements must be < n"); else if (u == v) throw std::domain_error("Self-loops are not allowed"); deg[u]++; deg[v]++; } } /*! Compute the incidence list of each vertex in an undirected graph * over a vertex set {0,...,n-1}. * * @param ind c_contiguous matrix of size m*2, * where {ind[i,0], ind[i,1]} is the i-th edge with ind[i,j] < n * @param m number of edges (rows in ind) * @param n number of vertices * @param cumdeg [out] array of size n+1, where cumdeg[i+1] the sum of the first i vertex degrees * @param inc [out] array of size 2*m; inc[cumdeg[i]]..inc[cumdeg[i+1]-1] gives the edges incident on the i-th vertex */ void Cgraph_vertex_incidences( const Py_ssize_t* ind, const Py_ssize_t m, const Py_ssize_t n, Py_ssize_t* cumdeg, Py_ssize_t* inc ) { cumdeg[0] = 0; Cgraph_vertex_degrees(ind, m, n, cumdeg+1); Py_ssize_t cd = 0; for (Py_ssize_t i=1; i _cumdeg; // data buffer for cumdeg (optional) std::vector _inc; // data buffer for inc (optional) public: CMSTProcessorBase( const Py_ssize_t* mst_i, const Py_ssize_t m, const Py_ssize_t n, Py_ssize_t* c=nullptr, const Py_ssize_t* cumdeg=nullptr, const Py_ssize_t* inc=nullptr, const bool* skip_edges=nullptr ) : mst_i(mst_i), m(m), n(n), c(c), cumdeg(cumdeg), inc(inc), skip_edges(skip_edges) { if (!cumdeg) { DEADWOOD_ASSERT(!inc); _cumdeg.resize(n+1); _inc.resize(2*m); Cgraph_vertex_incidences(mst_i, m, n, _cumdeg.data(), _inc.data()); this->cumdeg = _cumdeg.data(); this->inc = _inc.data(); } else { DEADWOOD_ASSERT(inc); } } }; /* ************************************************************************** */ /** See Cmst_get_cluster_sizes below. */ class CMSTClusterSizeGetter : public CMSTProcessorBase { private: Py_ssize_t max_k; Py_ssize_t* s; // NULL or of size max_k >= k, where k is the number of clusters Py_ssize_t k; // the number of connected components identified Py_ssize_t visit(Py_ssize_t v, Py_ssize_t e) { Py_ssize_t w; if (e < 0) { w = v; } else if (skip_edges && skip_edges[e]) return 0; else { Py_ssize_t iv = (Py_ssize_t)(mst_i[2*e+1]==v); w = mst_i[2*e+(1-iv)]; } DEADWOOD_ASSERT(c[w] < 0); c[w] = k; Py_ssize_t curs = 1; for (const Py_ssize_t* pe = inc+cumdeg[w]; pe != inc+cumdeg[w+1]; pe++) { if (*pe != e) curs += visit(w, *pe); } return curs; } public: CMSTClusterSizeGetter( const Py_ssize_t* mst_i, Py_ssize_t m, Py_ssize_t n, Py_ssize_t* c, Py_ssize_t max_k, Py_ssize_t* s=nullptr, const Py_ssize_t* cumdeg=nullptr, const Py_ssize_t* inc=nullptr, const bool* skip_edges=nullptr ) : CMSTProcessorBase(mst_i, m, n, c, cumdeg, inc, skip_edges), max_k(max_k), s(s), k(-1) { DEADWOOD_ASSERT(this->c); DEADWOOD_ASSERT(this->cumdeg); DEADWOOD_ASSERT(this->inc); } Py_ssize_t process() { for (Py_ssize_t v=0; v= 0) continue; // already visited -> skip if (s) { DEADWOOD_ASSERT(k= k, where k is the number of connected * components in the forest; s[i] gives the size of the i-th cluster; * pass NULL to get only the cluster labels; * obviously, k<=n; e.g., if m==n-1, then k=sum(skip_edges)+1 * @param mst_cumdeg an array of length n+1 or NULL; see Cgraph_vertex_incidences * @param mst_inc an array of length 2*m or NULL; see Cgraph_vertex_incidences * @param mst_skip Boolean array of length m or NULL; indicates the edges to skip */ Py_ssize_t Cmst_cluster_sizes( const Py_ssize_t* mst_i, Py_ssize_t m, Py_ssize_t n, Py_ssize_t* c, Py_ssize_t max_k=0, Py_ssize_t* s=nullptr, const Py_ssize_t* mst_cumdeg=nullptr, const Py_ssize_t* mst_inc=nullptr, const bool* mst_skip=nullptr ) { CMSTClusterSizeGetter get(mst_i, m, n, c, max_k, s, mst_cumdeg, mst_inc, mst_skip); return get.process(); // modifies c in place } /* ************************************************************************** */ /** See Cmst_impute_missing_labels below. */ class CMSTMissingLabelsImputer : public CMSTProcessorBase { private: void visit(Py_ssize_t v, Py_ssize_t e) { if (skip_edges && skip_edges[e]) return; Py_ssize_t iv = (Py_ssize_t)(mst_i[2*e+1]==v); Py_ssize_t w = mst_i[2*e+(1-iv)]; DEADWOOD_ASSERT(c[v] >= 0); DEADWOOD_ASSERT(c[w] < 0); c[w] = c[v]; for (const Py_ssize_t* pe = inc+cumdeg[w]; pe != inc+cumdeg[w+1]; pe++) { if (*pe != e) visit(w, *pe); } } public: CMSTMissingLabelsImputer( const Py_ssize_t* mst_i, Py_ssize_t m, Py_ssize_t n, Py_ssize_t* c, const Py_ssize_t* cumdeg=nullptr, const Py_ssize_t* inc=nullptr, const bool* skip_edges=nullptr ) : CMSTProcessorBase(mst_i, m, n, c, cumdeg, inc, skip_edges) { DEADWOOD_ASSERT(this->c); DEADWOOD_ASSERT(this->cumdeg); DEADWOOD_ASSERT(this->inc); } void process() { for (Py_ssize_t v=0; v void Cget_contamination( const FLOAT* mst_d, Py_ssize_t m, FLOAT max_contamination, FLOAT ema_dt, FLOAT& contamination, Py_ssize_t& threshold_index ) { if (max_contamination <= 0.0) { contamination = -max_contamination; threshold_index = int(m*(1.0-contamination)); } else { Py_ssize_t shift = (int)(m*(1.0-max_contamination)); Py_ssize_t elbow_index = Ckneedle_increasing(mst_d+shift, m-shift, true, ema_dt); if (elbow_index == 0) { threshold_index = m; contamination = 0.0; } else { threshold_index = shift+elbow_index+1; contamination = (m-threshold_index)/(FLOAT)(m+1); } } } /*! The Deadwood outlier detection algorithm * * @param mst_d size m - edge weights * @param mst_i c_contiguous matrix of size m*2, * where {mst_i[k,0], mst_i[k,1]} specifies the k-th (undirected) edge * in the spanning tree * @param mst_cut array of size k-1; indexes of cut edges defining a spanning * forest with k connected components * @param m number of rows in mst_i (edges) * @param n length of c and the number of vertices in the spanning tree * @param k number of initial clusters * @param c [out] array of length n, c[i]==1 marks an outlier * and c[i]==0 denotes an inlier * @param max_debris_size connected components of size <= max_debris_size will * be treated as outliers * @param max_contamination maximal contamination level; * negative values will be used as actual contamination levels * @param ema_dt controls the exponential moving average smoothing parameter * alpha = 1-exp(-dt) (in elbow detection) * @param contamination [out] array of length k; * detected contamination levels in each cluster * @param mst_cumdeg an array of length n+1 or NULL; see Cgraph_vertex_incidences * @param mst_inc an array of length 2*m or NULL; see Cgraph_vertex_incidences */ template void Cdeadwood( const FLOAT* mst_d, // size m [in] const Py_ssize_t* mst_i, // size m [in] const Py_ssize_t* mst_cut, // size k-1 [in] Py_ssize_t m, Py_ssize_t n, Py_ssize_t k, FLOAT max_contamination, FLOAT ema_dt, Py_ssize_t max_debris_size, FLOAT* contamination, // size k [out] Py_ssize_t* c, // size n [out] const Py_ssize_t* mst_cumdeg=nullptr, const Py_ssize_t* mst_inc=nullptr ) { DEADWOOD_ASSERT(k >= 1 && k <= n); DEADWOOD_ASSERT(m == n-1); DEADWOOD_ASSERT(n > 1); cvector sizes(n); // upper bound for the number of clusters cvector mst_skip(m, false); // std::vector has no data() CMSTClusterSizeGetter size_getter(mst_i, m, n, c, n, sizes.data(), mst_cumdeg, mst_inc, mst_skip.data()); DEADWOOD_ASSERT(max_contamination >= -1.0 && max_contamination <= 1.0); if (k == 1) { Py_ssize_t threshold_index = -1; Cget_contamination( mst_d, m, max_contamination, ema_dt, /*out*/contamination[0], /*out*/threshold_index ); // DEADWOOD_PRINT("%d-%d\n", threshold_index, m); DEADWOOD_ASSERT(threshold_index >= 0); for (Py_ssize_t i=threshold_index; i= 0 && mst_cut[i] < m); DEADWOOD_ASSERT(!mst_skip[mst_cut[i]]); mst_skip[mst_cut[i]] = true; } Py_ssize_t _k = size_getter.process(); // sets c and sizes based on the current mst_skip DEADWOOD_ASSERT(_k == k); cvector edge_labels(m); for (Py_ssize_t i=0; i=0 && c[mst_i[2*i+0]] == c[mst_i[2*i+1]]) edge_labels[i] = c[mst_i[2*i+0]]; else edge_labels[i] = -1; } cvector mst_d_grp(n); cvector ind_grp(k+1); Csort_groups(mst_d, m, edge_labels.data(), k, mst_d_grp.data(), ind_grp.data()); cvector weight_thresholds(k); for (Py_ssize_t i=0; i=0); if (threshold_index < mi) weight_thresholds[i] = mst_d_grp[ind_grp[i]+threshold_index]; else weight_thresholds[i] = INFINITY; } for (Py_ssize_t i=0; i= 0 && mst_d[i] >= weight_thresholds[edge_labels[i]]) mst_skip[i] = true; } } size_getter.process(); // sets c and sizes based on the current mst_skip // DEADWOOD_PRINT("%d\n", _k); for (Py_ssize_t i=0; i= 0 && c[i] < n); DEADWOOD_ASSERT(sizes[c[i]] > 0); if (sizes[c[i]] <= max_debris_size) c[i] = 1; else c[i] = 0; } } #if 0 /* /remove deprecated Cmst_trim_branches */ /** See Cmst_trim_branches below. [DEPRECATED] */ template class CMSTBranchTrimmer : public CMSTProcessorBase { private: const FLOAT* mst_d; const FLOAT min_d; const Py_ssize_t max_size; std::vector size; Py_ssize_t clk; // the number of connected components std::vector clsize; Py_ssize_t visit_get_sizes(Py_ssize_t v, Py_ssize_t e) { if (skip_edges && skip_edges[e]) return 0; Py_ssize_t iv = (Py_ssize_t)(mst_i[2*e+1]==v); Py_ssize_t w = mst_i[2*e+(1-iv)]; DEADWOOD_ASSERT(e >= 0 && e < m); DEADWOOD_ASSERT(v >= 0 && v < n); DEADWOOD_ASSERT(w >= 0 && w < n); DEADWOOD_ASSERT(c[w] < 0); Py_ssize_t this_size = 1; c[w] = c[v]; for (const Py_ssize_t* pe = inc+cumdeg[w]; pe != inc+cumdeg[w+1]; pe++) { if (*pe != e) this_size += visit_get_sizes(w, *pe); } size[2*e + (1-iv)] = this_size; size[2*e + iv] = -1; return this_size; } void visit_mark(Py_ssize_t v, Py_ssize_t e) { if (skip_edges && skip_edges[e]) return; Py_ssize_t iv = (Py_ssize_t)(mst_i[2*e+1]==v); Py_ssize_t w = mst_i[2*e+(1-iv)]; if (c[w] < 0) return; // already visited c[w] = -1; for (const Py_ssize_t* pe = inc+cumdeg[w]; pe != inc+cumdeg[w+1]; pe++) { if (*pe != e) visit_mark(w, *pe); } } public: CMSTBranchTrimmer( const FLOAT* mst_d, FLOAT min_d, Py_ssize_t max_size, const Py_ssize_t* mst_i, Py_ssize_t m, Py_ssize_t n, Py_ssize_t* c, const Py_ssize_t* cumdeg=nullptr, const Py_ssize_t* inc=nullptr, const bool* skip_edges=nullptr ) : CMSTProcessorBase(mst_i, m, n, c, cumdeg, inc, skip_edges), mst_d(mst_d), min_d(min_d), max_size(max_size), size(2*m, -1) { DEADWOOD_ASSERT(this->c); DEADWOOD_ASSERT(this->cumdeg); DEADWOOD_ASSERT(this->inc); DEADWOOD_ASSERT(m == n-1); // the number of connected components: clk = 1; if (skip_edges) { for (Py_ssize_t e = 0; e < m; ++e) if (skip_edges[e]) clk++; } clsize.resize(clk); } void process() { for (Py_ssize_t v=0; v= 0) continue; c[v] = lastc; Py_ssize_t this_size = 1; for (const Py_ssize_t* pe = inc+cumdeg[v]; pe != inc+cumdeg[v+1]; pe++) { if (skip_edges && skip_edges[*pe]) continue; this_size += visit_get_sizes(v, *pe); } clsize[lastc] = this_size; lastc++; if (lastc == clk) break; } DEADWOOD_ASSERT(lastc == clk); DEADWOOD_ASSERT(clk > 1 || clsize[0] == n); for (Py_ssize_t e=0; e 0 || size[2*e+1] > 0); DEADWOOD_ASSERT(clsize[c[mst_i[2*e+0]]] == clsize[c[mst_i[2*e+1]]]); if (size[2*e+0] > 0) size[2*e+1] = clsize[c[mst_i[2*e+0]]] - size[2*e+0]; else size[2*e+0] = clsize[c[mst_i[2*e+1]]] - size[2*e+1]; } for (Py_ssize_t e=0; e=size[2*e+1])?0:1; Py_ssize_t v = mst_i[2*e+iv]; if (c[v] < 0) continue; if (size[2*e+(1-iv)] > max_size) continue; visit_mark(v, e); } } }; /*! Trim tree branches of size <= max_size connected by an edge > min_d [DEPRECATED] * * * @param mst_d m edge weights * @param mst_i c_contiguous matrix of size m*2, * where {mst_i[k,0], mst_i[k,1]} specifies the k-th (undirected) edge * in the spanning tree * @param m number of rows in mst_i (edges) * @param c [out] vector of length n; c[i] == -1 marks a trimmed-out point, * whereas c[i] >= 0 denotes a retained one * @param n length of c and the number of vertices in the spanning tree, n == m+1 * @param min_d minimal edge weight to be considered trimmable * @param max_size maximal allowable size of a branch to cut * @param mst_cumdeg an array of length n+1 or NULL; see Cgraph_vertex_incidences * @param mst_inc an array of length 2*m or NULL; see Cgraph_vertex_incidences * @param mst_skip Boolean array of length m or NULL; indicates the edges to skip */ template void Cmst_trim_branches( const FLOAT* mst_d, FLOAT min_d, Py_ssize_t max_size, const Py_ssize_t* mst_i, Py_ssize_t m, Py_ssize_t n, Py_ssize_t* c, const Py_ssize_t* mst_cumdeg=nullptr, const Py_ssize_t* mst_inc=nullptr, const bool* mst_skip=nullptr ) { CMSTBranchTrimmer tr(mst_d, min_d, max_size, mst_i, m, n, c, cumdeg, inc, skip_edges); tr.process(); // modifies c in place } #endif /* /remove deprecated Cmst_trim_branches */ /* ************************************************************************** */ #endif deadwood/src/c_argfuns.h0000644000176200001440000000661715140416307014762 0ustar liggesusers/* Helper functions * * Copyleft (C) 2025-2026, Marek Gagolewski * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License * Version 3, 19 November 2007, published by the Free Software Foundation. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License Version 3 for more details. * You should have received a copy of the License along with this program. * If this is not the case, refer to . */ #ifndef __c_argfuns_h #define __c_argfuns_h #include "c_common.h" #include #include /*! Comparer for argsort(). * * Ensures the resulting permutation is stable. */ template struct __argsort_comparer { const T* x; __argsort_comparer(const T* x) { this->x = x; } inline bool operator()(const Py_ssize_t i, const Py_ssize_t j) const { return this->x[i] < this->x[j] || (this->x[i] == this->x[j] && i < j); } }; /*! Finds an(*) ordering permutation w.r.t. `<`. * * Both ret and x should be of the same length n; * ret will be overwritten. * * (*) or THE stable one, if stable=true, which is the default. * * We call permutation o stable, whenever i < j and x[i]==x[j] * implies that o[i] < o[j]. * * @param ret return array * @param x array to order * @param n size of ret and x * @param stable use a stable sorting algorithm? (slower) */ template void Cargsort(Py_ssize_t* ret, const T* x, Py_ssize_t n, bool stable=true) { if (n <= 0) return; for (Py_ssize_t i=0; i(x)); else std::sort(ret, ret+n, __argsort_comparer(x)); } /*! Returns the index of the (k-1)-th smallest value in an array x. * * argkmin(x, 0) == argmin(x), or, more generally, * argkmin(x, k) == np.argsort(x)[k]. * * Run time: O(nk), where n == len(x). Working mem: O(k). * Does not modify x. * * Very fast for small k and randomly ordered * or almost sorted (increasingly) data. * * * If buf is not NULL, it must be of length at least k+1. * * @param x data * @param n length of x * @param k value in {0,...,n-1}, preferably small * @param buf optional working buffer of size >= k+1, will be overwritten */ template Py_ssize_t Cargkmin(const T* x, Py_ssize_t n, Py_ssize_t k, Py_ssize_t* buf=NULL) { Py_ssize_t* idx; if (n <= 0) throw std::domain_error("n <= 0"); if (k >= n) throw std::domain_error("k >= n"); k += 1; if (!buf) idx = new Py_ssize_t[k]; else idx = buf; for (Py_ssize_t i=0; i 0 && x[i] < x[idx[j-1]]) { idx[j] = idx[j-1]; j -= 1; } idx[j] = i; } for (Py_ssize_t i=k; i 0 && x[i] < x[idx[j-1]]) { idx[j] = idx[j-1]; j -= 1; } idx[j] = i; } Py_ssize_t ret = idx[k-1]; if (!buf) delete [] idx; return ret; } #endif deadwood/src/Makevars0000644000176200001440000000032515146112764014332 0ustar liggesusersPKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -DDEADWOOD_R -Isrc/ -I../src/ PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) SOURCES = \ RcppDeadwood.cpp \ RcppOldmst.cpp \ RcppExports.cpp OBJECTS = $(SOURCES:.cpp=.o) deadwood/src/c_kneedle.h0000644000176200001440000000712615145137373014730 0ustar liggesusers/* An implementation of the *Kneedle* algorithm to detect knee/elbow points, * with exponential moving average smoothing * * Based on V. Satopaa, J. Albrecht, D. Irwin, B. Raghavan * Finding a “Kneedle” in a haystack: Detecting knee points in system behavior, * 31st Intl. Conf. Distributed Computing Systems Workshops, 2011, pp. 166-171, * DOI: 10.1109/ICDCSW.2011.20 * * * Copyleft (C) 2018-2026, Marek Gagolewski * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License * Version 3, 19 November 2007, published by the Free Software Foundation. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License Version 3 for more details. * You should have received a copy of the License along with this program. * If this is not the case, refer to . */ #ifndef __c_kneedle_h #define __c_kneedle_h #include "c_common.h" #include #include /** * Exponential moving average with * smoothing parameter alpha = 1-exp(-dt) * * y[0] = x[0], * y[i] = alpha*x[i]+(1-alpha)*x[i-1] * * @param x [in] input array of length n * @param y [out] output array of length n * @param n length of x and y * @param dt controls the smoothing parameter alpha = 1-exp(-dt) */ template void Cema(const FLOAT* x, Py_ssize_t n, FLOAT dt, FLOAT* y) { FLOAT alpha = -std::expm1(-dt); // 1-np.exp(-dt) FLOAT alpham1 = 1.0-alpha; y[0] = x[0]; for (Py_ssize_t i=1; i Py_ssize_t Ckneedle_increasing(const FLOAT* x, Py_ssize_t n, bool convex, FLOAT dt) { if (n < 1) return 0; std::vector _y(n); FLOAT* y = _y.data(); Cema(x, n, dt, y); // sets y // normalise to [0,1], subtract i/(n-1) FLOAT miny = y[0], maxy = y[0]; for (Py_ssize_t i=1; i y[i]) miny = y[i]; else if (maxy < y[i]) maxy = y[i]; } FLOAT rngy = maxy-miny; for (Py_ssize_t i=0; i y[i] && y[i] < y[i+1]) { if (y[i] >= peak_y) { peak_y = y[i]; peak_i = i; } } } } else { for (Py_ssize_t i=1; i y[i+1]) { if (y[i] >= peak_y) { peak_y = y[i]; peak_i = i; } } } } return peak_i; } #endif deadwood/src/RcppDeadwood.cpp0000644000176200001440000000721515145137373015724 0ustar liggesusers/* deadwood R interface * * Copyleft (C) 2025-2026, Marek Gagolewski * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License * Version 3, 19 November 2007, published by the Free Software Foundation. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License Version 3 for more details. * You should have received a copy of the License along with this program. * If this is not the case, refer to . */ #include "c_common.h" #include "c_kneedle.h" #include "c_deadwood.h" #include using namespace Rcpp; // TODO: sort_groups // TODO: mst_cluster_sizes // TODO: get_skip_edges // TODO: unskip_indexes // TODO: skip_indexes // TODO: graph_vertex_degrees // TODO: graph_vertex_incidences // TODO: mst_label_imputer //' @title Knee/Elbow Point Detection //' //' @description //' Finds the most significant knee/elbow using the Kneedle algorithm //' with exponential smoothing. //' //' @param x data vector (increasing) //' //' @param convex whether the data in \code{x} are convex-ish (elbow detection) //' or not (knee lookup) //' //' @param dt controls the smoothing parameter \eqn{\alpha = 1-\exp(-dt)} //' of the exponential moving average, //' \eqn{y_i = \alpha x_i + (1-\alpha) x_{i-1}}, \eqn{y_1 = x_1} //' //' //' @return //' Returns the index of the knee/elbow point; 1 if not found. //' //' @references //' V. Satopaa, J. Albrecht, D. Irwin, B. Raghavan, //' Finding a "Kneedle" in a haystack: Detecting knee points in system behavior, //' In: 31st Intl. Conf. Distributed Computing Systems Workshops, //' 2011, 166-171, \doi{10.1109/ICDCSW.2011.20} //' //' @name kneedle //' @rdname kneedle //' @export // [[Rcpp::export]] double kneedle_increasing(NumericVector x, bool convex=true, double dt=0.01) { Py_ssize_t n = (Py_ssize_t)x.size(); return Ckneedle_increasing(REAL(x), n, convex, dt)+1.0; } // [[Rcpp::export(".deadwood")]] LogicalVector dot_deadwood( NumericMatrix mst, NumericVector cut_edges, double max_contamination, double ema_dt, int max_debris_size, bool verbose ) { if (verbose) DEADWOOD_PRINT("[deadwood] Determining clusters.\n"); Py_ssize_t n = mst.nrow()+1; std::vector mst_i((n-1)*2); std::vector mst_d(n-1); for (Py_ssize_t i=0; i mst_cut(k-1); for (Py_ssize_t i=0; i= 0 && mst_cut[i] < n-1); } std::vector contamination(k); std::vector is_outlier(n); Cdeadwood( mst_d.data(), mst_i.data(), mst_cut.data(), n-1, n, k, max_contamination, ema_dt, max_debris_size, contamination.data(), is_outlier.data(), NULL, NULL ); LogicalVector res(n); for (Py_ssize_t i=0; i * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License * Version 3, 19 November 2007, published by the Free Software Foundation. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License Version 3 for more details. * You should have received a copy of the License along with this program. * If this is not the case, refer to . */ #include "c_common.h" #include "c_oldmst.h" #include using namespace Rcpp; /** * Represents a matrix as a C-contiguous array, * i.e., in a row-major order. */ template class CMatrix { private: size_t n, d; std::vector elems; public: /** Initialises a new matrix of size _nrow*_ncol, filled with 0s * * @param _nrow * @param _ncol */ CMatrix(size_t _nrow, size_t _ncol) : n(_nrow), d(_ncol), elems(_nrow*_ncol) { ; } /** Initialises a new matrix of size _nrow*_ncol, filled with _ts * * @param _nrow * @param _ncol * @param _t */ CMatrix(size_t _nrow, size_t _ncol, T _t) : n(_nrow), d(_ncol), elems(_nrow*_ncol, _t) { ; } /** Initialises a new matrix of size _nrow*_ncol based on a contiguous * C- or Fortran-style array * * @param _data * @param _nrow * @param _ncol * @param _c_order whether the first _ncol elements in _data constitute the first row * or the first _nrow elements define the first column */ template CMatrix(const S* _data, size_t _nrow, size_t _ncol, bool _c_order) : n(_nrow), d(_ncol), elems(_nrow*_ncol) { if (_c_order) { for (size_t i=0; i<_nrow*_ncol; ++i) elems[i] = (T)(_data[i]); } else { size_t k = 0; for (size_t i=0; i<_nrow; i++) { for (size_t j=0; j<_ncol; j++) { elems[k++] = (T)_data[i+_nrow*j]; } } } } /** Read/write access to an element in the i-th row and the j-th column * * @param i * @param j * @return a reference to the indicated matrix element */ inline T& operator()(const size_t i, const size_t j) { return elems[d*i + j]; } inline const T& operator()(const size_t i, const size_t j) const { return elems[d*i + j]; } /** Returns a direct pointer to the underlying C-contiguous data array: * the first ncol elements give the 1st row, * the next ncol element give the 2nd row, * and so forth. * * @return pointer */ T* data() { return elems.data(); } const T* data() const { return elems.data(); } /** Returns a direct pointer to the start of the i-th row * * @param i * @return pointer */ T* row(const size_t i) { return elems.data()+i*d; } const T* row(const size_t i) const { return elems.data()+i*d; } /** Returns the number of rows * * @return */ size_t nrow() const { return n; } /** Returns the number of columns * * @return */ size_t ncol() const { return d; } }; template NumericMatrix internal_oldmst_compute( CDistance* D, Py_ssize_t n, Py_ssize_t M, bool verbose ) { NumericMatrix ret(n-1, 3); CDistance* D2 = NULL; if (M >= 1) { // TODO we need it for M==1 as well, but this data can be read from the // MST data below! if (verbose) DEADWOOD_PRINT("[deadwood] Determining the core distance.\n"); CMatrix nn_i(n, M); CMatrix nn_d(n, M); Cknn_from_complete(D, n, M, nn_d.data(), nn_i.data()); IntegerMatrix out_nn_ind(n, M); NumericMatrix out_nn_dist(n, M); std::vector d_core(n); for (Py_ssize_t i=0; i(d_core.data(), n, D); } CMatrix mst_i(n-1, 2); std::vector mst_d(n-1); if (verbose) DEADWOOD_PRINT("[deadwood] Computing the MST.\n"); Cmst_from_complete(D2?D2:D, n, mst_d.data(), mst_i.data(), verbose); if (verbose) DEADWOOD_PRINT("[deadwood] Done.\n"); if (D2) delete D2; for (Py_ssize_t i=0; i NumericMatrix internal_oldmst_matrix( NumericMatrix X, String distance, Py_ssize_t M, /*bool use_mlpack, */ bool verbose ) { Py_ssize_t n = X.nrow(); Py_ssize_t d = X.ncol(); NumericMatrix ret; if (M < 0 || M >= n) stop("`M` must be an integer in [0, n-1]"); CMatrix X2(REAL(SEXP(X)), n, d, false); // Fortran- to C-contiguous T* _X2 = X2.data(); for (Py_ssize_t i=0; i* D = NULL; if (distance == "euclidean" || distance == "l2") D = (CDistance*)(new CDistanceEuclideanSquared(X2.data(), n, d)); else if (distance == "manhattan" || distance == "cityblock" || distance == "l1") D = (CDistance*)(new CDistanceManhattan(X2.data(), n, d)); else if (distance == "cosine") D = (CDistance*)(new CDistanceCosine(X2.data(), n, d)); else stop("given `distance` is not supported (yet)"); ret = internal_oldmst_compute(D, n, M, verbose); delete D; if (distance == "euclidean" || distance == "l2") { for (Py_ssize_t i=0; i 0) { Rcpp::NumericMatrix out_nn_dist = ret.attr("nn.dist"); for (Py_ssize_t i=0; i(X, distance, M, verbose); else return internal_oldmst_matrix(X, distance, M, verbose); } // [[Rcpp::export(".oldmst.dist")]] NumericMatrix dot_oldmst_dist( NumericVector d, int M=0, bool verbose=false ) { Py_ssize_t n = (Py_ssize_t)round((sqrt(1.0+8.0*d.size())+1.0)/2.0); DEADWOOD_ASSERT(n*(n-1)/2 == d.size()); CDistancePrecomputedVector D(REAL(SEXP(d)), n); return internal_oldmst_compute(&D, n, M, verbose); } deadwood/src/c_oldmst.h0000644000176200001440000006154215146111176014617 0ustar liggesusers/* Jarník (Prim)'s MST algorithm for complete undirected graphs * (the "old"/generic interface; see quitefastmst for a faster * implementation). * * Copyleft (C) 2018-2026, Marek Gagolewski * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License * Version 3, 19 November 2007, published by the Free Software Foundation. * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License Version 3 for more details. * You should have received a copy of the License along with this program. * If this is not the case, refer to . */ #ifndef __c_oldmst_h #define __c_oldmst_h #include "c_common.h" #include #include #include /*! Abstract base class for all distances */ template struct CDistance { virtual ~CDistance() {} /*! * @param i point index, 0 <= i < n * @param M indexes * @param k length of M * @return distances from the i-th point to M[0], .., M[k-1], * with ret[M[j]]=d(i, M[j]); * the user does not own ret; * the function is not thread-safe */ virtual const T* operator()(Py_ssize_t i, const Py_ssize_t* M, Py_ssize_t k) = 0; }; /*! A class to "compute" the distances from the i-th point * to all n points based on a pre-computed n*n symmetric, * complete pairwise distance c_contiguous matrix. */ template struct CDistancePrecomputedMatrix : public CDistance { const T* dist; Py_ssize_t n; /*! * @param dist n*n c_contiguous array, dist[i,j] is the distance between * the i-th and the j-th point, the matrix is symmetric * @param n number of points */ CDistancePrecomputedMatrix(const T* dist, Py_ssize_t n) { this->n = n; this->dist = dist; } CDistancePrecomputedMatrix() : CDistancePrecomputedMatrix(NULL, 0) { } virtual const T* operator()(Py_ssize_t i, const Py_ssize_t* /*M*/, Py_ssize_t /*k*/) { return &this->dist[i*n]; // the i-th row of dist } }; /*! A class to "compute" the distances from the i-th point * to all n points based on a pre-computed * c_contiguous distance vector. */ template struct CDistancePrecomputedVector : public CDistance { const T* dist; Py_ssize_t n; std::vector buf; /*! * @param dist n*(n-1)/2 c_contiguous vector, dist[ i*n - i*(i+1)/2 + w-i-1 ] * where w is the distance between the i-th and the w-th point * @param n number of points */ CDistancePrecomputedVector(const T* dist, Py_ssize_t n) : buf(n) { this->n = n; this->dist = dist; } CDistancePrecomputedVector() : CDistancePrecomputedVector(NULL, 0) { } virtual const T* operator()(Py_ssize_t i, const Py_ssize_t* M, Py_ssize_t k) { T* __buf = buf.data(); for (Py_ssize_t j=0; j struct CDistanceEuclidean : public CDistance { const T* X; Py_ssize_t n; Py_ssize_t d; std::vector buf; /*! * @param X n*d c_contiguous array * @param n number of points * @param d dimensionality */ CDistanceEuclidean(const T* X, Py_ssize_t n, Py_ssize_t d) : buf(n) { this->n = n; this->d = d; this->X = X; } CDistanceEuclidean() : CDistanceEuclidean(NULL, 0, 0) { } virtual const T* operator()(Py_ssize_t i, const Py_ssize_t* M, Py_ssize_t k) { T* __buf = buf.data(); const T* x = X+d*i; #if OPENMP_IS_ENABLED #pragma omp parallel for schedule(static) #endif for (Py_ssize_t j=0; j struct CDistanceEuclideanSquared : public CDistance { const T* X; Py_ssize_t n; Py_ssize_t d; std::vector buf; // std::vector x2; /*! * @param X n*d c_contiguous array * @param n number of points * @param d dimensionality */ CDistanceEuclideanSquared(const T* X, Py_ssize_t n, Py_ssize_t d) : buf(n) /*, x2(n, 0.0)*/ { this->n = n; this->d = d; this->X = X; // T* _x2 = x2.data(); // #if OPENMP_IS_ENABLED // #pragma omp parallel for schedule(static) // #endif // for (Py_ssize_t i=0; i struct CDistanceManhattan : public CDistance { const T* X; Py_ssize_t n; Py_ssize_t d; std::vector buf; /*! * @param X n*d c_contiguous array * @param n number of points * @param d dimensionality */ CDistanceManhattan(const T* X, Py_ssize_t n, Py_ssize_t d) : buf(n) { this->n = n; this->d = d; this->X = X; } CDistanceManhattan() : CDistanceManhattan(NULL, 0, 0) { } virtual const T* operator()(Py_ssize_t i, const Py_ssize_t* M, Py_ssize_t k) { T* __buf = buf.data(); #if OPENMP_IS_ENABLED #pragma omp parallel for schedule(static) #endif for (Py_ssize_t j=0; j=0 && w struct CDistanceCosine : public CDistance { const T* X; Py_ssize_t n; Py_ssize_t d; std::vector buf; std::vector norm; /*! * @param X n*d c_contiguous array * @param n number of points * @param d dimensionality */ CDistanceCosine(const T* X, Py_ssize_t n, Py_ssize_t d) : buf(n), norm(n) { this->n = n; this->d = d; this->X = X; T* __norm = norm.data(); #if OPENMP_IS_ENABLED #pragma omp parallel for schedule(static) #endif for (Py_ssize_t i=0; i=0&&w struct CDistanceMutualReachability : public CDistance { Py_ssize_t n; CDistance* d_pairwise; std::vector buf; std::vector d_core; CDistanceMutualReachability(const T* _d_core, Py_ssize_t n, CDistance* d_pairwise) : buf(n), d_core(_d_core, _d_core+n) { this->n = n; this->d_pairwise = d_pairwise; } CDistanceMutualReachability() : CDistanceMutualReachability(NULL, 0, NULL) { } virtual const T* operator()(Py_ssize_t i, const Py_ssize_t* M, Py_ssize_t k) { // pragma omp parallel for inside:: const T* d = (*d_pairwise)(i, M, k); T* __buf = buf.data(); #if OPENMP_IS_ENABLED #pragma omp parallel for schedule(static) #endif for (Py_ssize_t j=0; j __buf[w]) __buf[w] = d_core[i]; // if (d_core[w] > __buf[w]) __buf[w] = d_core[w]; T d_core_max; // T d_core_min; if (d_core[i] >= d_core[w]) { d_core_max = d_core[i]; // d_core_min = d_core[w]; } else { d_core_max = d_core[w]; // d_core_min = d_core[i]; } if (d[w] > d_core_max) { __buf[w] = d[w]; } else { __buf[w] = d_core_max; //+d_core_min*0.00000011920928955078125; // make it unambiguous: // pulled-away from each other, but ordered w.r.t. the core distances (increasingly) } } } return __buf; } }; /*! Represents an edge in a weighted graph. * Features a comparer used to sort the edges w.r.t. increasing weights; * more precisely, lexicographically w.r.t. (d, i1, d2). */ template struct CMstTriple { Py_ssize_t i1; //!< first vertex defining an edge Py_ssize_t i2; //!< second vertex defining an edge T d; //!< edge weight CMstTriple() {} CMstTriple(Py_ssize_t i1, Py_ssize_t i2, T d, bool order=true) { DEADWOOD_ASSERT(i1 != i2); DEADWOOD_ASSERT(i1 >= 0); DEADWOOD_ASSERT(i2 >= 0); this->d = d; if (!order || (i1 < i2)) { this->i1 = i1; this->i2 = i2; } else { this->i1 = i2; this->i2 = i1; } } bool operator<(const CMstTriple& other) const { if (d == other.d) { if (i1 == other.i1) return i2 < other.i2; else return i1 < other.i1; } else return d < other.d; } }; /*! A Jarník (Prim/Dijkstra)-like algorithm for determining * a(*) minimum spanning tree (MST) of a complete undirected graph * with weights given by, e.g., a symmetric n*n matrix. * * However, the distances can be computed on the fly, so that O(n) memory is used. * * (*) Note that there might be multiple minimum trees spanning a given graph. * * * References: * ---------- * * V. Jarník, O jistem problemu minimalnim (z dopisu panu O. Borůvkovi), * Prace Moravske Prirodovedecke Spolecnosti 6, 1930, 57-63 * * C.F. Olson, Parallel algorithms for hierarchical clustering, * Parallel Computing 21(8), 1995, 1313-1325 * * R. Prim, Shortest connection networks and some generalisations, * The Bell System Technical Journal 36(6), 1957, 1389-1401 * * * @param D a CDistance object such that a call to * D(j, M, Py_ssize_t k) returns a length-n array * with the distances from the j-th point to k points whose indexes * are given in array M * @param n number of points * @param mst_d [out] vector of length n-1, gives weights of the * resulting MST edges in nondecreasing order * @param mst_i [out] vector of length 2*(n-1), representing * a c_contiguous array of shape (n-1,2), defining the edges * corresponding to mst_d, with mst_i[j,0] < mst_i[j,1] for all j * @param verbose output diagnostic/progress messages? */ template void Cmst_from_complete(CDistance* D, Py_ssize_t n, T* mst_dist, Py_ssize_t* mst_ind, bool verbose=false) { if (verbose) DEADWOOD_PRINT("[deadwood] Computing the MST... %3d%%", 0); // NOTE: all changes should also be mirrored in Cmst_euclid_brute() // ind_nn[j] is the vertex from the current tree closest to vertex j std::vector ind_nn(n); std::vector dist_nn(n, INFINITY); // dist_nn[j] = d(j, ind_nn[j]) std::vector ind_left(n); for (Py_ssize_t i=0; i > mst(n-1); Py_ssize_t ind_cur = 0; // start with the first vertex (because we can start with any) for (Py_ssize_t i=1; i(best_ind_left, ind_nn[best_ind_left], dist_nn[best_ind_left], /*order=*/true); // don't visit best_ind_left again #if 0 std::swap(ind_left[best_ind_left_pos], ind_left[i]); #else // keep ind_left sorted (a bit better locality of reference) (#62) for (Py_ssize_t j=best_ind_left_pos; j>i; --j) ind_left[j] = ind_left[j-1]; ind_left[i] = best_ind_left; // for readability only #endif ind_cur = best_ind_left; // start from best_ind_left next time if (verbose) DEADWOOD_PRINT("\b\b\b\b%3d%%", (int)((n-1+n-i-1)*(i+1)*100/n/(n-1))); #if DEADWOOD_R Rcpp::checkUserInterrupt(); #elif DEADWOOD_PYTHON if (PyErr_CheckSignals() != 0) throw std::runtime_error("signal caught"); #endif } // sort the resulting MST edges in increasing order w.r.t. d std::sort(mst.begin(), mst.end()); for (Py_ssize_t i=0; iD(j, M, Py_ssize_t l) returns an n-ary array * with the distances from the j-th point to l points whose indexes * are given in array M * @param n number of points * @param k number of nearest neighbours, * @param dist [out] a c_contiguous array, shape (n,k), * dist[i,j] gives the weight of the (undirected) edge {i, ind[i,j]} * @param ind [out] a c_contiguous array, shape (n,k), * (undirected) edge definition, interpreted as {i, ind[i,j]} * @param verbose output diagnostic/progress messages? */ template void Cknn_from_complete(CDistance* D, Py_ssize_t n, Py_ssize_t k, T* dist, Py_ssize_t* ind, bool verbose=false) { if (n <= 0) throw std::domain_error("n <= 0"); if (k <= 0) throw std::domain_error("k <= 0"); if (k >= n) throw std::domain_error("k >= n"); if (verbose) DEADWOOD_PRINT("[deadwood] Computing the K-nn graph... %3d%%", 0); for (Py_ssize_t i=0; i M(n); for (Py_ssize_t i=0; i 0 && dij[j] < dist[i*k+l-1]) { dist[i*k+l] = dist[i*k+l-1]; ind[i*k+l] = ind[i*k+l-1]; l -= 1; } dist[i*k+l] = dij[j]; ind[i*k+l] = j; } } #if OPENMP_IS_ENABLED #pragma omp parallel for schedule(static) #endif for (Py_ssize_t j=i+1; j 0 && dij[j] < dist[j*k+l-1]) { dist[j*k+l] = dist[j*k+l-1]; ind[j*k+l] = ind[j*k+l-1]; l -= 1; } dist[j*k+l] = dij[j]; ind[j*k+l] = i; } } if (verbose) DEADWOOD_PRINT("\b\b\b\b%3d%%", (int)((n-1+n-i-1)*(i+1)*100/n/(n-1))); #if DEADWOOD_R Rcpp::checkUserInterrupt(); #elif DEADWOOD_PYTHON if (PyErr_CheckSignals() != 0) throw std::runtime_error("signal caught"); #endif } if (verbose) DEADWOOD_PRINT("\b\b\b\bdone.\n"); } #if 0 /* Cmst_from_nn DEPRECATED */ #include "c_disjoint_sets.h" /*! Computes a minimum spanning forest of a (<=k)-nearest neighbour graph * (i.e., one that connects no more than the first k nearest neighbours * (of each point) using Kruskal's algorithm, and orders * its edges w.r.t. increasing weights. [DEPRECATED] * * Note that, in general, an MST of the (<=k)-nearest neighbour graph * might not be equal to the MST of the complete Pairwise Distances Graph. * * It is assumed that each query point is not its own neighbour. * * @param dist a c_contiguous array, shape (n,k), * dist[i,j] gives the weight of the (undirected) edge {i, ind[i,j]} * @param ind a c_contiguous array, shape (n,k), * (undirected) edge definition, interpreted as {i, ind[i,j]}; * negative indexes as well as those such that ind[i,j]==i are ignored * @param d_core "core" distance (or NULL); * if not NULL then the distance between two points will be * d'(i, ind[i,j]) = max(d(i, ind[i,j]), d_core[i], d_core[ind[i,j]]) * @param n number of nodes * @param k minimal degree of all the nodes * @param mst_dist [out] c_contiguous vector of length n-1, gives weights of the * resulting MST edges in nondecreasing order; * refer to the function's return value for the actual number * of edges generated (if this is < n-1, the object is padded with INFINITY) * @param mst_ind [out] c_contiguous matrix of size (n-1)*2, defining the edges * corresponding to mst_d, with mst_i[j,0] <= mst_i[j,1] for all j; * refer to the function's return value for the actual number * of edges generated (if this is < n-1, the object is padded with -1) * @param maybe_inexact [out] true indicates that k should be increased to * guarantee that the resulting tree would be the same if a complete * pairwise distance graph was given. * @param verbose output diagnostic/progress messages? * * @return number of edges in the minimal spanning forest */ template Py_ssize_t Cmst_from_nn( const T* dist, const Py_ssize_t* ind, const T* d_core, Py_ssize_t n, Py_ssize_t k, T* mst_dist, Py_ssize_t* mst_ind, int* maybe_inexact, bool verbose=false) { if (n <= 0) throw std::domain_error("n <= 0"); if (k <= 0) throw std::domain_error("k <= 0"); if (k >= n) throw std::domain_error("k >= n"); Py_ssize_t nk = n*k; if (verbose) DEADWOOD_PRINT("[deadwood] Computing the MST... %3d%%", 0); std::vector< CMstTriple > nns(nk); Py_ssize_t c = 0; for (Py_ssize_t i = 0; i < n; ++i) { for (Py_ssize_t j = 0; j < k; ++j) { Py_ssize_t i2 = ind[k*i+j]; if (i2 >= 0 && i2 != i) { double d = dist[k*i+j]; if (d_core) { // d(i, i2) = max(d(i,i2), d_core[i], d_core[i2]) if (d < d_core[i]) d = d_core[i]; if (d < d_core[i2]) d = d_core[i2]; } nns[c++] = CMstTriple(i, i2, d, true); } } } std::sort(nns.data(), nns.data()+c); Py_ssize_t triple_cur = 0; Py_ssize_t mst_edge_cur = 0; CDisjointSets ds(n); while (mst_edge_cur < n-1) { if (triple_cur == c) { // The input graph is not connected (we have a forest) Py_ssize_t ret = mst_edge_cur; while (mst_edge_cur < n-1) { mst_ind[2*mst_edge_cur+0] = -1; mst_ind[2*mst_edge_cur+1] = -1; mst_dist[mst_edge_cur] = INFINITY; mst_edge_cur++; } if (verbose) DEADWOOD_PRINT("\b\b\b\b%3d%%", (int)(mst_edge_cur*100/(n-1))); return ret; } Py_ssize_t u = nns[triple_cur].i1; Py_ssize_t v = nns[triple_cur].i2; T d = nns[triple_cur].d; triple_cur++; if (ds.find(u) == ds.find(v)) continue; mst_ind[2*mst_edge_cur+0] = u; mst_ind[2*mst_edge_cur+1] = v; mst_dist[mst_edge_cur] = d; DEADWOOD_ASSERT(mst_edge_cur == 0 || mst_dist[mst_edge_cur] >= mst_dist[mst_edge_cur-1]); ds.merge(u, v); mst_edge_cur++; if (verbose) DEADWOOD_PRINT("\b\b\b\b%3d%%", (int)(mst_edge_cur*100/(n-1))); #if DEADWOOD_R Rcpp::checkUserInterrupt(); #elif DEADWOOD_PYTHON if (PyErr_CheckSignals() != 0) throw std::runtime_error("signal caught"); #endif } *maybe_inexact = 0; // TODO !!!! if (verbose) DEADWOOD_PRINT("\b\b\b\bdone.\n"); return mst_edge_cur; } #endif /* /Cmst_from_nn DEPRECATED */ #endif deadwood/NAMESPACE0000644000176200001440000000057615140722003013261 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method(deadwood,default) S3method(deadwood,dist) S3method(deadwood,mst) S3method(deadwood,mstclust) S3method(mst,default) S3method(mst,dist) export(deadwood) export(kneedle_increasing) export(mst) importFrom(Rcpp,evalCpp) importFrom(quitefastmst,knn_euclid) importFrom(quitefastmst,mst_euclid) useDynLib(deadwood, .registration=TRUE) deadwood/man/0000755000176200001440000000000015145137373012624 5ustar liggesusersdeadwood/man/mst.Rd0000644000176200001440000001163415145137373013723 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/mst.R \name{mst} \alias{mst} \alias{mst.default} \alias{mst.dist} \title{Euclidean and Mutual Reachability Minimum Spanning Trees} \usage{ mst(d, ...) \method{mst}{default}( d, M = 0L, distance = c("euclidean", "l2", "manhattan", "cityblock", "l1", "cosine"), verbose = FALSE, ... ) \method{mst}{dist}(d, M = 0L, verbose = FALSE, ...) } \arguments{ \item{d}{either a numeric matrix (or an object coercible to one, e.g., a data frame with numeric-like columns) or an object of class \code{dist}; see \code{\link[stats]{dist}}} \item{...}{further arguments passed to \code{\link[quitefastmst]{mst_euclid}} from \pkg{quitefastmst}} \item{M}{smoothing factor; \eqn{M=0} selects the requested \code{distance}; otherwise, the corresponding degree-\code{M} mutual reachability distance is used} \item{distance}{metric used in the case where \code{d} is a matrix; one of: \code{"euclidean"} (synonym: \code{"l2"}), \code{"manhattan"} (a.k.a. \code{"l1"} and \code{"cityblock"}), \code{"cosine"}} \item{verbose}{logical; whether to print diagnostic messages and progress information} } \value{ Returns a numeric matrix of class \code{mst} with \eqn{n-1} rows and three columns: \code{from}, \code{to}, and \code{dist}. Its \eqn{i}-th row specifies the \eqn{i}-th edge of the MST which is incident to the vertices \code{from[i]} and \code{to[i]} with \code{from[i] < to[i]} (in \eqn{1,...,n}) and \code{dist[i]} gives the corresponding weight, i.e., the distance between the point pair. Edges are ordered increasingly with respect to their weights. The \code{Size} attribute specifies the number of points, \eqn{n}. The \code{Labels} attribute gives the labels of the input points, if available. The \code{method} attribute provides the name of the distance function used. If \eqn{M>0}, the \code{nn.index} attribute gives the indexes of the \code{M} nearest neighbours of each point and \code{nn.dist} provides the corresponding distances, both in the form of an \eqn{n} by \eqn{M} matrix. } \description{ A Euclidean minimum spanning tree (MST) provides a computationally convenient representation of a dataset: the \eqn{n} points are connected via \eqn{n-1} shortest segments. Provided that the dataset has been appropriately preprocessed so that the distances between the points are informative, an MST can be applied in outlier detection, clustering, density estimation, dimensionality reduction, and many other topological data analysis tasks. } \details{ If \code{d} is a matrix and the Euclidean distance is requested (the default), then the MST is computed via a call to \code{\link[quitefastmst]{mst_euclid}} from \pkg{quitefastmst}. It is efficient in low-dimensional spaces. Otherwise, a general-purpose implementation of the Jarník (Prim/Dijkstra)-like \eqn{O(n^2)}-time algorithm is called. If \eqn{M>0}, then the minimum spanning tree is computed with respect to a mutual reachability distance (Campello et al., 2013): \eqn{d_M(i,j)=\max(d(i,j), c_M(i), c_M(j))}, where \eqn{d(i,j)} is an ordinary distance and \eqn{c_M(i)} is the core distance given by \eqn{d(i,k)} with \eqn{k} being \eqn{i}'s \eqn{M}-th nearest neighbour (not including self, unlike in Campello et al., 2013). This pulls outliers away from their neighbours. If the distances are not unique, there might be multiple trees spanning a given graph that meet the minimality property. } \examples{ library("datasets") data("iris") X <- jitter(as.matrix(iris[1:2])) # some data T <- mst(X) plot(X, asp=1, las=1) segments(X[T[, 1], 1], X[T[, 1], 2], X[T[, 2], 1], X[T[, 2], 2]) } \references{ V. Jarník, O jistem problemu minimalnim (z dopisu panu O. Borůvkovi), \emph{Prace Moravske Prirodovedecke Spolecnosti} 6, 1930, 57-63 C.F. Olson, Parallel algorithms for hierarchical clustering, \emph{Parallel Computing} 21, 1995, 1313-1325 R. Prim, Shortest connection networks and some generalisations, \emph{The Bell System Technical Journal} 36(6), 1957, 1389-1401 O. Borůvka, O jistém problému minimálním, \emph{Práce Moravské Přírodovědecké Společnosti} 3, 1926, 37–58 J.L. Bentley, Multidimensional binary search trees used for associative searching, \emph{Communications of the ACM} 18(9), 509–517, 1975, \doi{10.1145/361002.361007} W.B. March, R. Parikshit, A. Gray, Fast Euclidean minimum spanning tree: Algorithm, analysis, and applications, \emph{Proc. 16th ACM SIGKDD Intl. Conf. Knowledge Discovery and Data Mining (KDD '10)}, 2010, 603–612 R.J.G.B. Campello, D. Moulavi, J. Sander, Density-based clustering based on hierarchical density estimates, \emph{Lecture Notes in Computer Science} 7819, 2013, 160-172, \doi{10.1007/978-3-642-37456-2_14} M. Gagolewski, quitefastmst, in preparation, 2026, TODO } \seealso{ The official online manual of \pkg{deadwood} at \url{https://deadwood.gagolewski.com/} \code{\link[quitefastmst]{mst_euclid}} } \author{ \href{https://www.gagolewski.com/}{Marek Gagolewski} } deadwood/man/deadwood.Rd0000644000176200001440000001332615145137373014706 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/deadwood.R \name{deadwood} \alias{deadwood} \alias{deadwood.default} \alias{deadwood.dist} \alias{deadwood.mstclust} \alias{deadwood.mst} \title{Deadwood: Outlier Detection via Trimming of Mutual Reachability Minimum Spanning Trees} \usage{ deadwood(d, ...) \method{deadwood}{default}( d, M = 5L, contamination = NA_real_, max_debris_size = NA_real_, max_contamination = 0.5, ema_dt = 0.01, distance = c("euclidean", "l2", "manhattan", "cityblock", "l1", "cosine"), verbose = FALSE, ... ) \method{deadwood}{dist}( d, M = 5L, contamination = NA_real_, max_debris_size = NA_real_, max_contamination = 0.5, ema_dt = 0.01, verbose = FALSE, ... ) \method{deadwood}{mstclust}( d, contamination = NA_real_, max_debris_size = NA_real_, max_contamination = 0.5, ema_dt = 0.01, verbose = FALSE, ... ) \method{deadwood}{mst}( d, contamination = NA_real_, max_debris_size = NA_real_, max_contamination = 0.5, ema_dt = 0.01, cut_edges = NULL, verbose = FALSE, ... ) } \arguments{ \item{d}{a numeric matrix with \eqn{n} rows and \eqn{p} columns (or an object coercible to one, e.g., a data frame with numeric-like columns), an object of class \code{dist} (see \code{\link[stats]{dist}}), an object of class \code{mstclust} (see \pkg{genieclust} and \pkg{lumbermark}), or an object of class \code{mst} (see \code{\link{mst}})} \item{...}{further arguments passed to \code{\link{mst}}} \item{M}{smoothing factor; \eqn{M \leq 1} gives the selected \code{distance}; otherwise, the mutual reachability distance based on the \eqn{M}-th nearest neighbours is used} \item{contamination}{single numeric value or \code{NA}; the estimated (approximate) proportion of outliers in the dataset; if \code{NA}, the contamination amount will be determined by identifying the most significant elbow point of the curve comprised of increasingly ordered tree edge weights smoothened with an exponential moving average} \item{max_debris_size}{single integer value or \code{NA}; the maximal size of the leftover connected components that will be considered outliers; if \code{NA}, \eqn{\sqrt{n}} is assumed} \item{max_contamination}{single numeric value; maximal contamination level assumed when \code{contamination} is \code{NA}} \item{ema_dt}{single numeric value; controls the smoothing parameter \eqn{\alpha = 1-\exp(-dt)} of the exponential moving average (in edge length elbow point detection), \eqn{y_i = \alpha w_i + (1-\alpha) w_{i-1}}, \eqn{y_1 = d_1}} \item{distance}{metric used in the case where \code{d} is a matrix; one of: \code{"euclidean"} (synonym: \code{"l2"}), \code{"manhattan"} (a.k.a. \code{"l1"} and \code{"cityblock"}), \code{"cosine"}} \item{verbose}{logical; whether to print diagnostic messages and progress information} \item{cut_edges}{numeric vector or \code{NULL}; \eqn{k-1} indexes of the tree edges whose omission lead to \eqn{k} connected components (clusters), where the outliers are to be sought independently; most frequently this is generated via \pkg{genieclust} or \pkg{lumbermark}.} } \value{ A logical vector \code{y} of length \eqn{n}, where \code{y[i] == TRUE} means that the \code{i}-th observation is deemed to be an outlier. The \code{mst} attribute gives the computed minimum spanning tree which can be reused in further calls to the functions from \pkg{genieclust}, \pkg{lumbermark}, and \pkg{deadwood}. \code{cut_edges} gives the \code{cut_edges} passed as argument. \code{contamination} gives the detected contamination levels in each cluster (which can be different from the observed proportion of outliers detected). } \description{ Deadwood is an anomaly detection algorithm based on Mutual Reachability Minimum Spanning Trees. It trims protruding tree segments and marks small debris as outliers. More precisely, the use of a mutual reachability distance pulls peripheral points farther away from each other. Tree edges with weights beyond the detected elbow point are removed. All the resulting connected components whose sizes are smaller than a given threshold are deemed anomalous. } \details{ As with all distance-based methods (this includes k-means and DBSCAN as well), applying data preprocessing and feature engineering techniques (e.g., feature scaling, feature selection, dimensionality reduction) might lead to more meaningful results. If \code{d} is a numeric matrix or an object of class \code{dist}, \code{\link{mst}} will be called to compute an MST, which generally takes at most \eqn{O(n^2)} time. However, by default, for low-dimensional Euclidean spaces, a faster algorithm based on K-d trees is selected automatically; see \code{\link[quitefastmst]{mst_euclid}} from the \pkg{quitefastmst} package. Once the spanning tree is determined (\eqn{\Omega(n \log n)}-\eqn{O(n^2)}), the Deadwood algorithm runs in \eqn{O(n)} time. Memory use is also \eqn{O(n)}. } \examples{ library("datasets") data("iris") X <- jitter(as.matrix(iris[1:2])) # some data is_outlier <- deadwood(X, M=5) plot(X, col=c("#ff000066", "#55555555")[is_outlier+1], pch=c(16, 1)[is_outlier+1], asp=1, las=1) } \references{ M. Gagolewski, deadwood, in preparation, 2026, TODO V. Satopaa, J. Albrecht, D. Irwin, B. Raghavan, Finding a "Kneedle" in a haystack: Detecting knee points in system behavior, In: 31st Intl. Conf. Distributed Computing Systems Workshops, 2011, 166-171, \doi{10.1109/ICDCSW.2011.20} R.J.G.B. Campello, D. Moulavi, J. Sander, Density-based clustering based on hierarchical density estimates, Lecture Notes in Computer Science 7819, 2013, 160-172, \doi{10.1007/978-3-642-37456-2_14} } \author{ \href{https://www.gagolewski.com/}{Marek Gagolewski} } \seealso{ The official online manual of \pkg{deadwood} at \url{https://deadwood.gagolewski.com/} } deadwood/man/kneedle.Rd0000644000176200001440000000217415145137373014526 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/RcppExports.R \name{kneedle} \alias{kneedle} \alias{kneedle_increasing} \title{Knee/Elbow Point Detection} \usage{ kneedle_increasing(x, convex = TRUE, dt = 0.01) } \arguments{ \item{x}{data vector (increasing)} \item{convex}{whether the data in \code{x} are convex-ish (elbow detection) or not (knee lookup)} \item{dt}{controls the smoothing parameter \eqn{\alpha = 1-\exp(-dt)} of the exponential moving average, \eqn{y_i = \alpha x_i + (1-\alpha) x_{i-1}}, \eqn{y_1 = x_1}} } \value{ Returns the index of the knee/elbow point; 1 if not found. } \description{ Finds the most significant knee/elbow using the Kneedle algorithm with exponential smoothing. } \references{ V. Satopaa, J. Albrecht, D. Irwin, B. Raghavan, Finding a "Kneedle" in a haystack: Detecting knee points in system behavior, In: 31st Intl. Conf. Distributed Computing Systems Workshops, 2011, 166-171, \doi{10.1109/ICDCSW.2011.20} } \author{ \href{https://www.gagolewski.com/}{Marek Gagolewski} } \seealso{ The official online manual of \pkg{deadwood} at \url{https://deadwood.gagolewski.com/} } deadwood/man/deadwood-package.Rd0000644000176200001440000000135115145137373016272 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/deadwood-package.R \docType{package} \name{deadwood-package} \alias{deadwood-package} \title{The Deadwood Outlier Detection Algorithm} \description{ See \code{\link{deadwood}()} for more details. } \seealso{ The official online manual of \pkg{deadwood} at \url{https://deadwood.gagolewski.com/} Useful links: \itemize{ \item \url{https://deadwood.gagolewski.com/} \item \url{https://github.com/gagolews/deadwood} \item Report bugs at \url{https://github.com/gagolews/deadwood/issues} } } \author{ \strong{Maintainer}: Marek Gagolewski \email{marek@gagolewski.com} (\href{https://orcid.org/0000-0003-0637-6028}{ORCID}) [copyright holder] } \keyword{internal} deadwood/DESCRIPTION0000644000176200001440000000277015146437772013574 0ustar liggesusersPackage: deadwood Type: Package Title: Outlier Detection via Trimming of Mutual Reachability Minimum Spanning Trees Version: 0.9.0-3 Date: 2026-02-20 Authors@R: person("Marek", "Gagolewski", role = c("aut", "cre", "cph"), email = "marek@gagolewski.com", comment = c(ORCID = "0000-0003-0637-6028") ) Description: Implements an anomaly detection algorithm based on mutual reachability minimum spanning trees: 'deadwood' trims protruding tree segments and marks small debris as outliers; see Gagolewski (2026) . More precisely, the use of a mutual reachability distance pulls peripheral points farther away from each other. Tree edges with weights beyond the detected elbow point are removed. All the resulting connected components whose sizes are smaller than a given threshold are deemed anomalous. The 'Python' version of 'deadwood' is available via 'PyPI'. BugReports: https://github.com/gagolews/deadwood/issues URL: https://deadwood.gagolewski.com/, https://github.com/gagolews/deadwood License: AGPL-3 Imports: Rcpp, quitefastmst Suggests: datasets, LinkingTo: Rcpp Encoding: UTF-8 SystemRequirements: OpenMP RoxygenNote: 7.3.3 NeedsCompilation: yes Packaged: 2026-02-20 19:29:02 UTC; gagolews Author: Marek Gagolewski [aut, cre, cph] (ORCID: ) Maintainer: Marek Gagolewski Repository: CRAN Date/Publication: 2026-02-21 23:30:02 UTC