genie/ 0000755 0001762 0000144 00000000000 15151272272 011345 5 ustar ligges users genie/MD5 0000644 0001762 0000144 00000002356 15151272272 011663 0 ustar ligges users 16efafe4ab6ce5fac81451ac0ee45f63 *DESCRIPTION
0cce106eda8a0102d80da3495233fff5 *NAMESPACE
b482a15050787ae25867ce31ded18d2f *NEWS
1d29001a8b2960673856de5c93b19fc8 *R/RcppExports.R
ebd61f78f6aa3f40520b2245cd77f2e4 *R/genie-package.R
885bd658b3dbe4868767b57319e793fc *R/hclust2.R
13e66c864ae44f36588e4ba53abdf5fa *inst/CITATION
b515038389d74fe9d8cfed472ea32b35 *man/genie-package.Rd
9e7e26bb241a59479ed35cf804a6b5d9 *man/hclust2.Rd
e2d347f9ad519d45dd6bcf3703472671 *src/Makevars
e2d347f9ad519d45dd6bcf3703472671 *src/Makevars.win
6ab5e681b92cf54e2577bb5855d717e1 *src/RcppExports.cpp
0938147578dc2e53d0ef61f3e08624c0 *src/defs.h
19da696feb39964eaddf74753fbcc9e1 *src/disjoint_sets.cpp
a0f2862b8ccca8df68b6b5ab19cbf801 *src/disjoint_sets.h
1caca7d397be8c264b683b43083f9fae *src/hclust2_common.cpp
0b8d2bb29df6ccd439555df58e405ab5 *src/hclust2_common.h
26767a736d26bca88ff22d99e8d96023 *src/hclust2_distance.cpp
99e1c70992b7ac410d411d4430710e97 *src/hclust2_distance.h
5ef4c265f4bd863f4cde68c918818140 *src/hclust2_mstbased_gini.cpp
c35e21e035c416d3034b7cdf9588786a *src/hclust2_mstbased_gini.h
3ee7a387971d7569336ce9ea0a8b8091 *src/hclust2_rcpp_gini.cpp
81e2f743988de6587d1f4604aab57d54 *src/hclust2_result.cpp
34a422f0c54d17bc456006ef326c1438 *src/hclust2_result.h
genie/R/ 0000755 0001762 0000144 00000000000 15151264414 011545 5 ustar ligges users genie/R/RcppExports.R 0000644 0001762 0000144 00000000376 15151264414 014167 0 ustar ligges users # Generated by using Rcpp::compileAttributes() -> do not edit by hand
# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
.hclust2_gini <- function(distance, objects, control = NULL) {
.Call(`_genie_hclust2_gini`, distance, objects, control)
}
genie/R/genie-package.R 0000644 0001762 0000144 00000000351 15113324536 014347 0 ustar ligges users #' @title The Genie Package
#'
#' @description
#' See \code{\link{hclust2}()} for details.
#'
#' @author Marek Gagolewski, Maciej Bartoszuk, Anna Cena
#'
#' @useDynLib genie, .registration=TRUE
#' @importFrom Rcpp evalCpp
"_PACKAGE"
genie/R/hclust2.R 0000644 0001762 0000144 00000013306 15015566476 013273 0 ustar ligges users #' @title
#' Fast Hierarchical Clustering in Spaces Equipped With
#' a Dissimilarity Measure
#'
#' @description
#' The reference implementation of the fast, robust and outlier resistant
#' Genie algorithm described in (Gagolewski, Bartoszuk, Cena, 2016).
#' Note that the \code{genie} package has been superseded by \code{genieclust},
#' see \code{\link[genieclust]{gclust}} and \code{\link[genieclust]{genie}}
#' for more details.
#'
#' @param d an object of class \code{\link[stats]{dist}},
#' \code{NULL}, or a single string, see below
#' @param objects \code{NULL}, numeric matrix, a list, or a character vector
#' @param thresholdGini single numeric value in [0,1],
#' threshold for the Gini index, 1 gives the standard single linkage algorithm
#' @param ... internal parameters used to tune up the algorithm
#'
#' @details
#' The time needed to apply a hierarchical clustering algorithm
#' is most often dominated by the number of computations of a pairwise
#' dissimilarity measure. Such a constraint, for larger data sets,
#' puts at a disadvantage the use of all the classical linkage
#' criteria but the single linkage one. However, it is known that the single
#' linkage clustering algorithm is very sensitive to outliers, produces highly
#' skewed dendrograms, and therefore usually does not reflect the true
#' underlying data structure -- unless the clusters are well-separated.
#'
#' To overcome its limitations, in (Gagolewski, Bartoszuk, Cena, 2016)
#' we proposed a new hierarchical clustering linkage
#' criterion. Namely, our algorithm links two clusters in such a way that a chosen
#' economic inequity measure (here, the Gini index) of the cluster
#' sizes does not increase drastically above a given threshold. The
#' benchmarks indicate a high practical usefulness of the introduced method:
#' it most often outperforms the Ward or average linkage in terms of the
#' clustering quality while retaining the single linkage speed.
#' The algorithm can be run in parallel (via OpenMP) on multiple threads
#' to speed up its execution further on.
#' Its memory overhead is small: there is no need to precompute the complete
#' distance matrix to perform the computations in order to obtain a desired
#' clustering.
#'
#' For compatibility with \code{\link[stats]{hclust}}, \code{d} may be an object
#' of class \code{\link[stats]{dist}}. In such a case, the \code{objects}
#' argument is ignored. Note that such an object requires ca. \emph{8n(n-1)/2}
#' bytes of computer's memory, where \emph{n} is the number of objects to cluster,
#' and therefore this setting can be used to analyse data sets of sizes
#' up to about 10,000-50,000.
#'
#' If \code{objects} is a character vector or a list, then \code{d}
#' should be a single string, one of: \code{levenshtein} (or \code{NULL}),
#' \code{hamming}, \code{dinu} (Dinu, Sgarro, 2006),
#' or \code{euclinf} (Cena et al., 2015).
#' Note that the list must consist
#' either of integer or of numeric vectors only (depending on the dissimilarity
#' measure of choice). On the other hand, each string must be in ASCII,
#' but you can always convert it to UTF-32 with
#' \code{\link[stringi]{stri_enc_toutf32}}.
#'
#' Otherwise, if \code{objects} is a numeric matrix (here, each row
#' denotes a distinct observation), then \code{d} should be
#' a single string, one of: \code{euclidean_squared} (or \code{NULL}),
#' \code{euclidean} (which yields the same results as \code{euclidean_squared})
#' \code{manhattan}, \code{maximum}, or \code{hamming}.
#'
#' @return
#' A named list of class \code{hclust}, see \code{\link[stats]{hclust}},
#' with additional components:
#' \itemize{
#' \item \code{stats} - performance statistics
#' \item \code{control} - internal parameters used
#' }
#'
#' @examples
#' library("datasets")
#' data("iris")
#' h <- hclust2(objects=as.matrix(iris[,2:3]), thresholdGini=0.2)
#' plot(iris[,2], iris[,3], col=cutree(h, 3), pch=as.integer(iris[,5]), asp=1, las=1)
#'
#' @references
#' Cena A., Gagolewski M., Mesiar R., Problems and challenges of information
#' resources producers' clustering, \emph{Journal of Informetrics} 9(2), 2015,
#' pp. 273-284.
#'
#' Dinu L.P., Sgarro A., A Low-complexity Distance for DNA Strings,
#' \emph{Fundamenta Informaticae} 73(3), 2006, pp. 361-372.
#'
#' Gagolewski M., Bartoszuk M., Cena A.,
#' Genie: A new, fast, and outlier-resistant hierarchical clustering algorithm,
#' \emph{Information Sciences} 363, 2016, pp. 8-23.
#'
#' Gagolewski M., Cena A., Bartoszuk M.
#' \emph{Hierarchical clustering via penalty-based aggregation and the Genie
#' approach}, In: Torra V. et al. (Eds.), \emph{Modeling Decisions for
#' Artificial Intelligence} (\emph{Lecture Notes in Artificial Intelligence}
#' 9880), Springer, 2016.
#'
#' @importFrom stats approx
#' @importFrom genieclust gclust
#' @importFrom genieclust genie
#' @export
hclust2 <- function(d=NULL, objects=NULL, thresholdGini=0.3, ...)
{
opts <- list(thresholdGini=thresholdGini, useVpTree=FALSE, ...)
result <- .hclust2_gini(d, objects, opts)
result[["call"]] <- match.call()
result[["method"]] <- "gini"
if (any(result[["height"]]<0)) {
# corrections for departures from ultrametricity
# negative heights denote force Genie merges
# we could just use have used cummax, but then we'd get multiple
# merges at the same level; instead we'll linearly interpolate
# between the points
nonNegative <- which(result[["height"]]>=0)
lastNonNegative <- nonNegative[length(nonNegative)]
result[["height"]][1:lastNonNegative] <-
approx(nonNegative, # linear interpolation
result[["height"]][nonNegative],
1:lastNonNegative)$y
result[["height"]][result[["height"]] < 0] <- cummax(-result[["height"]][result[["height"]] < 0])
}
result
}
genie/NEWS 0000644 0001762 0000144 00000002340 15151264376 012051 0 ustar ligges users genie package NEWS and CHANGELOG
===============================================================================
## 1.0.7 (2026-03-02)
* Fixed build warnings.
## 1.0.6 (2025-12-01)
* 'useVpTree' has been removed.
## 1.0.5 (2020-08-02)
* Updated documentation and package metadata.
* This package has been superseded by `genieclust`, which is faster and
more feature-rich (and also available for Python).
## 1.0.4 (2017-04-27)
* Invalid DOI corrected.
## 1.0.3 (2017-04-27)
* [BUILD TIME] Registering native routines and disabling symbol search.
## 1.0.1 (2016-05-25)
* Updated documentation and package metadata.
The algorithm's description can now be found in:
Gagolewski M., Bartoszuk M., Cena A., Genie: A new, fast, and outlier-resistant
hierarchical clustering algorithm, Information Sciences 363, 2016, pp. 8-23,
doi:10.1016/j.ins.2016.05.003
See also:
Gagolewski M., Cena A., Bartoszuk M., Hierarchical clustering via penalty-based
aggregation and the Genie approach, In: Torra V. et al. (Eds.),
Modeling Decisions for Artificial Intelligence (Lecture Notes in Artificial
Intelligence 9880), Springer, 2016, pp. 191-202,
doi:10.1007/978-3-319-45656-0_16.
## 1.0.0 (2016-03-07)
* Initial release.
genie/src/ 0000755 0001762 0000144 00000000000 15151264416 012135 5 ustar ligges users genie/src/disjoint_sets.h 0000644 0001762 0000144 00000010656 15113316206 015170 0 ustar ligges users /* ************************************************************************* *
* This file is part of the `genie` package for R. *
* *
* Copyright 2015-2025 Marek Gagolewski, Maciej Bartoszuk, Anna Cena *
* *
* 'genie' is free software: you can redistribute it and/or *
* modify it under the terms of the GNU General Public License *
* as published by the Free Software Foundation, either version 3 *
* of the License, or (at your option) any later version. *
* *
* 'genie' is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with 'genie'. If not, see . *
* ************************************************************************* */
#ifndef __DISJOINT_SETS_H
#define __DISJOINT_SETS_H
#include "defs.h"
#include
#include
#include
#include
/* see defs.h */
#ifndef DISJOINT_SETS_DEBUG
#define DISJOINT_SETS_DEBUG_CONST const
#else
#define DISJOINT_SETS_DEBUG_CONST /* const */
#endif
namespace grup {
class DisjointSets {
private:
std::vector< std::size_t > clusterParent;
protected:
std::size_t n;
public:
DisjointSets(std::size_t n);
virtual ~DisjointSets();
virtual std::size_t link(std::size_t x, std::size_t y, std::size_t z);
virtual std::size_t link(std::size_t x, std::size_t y);
std::size_t union_set(std::size_t x, std::size_t y);
inline std::size_t find_set(std::size_t x) {
if (clusterParent[x] != x)
return clusterParent[x] = find_set(clusterParent[x]);
else
return clusterParent[x];
}
};
class PhatDisjointSets : public DisjointSets {
private:
std::vector< std::size_t > clusterSize;
std::vector< std::size_t* > clusterMembers;
std::vector< std::size_t > clusterNext;
std::vector< std::size_t > clusterPrev;
std::size_t clusterCount;
std::size_t minClusterSize;
std::size_t minClusterCount;
void recomputeMinClusterSize();
public:
PhatDisjointSets(std::size_t n);
virtual ~PhatDisjointSets();
virtual std::size_t link(std::size_t x, std::size_t y);
virtual std::size_t link(std::size_t x, std::size_t y, std::size_t z);
inline std::size_t getClusterCount() const { return clusterCount; }
inline std::size_t getMinClusterSize() const { return minClusterSize; }
inline const std::size_t* getClusterMembers(std::size_t x) DISJOINT_SETS_DEBUG_CONST {
#ifdef DISJOINT_SETS_DEBUG
STOPIFNOT(find_set(x) == x);
STOPIFNOT(clusterMembers[x]);
#endif
return clusterMembers[x];
}
inline std::size_t getClusterSize(std::size_t x) DISJOINT_SETS_DEBUG_CONST {
#ifdef DISJOINT_SETS_DEBUG
STOPIFNOT(find_set(x) == x);
STOPIFNOT(clusterSize[x] == 0 || clusterMembers[x] != NULL);
#endif
return clusterSize[x];
}
inline std::size_t getClusterPrev(std::size_t x) DISJOINT_SETS_DEBUG_CONST {
#ifdef DISJOINT_SETS_DEBUG
STOPIFNOT(find_set(x) == x);
STOPIFNOT(find_set(clusterPrev[x]) == clusterPrev[x]);
STOPIFNOT(find_set(clusterNext[x]) == clusterNext[x]);
#endif
return clusterPrev[x];
}
inline std::size_t getClusterNext(std::size_t x) DISJOINT_SETS_DEBUG_CONST {
/*
to iterate over all clusters starting from x, use something like:
for (size_t nx = ds.getClusterNext(x); nx != x; nx = ds.getClusterNext(nx)) {
// e.g.:
for (auto it = ds.getClusterMembers(nx).cbegin(); it != ds.getClusterMembers(nx).cend(); ++it)
// play with *it
}
*/
#ifdef DISJOINT_SETS_DEBUG
STOPIFNOT(find_set(x) == x);
STOPIFNOT(find_set(clusterPrev[x]) == clusterPrev[x]);
STOPIFNOT(find_set(clusterNext[x]) == clusterNext[x]);
#endif
return clusterNext[x];
}
};
} /* namespace grup */
#endif
genie/src/RcppExports.cpp 0000644 0001762 0000144 00000002276 15113316463 015137 0 ustar ligges users // Generated by using Rcpp::compileAttributes() -> do not edit by hand
// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
#include
using namespace Rcpp;
#ifdef RCPP_USE_GLOBAL_ROSTREAM
Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get();
Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
#endif
// hclust2_gini
RObject hclust2_gini(RObject distance, RObject objects, RObject control);
RcppExport SEXP _genie_hclust2_gini(SEXP distanceSEXP, SEXP objectsSEXP, SEXP controlSEXP) {
BEGIN_RCPP
Rcpp::RObject rcpp_result_gen;
Rcpp::RNGScope rcpp_rngScope_gen;
Rcpp::traits::input_parameter< RObject >::type distance(distanceSEXP);
Rcpp::traits::input_parameter< RObject >::type objects(objectsSEXP);
Rcpp::traits::input_parameter< RObject >::type control(controlSEXP);
rcpp_result_gen = Rcpp::wrap(hclust2_gini(distance, objects, control));
return rcpp_result_gen;
END_RCPP
}
static const R_CallMethodDef CallEntries[] = {
{"_genie_hclust2_gini", (DL_FUNC) &_genie_hclust2_gini, 3},
{NULL, NULL, 0}
};
RcppExport void R_init_genie(DllInfo *dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
}
genie/src/hclust2_common.h 0000644 0001762 0000144 00000043514 15151263532 015247 0 ustar ligges users /* ************************************************************************* *
* This file is part of the `genie` package for R. *
* *
* Copyright 2015-2025 Marek Gagolewski, Maciej Bartoszuk, Anna Cena *
* *
* 'genie' is free software: you can redistribute it and/or *
* modify it under the terms of the GNU General Public License *
* as published by the Free Software Foundation, either version 3 *
* of the License, or (at your option) any later version. *
* *
* 'genie' is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with 'genie'. If not, see . *
* ************************************************************************* */
#ifndef __HCLUST2_COMMON_H
#define __HCLUST2_COMMON_H
#include "defs.h"
#include "hclust2_distance.h"
#include "disjoint_sets.h"
#include
#include
#include
#include
#include
namespace grup
{
template struct Matrix
{
size_t nrow;
size_t ncol;
T* data;
Matrix() : nrow(0), ncol(0), data(NULL) {}
Matrix(size_t nrow, size_t ncol) :
nrow(nrow), ncol(ncol) {
data = new T[nrow*ncol];
}
~Matrix() {
if (data) delete [] data;
}
Matrix(const Matrix& m) {
nrow = m.nrow;
ncol = m.ncol;
data = new T[nrow*ncol];
for (size_t i=0; i= o2.dist;
}
};
typedef std::priority_queue, HeapNeighborItemFromSmallestComparator>
priority_queue_HeapNeighborItem_FromSmallest;
struct HeapHierarchicalItem
{
size_t index1;
size_t index2;
double dist;
HeapHierarchicalItem() :
index1(SIZE_MAX), index2(SIZE_MAX), dist(INFINITY) {}
HeapHierarchicalItem(size_t index1, size_t index2, double dist) :
index1(index1), index2(index2), dist(dist) {}
inline bool operator<( const HeapHierarchicalItem& o ) const {
return dist >= o.dist || (dist == o.dist && index2 > o.index2); // SIZE_MAX index2 at the end of a series
}
};
// class HclustPriorityQueue {
// private:
// struct BSTNode {
// BSTNode* left;
// BSTNode* right;
// HeapHierarchicalItem elem;
// };
//
// BSTNode* root;
//
// void rotateLeft(BSTNode** root) {
// STOPIFNOT(*root)
// if (!(*root)->left) return;
// BSTNode* oldroot = *root;
// *root = oldroot->left;
// oldroot->left = (*root)->right;
// (*root)->right = oldroot;
// }
//
// void rotateRight(BSTNode** root) {
// STOPIFNOT(*root)
// if (!(*root)->right) return;
// BSTNode* oldroot = *root;
// *root = oldroot->right;
// oldroot->right = (*root)->left;
// (*root)->left = oldroot;
// }
//
// void deleteSubTree(BSTNode** root) {
// if (!*root) return;
// deleteSubTree(&(*root)->left);
// deleteSubTree(&(*root)->right);
// delete *root;
// *root = NULL;
// }
//
// void deleteLeftmost(BSTNode** root) {
// STOPIFNOT(*root)
// if ((*root)->left) {
// deleteLeftmost(&(*root)->left);
// }
// else {
// BSTNode* delme = *root;
// *root = delme->right;
// delete delme;
// }
// }
//
// void insert(BSTNode** root, const HeapHierarchicalItem& data) {
// if (*root) {
// if (data.dist < (*root)->elem.dist)
// insert(&(*root)->left, data);
// else
// insert(&(*root)->right, data);
//
// double u = unif_rand();
// if (u < 0.33) rotateLeft(root);
// else if (u < 0.67) rotateRight(root);
// }
// else {
// *root = new BSTNode;
// (*root)->left = NULL;
// (*root)->right = NULL;
// (*root)->elem = data;
// }
// }
//
//
// public:
// HclustPriorityQueue(std::size_t) { root = NULL; }
// const bool empty() const { return root == NULL; }
// ~HclustPriorityQueue() { deleteSubTree(&root); }
//
// const HeapHierarchicalItem& top() {
// STOPIFNOT(root)
// while (root->left)
// rotateLeft(&root);
// return root->elem;
// }
//
// void pop() {
// STOPIFNOT(root)
// deleteLeftmost(&root);
// }
//
// void push(const HeapHierarchicalItem& data) {
// insert(&root, data);
// }
//
// };
// class HclustPriorityQueue {
// private:
//
// std::vector left;
// std::vector right;
// std::vector parent;
// std::vector elem;
// std::vector free;
// std::size_t occupied;
// std::size_t root;
// std::size_t best;
//
// double check_sorted;
//
// void print(std::size_t cur, std::size_t h) {
// if (cur == SIZE_MAX) return;
// print(left[cur], h+1);
// std::cerr << elem[cur].dist << "(" << h << "), ";
// print(right[cur], h+1);
// }
//
// void checkSorted(std::size_t cur) {
// if (cur == SIZE_MAX) return;
// checkSorted(right[cur]);
// STOPIFNOT(check_sorted >= elem[cur].dist)
// check_sorted = elem[cur].dist;
// checkSorted(left[cur]);
// }
//
// public:
//
// HclustPriorityQueue(std::size_t n) :
// left(n), right(n), parent(n), elem(n), free(n)
// {
// root = SIZE_MAX;
// occupied = 0;
// best = SIZE_MAX;
// for (std::size_t i=0; i= 0)
// if (parent[best] == SIZE_MAX) {
// // it's a root
// best = root = right[best];
// parent[root] = SIZE_MAX;
// if (best != SIZE_MAX) {
// while (left[best] != SIZE_MAX)
// best = left[best];
// }
// }
// else { // parent[best] != SIZE_MAX
// STOPIFNOT(left[parent[best]] == best)
// STOPIFNOT(elem[parent[best]].dist >= elem[best].dist)
// STOPIFNOT(elem[root].dist >= elem[best].dist)
// if (right[best] == SIZE_MAX) {
// left[parent[best]] = SIZE_MAX;
// best = parent[best];
// }
// else { // right[best] != SIZE_MAX
// left[parent[best]] = right[best];
// parent[right[best]] = parent[best];
// best = right[best];
// while (left[best] != SIZE_MAX)
// best = left[best];
// }
// }
// }
//
// inline const HeapHierarchicalItem& top() {
// STOPIFNOT(best != SIZE_MAX)
// return elem[best];
// }
//
// void push(const HeapHierarchicalItem& data) {
// STOPIFNOT(occupied+1 < free.size())
// if (root == SIZE_MAX) {
// STOPIFNOT(occupied == 0)
// root = best = free[occupied++];
// right[root] = left[root] = parent[root] = SIZE_MAX;
// elem[root] = data;
// return;
// }
//
// STOPIFNOT(best != SIZE_MAX)
// if (data.dist < elem[best].dist) {
// STOPIFNOT(left[best] == SIZE_MAX)
// left[best] = free[occupied++];
// parent[left[best]] = best;
// best = left[best];
// right[best] = left[best] = SIZE_MAX;
// elem[best] = data;
// return;
// }
//
//
// std::size_t start = root;
// while (true) {
// if (data.dist < elem[start].dist) {
// if (left[start] == SIZE_MAX) {
// left[start] = free[occupied++];
// parent[left[start]] = start;
// left[left[start]] = right[left[start]] = SIZE_MAX;
// elem[left[start]] = data;
// return;
// }
// else {
// start = left[start];
// }
// }
// else {
// if (right[start] == SIZE_MAX) {
// right[start] = free[occupied++];
// parent[right[start]] = start;
// left[right[start]] = right[right[start]] = SIZE_MAX;
// elem[right[start]] = data;
// return;
// }
// else {
// start = right[start];
// }
// }
// }
// }
//
// inline bool empty() const { return root == SIZE_MAX; }
// };
class HclustPriorityQueue
{
size_t n;
size_t ncur;
std::vector items;
bool heapMade;
public:
HclustPriorityQueue(size_t n=0) :
n(n), ncur(0), items(n), heapMade(false) { }
const HeapHierarchicalItem& top() {
if (!heapMade) {
std::make_heap(items.begin(), items.begin()+ncur);
heapMade = true;
}
return items[0];
}
void pop() {
if (!heapMade) {
std::make_heap(items.begin(), items.begin()+ncur);
heapMade = true;
}
std::pop_heap(items.begin(), items.begin()+ncur);
--ncur;
STOPIFNOT(ncur >= 0);
}
void push(const HeapHierarchicalItem& item) {
STOPIFNOT(ncur < n);
items[ncur++] = item;
if (heapMade) {
std::push_heap(items.begin(), items.begin()+ncur);
}
}
bool empty() const {
return (ncur == 0);
}
void reset() { heapMade = false; }
};
struct HClustOptions
{
// size_t degree; // for GNAT
// size_t candidatesTimes; // for GNAT
// size_t minDegree; // for GNAT
// size_t maxDegree; // for GNAT
// size_t maxTimesDegree; // for GNAT
size_t maxLeavesElems; //
size_t maxNNPrefetch; //
size_t maxNNMerge; //
size_t minNNPrefetch; //
size_t minNNMerge; //
// std::string exemplar; //
// bool useVpTree;
// bool useMST;
size_t vpSelectScheme; // vp-tree and GNAT
size_t vpSelectCand; // for vpSelectScheme == 1
size_t vpSelectTest; // for vpSelectScheme == 1
size_t nodesVisitedLimit;// for single approx
double thresholdGini; // for single approx
// size_t exemplarUpdateMethod; // exemplar - naive(0) or not naive(1)?
// size_t maxExemplarLeavesElems; //for exemplars biggers numbers are needed I think
// bool isCurseOfDimensionality;
HClustOptions(Rcpp::RObject control);
Rcpp::NumericVector toR() const;
};
struct HClustStats
{
size_t nodeCount; // how many nodes are there in the tree
size_t leafCount; // how many leaves
size_t nodeVisit; // how many nodes were visited during NN search
size_t nnCals; // how many times NN search job was launched
size_t nnCount; // how many NNs were obtained in overall
size_t medoidOldNew; //..how many times it was successful
size_t medoidUpdateCount; // how many times we calculate d_old and d_new..
HClustStats();
~HClustStats();
Rcpp::NumericVector toR() const;
};
struct NNHeap {
std::priority_queue< HeapNeighborItem > heap;
static HClustOptions* opts;
size_t exemplarsCount;
// #ifdef _OPENMP
// omp_lock_t lock;
// #endif
NNHeap() :
heap(),
exemplarsCount(0) {
// #ifdef _OPENMP
// omp_init_lock(&lock);
// #endif
}
static void setOptions(HClustOptions* newopts) {
opts = newopts;
}
~NNHeap() {
// #ifdef _OPENMP
// omp_destroy_lock(&lock);
// #endif
}
inline bool empty()
{
return heap.empty();
}
inline const HeapNeighborItem& top()
{
return heap.top();
}
inline const size_t size()
{
return heap.size();
}
inline void pop()
{
heap.pop();
}
inline void push(const HeapNeighborItem& elem)
{
heap.push(elem);
}
inline void insert(double index, double dist, double& maxR) {
STOPIFNOT(NNHeap::opts != NULL)
// #ifdef _OPENMP
// omp_set_lock(&lock);
// #endif
if (heap.size() >= opts->maxNNPrefetch && dist < maxR) {
while (!heap.empty() && heap.top().dist == maxR) {
heap.pop();
}
}
heap.push( HeapNeighborItem(index, dist) );
if (heap.size() >= opts->maxNNPrefetch) maxR = heap.top().dist;
// #ifdef _OPENMP
// omp_unset_lock(&lock);
// #endif
}
inline void insertExemplars(double index, double dist, double& maxR, DisjointSets& ds, bool isExemplar) {
// #ifdef _OPENMP
// omp_set_lock(&lock);
// #endif
heap.push( HeapNeighborItem(index, dist) );
if(isExemplar)
{
exemplarsCount++;
}
std::list toRemove;
size_t toRemoveExemplarsCount=0;
if (heap.size() >= opts->maxNNPrefetch+1 && dist < maxR) {
while (!heap.empty() && heap.top().dist == maxR) {
toRemove.push_back(heap.top());
if(heap.top().index == ds.find_set(heap.top().index))
{
toRemoveExemplarsCount++;
}
heap.pop();
}
}
if(toRemoveExemplarsCount == exemplarsCount && exemplarsCount > 0)
{
for(auto it = toRemove.begin(); it != toRemove.end(); ++it)
heap.push(*it);
}
else
{
exemplarsCount -= toRemoveExemplarsCount;
}
if (heap.size() >= opts->maxNNPrefetch && exemplarsCount > 0) maxR = heap.top().dist;
// #ifdef _OPENMP
// omp_unset_lock(&lock);
// #endif
}
inline void fill(std::deque& nearestNeighbors) {
while (!heap.empty()) {
nearestNeighbors.push_front(heap.top());
heap.pop();
}
}
inline void fill(std::list& nearestNeighbors) {
while (!heap.empty()) {
nearestNeighbors.push_front(heap.top());
heap.pop();
}
}
inline void fill(std::priority_queue, HeapNeighborItemFromSmallestComparator>& nearestNeighbors)
{
while (!heap.empty()) {
nearestNeighbors.push(heap.top());
heap.pop();
}
}
};
struct DistanceComparator
{
size_t index;
Distance* distance;
DistanceComparator(size_t index, Distance* distance)
: index(index), distance(distance) {}
inline bool operator()(size_t a, size_t b) {
return (*distance)( index, a ) < (*distance)( index, b );
}
};
struct DistanceComparatorCached
{
std::vector* distances;
DistanceComparatorCached(std::vector* distances)
: distances(distances) {}
inline bool operator()(size_t a, size_t b) {
return (*distances)[a] < (*distances)[b];
}
};
struct IndexComparator
{
size_t index;
IndexComparator(size_t index)
: index(index) {}
inline bool operator()(size_t a) {
return (a <= index);
}
};
inline bool comparer_gt(size_t i, size_t j) { return (i>j); }
inline bool comparer_gt(double i, double j) { return (i>j); }
struct SortedPoint
{
size_t i;
size_t j;
SortedPoint()
:i(0),j(0) {}
SortedPoint(size_t _i, size_t _j) {
if(_j < _i) {
i = _j;
j = _i;
}
else {
i = _i;
j = _j;
}
}
inline bool operator==(const SortedPoint &other) const {
return (i == other.i && j == other.j);
}
};
struct Point
{
size_t i;
size_t j;
Point()
: i(0),j(0) {}
Point(size_t _i, size_t _j) {
i = _i;
j = _j;
}
inline bool operator==(const Point &other) const {
return (i == other.i && j == other.j);
}
};
} // namespace grup
// #include
namespace std
{
template <> struct hash
{
/*
* template void hash_combine(size_t & seed, T const& v);
* seed ^= hash_value(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
*/
std::size_t operator()(const grup::Point& k) const {
std::size_t seed = 0;
seed ^= (size_t)(k.i) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
seed ^= (size_t)(k.j) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
// boost::hash_combine(seed, k.i);
// boost::hash_combine(seed, k.j);
return seed;
}
};
template <> struct hash
{
std::size_t operator()(const grup::SortedPoint& k) const {
std::size_t seed = 0;
seed ^= (size_t)(k.i) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
seed ^= (size_t)(k.j) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
// boost::hash_combine(seed, k.i);
// boost::hash_combine(seed, k.j);
return seed;
}
};
} // namespace std
#endif
genie/src/Makevars.win 0000644 0001762 0000144 00000000114 15113316437 014420 0 ustar ligges users PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)
genie/src/hclust2_mstbased_gini.h 0000644 0001762 0000144 00000004365 15113316206 016563 0 ustar ligges users /* ************************************************************************* *
* This file is part of the `genie` package for R. *
* *
* Copyright 2015-2025 Marek Gagolewski, Maciej Bartoszuk, Anna Cena *
* *
* 'genie' is free software: you can redistribute it and/or *
* modify it under the terms of the GNU General Public License *
* as published by the Free Software Foundation, either version 3 *
* of the License, or (at your option) any later version. *
* *
* 'genie' is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with 'genie'. If not, see . *
* ************************************************************************* */
#ifndef __HCLUST2_MSTBASED_GINI_H
#define __HCLUST2_MSTBASED_GINI_H
// ************************************************************************
#include
#include
#include
#include
#include "hclust2_common.h"
#include "disjoint_sets.h"
#include "hclust2_result.h"
namespace grup
{
class HClustMSTbasedGini
{
protected:
HClustOptions* opts;
size_t n;
HClustStats stats;
Distance* distance;
HclustPriorityQueue getMST();
void linkAndRecomputeGini(PhatDisjointSets& ds, double& lastGini, size_t s1, size_t s2);
public:
HClustMSTbasedGini(Distance* dist, HClustOptions* opts);
virtual ~HClustMSTbasedGini();
HClustResult compute();
inline const HClustStats& getStats() { return stats; }
inline const HClustOptions& getOptions() { return *opts; }
}; // class
} // namespace grup
#endif
genie/src/hclust2_distance.h 0000644 0001762 0000144 00000027314 15113316206 015544 0 ustar ligges users /* ************************************************************************* *
* This file is part of the `genie` package for R. *
* *
* Copyright 2015-2025 Marek Gagolewski, Maciej Bartoszuk, Anna Cena *
* *
* 'genie' is free software: you can redistribute it and/or *
* modify it under the terms of the GNU General Public License *
* as published by the Free Software Foundation, either version 3 *
* of the License, or (at your option) any later version. *
* *
* 'genie' is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with 'genie'. If not, see . *
* ************************************************************************* */
#ifndef __HCLUST2_DISTANCE_H
#define __HCLUST2_DISTANCE_H
#include "defs.h"
/*
add string dists = lcs, dam-lev
numeric -> metric: binary (see dist) minkowski (p), canberra
allow external ptr distance:
double dist(const char* s1, int nx, const char* s2, int ny)
double dist(const double* s1, int nx, const double* s2, int ny)
double dist(const int* s1, int nx, const int* s2, int ny)
double dist(SEXP s1, SEXP s2)
use cases: objects 1:n, distance(i,j) -> ith, jth row of a data frame
(check namespaces... - call within an R function)
test for proper NA handling in Matrix and String distance
GenericRDistance, DistObjectDistance -- how to handle NAs??
*/
#include
#include
#include
#include
#include
namespace grup
{
struct DistanceStats
{
// size_t hashmapHit;
// size_t hashmapMiss;
size_t distCallCount;
size_t distCallTheoretical;
DistanceStats(size_t n) :
// hashmapHit(0), hashmapMiss(0),
distCallCount(0),
distCallTheoretical(n*(n-1)/2) {}
void print() const;
Rcpp::NumericVector toR() const {
return Rcpp::NumericVector::create(
// Rcpp::_["hashmapHit"]
// = (hashmapHit>0)?(double)hashmapHit:NA_REAL,
// Rcpp::_["hashmapMiss"]
// = (hashmapMiss>0)?(double)hashmapMiss:NA_REAL,
Rcpp::_["distCallCount"]
= (distCallCount>0)?(double)distCallCount:NA_REAL,
Rcpp::_["distCallTheoretical"]
= (distCallTheoretical>0)?(double)distCallTheoretical:NA_REAL
);
}
};
class Distance
{
private:
#ifdef HASHMAP_ENABLED
std::vector< std::unordered_map > hashmap;
#endif
DistanceStats stats;
protected:
size_t n;
virtual double compute(size_t v1, size_t v2) = 0;
public:
Distance(size_t n);
virtual ~Distance();
inline size_t getObjectCount() { return n; }
static Distance* createDistance(Rcpp::RObject distance, Rcpp::RObject objects, Rcpp::RObject control=R_NilValue);
virtual Rcpp::RObject getLabels() { /* stub */ return R_NilValue; }
virtual Rcpp::RObject getDistMethod() { /* stub */ return R_NilValue; }
inline const DistanceStats& getStats() { return stats; }
#ifdef HASHMAP_ENABLED
double operator()(size_t v1, size_t v2);
#else
inline double operator()(size_t v1, size_t v2) {
#ifdef GENERATE_STATS
#ifdef _OPENMP
#pragma omp atomic
#endif
++stats.distCallCount;
#endif
return compute(v1, v2);
}
#endif
};
class GenericMatrixDistance : public Distance
{
protected:
double* items;
size_t m;
public:
// TO DO: virtual Rcpp::RObject getLabels() { /* stub */ return R_NilValue; } --- get row names
GenericMatrixDistance(const Rcpp::NumericMatrix& points);
virtual ~GenericMatrixDistance() {
// #if VERBOSE > 5
// Rprintf("[%010.3f] destroying distance object\n", clock()/(float)CLOCKS_PER_SEC);
// #endif
delete [] items;
}
};
class SquaredEuclideanDistance : public GenericMatrixDistance
{
protected:
virtual double compute(size_t v1, size_t v2);
public:
virtual Rcpp::RObject getDistMethod() { return Rf_mkString("euclidean_squared"); }
SquaredEuclideanDistance(const Rcpp::NumericMatrix& points) :
GenericMatrixDistance(points) { }
};
class EuclideanDistance : public GenericMatrixDistance
{
// private:
// std::vector sqobs;
protected:
virtual double compute(size_t v1, size_t v2);
public:
virtual Rcpp::RObject getDistMethod() { return Rf_mkString("euclidean"); }
EuclideanDistance(const Rcpp::NumericMatrix& points) :
GenericMatrixDistance(points) {
// const double* items_ptr = items;
// for (size_t i=0; i > ranks;
public:
virtual Rcpp::RObject getDistMethod() { return Rf_mkString("dinu"); }
DinuDistanceInt(const Rcpp::List& strings) :
StringDistanceInt(strings), ranks(n) {
// TODO: openmp
for (size_t i=0; i > ranks;
public:
virtual Rcpp::RObject getDistMethod() { return Rf_mkString("dinu"); }
DinuDistanceChar(const Rcpp::CharacterVector& strings) :
StringDistanceChar(strings), ranks(n) {
// TODO: openmp
for (size_t i=0; i