flashClust/0000755000176200001440000000000015151520310012353 5ustar liggesusersflashClust/MD50000644000176200001440000000065315151520310012667 0ustar liggesusers0a062956d2f112754f763cf2d5b8aa91 *Changelog 7a428c0ca06160251d3ab8328c396070 *DESCRIPTION 3797cd8641694375ac16a3ab5f9bfdf3 *NAMESPACE 4cbccf404d1e7f3d6c98c9b3f97e0778 *R/murtagh.R af50680480870d65802dc732a2873fa6 *inst/CITATION 50e090ce1601b9383f020990956eec59 *man/flashClust.Rd c68066024f4024a92088d82db568258a *src/hc.f 54a2d4ec7d6a4124f69dec1d567eb9b6 *src/hcass2.f 581dc8820a8c808af60f0b864843560b *src/registerRoutines.c flashClust/Changelog0000644000176200001440000000211015135034074014170 0ustar liggesusers 2026-01-24: 1.01-3 . Correction in help file. Thanks to Kurt Hornik for pointing out the typo. 2012-08-21: 1.01-2 . Added the stats routine hcass2 to the package to avoid calling undocumented internal function in the base distribution of R. 2012-03-02: 1.01-1 . Added citation 2011-03-20: 1.01 . Fixed incorrect clustering with methods "centroid" and "median". Thanks to Chi Ming Yau and Daniel Müllner for pointing out the error and suggesting a fix. . Speed now for the most part back up to the level experienced with 1.00-2 but still capable of working with larger data sets. 2011-02-06: 1.00-3 . Fixed overflow errors when the number of clustered objects is over ~45000. The code should now be able to cluster as many as 65000 objects, given enough memory. 2010-07-19: 1.00-2 . added a timing example in the help file 2010-02-19: 1.00 . added a wrapper called hclust so a fast version of hclust is used automatically after loading the package 2009-11-13: 0.10-1 . flashClust now checks that the input distance actually contains some data flashClust/R/0000755000176200001440000000000015151501073012561 5ustar liggesusersflashClust/R/murtagh.R0000644000176200001440000000512315151477716014374 0ustar liggesusers# Code by F. Murthagh, https://astro.u-strasbg.fr/~fmurtagh/mda-sw/splus # modified by Peter Langfelder to make it compatible with R's standard hclust flashClust <- function(d, method="complete", members = NULL) { hclust(d, method, members) } hclust <- function(d, method="complete", members = NULL) { # Hierarchical clustering, on raw input data; we will use Euclidean distance. # A range of criteria are supported; also there is a storage-economic option. # Author: F. Murtagh, May 1992 METHODS <- c("ward", "single", "complete", "average", "mcquitty", "median", "centroid") method <- pmatch(method, METHODS) if (is.na(method)) stop("Invalid clustering method") if (method == -1) stop("Ambiguous clustering method") n = attr(d, "Size") len = length(d); if (len!=(n*(n-1)/2)) stop("Distance structure appears invalid."); if (n==1 || len==0) stop("The distance structure is empty."); if (is.null(members)) { members <- rep(1, n) } else if (length(members) != n) stop("invalid length of members") # We choose the general routine, `hc', which # caters for 7 criteria, using a half dissimilarity matrix; (BTW, this uses the # very efficient nearest neighbor chain algorithm, which makes this algorithm # of O(n^2) computational time, and differentiates it from the less efficient # -- i.e. O(n^3) -- implementations in all commercial statistical packages # -- as far as I am aware -- except Clustan.) hcl <- .Fortran(.F77_hc, n = as.integer(n), len = as.integer(len), method = as.integer(method), ia = integer(n), ib = integer(n), crit = double(n), membr = as.double(members), nn = integer(n), disnn = double(n), diss = as.double(d), PACKAGE = "flashClust") # 2nd step: interpret the information that we now have, -- seq. of aggloms., -- # as merge, height, and order lists. #PL: not clear what this iclass is supposed to be for. #iclass <- matrix(0.0, n, n) #storage.mode(iclass) <- "integer" hcass <- .Fortran(.F77_hcass2, n = as.integer(n), ia = as.integer(hcl$ia), ib = as.integer(hcl$ib), order = integer(n), iia = integer(n), iib = integer(n), PACKAGE = "flashClust") merge <- cbind(hcass$iia[1:n-1],hcass$iib[1:n-1]) hhh <- list(merge = merge, height = hcl$crit[1:n-1], order = hcass$order, labels = attr(d, "Labels"), method = METHODS[method], call = match.call(), dist.method = attr(d, "method")) class(hhh) = "hclust" hhh } flashClust/src/0000755000176200001440000000000015151501067013152 5ustar liggesusersflashClust/src/registerRoutines.c0000644000176200001440000000250115151477513016701 0ustar liggesusers#include #include #include #include #include #include void F77_NAME(hc)(int *n, int *len, int *iopt, int *ia, int *ib, double *crit, double *membr, int *nn, double *disnn, double *diss); void F77_NAME(hcass2)(int *n, int *ia, int *ib, int *iorder, int *iia, int *iib); /* ============================================================================================= * * Register native routines here. * * =============================================================================================*/ void attribute_visible R_init_flashClust(DllInfo * info) { static R_NativePrimitiveArgType hc_t[] = { INTSXP,INTSXP,INTSXP,INTSXP,INTSXP, REALSXP, REALSXP, // CRIT and MEMBR INTSXP, REALSXP, // NN, DISNN REALSXP }; // DISS static R_NativePrimitiveArgType hcass2_t[] = { INTSXP, INTSXP, INTSXP, INTSXP, INTSXP, INTSXP }; static const R_FortranMethodDef FMethods[] = { {"hc", (DL_FUNC) &F77_NAME(hc), 10, hc_t}, {"hcass2", (DL_FUNC) &F77_NAME(hcass2), 6, hcass2_t}, {NULL, NULL, 0} }; R_registerRoutines(info, NULL, NULL, FMethods, NULL); R_useDynamicSymbols(info, FALSE); R_forceSymbols(info, TRUE); } flashClust/src/hc.f0000644000176200001440000001732215151500777013727 0ustar liggesusersC++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++C C C C HIERARCHICAL CLUSTERING using (user-specified) criterion. C C C C Parameters: C C C C DATA(N,M) input data matrix, C C DISS(LEN) dissimilarities in lower half diagonal C C storage; LEN = N.N-1/2, C C IOPT clustering criterion to be used, C C IA, IB, CRIT history of agglomerations; dimensions C C N, first N-1 locations only used, C C MEMBR, NN, DISNN vectors of length N, used to store C C cluster cardinalities, current nearest C C neighbour, and the dissimilarity assoc. C C with the latter. C C FLAG boolean indicator of agglomerable obj./ C C clusters. C C C C F. Murtagh, ESA/ESO/STECF, Garching, February 1986. C C Modified by Peter Langfelder, implemented bug fix C by Chi Ming Yau C March 2026: argument FLAG is not passed anymore to prevent C potential fortran/C inconsistencies C C C------------------------------------------------------------C SUBROUTINE HC(N,LEN,IOPT,IA,IB,CRIT,MEMBR,NN,DISNN, X DISS) IMPLICIT DOUBLE PRECISION (A-H, O-Z) DOUBLE PRECISION MEMBR(N),DISS(LEN) INTEGER IA(N),IB(N) DOUBLE PRECISION CRIT(N) DIMENSION NN(N),DISNN(N) LOGICAL FLAG(N) DOUBLE PRECISION INF c was 1D+20 DATA INF/1.D+300/ c c unnecessary initialization of im jj jm to keep g77 -Wall happy c IM = 0 JJ = 0 JM = 0 C C Initializations C DO I=1,N c MEMBR(I)=1. FLAG(I)=.TRUE. ENDDO NCL=N C C Construct dissimilarity matrix C C DO I=1,N-1 C DO J=I+1,N C IND=IOFFSET(N,I,J) C DISS(IND)=0. C DO K=1,M C DISS(IND)=DISS(IND)+(DATA(I,K)-DATA(J,K))**2 C ENDDO C IF (IOPT.EQ.1) DISS(IND)=DISS(IND)/2. C (Above is done for the case of the min. var. method C where merging criteria are defined in terms of variances C rather than distances.) C ENDDO C ENDDO C C Carry out an agglomeration - first create list of NNs C DO I=1,N-1 DMIN=INF DO J=I+1,N IND=IOFFSET(N,I,J) IF (DISS(IND).GE.DMIN) GOTO 500 DMIN=DISS(IND) JM=J 500 CONTINUE ENDDO NN(I)=JM DISNN(I)=DMIN ENDDO C 400 CONTINUE C Next, determine least diss. using list of NNs DMIN=INF DO I=1,N-1 IF (.NOT.FLAG(I)) GOTO 600 IF (DISNN(I).GE.DMIN) GOTO 600 DMIN=DISNN(I) IM=I JM=NN(I) 600 CONTINUE ENDDO NCL=NCL-1 C C This allows an agglomeration to be carried out. C I2=MIN0(IM,JM) J2=MAX0(IM,JM) IA(N-NCL)=I2 IB(N-NCL)=J2 CRIT(N-NCL)=DMIN C C Update dissimilarities from new cluster. C FLAG(J2)=.FALSE. DMIN=INF DO K=1,N IF (.NOT.FLAG(K)) GOTO 800 IF (K.EQ.I2) GOTO 800 X=MEMBR(I2)+MEMBR(J2)+MEMBR(K) IF (I2.LT.K) THEN IND1=IOFFSET(N,I2,K) ELSE IND1=IOFFSET(N,K,I2) ENDIF IF (J2.LT.K) THEN IND2=IOFFSET(N,J2,K) ELSE IND2=IOFFSET(N,K,J2) ENDIF IND3=IOFFSET(N,I2,J2) XX=DISS(IND3) C C WARD'S MINIMUM VARIANCE METHOD - IOPT=1. C IF (IOPT.EQ.1) THEN DISS(IND1)=(MEMBR(I2)+MEMBR(K))*DISS(IND1)+ X (MEMBR(J2)+MEMBR(K))*DISS(IND2)- X MEMBR(K)*XX DISS(IND1)=DISS(IND1)/X ENDIF C C SINGLE LINK METHOD - IOPT=2. C IF (IOPT.EQ.2) THEN DISS(IND1)=MIN(DISS(IND1),DISS(IND2)) ENDIF C C COMPLETE LINK METHOD - IOPT=3. C IF (IOPT.EQ.3) THEN DISS(IND1)=MAX(DISS(IND1),DISS(IND2)) ENDIF C C AVERAGE LINK (OR GROUP AVERAGE) METHOD - IOPT=4. C IF (IOPT.EQ.4) THEN DISS(IND1)=(MEMBR(I2)*DISS(IND1)+MEMBR(J2)*DISS(IND2))/ X (MEMBR(I2)+MEMBR(J2)) ENDIF C C MCQUITTY'S METHOD - IOPT=5. C IF (IOPT.EQ.5) THEN DISS(IND1)=0.5*DISS(IND1)+0.5*DISS(IND2) ENDIF C C MEDIAN (GOWER'S) METHOD - IOPT=6. C IF (IOPT.EQ.6) THEN DISS(IND1)=0.5*DISS(IND1)+0.5*DISS(IND2)-0.25*XX ENDIF C C CENTROID METHOD - IOPT=7. C IF (IOPT.EQ.7) THEN DISS(IND1)=(MEMBR(I2)*DISS(IND1)+MEMBR(J2)*DISS(IND2)- X MEMBR(I2)*MEMBR(J2)*XX/(MEMBR(I2)+MEMBR(J2)))/ X (MEMBR(I2)+MEMBR(J2)) ENDIF C IF (I2.GT.K) GOTO 800 IF (DISS(IND1).GE.DMIN) GOTO 800 DMIN=DISS(IND1) JJ=K 800 CONTINUE ENDDO MEMBR(I2)=MEMBR(I2)+MEMBR(J2) DISNN(I2)=DMIN NN(I2)=JJ C C Update list of NNs insofar as this is required. C This part modified by Chi Ming Yau and PL. For methods IOPT=6 and 7 C use modified updating of nearest neighbors that is a bit slower but C necessary. IF (IOPT.GT.5) THEN DO I=1,N-1 IF (.NOT.FLAG(I)) GOTO 900 IF (I.EQ.I2) GOTO 850 IF (NN(I).EQ.I2) GOTO 850 IF (NN(I).EQ.J2) GOTO 850 C Compare DISNN(I) with updated DISS between I and I2 IF (I2.LT.I) THEN IND=IOFFSET(N,I2,I) ELSE IND=IOFFSET(N,I,I2) ENDIF DMIN=DISS(IND) IF (DMIN.GE.DISNN(I)) GOTO 900 DISNN(I)=DMIN NN(I)=I2 GOTO 900 850 CONTINUE C (Redetermine NN of I:) DMIN=INF DO J=I+1,N IND=IOFFSET(N,I,J) IF (.NOT.FLAG(J)) GOTO 870 IF (I.EQ.J) GOTO 870 IF (DISS(IND).GE.DMIN) GOTO 870 DMIN=DISS(IND) JJ=J 870 CONTINUE ENDDO NN(I)=JJ DISNN(I)=DMIN 900 CONTINUE ENDDO ELSE C For methods IOPT<6 use the original fast update. DO I=1,N-1 IF (.NOT.FLAG(I)) GOTO 901 IF (NN(I).EQ.I2) GOTO 851 IF (NN(I).EQ.J2) GOTO 851 GOTO 901 851 CONTINUE C (Redetermine NN of I:) DMIN=INF DO J=I+1,N IND=IOFFSET(N,I,J) IF (.NOT.FLAG(J)) GOTO 871 IF (I.EQ.J) GOTO 871 IF (DISS(IND).GE.DMIN) GOTO 871 DMIN=DISS(IND) JJ=J 871 CONTINUE ENDDO NN(I)=JJ DISNN(I)=DMIN 901 CONTINUE ENDDO ENDIF C C Repeat previous steps until N-1 agglomerations carried out. C IF (NCL.GT.1) GOTO 400 C C RETURN END C C FUNCTION IOFFSET(N,I,J) C Map row I and column J of upper half diagonal symmetric matrix C onto vector. IMPLICIT DOUBLE PRECISION (A-H, O-Z) C Convert integer I to a double C This hopefully prevents overflow errors when I^2 is greater than C 2^31. IF (N.GT.32768) THEN XI = DBLE(I) IOFFSET=J+NINT( (XI-1)*N - (XI*(XI+1))/2) ELSE IOFFSET=J+(I-1)*N-(I*(I+1))/2 ENDIF RETURN END flashClust/src/hcass2.f0000644000176200001440000000703112014736753014514 0ustar liggesusersC+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++C C C C Given a HIERARCHIC CLUSTERING, described as a sequence of C C agglomerations, prepare the seq. of aggloms. and "horiz." C C order of objects for plotting the dendrogram using S routine C C 'plclust'. C C C C Parameters: C C C C IA, IB: vectors of dimension N defining the agglomer- C C ations. C C IIA, IIB: used to store IA and IB values differently C C (in form needed for S command 'plclust' C C IORDER: "horiz." order of objects for dendrogram C C C C F. Murtagh, ESA/ESO/STECF, Garching, June 1991 C C C C HISTORY C C C C Adapted from routine HCASS, which additionally determines C C cluster assignments at all levels, at extra comput. expense C C C C This routine copied by Peter Langfelder from the source C C of R package stats. C C C C---------------------------------------------------------------C SUBROUTINE HCASS2(N,IA,IB,IORDER,IIA,IIB) c Args INTEGER N,IA(N),IB(N),IORDER(N),IIA(N),IIB(N) c Var INTEGER I, J, K, K1, K2, LOC C C Following bit is to get seq. of merges into format acceptable to plclust C I coded clusters as lowest seq. no. of constituents; S's 'hclust' codes C singletons as -ve numbers, and non-singletons with their seq. nos. C do I=1,N IIA(I)=IA(I) IIB(I)=IB(I) end do do I=1,N-2 C In the following, smallest (+ve or -ve) seq. no. wanted K=MIN(IA(I),IB(I)) do J=I+1, N-1 IF(IA(J).EQ.K) IIA(J)=-I IF(IB(J).EQ.K) IIB(J)=-I end do end do do I=1,N-1 IIA(I)=-IIA(I) IIB(I)=-IIB(I) end do do I=1,N-1 IF (IIA(I).GT.0 .AND. IIB(I).LT.0) THEN K = IIA(I) IIA(I) = IIB(I) IIB(I) = K ENDIF IF (IIA(I).GT.0 .AND. IIB(I).GT.0) THEN K1 = MIN(IIA(I),IIB(I)) K2 = MAX(IIA(I),IIB(I)) IIA(I) = K1 IIB(I) = K2 ENDIF end do C C C NEW PART FOR 'ORDER' C IORDER(1) = IIA(N-1) IORDER(2) = IIB(N-1) LOC=2 DO I=N-2,1,-1 DO J=1,LOC IF(IORDER(J).EQ.I) THEN C REPLACE IORDER(J) WITH IIA(I) AND IIB(I) IORDER(J)=IIA(I) IF (J.EQ.LOC) THEN LOC=LOC+1 IORDER(LOC)=IIB(I) else LOC=LOC+1 do K=LOC,J+2,-1 IORDER(K)=IORDER(K-1) end do IORDER(J+1)=IIB(I) end if GOTO 171 ENDIF end do C SHOULD NEVER REACH HERE 171 CONTINUE end do C C do I=1,N IORDER(I) = -IORDER(I) end do C C RETURN END flashClust/NAMESPACE0000644000176200001440000000013015151477675013615 0ustar liggesusersuseDynLib(flashClust, .registration = TRUE, .fixes = ".F77_") exportPattern("^[^\\.]") flashClust/inst/0000755000176200001440000000000015151313421013333 5ustar liggesusersflashClust/inst/CITATION0000644000176200001440000000136715151313421014477 0ustar liggesuserscitHeader("To cite flashClust in publications use:") bibentry(bibtype = "Article", title = "Fast {R} Functions for Robust Correlations and Hierarchical Clustering", author = c(as.person("Peter Langfelder"), as.person("Steve Horvath")), journal = "Journal of Statistical Software", year = "2012", volume = "46", number = "11", pages = "1--17", url = "https://www.jstatsoft.org/v46/i11/", textVersion = paste("Peter Langfelder, Steve Horvath (2012).", "Fast R Functions for Robust Correlations and Hierarchical Clustering.", "Journal of Statistical Software, 46(11), 1-17.", "URL https://www.jstatsoft.org/v46/i11/.") ) flashClust/man/0000755000176200001440000000000015151501337013136 5ustar liggesusersflashClust/man/flashClust.Rd0000644000176200001440000001221315151501337015534 0ustar liggesusers\name{flashClust} \alias{flashClust} \alias{hclust} \title{ Faster alternative to hclust } \description{ This function implements optimal hierarchical clustering with the same interface as \code{\link{hclust}}. } \usage{ hclust(d, method = "complete", members=NULL) flashClust(d, method = "complete", members=NULL) } \arguments{ \item{d}{ a dissimilarity structure as produced by 'dist'.} \item{method}{ the agglomeration method to be used. This should be (an unambiguous abbreviation of) one of \code{"ward"}, \code{"single"}, \code{"complete"}, \code{"average"}, \code{"mcquitty"}, \code{"median"} or \code{"centroid"}. } \item{members}{\code{NULL} or a vector with length size of \code{d}. See the \sQuote{Details} section.} } \details{ See the description of \code{\link{hclust}} for details on available clustering methods. If \code{members!=NULL}, then \code{d} is taken to be a dissimilarity matrix between clusters instead of dissimilarities between singletons and \code{members} gives the number of observations per cluster. This way the hierarchical cluster algorithm can be \sQuote{started in the middle of the dendrogram}, e.g., in order to reconstruct the part of the tree above a cut (see examples). Dissimilarities between clusters can be efficiently computed (i.e., without \code{hclust} itself) only for a limited number of distance/linkage combinations, the simplest one being squared Euclidean distance and centroid linkage. In this case the dissimilarities between the clusters are the squared Euclidean distances between cluster means. \code{flashClust} is a wrapper for compatibility with older code. } \value{ Returned value is the same as that of \code{\link{hclust}}: An object of class \bold{hclust} which describes the tree produced by the clustering process. The object is a list with components: \item{merge}{an \eqn{n-1} by 2 matrix. Row \eqn{i} of \code{merge} describes the merging of clusters at step \eqn{i} of the clustering. If an element \eqn{j} in the row is negative, then observation \eqn{-j} was merged at this stage. If \eqn{j} is positive then the merge was with the cluster formed at the (earlier) stage \eqn{j} of the algorithm. Thus negative entries in \code{merge} indicate agglomerations of singletons, and positive entries indicate agglomerations of non-singletons.} \item{height}{a set of \eqn{n-1} non-decreasing real values. The clustering \emph{height}: that is, the value of the criterion associated with the clustering \code{method} for the particular agglomeration.} \item{order}{a vector giving the permutation of the original observations suitable for plotting, in the sense that a cluster plot using this ordering and matrix \code{merge} will not have crossings of the branches.} \item{labels}{labels for each of the objects being clustered.} \item{call}{the call which produced the result.} \item{method}{the cluster method that has been used.} \item{dist.method}{the distance that has been used to create \code{d} (only returned if the distance object has a \code{"method"} attribute).} } \references{ This implementation is mentioned in Peter Langfelder, Steve Horvath (2012) Fast R Functions for Robust Correlations and Hierarchical Clustering. Journal of Statistical Software, 46(11), 1-17. \url{https://www.jstatsoft.org/v46/i11/} F. Murtagh's software web site: https://www.classification-society.org/csna/mda-sw/ , section 6 Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) \emph{The New S Language}. Wadsworth & Brooks/Cole. (S version.) Everitt, B. (1974). \emph{Cluster Analysis}. London: Heinemann Educ. Books. Hartigan, J. A. (1975). \emph{Clustering Algorithms}. New York: Wiley. Sneath, P. H. A. and R. R. Sokal (1973). \emph{Numerical Taxonomy}. San Francisco: Freeman. Anderberg, M. R. (1973). \emph{Cluster Analysis for Applications}. Academic Press: New York. Gordon, A. D. (1999). \emph{Classification}. Second Edition. London: Chapman and Hall / CRC Murtagh, F. (1985). \dQuote{Multidimensional Clustering Algorithms}, in \emph{COMPSTAT Lectures 4}. Wuerzburg: Physica-Verlag (for algorithmic details of algorithms used). McQuitty, L.L. (1966). Similarity Analysis by Reciprocal Pairs for Discrete and Continuous Data. \emph{Educational and Psychological Measurement}, \bold{26}, 825--831. } \author{ Fionn Murtagh, adapted and packaged by Peter Langfelder} \seealso{ \code{\link{hclust}} } \examples{ # generate some data to cluster set.seed(1); nNodes = 2000; # Random "distance" matrix dst = matrix(runif(n = nNodes^2, min = 0, max = 1), nNodes, nNodes); # Time the flashClust clustering system.time( { h1 = hclust(as.dist(dst), method= "average"); } ); # Time the standard R clustering system.time( { h2 = stats::hclust(as.dist(dst), method = "average"); } ); all.equal(h1, h2) # What is different: h1[[6]] h2[[6]] # Everything but the 'call' component is the same; in particular, the trees are exactly equal. } \keyword{multivariate} \keyword{cluster} flashClust/DESCRIPTION0000644000176200001440000000173015151520310014062 0ustar liggesusersPackage: flashClust Version: 1.1-4 Date: 2026-03-02 Title: Implementation of Fast Hierarchical Clustering Authors@R: c(person(given = c("Fionn"), family = "Murtagh", role = "aut"), person(given = c(""), family = "R development team", role = "aut"), person(given = "Peter", family = "Langfelder", role = c("com", "cre"), email = "Peter.Langfelder@gmail.com")) Maintainer: Peter Langfelder Depends: R (>= 2.3.0) ZipData: no License: GPL (>= 2) Description: A fast implementation of hierarchical clustering that incorporates original code from Fionn Murtagh. NeedsCompilation: yes Packaged: 2026-03-03 06:42:48 UTC; plangfelder Author: Fionn Murtagh [aut], R development team [aut], Peter Langfelder [com, cre] Repository: CRAN Date/Publication: 2026-03-03 08:50:16 UTC