Source code for endgame.clustering.hierarchical

"""Hierarchical clustering: Agglomerative and Genie.

AgglomerativeClusterer wraps sklearn with all linkage options.
GenieClusterer wraps genieclust (optional dependency).
"""

from __future__ import annotations

from typing import Any

import numpy as np
from numpy.typing import ArrayLike
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.utils.validation import check_array

try:
    import genieclust
    HAS_GENIECLUST = True
except ImportError:
    HAS_GENIECLUST = False


[docs] class AgglomerativeClusterer(BaseEstimator, ClusterMixin): """Agglomerative hierarchical clustering with multiple linkage options. Ward's linkage (default) is the strongest general-purpose option. Average linkage is robust. Single linkage is fast but chaining-sensitive. Complete linkage produces compact clusters. Parameters ---------- n_clusters : int or None, default=2 Number of clusters. If None, must provide ``distance_threshold``. linkage : str, default='ward' Linkage criterion: 'ward', 'average', 'complete', 'single'. metric : str, default='euclidean' Distance metric (only used with non-ward linkage). distance_threshold : float or None, default=None Distance threshold for stopping. If set, ``n_clusters`` must be None. connectivity : array-like or callable or None, default=None Connectivity constraints. compute_full_tree : bool or 'auto', default='auto' Whether to compute the full dendrogram. compute_distances : bool, default=False Whether to compute distances between clusters. Attributes ---------- labels_ : ndarray of shape (n_samples,) Cluster labels. n_clusters_ : int Number of clusters. n_leaves_ : int Number of leaves in the dendrogram. children_ : ndarray of shape (n_nodes-1, 2) Merge history. distances_ : ndarray or None Distances between merged clusters (if ``compute_distances=True``). """ def __init__( self, n_clusters: int | None = 2, linkage: str = "ward", metric: str = "euclidean", distance_threshold: float | None = None, connectivity: Any = None, compute_full_tree: bool | str = "auto", compute_distances: bool = False, ): self.n_clusters = n_clusters self.linkage = linkage self.metric = metric self.distance_threshold = distance_threshold self.connectivity = connectivity self.compute_full_tree = compute_full_tree self.compute_distances = compute_distances
[docs] def fit(self, X: ArrayLike, y=None) -> AgglomerativeClusterer: """Fit agglomerative clustering.""" from sklearn.cluster import AgglomerativeClustering X = check_array(X) self.model_ = AgglomerativeClustering( n_clusters=self.n_clusters, linkage=self.linkage, metric=self.metric, distance_threshold=self.distance_threshold, connectivity=self.connectivity, compute_full_tree=self.compute_full_tree, compute_distances=self.compute_distances, ) self.model_.fit(X) self.labels_ = self.model_.labels_ self.n_clusters_ = self.model_.n_clusters_ self.n_leaves_ = self.model_.n_leaves_ self.children_ = self.model_.children_ self.distances_ = getattr(self.model_, "distances_", None) self.n_features_in_ = X.shape[1] return self
[docs] def fit_predict(self, X: ArrayLike, y=None) -> np.ndarray: """Fit and return cluster labels.""" self.fit(X) return self.labels_
[docs] class GenieClusterer(BaseEstimator, ClusterMixin): """Genie clustering: MST-based with Gini index threshold. Builds a minimum spanning tree and merges clusters using single linkage, but applies a Gini index threshold on cluster sizes to prevent the pathological chaining behavior. Consistently outperforms Ward and average linkage on standard benchmarks. Requires the ``genieclust`` package. Parameters ---------- n_clusters : int, default=2 Number of clusters. gini_threshold : float, default=0.3 Gini index threshold for cluster size inequality. Lower values enforce more balanced clusters. 0 = single linkage, 1 = balanced. affinity : str, default='euclidean' Distance metric. exact : bool, default=True Use exact (True) or approximate (False) algorithm. compute_full_tree : bool, default=True Whether to compute the full hierarchy. M : int, default=1 Smoothing factor for the mutual reachability distance. M=1 is standard MST; larger M approaches HDBSCAN*-like behavior. Attributes ---------- labels_ : ndarray of shape (n_samples,) Cluster labels. n_clusters_ : int Number of clusters. References ---------- Gagolewski, M. (2016). "Genie: A new, fast, and outlier-resistant hierarchical clustering algorithm." Information Sciences. Gagolewski, M. (2025). Journal of Classification. """ def __init__( self, n_clusters: int = 2, gini_threshold: float = 0.3, affinity: str = "euclidean", exact: bool = True, compute_full_tree: bool = True, M: int = 1, ): self.n_clusters = n_clusters self.gini_threshold = gini_threshold self.affinity = affinity self.exact = exact self.compute_full_tree = compute_full_tree self.M = M
[docs] def fit(self, X: ArrayLike, y=None) -> GenieClusterer: """Fit Genie clustering.""" if not HAS_GENIECLUST: raise ImportError( "genieclust is required for GenieClusterer. " "Install with: pip install genieclust" ) X = check_array(X, dtype=np.float64) self.model_ = genieclust.Genie( n_clusters=self.n_clusters, gini_threshold=self.gini_threshold, affinity=self.affinity, exact=self.exact, compute_full_tree=self.compute_full_tree, M=self.M, ) self.labels_ = self.model_.fit_predict(X) self.n_clusters_ = len(set(self.labels_)) - (1 if -1 in self.labels_ else 0) self.n_features_in_ = X.shape[1] return self
[docs] def fit_predict(self, X: ArrayLike, y=None) -> np.ndarray: """Fit and return cluster labels.""" self.fit(X) return self.labels_