Source code for endgame.preprocessing.imbalance

"""Class imbalance handling: SMOTE variants, under-sampling, and combined methods.

This module provides sklearn-compatible wrappers around imbalanced-learn with
competition-tuned defaults and additional utilities for handling class imbalance.

Example
-------
>>> from endgame.preprocessing import SMOTEResampler, AutoBalancer
>>>
>>> # Simple SMOTE resampling
>>> smote = SMOTEResampler(sampling_strategy='auto')
>>> X_resampled, y_resampled = smote.fit_resample(X, y)
>>>
>>> # Auto-select best strategy based on imbalance ratio
>>> balancer = AutoBalancer(strategy='auto')
>>> X_balanced, y_balanced = balancer.fit_resample(X, y)
>>>
>>> # Use in sklearn pipeline with imblearn's Pipeline
>>> from imblearn.pipeline import Pipeline
>>> pipe = Pipeline([
...     ('balance', SMOTEResampler()),
...     ('clf', RandomForestClassifier())
... ])
"""

from __future__ import annotations

from typing import Any, Literal

import numpy as np
from numpy.typing import ArrayLike
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_is_fitted, check_X_y

# Lazy imports for imblearn
_IMBLEARN_AVAILABLE = None


def _check_imblearn():
    """Check if imbalanced-learn is available."""
    global _IMBLEARN_AVAILABLE
    if _IMBLEARN_AVAILABLE is None:
        try:
            import imblearn
            _IMBLEARN_AVAILABLE = True
        except ImportError:
            _IMBLEARN_AVAILABLE = False
    if not _IMBLEARN_AVAILABLE:
        raise ImportError(
            "imbalanced-learn is required for class balancing. "
            "Install with: pip install imbalanced-learn"
        )


# =============================================================================
# Over-sampling Methods
# =============================================================================

[docs] class SMOTEResampler(BaseEstimator): """SMOTE (Synthetic Minority Over-sampling Technique) wrapper. Creates synthetic samples by interpolating between minority class instances and their k-nearest neighbors. Parameters ---------- sampling_strategy : float, str, dict, or callable, default='auto' Sampling information: - 'auto': Resample all classes but the majority - 'minority': Resample only the minority class - 'not majority': Resample all classes but the majority - 'all': Resample all classes - float: Ratio of minority to majority (0 < ratio <= 1) - dict: {class_label: n_samples} for each class k_neighbors : int, default=5 Number of nearest neighbors used to construct synthetic samples. random_state : int or None, default=None Random seed for reproducibility. n_jobs : int, default=-1 Number of parallel jobs for neighbor search. Attributes ---------- sampler_ : imblearn.over_sampling.SMOTE The fitted SMOTE sampler. sampling_strategy_ : dict The computed sampling strategy. Examples -------- >>> from endgame.preprocessing import SMOTEResampler >>> smote = SMOTEResampler(k_neighbors=5, random_state=42) >>> X_res, y_res = smote.fit_resample(X, y) """ def __init__( self, sampling_strategy: str | float | dict = 'auto', k_neighbors: int = 5, random_state: int | None = None, ): self.sampling_strategy = sampling_strategy self.k_neighbors = k_neighbors self.random_state = random_state
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> SMOTEResampler: """Fit the SMOTE sampler. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data. y : array-like of shape (n_samples,) Target values. Returns ------- self : SMOTEResampler Fitted sampler. """ _check_imblearn() from imblearn.over_sampling import SMOTE X, y = check_X_y(X, y) self.sampler_ = SMOTE( sampling_strategy=self.sampling_strategy, k_neighbors=self.k_neighbors, random_state=self.random_state, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data. y : array-like of shape (n_samples,) Target values. Returns ------- X_resampled : ndarray of shape (n_samples_new, n_features) Resampled training data. y_resampled : ndarray of shape (n_samples_new,) Resampled target values. """ self.fit(X, y) return self.sampler_.fit_resample(X, y)
[docs] class BorderlineSMOTEResampler(BaseEstimator): """Borderline-SMOTE wrapper focusing on difficult borderline samples. Only generates synthetic samples from minority instances that are near the decision boundary (borderline instances). Parameters ---------- sampling_strategy : float, str, dict, or callable, default='auto' See SMOTEResampler for details. k_neighbors : int, default=5 Number of nearest neighbors for SMOTE interpolation. m_neighbors : int, default=10 Number of nearest neighbors to determine if instance is borderline. kind : {'borderline-1', 'borderline-2'}, default='borderline-1' - 'borderline-1': Only use borderline minority instances - 'borderline-2': Use borderline minority + their majority neighbors random_state : int or None, default=None Random seed for reproducibility. n_jobs : int, default=-1 Number of parallel jobs. """ def __init__( self, sampling_strategy: str | float | dict = 'auto', k_neighbors: int = 5, m_neighbors: int = 10, kind: Literal['borderline-1', 'borderline-2'] = 'borderline-1', random_state: int | None = None, ): self.sampling_strategy = sampling_strategy self.k_neighbors = k_neighbors self.m_neighbors = m_neighbors self.kind = kind self.random_state = random_state
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> BorderlineSMOTEResampler: """Fit the BorderlineSMOTE sampler.""" _check_imblearn() from imblearn.over_sampling import BorderlineSMOTE X, y = check_X_y(X, y) self.sampler_ = BorderlineSMOTE( sampling_strategy=self.sampling_strategy, k_neighbors=self.k_neighbors, m_neighbors=self.m_neighbors, kind=self.kind, random_state=self.random_state, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
[docs] class ADASYNResampler(BaseEstimator): """ADASYN (Adaptive Synthetic Sampling) wrapper. Generates synthetic samples adaptively based on local density - more samples are generated in regions where minority class is sparse. Parameters ---------- sampling_strategy : float, str, dict, or callable, default='auto' See SMOTEResampler for details. n_neighbors : int, default=5 Number of nearest neighbors. random_state : int or None, default=None Random seed for reproducibility. n_jobs : int, default=-1 Number of parallel jobs. """ def __init__( self, sampling_strategy: str | float | dict = 'auto', n_neighbors: int = 5, random_state: int | None = None, ): self.sampling_strategy = sampling_strategy self.n_neighbors = n_neighbors self.random_state = random_state
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> ADASYNResampler: """Fit the ADASYN sampler.""" _check_imblearn() from imblearn.over_sampling import ADASYN X, y = check_X_y(X, y) self.sampler_ = ADASYN( sampling_strategy=self.sampling_strategy, n_neighbors=self.n_neighbors, random_state=self.random_state, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
[docs] class SVMSMOTEResampler(BaseEstimator): """SVM-SMOTE wrapper using SVM to identify borderline samples. Uses SVM to identify support vectors (borderline samples) and generates synthetic samples only from those. Parameters ---------- sampling_strategy : float, str, dict, or callable, default='auto' See SMOTEResampler for details. k_neighbors : int, default=5 Number of nearest neighbors for SMOTE. m_neighbors : int, default=10 Number of nearest neighbors for borderline detection. svm_estimator : estimator or None, default=None SVM classifier. If None, uses SVC with default parameters. out_step : float, default=0.5 Step size for generating samples outside the decision boundary. random_state : int or None, default=None Random seed for reproducibility. n_jobs : int, default=-1 Number of parallel jobs. """ def __init__( self, sampling_strategy: str | float | dict = 'auto', k_neighbors: int = 5, m_neighbors: int = 10, svm_estimator: Any = None, out_step: float = 0.5, random_state: int | None = None, ): self.sampling_strategy = sampling_strategy self.k_neighbors = k_neighbors self.m_neighbors = m_neighbors self.svm_estimator = svm_estimator self.out_step = out_step self.random_state = random_state
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> SVMSMOTEResampler: """Fit the SVM-SMOTE sampler.""" _check_imblearn() from imblearn.over_sampling import SVMSMOTE X, y = check_X_y(X, y) self.sampler_ = SVMSMOTE( sampling_strategy=self.sampling_strategy, k_neighbors=self.k_neighbors, m_neighbors=self.m_neighbors, svm_estimator=self.svm_estimator, out_step=self.out_step, random_state=self.random_state, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
[docs] class KMeansSMOTEResampler(BaseEstimator): """K-Means SMOTE wrapper for cluster-based oversampling. Applies k-means clustering before SMOTE, generating synthetic samples in under-represented clusters. Parameters ---------- sampling_strategy : float, str, dict, or callable, default='auto' See SMOTEResampler for details. k_neighbors : int, default=2 Number of nearest neighbors for SMOTE. kmeans_estimator : estimator or int, default=None KMeans instance or number of clusters. If None, uses n_classes. cluster_balance_threshold : float, default=0.1 Threshold for considering clusters as imbalanced. density_exponent : float or 'auto', default='auto' Exponent for density-based sample allocation. random_state : int or None, default=None Random seed for reproducibility. n_jobs : int, default=-1 Number of parallel jobs. """ def __init__( self, sampling_strategy: str | float | dict = 'auto', k_neighbors: int = 2, kmeans_estimator: Any = None, cluster_balance_threshold: float = 0.1, density_exponent: float | str = 'auto', random_state: int | None = None, n_jobs: int = -1, ): self.sampling_strategy = sampling_strategy self.k_neighbors = k_neighbors self.kmeans_estimator = kmeans_estimator self.cluster_balance_threshold = cluster_balance_threshold self.density_exponent = density_exponent self.random_state = random_state self.n_jobs = n_jobs
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> KMeansSMOTEResampler: """Fit the K-Means SMOTE sampler.""" _check_imblearn() from imblearn.over_sampling import KMeansSMOTE X, y = check_X_y(X, y) self.sampler_ = KMeansSMOTE( sampling_strategy=self.sampling_strategy, k_neighbors=self.k_neighbors, kmeans_estimator=self.kmeans_estimator, cluster_balance_threshold=self.cluster_balance_threshold, density_exponent=self.density_exponent, random_state=self.random_state, n_jobs=self.n_jobs, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
[docs] class RandomOverSampler(BaseEstimator): """Random over-sampling wrapper (duplicates minority samples). Simply duplicates random minority class samples. Fast but may lead to overfitting. Parameters ---------- sampling_strategy : float, str, dict, or callable, default='auto' See SMOTEResampler for details. random_state : int or None, default=None Random seed for reproducibility. shrinkage : float or dict, default=None If not None, apply smoothed bootstrap with this shrinkage factor. """ def __init__( self, sampling_strategy: str | float | dict = 'auto', random_state: int | None = None, shrinkage: float | dict | None = None, ): self.sampling_strategy = sampling_strategy self.random_state = random_state self.shrinkage = shrinkage
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> RandomOverSampler: """Fit the random over-sampler.""" _check_imblearn() from imblearn.over_sampling import RandomOverSampler as _ROS X, y = check_X_y(X, y) self.sampler_ = _ROS( sampling_strategy=self.sampling_strategy, random_state=self.random_state, shrinkage=self.shrinkage, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
# ============================================================================= # Under-sampling Methods # =============================================================================
[docs] class EditedNearestNeighbours(BaseEstimator): """Edited Nearest Neighbours (ENN) under-sampling. Removes samples whose class label differs from the majority of their k-nearest neighbors (noise removal). Parameters ---------- sampling_strategy : str, list, or callable, default='auto' Classes to be under-sampled. n_neighbors : int, default=3 Number of nearest neighbors for majority voting. kind_sel : {'all', 'mode'}, default='all' - 'all': Sample removed if any neighbor is from different class - 'mode': Sample removed if majority of neighbors are different n_jobs : int, default=-1 Number of parallel jobs. """ def __init__( self, sampling_strategy: str | list = 'auto', n_neighbors: int = 3, kind_sel: Literal['all', 'mode'] = 'all', n_jobs: int = -1, ): self.sampling_strategy = sampling_strategy self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.n_jobs = n_jobs
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> EditedNearestNeighbours: """Fit the ENN sampler.""" _check_imblearn() from imblearn.under_sampling import EditedNearestNeighbours as _ENN X, y = check_X_y(X, y) self.sampler_ = _ENN( sampling_strategy=self.sampling_strategy, n_neighbors=self.n_neighbors, kind_sel=self.kind_sel, n_jobs=self.n_jobs, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
[docs] class AllKNNUnderSampler(BaseEstimator): """AllKNN under-sampling (multiple passes of ENN). Applies ENN repeatedly with increasing k values until no more samples are removed. Parameters ---------- sampling_strategy : str, list, or callable, default='auto' Classes to be under-sampled. n_neighbors : int, default=3 Starting number of nearest neighbors. kind_sel : {'all', 'mode'}, default='all' Selection strategy (see EditedNearestNeighbours). allow_minority : bool, default=False If True, allow removal of minority samples. n_jobs : int, default=-1 Number of parallel jobs. """ def __init__( self, sampling_strategy: str | list = 'auto', n_neighbors: int = 3, kind_sel: Literal['all', 'mode'] = 'all', allow_minority: bool = False, n_jobs: int = -1, ): self.sampling_strategy = sampling_strategy self.n_neighbors = n_neighbors self.kind_sel = kind_sel self.allow_minority = allow_minority self.n_jobs = n_jobs
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> AllKNNUnderSampler: """Fit the AllKNN sampler.""" _check_imblearn() from imblearn.under_sampling import AllKNN X, y = check_X_y(X, y) self.sampler_ = AllKNN( sampling_strategy=self.sampling_strategy, n_neighbors=self.n_neighbors, kind_sel=self.kind_sel, allow_minority=self.allow_minority, n_jobs=self.n_jobs, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
[docs] class TomekLinksUnderSampler(BaseEstimator): """Tomek Links under-sampling. Removes Tomek links - pairs of instances from different classes that are each other's nearest neighbor. Cleans the decision boundary. Parameters ---------- sampling_strategy : str, list, or callable, default='auto' Classes to be under-sampled. n_jobs : int, default=-1 Number of parallel jobs. """ def __init__( self, sampling_strategy: str | list = 'auto', n_jobs: int = -1, ): self.sampling_strategy = sampling_strategy self.n_jobs = n_jobs
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> TomekLinksUnderSampler: """Fit the Tomek Links sampler.""" _check_imblearn() from imblearn.under_sampling import TomekLinks X, y = check_X_y(X, y) self.sampler_ = TomekLinks( sampling_strategy=self.sampling_strategy, n_jobs=self.n_jobs, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
[docs] class RandomUnderSampler(BaseEstimator): """Random under-sampling (removes random majority samples). Randomly removes majority class samples. Fast but may lose important information. Parameters ---------- sampling_strategy : float, str, dict, or callable, default='auto' Sampling information. random_state : int or None, default=None Random seed for reproducibility. replacement : bool, default=False Whether to sample with replacement. """ def __init__( self, sampling_strategy: str | float | dict = 'auto', random_state: int | None = None, replacement: bool = False, ): self.sampling_strategy = sampling_strategy self.random_state = random_state self.replacement = replacement
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> RandomUnderSampler: """Fit the random under-sampler.""" _check_imblearn() from imblearn.under_sampling import RandomUnderSampler as _RUS X, y = check_X_y(X, y) self.sampler_ = _RUS( sampling_strategy=self.sampling_strategy, random_state=self.random_state, replacement=self.replacement, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
[docs] class NearMissUnderSampler(BaseEstimator): """NearMiss under-sampling using nearest neighbor heuristics. Selects majority samples based on their distance to minority samples. Parameters ---------- sampling_strategy : float, str, dict, or callable, default='auto' Sampling information. version : {1, 2, 3}, default=1 Version of NearMiss algorithm: - 1: Select majority samples with smallest average distance to k nearest minority - 2: Select majority samples with smallest average distance to k farthest minority - 3: Select majority samples with smallest distance to each minority sample n_neighbors : int, default=3 Number of nearest neighbors. n_neighbors_ver3 : int, default=3 Number of neighbors for version 3. n_jobs : int, default=-1 Number of parallel jobs. """ def __init__( self, sampling_strategy: str | float | dict = 'auto', version: Literal[1, 2, 3] = 1, n_neighbors: int = 3, n_neighbors_ver3: int = 3, n_jobs: int = -1, ): self.sampling_strategy = sampling_strategy self.version = version self.n_neighbors = n_neighbors self.n_neighbors_ver3 = n_neighbors_ver3 self.n_jobs = n_jobs
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> NearMissUnderSampler: """Fit the NearMiss sampler.""" _check_imblearn() from imblearn.under_sampling import NearMiss X, y = check_X_y(X, y) self.sampler_ = NearMiss( sampling_strategy=self.sampling_strategy, version=self.version, n_neighbors=self.n_neighbors, n_neighbors_ver3=self.n_neighbors_ver3, n_jobs=self.n_jobs, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
[docs] class CondensedNearestNeighbour(BaseEstimator): """Condensed Nearest Neighbour (CNN) under-sampling. Iteratively selects samples that are misclassified by 1-NN on the current condensed set. Finds a minimal consistent subset. Parameters ---------- sampling_strategy : str, list, or callable, default='auto' Classes to be under-sampled. random_state : int or None, default=None Random seed for reproducibility. n_neighbors : int, default=1 Number of nearest neighbors. n_seeds_S : int, default=1 Number of samples to start the condensing. n_jobs : int, default=-1 Number of parallel jobs. """ def __init__( self, sampling_strategy: str | list = 'auto', random_state: int | None = None, n_neighbors: int = 1, n_seeds_S: int = 1, n_jobs: int = -1, ): self.sampling_strategy = sampling_strategy self.random_state = random_state self.n_neighbors = n_neighbors self.n_seeds_S = n_seeds_S self.n_jobs = n_jobs
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> CondensedNearestNeighbour: """Fit the CNN sampler.""" _check_imblearn() from imblearn.under_sampling import CondensedNearestNeighbour as _CNN X, y = check_X_y(X, y) self.sampler_ = _CNN( sampling_strategy=self.sampling_strategy, random_state=self.random_state, n_neighbors=self.n_neighbors, n_seeds_S=self.n_seeds_S, n_jobs=self.n_jobs, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
[docs] class OneSidedSelectionUnderSampler(BaseEstimator): """One-Sided Selection (OSS) under-sampling. Combines Tomek links removal with CNN to remove noisy and redundant majority samples. Parameters ---------- sampling_strategy : str, list, or callable, default='auto' Classes to be under-sampled. random_state : int or None, default=None Random seed for reproducibility. n_neighbors : int, default=1 Number of nearest neighbors for CNN step. n_seeds_S : int, default=1 Number of samples to start CNN condensing. n_jobs : int, default=-1 Number of parallel jobs. """ def __init__( self, sampling_strategy: str | list = 'auto', random_state: int | None = None, n_neighbors: int = 1, n_seeds_S: int = 1, n_jobs: int = -1, ): self.sampling_strategy = sampling_strategy self.random_state = random_state self.n_neighbors = n_neighbors self.n_seeds_S = n_seeds_S self.n_jobs = n_jobs
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> OneSidedSelectionUnderSampler: """Fit the OSS sampler.""" _check_imblearn() from imblearn.under_sampling import OneSidedSelection X, y = check_X_y(X, y) self.sampler_ = OneSidedSelection( sampling_strategy=self.sampling_strategy, random_state=self.random_state, n_neighbors=self.n_neighbors, n_seeds_S=self.n_seeds_S, n_jobs=self.n_jobs, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
[docs] class NeighbourhoodCleaningRule(BaseEstimator): """Neighbourhood Cleaning Rule (NCR) under-sampling. Uses ENN to clean the data and then removes majority samples whose nearest neighbors are mostly minority. Parameters ---------- sampling_strategy : str, list, or callable, default='auto' Classes to be under-sampled. n_neighbors : int, default=3 Number of nearest neighbors. threshold_cleaning : float, default=0.5 Threshold for cleaning majority samples. n_jobs : int, default=None Number of parallel jobs. """ def __init__( self, sampling_strategy: str | list = 'auto', n_neighbors: int = 3, threshold_cleaning: float = 0.5, n_jobs: int | None = None, ): self.sampling_strategy = sampling_strategy self.n_neighbors = n_neighbors self.threshold_cleaning = threshold_cleaning self.n_jobs = n_jobs
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> NeighbourhoodCleaningRule: """Fit the NCR sampler.""" _check_imblearn() from imblearn.under_sampling import NeighbourhoodCleaningRule as _NCR X, y = check_X_y(X, y) self.sampler_ = _NCR( sampling_strategy=self.sampling_strategy, n_neighbors=self.n_neighbors, threshold_cleaning=self.threshold_cleaning, n_jobs=self.n_jobs, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
[docs] class InstanceHardnessThresholdSampler(BaseEstimator): """Instance Hardness Threshold (IHT) under-sampling. Removes samples that are hard to classify based on a classifier's predicted probabilities. Parameters ---------- sampling_strategy : str, list, or callable, default='auto' Classes to be under-sampled. estimator : estimator or None, default=None Classifier for computing instance hardness. If None, uses RandomForestClassifier. cv : int, default=5 Number of cross-validation folds. random_state : int or None, default=None Random seed for reproducibility. n_jobs : int, default=-1 Number of parallel jobs. """ def __init__( self, sampling_strategy: str | list = 'auto', estimator: Any = None, cv: int = 5, random_state: int | None = None, n_jobs: int = -1, ): self.sampling_strategy = sampling_strategy self.estimator = estimator self.cv = cv self.random_state = random_state self.n_jobs = n_jobs
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> InstanceHardnessThresholdSampler: """Fit the IHT sampler.""" _check_imblearn() from imblearn.under_sampling import InstanceHardnessThreshold X, y = check_X_y(X, y) self.sampler_ = InstanceHardnessThreshold( sampling_strategy=self.sampling_strategy, estimator=self.estimator, cv=self.cv, random_state=self.random_state, n_jobs=self.n_jobs, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
[docs] class ClusterCentroidsUnderSampler(BaseEstimator): """Cluster Centroids under-sampling. Replaces majority samples with cluster centroids from k-means. Parameters ---------- sampling_strategy : float, str, dict, or callable, default='auto' Sampling information. random_state : int or None, default=None Random seed for reproducibility. estimator : estimator or None, default=None Clustering estimator. If None, uses KMeans. voting : {'hard', 'soft'}, default='auto' Voting strategy for cluster assignment. """ def __init__( self, sampling_strategy: str | float | dict = 'auto', random_state: int | None = None, estimator: Any = None, voting: Literal['hard', 'soft', 'auto'] = 'auto', ): self.sampling_strategy = sampling_strategy self.random_state = random_state self.estimator = estimator self.voting = voting
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> ClusterCentroidsUnderSampler: """Fit the Cluster Centroids sampler.""" _check_imblearn() from imblearn.under_sampling import ClusterCentroids X, y = check_X_y(X, y) self.sampler_ = ClusterCentroids( sampling_strategy=self.sampling_strategy, random_state=self.random_state, estimator=self.estimator, voting=self.voting, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
# ============================================================================= # Combined Methods (Over + Under) # =============================================================================
[docs] class SMOTEENNResampler(BaseEstimator): """SMOTE + Edited Nearest Neighbours combined resampling. Applies SMOTE over-sampling followed by ENN cleaning to remove noisy synthetic samples. Parameters ---------- sampling_strategy : float, str, dict, or callable, default='auto' Sampling strategy for SMOTE. smote : SMOTEResampler or dict, default=None SMOTE instance or parameters. enn : EditedNearestNeighbours or dict, default=None ENN instance or parameters. random_state : int or None, default=None Random seed for reproducibility. n_jobs : int, default=-1 Number of parallel jobs. """ def __init__( self, sampling_strategy: str | float | dict = 'auto', smote: Any = None, enn: Any = None, random_state: int | None = None, n_jobs: int = -1, ): self.sampling_strategy = sampling_strategy self.smote = smote self.enn = enn self.random_state = random_state self.n_jobs = n_jobs
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> SMOTEENNResampler: """Fit the SMOTE-ENN sampler.""" _check_imblearn() from imblearn.combine import SMOTEENN X, y = check_X_y(X, y) self.sampler_ = SMOTEENN( sampling_strategy=self.sampling_strategy, smote=self.smote, enn=self.enn, random_state=self.random_state, n_jobs=self.n_jobs, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
[docs] class SMOTETomekResampler(BaseEstimator): """SMOTE + Tomek Links combined resampling. Applies SMOTE over-sampling followed by Tomek links removal to clean the decision boundary. Parameters ---------- sampling_strategy : float, str, dict, or callable, default='auto' Sampling strategy for SMOTE. smote : SMOTEResampler or dict, default=None SMOTE instance or parameters. tomek : TomekLinksUnderSampler or dict, default=None Tomek Links instance or parameters. random_state : int or None, default=None Random seed for reproducibility. n_jobs : int, default=-1 Number of parallel jobs. """ def __init__( self, sampling_strategy: str | float | dict = 'auto', smote: Any = None, tomek: Any = None, random_state: int | None = None, n_jobs: int = -1, ): self.sampling_strategy = sampling_strategy self.smote = smote self.tomek = tomek self.random_state = random_state self.n_jobs = n_jobs
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> SMOTETomekResampler: """Fit the SMOTE-Tomek sampler.""" _check_imblearn() from imblearn.combine import SMOTETomek X, y = check_X_y(X, y) self.sampler_ = SMOTETomek( sampling_strategy=self.sampling_strategy, smote=self.smote, tomek=self.tomek, random_state=self.random_state, n_jobs=self.n_jobs, ) self.sampler_.fit(X, y) self.sampling_strategy_ = self.sampler_.sampling_strategy_ return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset.""" self.fit(X, y) return self.sampler_.fit_resample(X, y)
# ============================================================================= # Auto-Balancer with Strategy Selection # ============================================================================= # Available algorithms for easy access OVER_SAMPLERS = { 'smote': SMOTEResampler, 'borderline_smote': BorderlineSMOTEResampler, 'adasyn': ADASYNResampler, 'svm_smote': SVMSMOTEResampler, 'kmeans_smote': KMeansSMOTEResampler, 'random_over': RandomOverSampler, } UNDER_SAMPLERS = { 'enn': EditedNearestNeighbours, 'allknn': AllKNNUnderSampler, 'tomek': TomekLinksUnderSampler, 'random_under': RandomUnderSampler, 'nearmiss': NearMissUnderSampler, 'cnn': CondensedNearestNeighbour, 'oss': OneSidedSelectionUnderSampler, 'ncr': NeighbourhoodCleaningRule, 'iht': InstanceHardnessThresholdSampler, 'cluster_centroids': ClusterCentroidsUnderSampler, } COMBINED_SAMPLERS = { 'smoteenn': SMOTEENNResampler, 'smotetomek': SMOTETomekResampler, } # Modern algorithm categories (populated by submodules) GEOMETRIC_SAMPLERS: dict[str, type] = {} GENERATIVE_SAMPLERS: dict[str, type] = {} LLM_SAMPLERS: dict[str, type] = {} ALL_SAMPLERS = {**OVER_SAMPLERS, **UNDER_SAMPLERS, **COMBINED_SAMPLERS}
[docs] class AutoBalancer(BaseEstimator): """Automatic class balancing with strategy selection. Automatically selects and applies the best resampling strategy based on the imbalance ratio and data characteristics. Parameters ---------- strategy : str, default='auto' Balancing strategy: - 'auto': Automatically select based on imbalance ratio - 'oversample': Use SMOTE-based oversampling - 'undersample': Use ENN-based undersampling - 'combine': Use SMOTE + ENN - 'geometric': Use MultivariateGaussianSMOTE (from geometric module) - 'generative': Use ForestFlowResampler (from generative module) - Any key from ALL_SAMPLERS (e.g., 'smote', 'borderline_smote', etc.) sampling_strategy : float, str, dict, or callable, default='auto' Target class distribution. imbalance_threshold : float, default=0.5 Ratio below which data is considered imbalanced. severe_imbalance_threshold : float, default=0.1 Ratio below which imbalance is considered severe. random_state : int or None, default=None Random seed for reproducibility. include_generative : bool, default=False If True, include generative samplers (from ``imbalance_generative``) in the auto-selection pool. n_jobs : int, default=-1 Number of parallel jobs. **kwargs : dict Additional parameters passed to the selected sampler. Attributes ---------- sampler_ : BaseEstimator The fitted sampler. imbalance_ratio_ : float Computed imbalance ratio (minority / majority). selected_strategy_ : str The strategy that was selected. Examples -------- >>> from endgame.preprocessing import AutoBalancer >>> balancer = AutoBalancer(strategy='auto', random_state=42) >>> X_balanced, y_balanced = balancer.fit_resample(X, y) >>> print(f"Selected: {balancer.selected_strategy_}") """ def __init__( self, strategy: str = 'auto', sampling_strategy: str | float | dict = 'auto', imbalance_threshold: float = 0.5, severe_imbalance_threshold: float = 0.1, include_generative: bool = False, random_state: int | None = None, n_jobs: int = -1, **kwargs, ): self.strategy = strategy self.sampling_strategy = sampling_strategy self.imbalance_threshold = imbalance_threshold self.severe_imbalance_threshold = severe_imbalance_threshold self.include_generative = include_generative self.random_state = random_state self.n_jobs = n_jobs self.kwargs = kwargs def _compute_imbalance_ratio(self, y: np.ndarray) -> float: """Compute imbalance ratio (minority / majority).""" unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: return 1.0 return counts.min() / counts.max() def _select_strategy(self, X: np.ndarray, y: np.ndarray) -> str: """Auto-select the best strategy based on data characteristics.""" self.imbalance_ratio_ = self._compute_imbalance_ratio(y) n_samples, n_features = X.shape # If data is balanced, no resampling needed if self.imbalance_ratio_ >= self.imbalance_threshold: return 'none' # Severe imbalance: use combined approach if self.imbalance_ratio_ < self.severe_imbalance_threshold: return 'smoteenn' # Moderate imbalance with small dataset: oversample if n_samples < 1000: return 'borderline_smote' # Large dataset: undersample to save computation if n_samples > 10000: return 'random_under' # Default: SMOTE return 'smote'
[docs] def fit(self, X: ArrayLike, y: ArrayLike) -> AutoBalancer: """Fit the auto-balancer. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data. y : array-like of shape (n_samples,) Target values. Returns ------- self : AutoBalancer Fitted balancer. """ _check_imblearn() X, y = check_X_y(X, y) # Determine strategy if self.strategy == 'auto': self.selected_strategy_ = self._select_strategy(X, y) elif self.strategy == 'oversample': self.selected_strategy_ = 'smote' elif self.strategy == 'undersample': self.selected_strategy_ = 'enn' elif self.strategy == 'combine': self.selected_strategy_ = 'smoteenn' elif self.strategy == 'geometric': self.selected_strategy_ = 'multivariate_gaussian_smote' elif self.strategy == 'generative': self.selected_strategy_ = 'forest_flow' else: self.selected_strategy_ = self.strategy # Handle 'none' strategy if self.selected_strategy_ == 'none': self.sampler_ = None return self if self.selected_strategy_ not in ALL_SAMPLERS: raise ValueError( f"Unknown strategy '{self.selected_strategy_}'. " f"Available: {list(ALL_SAMPLERS.keys())}" ) else: SamplerClass = ALL_SAMPLERS[self.selected_strategy_] # Create sampler with appropriate parameters sampler_params = { 'sampling_strategy': self.sampling_strategy, 'random_state': self.random_state, } # Add n_jobs if supported import inspect sig = inspect.signature(SamplerClass.__init__) if 'n_jobs' in sig.parameters: sampler_params['n_jobs'] = self.n_jobs # Add extra kwargs sampler_params.update(self.kwargs) # Filter to valid parameters valid_params = set(sig.parameters.keys()) - {'self'} sampler_params = {k: v for k, v in sampler_params.items() if k in valid_params} self.sampler_ = SamplerClass(**sampler_params) self.sampler_.fit(X, y) return self
[docs] def fit_resample(self, X: ArrayLike, y: ArrayLike) -> tuple[np.ndarray, np.ndarray]: """Fit and resample the dataset. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data. y : array-like of shape (n_samples,) Target values. Returns ------- X_resampled : ndarray of shape (n_samples_new, n_features) Resampled training data. y_resampled : ndarray of shape (n_samples_new,) Resampled target values. """ self.fit(X, y) if self.sampler_ is None: # No resampling needed return np.asarray(X), np.asarray(y) return self.sampler_.fit_resample(X, y)
[docs] def get_sampler(self) -> BaseEstimator | None: """Get the underlying sampler. Returns ------- sampler : BaseEstimator or None The fitted sampler, or None if no resampling was needed. """ check_is_fitted(self, 'sampler_') return self.sampler_
[docs] def get_imbalance_ratio(y: ArrayLike) -> float: """Compute the imbalance ratio of a target array. Parameters ---------- y : array-like of shape (n_samples,) Target values. Returns ------- ratio : float Imbalance ratio (minority_count / majority_count). Returns 1.0 if all classes have the same count. Examples -------- >>> y = [0, 0, 0, 0, 0, 1, 1] >>> get_imbalance_ratio(y) 0.4 """ y = np.asarray(y) unique, counts = np.unique(y, return_counts=True) if len(unique) < 2: return 1.0 return counts.min() / counts.max()
[docs] def get_class_distribution(y: ArrayLike) -> dict[Any, int]: """Get the class distribution of a target array. Parameters ---------- y : array-like of shape (n_samples,) Target values. Returns ------- distribution : dict Dictionary mapping class labels to counts. Examples -------- >>> y = [0, 0, 0, 1, 1, 2] >>> get_class_distribution(y) {0: 3, 1: 2, 2: 1} """ y = np.asarray(y) unique, counts = np.unique(y, return_counts=True) return dict(zip(unique, counts))