Source code for endgame.feature_selection.importance.tree

from __future__ import annotations

"""Tree-based feature importance selection."""

from typing import Literal

import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y


[docs] class TreeImportanceSelector(TransformerMixin, BaseEstimator): """Feature selection based on tree-based importance. Uses Gini/entropy importance from tree-based models. Fast but can be biased toward high-cardinality features. Parameters ---------- estimator : BaseEstimator, optional Tree-based model with `feature_importances_`. Default: RandomForestClassifier. n_features : int, float, or str, default='mean' Number of features to select: - If int, select that many features. - If float (0-1), select that fraction. - If 'mean', select features with importance > mean. - If 'median', select features with importance > median. importance_type : str, default='native' Type of importance to use: - 'native': Use model's feature_importances_ - 'gain': Gain-based (LightGBM/XGBoost specific) - 'split': Split count-based threshold : float, optional Explicit importance threshold. prefit : bool, default=False Whether the estimator is already fitted. random_state : int, optional Random seed. Attributes ---------- feature_importances_ : ndarray Feature importance scores. selected_features_ : ndarray Indices of selected features. threshold_ : float Actual threshold used for selection. Example ------- >>> from endgame.feature_selection import TreeImportanceSelector >>> selector = TreeImportanceSelector(n_features=20) >>> X_selected = selector.fit_transform(X, y) """ def __init__( self, estimator: BaseEstimator | None = None, n_features: int | float | str = "mean", importance_type: Literal["native", "gain", "split"] = "native", threshold: float | None = None, prefit: bool = False, random_state: int | None = None, ): self.estimator = estimator self.n_features = n_features self.importance_type = importance_type self.threshold = threshold self.prefit = prefit self.random_state = random_state def _get_estimator(self): """Get the estimator to use.""" if self.estimator is not None: if self.prefit: return self.estimator return clone(self.estimator) try: from lightgbm import LGBMClassifier return LGBMClassifier( n_estimators=100, importance_type="gain" if self.importance_type == "gain" else "split", verbosity=-1, n_jobs=-1, random_state=self.random_state, ) except ImportError: from sklearn.ensemble import RandomForestClassifier return RandomForestClassifier( n_estimators=100, max_depth=5, n_jobs=-1, random_state=self.random_state, )
[docs] def fit(self, X, y): """Fit the tree importance selector. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data. y : array-like of shape (n_samples,) Target values. Returns ------- self : TreeImportanceSelector """ X, y = check_X_y(X, y) n_samples, n_features = X.shape self.n_features_in_ = n_features # Get and fit estimator estimator = self._get_estimator() if not self.prefit: estimator.fit(X, y) self.estimator_ = estimator # Get feature importances if self.importance_type == "native" or not hasattr(estimator, "booster_"): self.feature_importances_ = estimator.feature_importances_ else: # LightGBM/XGBoost specific booster = estimator.booster_ if hasattr(booster, "feature_importance"): self.feature_importances_ = booster.feature_importance( importance_type=self.importance_type ) else: self.feature_importances_ = estimator.feature_importances_ # Normalize if self.feature_importances_.sum() > 0: self.feature_importances_ = ( self.feature_importances_ / self.feature_importances_.sum() ) # Determine threshold if self.threshold is not None: self.threshold_ = self.threshold elif self.n_features == "mean": self.threshold_ = np.mean(self.feature_importances_) elif self.n_features == "median": self.threshold_ = np.median(self.feature_importances_) elif isinstance(self.n_features, float): n_select = max(1, int(n_features * self.n_features)) sorted_imp = np.sort(self.feature_importances_)[::-1] self.threshold_ = sorted_imp[n_select - 1] if n_select <= n_features else 0 else: # int n_select = min(self.n_features, n_features) sorted_imp = np.sort(self.feature_importances_)[::-1] self.threshold_ = sorted_imp[n_select - 1] if n_select <= n_features else 0 # Select features self._support_mask = self.feature_importances_ >= self.threshold_ self.selected_features_ = np.where(self._support_mask)[0] self.n_features_ = len(self.selected_features_) # Ensure at least one feature if self.n_features_ == 0: best_idx = np.argmax(self.feature_importances_) self._support_mask[best_idx] = True self.selected_features_ = np.array([best_idx]) self.n_features_ = 1 return self
[docs] def transform(self, X) -> np.ndarray: """Select features. Parameters ---------- X : array-like of shape (n_samples, n_features) Data to transform. Returns ------- X_selected : ndarray Data with selected features. """ check_is_fitted(self, "selected_features_") X = check_array(X) return X[:, self.selected_features_]
[docs] def fit_transform(self, X, y) -> np.ndarray: """Fit and transform.""" self.fit(X, y) return self.transform(X)
[docs] def get_support(self, indices: bool = False) -> np.ndarray: """Get mask or indices of selected features.""" check_is_fitted(self, "_support_mask") if indices: return self.selected_features_ return self._support_mask
[docs] def get_feature_ranking(self) -> np.ndarray: """Get feature ranking by importance.""" check_is_fitted(self, "feature_importances_") return np.argsort(self.feature_importances_)[::-1]