Source code for endgame.preprocessing.target_transform

from __future__ import annotations

"""Target transformation wrappers for regression.

Applies invertible transformations to the target variable during training,
then inverse-transforms predictions at inference time. This can improve
regression performance when the target distribution is skewed, heavy-tailed,
or otherwise non-normal.

Similar to sklearn's TransformedTargetRegressor but with more flexible
transform selection including automatic normality-based selection.

Supported transforms:
- log, log1p, sqrt: Simple monotonic transforms
- box_cox, yeo_johnson: Power transforms (scipy)
- quantile: QuantileTransformer-based normalization
- rank: Rank-based (ordinal) normalization
- auto: Automatically selects the best transform via Shapiro-Wilk normality test
- none: No transformation (passthrough)
"""

from typing import Any

import numpy as np
from sklearn.base import RegressorMixin, clone
from sklearn.preprocessing import QuantileTransformer as _SklearnQuantileTransformer

from endgame.core.base import EndgameEstimator

# Methods that require strictly positive targets
_POSITIVE_METHODS = {"log", "box_cox"}
# Methods that require non-negative targets
_NONNEG_METHODS = {"sqrt", "log1p"}
# All valid method strings
_VALID_METHODS = {
    "auto", "log", "log1p", "sqrt", "box_cox",
    "yeo_johnson", "quantile", "rank", "none",
}


def _check_positive(y: np.ndarray, method: str) -> None:
    """Raise ValueError if y contains non-positive values for a method that requires them."""
    if np.any(y <= 0):
        raise ValueError(
            f"Target contains non-positive values. Method '{method}' requires "
            f"all positive targets. Consider 'log1p', 'yeo_johnson', 'quantile', "
            f"or 'auto' instead."
        )


def _check_nonneg(y: np.ndarray, method: str) -> None:
    """Raise ValueError if y contains negative values for a method that requires non-negative."""
    if np.any(y < 0):
        raise ValueError(
            f"Target contains negative values. Method '{method}' requires "
            f"all non-negative targets. Consider 'yeo_johnson', 'quantile', "
            f"or 'auto' instead."
        )


[docs] class TargetTransformer(EndgameEstimator, RegressorMixin): """Wrapper that applies target transformations for regression. Transforms the target variable y during ``fit``, trains the wrapped regressor on the transformed targets, and inverse-transforms predictions at inference time. Parameters ---------- regressor : estimator Any sklearn-compatible regressor. This is required. method : str, default='auto' Transformation method. One of: - ``'auto'``: Test normality via Shapiro-Wilk; try Box-Cox and Yeo-Johnson and pick whichever produces the most normal transformed y. Falls back to ``'yeo_johnson'`` when Box-Cox is not applicable (non-positive targets). - ``'log'``: Natural log. Requires strictly positive targets. - ``'log1p'``: ``log(1 + y)``. Requires non-negative targets. - ``'sqrt'``: Square root. Requires non-negative targets. - ``'box_cox'``: Box-Cox power transform (scipy). Requires strictly positive targets. - ``'yeo_johnson'``: Yeo-Johnson power transform (scipy). Works with any real-valued targets. - ``'quantile'``: Sklearn QuantileTransformer mapping to normal. - ``'rank'``: Rank-based (ordinal) normalization. - ``'none'``: No transformation (passthrough). random_state : int, optional Random seed for reproducibility (passed to quantile transform and the wrapped regressor if it supports it). verbose : bool, default=False Enable verbose output. Attributes ---------- regressor_ : estimator The fitted regressor (clone of ``regressor``). method_ : str The method actually used (relevant when ``method='auto'``). lambda_ : float or None The fitted lambda parameter for Box-Cox / Yeo-Johnson transforms. qt_ : QuantileTransformer or None Fitted QuantileTransformer instance (for ``method='quantile'``). y_train_sorted_ : ndarray or None Sorted training targets for rank inverse transform. feature_importances_ : ndarray Delegated from the wrapped regressor, if available. Examples -------- >>> from sklearn.ensemble import RandomForestRegressor >>> from endgame.preprocessing import TargetTransformer >>> model = TargetTransformer( ... regressor=RandomForestRegressor(n_estimators=100, random_state=42), ... method='auto', ... ) >>> model.fit(X_train, y_train) >>> preds = model.predict(X_test) """ _estimator_type = "regressor" def __init__( self, regressor: Any = None, method: str = "auto", random_state: int | None = None, verbose: bool = False, ): super().__init__(random_state=random_state, verbose=verbose) if regressor is None: raise TypeError( "TargetTransformer requires a regressor. Pass a sklearn-compatible " "regressor via the 'regressor' parameter." ) self.regressor = regressor self.method = method # ------------------------------------------------------------------ # Fit # ------------------------------------------------------------------
[docs] def fit(self, X, y, **fit_params) -> TargetTransformer: """Fit the wrapped regressor on transformed targets. Parameters ---------- X : array-like of shape (n_samples, n_features) Training features. y : array-like of shape (n_samples,) Training targets. **fit_params : dict Additional parameters forwarded to the wrapped regressor's ``fit`` method (e.g. ``sample_weight``). Returns ------- self Fitted TargetTransformer. """ X, y = self._validate_data(X, y, reset=True) y = y.astype(np.float64) if self.method not in _VALID_METHODS: raise ValueError( f"Unknown method '{self.method}'. Must be one of {sorted(_VALID_METHODS)}." ) # Handle constant target edge case if np.all(y == y[0]): self._log("Target is constant; using 'none' transform.", level="warn") self.method_ = "none" self.lambda_ = None self.qt_ = None self.y_train_sorted_ = None self.regressor_ = clone(self.regressor) self.regressor_.fit(X, y, **fit_params) self._is_fitted = True return self # Resolve 'auto' if self.method == "auto": self.method_ = self._select_auto(y) self._log(f"Auto-selected method: '{self.method_}'") else: self.method_ = self.method # Initialize transform state self.lambda_ = None self.qt_ = None self.y_train_sorted_ = None # Apply forward transform y_transformed = self._forward(y, fit=True) # Fit the regressor on transformed targets self.regressor_ = clone(self.regressor) self.regressor_.fit(X, y_transformed, **fit_params) self._is_fitted = True return self
# ------------------------------------------------------------------ # Predict # ------------------------------------------------------------------
[docs] def predict(self, X) -> np.ndarray: """Predict target values, inverse-transforming the regressor's output. Parameters ---------- X : array-like of shape (n_samples, n_features) Test features. Returns ------- ndarray of shape (n_samples,) Predicted target values in the original scale. """ self._check_is_fitted() X = self._to_numpy(X) y_pred_transformed = self.regressor_.predict(X) return self._inverse(y_pred_transformed)
[docs] def predict_proba(self, X) -> np.ndarray: """Pass through to the wrapped regressor's predict_proba, if available. Some regressors (e.g. NGBoost) support probabilistic predictions. This method delegates directly without inverse-transforming, as the semantics are regressor-specific. Parameters ---------- X : array-like of shape (n_samples, n_features) Test features. Returns ------- ndarray Whatever the wrapped regressor returns from predict_proba. Raises ------ AttributeError If the wrapped regressor does not support predict_proba. """ self._check_is_fitted() X = self._to_numpy(X) if not hasattr(self.regressor_, "predict_proba"): raise AttributeError( f"The wrapped regressor {type(self.regressor_).__name__} " f"does not support predict_proba." ) return self.regressor_.predict_proba(X)
# ------------------------------------------------------------------ # Properties # ------------------------------------------------------------------ @property def feature_importances_(self) -> np.ndarray: """Feature importances from the wrapped regressor. Returns ------- ndarray of shape (n_features,) Feature importances. Raises ------ AttributeError If the wrapped regressor does not expose feature_importances_. """ self._check_is_fitted() if hasattr(self.regressor_, "feature_importances_"): return self.regressor_.feature_importances_ raise AttributeError( f"The wrapped regressor {type(self.regressor_).__name__} " f"does not expose feature_importances_." ) # ------------------------------------------------------------------ # Auto selection # ------------------------------------------------------------------ def _select_auto(self, y: np.ndarray) -> str: """Select the best transform automatically based on normality. Strategy: 1. Compute Shapiro-Wilk statistic on raw y. 2. If y is already normal (p > 0.05), use 'none'. 3. Otherwise, try Box-Cox (if applicable) and Yeo-Johnson. 4. Return whichever produces the highest Shapiro-Wilk p-value. Parameters ---------- y : ndarray Target array. Returns ------- str Selected method name. """ from scipy import stats # Shapiro-Wilk has a sample size limit; subsample if needed y_test = y if len(y) > 5000: rng = np.random.RandomState(self.random_state) idx = rng.choice(len(y), 5000, replace=False) y_test = y[idx] _, p_raw = stats.shapiro(y_test) self._log(f"Raw target Shapiro-Wilk p={p_raw:.4f}") if p_raw > 0.05: return "none" candidates: dict[str, float] = {} # Try Yeo-Johnson (always applicable) try: y_yj, lam_yj = stats.yeojohnson(y) y_yj_test = y_yj if len(y) <= 5000 else y_yj[idx] _, p_yj = stats.shapiro(y_yj_test) candidates["yeo_johnson"] = p_yj self._log(f"Yeo-Johnson Shapiro-Wilk p={p_yj:.4f} (lambda={lam_yj:.4f})") except Exception: pass # Try Box-Cox (requires strictly positive) if np.all(y > 0): try: y_bc, lam_bc = stats.boxcox(y) y_bc_test = y_bc if len(y) <= 5000 else y_bc[idx] _, p_bc = stats.shapiro(y_bc_test) candidates["box_cox"] = p_bc self._log(f"Box-Cox Shapiro-Wilk p={p_bc:.4f} (lambda={lam_bc:.4f})") except Exception: pass if not candidates: return "yeo_johnson" best = max(candidates, key=candidates.get) return best # ------------------------------------------------------------------ # Forward / inverse transforms # ------------------------------------------------------------------ def _forward(self, y: np.ndarray, fit: bool = False) -> np.ndarray: """Apply the forward transform to target values. Parameters ---------- y : ndarray Target values. fit : bool Whether this is the fitting step (fit transform parameters). Returns ------- ndarray Transformed target values. """ method = self.method_ if method == "none": return y.copy() elif method == "log": _check_positive(y, method) return np.log(y) elif method == "log1p": _check_nonneg(y, method) return np.log1p(y) elif method == "sqrt": _check_nonneg(y, method) return np.sqrt(y) elif method == "box_cox": from scipy import stats _check_positive(y, method) if fit: y_t, self.lambda_ = stats.boxcox(y) return y_t else: return stats.boxcox(y, lmbda=self.lambda_) elif method == "yeo_johnson": from scipy import stats if fit: y_t, self.lambda_ = stats.yeojohnson(y) return y_t else: return stats.yeojohnson(y, lmbda=self.lambda_) elif method == "quantile": if fit: self.qt_ = _SklearnQuantileTransformer( output_distribution="normal", random_state=self.random_state, ) return self.qt_.fit_transform(y.reshape(-1, 1)).ravel() else: return self.qt_.transform(y.reshape(-1, 1)).ravel() elif method == "rank": if fit: self.y_train_sorted_ = np.sort(y) n = len(y) # Rank transform: map to [0, 1] via ranks then to normal quantiles ranks = np.searchsorted(self.y_train_sorted_, y, side="right") # Clip to valid quantile range quantiles = np.clip(ranks / len(self.y_train_sorted_), 1e-6, 1 - 1e-6) from scipy import stats return stats.norm.ppf(quantiles) else: raise ValueError(f"Unknown method '{method}'.") def _inverse(self, y: np.ndarray) -> np.ndarray: """Apply the inverse transform to predicted values. Parameters ---------- y : ndarray Transformed predicted values. Returns ------- ndarray Predictions in the original target scale. """ method = self.method_ if method == "none": return y.copy() elif method == "log": return np.exp(y) elif method == "log1p": return np.expm1(y) elif method == "sqrt": # Clip to avoid negative values from numeric noise return np.square(np.clip(y, 0.0, None)) elif method == "box_cox": from scipy.special import inv_boxcox return inv_boxcox(y, self.lambda_) elif method == "yeo_johnson": from scipy.special import inv_boxcox # Yeo-Johnson inverse is not provided by scipy directly. # Implement manually following the Yeo-Johnson definition. return _inv_yeojohnson(y, self.lambda_) elif method == "quantile": return self.qt_.inverse_transform(y.reshape(-1, 1)).ravel() elif method == "rank": from scipy import stats # Map normal quantiles back to [0, 1], then interpolate quantiles = stats.norm.cdf(y) # Interpolate back to original scale using stored sorted targets n = len(self.y_train_sorted_) indices = quantiles * (n - 1) idx_low = np.clip(np.floor(indices).astype(int), 0, n - 1) idx_high = np.clip(np.ceil(indices).astype(int), 0, n - 1) frac = indices - idx_low return ( self.y_train_sorted_[idx_low] * (1 - frac) + self.y_train_sorted_[idx_high] * frac ) else: raise ValueError(f"Unknown method '{method}'.")
def _inv_yeojohnson(y: np.ndarray, lam: float) -> np.ndarray: """Inverse Yeo-Johnson transform. The Yeo-Johnson transform is defined piecewise: For x >= 0: if lam != 0: y = ((x + 1)^lam - 1) / lam if lam == 0: y = log(x + 1) For x < 0: if lam != 2: y = -((-x + 1)^(2 - lam) - 1) / (2 - lam) if lam == 2: y = -log(-x + 1) This function inverts those definitions. Parameters ---------- y : ndarray Transformed values. lam : float Lambda parameter. Returns ------- ndarray Original-scale values. """ x = np.zeros_like(y, dtype=np.float64) pos = y >= 0 neg = ~pos # Invert positive branch if np.any(pos): if np.abs(lam) < 1e-12: # lam ~ 0: y = log(x + 1) => x = exp(y) - 1 x[pos] = np.exp(y[pos]) - 1 else: # y = ((x+1)^lam - 1) / lam => x = (y*lam + 1)^(1/lam) - 1 x[pos] = np.power(y[pos] * lam + 1, 1.0 / lam) - 1 # Invert negative branch if np.any(neg): if np.abs(lam - 2) < 1e-12: # lam ~ 2: y = -log(-x + 1) => x = 1 - exp(-y) x[neg] = 1 - np.exp(-y[neg]) else: # y = -((-x+1)^(2-lam) - 1) / (2-lam) # => -x + 1 = (-y*(2-lam) + 1)^(1/(2-lam)) # => x = 1 - (-y*(2-lam) + 1)^(1/(2-lam)) x[neg] = 1 - np.power(-y[neg] * (2 - lam) + 1, 1.0 / (2 - lam)) return x
[docs] class TargetQuantileTransformer(EndgameEstimator, RegressorMixin): """Convenience wrapper applying QuantileTransformer to the target. This is a specialized shortcut for ``TargetTransformer(method='quantile')``. It wraps a regressor and normalizes the target via sklearn's QuantileTransformer before fitting. Parameters ---------- regressor : estimator Any sklearn-compatible regressor. n_quantiles : int, default=1000 Number of quantiles for the QuantileTransformer. output_distribution : str, default='normal' Output distribution: 'normal' or 'uniform'. subsample : int, default=100000 Subsample size for quantile estimation. random_state : int, optional Random seed for reproducibility. verbose : bool, default=False Enable verbose output. Attributes ---------- regressor_ : estimator The fitted regressor. qt_ : QuantileTransformer The fitted target QuantileTransformer. feature_importances_ : ndarray Delegated from the wrapped regressor, if available. Examples -------- >>> from sklearn.linear_model import Ridge >>> from endgame.preprocessing.target_transform import TargetQuantileTransformer >>> model = TargetQuantileTransformer( ... regressor=Ridge(), ... n_quantiles=500, ... output_distribution='normal', ... ) >>> model.fit(X_train, y_train) >>> preds = model.predict(X_test) """ _estimator_type = "regressor" def __init__( self, regressor: Any = None, n_quantiles: int = 1000, output_distribution: str = "normal", subsample: int = 100_000, random_state: int | None = None, verbose: bool = False, ): super().__init__(random_state=random_state, verbose=verbose) if regressor is None: raise TypeError( "TargetQuantileTransformer requires a regressor. Pass a " "sklearn-compatible regressor via the 'regressor' parameter." ) self.regressor = regressor self.n_quantiles = n_quantiles self.output_distribution = output_distribution self.subsample = subsample
[docs] def fit(self, X, y, **fit_params) -> TargetQuantileTransformer: """Fit the wrapped regressor on quantile-transformed targets. Parameters ---------- X : array-like of shape (n_samples, n_features) Training features. y : array-like of shape (n_samples,) Training targets. **fit_params : dict Additional parameters forwarded to the regressor. Returns ------- self """ X, y = self._validate_data(X, y, reset=True) y = y.astype(np.float64) self.qt_ = _SklearnQuantileTransformer( n_quantiles=min(self.n_quantiles, len(y)), output_distribution=self.output_distribution, subsample=self.subsample, random_state=self.random_state, ) y_transformed = self.qt_.fit_transform(y.reshape(-1, 1)).ravel() self.regressor_ = clone(self.regressor) self.regressor_.fit(X, y_transformed, **fit_params) self._is_fitted = True return self
[docs] def predict(self, X) -> np.ndarray: """Predict target values, inverse-transforming the output. Parameters ---------- X : array-like of shape (n_samples, n_features) Test features. Returns ------- ndarray of shape (n_samples,) Predicted target values in the original scale. """ self._check_is_fitted() X = self._to_numpy(X) y_pred_transformed = self.regressor_.predict(X) return self.qt_.inverse_transform( y_pred_transformed.reshape(-1, 1) ).ravel()
@property def feature_importances_(self) -> np.ndarray: """Feature importances from the wrapped regressor.""" self._check_is_fitted() if hasattr(self.regressor_, "feature_importances_"): return self.regressor_.feature_importances_ raise AttributeError( f"The wrapped regressor {type(self.regressor_).__name__} " f"does not expose feature_importances_." )