Source code for endgame.preprocessing.target_transform
from __future__ import annotations
"""Target transformation wrappers for regression.
Applies invertible transformations to the target variable during training,
then inverse-transforms predictions at inference time. This can improve
regression performance when the target distribution is skewed, heavy-tailed,
or otherwise non-normal.
Similar to sklearn's TransformedTargetRegressor but with more flexible
transform selection including automatic normality-based selection.
Supported transforms:
- log, log1p, sqrt: Simple monotonic transforms
- box_cox, yeo_johnson: Power transforms (scipy)
- quantile: QuantileTransformer-based normalization
- rank: Rank-based (ordinal) normalization
- auto: Automatically selects the best transform via Shapiro-Wilk normality test
- none: No transformation (passthrough)
"""
from typing import Any
import numpy as np
from sklearn.base import RegressorMixin, clone
from sklearn.preprocessing import QuantileTransformer as _SklearnQuantileTransformer
from endgame.core.base import EndgameEstimator
# Methods that require strictly positive targets
_POSITIVE_METHODS = {"log", "box_cox"}
# Methods that require non-negative targets
_NONNEG_METHODS = {"sqrt", "log1p"}
# All valid method strings
_VALID_METHODS = {
"auto", "log", "log1p", "sqrt", "box_cox",
"yeo_johnson", "quantile", "rank", "none",
}
def _check_positive(y: np.ndarray, method: str) -> None:
"""Raise ValueError if y contains non-positive values for a method that requires them."""
if np.any(y <= 0):
raise ValueError(
f"Target contains non-positive values. Method '{method}' requires "
f"all positive targets. Consider 'log1p', 'yeo_johnson', 'quantile', "
f"or 'auto' instead."
)
def _check_nonneg(y: np.ndarray, method: str) -> None:
"""Raise ValueError if y contains negative values for a method that requires non-negative."""
if np.any(y < 0):
raise ValueError(
f"Target contains negative values. Method '{method}' requires "
f"all non-negative targets. Consider 'yeo_johnson', 'quantile', "
f"or 'auto' instead."
)
[docs]
class TargetTransformer(EndgameEstimator, RegressorMixin):
"""Wrapper that applies target transformations for regression.
Transforms the target variable y during ``fit``, trains the wrapped
regressor on the transformed targets, and inverse-transforms predictions
at inference time.
Parameters
----------
regressor : estimator
Any sklearn-compatible regressor. This is required.
method : str, default='auto'
Transformation method. One of:
- ``'auto'``: Test normality via Shapiro-Wilk; try Box-Cox and
Yeo-Johnson and pick whichever produces the most normal
transformed y. Falls back to ``'yeo_johnson'`` when Box-Cox is
not applicable (non-positive targets).
- ``'log'``: Natural log. Requires strictly positive targets.
- ``'log1p'``: ``log(1 + y)``. Requires non-negative targets.
- ``'sqrt'``: Square root. Requires non-negative targets.
- ``'box_cox'``: Box-Cox power transform (scipy). Requires
strictly positive targets.
- ``'yeo_johnson'``: Yeo-Johnson power transform (scipy). Works
with any real-valued targets.
- ``'quantile'``: Sklearn QuantileTransformer mapping to normal.
- ``'rank'``: Rank-based (ordinal) normalization.
- ``'none'``: No transformation (passthrough).
random_state : int, optional
Random seed for reproducibility (passed to quantile transform
and the wrapped regressor if it supports it).
verbose : bool, default=False
Enable verbose output.
Attributes
----------
regressor_ : estimator
The fitted regressor (clone of ``regressor``).
method_ : str
The method actually used (relevant when ``method='auto'``).
lambda_ : float or None
The fitted lambda parameter for Box-Cox / Yeo-Johnson transforms.
qt_ : QuantileTransformer or None
Fitted QuantileTransformer instance (for ``method='quantile'``).
y_train_sorted_ : ndarray or None
Sorted training targets for rank inverse transform.
feature_importances_ : ndarray
Delegated from the wrapped regressor, if available.
Examples
--------
>>> from sklearn.ensemble import RandomForestRegressor
>>> from endgame.preprocessing import TargetTransformer
>>> model = TargetTransformer(
... regressor=RandomForestRegressor(n_estimators=100, random_state=42),
... method='auto',
... )
>>> model.fit(X_train, y_train)
>>> preds = model.predict(X_test)
"""
_estimator_type = "regressor"
def __init__(
self,
regressor: Any = None,
method: str = "auto",
random_state: int | None = None,
verbose: bool = False,
):
super().__init__(random_state=random_state, verbose=verbose)
if regressor is None:
raise TypeError(
"TargetTransformer requires a regressor. Pass a sklearn-compatible "
"regressor via the 'regressor' parameter."
)
self.regressor = regressor
self.method = method
# ------------------------------------------------------------------
# Fit
# ------------------------------------------------------------------
[docs]
def fit(self, X, y, **fit_params) -> TargetTransformer:
"""Fit the wrapped regressor on transformed targets.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training features.
y : array-like of shape (n_samples,)
Training targets.
**fit_params : dict
Additional parameters forwarded to the wrapped regressor's
``fit`` method (e.g. ``sample_weight``).
Returns
-------
self
Fitted TargetTransformer.
"""
X, y = self._validate_data(X, y, reset=True)
y = y.astype(np.float64)
if self.method not in _VALID_METHODS:
raise ValueError(
f"Unknown method '{self.method}'. Must be one of {sorted(_VALID_METHODS)}."
)
# Handle constant target edge case
if np.all(y == y[0]):
self._log("Target is constant; using 'none' transform.", level="warn")
self.method_ = "none"
self.lambda_ = None
self.qt_ = None
self.y_train_sorted_ = None
self.regressor_ = clone(self.regressor)
self.regressor_.fit(X, y, **fit_params)
self._is_fitted = True
return self
# Resolve 'auto'
if self.method == "auto":
self.method_ = self._select_auto(y)
self._log(f"Auto-selected method: '{self.method_}'")
else:
self.method_ = self.method
# Initialize transform state
self.lambda_ = None
self.qt_ = None
self.y_train_sorted_ = None
# Apply forward transform
y_transformed = self._forward(y, fit=True)
# Fit the regressor on transformed targets
self.regressor_ = clone(self.regressor)
self.regressor_.fit(X, y_transformed, **fit_params)
self._is_fitted = True
return self
# ------------------------------------------------------------------
# Predict
# ------------------------------------------------------------------
[docs]
def predict(self, X) -> np.ndarray:
"""Predict target values, inverse-transforming the regressor's output.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Test features.
Returns
-------
ndarray of shape (n_samples,)
Predicted target values in the original scale.
"""
self._check_is_fitted()
X = self._to_numpy(X)
y_pred_transformed = self.regressor_.predict(X)
return self._inverse(y_pred_transformed)
[docs]
def predict_proba(self, X) -> np.ndarray:
"""Pass through to the wrapped regressor's predict_proba, if available.
Some regressors (e.g. NGBoost) support probabilistic predictions.
This method delegates directly without inverse-transforming, as the
semantics are regressor-specific.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Test features.
Returns
-------
ndarray
Whatever the wrapped regressor returns from predict_proba.
Raises
------
AttributeError
If the wrapped regressor does not support predict_proba.
"""
self._check_is_fitted()
X = self._to_numpy(X)
if not hasattr(self.regressor_, "predict_proba"):
raise AttributeError(
f"The wrapped regressor {type(self.regressor_).__name__} "
f"does not support predict_proba."
)
return self.regressor_.predict_proba(X)
# ------------------------------------------------------------------
# Properties
# ------------------------------------------------------------------
@property
def feature_importances_(self) -> np.ndarray:
"""Feature importances from the wrapped regressor.
Returns
-------
ndarray of shape (n_features,)
Feature importances.
Raises
------
AttributeError
If the wrapped regressor does not expose feature_importances_.
"""
self._check_is_fitted()
if hasattr(self.regressor_, "feature_importances_"):
return self.regressor_.feature_importances_
raise AttributeError(
f"The wrapped regressor {type(self.regressor_).__name__} "
f"does not expose feature_importances_."
)
# ------------------------------------------------------------------
# Auto selection
# ------------------------------------------------------------------
def _select_auto(self, y: np.ndarray) -> str:
"""Select the best transform automatically based on normality.
Strategy:
1. Compute Shapiro-Wilk statistic on raw y.
2. If y is already normal (p > 0.05), use 'none'.
3. Otherwise, try Box-Cox (if applicable) and Yeo-Johnson.
4. Return whichever produces the highest Shapiro-Wilk p-value.
Parameters
----------
y : ndarray
Target array.
Returns
-------
str
Selected method name.
"""
from scipy import stats
# Shapiro-Wilk has a sample size limit; subsample if needed
y_test = y
if len(y) > 5000:
rng = np.random.RandomState(self.random_state)
idx = rng.choice(len(y), 5000, replace=False)
y_test = y[idx]
_, p_raw = stats.shapiro(y_test)
self._log(f"Raw target Shapiro-Wilk p={p_raw:.4f}")
if p_raw > 0.05:
return "none"
candidates: dict[str, float] = {}
# Try Yeo-Johnson (always applicable)
try:
y_yj, lam_yj = stats.yeojohnson(y)
y_yj_test = y_yj if len(y) <= 5000 else y_yj[idx]
_, p_yj = stats.shapiro(y_yj_test)
candidates["yeo_johnson"] = p_yj
self._log(f"Yeo-Johnson Shapiro-Wilk p={p_yj:.4f} (lambda={lam_yj:.4f})")
except Exception:
pass
# Try Box-Cox (requires strictly positive)
if np.all(y > 0):
try:
y_bc, lam_bc = stats.boxcox(y)
y_bc_test = y_bc if len(y) <= 5000 else y_bc[idx]
_, p_bc = stats.shapiro(y_bc_test)
candidates["box_cox"] = p_bc
self._log(f"Box-Cox Shapiro-Wilk p={p_bc:.4f} (lambda={lam_bc:.4f})")
except Exception:
pass
if not candidates:
return "yeo_johnson"
best = max(candidates, key=candidates.get)
return best
# ------------------------------------------------------------------
# Forward / inverse transforms
# ------------------------------------------------------------------
def _forward(self, y: np.ndarray, fit: bool = False) -> np.ndarray:
"""Apply the forward transform to target values.
Parameters
----------
y : ndarray
Target values.
fit : bool
Whether this is the fitting step (fit transform parameters).
Returns
-------
ndarray
Transformed target values.
"""
method = self.method_
if method == "none":
return y.copy()
elif method == "log":
_check_positive(y, method)
return np.log(y)
elif method == "log1p":
_check_nonneg(y, method)
return np.log1p(y)
elif method == "sqrt":
_check_nonneg(y, method)
return np.sqrt(y)
elif method == "box_cox":
from scipy import stats
_check_positive(y, method)
if fit:
y_t, self.lambda_ = stats.boxcox(y)
return y_t
else:
return stats.boxcox(y, lmbda=self.lambda_)
elif method == "yeo_johnson":
from scipy import stats
if fit:
y_t, self.lambda_ = stats.yeojohnson(y)
return y_t
else:
return stats.yeojohnson(y, lmbda=self.lambda_)
elif method == "quantile":
if fit:
self.qt_ = _SklearnQuantileTransformer(
output_distribution="normal",
random_state=self.random_state,
)
return self.qt_.fit_transform(y.reshape(-1, 1)).ravel()
else:
return self.qt_.transform(y.reshape(-1, 1)).ravel()
elif method == "rank":
if fit:
self.y_train_sorted_ = np.sort(y)
n = len(y)
# Rank transform: map to [0, 1] via ranks then to normal quantiles
ranks = np.searchsorted(self.y_train_sorted_, y, side="right")
# Clip to valid quantile range
quantiles = np.clip(ranks / len(self.y_train_sorted_), 1e-6, 1 - 1e-6)
from scipy import stats
return stats.norm.ppf(quantiles)
else:
raise ValueError(f"Unknown method '{method}'.")
def _inverse(self, y: np.ndarray) -> np.ndarray:
"""Apply the inverse transform to predicted values.
Parameters
----------
y : ndarray
Transformed predicted values.
Returns
-------
ndarray
Predictions in the original target scale.
"""
method = self.method_
if method == "none":
return y.copy()
elif method == "log":
return np.exp(y)
elif method == "log1p":
return np.expm1(y)
elif method == "sqrt":
# Clip to avoid negative values from numeric noise
return np.square(np.clip(y, 0.0, None))
elif method == "box_cox":
from scipy.special import inv_boxcox
return inv_boxcox(y, self.lambda_)
elif method == "yeo_johnson":
from scipy.special import inv_boxcox
# Yeo-Johnson inverse is not provided by scipy directly.
# Implement manually following the Yeo-Johnson definition.
return _inv_yeojohnson(y, self.lambda_)
elif method == "quantile":
return self.qt_.inverse_transform(y.reshape(-1, 1)).ravel()
elif method == "rank":
from scipy import stats
# Map normal quantiles back to [0, 1], then interpolate
quantiles = stats.norm.cdf(y)
# Interpolate back to original scale using stored sorted targets
n = len(self.y_train_sorted_)
indices = quantiles * (n - 1)
idx_low = np.clip(np.floor(indices).astype(int), 0, n - 1)
idx_high = np.clip(np.ceil(indices).astype(int), 0, n - 1)
frac = indices - idx_low
return (
self.y_train_sorted_[idx_low] * (1 - frac)
+ self.y_train_sorted_[idx_high] * frac
)
else:
raise ValueError(f"Unknown method '{method}'.")
def _inv_yeojohnson(y: np.ndarray, lam: float) -> np.ndarray:
"""Inverse Yeo-Johnson transform.
The Yeo-Johnson transform is defined piecewise:
For x >= 0:
if lam != 0: y = ((x + 1)^lam - 1) / lam
if lam == 0: y = log(x + 1)
For x < 0:
if lam != 2: y = -((-x + 1)^(2 - lam) - 1) / (2 - lam)
if lam == 2: y = -log(-x + 1)
This function inverts those definitions.
Parameters
----------
y : ndarray
Transformed values.
lam : float
Lambda parameter.
Returns
-------
ndarray
Original-scale values.
"""
x = np.zeros_like(y, dtype=np.float64)
pos = y >= 0
neg = ~pos
# Invert positive branch
if np.any(pos):
if np.abs(lam) < 1e-12:
# lam ~ 0: y = log(x + 1) => x = exp(y) - 1
x[pos] = np.exp(y[pos]) - 1
else:
# y = ((x+1)^lam - 1) / lam => x = (y*lam + 1)^(1/lam) - 1
x[pos] = np.power(y[pos] * lam + 1, 1.0 / lam) - 1
# Invert negative branch
if np.any(neg):
if np.abs(lam - 2) < 1e-12:
# lam ~ 2: y = -log(-x + 1) => x = 1 - exp(-y)
x[neg] = 1 - np.exp(-y[neg])
else:
# y = -((-x+1)^(2-lam) - 1) / (2-lam)
# => -x + 1 = (-y*(2-lam) + 1)^(1/(2-lam))
# => x = 1 - (-y*(2-lam) + 1)^(1/(2-lam))
x[neg] = 1 - np.power(-y[neg] * (2 - lam) + 1, 1.0 / (2 - lam))
return x
[docs]
class TargetQuantileTransformer(EndgameEstimator, RegressorMixin):
"""Convenience wrapper applying QuantileTransformer to the target.
This is a specialized shortcut for ``TargetTransformer(method='quantile')``.
It wraps a regressor and normalizes the target via sklearn's
QuantileTransformer before fitting.
Parameters
----------
regressor : estimator
Any sklearn-compatible regressor.
n_quantiles : int, default=1000
Number of quantiles for the QuantileTransformer.
output_distribution : str, default='normal'
Output distribution: 'normal' or 'uniform'.
subsample : int, default=100000
Subsample size for quantile estimation.
random_state : int, optional
Random seed for reproducibility.
verbose : bool, default=False
Enable verbose output.
Attributes
----------
regressor_ : estimator
The fitted regressor.
qt_ : QuantileTransformer
The fitted target QuantileTransformer.
feature_importances_ : ndarray
Delegated from the wrapped regressor, if available.
Examples
--------
>>> from sklearn.linear_model import Ridge
>>> from endgame.preprocessing.target_transform import TargetQuantileTransformer
>>> model = TargetQuantileTransformer(
... regressor=Ridge(),
... n_quantiles=500,
... output_distribution='normal',
... )
>>> model.fit(X_train, y_train)
>>> preds = model.predict(X_test)
"""
_estimator_type = "regressor"
def __init__(
self,
regressor: Any = None,
n_quantiles: int = 1000,
output_distribution: str = "normal",
subsample: int = 100_000,
random_state: int | None = None,
verbose: bool = False,
):
super().__init__(random_state=random_state, verbose=verbose)
if regressor is None:
raise TypeError(
"TargetQuantileTransformer requires a regressor. Pass a "
"sklearn-compatible regressor via the 'regressor' parameter."
)
self.regressor = regressor
self.n_quantiles = n_quantiles
self.output_distribution = output_distribution
self.subsample = subsample
[docs]
def fit(self, X, y, **fit_params) -> TargetQuantileTransformer:
"""Fit the wrapped regressor on quantile-transformed targets.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training features.
y : array-like of shape (n_samples,)
Training targets.
**fit_params : dict
Additional parameters forwarded to the regressor.
Returns
-------
self
"""
X, y = self._validate_data(X, y, reset=True)
y = y.astype(np.float64)
self.qt_ = _SklearnQuantileTransformer(
n_quantiles=min(self.n_quantiles, len(y)),
output_distribution=self.output_distribution,
subsample=self.subsample,
random_state=self.random_state,
)
y_transformed = self.qt_.fit_transform(y.reshape(-1, 1)).ravel()
self.regressor_ = clone(self.regressor)
self.regressor_.fit(X, y_transformed, **fit_params)
self._is_fitted = True
return self
[docs]
def predict(self, X) -> np.ndarray:
"""Predict target values, inverse-transforming the output.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Test features.
Returns
-------
ndarray of shape (n_samples,)
Predicted target values in the original scale.
"""
self._check_is_fitted()
X = self._to_numpy(X)
y_pred_transformed = self.regressor_.predict(X)
return self.qt_.inverse_transform(
y_pred_transformed.reshape(-1, 1)
).ravel()
@property
def feature_importances_(self) -> np.ndarray:
"""Feature importances from the wrapped regressor."""
self._check_is_fitted()
if hasattr(self.regressor_, "feature_importances_"):
return self.regressor_.feature_importances_
raise AttributeError(
f"The wrapped regressor {type(self.regressor_).__name__} "
f"does not expose feature_importances_."
)