Source code for endgame.models.ngboost

from __future__ import annotations

"""NGBoost wrapper for probabilistic prediction.

Wraps the Stanford NGBoost library (https://github.com/stanfordmlgroup/ngboost)
with competition-tuned defaults and additional utilities.

NGBoost uses Natural Gradient Boosting to produce full probability distributions
for predictions, enabling uncertainty quantification and probabilistic scoring.
"""

from typing import Any

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.tree import DecisionTreeRegressor

from endgame.core.base import EndgameEstimator

# Check for ngboost availability
try:
    from ngboost import NGBClassifier, NGBRegressor
    from ngboost.distns import (
        Bernoulli,
        Cauchy,
        ClassificationDistn,
        Exponential,
        Laplace,
        LogNormal,
        MultivariateNormal,
        Normal,
        Poisson,
        T,
        TFixedDf,
        k_categorical,
    )
    from ngboost.scores import MLE, CRPScore, LogScore
    HAS_NGBOOST = True
except ImportError:
    HAS_NGBOOST = False


def _check_ngboost():
    """Check if ngboost is available."""
    if not HAS_NGBOOST:
        raise ImportError(
            "ngboost is required for NGBoost models. "
            "Install with: pip install ngboost"
        )


# Distribution name mappings
REGRESSION_DISTRIBUTIONS = {
    "normal": "Normal",
    "lognormal": "LogNormal",
    "exponential": "Exponential",
    "laplace": "Laplace",
    "t": "T",
    "cauchy": "Cauchy",
    "poisson": "Poisson",
}

CLASSIFICATION_DISTRIBUTIONS = {
    "bernoulli": "Bernoulli",
    "categorical": "k_categorical",
}

# Scoring rule mappings
SCORES = {
    "crps": "CRPScore",
    "mle": "MLE",
    "log": "LogScore",
    "nll": "LogScore",  # Alias
}

# Presets for different use cases
NGBOOST_PRESETS = {
    "endgame": {
        "n_estimators": 500,
        "learning_rate": 0.01,
        "minibatch_frac": 1.0,
        "col_sample": 1.0,
        "tol": 1e-4,
        "natural_gradient": True,
    },
    "fast": {
        "n_estimators": 100,
        "learning_rate": 0.1,
        "minibatch_frac": 0.5,
        "col_sample": 0.8,
        "tol": 1e-3,
        "natural_gradient": True,
    },
    "accurate": {
        "n_estimators": 1000,
        "learning_rate": 0.005,
        "minibatch_frac": 1.0,
        "col_sample": 1.0,
        "tol": 1e-5,
        "natural_gradient": True,
    },
    "competition": {
        "n_estimators": 2000,
        "learning_rate": 0.01,
        "minibatch_frac": 1.0,
        "col_sample": 1.0,
        "tol": 1e-5,
        "natural_gradient": True,
    },
}


[docs] class NGBoostRegressor(EndgameEstimator, RegressorMixin): """NGBoost Regressor for probabilistic regression. Produces full probability distributions for predictions, enabling uncertainty quantification and scoring with proper scoring rules. Parameters ---------- preset : str, default='endgame' Hyperparameter preset: 'endgame', 'fast', 'accurate', 'competition'. distribution : str, default='normal' Output distribution: 'normal', 'lognormal', 'exponential', 'laplace', 't', 'cauchy', 'poisson'. score : str, default='crps' Scoring rule: 'crps' (Continuous Ranked Probability Score), 'mle'/'nll' (Maximum Likelihood / Negative Log Likelihood). n_estimators : int, optional Number of boosting iterations. Overrides preset. learning_rate : float, optional Learning rate. Overrides preset. minibatch_frac : float, optional Fraction of data to use in each iteration. col_sample : float, optional Fraction of features to use in each iteration. base_learner : estimator, optional Base learner for boosting. Default is DecisionTreeRegressor(max_depth=3). natural_gradient : bool, default=True Use natural gradient (recommended). early_stopping_rounds : int, optional Early stopping patience. If None, no early stopping. random_state : int, optional Random seed. verbose : bool, default=False Enable verbose output. **kwargs Additional parameters passed to NGBRegressor. Attributes ---------- model_ : NGBRegressor Fitted NGBoost model. feature_importances_ : ndarray Feature importances from the base learners. Examples -------- >>> from endgame.models import NGBoostRegressor >>> model = NGBoostRegressor(distribution='normal', score='crps') >>> model.fit(X_train, y_train) >>> >>> # Point predictions >>> y_pred = model.predict(X_test) >>> >>> # Full distribution predictions >>> y_dist = model.pred_dist(X_test) >>> mean = y_dist.mean() >>> std = y_dist.std() >>> >>> # Prediction intervals >>> lower, upper = model.predict_interval(X_test, alpha=0.1) # 90% CI >>> >>> # Negative log-likelihood >>> nll = -y_dist.logpdf(y_test).mean() References ---------- Duan et al., 2020. "NGBoost: Natural Gradient Boosting for Probabilistic Prediction." https://arxiv.org/abs/1910.03225 """ def __init__( self, preset: str = "endgame", distribution: str = "normal", score: str = "crps", n_estimators: int | None = None, learning_rate: float | None = None, minibatch_frac: float | None = None, col_sample: float | None = None, base_learner: BaseEstimator | None = None, natural_gradient: bool = True, early_stopping_rounds: int | None = None, random_state: int | None = None, verbose: bool = False, **kwargs, ): _check_ngboost() super().__init__(random_state=random_state, verbose=verbose) self.preset = preset self.distribution = distribution self.score = score self.n_estimators = n_estimators self.learning_rate = learning_rate self.minibatch_frac = minibatch_frac self.col_sample = col_sample self.base_learner = base_learner self.natural_gradient = natural_gradient self.early_stopping_rounds = early_stopping_rounds self.kwargs = kwargs self.model_: NGBRegressor | None = None self._feature_names: list[str] | None = None def _get_distribution(self): """Get the distribution class.""" dist_name = self.distribution.lower() if dist_name == "normal": return Normal elif dist_name == "lognormal": return LogNormal elif dist_name == "exponential": return Exponential elif dist_name == "laplace": return Laplace elif dist_name == "t": return T elif dist_name == "cauchy": return Cauchy elif dist_name == "poisson": return Poisson else: raise ValueError( f"Unknown distribution: {self.distribution}. " f"Choose from: {list(REGRESSION_DISTRIBUTIONS.keys())}" ) def _get_score(self): """Get the scoring rule class.""" score_name = self.score.lower() if score_name in ("crps", "crps_score"): return CRPScore elif score_name in ("mle", "nll", "log", "logscore"): return LogScore else: raise ValueError( f"Unknown score: {self.score}. " f"Choose from: {list(SCORES.keys())}" ) def _get_params(self) -> dict[str, Any]: """Get merged parameters from preset and overrides.""" # Start with preset params = NGBOOST_PRESETS.get(self.preset, NGBOOST_PRESETS["endgame"]).copy() # Apply overrides if self.n_estimators is not None: params["n_estimators"] = self.n_estimators if self.learning_rate is not None: params["learning_rate"] = self.learning_rate if self.minibatch_frac is not None: params["minibatch_frac"] = self.minibatch_frac if self.col_sample is not None: params["col_sample"] = self.col_sample params["natural_gradient"] = self.natural_gradient # Add distribution and score params["Dist"] = self._get_distribution() params["Score"] = self._get_score() # Add base learner if self.base_learner is not None: params["Base"] = self.base_learner else: # Default base learner with reasonable depth params["Base"] = DecisionTreeRegressor( max_depth=3, min_samples_split=10, min_samples_leaf=5, ) # Random state if self.random_state is not None: params["random_state"] = self.random_state # Verbose params["verbose"] = self.verbose # Additional kwargs params.update(self.kwargs) return params
[docs] def fit( self, X, y, X_val: np.ndarray | None = None, y_val: np.ndarray | None = None, sample_weight: np.ndarray | None = None, val_sample_weight: np.ndarray | None = None, ) -> NGBoostRegressor: """Fit the NGBoost regressor. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data. y : array-like of shape (n_samples,) Target values. X_val : array-like, optional Validation features for early stopping. y_val : array-like, optional Validation targets for early stopping. sample_weight : array-like, optional Training sample weights. val_sample_weight : array-like, optional Validation sample weights. Returns ------- self """ X_arr = self._to_numpy(X) y_arr = np.asarray(y).ravel() # Store feature names self._feature_names = self._get_feature_names(X, X_arr.shape[1]) # Get parameters params = self._get_params() # Create model self.model_ = NGBRegressor(**params) # Prepare fit arguments fit_kwargs = {} if sample_weight is not None: fit_kwargs["sample_weight"] = sample_weight # Early stopping if X_val is not None and y_val is not None: X_val_arr = self._to_numpy(X_val) y_val_arr = np.asarray(y_val).ravel() fit_kwargs["X_val"] = X_val_arr fit_kwargs["Y_val"] = y_val_arr if val_sample_weight is not None: fit_kwargs["val_sample_weight"] = val_sample_weight if self.early_stopping_rounds is not None: fit_kwargs["early_stopping_rounds"] = self.early_stopping_rounds self._log(f"Training NGBoost regressor with {len(X_arr)} samples...") self.model_.fit(X_arr, y_arr, **fit_kwargs) self._is_fitted = True return self
[docs] def predict(self, X) -> np.ndarray: """Predict the mean of the distribution. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples to predict. Returns ------- y_pred : ndarray of shape (n_samples,) Predicted means. """ self._check_is_fitted() X_arr = self._to_numpy(X) return self.model_.predict(X_arr)
[docs] def pred_dist(self, X): """Predict the full distribution. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples to predict. Returns ------- dist : ngboost distribution Predicted distributions with methods: - mean(): Expected value - std(): Standard deviation - var(): Variance - logpdf(y): Log probability density - pdf(y): Probability density - cdf(y): Cumulative distribution function - ppf(q): Percent point function (inverse CDF) - sample(n): Draw n samples """ self._check_is_fitted() X_arr = self._to_numpy(X) return self.model_.pred_dist(X_arr)
[docs] def predict_interval( self, X, alpha: float = 0.1, ) -> tuple[np.ndarray, np.ndarray]: """Predict prediction intervals. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples to predict. alpha : float, default=0.1 Significance level. Returns (1-alpha) prediction interval. E.g., alpha=0.1 returns 90% prediction interval. Returns ------- lower : ndarray of shape (n_samples,) Lower bound of prediction interval. upper : ndarray of shape (n_samples,) Upper bound of prediction interval. """ dist = self.pred_dist(X) lower = dist.ppf(alpha / 2) upper = dist.ppf(1 - alpha / 2) return lower, upper
[docs] def predict_std(self, X) -> np.ndarray: """Predict the standard deviation (uncertainty). Parameters ---------- X : array-like of shape (n_samples, n_features) Samples to predict. Returns ------- std : ndarray of shape (n_samples,) Predicted standard deviations. """ dist = self.pred_dist(X) return dist.std()
[docs] def score(self, X, y, sample_weight=None) -> float: """Return the negative log-likelihood on the given data. Parameters ---------- X : array-like of shape (n_samples, n_features) Test samples. y : array-like of shape (n_samples,) True target values. sample_weight : array-like, optional Sample weights (not used, for API compatibility). Returns ------- score : float Mean negative log-likelihood (lower is better). """ self._check_is_fitted() dist = self.pred_dist(X) y_arr = np.asarray(y).ravel() # Return negative NLL so higher is better (sklearn convention) return dist.logpdf(y_arr).mean()
@property def feature_importances_(self) -> np.ndarray: """Feature importances based on base learner splits.""" self._check_is_fitted() return self.model_.feature_importances_ def _get_feature_names(self, X, n_features: int) -> list[str]: """Extract feature names from input.""" try: import pandas as pd if isinstance(X, pd.DataFrame): return list(X.columns) except ImportError: pass try: import polars as pl if isinstance(X, (pl.DataFrame, pl.LazyFrame)): if isinstance(X, pl.LazyFrame): X = X.collect() return list(X.columns) except ImportError: pass return [f"f{i}" for i in range(n_features)]
[docs] class NGBoostClassifier(ClassifierMixin, EndgameEstimator): """NGBoost Classifier for probabilistic classification. Produces calibrated probability distributions over classes, with proper uncertainty quantification. Parameters ---------- preset : str, default='endgame' Hyperparameter preset: 'endgame', 'fast', 'accurate', 'competition'. n_estimators : int, optional Number of boosting iterations. Overrides preset. learning_rate : float, optional Learning rate. Overrides preset. minibatch_frac : float, optional Fraction of data to use in each iteration. col_sample : float, optional Fraction of features to use in each iteration. base_learner : estimator, optional Base learner for boosting. Default is DecisionTreeRegressor(max_depth=3). natural_gradient : bool, default=True Use natural gradient (recommended). early_stopping_rounds : int, optional Early stopping patience. If None, no early stopping. random_state : int, optional Random seed. verbose : bool, default=False Enable verbose output. **kwargs Additional parameters passed to NGBClassifier. Attributes ---------- model_ : NGBClassifier Fitted NGBoost model. classes_ : ndarray Unique class labels. n_classes_ : int Number of classes. feature_importances_ : ndarray Feature importances from the base learners. Examples -------- >>> from endgame.models import NGBoostClassifier >>> model = NGBoostClassifier(preset='endgame') >>> model.fit(X_train, y_train) >>> >>> # Class predictions >>> y_pred = model.predict(X_test) >>> >>> # Probability predictions >>> y_proba = model.predict_proba(X_test) >>> >>> # Distribution predictions >>> y_dist = model.pred_dist(X_test) >>> >>> # Log-loss >>> from sklearn.metrics import log_loss >>> loss = log_loss(y_test, y_proba) References ---------- Duan et al., 2020. "NGBoost: Natural Gradient Boosting for Probabilistic Prediction." https://arxiv.org/abs/1910.03225 """ def __init__( self, preset: str = "endgame", n_estimators: int | None = None, learning_rate: float | None = None, minibatch_frac: float | None = None, col_sample: float | None = None, base_learner: BaseEstimator | None = None, natural_gradient: bool = True, early_stopping_rounds: int | None = None, random_state: int | None = None, verbose: bool = False, **kwargs, ): _check_ngboost() super().__init__(random_state=random_state, verbose=verbose) self.preset = preset self.n_estimators = n_estimators self.learning_rate = learning_rate self.minibatch_frac = minibatch_frac self.col_sample = col_sample self.base_learner = base_learner self.natural_gradient = natural_gradient self.early_stopping_rounds = early_stopping_rounds self.kwargs = kwargs self.model_: NGBClassifier | None = None self.classes_: np.ndarray | None = None self.n_classes_: int | None = None self._feature_names: list[str] | None = None def _get_params(self, n_classes: int) -> dict[str, Any]: """Get merged parameters from preset and overrides.""" # Start with preset params = NGBOOST_PRESETS.get(self.preset, NGBOOST_PRESETS["endgame"]).copy() # Apply overrides if self.n_estimators is not None: params["n_estimators"] = self.n_estimators if self.learning_rate is not None: params["learning_rate"] = self.learning_rate if self.minibatch_frac is not None: params["minibatch_frac"] = self.minibatch_frac if self.col_sample is not None: params["col_sample"] = self.col_sample params["natural_gradient"] = self.natural_gradient # Set distribution based on number of classes if n_classes == 2: params["Dist"] = Bernoulli else: params["Dist"] = k_categorical(n_classes) # Score is always LogScore for classification params["Score"] = LogScore # Add base learner if self.base_learner is not None: params["Base"] = self.base_learner else: params["Base"] = DecisionTreeRegressor( max_depth=3, min_samples_split=10, min_samples_leaf=5, ) # Random state if self.random_state is not None: params["random_state"] = self.random_state # Verbose params["verbose"] = self.verbose # Additional kwargs params.update(self.kwargs) return params
[docs] def fit( self, X, y, X_val: np.ndarray | None = None, y_val: np.ndarray | None = None, sample_weight: np.ndarray | None = None, val_sample_weight: np.ndarray | None = None, ) -> NGBoostClassifier: """Fit the NGBoost classifier. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data. y : array-like of shape (n_samples,) Target labels. X_val : array-like, optional Validation features for early stopping. y_val : array-like, optional Validation labels for early stopping. sample_weight : array-like, optional Training sample weights. val_sample_weight : array-like, optional Validation sample weights. Returns ------- self """ X_arr = self._to_numpy(X) y_arr = np.asarray(y).ravel() # Store classes self.classes_ = np.unique(y_arr) self.n_classes_ = len(self.classes_) # Remap labels to contiguous 0..n-1 (required by ngboost k_categorical) self._label_remap = None if not np.array_equal(self.classes_, np.arange(self.n_classes_)): self._label_remap = {c: i for i, c in enumerate(self.classes_)} y_arr = np.array([self._label_remap[v] for v in y_arr]) # Store feature names self._feature_names = self._get_feature_names(X, X_arr.shape[1]) # Get parameters params = self._get_params(self.n_classes_) # Create model self.model_ = NGBClassifier(**params) # Prepare fit arguments fit_kwargs = {} if sample_weight is not None: fit_kwargs["sample_weight"] = sample_weight # Early stopping if X_val is not None and y_val is not None: X_val_arr = self._to_numpy(X_val) y_val_arr = np.asarray(y_val).ravel() if self._label_remap is not None: y_val_arr = np.array([self._label_remap.get(v, v) for v in y_val_arr]) fit_kwargs["X_val"] = X_val_arr fit_kwargs["Y_val"] = y_val_arr if val_sample_weight is not None: fit_kwargs["val_sample_weight"] = val_sample_weight if self.early_stopping_rounds is not None: fit_kwargs["early_stopping_rounds"] = self.early_stopping_rounds self._log(f"Training NGBoost classifier with {len(X_arr)} samples, {self.n_classes_} classes...") self.model_.fit(X_arr, y_arr, **fit_kwargs) self._is_fitted = True return self
[docs] def predict(self, X) -> np.ndarray: """Predict class labels. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples to predict. Returns ------- y_pred : ndarray of shape (n_samples,) Predicted class labels. """ self._check_is_fitted() X_arr = self._to_numpy(X) preds = self.model_.predict(X_arr) if self._label_remap is not None: preds = self.classes_[preds.astype(int)] return preds
[docs] def predict_proba(self, X) -> np.ndarray: """Predict class probabilities. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples to predict. Returns ------- proba : ndarray of shape (n_samples, n_classes) Class probabilities. """ self._check_is_fitted() X_arr = self._to_numpy(X) return self.model_.predict_proba(X_arr)
[docs] def pred_dist(self, X): """Predict the full distribution over classes. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples to predict. Returns ------- dist : ngboost distribution Predicted distributions. """ self._check_is_fitted() X_arr = self._to_numpy(X) return self.model_.pred_dist(X_arr)
[docs] def score(self, X, y, sample_weight=None) -> float: """Return accuracy on the given data. Parameters ---------- X : array-like of shape (n_samples, n_features) Test samples. y : array-like of shape (n_samples,) True labels. sample_weight : array-like, optional Sample weights. Returns ------- score : float Accuracy score. """ from sklearn.metrics import accuracy_score y_pred = self.predict(X) return accuracy_score(y, y_pred, sample_weight=sample_weight)
@property def feature_importances_(self) -> np.ndarray: """Feature importances based on base learner splits.""" self._check_is_fitted() return self.model_.feature_importances_ def _get_feature_names(self, X, n_features: int) -> list[str]: """Extract feature names from input.""" try: import pandas as pd if isinstance(X, pd.DataFrame): return list(X.columns) except ImportError: pass try: import polars as pl if isinstance(X, (pl.DataFrame, pl.LazyFrame)): if isinstance(X, pl.LazyFrame): X = X.collect() return list(X.columns) except ImportError: pass return [f"f{i}" for i in range(n_features)]
# Convenience function for survival analysis (if needed in future) def create_ngboost_survival( distribution: str = "exponential", **kwargs ) -> NGBoostRegressor: """Create an NGBoost model for survival analysis. Parameters ---------- distribution : str, default='exponential' Distribution for survival times: 'exponential', 'lognormal'. **kwargs Additional parameters for NGBoostRegressor. Returns ------- model : NGBoostRegressor Configured NGBoost model for survival analysis. Notes ----- For survival analysis with censoring, use the ngboost library directly with NGBSurvival class. """ return NGBoostRegressor( distribution=distribution, score="mle", **kwargs )