Source code for endgame.models.ngboost

from __future__ import annotations

"""NGBoost wrapper for probabilistic prediction.

Wraps the Stanford NGBoost library (https://github.com/stanfordmlgroup/ngboost)
with competition-tuned defaults and additional utilities.

NGBoost uses Natural Gradient Boosting to produce full probability distributions
for predictions, enabling uncertainty quantification and probabilistic scoring.
"""

from typing import Any

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.tree import DecisionTreeRegressor

from endgame.core.base import EndgameEstimator

# Check for ngboost availability
try:
    from ngboost import NGBClassifier, NGBRegressor
    from ngboost.distns import (
        Bernoulli,
        Cauchy,
        ClassificationDistn,
        Exponential,
        Laplace,
        LogNormal,
        MultivariateNormal,
        Normal,
        Poisson,
        T,
        TFixedDf,
        k_categorical,
    )
    from ngboost.scores import MLE, CRPScore, LogScore
    HAS_NGBOOST = True
except ImportError:
    HAS_NGBOOST = False


def _check_ngboost():
    """Check if ngboost is available."""
    if not HAS_NGBOOST:
        raise ImportError(
            "ngboost is required for NGBoost models. "
            "Install with: pip install ngboost"
        )


# Distribution name mappings
REGRESSION_DISTRIBUTIONS = {
    "normal": "Normal",
    "lognormal": "LogNormal",
    "exponential": "Exponential",
    "laplace": "Laplace",
    "t": "T",
    "cauchy": "Cauchy",
    "poisson": "Poisson",
}

CLASSIFICATION_DISTRIBUTIONS = {
    "bernoulli": "Bernoulli",
    "categorical": "k_categorical",
}

# Scoring rule mappings
SCORES = {
    "crps": "CRPScore",
    "mle": "MLE",
    "log": "LogScore",
    "nll": "LogScore",  # Alias
}

# Presets for different use cases
NGBOOST_PRESETS = {
    "endgame": {
        "n_estimators": 500,
        "learning_rate": 0.01,
        "minibatch_frac": 1.0,
        "col_sample": 1.0,
        "tol": 1e-4,
        "natural_gradient": True,
    },
    "fast": {
        "n_estimators": 100,
        "learning_rate": 0.1,
        "minibatch_frac": 0.5,
        "col_sample": 0.8,
        "tol": 1e-3,
        "natural_gradient": True,
    },
    "accurate": {
        "n_estimators": 1000,
        "learning_rate": 0.005,
        "minibatch_frac": 1.0,
        "col_sample": 1.0,
        "tol": 1e-5,
        "natural_gradient": True,
    },
    "competition": {
        "n_estimators": 2000,
        "learning_rate": 0.01,
        "minibatch_frac": 1.0,
        "col_sample": 1.0,
        "tol": 1e-5,
        "natural_gradient": True,
    },
}



[docs]
class NGBoostRegressor(EndgameEstimator, RegressorMixin):
    """NGBoost Regressor for probabilistic regression.

    Produces full probability distributions for predictions, enabling
    uncertainty quantification and scoring with proper scoring rules.

    Parameters
    ----------
    preset : str, default='endgame'
        Hyperparameter preset: 'endgame', 'fast', 'accurate', 'competition'.
    distribution : str, default='normal'
        Output distribution: 'normal', 'lognormal', 'exponential',
        'laplace', 't', 'cauchy', 'poisson'.
    score : str, default='crps'
        Scoring rule: 'crps' (Continuous Ranked Probability Score),
        'mle'/'nll' (Maximum Likelihood / Negative Log Likelihood).
    n_estimators : int, optional
        Number of boosting iterations. Overrides preset.
    learning_rate : float, optional
        Learning rate. Overrides preset.
    minibatch_frac : float, optional
        Fraction of data to use in each iteration.
    col_sample : float, optional
        Fraction of features to use in each iteration.
    base_learner : estimator, optional
        Base learner for boosting. Default is DecisionTreeRegressor(max_depth=3).
    natural_gradient : bool, default=True
        Use natural gradient (recommended).
    early_stopping_rounds : int, optional
        Early stopping patience. If None, no early stopping.
    random_state : int, optional
        Random seed.
    verbose : bool, default=False
        Enable verbose output.
    **kwargs
        Additional parameters passed to NGBRegressor.

    Attributes
    ----------
    model_ : NGBRegressor
        Fitted NGBoost model.
    feature_importances_ : ndarray
        Feature importances from the base learners.

    Examples
    --------
    >>> from endgame.models import NGBoostRegressor
    >>> model = NGBoostRegressor(distribution='normal', score='crps')
    >>> model.fit(X_train, y_train)
    >>>
    >>> # Point predictions
    >>> y_pred = model.predict(X_test)
    >>>
    >>> # Full distribution predictions
    >>> y_dist = model.pred_dist(X_test)
    >>> mean = y_dist.mean()
    >>> std = y_dist.std()
    >>>
    >>> # Prediction intervals
    >>> lower, upper = model.predict_interval(X_test, alpha=0.1)  # 90% CI
    >>>
    >>> # Negative log-likelihood
    >>> nll = -y_dist.logpdf(y_test).mean()

    References
    ----------
    Duan et al., 2020. "NGBoost: Natural Gradient Boosting for
    Probabilistic Prediction." https://arxiv.org/abs/1910.03225
    """

    def __init__(
        self,
        preset: str = "endgame",
        distribution: str = "normal",
        score: str = "crps",
        n_estimators: int | None = None,
        learning_rate: float | None = None,
        minibatch_frac: float | None = None,
        col_sample: float | None = None,
        base_learner: BaseEstimator | None = None,
        natural_gradient: bool = True,
        early_stopping_rounds: int | None = None,
        random_state: int | None = None,
        verbose: bool = False,
        **kwargs,
    ):
        _check_ngboost()
        super().__init__(random_state=random_state, verbose=verbose)

        self.preset = preset
        self.distribution = distribution
        self.score = score
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.minibatch_frac = minibatch_frac
        self.col_sample = col_sample
        self.base_learner = base_learner
        self.natural_gradient = natural_gradient
        self.early_stopping_rounds = early_stopping_rounds
        self.kwargs = kwargs

        self.model_: NGBRegressor | None = None
        self._feature_names: list[str] | None = None

    def _get_distribution(self):
        """Get the distribution class."""
        dist_name = self.distribution.lower()

        if dist_name == "normal":
            return Normal
        elif dist_name == "lognormal":
            return LogNormal
        elif dist_name == "exponential":
            return Exponential
        elif dist_name == "laplace":
            return Laplace
        elif dist_name == "t":
            return T
        elif dist_name == "cauchy":
            return Cauchy
        elif dist_name == "poisson":
            return Poisson
        else:
            raise ValueError(
                f"Unknown distribution: {self.distribution}. "
                f"Choose from: {list(REGRESSION_DISTRIBUTIONS.keys())}"
            )

    def _get_score(self):
        """Get the scoring rule class."""
        score_name = self.score.lower()

        if score_name in ("crps", "crps_score"):
            return CRPScore
        elif score_name in ("mle", "nll", "log", "logscore"):
            return LogScore
        else:
            raise ValueError(
                f"Unknown score: {self.score}. "
                f"Choose from: {list(SCORES.keys())}"
            )

    def _get_params(self) -> dict[str, Any]:
        """Get merged parameters from preset and overrides."""
        # Start with preset
        params = NGBOOST_PRESETS.get(self.preset, NGBOOST_PRESETS["endgame"]).copy()

        # Apply overrides
        if self.n_estimators is not None:
            params["n_estimators"] = self.n_estimators
        if self.learning_rate is not None:
            params["learning_rate"] = self.learning_rate
        if self.minibatch_frac is not None:
            params["minibatch_frac"] = self.minibatch_frac
        if self.col_sample is not None:
            params["col_sample"] = self.col_sample

        params["natural_gradient"] = self.natural_gradient

        # Add distribution and score
        params["Dist"] = self._get_distribution()
        params["Score"] = self._get_score()

        # Add base learner
        if self.base_learner is not None:
            params["Base"] = self.base_learner
        else:
            # Default base learner with reasonable depth
            params["Base"] = DecisionTreeRegressor(
                max_depth=3,
                min_samples_split=10,
                min_samples_leaf=5,
            )

        # Random state
        if self.random_state is not None:
            params["random_state"] = self.random_state

        # Verbose
        params["verbose"] = self.verbose

        # Additional kwargs
        params.update(self.kwargs)

        return params


[docs]
    def fit(
        self,
        X,
        y,
        X_val: np.ndarray | None = None,
        y_val: np.ndarray | None = None,
        sample_weight: np.ndarray | None = None,
        val_sample_weight: np.ndarray | None = None,
    ) -> NGBoostRegressor:
        """Fit the NGBoost regressor.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data.
        y : array-like of shape (n_samples,)
            Target values.
        X_val : array-like, optional
            Validation features for early stopping.
        y_val : array-like, optional
            Validation targets for early stopping.
        sample_weight : array-like, optional
            Training sample weights.
        val_sample_weight : array-like, optional
            Validation sample weights.

        Returns
        -------
        self
        """
        X_arr = self._to_numpy(X)
        y_arr = np.asarray(y).ravel()

        # Store feature names
        self._feature_names = self._get_feature_names(X, X_arr.shape[1])

        # Get parameters
        params = self._get_params()

        # Create model
        self.model_ = NGBRegressor(**params)

        # Prepare fit arguments
        fit_kwargs = {}
        if sample_weight is not None:
            fit_kwargs["sample_weight"] = sample_weight

        # Early stopping
        if X_val is not None and y_val is not None:
            X_val_arr = self._to_numpy(X_val)
            y_val_arr = np.asarray(y_val).ravel()
            fit_kwargs["X_val"] = X_val_arr
            fit_kwargs["Y_val"] = y_val_arr
            if val_sample_weight is not None:
                fit_kwargs["val_sample_weight"] = val_sample_weight
            if self.early_stopping_rounds is not None:
                fit_kwargs["early_stopping_rounds"] = self.early_stopping_rounds

        self._log(f"Training NGBoost regressor with {len(X_arr)} samples...")
        self.model_.fit(X_arr, y_arr, **fit_kwargs)

        self._is_fitted = True
        return self



[docs]
    def predict(self, X) -> np.ndarray:
        """Predict the mean of the distribution.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        y_pred : ndarray of shape (n_samples,)
            Predicted means.
        """
        self._check_is_fitted()
        X_arr = self._to_numpy(X)
        return self.model_.predict(X_arr)



[docs]
    def pred_dist(self, X):
        """Predict the full distribution.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        dist : ngboost distribution
            Predicted distributions with methods:
            - mean(): Expected value
            - std(): Standard deviation
            - var(): Variance
            - logpdf(y): Log probability density
            - pdf(y): Probability density
            - cdf(y): Cumulative distribution function
            - ppf(q): Percent point function (inverse CDF)
            - sample(n): Draw n samples
        """
        self._check_is_fitted()
        X_arr = self._to_numpy(X)
        return self.model_.pred_dist(X_arr)



[docs]
    def predict_interval(
        self,
        X,
        alpha: float = 0.1,
    ) -> tuple[np.ndarray, np.ndarray]:
        """Predict prediction intervals.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.
        alpha : float, default=0.1
            Significance level. Returns (1-alpha) prediction interval.
            E.g., alpha=0.1 returns 90% prediction interval.

        Returns
        -------
        lower : ndarray of shape (n_samples,)
            Lower bound of prediction interval.
        upper : ndarray of shape (n_samples,)
            Upper bound of prediction interval.
        """
        dist = self.pred_dist(X)
        lower = dist.ppf(alpha / 2)
        upper = dist.ppf(1 - alpha / 2)
        return lower, upper



[docs]
    def predict_std(self, X) -> np.ndarray:
        """Predict the standard deviation (uncertainty).

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        std : ndarray of shape (n_samples,)
            Predicted standard deviations.
        """
        dist = self.pred_dist(X)
        return dist.std()



[docs]
    def score(self, X, y, sample_weight=None) -> float:
        """Return the negative log-likelihood on the given data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Test samples.
        y : array-like of shape (n_samples,)
            True target values.
        sample_weight : array-like, optional
            Sample weights (not used, for API compatibility).

        Returns
        -------
        score : float
            Mean negative log-likelihood (lower is better).
        """
        self._check_is_fitted()
        dist = self.pred_dist(X)
        y_arr = np.asarray(y).ravel()
        # Return negative NLL so higher is better (sklearn convention)
        return dist.logpdf(y_arr).mean()


    @property
    def feature_importances_(self) -> np.ndarray:
        """Feature importances based on base learner splits."""
        self._check_is_fitted()
        return self.model_.feature_importances_

    def _get_feature_names(self, X, n_features: int) -> list[str]:
        """Extract feature names from input."""
        try:
            import pandas as pd
            if isinstance(X, pd.DataFrame):
                return list(X.columns)
        except ImportError:
            pass

        try:
            import polars as pl
            if isinstance(X, (pl.DataFrame, pl.LazyFrame)):
                if isinstance(X, pl.LazyFrame):
                    X = X.collect()
                return list(X.columns)
        except ImportError:
            pass

        return [f"f{i}" for i in range(n_features)]




[docs]
class NGBoostClassifier(ClassifierMixin, EndgameEstimator):
    """NGBoost Classifier for probabilistic classification.

    Produces calibrated probability distributions over classes,
    with proper uncertainty quantification.

    Parameters
    ----------
    preset : str, default='endgame'
        Hyperparameter preset: 'endgame', 'fast', 'accurate', 'competition'.
    n_estimators : int, optional
        Number of boosting iterations. Overrides preset.
    learning_rate : float, optional
        Learning rate. Overrides preset.
    minibatch_frac : float, optional
        Fraction of data to use in each iteration.
    col_sample : float, optional
        Fraction of features to use in each iteration.
    base_learner : estimator, optional
        Base learner for boosting. Default is DecisionTreeRegressor(max_depth=3).
    natural_gradient : bool, default=True
        Use natural gradient (recommended).
    early_stopping_rounds : int, optional
        Early stopping patience. If None, no early stopping.
    random_state : int, optional
        Random seed.
    verbose : bool, default=False
        Enable verbose output.
    **kwargs
        Additional parameters passed to NGBClassifier.

    Attributes
    ----------
    model_ : NGBClassifier
        Fitted NGBoost model.
    classes_ : ndarray
        Unique class labels.
    n_classes_ : int
        Number of classes.
    feature_importances_ : ndarray
        Feature importances from the base learners.

    Examples
    --------
    >>> from endgame.models import NGBoostClassifier
    >>> model = NGBoostClassifier(preset='endgame')
    >>> model.fit(X_train, y_train)
    >>>
    >>> # Class predictions
    >>> y_pred = model.predict(X_test)
    >>>
    >>> # Probability predictions
    >>> y_proba = model.predict_proba(X_test)
    >>>
    >>> # Distribution predictions
    >>> y_dist = model.pred_dist(X_test)
    >>>
    >>> # Log-loss
    >>> from sklearn.metrics import log_loss
    >>> loss = log_loss(y_test, y_proba)

    References
    ----------
    Duan et al., 2020. "NGBoost: Natural Gradient Boosting for
    Probabilistic Prediction." https://arxiv.org/abs/1910.03225
    """

    def __init__(
        self,
        preset: str = "endgame",
        n_estimators: int | None = None,
        learning_rate: float | None = None,
        minibatch_frac: float | None = None,
        col_sample: float | None = None,
        base_learner: BaseEstimator | None = None,
        natural_gradient: bool = True,
        early_stopping_rounds: int | None = None,
        random_state: int | None = None,
        verbose: bool = False,
        **kwargs,
    ):
        _check_ngboost()
        super().__init__(random_state=random_state, verbose=verbose)

        self.preset = preset
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.minibatch_frac = minibatch_frac
        self.col_sample = col_sample
        self.base_learner = base_learner
        self.natural_gradient = natural_gradient
        self.early_stopping_rounds = early_stopping_rounds
        self.kwargs = kwargs

        self.model_: NGBClassifier | None = None
        self.classes_: np.ndarray | None = None
        self.n_classes_: int | None = None
        self._feature_names: list[str] | None = None

    def _get_params(self, n_classes: int) -> dict[str, Any]:
        """Get merged parameters from preset and overrides."""
        # Start with preset
        params = NGBOOST_PRESETS.get(self.preset, NGBOOST_PRESETS["endgame"]).copy()

        # Apply overrides
        if self.n_estimators is not None:
            params["n_estimators"] = self.n_estimators
        if self.learning_rate is not None:
            params["learning_rate"] = self.learning_rate
        if self.minibatch_frac is not None:
            params["minibatch_frac"] = self.minibatch_frac
        if self.col_sample is not None:
            params["col_sample"] = self.col_sample

        params["natural_gradient"] = self.natural_gradient

        # Set distribution based on number of classes
        if n_classes == 2:
            params["Dist"] = Bernoulli
        else:
            params["Dist"] = k_categorical(n_classes)

        # Score is always LogScore for classification
        params["Score"] = LogScore

        # Add base learner
        if self.base_learner is not None:
            params["Base"] = self.base_learner
        else:
            params["Base"] = DecisionTreeRegressor(
                max_depth=3,
                min_samples_split=10,
                min_samples_leaf=5,
            )

        # Random state
        if self.random_state is not None:
            params["random_state"] = self.random_state

        # Verbose
        params["verbose"] = self.verbose

        # Additional kwargs
        params.update(self.kwargs)

        return params


[docs]
    def fit(
        self,
        X,
        y,
        X_val: np.ndarray | None = None,
        y_val: np.ndarray | None = None,
        sample_weight: np.ndarray | None = None,
        val_sample_weight: np.ndarray | None = None,
    ) -> NGBoostClassifier:
        """Fit the NGBoost classifier.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data.
        y : array-like of shape (n_samples,)
            Target labels.
        X_val : array-like, optional
            Validation features for early stopping.
        y_val : array-like, optional
            Validation labels for early stopping.
        sample_weight : array-like, optional
            Training sample weights.
        val_sample_weight : array-like, optional
            Validation sample weights.

        Returns
        -------
        self
        """
        X_arr = self._to_numpy(X)
        y_arr = np.asarray(y).ravel()

        # Store classes
        self.classes_ = np.unique(y_arr)
        self.n_classes_ = len(self.classes_)

        # Remap labels to contiguous 0..n-1 (required by ngboost k_categorical)
        self._label_remap = None
        if not np.array_equal(self.classes_, np.arange(self.n_classes_)):
            self._label_remap = {c: i for i, c in enumerate(self.classes_)}
            y_arr = np.array([self._label_remap[v] for v in y_arr])

        # Store feature names
        self._feature_names = self._get_feature_names(X, X_arr.shape[1])

        # Get parameters
        params = self._get_params(self.n_classes_)

        # Create model
        self.model_ = NGBClassifier(**params)

        # Prepare fit arguments
        fit_kwargs = {}
        if sample_weight is not None:
            fit_kwargs["sample_weight"] = sample_weight

        # Early stopping
        if X_val is not None and y_val is not None:
            X_val_arr = self._to_numpy(X_val)
            y_val_arr = np.asarray(y_val).ravel()
            if self._label_remap is not None:
                y_val_arr = np.array([self._label_remap.get(v, v) for v in y_val_arr])
            fit_kwargs["X_val"] = X_val_arr
            fit_kwargs["Y_val"] = y_val_arr
            if val_sample_weight is not None:
                fit_kwargs["val_sample_weight"] = val_sample_weight
            if self.early_stopping_rounds is not None:
                fit_kwargs["early_stopping_rounds"] = self.early_stopping_rounds

        self._log(f"Training NGBoost classifier with {len(X_arr)} samples, {self.n_classes_} classes...")
        self.model_.fit(X_arr, y_arr, **fit_kwargs)

        self._is_fitted = True
        return self



[docs]
    def predict(self, X) -> np.ndarray:
        """Predict class labels.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        y_pred : ndarray of shape (n_samples,)
            Predicted class labels.
        """
        self._check_is_fitted()
        X_arr = self._to_numpy(X)
        preds = self.model_.predict(X_arr)
        if self._label_remap is not None:
            preds = self.classes_[preds.astype(int)]
        return preds



[docs]
    def predict_proba(self, X) -> np.ndarray:
        """Predict class probabilities.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        proba : ndarray of shape (n_samples, n_classes)
            Class probabilities.
        """
        self._check_is_fitted()
        X_arr = self._to_numpy(X)
        return self.model_.predict_proba(X_arr)



[docs]
    def pred_dist(self, X):
        """Predict the full distribution over classes.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        dist : ngboost distribution
            Predicted distributions.
        """
        self._check_is_fitted()
        X_arr = self._to_numpy(X)
        return self.model_.pred_dist(X_arr)



[docs]
    def score(self, X, y, sample_weight=None) -> float:
        """Return accuracy on the given data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Test samples.
        y : array-like of shape (n_samples,)
            True labels.
        sample_weight : array-like, optional
            Sample weights.

        Returns
        -------
        score : float
            Accuracy score.
        """
        from sklearn.metrics import accuracy_score
        y_pred = self.predict(X)
        return accuracy_score(y, y_pred, sample_weight=sample_weight)


    @property
    def feature_importances_(self) -> np.ndarray:
        """Feature importances based on base learner splits."""
        self._check_is_fitted()
        return self.model_.feature_importances_

    def _get_feature_names(self, X, n_features: int) -> list[str]:
        """Extract feature names from input."""
        try:
            import pandas as pd
            if isinstance(X, pd.DataFrame):
                return list(X.columns)
        except ImportError:
            pass

        try:
            import polars as pl
            if isinstance(X, (pl.DataFrame, pl.LazyFrame)):
                if isinstance(X, pl.LazyFrame):
                    X = X.collect()
                return list(X.columns)
        except ImportError:
            pass

        return [f"f{i}" for i in range(n_features)]



# Convenience function for survival analysis (if needed in future)
def create_ngboost_survival(
    distribution: str = "exponential",
    **kwargs
) -> NGBoostRegressor:
    """Create an NGBoost model for survival analysis.

    Parameters
    ----------
    distribution : str, default='exponential'
        Distribution for survival times: 'exponential', 'lognormal'.
    **kwargs
        Additional parameters for NGBoostRegressor.

    Returns
    -------
    model : NGBoostRegressor
        Configured NGBoost model for survival analysis.

    Notes
    -----
    For survival analysis with censoring, use the ngboost library
    directly with NGBSurvival class.
    """
    return NGBoostRegressor(
        distribution=distribution,
        score="mle",
        **kwargs
    )