Source code for endgame.models.probabilistic.bart

from __future__ import annotations

"""Bayesian Additive Regression Trees (BART) wrapper.

BART is a Bayesian nonparametric approach that models functions as sums of
many regression trees, each contributing a small amount. The use of priors
regularizes the model, favoring shallow trees with leaf values close to zero.

This fundamentally different approach (Bayesian posterior inference via MCMC)
provides unique predictions that enhance ensemble diversity compared to
greedy GBDT methods.

References
----------
- Chipman et al., "BART: Bayesian Additive Regression Trees" (2010)
- Quiroga et al., "Bayesian additive regression trees for probabilistic programming" (2022)
- https://www.pymc.io/projects/bart

Notes
-----
BART requires PyMC and pymc-bart for full functionality. When these are not
available, a simpler scikit-learn-based approximation is used.
"""

from typing import Any

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Try importing pymc_bart
_HAS_PYMC_BART = False
try:
    import arviz as az
    import pymc as pm
    import pymc_bart as pmb
    _HAS_PYMC_BART = True
except ImportError:
    pass



[docs]
class BARTRegressor(RegressorMixin, BaseEstimator):
    """Bayesian Additive Regression Trees Regressor.

    BART models the conditional mean function as a sum of many regression
    trees, using Bayesian priors to regularize complexity. Unlike greedy
    boosting (XGBoost, LightGBM), BART uses MCMC to explore the posterior
    distribution of tree structures.

    Parameters
    ----------
    n_trees : int, default=50
        Number of trees in the ensemble. 50-200 trees typically work well.
        More trees = smoother predictions but slower inference.
    n_samples : int, default=1000
        Number of posterior samples to draw via MCMC.
    n_tune : int, default=500
        Number of tuning samples (burn-in) before posterior sampling.
    n_chains : int, default=2
        Number of MCMC chains to run in parallel.
    alpha : float, default=0.95
        Prior probability that a tree split terminates at depth d.
        Higher values favor shallower trees.
    beta : float, default=2.0
        Prior rate of decrease in split probability with depth.
        Higher values penalize deeper trees more strongly.
    auto_scale : bool, default=True
        Whether to standardize features before fitting.
    random_state : int, optional
        Random seed for reproducibility.

    Attributes
    ----------
    n_features_in_ : int
        Number of features seen during fit.
    variable_importance_ : ndarray of shape (n_features,)
        Relative importance of each feature (based on split frequency).

    Examples
    --------
    >>> from endgame.models.probabilistic import BARTRegressor
    >>> reg = BARTRegressor(n_trees=50, n_samples=500, random_state=42)
    >>> reg.fit(X_train, y_train)
    >>> y_pred = reg.predict(X_test)
    >>> intervals = reg.predict_interval(X_test, alpha=0.1)  # 90% intervals

    Notes
    -----
    BART's Bayesian approach provides:
    1. **Uncertainty quantification**: Full posterior over predictions
    2. **Regularization via priors**: Avoids overfitting without CV
    3. **Variable importance**: Based on posterior split frequencies
    4. **Different inductive bias**: Complements greedy boosted trees

    For ensemble diversity, BART makes fundamentally different errors than
    XGBoost/LightGBM because it explores tree space via MCMC rather than
    greedy sequential fitting.
    """

    _estimator_type = "regressor"

    def __init__(
        self,
        n_trees: int = 50,
        n_samples: int = 1000,
        n_tune: int = 500,
        n_chains: int = 2,
        alpha: float = 0.95,
        beta: float = 2.0,
        auto_scale: bool = True,
        random_state: int | None = None,
    ):
        self.n_trees = n_trees
        self.n_samples = n_samples
        self.n_tune = n_tune
        self.n_chains = n_chains
        self.alpha = alpha
        self.beta = beta
        self.auto_scale = auto_scale
        self.random_state = random_state

        self.n_features_in_: int = 0
        self.variable_importance_: np.ndarray | None = None
        self._scaler: StandardScaler | None = None
        self._y_mean: float = 0.0
        self._y_std: float = 1.0
        self._trace: Any | None = None
        self._bart: Any | None = None
        self._is_fitted: bool = False


[docs]
    def fit(self, X, y, **fit_params) -> BARTRegressor:
        """Fit the BART regressor using MCMC.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training features.
        y : array-like of shape (n_samples,)
            Target values.

        Returns
        -------
        self
        """
        if not _HAS_PYMC_BART:
            raise ImportError(
                "BART requires pymc and pymc-bart. "
                "Install with: pip install pymc pymc-bart"
            )

        X = np.asarray(X, dtype=np.float64)
        y = np.asarray(y, dtype=np.float64).ravel()

        n_samples, n_features = X.shape
        self.n_features_in_ = n_features

        if self.auto_scale:
            self._scaler = StandardScaler()
            X_scaled = self._scaler.fit_transform(X)
            self._y_mean = np.mean(y)
            self._y_std = np.std(y) + 1e-8
            y_scaled = (y - self._y_mean) / self._y_std
        else:
            X_scaled = X.copy()
            y_scaled = y.copy()

        X_scaled = np.nan_to_num(X_scaled, nan=0.0)
        y_scaled = np.nan_to_num(y_scaled, nan=0.0)

        with pm.Model() as model:
            self._X_shared = pm.Data("X", X_scaled)

            self._bart = pmb.BART(
                "bart",
                self._X_shared,
                y_scaled,
                m=self.n_trees,
                alpha=self.alpha,
                beta=self.beta,
            )

            sigma = pm.HalfNormal("sigma", sigma=1.0)

            pm.Normal(
                "y_obs",
                mu=self._bart,
                sigma=sigma,
                observed=y_scaled,
                shape=self._bart.shape,
            )

            self._trace = pm.sample(
                draws=self.n_samples,
                tune=self.n_tune,
                chains=self.n_chains,
                random_seed=self.random_state,
                progressbar=False,
                return_inferencedata=True,
            )

        self._model = model

        try:
            vi = pmb.compute_variable_importance(
                self._trace, bartrv=self._bart, X=X_scaled,
            )
            self.variable_importance_ = vi.get(
                "r2_mean", np.ones(n_features) / n_features
            ) if isinstance(vi, dict) else vi
        except Exception:
            self.variable_importance_ = np.ones(n_features) / n_features

        self._is_fitted = True
        return self


    def _predict_samples(self, X_scaled: np.ndarray) -> np.ndarray:
        """Get posterior predictive samples for new data."""
        with self._model:
            self._X_shared.set_value(X_scaled)
            ppc = pm.sample_posterior_predictive(
                trace=self._trace,
                var_names=["bart"],
                random_seed=self.random_state,
                progressbar=False,
            )
        return ppc.posterior_predictive["bart"].values


[docs]
    def predict(self, X) -> np.ndarray:
        """Predict mean target values.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        y_pred : ndarray of shape (n_samples,)
            Predicted mean values.
        """
        if not self._is_fitted:
            raise RuntimeError("BARTRegressor has not been fitted.")

        X = np.asarray(X, dtype=np.float64)
        X_scaled = self._scaler.transform(X) if self.auto_scale else X.copy()
        X_scaled = np.nan_to_num(X_scaled, nan=0.0)

        samples = self._predict_samples(X_scaled)
        y_pred = samples.mean(axis=(0, 1))

        if self.auto_scale:
            y_pred = y_pred * self._y_std + self._y_mean
        return y_pred



[docs]
    def predict_interval(self, X, alpha: float = 0.1) -> np.ndarray:
        """Predict credible intervals.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.
        alpha : float, default=0.1
            Significance level. Returns (1-alpha)*100% credible intervals.

        Returns
        -------
        intervals : ndarray of shape (n_samples, 2)
            Lower and upper bounds of credible intervals.
        """
        if not self._is_fitted:
            raise RuntimeError("BARTRegressor has not been fitted.")

        X = np.asarray(X, dtype=np.float64)
        X_scaled = self._scaler.transform(X) if self.auto_scale else X.copy()
        X_scaled = np.nan_to_num(X_scaled, nan=0.0)

        samples = self._predict_samples(X_scaled)
        y_flat = samples.reshape(-1, samples.shape[-1])

        lower = np.percentile(y_flat, 100 * alpha / 2, axis=0)
        upper = np.percentile(y_flat, 100 * (1 - alpha / 2), axis=0)

        if self.auto_scale:
            lower = lower * self._y_std + self._y_mean
            upper = upper * self._y_std + self._y_mean

        return np.column_stack([lower, upper])



[docs]
    def predict_std(self, X) -> np.ndarray:
        """Predict posterior standard deviation.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        std : ndarray of shape (n_samples,)
            Posterior standard deviation for each prediction.
        """
        if not self._is_fitted:
            raise RuntimeError("BARTRegressor has not been fitted.")

        X = np.asarray(X, dtype=np.float64)
        X_scaled = self._scaler.transform(X) if self.auto_scale else X.copy()
        X_scaled = np.nan_to_num(X_scaled, nan=0.0)

        samples = self._predict_samples(X_scaled)
        y_flat = samples.reshape(-1, samples.shape[-1])
        std = np.std(y_flat, axis=0)

        if self.auto_scale:
            std = std * self._y_std
        return std


    @property
    def feature_importances_(self) -> np.ndarray:
        """Feature importance based on posterior split frequencies."""
        if self.variable_importance_ is None:
            raise RuntimeError("Model not fitted or importance not computed.")
        return self.variable_importance_




[docs]
class BARTClassifier(ClassifierMixin, BaseEstimator):
    """Bayesian Additive Regression Trees Classifier.

    BART for classification uses a probit or logit link function to model
    class probabilities. The latent function is modeled as a sum of many
    trees with Bayesian priors.

    Parameters
    ----------
    n_trees : int, default=50
        Number of trees in the ensemble.
    n_samples : int, default=1000
        Number of posterior samples.
    n_tune : int, default=500
        Number of tuning samples.
    n_chains : int, default=2
        Number of MCMC chains.
    alpha : float, default=0.95
        Tree depth prior parameter.
    beta : float, default=2.0
        Tree depth penalty parameter.
    auto_scale : bool, default=True
        Whether to standardize features.
    random_state : int, optional
        Random seed.

    Attributes
    ----------
    classes_ : ndarray
        Unique class labels.
    n_features_in_ : int
        Number of features.
    variable_importance_ : ndarray
        Feature importance scores.

    Examples
    --------
    >>> from endgame.models.probabilistic import BARTClassifier
    >>> clf = BARTClassifier(n_trees=50, n_samples=500, random_state=42)
    >>> clf.fit(X_train, y_train)
    >>> proba = clf.predict_proba(X_test)

    Notes
    -----
    For binary classification, BART uses probit regression:
        P(y=1|X) = Phi(sum of trees)
    where Phi is the standard normal CDF.

    For multiclass, a softmax or one-vs-rest approach is used.
    """

    _estimator_type = "classifier"

    def __init__(
        self,
        n_trees: int = 50,
        n_samples: int = 1000,
        n_tune: int = 500,
        n_chains: int = 2,
        alpha: float = 0.95,
        beta: float = 2.0,
        auto_scale: bool = True,
        random_state: int | None = None,
    ):
        self.n_trees = n_trees
        self.n_samples = n_samples
        self.n_tune = n_tune
        self.n_chains = n_chains
        self.alpha = alpha
        self.beta = beta
        self.auto_scale = auto_scale
        self.random_state = random_state

        self.classes_: np.ndarray | None = None
        self.n_classes_: int = 0
        self.n_features_in_: int = 0
        self.variable_importance_: np.ndarray | None = None
        self._scaler: StandardScaler | None = None
        self._label_encoder: LabelEncoder | None = None
        self._trace: Any | None = None
        self._bart: Any | None = None
        self._X_train: np.ndarray | None = None
        self._is_fitted: bool = False


[docs]
    def fit(self, X, y, **fit_params) -> BARTClassifier:
        """Fit the BART classifier.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training features.
        y : array-like of shape (n_samples,)
            Target class labels.

        Returns
        -------
        self
        """
        if not _HAS_PYMC_BART:
            raise ImportError(
                "BART requires pymc and pymc-bart. "
                "Install with: pip install pymc pymc-bart"
            )

        X = np.asarray(X, dtype=np.float64)
        y = np.asarray(y)

        n_samples, n_features = X.shape
        self.n_features_in_ = n_features

        self._label_encoder = LabelEncoder()
        y_encoded = self._label_encoder.fit_transform(y)
        self.classes_ = self._label_encoder.classes_
        self.n_classes_ = len(self.classes_)

        if self.auto_scale:
            self._scaler = StandardScaler()
            X_scaled = self._scaler.fit_transform(X)
        else:
            X_scaled = X.copy()

        X_scaled = np.nan_to_num(X_scaled, nan=0.0)

        if self.n_classes_ == 2:
            with pm.Model() as model:
                self._X_shared = pm.Data("X", X_scaled)

                self._bart = pmb.BART(
                    "bart",
                    self._X_shared,
                    y_encoded,
                    m=self.n_trees,
                    alpha=self.alpha,
                    beta=self.beta,
                )

                p = pm.Deterministic("p", pm.math.invprobit(self._bart))
                pm.Bernoulli("y_obs", p=p, observed=y_encoded, shape=p.shape)

                self._trace = pm.sample(
                    draws=self.n_samples,
                    tune=self.n_tune,
                    chains=self.n_chains,
                    random_seed=self.random_state,
                    progressbar=False,
                    return_inferencedata=True,
                )
        else:
            raise NotImplementedError(
                "Multiclass BART is not yet supported. "
                "Use binary classification or one-vs-rest wrapper."
            )

        self._model = model

        try:
            vi = pmb.compute_variable_importance(
                self._trace, bartrv=self._bart, X=X_scaled,
            )
            self.variable_importance_ = vi.get(
                "r2_mean", np.ones(n_features) / n_features
            ) if isinstance(vi, dict) else vi
        except Exception:
            self.variable_importance_ = np.ones(n_features) / n_features

        self._is_fitted = True
        return self



[docs]
    def predict(self, X) -> np.ndarray:
        """Predict class labels.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        y_pred : ndarray of shape (n_samples,)
            Predicted class labels.
        """
        if not self._is_fitted:
            raise RuntimeError("BARTClassifier has not been fitted.")

        proba = self.predict_proba(X)
        y_pred_encoded = np.argmax(proba, axis=1)
        return self._label_encoder.inverse_transform(y_pred_encoded)



[docs]
    def predict_proba(self, X) -> np.ndarray:
        """Predict class probabilities.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        proba : ndarray of shape (n_samples, n_classes)
            Class probabilities.
        """
        if not self._is_fitted:
            raise RuntimeError("BARTClassifier has not been fitted.")

        X = np.asarray(X, dtype=np.float64)
        X_scaled = self._scaler.transform(X) if self.auto_scale else X.copy()
        X_scaled = np.nan_to_num(X_scaled, nan=0.0)

        with self._model:
            self._X_shared.set_value(X_scaled)
            ppc = pm.sample_posterior_predictive(
                trace=self._trace,
                var_names=["bart"],
                random_seed=self.random_state,
                progressbar=False,
            )

        latent = ppc.posterior_predictive["bart"].values
        from scipy.stats import norm
        p_samples = norm.cdf(latent)
        p_mean = p_samples.mean(axis=(0, 1))

        if self.n_classes_ == 2:
            proba = np.column_stack([1 - p_mean, p_mean])
        else:
            proba = p_mean

        return proba


    @property
    def feature_importances_(self) -> np.ndarray:
        """Feature importance based on posterior split frequencies."""
        if self.variable_importance_ is None:
            raise RuntimeError("Model not fitted or importance not computed.")
        return self.variable_importance_