Source code for endgame.models.kernel.gaussian_process

from __future__ import annotations

"""Gaussian Process models with competition-tuned defaults.

Gaussian Processes provide Bayesian inference with kernel methods,
offering principled uncertainty quantification and different error
patterns from tree-based and neural network models.

References
----------
- Rasmussen & Williams, "Gaussian Processes for Machine Learning" (2006)
- sklearn.gaussian_process documentation
"""

from typing import Any

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.gaussian_process import GaussianProcessClassifier as _GPClassifier
from sklearn.gaussian_process import GaussianProcessRegressor as _GPRegressor
from sklearn.gaussian_process.kernels import (
    RBF,
    ConstantKernel,
    DotProduct,
    Matern,
    RationalQuadratic,
    WhiteKernel,
)
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Kernel presets for different problem types
KERNEL_PRESETS = {
    "rbf": lambda length_scale: ConstantKernel(1.0) * RBF(length_scale=length_scale),
    "matern": lambda length_scale: ConstantKernel(1.0) * Matern(length_scale=length_scale, nu=2.5),
    "matern12": lambda length_scale: ConstantKernel(1.0) * Matern(length_scale=length_scale, nu=0.5),
    "matern32": lambda length_scale: ConstantKernel(1.0) * Matern(length_scale=length_scale, nu=1.5),
    "matern52": lambda length_scale: ConstantKernel(1.0) * Matern(length_scale=length_scale, nu=2.5),
    "rq": lambda length_scale: ConstantKernel(1.0) * RationalQuadratic(length_scale=length_scale),
    "linear": lambda length_scale: ConstantKernel(1.0) * DotProduct(sigma_0=1.0),
}


[docs] class GPClassifier(ClassifierMixin, BaseEstimator): """Gaussian Process Classifier with competition-tuned defaults. A Bayesian kernel method that provides probabilistic predictions with principled uncertainty estimates. Different inductive bias from trees and neural networks, making it valuable for ensemble diversity. Parameters ---------- kernel : str or sklearn kernel, default='rbf' Kernel type. Options: 'rbf', 'matern', 'matern12', 'matern32', 'matern52', 'rq', 'linear', or a sklearn kernel object. length_scale : float, default=1.0 Length scale parameter for the kernel. n_restarts_optimizer : int, default=3 Number of restarts for the optimizer. max_iter_predict : int, default=100 Maximum iterations for prediction. warm_start : bool, default=False Use previous fit as initialization. multi_class : str, default='one_vs_rest' Multi-class strategy: 'one_vs_rest' or 'one_vs_one'. auto_scale : bool, default=True Automatically scale features before fitting. random_state : int, optional Random seed for reproducibility. Attributes ---------- classes_ : ndarray Unique class labels. n_features_in_ : int Number of features. model_ : GaussianProcessClassifier Fitted sklearn GP classifier. Examples -------- >>> from endgame.models.kernel import GPClassifier >>> clf = GPClassifier(kernel='rbf', random_state=42) >>> clf.fit(X_train, y_train) >>> proba = clf.predict_proba(X_test) >>> # Get uncertainty >>> proba, std = clf.predict_proba(X_test, return_std=True) Notes ----- Gaussian Processes excel on small-medium datasets where uncertainty matters. They scale O(n^3) with training size, so not suitable for large datasets (>10k samples) without approximations. """ _estimator_type = "classifier" def __init__( self, kernel: str | Any = "rbf", length_scale: float = 1.0, n_restarts_optimizer: int = 3, max_iter_predict: int = 100, warm_start: bool = False, multi_class: str = "one_vs_rest", auto_scale: bool = True, random_state: int | None = None, ): self.kernel = kernel self.length_scale = length_scale self.n_restarts_optimizer = n_restarts_optimizer self.max_iter_predict = max_iter_predict self.warm_start = warm_start self.multi_class = multi_class self.auto_scale = auto_scale self.random_state = random_state self.classes_: np.ndarray | None = None self.n_classes_: int = 0 self.n_features_in_: int = 0 self.model_: _GPClassifier | None = None self._scaler: StandardScaler | None = None self._label_encoder: LabelEncoder | None = None self._is_fitted: bool = False def _get_kernel(self): """Get kernel object from string or return as-is.""" if isinstance(self.kernel, str): if self.kernel not in KERNEL_PRESETS: raise ValueError(f"Unknown kernel: {self.kernel}. " f"Options: {list(KERNEL_PRESETS.keys())}") return KERNEL_PRESETS[self.kernel](self.length_scale) return self.kernel
[docs] def fit(self, X, y, **fit_params) -> GPClassifier: """Fit the Gaussian Process classifier. Parameters ---------- X : array-like of shape (n_samples, n_features) Training features. y : array-like of shape (n_samples,) Target labels. Returns ------- self """ X = np.asarray(X, dtype=np.float64) y = np.asarray(y) self.n_features_in_ = X.shape[1] # Encode labels self._label_encoder = LabelEncoder() y_encoded = self._label_encoder.fit_transform(y) self.classes_ = self._label_encoder.classes_ self.n_classes_ = len(self.classes_) # Scale features if self.auto_scale: self._scaler = StandardScaler() X_scaled = self._scaler.fit_transform(X) else: X_scaled = X # Handle NaN X_scaled = np.nan_to_num(X_scaled, nan=0.0) # Create and fit model kernel = self._get_kernel() self.model_ = _GPClassifier( kernel=kernel, n_restarts_optimizer=self.n_restarts_optimizer, max_iter_predict=self.max_iter_predict, warm_start=self.warm_start, multi_class=self.multi_class, random_state=self.random_state, ) self.model_.fit(X_scaled, y_encoded) self._is_fitted = True return self
[docs] def predict(self, X) -> np.ndarray: """Predict class labels. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples to predict. Returns ------- y_pred : ndarray of shape (n_samples,) Predicted class labels. """ if not self._is_fitted: raise RuntimeError("GPClassifier has not been fitted.") X = np.asarray(X, dtype=np.float64) if self.auto_scale: X_scaled = self._scaler.transform(X) else: X_scaled = X X_scaled = np.nan_to_num(X_scaled, nan=0.0) y_pred = self.model_.predict(X_scaled) return self._label_encoder.inverse_transform(y_pred)
[docs] def predict_proba(self, X, return_std: bool = False) -> np.ndarray | tuple: """Predict class probabilities. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples to predict. return_std : bool, default=False If True, also return uncertainty estimates. Returns ------- proba : ndarray of shape (n_samples, n_classes) Class probabilities. std : ndarray of shape (n_samples,), optional Uncertainty estimates (if return_std=True). """ if not self._is_fitted: raise RuntimeError("GPClassifier has not been fitted.") X = np.asarray(X, dtype=np.float64) if self.auto_scale: X_scaled = self._scaler.transform(X) else: X_scaled = X X_scaled = np.nan_to_num(X_scaled, nan=0.0) proba = self.model_.predict_proba(X_scaled) if return_std: # Estimate uncertainty from entropy of predictions entropy = -np.sum(proba * np.log(proba + 1e-10), axis=1) max_entropy = np.log(self.n_classes_) std = entropy / max_entropy # Normalized uncertainty return proba, std return proba
[docs] class GPRegressor(RegressorMixin, BaseEstimator): """Gaussian Process Regressor with competition-tuned defaults. A Bayesian kernel method that provides predictions with principled uncertainty estimates through the posterior predictive distribution. Parameters ---------- kernel : str or sklearn kernel, default='rbf' Kernel type. Options: 'rbf', 'matern', 'matern12', 'matern32', 'matern52', 'rq', 'linear', or a sklearn kernel object. length_scale : float, default=1.0 Length scale parameter for the kernel. alpha : float, default=1e-10 Value added to diagonal for numerical stability. n_restarts_optimizer : int, default=3 Number of restarts for the optimizer. normalize_y : bool, default=True Normalize target values. auto_scale : bool, default=True Automatically scale features before fitting. random_state : int, optional Random seed for reproducibility. Attributes ---------- n_features_in_ : int Number of features. model_ : GaussianProcessRegressor Fitted sklearn GP regressor. Examples -------- >>> from endgame.models.kernel import GPRegressor >>> reg = GPRegressor(kernel='matern', random_state=42) >>> reg.fit(X_train, y_train) >>> y_pred, y_std = reg.predict(X_test, return_std=True) >>> # Prediction intervals >>> lower = y_pred - 1.96 * y_std >>> upper = y_pred + 1.96 * y_std """ _estimator_type = "regressor" def __init__( self, kernel: str | Any = "rbf", length_scale: float = 1.0, alpha: float = 1e-10, n_restarts_optimizer: int = 3, normalize_y: bool = True, auto_scale: bool = True, random_state: int | None = None, ): self.kernel = kernel self.length_scale = length_scale self.alpha = alpha self.n_restarts_optimizer = n_restarts_optimizer self.normalize_y = normalize_y self.auto_scale = auto_scale self.random_state = random_state self.n_features_in_: int = 0 self.model_: _GPRegressor | None = None self._scaler: StandardScaler | None = None self._is_fitted: bool = False def _get_kernel(self): """Get kernel object from string or return as-is.""" if isinstance(self.kernel, str): if self.kernel not in KERNEL_PRESETS: raise ValueError(f"Unknown kernel: {self.kernel}. " f"Options: {list(KERNEL_PRESETS.keys())}") # Add white noise kernel for regression base_kernel = KERNEL_PRESETS[self.kernel](self.length_scale) return base_kernel + WhiteKernel(noise_level=0.1) return self.kernel
[docs] def fit(self, X, y, **fit_params) -> GPRegressor: """Fit the Gaussian Process regressor. Parameters ---------- X : array-like of shape (n_samples, n_features) Training features. y : array-like of shape (n_samples,) Target values. Returns ------- self """ X = np.asarray(X, dtype=np.float64) y = np.asarray(y, dtype=np.float64) self.n_features_in_ = X.shape[1] # Scale features if self.auto_scale: self._scaler = StandardScaler() X_scaled = self._scaler.fit_transform(X) else: X_scaled = X # Handle NaN X_scaled = np.nan_to_num(X_scaled, nan=0.0) y = np.nan_to_num(y, nan=0.0) # Create and fit model kernel = self._get_kernel() self.model_ = _GPRegressor( kernel=kernel, alpha=self.alpha, n_restarts_optimizer=self.n_restarts_optimizer, normalize_y=self.normalize_y, random_state=self.random_state, ) self.model_.fit(X_scaled, y) self._is_fitted = True return self
[docs] def predict(self, X, return_std: bool = False, return_cov: bool = False): """Predict target values. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples to predict. return_std : bool, default=False If True, return standard deviation of predictions. return_cov : bool, default=False If True, return covariance of predictions. Returns ------- y_pred : ndarray of shape (n_samples,) Predicted values. y_std : ndarray of shape (n_samples,), optional Standard deviation (if return_std=True). y_cov : ndarray of shape (n_samples, n_samples), optional Covariance matrix (if return_cov=True). """ if not self._is_fitted: raise RuntimeError("GPRegressor has not been fitted.") X = np.asarray(X, dtype=np.float64) if self.auto_scale: X_scaled = self._scaler.transform(X) else: X_scaled = X X_scaled = np.nan_to_num(X_scaled, nan=0.0) return self.model_.predict(X_scaled, return_std=return_std, return_cov=return_cov)
[docs] def predict_interval(self, X, alpha: float = 0.05) -> tuple: """Predict with prediction intervals. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples to predict. alpha : float, default=0.05 Significance level (0.05 = 95% interval). Returns ------- y_pred : ndarray of shape (n_samples,) Point predictions. lower : ndarray of shape (n_samples,) Lower bound of prediction interval. upper : ndarray of shape (n_samples,) Upper bound of prediction interval. """ from scipy import stats y_pred, y_std = self.predict(X, return_std=True) z = stats.norm.ppf(1 - alpha / 2) lower = y_pred - z * y_std upper = y_pred + z * y_std return y_pred, lower, upper
[docs] def sample_y(self, X, n_samples: int = 1, random_state: int | None = None) -> np.ndarray: """Sample from the posterior predictive distribution. Parameters ---------- X : array-like of shape (n_samples, n_features) Query points. n_samples : int, default=1 Number of samples to draw. random_state : int, optional Random seed. Returns ------- samples : ndarray of shape (n_query, n_samples) Samples from posterior predictive. """ if not self._is_fitted: raise RuntimeError("GPRegressor has not been fitted.") X = np.asarray(X, dtype=np.float64) if self.auto_scale: X_scaled = self._scaler.transform(X) else: X_scaled = X X_scaled = np.nan_to_num(X_scaled, nan=0.0) return self.model_.sample_y(X_scaled, n_samples=n_samples, random_state=random_state)