Source code for endgame.models.baselines.elm

from __future__ import annotations

"""Extreme Learning Machine implementation.

ELM is a single-layer feedforward neural network where input weights are
randomly assigned and never updated. Only the output weights are learned
via a closed-form solution (pseudoinverse), making training extremely fast.

This fundamentally different optimization (no backpropagation) provides
unique predictions that enhance ensemble diversity.

References
----------
- Huang et al., "Extreme Learning Machine: Theory and Applications" (2006)
- Huang et al., "Extreme Learning Machine for Regression and Multiclass Classification" (2012)
"""

from collections.abc import Callable

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.preprocessing import LabelEncoder, StandardScaler


def _sigmoid(x):
    """Sigmoid activation function."""
    return 1.0 / (1.0 + np.exp(-np.clip(x, -500, 500)))


def _tanh(x):
    """Tanh activation function."""
    return np.tanh(x)


def _relu(x):
    """ReLU activation function."""
    return np.maximum(0, x)


def _leaky_relu(x, alpha=0.01):
    """Leaky ReLU activation function."""
    return np.where(x > 0, x, alpha * x)


def _sin(x):
    """Sinusoidal activation function."""
    return np.sin(x)


def _hardlim(x):
    """Hard limit activation function."""
    return (x >= 0).astype(float)


ACTIVATION_FUNCTIONS = {
    "sigmoid": _sigmoid,
    "tanh": _tanh,
    "relu": _relu,
    "leaky_relu": _leaky_relu,
    "sin": _sin,
    "hardlim": _hardlim,
}


[docs] class ELMClassifier(ClassifierMixin, BaseEstimator): """Extreme Learning Machine Classifier. A single-layer neural network with random input weights and analytically computed output weights. Training is extremely fast (milliseconds) because there's no iterative optimization. Parameters ---------- n_hidden : int, default=500 Number of hidden neurons. activation : str or callable, default='sigmoid' Activation function: 'sigmoid', 'tanh', 'relu', 'leaky_relu', 'sin', 'hardlim', or a callable. alpha : float, default=1e-6 Regularization parameter for ridge regression. auto_scale : bool, default=True Automatically scale features before fitting. random_state : int, optional Random seed for reproducibility. Attributes ---------- classes_ : ndarray Unique class labels. n_features_in_ : int Number of features. input_weights_ : ndarray Random input-to-hidden weights. biases_ : ndarray Random hidden layer biases. output_weights_ : ndarray Learned hidden-to-output weights. Examples -------- >>> from endgame.models.baselines import ELMClassifier >>> clf = ELMClassifier(n_hidden=500, random_state=42) >>> clf.fit(X_train, y_train) # Milliseconds! >>> proba = clf.predict_proba(X_test) Notes ----- ELM is valuable for ensemble diversity because: 1. No backpropagation - fundamentally different optimization 2. Random projections explore different feature spaces 3. Extremely fast - can train many models for ensemble selection 4. Often surprisingly competitive with slower methods The analytical solution is: beta = pinv(H) @ T where H is the hidden layer output and T is the target. """ _estimator_type = "classifier" def __init__( self, n_hidden: int = 500, activation: str | Callable = "sigmoid", alpha: float = 1e-6, auto_scale: bool = True, random_state: int | None = None, ): self.n_hidden = n_hidden self.activation = activation self.alpha = alpha self.auto_scale = auto_scale self.random_state = random_state self.classes_: np.ndarray | None = None self.n_classes_: int = 0 self.n_features_in_: int = 0 self.input_weights_: np.ndarray | None = None self.biases_: np.ndarray | None = None self.output_weights_: np.ndarray | None = None self._scaler: StandardScaler | None = None self._label_encoder: LabelEncoder | None = None self._is_fitted: bool = False def _get_activation(self) -> Callable: """Get activation function.""" if callable(self.activation): return self.activation if self.activation not in ACTIVATION_FUNCTIONS: raise ValueError(f"Unknown activation: {self.activation}. " f"Options: {list(ACTIVATION_FUNCTIONS.keys())}") return ACTIVATION_FUNCTIONS[self.activation] def _compute_hidden_output(self, X: np.ndarray) -> np.ndarray: """Compute hidden layer output H.""" activation = self._get_activation() # H = activation(X @ W + b) H = activation(X @ self.input_weights_ + self.biases_) return H
[docs] def fit(self, X, y, **fit_params) -> ELMClassifier: """Fit the ELM classifier. Training is O(n * m * h) where n=samples, m=features, h=hidden. The closed-form solution makes this extremely fast. Parameters ---------- X : array-like of shape (n_samples, n_features) Training features. y : array-like of shape (n_samples,) Target labels. Returns ------- self """ rng = np.random.RandomState(self.random_state) X = np.asarray(X, dtype=np.float64) y = np.asarray(y) n_samples, n_features = X.shape self.n_features_in_ = n_features # Encode labels self._label_encoder = LabelEncoder() y_encoded = self._label_encoder.fit_transform(y) self.classes_ = self._label_encoder.classes_ self.n_classes_ = len(self.classes_) # One-hot encode targets for multi-class if self.n_classes_ > 2: T = np.eye(self.n_classes_)[y_encoded] else: T = y_encoded.reshape(-1, 1) # Scale features if self.auto_scale: self._scaler = StandardScaler() X_scaled = self._scaler.fit_transform(X) else: X_scaled = X # Handle NaN X_scaled = np.nan_to_num(X_scaled, nan=0.0) # Initialize random input weights and biases # Using uniform distribution in [-1, 1] self.input_weights_ = rng.uniform(-1, 1, (n_features, self.n_hidden)) self.biases_ = rng.uniform(-1, 1, (1, self.n_hidden)) # Compute hidden layer output H = self._compute_hidden_output(X_scaled) # Compute output weights using regularized pseudoinverse # beta = (H^T H + alpha*I)^(-1) H^T T HtH = H.T @ H regularized = HtH + self.alpha * np.eye(self.n_hidden) HtT = H.T @ T try: self.output_weights_ = np.linalg.solve(regularized, HtT) except np.linalg.LinAlgError: # Fall back to pseudoinverse if singular self.output_weights_ = np.linalg.pinv(regularized) @ HtT self._is_fitted = True return self
[docs] def predict(self, X) -> np.ndarray: """Predict class labels. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples to predict. Returns ------- y_pred : ndarray of shape (n_samples,) Predicted class labels. """ if not self._is_fitted: raise RuntimeError("ELMClassifier has not been fitted.") proba = self.predict_proba(X) y_pred_encoded = np.argmax(proba, axis=1) return self._label_encoder.inverse_transform(y_pred_encoded)
[docs] def predict_proba(self, X) -> np.ndarray: """Predict class probabilities. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples to predict. Returns ------- proba : ndarray of shape (n_samples, n_classes) Class probabilities (softmax normalized). """ if not self._is_fitted: raise RuntimeError("ELMClassifier has not been fitted.") X = np.asarray(X, dtype=np.float64) if self.auto_scale: X_scaled = self._scaler.transform(X) else: X_scaled = X X_scaled = np.nan_to_num(X_scaled, nan=0.0) # Compute hidden layer and output H = self._compute_hidden_output(X_scaled) output = H @ self.output_weights_ # For binary classification if self.n_classes_ == 2: proba_pos = _sigmoid(output).ravel() proba = np.column_stack([1 - proba_pos, proba_pos]) else: # Softmax for multi-class exp_output = np.exp(output - np.max(output, axis=1, keepdims=True)) proba = exp_output / np.sum(exp_output, axis=1, keepdims=True) return proba
[docs] class ELMRegressor(RegressorMixin, BaseEstimator): """Extreme Learning Machine Regressor. A single-layer neural network with random input weights and analytically computed output weights for regression. Parameters ---------- n_hidden : int, default=500 Number of hidden neurons. activation : str or callable, default='tanh' Activation function. 'tanh' is preferred for regression (unbounded, symmetric). 'sigmoid' compresses to [0,1]. alpha : float, default=0.01 Regularization parameter for ridge regression on output weights. auto_scale : bool, default=True Automatically scale features before fitting. random_state : int, optional Random seed for reproducibility. Attributes ---------- n_features_in_ : int Number of features. input_weights_ : ndarray Random input-to-hidden weights. output_weights_ : ndarray Learned hidden-to-output weights. Examples -------- >>> from endgame.models.baselines import ELMRegressor >>> reg = ELMRegressor(n_hidden=500, random_state=42) >>> reg.fit(X_train, y_train) >>> y_pred = reg.predict(X_test) """ _estimator_type = "regressor" def __init__( self, n_hidden: int = 500, activation: str | Callable = "tanh", alpha: float = 0.01, auto_scale: bool = True, random_state: int | None = None, ): self.n_hidden = n_hidden self.activation = activation self.alpha = alpha self.auto_scale = auto_scale self.random_state = random_state self.n_features_in_: int = 0 self.input_weights_: np.ndarray | None = None self.biases_: np.ndarray | None = None self.output_weights_: np.ndarray | None = None self._scaler: StandardScaler | None = None self._y_mean: float = 0.0 self._y_std: float = 1.0 self._is_fitted: bool = False def _get_activation(self) -> Callable: """Get activation function.""" if callable(self.activation): return self.activation if self.activation not in ACTIVATION_FUNCTIONS: raise ValueError(f"Unknown activation: {self.activation}. " f"Options: {list(ACTIVATION_FUNCTIONS.keys())}") return ACTIVATION_FUNCTIONS[self.activation] def _compute_hidden_output(self, X: np.ndarray) -> np.ndarray: """Compute hidden layer output H.""" activation = self._get_activation() H = activation(X @ self.input_weights_ + self.biases_) return H
[docs] def fit(self, X, y, **fit_params) -> ELMRegressor: """Fit the ELM regressor. Parameters ---------- X : array-like of shape (n_samples, n_features) Training features. y : array-like of shape (n_samples,) Target values. Returns ------- self """ rng = np.random.RandomState(self.random_state) X = np.asarray(X, dtype=np.float64) y = np.asarray(y, dtype=np.float64).ravel() n_samples, n_features = X.shape self.n_features_in_ = n_features # Scale features if self.auto_scale: self._scaler = StandardScaler() X_scaled = self._scaler.fit_transform(X) # Also scale target self._y_mean = np.mean(y) self._y_std = np.std(y) + 1e-8 y_scaled = (y - self._y_mean) / self._y_std else: X_scaled = X y_scaled = y # Handle NaN X_scaled = np.nan_to_num(X_scaled, nan=0.0) y_scaled = np.nan_to_num(y_scaled, nan=0.0) T = y_scaled.reshape(-1, 1) # Initialize random input weights and biases self.input_weights_ = rng.uniform(-1, 1, (n_features, self.n_hidden)) self.biases_ = rng.uniform(-1, 1, (1, self.n_hidden)) # Compute hidden layer output H = self._compute_hidden_output(X_scaled) # Compute output weights using regularized pseudoinverse HtH = H.T @ H regularized = HtH + self.alpha * np.eye(self.n_hidden) HtT = H.T @ T try: self.output_weights_ = np.linalg.solve(regularized, HtT) except np.linalg.LinAlgError: self.output_weights_ = np.linalg.pinv(regularized) @ HtT self._is_fitted = True return self
[docs] def predict(self, X) -> np.ndarray: """Predict target values. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples to predict. Returns ------- y_pred : ndarray of shape (n_samples,) Predicted values. """ if not self._is_fitted: raise RuntimeError("ELMRegressor has not been fitted.") X = np.asarray(X, dtype=np.float64) if self.auto_scale: X_scaled = self._scaler.transform(X) else: X_scaled = X X_scaled = np.nan_to_num(X_scaled, nan=0.0) # Compute prediction H = self._compute_hidden_output(X_scaled) y_pred = (H @ self.output_weights_).ravel() # Inverse scale if self.auto_scale: y_pred = y_pred * self._y_std + self._y_mean return y_pred