Source code for endgame.models.baselines.elm

from __future__ import annotations

"""Extreme Learning Machine implementation.

ELM is a single-layer feedforward neural network where input weights are
randomly assigned and never updated. Only the output weights are learned
via a closed-form solution (pseudoinverse), making training extremely fast.

This fundamentally different optimization (no backpropagation) provides
unique predictions that enhance ensemble diversity.

References
----------
- Huang et al., "Extreme Learning Machine: Theory and Applications" (2006)
- Huang et al., "Extreme Learning Machine for Regression and Multiclass Classification" (2012)
"""

from collections.abc import Callable

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.preprocessing import LabelEncoder, StandardScaler


def _sigmoid(x):
    """Sigmoid activation function."""
    return 1.0 / (1.0 + np.exp(-np.clip(x, -500, 500)))


def _tanh(x):
    """Tanh activation function."""
    return np.tanh(x)


def _relu(x):
    """ReLU activation function."""
    return np.maximum(0, x)


def _leaky_relu(x, alpha=0.01):
    """Leaky ReLU activation function."""
    return np.where(x > 0, x, alpha * x)


def _sin(x):
    """Sinusoidal activation function."""
    return np.sin(x)


def _hardlim(x):
    """Hard limit activation function."""
    return (x >= 0).astype(float)


ACTIVATION_FUNCTIONS = {
    "sigmoid": _sigmoid,
    "tanh": _tanh,
    "relu": _relu,
    "leaky_relu": _leaky_relu,
    "sin": _sin,
    "hardlim": _hardlim,
}



[docs]
class ELMClassifier(ClassifierMixin, BaseEstimator):
    """Extreme Learning Machine Classifier.

    A single-layer neural network with random input weights and
    analytically computed output weights. Training is extremely fast
    (milliseconds) because there's no iterative optimization.

    Parameters
    ----------
    n_hidden : int, default=500
        Number of hidden neurons.
    activation : str or callable, default='sigmoid'
        Activation function: 'sigmoid', 'tanh', 'relu', 'leaky_relu',
        'sin', 'hardlim', or a callable.
    alpha : float, default=1e-6
        Regularization parameter for ridge regression.
    auto_scale : bool, default=True
        Automatically scale features before fitting.
    random_state : int, optional
        Random seed for reproducibility.

    Attributes
    ----------
    classes_ : ndarray
        Unique class labels.
    n_features_in_ : int
        Number of features.
    input_weights_ : ndarray
        Random input-to-hidden weights.
    biases_ : ndarray
        Random hidden layer biases.
    output_weights_ : ndarray
        Learned hidden-to-output weights.

    Examples
    --------
    >>> from endgame.models.baselines import ELMClassifier
    >>> clf = ELMClassifier(n_hidden=500, random_state=42)
    >>> clf.fit(X_train, y_train)  # Milliseconds!
    >>> proba = clf.predict_proba(X_test)

    Notes
    -----
    ELM is valuable for ensemble diversity because:
    1. No backpropagation - fundamentally different optimization
    2. Random projections explore different feature spaces
    3. Extremely fast - can train many models for ensemble selection
    4. Often surprisingly competitive with slower methods

    The analytical solution is: beta = pinv(H) @ T
    where H is the hidden layer output and T is the target.
    """

    _estimator_type = "classifier"

    def __init__(
        self,
        n_hidden: int = 500,
        activation: str | Callable = "sigmoid",
        alpha: float = 1e-6,
        auto_scale: bool = True,
        random_state: int | None = None,
    ):
        self.n_hidden = n_hidden
        self.activation = activation
        self.alpha = alpha
        self.auto_scale = auto_scale
        self.random_state = random_state

        self.classes_: np.ndarray | None = None
        self.n_classes_: int = 0
        self.n_features_in_: int = 0
        self.input_weights_: np.ndarray | None = None
        self.biases_: np.ndarray | None = None
        self.output_weights_: np.ndarray | None = None
        self._scaler: StandardScaler | None = None
        self._label_encoder: LabelEncoder | None = None
        self._is_fitted: bool = False

    def _get_activation(self) -> Callable:
        """Get activation function."""
        if callable(self.activation):
            return self.activation
        if self.activation not in ACTIVATION_FUNCTIONS:
            raise ValueError(f"Unknown activation: {self.activation}. "
                           f"Options: {list(ACTIVATION_FUNCTIONS.keys())}")
        return ACTIVATION_FUNCTIONS[self.activation]

    def _compute_hidden_output(self, X: np.ndarray) -> np.ndarray:
        """Compute hidden layer output H."""
        activation = self._get_activation()
        # H = activation(X @ W + b)
        H = activation(X @ self.input_weights_ + self.biases_)
        return H


[docs]
    def fit(self, X, y, **fit_params) -> ELMClassifier:
        """Fit the ELM classifier.

        Training is O(n * m * h) where n=samples, m=features, h=hidden.
        The closed-form solution makes this extremely fast.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training features.
        y : array-like of shape (n_samples,)
            Target labels.

        Returns
        -------
        self
        """
        rng = np.random.RandomState(self.random_state)

        X = np.asarray(X, dtype=np.float64)
        y = np.asarray(y)

        n_samples, n_features = X.shape
        self.n_features_in_ = n_features

        # Encode labels
        self._label_encoder = LabelEncoder()
        y_encoded = self._label_encoder.fit_transform(y)
        self.classes_ = self._label_encoder.classes_
        self.n_classes_ = len(self.classes_)

        # One-hot encode targets for multi-class
        if self.n_classes_ > 2:
            T = np.eye(self.n_classes_)[y_encoded]
        else:
            T = y_encoded.reshape(-1, 1)

        # Scale features
        if self.auto_scale:
            self._scaler = StandardScaler()
            X_scaled = self._scaler.fit_transform(X)
        else:
            X_scaled = X

        # Handle NaN
        X_scaled = np.nan_to_num(X_scaled, nan=0.0)

        # Initialize random input weights and biases
        # Using uniform distribution in [-1, 1]
        self.input_weights_ = rng.uniform(-1, 1, (n_features, self.n_hidden))
        self.biases_ = rng.uniform(-1, 1, (1, self.n_hidden))

        # Compute hidden layer output
        H = self._compute_hidden_output(X_scaled)

        # Compute output weights using regularized pseudoinverse
        # beta = (H^T H + alpha*I)^(-1) H^T T
        HtH = H.T @ H
        regularized = HtH + self.alpha * np.eye(self.n_hidden)
        HtT = H.T @ T

        try:
            self.output_weights_ = np.linalg.solve(regularized, HtT)
        except np.linalg.LinAlgError:
            # Fall back to pseudoinverse if singular
            self.output_weights_ = np.linalg.pinv(regularized) @ HtT

        self._is_fitted = True
        return self



[docs]
    def predict(self, X) -> np.ndarray:
        """Predict class labels.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        y_pred : ndarray of shape (n_samples,)
            Predicted class labels.
        """
        if not self._is_fitted:
            raise RuntimeError("ELMClassifier has not been fitted.")

        proba = self.predict_proba(X)
        y_pred_encoded = np.argmax(proba, axis=1)
        return self._label_encoder.inverse_transform(y_pred_encoded)



[docs]
    def predict_proba(self, X) -> np.ndarray:
        """Predict class probabilities.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        proba : ndarray of shape (n_samples, n_classes)
            Class probabilities (softmax normalized).
        """
        if not self._is_fitted:
            raise RuntimeError("ELMClassifier has not been fitted.")

        X = np.asarray(X, dtype=np.float64)

        if self.auto_scale:
            X_scaled = self._scaler.transform(X)
        else:
            X_scaled = X

        X_scaled = np.nan_to_num(X_scaled, nan=0.0)

        # Compute hidden layer and output
        H = self._compute_hidden_output(X_scaled)
        output = H @ self.output_weights_

        # For binary classification
        if self.n_classes_ == 2:
            proba_pos = _sigmoid(output).ravel()
            proba = np.column_stack([1 - proba_pos, proba_pos])
        else:
            # Softmax for multi-class
            exp_output = np.exp(output - np.max(output, axis=1, keepdims=True))
            proba = exp_output / np.sum(exp_output, axis=1, keepdims=True)

        return proba





[docs]
class ELMRegressor(RegressorMixin, BaseEstimator):
    """Extreme Learning Machine Regressor.

    A single-layer neural network with random input weights and
    analytically computed output weights for regression.

    Parameters
    ----------
    n_hidden : int, default=500
        Number of hidden neurons.
    activation : str or callable, default='tanh'
        Activation function. 'tanh' is preferred for regression
        (unbounded, symmetric). 'sigmoid' compresses to [0,1].
    alpha : float, default=0.01
        Regularization parameter for ridge regression on output weights.
    auto_scale : bool, default=True
        Automatically scale features before fitting.
    random_state : int, optional
        Random seed for reproducibility.

    Attributes
    ----------
    n_features_in_ : int
        Number of features.
    input_weights_ : ndarray
        Random input-to-hidden weights.
    output_weights_ : ndarray
        Learned hidden-to-output weights.

    Examples
    --------
    >>> from endgame.models.baselines import ELMRegressor
    >>> reg = ELMRegressor(n_hidden=500, random_state=42)
    >>> reg.fit(X_train, y_train)
    >>> y_pred = reg.predict(X_test)
    """

    _estimator_type = "regressor"

    def __init__(
        self,
        n_hidden: int = 500,
        activation: str | Callable = "tanh",
        alpha: float = 0.01,
        auto_scale: bool = True,
        random_state: int | None = None,
    ):
        self.n_hidden = n_hidden
        self.activation = activation
        self.alpha = alpha
        self.auto_scale = auto_scale
        self.random_state = random_state

        self.n_features_in_: int = 0
        self.input_weights_: np.ndarray | None = None
        self.biases_: np.ndarray | None = None
        self.output_weights_: np.ndarray | None = None
        self._scaler: StandardScaler | None = None
        self._y_mean: float = 0.0
        self._y_std: float = 1.0
        self._is_fitted: bool = False

    def _get_activation(self) -> Callable:
        """Get activation function."""
        if callable(self.activation):
            return self.activation
        if self.activation not in ACTIVATION_FUNCTIONS:
            raise ValueError(f"Unknown activation: {self.activation}. "
                           f"Options: {list(ACTIVATION_FUNCTIONS.keys())}")
        return ACTIVATION_FUNCTIONS[self.activation]

    def _compute_hidden_output(self, X: np.ndarray) -> np.ndarray:
        """Compute hidden layer output H."""
        activation = self._get_activation()
        H = activation(X @ self.input_weights_ + self.biases_)
        return H


[docs]
    def fit(self, X, y, **fit_params) -> ELMRegressor:
        """Fit the ELM regressor.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training features.
        y : array-like of shape (n_samples,)
            Target values.

        Returns
        -------
        self
        """
        rng = np.random.RandomState(self.random_state)

        X = np.asarray(X, dtype=np.float64)
        y = np.asarray(y, dtype=np.float64).ravel()

        n_samples, n_features = X.shape
        self.n_features_in_ = n_features

        # Scale features
        if self.auto_scale:
            self._scaler = StandardScaler()
            X_scaled = self._scaler.fit_transform(X)
            # Also scale target
            self._y_mean = np.mean(y)
            self._y_std = np.std(y) + 1e-8
            y_scaled = (y - self._y_mean) / self._y_std
        else:
            X_scaled = X
            y_scaled = y

        # Handle NaN
        X_scaled = np.nan_to_num(X_scaled, nan=0.0)
        y_scaled = np.nan_to_num(y_scaled, nan=0.0)

        T = y_scaled.reshape(-1, 1)

        # Initialize random input weights and biases
        self.input_weights_ = rng.uniform(-1, 1, (n_features, self.n_hidden))
        self.biases_ = rng.uniform(-1, 1, (1, self.n_hidden))

        # Compute hidden layer output
        H = self._compute_hidden_output(X_scaled)

        # Compute output weights using regularized pseudoinverse
        HtH = H.T @ H
        regularized = HtH + self.alpha * np.eye(self.n_hidden)
        HtT = H.T @ T

        try:
            self.output_weights_ = np.linalg.solve(regularized, HtT)
        except np.linalg.LinAlgError:
            self.output_weights_ = np.linalg.pinv(regularized) @ HtT

        self._is_fitted = True
        return self



[docs]
    def predict(self, X) -> np.ndarray:
        """Predict target values.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        y_pred : ndarray of shape (n_samples,)
            Predicted values.
        """
        if not self._is_fitted:
            raise RuntimeError("ELMRegressor has not been fitted.")

        X = np.asarray(X, dtype=np.float64)

        if self.auto_scale:
            X_scaled = self._scaler.transform(X)
        else:
            X_scaled = X

        X_scaled = np.nan_to_num(X_scaled, nan=0.0)

        # Compute prediction
        H = self._compute_hidden_output(X_scaled)
        y_pred = (H @ self.output_weights_).ravel()

        # Inverse scale
        if self.auto_scale:
            y_pred = y_pred * self._y_std + self._y_mean

        return y_pred