Source code for endgame.models.ordinal.ordinal

from __future__ import annotations

"""Ordinal Regression models for ordered categorical targets.

Ordinal regression is appropriate when the target variable has a natural
ordering (e.g., 'bad' < 'average' < 'good') but the distances between
categories are unknown or not meaningful.

Key models:
- All-Threshold (AT): Each class boundary has its own threshold
- Immediate-Threshold (IT): Adjacent classes share boundaries
- SE: Same as AT but using absolute errors
- LAD: Least Absolute Deviation regression

References
----------
- Rennie & Srebro, "Loss Functions for Preference Levels" (2005)
- Pedregosa et al., "mord: A Python Package for Ordinal Regression" (2015)
- https://pythonhosted.org/mord/
"""

from typing import Any

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Try importing mord
_HAS_MORD = False
try:
    import mord
    _HAS_MORD = True
except ImportError:
    pass



[docs]
class OrdinalClassifier(ClassifierMixin, BaseEstimator):
    """Unified Ordinal Regression Classifier with auto-variant selection.

    Wraps mord library ordinal regression methods with automatic model
    selection based on data characteristics.

    Ordinal regression is critical for ordered categorical targets where
    standard classification ignores the ordering (e.g., rating prediction,
    grade classification, severity levels).

    Parameters
    ----------
    variant : str, default='auto'
        Ordinal regression variant:
        - 'auto': Automatically select based on data
        - 'at': All-Threshold (LogisticAT) - most common
        - 'it': Immediate-Threshold (LogisticIT)
        - 'se': All-Threshold with absolute errors
        - 'lad': Least Absolute Deviation
        - 'ridge': Ordinal Ridge regression
    alpha : float, default=1.0
        Regularization strength (inverse of C for logistic models,
        regularization strength for Ridge/LAD).
    max_iter : int, default=1000
        Maximum iterations for optimization.
    auto_scale : bool, default=True
        Whether to standardize features before fitting.
    random_state : int, optional
        Random seed (not used by all variants).

    Attributes
    ----------
    classes_ : ndarray
        Ordered class labels.
    n_classes_ : int
        Number of classes.
    n_features_in_ : int
        Number of features.
    variant_ : str
        The actual variant used.
    model_ : mord estimator
        Fitted ordinal regression model.
    coef_ : ndarray
        Feature coefficients.
    theta_ : ndarray
        Class thresholds (boundaries).

    Examples
    --------
    >>> from endgame.models.ordinal import OrdinalClassifier
    >>> clf = OrdinalClassifier(variant='at', alpha=1.0)
    >>> clf.fit(X_train, y_train)  # y_train has ordered labels
    >>> y_pred = clf.predict(X_test)
    >>> proba = clf.predict_proba(X_test)

    Notes
    -----
    Ordinal regression assumes:
    1. Target classes have a meaningful order
    2. A latent continuous variable underlies the ordered categories
    3. Thresholds partition this latent space into ordered categories

    The cumulative model is:
        P(Y <= j) = g(theta_j - X @ beta)
    where g is a link function (logistic, probit, etc.).
    """

    _estimator_type = "classifier"

    def __init__(
        self,
        variant: str = "auto",
        alpha: float = 1.0,
        max_iter: int = 1000,
        auto_scale: bool = True,
        random_state: int | None = None,
    ):
        self.variant = variant
        self.alpha = alpha
        self.max_iter = max_iter
        self.auto_scale = auto_scale
        self.random_state = random_state

        self.classes_: np.ndarray | None = None
        self.n_classes_: int = 0
        self.n_features_in_: int = 0
        self.variant_: str | None = None
        self.model_: Any | None = None
        self._scaler: StandardScaler | None = None
        self._label_encoder: LabelEncoder | None = None
        self._is_fitted: bool = False

    def _detect_variant(self, X: np.ndarray, y: np.ndarray) -> str:
        """Auto-detect the best ordinal regression variant.

        Decision logic:
        - Small dataset (n < 1000) -> ridge (faster, more stable)
        - Large dataset -> at (more flexible)
        - Many classes (> 10) -> ridge (fewer parameters)
        """
        n_samples = X.shape[0]
        n_classes = len(np.unique(y))

        if n_samples < 1000 or n_classes > 10:
            return "ridge"
        else:
            return "at"

    def _create_model(self):
        """Create the appropriate ordinal regression model."""
        if not _HAS_MORD:
            raise ImportError(
                "Ordinal regression requires mord. "
                "Install with: pip install mord"
            )

        if self.variant_ == "at":
            return mord.LogisticAT(alpha=self.alpha, max_iter=self.max_iter)
        elif self.variant_ == "it":
            return mord.LogisticIT(alpha=self.alpha, max_iter=self.max_iter)
        elif self.variant_ == "se":
            return mord.LogisticSE(alpha=self.alpha, max_iter=self.max_iter)
        elif self.variant_ == "lad":
            return mord.LAD(C=1.0 / self.alpha, max_iter=self.max_iter)
        elif self.variant_ == "ridge":
            return mord.OrdinalRidge(alpha=self.alpha, max_iter=self.max_iter)
        else:
            raise ValueError(
                f"Unknown variant: {self.variant_}. "
                "Options: 'auto', 'at', 'it', 'se', 'lad', 'ridge'"
            )


[docs]
    def fit(self, X, y, sample_weight=None, **fit_params) -> OrdinalClassifier:
        """Fit the ordinal regression model.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training features.
        y : array-like of shape (n_samples,)
            Ordered target labels. Labels should be integers 0, 1, 2, ...
            or will be encoded to integers preserving order.
        sample_weight : array-like, optional
            Not supported by mord, ignored.

        Returns
        -------
        self
        """
        X = np.asarray(X, dtype=np.float64)
        y = np.asarray(y)

        n_samples, n_features = X.shape
        self.n_features_in_ = n_features

        # Encode labels to consecutive integers
        self._label_encoder = LabelEncoder()
        y_encoded = self._label_encoder.fit_transform(y)
        self.classes_ = self._label_encoder.classes_
        self.n_classes_ = len(self.classes_)

        # Scale features
        if self.auto_scale:
            self._scaler = StandardScaler()
            X_scaled = self._scaler.fit_transform(X)
        else:
            X_scaled = X.copy()

        # Handle NaN
        X_scaled = np.nan_to_num(X_scaled, nan=0.0)

        # Determine variant
        if self.variant == "auto":
            self.variant_ = self._detect_variant(X_scaled, y_encoded)
        else:
            self.variant_ = self.variant

        # Create and fit model
        self.model_ = self._create_model()
        self.model_.fit(X_scaled, y_encoded)

        self._is_fitted = True
        return self



[docs]
    def predict(self, X) -> np.ndarray:
        """Predict ordinal class labels.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        y_pred : ndarray of shape (n_samples,)
            Predicted class labels.
        """
        if not self._is_fitted:
            raise RuntimeError("OrdinalClassifier has not been fitted.")

        X = np.asarray(X, dtype=np.float64)

        if self.auto_scale:
            X_scaled = self._scaler.transform(X)
        else:
            X_scaled = X.copy()

        X_scaled = np.nan_to_num(X_scaled, nan=0.0)

        y_pred_encoded = self.model_.predict(X_scaled)
        return self._label_encoder.inverse_transform(y_pred_encoded.astype(int))



[docs]
    def predict_proba(self, X) -> np.ndarray:
        """Predict class probabilities.

        For ordinal regression, probabilities are derived from the
        cumulative model:
            P(Y = j) = P(Y <= j) - P(Y <= j-1)

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples to predict.

        Returns
        -------
        proba : ndarray of shape (n_samples, n_classes)
            Class probabilities.
        """
        if not self._is_fitted:
            raise RuntimeError("OrdinalClassifier has not been fitted.")

        X = np.asarray(X, dtype=np.float64)

        if self.auto_scale:
            X_scaled = self._scaler.transform(X)
        else:
            X_scaled = X.copy()

        X_scaled = np.nan_to_num(X_scaled, nan=0.0)

        # mord models typically don't have predict_proba
        # We compute it from the cumulative probabilities
        if hasattr(self.model_, 'predict_proba'):
            return self.model_.predict_proba(X_scaled)
        else:
            # Compute from linear predictions and thresholds
            return self._compute_proba(X_scaled)


    def _compute_proba(self, X: np.ndarray) -> np.ndarray:
        """Compute class probabilities from cumulative model.

        P(Y = j) = sigmoid(theta_j - X@beta) - sigmoid(theta_{j-1} - X@beta)
        """
        from scipy.special import expit

        # Linear predictions
        linear = X @ self.coef_.ravel()

        # Thresholds
        theta = self.theta_

        # Cumulative probabilities
        n_samples = X.shape[0]
        n_classes = len(theta) + 1
        proba = np.zeros((n_samples, n_classes))

        # P(Y <= j) for each threshold
        cumprob = np.zeros((n_samples, n_classes))
        cumprob[:, -1] = 1.0  # P(Y <= K-1) = 1

        for j in range(n_classes - 1):
            cumprob[:, j] = expit(theta[j] - linear)

        # P(Y = j) = P(Y <= j) - P(Y <= j-1)
        proba[:, 0] = cumprob[:, 0]
        for j in range(1, n_classes):
            proba[:, j] = cumprob[:, j] - cumprob[:, j - 1]

        # Clip for numerical stability
        proba = np.clip(proba, 1e-10, 1.0)
        proba = proba / proba.sum(axis=1, keepdims=True)

        return proba

    @property
    def coef_(self) -> np.ndarray:
        """Feature coefficients."""
        if not self._is_fitted:
            raise RuntimeError("Model not fitted.")
        return self.model_.coef_

    @property
    def theta_(self) -> np.ndarray:
        """Class thresholds (boundaries)."""
        if not self._is_fitted:
            raise RuntimeError("Model not fitted.")
        if hasattr(self.model_, 'theta_'):
            return self.model_.theta_
        elif hasattr(self.model_, 'classes_'):
            # Ridge uses different attribute name
            return getattr(self.model_, 'theta_', np.arange(self.n_classes_ - 1))
        return np.arange(self.n_classes_ - 1)



# Convenience wrappers for specific variants


[docs]
class OrdinalRidge(OrdinalClassifier):
    """Ordinal Ridge Regression.

    Ridge regression for ordinal targets. Uses L2 regularization.
    Good for smaller datasets and many ordinal classes.

    Parameters
    ----------
    alpha : float, default=1.0
        Regularization strength.
    max_iter : int, default=1000
        Maximum iterations.
    auto_scale : bool, default=True
        Whether to standardize features.

    Examples
    --------
    >>> from endgame.models.ordinal import OrdinalRidge
    >>> clf = OrdinalRidge(alpha=1.0)
    >>> clf.fit(X_train, y_train)
    >>> y_pred = clf.predict(X_test)
    """

    def __init__(
        self,
        alpha: float = 1.0,
        max_iter: int = 1000,
        auto_scale: bool = True,
        random_state: int | None = None,
    ):
        super().__init__(
            variant="ridge",
            alpha=alpha,
            max_iter=max_iter,
            auto_scale=auto_scale,
            random_state=random_state,
        )




[docs]
class LogisticAT(OrdinalClassifier):
    """All-Threshold Ordinal Logistic Regression.

    The most common ordinal regression model. Each class boundary has
    its own threshold parameter.

    Also known as: Proportional Odds Model, Cumulative Logit Model.

    Parameters
    ----------
    alpha : float, default=1.0
        Regularization strength (inverse of C).
    max_iter : int, default=1000
        Maximum iterations.
    auto_scale : bool, default=True
        Whether to standardize features.

    Examples
    --------
    >>> from endgame.models.ordinal import LogisticAT
    >>> clf = LogisticAT(alpha=1.0)
    >>> clf.fit(X_train, y_train)
    >>> proba = clf.predict_proba(X_test)
    """

    def __init__(
        self,
        alpha: float = 1.0,
        max_iter: int = 1000,
        auto_scale: bool = True,
        random_state: int | None = None,
    ):
        super().__init__(
            variant="at",
            alpha=alpha,
            max_iter=max_iter,
            auto_scale=auto_scale,
            random_state=random_state,
        )




[docs]
class LogisticIT(OrdinalClassifier):
    """Immediate-Threshold Ordinal Logistic Regression.

    Adjacent classes share threshold boundaries. More constrained
    than All-Threshold, which can help with small datasets.

    Parameters
    ----------
    alpha : float, default=1.0
        Regularization strength.
    max_iter : int, default=1000
        Maximum iterations.
    auto_scale : bool, default=True
        Whether to standardize features.

    Examples
    --------
    >>> from endgame.models.ordinal import LogisticIT
    >>> clf = LogisticIT(alpha=1.0)
    >>> clf.fit(X_train, y_train)
    >>> y_pred = clf.predict(X_test)
    """

    def __init__(
        self,
        alpha: float = 1.0,
        max_iter: int = 1000,
        auto_scale: bool = True,
        random_state: int | None = None,
    ):
        super().__init__(
            variant="it",
            alpha=alpha,
            max_iter=max_iter,
            auto_scale=auto_scale,
            random_state=random_state,
        )




[docs]
class LogisticSE(OrdinalClassifier):
    """Squared-Error Ordinal Logistic Regression.

    All-Threshold variant but using squared errors in optimization.
    Can be more robust to outliers.

    Parameters
    ----------
    alpha : float, default=1.0
        Regularization strength.
    max_iter : int, default=1000
        Maximum iterations.
    auto_scale : bool, default=True
        Whether to standardize features.

    Examples
    --------
    >>> from endgame.models.ordinal import LogisticSE
    >>> clf = LogisticSE(alpha=1.0)
    >>> clf.fit(X_train, y_train)
    >>> y_pred = clf.predict(X_test)
    """

    def __init__(
        self,
        alpha: float = 1.0,
        max_iter: int = 1000,
        auto_scale: bool = True,
        random_state: int | None = None,
    ):
        super().__init__(
            variant="se",
            alpha=alpha,
            max_iter=max_iter,
            auto_scale=auto_scale,
            random_state=random_state,
        )




[docs]
class LAD(OrdinalClassifier):
    """Least Absolute Deviation Ordinal Regression.

    Uses L1 loss (absolute errors) instead of L2. More robust
    to outliers in the target variable.

    Parameters
    ----------
    alpha : float, default=1.0
        Regularization strength (inverse of C parameter).
    max_iter : int, default=1000
        Maximum iterations.
    auto_scale : bool, default=True
        Whether to standardize features.

    Examples
    --------
    >>> from endgame.models.ordinal import LAD
    >>> clf = LAD(alpha=1.0)
    >>> clf.fit(X_train, y_train)
    >>> y_pred = clf.predict(X_test)
    """

    def __init__(
        self,
        alpha: float = 1.0,
        max_iter: int = 1000,
        auto_scale: bool = True,
        random_state: int | None = None,
    ):
        super().__init__(
            variant="lad",
            alpha=alpha,
            max_iter=max_iter,
            auto_scale=auto_scale,
            random_state=random_state,
        )