from __future__ import annotations
"""Ordinal Regression models for ordered categorical targets.
Ordinal regression is appropriate when the target variable has a natural
ordering (e.g., 'bad' < 'average' < 'good') but the distances between
categories are unknown or not meaningful.
Key models:
- All-Threshold (AT): Each class boundary has its own threshold
- Immediate-Threshold (IT): Adjacent classes share boundaries
- SE: Same as AT but using absolute errors
- LAD: Least Absolute Deviation regression
References
----------
- Rennie & Srebro, "Loss Functions for Preference Levels" (2005)
- Pedregosa et al., "mord: A Python Package for Ordinal Regression" (2015)
- https://pythonhosted.org/mord/
"""
from typing import Any
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Try importing mord
_HAS_MORD = False
try:
import mord
_HAS_MORD = True
except ImportError:
pass
[docs]
class OrdinalClassifier(ClassifierMixin, BaseEstimator):
"""Unified Ordinal Regression Classifier with auto-variant selection.
Wraps mord library ordinal regression methods with automatic model
selection based on data characteristics.
Ordinal regression is critical for ordered categorical targets where
standard classification ignores the ordering (e.g., rating prediction,
grade classification, severity levels).
Parameters
----------
variant : str, default='auto'
Ordinal regression variant:
- 'auto': Automatically select based on data
- 'at': All-Threshold (LogisticAT) - most common
- 'it': Immediate-Threshold (LogisticIT)
- 'se': All-Threshold with absolute errors
- 'lad': Least Absolute Deviation
- 'ridge': Ordinal Ridge regression
alpha : float, default=1.0
Regularization strength (inverse of C for logistic models,
regularization strength for Ridge/LAD).
max_iter : int, default=1000
Maximum iterations for optimization.
auto_scale : bool, default=True
Whether to standardize features before fitting.
random_state : int, optional
Random seed (not used by all variants).
Attributes
----------
classes_ : ndarray
Ordered class labels.
n_classes_ : int
Number of classes.
n_features_in_ : int
Number of features.
variant_ : str
The actual variant used.
model_ : mord estimator
Fitted ordinal regression model.
coef_ : ndarray
Feature coefficients.
theta_ : ndarray
Class thresholds (boundaries).
Examples
--------
>>> from endgame.models.ordinal import OrdinalClassifier
>>> clf = OrdinalClassifier(variant='at', alpha=1.0)
>>> clf.fit(X_train, y_train) # y_train has ordered labels
>>> y_pred = clf.predict(X_test)
>>> proba = clf.predict_proba(X_test)
Notes
-----
Ordinal regression assumes:
1. Target classes have a meaningful order
2. A latent continuous variable underlies the ordered categories
3. Thresholds partition this latent space into ordered categories
The cumulative model is:
P(Y <= j) = g(theta_j - X @ beta)
where g is a link function (logistic, probit, etc.).
"""
_estimator_type = "classifier"
def __init__(
self,
variant: str = "auto",
alpha: float = 1.0,
max_iter: int = 1000,
auto_scale: bool = True,
random_state: int | None = None,
):
self.variant = variant
self.alpha = alpha
self.max_iter = max_iter
self.auto_scale = auto_scale
self.random_state = random_state
self.classes_: np.ndarray | None = None
self.n_classes_: int = 0
self.n_features_in_: int = 0
self.variant_: str | None = None
self.model_: Any | None = None
self._scaler: StandardScaler | None = None
self._label_encoder: LabelEncoder | None = None
self._is_fitted: bool = False
def _detect_variant(self, X: np.ndarray, y: np.ndarray) -> str:
"""Auto-detect the best ordinal regression variant.
Decision logic:
- Small dataset (n < 1000) -> ridge (faster, more stable)
- Large dataset -> at (more flexible)
- Many classes (> 10) -> ridge (fewer parameters)
"""
n_samples = X.shape[0]
n_classes = len(np.unique(y))
if n_samples < 1000 or n_classes > 10:
return "ridge"
else:
return "at"
def _create_model(self):
"""Create the appropriate ordinal regression model."""
if not _HAS_MORD:
raise ImportError(
"Ordinal regression requires mord. "
"Install with: pip install mord"
)
if self.variant_ == "at":
return mord.LogisticAT(alpha=self.alpha, max_iter=self.max_iter)
elif self.variant_ == "it":
return mord.LogisticIT(alpha=self.alpha, max_iter=self.max_iter)
elif self.variant_ == "se":
return mord.LogisticSE(alpha=self.alpha, max_iter=self.max_iter)
elif self.variant_ == "lad":
return mord.LAD(C=1.0 / self.alpha, max_iter=self.max_iter)
elif self.variant_ == "ridge":
return mord.OrdinalRidge(alpha=self.alpha, max_iter=self.max_iter)
else:
raise ValueError(
f"Unknown variant: {self.variant_}. "
"Options: 'auto', 'at', 'it', 'se', 'lad', 'ridge'"
)
[docs]
def fit(self, X, y, sample_weight=None, **fit_params) -> OrdinalClassifier:
"""Fit the ordinal regression model.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training features.
y : array-like of shape (n_samples,)
Ordered target labels. Labels should be integers 0, 1, 2, ...
or will be encoded to integers preserving order.
sample_weight : array-like, optional
Not supported by mord, ignored.
Returns
-------
self
"""
X = np.asarray(X, dtype=np.float64)
y = np.asarray(y)
n_samples, n_features = X.shape
self.n_features_in_ = n_features
# Encode labels to consecutive integers
self._label_encoder = LabelEncoder()
y_encoded = self._label_encoder.fit_transform(y)
self.classes_ = self._label_encoder.classes_
self.n_classes_ = len(self.classes_)
# Scale features
if self.auto_scale:
self._scaler = StandardScaler()
X_scaled = self._scaler.fit_transform(X)
else:
X_scaled = X.copy()
# Handle NaN
X_scaled = np.nan_to_num(X_scaled, nan=0.0)
# Determine variant
if self.variant == "auto":
self.variant_ = self._detect_variant(X_scaled, y_encoded)
else:
self.variant_ = self.variant
# Create and fit model
self.model_ = self._create_model()
self.model_.fit(X_scaled, y_encoded)
self._is_fitted = True
return self
[docs]
def predict(self, X) -> np.ndarray:
"""Predict ordinal class labels.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Samples to predict.
Returns
-------
y_pred : ndarray of shape (n_samples,)
Predicted class labels.
"""
if not self._is_fitted:
raise RuntimeError("OrdinalClassifier has not been fitted.")
X = np.asarray(X, dtype=np.float64)
if self.auto_scale:
X_scaled = self._scaler.transform(X)
else:
X_scaled = X.copy()
X_scaled = np.nan_to_num(X_scaled, nan=0.0)
y_pred_encoded = self.model_.predict(X_scaled)
return self._label_encoder.inverse_transform(y_pred_encoded.astype(int))
[docs]
def predict_proba(self, X) -> np.ndarray:
"""Predict class probabilities.
For ordinal regression, probabilities are derived from the
cumulative model:
P(Y = j) = P(Y <= j) - P(Y <= j-1)
Parameters
----------
X : array-like of shape (n_samples, n_features)
Samples to predict.
Returns
-------
proba : ndarray of shape (n_samples, n_classes)
Class probabilities.
"""
if not self._is_fitted:
raise RuntimeError("OrdinalClassifier has not been fitted.")
X = np.asarray(X, dtype=np.float64)
if self.auto_scale:
X_scaled = self._scaler.transform(X)
else:
X_scaled = X.copy()
X_scaled = np.nan_to_num(X_scaled, nan=0.0)
# mord models typically don't have predict_proba
# We compute it from the cumulative probabilities
if hasattr(self.model_, 'predict_proba'):
return self.model_.predict_proba(X_scaled)
else:
# Compute from linear predictions and thresholds
return self._compute_proba(X_scaled)
def _compute_proba(self, X: np.ndarray) -> np.ndarray:
"""Compute class probabilities from cumulative model.
P(Y = j) = sigmoid(theta_j - X@beta) - sigmoid(theta_{j-1} - X@beta)
"""
from scipy.special import expit
# Linear predictions
linear = X @ self.coef_.ravel()
# Thresholds
theta = self.theta_
# Cumulative probabilities
n_samples = X.shape[0]
n_classes = len(theta) + 1
proba = np.zeros((n_samples, n_classes))
# P(Y <= j) for each threshold
cumprob = np.zeros((n_samples, n_classes))
cumprob[:, -1] = 1.0 # P(Y <= K-1) = 1
for j in range(n_classes - 1):
cumprob[:, j] = expit(theta[j] - linear)
# P(Y = j) = P(Y <= j) - P(Y <= j-1)
proba[:, 0] = cumprob[:, 0]
for j in range(1, n_classes):
proba[:, j] = cumprob[:, j] - cumprob[:, j - 1]
# Clip for numerical stability
proba = np.clip(proba, 1e-10, 1.0)
proba = proba / proba.sum(axis=1, keepdims=True)
return proba
@property
def coef_(self) -> np.ndarray:
"""Feature coefficients."""
if not self._is_fitted:
raise RuntimeError("Model not fitted.")
return self.model_.coef_
@property
def theta_(self) -> np.ndarray:
"""Class thresholds (boundaries)."""
if not self._is_fitted:
raise RuntimeError("Model not fitted.")
if hasattr(self.model_, 'theta_'):
return self.model_.theta_
elif hasattr(self.model_, 'classes_'):
# Ridge uses different attribute name
return getattr(self.model_, 'theta_', np.arange(self.n_classes_ - 1))
return np.arange(self.n_classes_ - 1)
# Convenience wrappers for specific variants
[docs]
class OrdinalRidge(OrdinalClassifier):
"""Ordinal Ridge Regression.
Ridge regression for ordinal targets. Uses L2 regularization.
Good for smaller datasets and many ordinal classes.
Parameters
----------
alpha : float, default=1.0
Regularization strength.
max_iter : int, default=1000
Maximum iterations.
auto_scale : bool, default=True
Whether to standardize features.
Examples
--------
>>> from endgame.models.ordinal import OrdinalRidge
>>> clf = OrdinalRidge(alpha=1.0)
>>> clf.fit(X_train, y_train)
>>> y_pred = clf.predict(X_test)
"""
def __init__(
self,
alpha: float = 1.0,
max_iter: int = 1000,
auto_scale: bool = True,
random_state: int | None = None,
):
super().__init__(
variant="ridge",
alpha=alpha,
max_iter=max_iter,
auto_scale=auto_scale,
random_state=random_state,
)
[docs]
class LogisticAT(OrdinalClassifier):
"""All-Threshold Ordinal Logistic Regression.
The most common ordinal regression model. Each class boundary has
its own threshold parameter.
Also known as: Proportional Odds Model, Cumulative Logit Model.
Parameters
----------
alpha : float, default=1.0
Regularization strength (inverse of C).
max_iter : int, default=1000
Maximum iterations.
auto_scale : bool, default=True
Whether to standardize features.
Examples
--------
>>> from endgame.models.ordinal import LogisticAT
>>> clf = LogisticAT(alpha=1.0)
>>> clf.fit(X_train, y_train)
>>> proba = clf.predict_proba(X_test)
"""
def __init__(
self,
alpha: float = 1.0,
max_iter: int = 1000,
auto_scale: bool = True,
random_state: int | None = None,
):
super().__init__(
variant="at",
alpha=alpha,
max_iter=max_iter,
auto_scale=auto_scale,
random_state=random_state,
)
[docs]
class LogisticIT(OrdinalClassifier):
"""Immediate-Threshold Ordinal Logistic Regression.
Adjacent classes share threshold boundaries. More constrained
than All-Threshold, which can help with small datasets.
Parameters
----------
alpha : float, default=1.0
Regularization strength.
max_iter : int, default=1000
Maximum iterations.
auto_scale : bool, default=True
Whether to standardize features.
Examples
--------
>>> from endgame.models.ordinal import LogisticIT
>>> clf = LogisticIT(alpha=1.0)
>>> clf.fit(X_train, y_train)
>>> y_pred = clf.predict(X_test)
"""
def __init__(
self,
alpha: float = 1.0,
max_iter: int = 1000,
auto_scale: bool = True,
random_state: int | None = None,
):
super().__init__(
variant="it",
alpha=alpha,
max_iter=max_iter,
auto_scale=auto_scale,
random_state=random_state,
)
[docs]
class LogisticSE(OrdinalClassifier):
"""Squared-Error Ordinal Logistic Regression.
All-Threshold variant but using squared errors in optimization.
Can be more robust to outliers.
Parameters
----------
alpha : float, default=1.0
Regularization strength.
max_iter : int, default=1000
Maximum iterations.
auto_scale : bool, default=True
Whether to standardize features.
Examples
--------
>>> from endgame.models.ordinal import LogisticSE
>>> clf = LogisticSE(alpha=1.0)
>>> clf.fit(X_train, y_train)
>>> y_pred = clf.predict(X_test)
"""
def __init__(
self,
alpha: float = 1.0,
max_iter: int = 1000,
auto_scale: bool = True,
random_state: int | None = None,
):
super().__init__(
variant="se",
alpha=alpha,
max_iter=max_iter,
auto_scale=auto_scale,
random_state=random_state,
)
[docs]
class LAD(OrdinalClassifier):
"""Least Absolute Deviation Ordinal Regression.
Uses L1 loss (absolute errors) instead of L2. More robust
to outliers in the target variable.
Parameters
----------
alpha : float, default=1.0
Regularization strength (inverse of C parameter).
max_iter : int, default=1000
Maximum iterations.
auto_scale : bool, default=True
Whether to standardize features.
Examples
--------
>>> from endgame.models.ordinal import LAD
>>> clf = LAD(alpha=1.0)
>>> clf.fit(X_train, y_train)
>>> y_pred = clf.predict(X_test)
"""
def __init__(
self,
alpha: float = 1.0,
max_iter: int = 1000,
auto_scale: bool = True,
random_state: int | None = None,
):
super().__init__(
variant="lad",
alpha=alpha,
max_iter=max_iter,
auto_scale=auto_scale,
random_state=random_state,
)