Source code for endgame.models.symbolic.symbolic_classifier

"""Symbolic Classification via logistic transformation of symbolic regression.

Binary: fits symbolic regression on log-odds, applies sigmoid.
Multiclass: one-vs-rest with softmax over symbolic regressors.
"""

from __future__ import annotations

import numpy as np
from numpy.typing import NDArray
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.validation import check_array, check_is_fitted, check_X_y

from endgame.models.symbolic.symbolic_regressor import SymbolicRegressor


[docs] class SymbolicClassifier(BaseEstimator, ClassifierMixin): """Symbolic Classification via logistic transformation of symbolic regression. For binary classification, fits a symbolic regression model to the log-odds and applies sigmoid transformation for probabilities. For multiclass, uses one-vs-rest strategy with multiple symbolic regressors. Parameters ---------- All parameters from SymbolicRegressor are accepted. threshold : float, default=0.5 Classification threshold for binary classification. Attributes ---------- model_ : SymbolicRegressor or list of SymbolicRegressor Underlying symbolic regressor(s). classes_ : ndarray Unique class labels. n_classes_ : int Number of classes. """ _estimator_type = "classifier" def __init__( self, preset: str = "default", operators: str | dict[str, list[str]] = "scientific", binary_operators: list[str] | None = None, unary_operators: list[str] | None = None, niterations: int | None = None, maxsize: int | None = None, maxdepth: int | None = None, populations: int | None = None, population_size: int | None = None, parsimony: float | None = None, model_selection: str = "best", constraints: dict | None = None, nested_constraints: dict | None = None, denoise: bool = False, select_k_features: int | None = None, turbo: bool = False, parallelism: str = "multithreading", procs: int | None = None, random_state: int | None = None, verbosity: int = 0, temp_equation_file: bool = True, output_directory: str | None = None, threshold: float = 0.5, ): self.preset = preset self.operators = operators self.binary_operators = binary_operators self.unary_operators = unary_operators self.niterations = niterations self.maxsize = maxsize self.maxdepth = maxdepth self.populations = populations self.population_size = population_size self.parsimony = parsimony self.model_selection = model_selection self.constraints = constraints self.nested_constraints = nested_constraints self.denoise = denoise self.select_k_features = select_k_features self.turbo = turbo self.parallelism = parallelism self.procs = procs self.random_state = random_state self.verbosity = verbosity self.temp_equation_file = temp_equation_file self.output_directory = output_directory self.threshold = threshold def _create_regressor(self) -> SymbolicRegressor: return SymbolicRegressor( preset=self.preset, operators=self.operators, binary_operators=self.binary_operators, unary_operators=self.unary_operators, niterations=self.niterations, maxsize=self.maxsize, maxdepth=self.maxdepth, populations=self.populations, population_size=self.population_size, parsimony=self.parsimony, model_selection=self.model_selection, loss="L2DistLoss()", constraints=self.constraints, nested_constraints=self.nested_constraints, denoise=self.denoise, select_k_features=self.select_k_features, turbo=self.turbo, parallelism=self.parallelism, procs=self.procs, random_state=self.random_state, verbosity=self.verbosity, temp_equation_file=self.temp_equation_file, output_directory=self.output_directory, )
[docs] def fit(self, X, y, **fit_params) -> SymbolicClassifier: """Fit symbolic classifier. Parameters ---------- X : array-like of shape (n_samples, n_features) y : array-like of shape (n_samples,) Returns ------- self """ X, y = check_X_y(X, y, accept_sparse=False, dtype=np.float64) self._label_encoder = LabelEncoder() y_encoded = self._label_encoder.fit_transform(y) self.classes_ = self._label_encoder.classes_ self.n_classes_ = len(self.classes_) self.n_features_in_ = X.shape[1] if hasattr(X, "columns"): self.feature_names_in_ = np.array(X.columns) else: self.feature_names_in_ = np.array([f"x{i}" for i in range(X.shape[1])]) if self.n_classes_ == 2: y_continuous = y_encoded * 6 - 3 self.model_ = self._create_regressor() self.model_.fit(X, y_continuous, **fit_params) else: self.model_ = [] for c in range(self.n_classes_): y_binary = (y_encoded == c).astype(float) * 6 - 3 if self.verbosity > 0: print(f"Fitting model for class {self.classes_[c]}...") model = self._create_regressor() model.fit(X, y_binary, **fit_params) self.model_.append(model) return self
[docs] def predict_proba(self, X) -> NDArray: check_is_fitted(self, "model_") X = check_array(X, accept_sparse=False, dtype=np.float64) def sigmoid(x): return 1 / (1 + np.exp(-np.clip(x, -500, 500))) if self.n_classes_ == 2: logits = self.model_.predict(X) p1 = sigmoid(logits) return np.column_stack([1 - p1, p1]) else: scores = np.column_stack([m.predict(X) for m in self.model_]) exp_scores = np.exp(scores - scores.max(axis=1, keepdims=True)) return exp_scores / exp_scores.sum(axis=1, keepdims=True)
[docs] def predict(self, X) -> NDArray: proba = self.predict_proba(X) indices = np.argmax(proba, axis=1) return self._label_encoder.inverse_transform(indices)
[docs] def get_best_equation(self, class_idx: int = 1) -> str: check_is_fitted(self, "model_") if self.n_classes_ == 2: return self.model_.get_best_equation() if class_idx >= len(self.model_): raise ValueError(f"class_idx must be < {len(self.model_)}") return self.model_[class_idx].get_best_equation()
[docs] def sympy(self, class_idx: int = 1): check_is_fitted(self, "model_") if self.n_classes_ == 2: return self.model_.sympy() if class_idx >= len(self.model_): raise ValueError(f"class_idx must be < {len(self.model_)}") return self.model_[class_idx].sympy()
@property def feature_importances_(self) -> NDArray: check_is_fitted(self, "model_") if self.n_classes_ == 2: return self.model_.feature_importances_ return np.mean([m.feature_importances_ for m in self.model_], axis=0)
[docs] def summary(self) -> str: check_is_fitted(self, "model_") lines = ["Symbolic Classifier Results", "=" * 50] if self.n_classes_ == 2: lines.append(f"Binary classification (positive class: {self.classes_[1]})") lines.append(self.model_.summary()) else: lines.append(f"Multiclass classification ({self.n_classes_} classes)") for cls, model in zip(self.classes_, self.model_): lines.append(f"\n--- Class {cls} ---") lines.append(model.summary()) return "\n".join(lines)
def __repr__(self) -> str: if hasattr(self, "model_"): if self.n_classes_ == 2: return f"SymbolicClassifier(equation={self.model_.best_equation_})" return f"SymbolicClassifier(n_classes={self.n_classes_})" return f"SymbolicClassifier(preset={self.preset!r})"