from __future__ import annotations
"""Naive Bayes classifier with automatic variant selection.
Naive Bayes assumes feature independence - a strong assumption that's
usually wrong, but often works surprisingly well. This different
inductive bias makes it valuable for ensemble diversity.
References
----------
- McCallum & Nigam, "A Comparison of Event Models for Naive Bayes Text Classification" (1998)
- sklearn.naive_bayes documentation
"""
from typing import Any
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.naive_bayes import BernoulliNB, ComplementNB, GaussianNB, MultinomialNB
from sklearn.preprocessing import LabelEncoder
[docs]
class NaiveBayesClassifier(ClassifierMixin, BaseEstimator):
"""Naive Bayes Classifier with automatic variant selection.
Automatically selects the appropriate Naive Bayes variant based on
feature characteristics, or uses a specified variant.
The feature independence assumption is fundamentally different from
tree-based models (which capture interactions) and neural networks
(which learn complex dependencies), making this valuable for
ensemble diversity.
Parameters
----------
variant : str, default='auto'
Naive Bayes variant:
- 'auto': Automatically select based on features
- 'gaussian': For continuous features
- 'bernoulli': For binary features
- 'multinomial': For count/frequency features
- 'complement': For imbalanced text classification
var_smoothing : float, default=1e-9
Portion of the largest variance of all features added to
variances for stability (Gaussian only).
alpha : float, default=1.0
Additive smoothing parameter (Bernoulli, Multinomial, Complement).
binarize : float or None, default=0.0
Threshold for binarizing features (Bernoulli only).
None means features are already binary.
fit_prior : bool, default=True
Whether to learn class prior probabilities.
class_prior : array-like, optional
Prior probabilities of the classes.
Attributes
----------
classes_ : ndarray
Unique class labels.
n_features_in_ : int
Number of features.
variant_ : str
The actual variant used (resolved from 'auto').
model_ : sklearn NB estimator
Fitted Naive Bayes model.
Examples
--------
>>> from endgame.models.baselines import NaiveBayesClassifier
>>> clf = NaiveBayesClassifier(variant='auto')
>>> clf.fit(X_train, y_train)
>>> proba = clf.predict_proba(X_test)
Notes
-----
Despite the naive independence assumption, Naive Bayes often works
surprisingly well because:
1. Classification only requires correct ordering, not accurate probabilities
2. Dependencies often "cancel out" when aggregated
3. Regularization effect from the strong prior
For ensembles, NB provides diversity because it makes fundamentally
different errors from models that capture feature interactions.
"""
_estimator_type = "classifier"
def __init__(
self,
variant: str = "auto",
var_smoothing: float = 1e-9,
alpha: float = 1.0,
binarize: float | None = 0.0,
fit_prior: bool = True,
class_prior: np.ndarray | None = None,
):
self.variant = variant
self.var_smoothing = var_smoothing
self.alpha = alpha
self.binarize = binarize
self.fit_prior = fit_prior
self.class_prior = class_prior
self.classes_: np.ndarray | None = None
self.n_classes_: int = 0
self.n_features_in_: int = 0
self.variant_: str | None = None
self.model_: Any | None = None
self._label_encoder: LabelEncoder | None = None
self._is_fitted: bool = False
def _detect_variant(self, X: np.ndarray) -> str:
"""Automatically detect the best NB variant based on features.
Decision logic:
- If all values are 0/1 -> Bernoulli
- If all values are non-negative integers -> Multinomial
- Otherwise -> Gaussian
"""
# Check if binary (0/1 only)
unique_vals = np.unique(X[~np.isnan(X)])
if len(unique_vals) <= 2 and set(unique_vals).issubset({0, 1}):
return "bernoulli"
# Check if non-negative integers (count data)
if np.all(X >= 0) and np.allclose(X, X.astype(int)):
return "multinomial"
# Default to Gaussian for continuous features
return "gaussian"
def _create_model(self):
"""Create the appropriate Naive Bayes model."""
if self.variant_ == "gaussian":
return GaussianNB(
var_smoothing=self.var_smoothing,
priors=self.class_prior,
)
elif self.variant_ == "bernoulli":
return BernoulliNB(
alpha=self.alpha,
binarize=self.binarize,
fit_prior=self.fit_prior,
class_prior=self.class_prior,
)
elif self.variant_ == "multinomial":
return MultinomialNB(
alpha=self.alpha,
fit_prior=self.fit_prior,
class_prior=self.class_prior,
)
elif self.variant_ == "complement":
return ComplementNB(
alpha=self.alpha,
fit_prior=self.fit_prior,
class_prior=self.class_prior,
norm=True,
)
else:
raise ValueError(f"Unknown variant: {self.variant_}. "
"Options: 'auto', 'gaussian', 'bernoulli', "
"'multinomial', 'complement'")
[docs]
def fit(self, X, y, sample_weight=None, **fit_params) -> NaiveBayesClassifier:
"""Fit the Naive Bayes classifier.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Training features.
y : array-like of shape (n_samples,)
Target labels.
sample_weight : array-like of shape (n_samples,), optional
Sample weights.
Returns
-------
self
"""
X = np.asarray(X, dtype=np.float64)
y = np.asarray(y)
self.n_features_in_ = X.shape[1]
# Encode labels
self._label_encoder = LabelEncoder()
y_encoded = self._label_encoder.fit_transform(y)
self.classes_ = self._label_encoder.classes_
self.n_classes_ = len(self.classes_)
# Handle NaN - replace with column means for Gaussian
X_clean = X.copy()
if np.any(np.isnan(X_clean)):
col_means = np.nanmean(X_clean, axis=0)
for i in range(X_clean.shape[1]):
mask = np.isnan(X_clean[:, i])
X_clean[mask, i] = col_means[i]
# Determine variant
if self.variant == "auto":
self.variant_ = self._detect_variant(X_clean)
else:
self.variant_ = self.variant
# Handle negative values for multinomial
if self.variant_ == "multinomial" and np.any(X_clean < 0):
# Shift to non-negative
X_clean = X_clean - X_clean.min(axis=0) + 1e-10
# Create and fit model
self.model_ = self._create_model()
self.model_.fit(X_clean, y_encoded, sample_weight=sample_weight)
self._is_fitted = True
return self
[docs]
def predict(self, X) -> np.ndarray:
"""Predict class labels.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Samples to predict.
Returns
-------
y_pred : ndarray of shape (n_samples,)
Predicted class labels.
"""
if not self._is_fitted:
raise RuntimeError("NaiveBayesClassifier has not been fitted.")
X = np.asarray(X, dtype=np.float64)
X_clean = np.nan_to_num(X, nan=0.0)
# Handle multinomial negative values
if self.variant_ == "multinomial" and np.any(X_clean < 0):
X_clean = X_clean - X_clean.min(axis=0) + 1e-10
y_pred = self.model_.predict(X_clean)
return self._label_encoder.inverse_transform(y_pred)
[docs]
def predict_proba(self, X) -> np.ndarray:
"""Predict class probabilities.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Samples to predict.
Returns
-------
proba : ndarray of shape (n_samples, n_classes)
Class probabilities.
"""
if not self._is_fitted:
raise RuntimeError("NaiveBayesClassifier has not been fitted.")
X = np.asarray(X, dtype=np.float64)
X_clean = np.nan_to_num(X, nan=0.0)
# Handle multinomial negative values
if self.variant_ == "multinomial" and np.any(X_clean < 0):
X_clean = X_clean - X_clean.min(axis=0) + 1e-10
return self.model_.predict_proba(X_clean)
[docs]
def predict_log_proba(self, X) -> np.ndarray:
"""Predict log class probabilities.
Parameters
----------
X : array-like of shape (n_samples, n_features)
Samples to predict.
Returns
-------
log_proba : ndarray of shape (n_samples, n_classes)
Log class probabilities.
"""
if not self._is_fitted:
raise RuntimeError("NaiveBayesClassifier has not been fitted.")
X = np.asarray(X, dtype=np.float64)
X_clean = np.nan_to_num(X, nan=0.0)
if self.variant_ == "multinomial" and np.any(X_clean < 0):
X_clean = X_clean - X_clean.min(axis=0) + 1e-10
return self.model_.predict_log_proba(X_clean)
@property
def feature_log_prob_(self):
"""Log probability of features given a class (for discrete NB)."""
if not self._is_fitted:
raise RuntimeError("NaiveBayesClassifier has not been fitted.")
if hasattr(self.model_, 'feature_log_prob_'):
return self.model_.feature_log_prob_
return None
@property
def class_log_prior_(self):
"""Log probability of each class."""
if not self._is_fitted:
raise RuntimeError("NaiveBayesClassifier has not been fitted.")
if hasattr(self.model_, 'class_log_prior_'):
return self.model_.class_log_prior_
elif hasattr(self.model_, 'class_prior_'):
return np.log(self.model_.class_prior_)
return None