Source code for endgame.models.baselines.knn

from __future__ import annotations

"""K-Nearest Neighbors classifiers and regressors.

KNN models provide a fundamentally different inductive bias from
tree-based and neural network models:
- Instance-based (lazy) learning - no explicit model training
- Local decision boundaries based on neighbor voting/averaging
- Sensitive to feature scaling and distance metric choice

These characteristics make KNN valuable for ensemble diversity.

References
----------
- Cover & Hart, "Nearest Neighbor Pattern Classification" (1967)
- sklearn.neighbors documentation
"""

from typing import Literal

import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler


[docs] class KNNClassifier(ClassifierMixin, BaseEstimator): """K-Nearest Neighbors Classifier with competition-tuned defaults. A wrapper around sklearn's KNeighborsClassifier with automatic feature scaling and sensible defaults for competitive ML. Parameters ---------- n_neighbors : int, default=5 Number of neighbors to use. weights : str, default='distance' Weight function: 'uniform' or 'distance'. 'distance' often works better in practice. metric : str, default='minkowski' Distance metric: 'minkowski', 'euclidean', 'manhattan', 'cosine', etc. p : int, default=2 Power parameter for Minkowski metric. p=2 is Euclidean, p=1 is Manhattan. leaf_size : int, default=30 Leaf size for BallTree or KDTree. algorithm : str, default='auto' Algorithm: 'auto', 'ball_tree', 'kd_tree', 'brute'. scale_features : bool, default=True Whether to standardize features before fitting. Highly recommended for distance-based methods. n_jobs : int, default=-1 Number of parallel jobs. Attributes ---------- classes_ : ndarray Unique class labels. n_features_in_ : int Number of features. Examples -------- >>> from endgame.models.baselines import KNNClassifier >>> clf = KNNClassifier(n_neighbors=5, weights='distance') >>> clf.fit(X_train, y_train) >>> proba = clf.predict_proba(X_test) Notes ----- KNN is different from other models because: 1. Instance-based - stores training data, no explicit model 2. Non-parametric - makes no assumptions about data distribution 3. Local decision boundaries - can capture complex patterns 4. Sensitive to curse of dimensionality in high dimensions The scale_features=True default is important because KNN relies on distance calculations that can be dominated by features with larger scales. """ _estimator_type = "classifier" def __init__( self, n_neighbors: int = 5, weights: Literal["uniform", "distance"] = "distance", metric: str = "minkowski", p: int = 2, leaf_size: int = 30, algorithm: Literal["auto", "ball_tree", "kd_tree", "brute"] = "auto", scale_features: bool = True, n_jobs: int = -1, ): self.n_neighbors = n_neighbors self.weights = weights self.metric = metric self.p = p self.leaf_size = leaf_size self.algorithm = algorithm self.scale_features = scale_features self.n_jobs = n_jobs self.classes_: np.ndarray | None = None self.n_classes_: int = 0 self.n_features_in_: int = 0 self.model_: KNeighborsClassifier | None = None self._scaler: StandardScaler | None = None self._label_encoder: LabelEncoder | None = None self._is_fitted: bool = False
[docs] def fit(self, X, y, **fit_params) -> KNNClassifier: """Fit the KNN classifier. Parameters ---------- X : array-like of shape (n_samples, n_features) Training features. y : array-like of shape (n_samples,) Target labels. Returns ------- self """ X = np.asarray(X, dtype=np.float64) y = np.asarray(y) self.n_features_in_ = X.shape[1] # Encode labels self._label_encoder = LabelEncoder() y_encoded = self._label_encoder.fit_transform(y) self.classes_ = self._label_encoder.classes_ self.n_classes_ = len(self.classes_) # Handle NaN X_clean = np.nan_to_num(X, nan=0.0) # Scale features if self.scale_features: self._scaler = StandardScaler() X_scaled = self._scaler.fit_transform(X_clean) else: X_scaled = X_clean # Adjust n_neighbors if necessary n_neighbors = min(self.n_neighbors, len(y) - 1) if n_neighbors < 1: n_neighbors = 1 # Create and fit model self.model_ = KNeighborsClassifier( n_neighbors=n_neighbors, weights=self.weights, metric=self.metric, p=self.p, leaf_size=self.leaf_size, algorithm=self.algorithm, n_jobs=self.n_jobs, ) self.model_.fit(X_scaled, y_encoded) self._is_fitted = True return self
def _preprocess(self, X) -> np.ndarray: """Preprocess features for prediction.""" X = np.asarray(X, dtype=np.float64) X_clean = np.nan_to_num(X, nan=0.0) if self.scale_features and self._scaler is not None: return self._scaler.transform(X_clean) return X_clean
[docs] def predict(self, X) -> np.ndarray: """Predict class labels.""" if not self._is_fitted: raise RuntimeError("KNNClassifier has not been fitted.") X_proc = self._preprocess(X) y_pred = self.model_.predict(X_proc) return self._label_encoder.inverse_transform(y_pred)
[docs] def predict_proba(self, X) -> np.ndarray: """Predict class probabilities.""" if not self._is_fitted: raise RuntimeError("KNNClassifier has not been fitted.") X_proc = self._preprocess(X) return self.model_.predict_proba(X_proc)
[docs] def kneighbors( self, X=None, n_neighbors: int | None = None, return_distance: bool = True ): """Find the K-neighbors of a point. Parameters ---------- X : array-like, optional Query points. If None, returns neighbors of training data. n_neighbors : int, optional Number of neighbors. If None, uses n_neighbors from init. return_distance : bool, default=True Whether to return distances. Returns ------- neigh_dist : ndarray (if return_distance=True) Distances to neighbors. neigh_ind : ndarray Indices of neighbors. """ if not self._is_fitted: raise RuntimeError("KNNClassifier has not been fitted.") if X is not None: X = self._preprocess(X) return self.model_.kneighbors( X, n_neighbors=n_neighbors, return_distance=return_distance )
[docs] class KNNRegressor(RegressorMixin, BaseEstimator): """K-Nearest Neighbors Regressor with competition-tuned defaults. A wrapper around sklearn's KNeighborsRegressor with automatic feature scaling and sensible defaults for competitive ML. Parameters ---------- n_neighbors : int, default=5 Number of neighbors to use. weights : str, default='distance' Weight function: 'uniform' or 'distance'. metric : str, default='minkowski' Distance metric. p : int, default=2 Power parameter for Minkowski metric. leaf_size : int, default=30 Leaf size for BallTree or KDTree. algorithm : str, default='auto' Algorithm: 'auto', 'ball_tree', 'kd_tree', 'brute'. scale_features : bool, default=True Whether to standardize features before fitting. n_jobs : int, default=-1 Number of parallel jobs. Attributes ---------- n_features_in_ : int Number of features. Examples -------- >>> from endgame.models.baselines import KNNRegressor >>> reg = KNNRegressor(n_neighbors=10, weights='distance') >>> reg.fit(X_train, y_train) >>> predictions = reg.predict(X_test) Notes ----- KNN regression averages (or weighted-averages) the target values of the k nearest neighbors. This provides a local, non-parametric estimate that can capture complex patterns but may suffer from the curse of dimensionality. """ _estimator_type = "regressor" def __init__( self, n_neighbors: int = 5, weights: Literal["uniform", "distance"] = "distance", metric: str = "minkowski", p: int = 2, leaf_size: int = 30, algorithm: Literal["auto", "ball_tree", "kd_tree", "brute"] = "auto", scale_features: bool = True, n_jobs: int = -1, ): self.n_neighbors = n_neighbors self.weights = weights self.metric = metric self.p = p self.leaf_size = leaf_size self.algorithm = algorithm self.scale_features = scale_features self.n_jobs = n_jobs self.n_features_in_: int = 0 self.model_: KNeighborsRegressor | None = None self._scaler: StandardScaler | None = None self._is_fitted: bool = False
[docs] def fit(self, X, y, **fit_params) -> KNNRegressor: """Fit the KNN regressor. Parameters ---------- X : array-like of shape (n_samples, n_features) Training features. y : array-like of shape (n_samples,) Target values. Returns ------- self """ X = np.asarray(X, dtype=np.float64) y = np.asarray(y, dtype=np.float64) self.n_features_in_ = X.shape[1] # Handle NaN X_clean = np.nan_to_num(X, nan=0.0) y_clean = np.nan_to_num(y, nan=0.0) # Scale features if self.scale_features: self._scaler = StandardScaler() X_scaled = self._scaler.fit_transform(X_clean) else: X_scaled = X_clean # Adjust n_neighbors if necessary n_neighbors = min(self.n_neighbors, len(y) - 1) if n_neighbors < 1: n_neighbors = 1 # Create and fit model self.model_ = KNeighborsRegressor( n_neighbors=n_neighbors, weights=self.weights, metric=self.metric, p=self.p, leaf_size=self.leaf_size, algorithm=self.algorithm, n_jobs=self.n_jobs, ) self.model_.fit(X_scaled, y_clean) self._is_fitted = True return self
def _preprocess(self, X) -> np.ndarray: """Preprocess features for prediction.""" X = np.asarray(X, dtype=np.float64) X_clean = np.nan_to_num(X, nan=0.0) if self.scale_features and self._scaler is not None: return self._scaler.transform(X_clean) return X_clean
[docs] def predict(self, X) -> np.ndarray: """Predict target values.""" if not self._is_fitted: raise RuntimeError("KNNRegressor has not been fitted.") X_proc = self._preprocess(X) return self.model_.predict(X_proc)
[docs] def kneighbors( self, X=None, n_neighbors: int | None = None, return_distance: bool = True ): """Find the K-neighbors of a point. Parameters ---------- X : array-like, optional Query points. If None, returns neighbors of training data. n_neighbors : int, optional Number of neighbors. If None, uses n_neighbors from init. return_distance : bool, default=True Whether to return distances. Returns ------- neigh_dist : ndarray (if return_distance=True) Distances to neighbors. neigh_ind : ndarray Indices of neighbors. """ if not self._is_fitted: raise RuntimeError("KNNRegressor has not been fitted.") if X is not None: X = self._preprocess(X) return self.model_.kneighbors( X, n_neighbors=n_neighbors, return_distance=return_distance )