Source code for endgame.validation.nested_cv

"""Nested Cross-Validation for unbiased model evaluation.

Provides proper nested CV where the inner loop handles model selection
or hyperparameter tuning, and the outer loop provides unbiased performance
estimates.

Example
-------
>>> from endgame.validation import NestedCV
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.model_selection import GridSearchCV
>>>
>>> ncv = NestedCV(
...     estimator=RandomForestClassifier(),
...     search=GridSearchCV(
...         RandomForestClassifier(),
...         param_grid={'n_estimators': [50, 100], 'max_depth': [3, 5]},
...         cv=3, scoring='accuracy'
...     ),
...     outer_cv=5,
...     scoring='accuracy'
... )
>>> results = ncv.evaluate(X, y)
>>> print(f"Score: {results.mean_score:.4f} +/- {results.std_score:.4f}")
"""

from __future__ import annotations

from collections.abc import Callable
from dataclasses import dataclass, field
from typing import Any

import numpy as np
from sklearn.base import clone, is_classifier
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    get_scorer,
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import (
    KFold,
    StratifiedKFold,
)



[docs]
@dataclass
class NestedCVResult:
    """Results from nested cross-validation.

    Attributes
    ----------
    outer_scores : list of float
        Score for each outer fold.
    mean_score : float
        Mean of outer fold scores.
    std_score : float
        Standard deviation of outer fold scores.
    best_params : list of dict
        Best parameters found in each outer fold's inner search.
    oof_predictions : ndarray or None
        Out-of-fold predictions (if return_oof=True).
    inner_scores : list of float
        Best inner CV score for each outer fold.
    scoring : str
        Metric name used.
    """
    outer_scores: list[float] = field(default_factory=list)
    mean_score: float = 0.0
    std_score: float = 0.0
    best_params: list[dict[str, Any]] = field(default_factory=list)
    oof_predictions: np.ndarray | None = None
    inner_scores: list[float] = field(default_factory=list)
    scoring: str = "accuracy"

    def __repr__(self) -> str:
        return (
            f"NestedCVResult(score={self.mean_score:.4f} +/- {self.std_score:.4f}, "
            f"n_folds={len(self.outer_scores)}, metric='{self.scoring}')"
        )



_METRIC_MAP = {
    "accuracy": accuracy_score,
    "balanced_accuracy": balanced_accuracy_score,
    "f1": lambda y, p: f1_score(y, p, average="weighted"),
    "f1_macro": lambda y, p: f1_score(y, p, average="macro"),
    "r2": r2_score,
    "mse": mean_squared_error,
    "neg_mean_squared_error": lambda y, p: -mean_squared_error(y, p),
    "mae": mean_absolute_error,
    "neg_mean_absolute_error": lambda y, p: -mean_absolute_error(y, p),
}



[docs]
class NestedCV:
    """Nested cross-validation for unbiased model evaluation.

    The inner loop performs model selection (hyperparameter tuning or
    algorithm comparison) and the outer loop estimates generalization
    performance using the best model from each inner fold.

    Parameters
    ----------
    estimator : estimator or None
        Base estimator to evaluate. If `search` is provided, this is
        ignored (the search object contains the estimator).
    search : estimator with fit/predict or None
        A search object (e.g., GridSearchCV, RandomizedSearchCV,
        OptunaOptimizer) that performs inner-loop model selection.
        Must have `best_estimator_` and `best_params_` after fitting.
        If None, `estimator` is used directly without inner tuning.
    outer_cv : int or CV splitter, default=5
        Number of outer folds or a CV splitter object.
    scoring : str or callable, default='auto'
        Scoring metric. 'auto' uses accuracy for classifiers, r2 for
        regressors. Can be a string key or a callable(y_true, y_pred).
    return_oof : bool, default=True
        Whether to return out-of-fold predictions.
    random_state : int or None, default=None
        Random state for reproducibility.
    verbose : int, default=0
        Verbosity level. 0=silent, 1=progress, 2=detailed.

    Example
    -------
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> from sklearn.model_selection import GridSearchCV
    >>>
    >>> # With hyperparameter search
    >>> search = GridSearchCV(
    ...     RandomForestClassifier(random_state=42),
    ...     param_grid={'n_estimators': [50, 100, 200]},
    ...     cv=3, scoring='accuracy', refit=True
    ... )
    >>> ncv = NestedCV(search=search, outer_cv=5)
    >>> result = ncv.evaluate(X, y)
    >>>
    >>> # Without search (just evaluate a fixed model)
    >>> ncv = NestedCV(estimator=RandomForestClassifier(n_estimators=100))
    >>> result = ncv.evaluate(X, y)
    """

    def __init__(
        self,
        estimator=None,
        search=None,
        outer_cv: int | Any = 5,
        scoring: str | Callable = "auto",
        return_oof: bool = True,
        random_state: int | None = None,
        verbose: int = 0,
    ):
        if estimator is None and search is None:
            raise ValueError("Either 'estimator' or 'search' must be provided.")

        self.estimator = estimator
        self.search = search
        self.outer_cv = outer_cv
        self.scoring = scoring
        self.return_oof = return_oof
        self.random_state = random_state
        self.verbose = verbose


[docs]
    def evaluate(self, X, y, groups=None) -> NestedCVResult:
        """Run nested cross-validation.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training features.
        y : array-like of shape (n_samples,)
            Target values.
        groups : array-like of shape (n_samples,), optional
            Group labels for GroupKFold-style splitting.

        Returns
        -------
        NestedCVResult
            Results containing scores, best params, and OOF predictions.
        """
        X = np.asarray(X)
        y = np.asarray(y)

        # Determine if classifier
        if self.search is not None:
            base = self.search.estimator if hasattr(self.search, 'estimator') else self.search
        else:
            base = self.estimator
        is_clf = is_classifier(base)

        # Set up scoring
        scoring_name, score_fn = self._resolve_scoring(is_clf)

        # Set up outer CV
        outer_cv = self._resolve_cv(is_clf, y)

        # Initialize results
        outer_scores = []
        best_params = []
        inner_scores = []
        oof_preds = np.zeros(len(y)) if self.return_oof and not is_clf else None
        oof_proba = None

        if self.return_oof and is_clf:
            oof_preds = np.zeros(len(y), dtype=y.dtype)

        for fold_idx, (train_idx, test_idx) in enumerate(outer_cv.split(X, y, groups)):
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]

            if self.verbose >= 1:
                print(f"Outer fold {fold_idx + 1}/{outer_cv.get_n_splits()}: "
                      f"train={len(train_idx)}, test={len(test_idx)}")

            # Inner loop: model selection
            if self.search is not None:
                search = clone(self.search)
                search.fit(X_train, y_train)
                model = search.best_estimator_
                fold_params = search.best_params_
                fold_inner_score = search.best_score_
            else:
                model = clone(self.estimator)
                model.fit(X_train, y_train)
                fold_params = model.get_params()
                fold_inner_score = float('nan')

            # Outer evaluation
            y_pred = model.predict(X_test)
            fold_score = score_fn(y_test, y_pred)

            outer_scores.append(fold_score)
            best_params.append(fold_params)
            inner_scores.append(fold_inner_score)

            if self.return_oof and oof_preds is not None:
                oof_preds[test_idx] = y_pred

            if self.verbose >= 2:
                print(f"  Score: {fold_score:.4f}, "
                      f"Inner best: {fold_inner_score:.4f}, "
                      f"Params: {fold_params}")

        result = NestedCVResult(
            outer_scores=outer_scores,
            mean_score=float(np.mean(outer_scores)),
            std_score=float(np.std(outer_scores)),
            best_params=best_params,
            oof_predictions=oof_preds,
            inner_scores=inner_scores,
            scoring=scoring_name,
        )

        if self.verbose >= 1:
            print(f"\nNested CV Result: {result.mean_score:.4f} +/- {result.std_score:.4f}")

        return result


    def _resolve_scoring(self, is_clf: bool):
        """Resolve scoring metric."""
        if callable(self.scoring):
            return "custom", self.scoring

        if self.scoring == "auto":
            name = "accuracy" if is_clf else "r2"
        else:
            name = self.scoring

        if name in _METRIC_MAP:
            return name, _METRIC_MAP[name]

        # Try sklearn scorer
        try:
            scorer = get_scorer(name)
            def score_fn(y_true, y_pred):
                # Simple wrapper — doesn't use estimator
                return scorer._score_func(y_true, y_pred, **scorer._kwargs)
            return name, score_fn
        except (ValueError, KeyError):
            raise ValueError(f"Unknown scoring metric: {name}")

    def _resolve_cv(self, is_clf: bool, y: np.ndarray):
        """Resolve outer CV splitter."""
        if isinstance(self.outer_cv, int):
            if is_clf:
                return StratifiedKFold(
                    n_splits=self.outer_cv,
                    shuffle=True,
                    random_state=self.random_state,
                )
            else:
                return KFold(
                    n_splits=self.outer_cv,
                    shuffle=True,
                    random_state=self.random_state,
                )
        return self.outer_cv