Source code for endgame.benchmark.loader

from __future__ import annotations

"""Dataset loading utilities for benchmark suites.

Provides unified access to benchmark datasets from OpenML, sklearn, and custom sources.
"""

import warnings
from collections.abc import Generator
from dataclasses import dataclass, field
from enum import Enum
from typing import Any

import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# Lazy imports for optional dependencies
HAS_OPENML = False
HAS_PANDAS = False

try:
    import openml
    HAS_OPENML = True
except ImportError:
    pass

try:
    import pandas as pd
    HAS_PANDAS = True
except ImportError:
    pass


class TaskType(str, Enum):
    """Type of machine learning task."""
    CLASSIFICATION = "classification"
    REGRESSION = "regression"
    MULTICLASS = "multiclass"
    MULTILABEL = "multilabel"



[docs]
@dataclass
class DatasetInfo:
    """Container for dataset information and data.

    Attributes
    ----------
    name : str
        Name of the dataset.
    task_type : TaskType
        Type of ML task.
    X : np.ndarray
        Feature matrix.
    y : np.ndarray
        Target variable.
    feature_names : List[str]
        Names of features.
    categorical_indicator : List[bool]
        Boolean mask indicating categorical features.
    n_samples : int
        Number of samples.
    n_features : int
        Number of features.
    n_classes : int
        Number of classes (for classification).
    class_distribution : Dict[Any, int]
        Distribution of classes.
    source : str
        Source of the dataset (e.g., 'openml', 'sklearn').
    openml_id : Optional[int]
        OpenML dataset ID if applicable.
    cv_splits : Optional[List[Tuple[np.ndarray, np.ndarray]]]
        Predefined cross-validation splits.
    metadata : Dict[str, Any]
        Additional metadata.
    """
    name: str
    task_type: TaskType
    X: np.ndarray
    y: np.ndarray
    feature_names: list[str] = field(default_factory=list)
    categorical_indicator: list[bool] = field(default_factory=list)
    n_samples: int = 0
    n_features: int = 0
    n_classes: int = 0
    class_distribution: dict[Any, int] = field(default_factory=dict)
    source: str = "unknown"
    openml_id: int | None = None
    cv_splits: list[tuple[np.ndarray, np.ndarray]] | None = None
    metadata: dict[str, Any] = field(default_factory=dict)

    def __post_init__(self):
        """Compute derived attributes."""
        self.n_samples = self.X.shape[0]
        self.n_features = self.X.shape[1] if self.X.ndim > 1 else 1

        if not self.feature_names:
            self.feature_names = [f"f{i}" for i in range(self.n_features)]

        if not self.categorical_indicator:
            self.categorical_indicator = [False] * self.n_features

        if self.task_type in (TaskType.CLASSIFICATION, TaskType.MULTICLASS):
            unique, counts = np.unique(self.y, return_counts=True)
            self.n_classes = len(unique)
            self.class_distribution = dict(zip(unique.tolist(), counts.tolist()))

    @property
    def n_categorical(self) -> int:
        """Number of categorical features."""
        return sum(self.categorical_indicator)

    @property
    def n_numerical(self) -> int:
        """Number of numerical features."""
        return self.n_features - self.n_categorical

    @property
    def imbalance_ratio(self) -> float:
        """Class imbalance ratio (max_count / min_count)."""
        if self.task_type == TaskType.REGRESSION:
            return 1.0
        if not self.class_distribution:
            return 1.0
        counts = list(self.class_distribution.values())
        return max(counts) / min(counts) if min(counts) > 0 else float('inf')


[docs]
    def get_cv_splits(
        self,
        n_splits: int = 10,
        shuffle: bool = True,
        random_state: int = 42,
    ) -> list[tuple[np.ndarray, np.ndarray]]:
        """Get cross-validation splits.

        Returns predefined splits if available, otherwise generates new ones.
        """
        if self.cv_splits is not None:
            return self.cv_splits

        if self.task_type in (TaskType.CLASSIFICATION, TaskType.MULTICLASS):
            cv = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
        else:
            cv = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

        return list(cv.split(self.X, self.y))




# Predefined benchmark suites
BUILTIN_SUITES: dict[str, dict[str, Any]] = {
    # OpenML suites
    "OpenML-CC18": {
        "type": "openml",
        "suite_id": 99,
        "description": "OpenML-CC18 classification benchmark (72 datasets)",
    },
    "OpenML-CTR23": {
        "type": "openml",
        "suite_id": 336,
        "description": "OpenML-CTR23 regression benchmark",
    },
    "AutoML-Benchmark": {
        "type": "openml",
        "suite_id": 271,
        "description": "AutoML benchmark suite",
    },
    # Sklearn built-in datasets
    "sklearn-classic": {
        "type": "sklearn",
        "datasets": ["iris", "wine", "breast_cancer", "digits", "diabetes"],
        "description": "Classic sklearn toy datasets",
    },
    "sklearn-classification": {
        "type": "sklearn",
        "datasets": ["iris", "wine", "breast_cancer", "digits"],
        "description": "Sklearn classification datasets",
    },
    "sklearn-regression": {
        "type": "sklearn",
        "datasets": ["diabetes", "california_housing"],
        "description": "Sklearn regression datasets",
    },
    # Popular UCI datasets via OpenML
    "uci-popular": {
        "type": "openml",
        "task_ids": [
            3,      # kr-vs-kp
            6,      # letter
            11,     # balance-scale
            12,     # mfeat-factors
            14,     # mfeat-fourier
            16,     # mfeat-karhunen
            18,     # mfeat-morphological
            22,     # mfeat-zernike
            28,     # optdigits
            32,     # pendigits
            37,     # diabetes
            44,     # spambase
            54,     # vehicle
            182,    # satimage
            188,    # eucalyptus
            1461,   # bank-marketing
            1464,   # blood-transfusion
            1480,   # ilpd
            1494,   # qsar-biodeg
            1510,   # wdbc
        ],
        "description": "Popular UCI ML Repository datasets",
    },
    # Small datasets for quick testing
    "quick-test": {
        "type": "sklearn",
        "datasets": ["iris", "wine", "breast_cancer"],
        "description": "Quick test suite with small datasets",
    },
    # Grinsztajn benchmark (NeurIPS 2022) — full 45 datasets across 4 suites
    # Reference: "Why do tree-based models still outperform deep learning
    # on typical tabular data?" (Grinsztajn et al., NeurIPS 2022)
    "grinsztajn": {
        "type": "openml",
        "suite_ids": [
            337,  # Classification on numerical features
            334,  # Classification on numerical + categorical features
            336,  # Regression on numerical features
            335,  # Regression on numerical + categorical features
        ],
        "description": "Grinsztajn et al. NeurIPS 2022 benchmark (~45 datasets, classification + regression)",
    },
    # Grinsztajn classification-only subset
    "grinsztajn-classif": {
        "type": "openml",
        "suite_ids": [
            337,  # Classification on numerical features
            334,  # Classification on numerical + categorical features
        ],
        "description": "Grinsztajn NeurIPS 2022 classification datasets only",
    },
    # Grinsztajn regression-only subset
    "grinsztajn-regression": {
        "type": "openml",
        "suite_ids": [
            336,  # Regression on numerical features
            335,  # Regression on numerical + categorical features
        ],
        "description": "Grinsztajn NeurIPS 2022 regression datasets only",
    },
}



[docs]
class SuiteLoader:
    """Load benchmark datasets from various sources.

    Supports OpenML benchmark suites, sklearn built-in datasets, and custom datasets.
    Provides standardized interface for benchmark experiments.

    Parameters
    ----------
    suite : str or List[int]
        Suite name (e.g., "OpenML-CC18") or list of OpenML task IDs.
    max_datasets : int, optional
        Maximum number of datasets to load.
    max_samples : int, optional
        Maximum samples per dataset (larger datasets are sampled).
    max_features : int, optional
        Maximum features per dataset.
    cache_dir : str, optional
        Directory for caching downloaded datasets.
    random_state : int, default=42
        Random seed for sampling.
    verbose : bool, default=True
        Enable verbose output.

    Examples
    --------
    >>> loader = SuiteLoader(suite="sklearn-classic")
    >>> for dataset in loader.load():
    ...     print(f"{dataset.name}: {dataset.n_samples} samples, {dataset.n_features} features")

    >>> loader = SuiteLoader(suite="OpenML-CC18", max_datasets=10)
    >>> datasets = list(loader.load())
    """

    def __init__(
        self,
        suite: str | list[int] = "sklearn-classic",
        max_datasets: int | None = None,
        max_samples: int | None = None,
        max_features: int | None = None,
        cache_dir: str | None = None,
        random_state: int = 42,
        verbose: bool = True,
    ):
        self.suite = suite
        self.max_datasets = max_datasets
        self.max_samples = max_samples
        self.max_features = max_features
        self.cache_dir = cache_dir
        self.random_state = random_state
        self.verbose = verbose

        self._rng = np.random.RandomState(random_state)

    def _log(self, message: str) -> None:
        """Print message if verbose."""
        if self.verbose:
            print(f"[SuiteLoader] {message}")


[docs]
    def load(self) -> Generator[DatasetInfo, None, None]:
        """Load datasets from the suite.

        Yields
        ------
        DatasetInfo
            Dataset information and data.
        """
        if isinstance(self.suite, list):
            # List of OpenML task IDs
            yield from self._load_openml_tasks(self.suite)
        elif self.suite in BUILTIN_SUITES:
            suite_config = BUILTIN_SUITES[self.suite]
            if suite_config["type"] == "openml":
                if "suite_ids" in suite_config:
                    for sid in suite_config["suite_ids"]:
                        yield from self._load_openml_suite(sid)
                elif "suite_id" in suite_config:
                    yield from self._load_openml_suite(suite_config["suite_id"])
                elif "task_ids" in suite_config:
                    yield from self._load_openml_tasks(suite_config["task_ids"])
            elif suite_config["type"] == "sklearn":
                yield from self._load_sklearn_datasets(suite_config["datasets"])
            elif suite_config["type"] == "mixed":
                # Mixed suite with both OpenML and sklearn datasets
                if "openml_task_ids" in suite_config:
                    yield from self._load_openml_tasks(suite_config["openml_task_ids"])
                if "sklearn_datasets" in suite_config:
                    yield from self._load_sklearn_datasets(suite_config["sklearn_datasets"])
        else:
            raise ValueError(
                f"Unknown suite: {self.suite}. "
                f"Available: {list(BUILTIN_SUITES.keys())} or list of OpenML task IDs"
            )


    def _load_openml_suite(self, suite_id: int) -> Generator[DatasetInfo, None, None]:
        """Load datasets from an OpenML benchmark suite."""
        if not HAS_OPENML:
            raise ImportError(
                "openml is required for OpenML suites. "
                "Install with: pip install openml"
            )

        self._log(f"Loading OpenML suite {suite_id}...")

        import time as _time
        for _attempt in range(5):
            try:
                suite = openml.study.get_suite(suite_id)
                break
            except Exception as e:
                if _attempt < 4 and "connection" in str(e).lower():
                    wait = 10 * (2 ** _attempt)
                    self._log(f"OpenML server error, retrying in {wait}s... ({e})")
                    _time.sleep(wait)
                else:
                    raise
        task_ids = suite.tasks

        if self.max_datasets:
            task_ids = task_ids[:self.max_datasets]

        self._log(f"Found {len(task_ids)} tasks")

        yield from self._load_openml_tasks(task_ids)

    def _load_openml_tasks(self, task_ids: list[int]) -> Generator[DatasetInfo, None, None]:
        """Load datasets from OpenML task IDs."""
        if not HAS_OPENML:
            raise ImportError(
                "openml is required for OpenML datasets. "
                "Install with: pip install openml"
            )

        if self.max_datasets:
            task_ids = task_ids[:self.max_datasets]

        import time as _time
        for i, task_id in enumerate(task_ids):
            self._log(f"Loading task {task_id} ({i+1}/{len(task_ids)})...")
            dataset_info = None
            for _attempt in range(3):
                try:
                    dataset_info = self._load_openml_task(task_id)
                    break
                except Exception as e:
                    if _attempt < 2:
                        wait = 10 * (2 ** _attempt)
                        self._log(f"Task {task_id} failed ({e}), retrying in {wait}s...")
                        _time.sleep(wait)
                    else:
                        self._log(f"Failed to load task {task_id} after 3 attempts: {e}")
            if dataset_info is not None:
                yield dataset_info

    def _load_openml_task(self, task_id: int) -> DatasetInfo | None:
        """Load a single OpenML task."""
        try:
            task = openml.tasks.get_task(task_id)
            dataset = task.get_dataset()

            # Get data
            X, y, categorical_indicator, feature_names = dataset.get_data(
                target=dataset.default_target_attribute,
                dataset_format="array",
            )

            # Handle missing values
            if hasattr(X, 'toarray'):
                X = X.toarray()
            X = np.asarray(X, dtype=np.float64)
            X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)

            # Encode target if needed
            y = np.asarray(y)
            if y.dtype == object or (hasattr(y, 'dtype') and y.dtype.kind in ['U', 'S', 'O']):
                le = LabelEncoder()
                y = le.fit_transform(y.astype(str))
            y = np.nan_to_num(y, nan=0.0)

            # Determine task type (compare .value — OpenML enum != int)
            type_id = task.task_type_id
            if hasattr(type_id, "value"):
                type_id = type_id.value
            if type_id == 1:  # Classification
                n_classes = len(np.unique(y))
                task_type = TaskType.MULTICLASS if n_classes > 2 else TaskType.CLASSIFICATION
            elif type_id == 2:  # Regression
                task_type = TaskType.REGRESSION
            else:
                task_type = TaskType.CLASSIFICATION

            # Apply size limits
            X, y = self._apply_size_limits(X, y)

            # Handle categorical indicator
            if categorical_indicator is None:
                categorical_indicator = [False] * X.shape[1]
            else:
                categorical_indicator = list(categorical_indicator)[:X.shape[1]]

            # Handle feature names
            if feature_names is None:
                feature_names = [f"f{i}" for i in range(X.shape[1])]
            else:
                feature_names = list(feature_names)[:X.shape[1]]

            # Try to get CV splits
            cv_splits = None
            try:
                splits = task.get_split()
                cv_splits = []
                for fold in range(splits.get_maximum_folds()):
                    train_idx, test_idx = splits.get(fold=fold)
                    # Filter indices to valid range
                    train_idx = train_idx[train_idx < len(y)]
                    test_idx = test_idx[test_idx < len(y)]
                    if len(train_idx) > 0 and len(test_idx) > 0:
                        cv_splits.append((train_idx, test_idx))
            except Exception:
                pass  # Use default CV splits

            return DatasetInfo(
                name=dataset.name,
                task_type=task_type,
                X=X,
                y=y,
                feature_names=feature_names,
                categorical_indicator=categorical_indicator,
                source="openml",
                openml_id=dataset.dataset_id,
                cv_splits=cv_splits if cv_splits else None,
                metadata={
                    "task_id": task_id,
                    "openml_url": f"https://www.openml.org/d/{dataset.dataset_id}",
                },
            )

        except Exception as e:
            warnings.warn(f"Failed to load OpenML task {task_id}: {e}")
            return None

    def _load_sklearn_datasets(
        self,
        dataset_names: list[str],
    ) -> Generator[DatasetInfo, None, None]:
        """Load sklearn built-in datasets."""
        from sklearn import datasets

        sklearn_loaders = {
            "iris": (datasets.load_iris, TaskType.MULTICLASS),
            "wine": (datasets.load_wine, TaskType.MULTICLASS),
            "breast_cancer": (datasets.load_breast_cancer, TaskType.CLASSIFICATION),
            "digits": (datasets.load_digits, TaskType.MULTICLASS),
            "diabetes": (datasets.load_diabetes, TaskType.REGRESSION),
            "california_housing": (datasets.fetch_california_housing, TaskType.REGRESSION),
            "covtype": (datasets.fetch_covtype, TaskType.MULTICLASS),
        }

        if self.max_datasets:
            dataset_names = dataset_names[:self.max_datasets]

        for name in dataset_names:
            if name not in sklearn_loaders:
                self._log(f"Unknown sklearn dataset: {name}")
                continue

            try:
                self._log(f"Loading sklearn dataset: {name}")
                loader, task_type = sklearn_loaders[name]
                data = loader()

                X = np.asarray(data.data, dtype=np.float64)
                y = np.asarray(data.target)

                # Apply size limits
                X, y = self._apply_size_limits(X, y)

                feature_names = list(data.feature_names) if hasattr(data, 'feature_names') else None

                yield DatasetInfo(
                    name=name,
                    task_type=task_type,
                    X=X,
                    y=y,
                    feature_names=feature_names or [f"f{i}" for i in range(X.shape[1])],
                    categorical_indicator=[False] * X.shape[1],
                    source="sklearn",
                    metadata={"sklearn_name": name},
                )

            except Exception as e:
                self._log(f"Failed to load {name}: {e}")
                continue

    def _apply_size_limits(
        self,
        X: np.ndarray,
        y: np.ndarray,
    ) -> tuple[np.ndarray, np.ndarray]:
        """Apply sample and feature limits."""
        # Sample limit
        if self.max_samples and X.shape[0] > self.max_samples:
            idx = self._rng.choice(X.shape[0], self.max_samples, replace=False)
            X = X[idx]
            y = y[idx]

        # Feature limit
        if self.max_features and X.shape[1] > self.max_features:
            # Use variance-based selection
            variances = np.var(X, axis=0)
            top_features = np.argsort(variances)[-self.max_features:]
            X = X[:, top_features]

        return X, y


[docs]
    @staticmethod
    def list_suites() -> dict[str, str]:
        """List available benchmark suites."""
        return {name: config["description"] for name, config in BUILTIN_SUITES.items()}



[docs]
    @staticmethod
    def get_suite_info(suite_name: str) -> dict[str, Any]:
        """Get detailed information about a suite."""
        if suite_name not in BUILTIN_SUITES:
            raise ValueError(f"Unknown suite: {suite_name}")
        return BUILTIN_SUITES[suite_name].copy()