Source code for endgame.benchmark.loader

from __future__ import annotations

"""Dataset loading utilities for benchmark suites.

Provides unified access to benchmark datasets from OpenML, sklearn, and custom sources.
"""

import warnings
from collections.abc import Generator
from dataclasses import dataclass, field
from enum import Enum
from typing import Any

import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# Lazy imports for optional dependencies
HAS_OPENML = False
HAS_PANDAS = False

try:
    import openml
    HAS_OPENML = True
except ImportError:
    pass

try:
    import pandas as pd
    HAS_PANDAS = True
except ImportError:
    pass


class TaskType(str, Enum):
    """Type of machine learning task."""
    CLASSIFICATION = "classification"
    REGRESSION = "regression"
    MULTICLASS = "multiclass"
    MULTILABEL = "multilabel"


[docs] @dataclass class DatasetInfo: """Container for dataset information and data. Attributes ---------- name : str Name of the dataset. task_type : TaskType Type of ML task. X : np.ndarray Feature matrix. y : np.ndarray Target variable. feature_names : List[str] Names of features. categorical_indicator : List[bool] Boolean mask indicating categorical features. n_samples : int Number of samples. n_features : int Number of features. n_classes : int Number of classes (for classification). class_distribution : Dict[Any, int] Distribution of classes. source : str Source of the dataset (e.g., 'openml', 'sklearn'). openml_id : Optional[int] OpenML dataset ID if applicable. cv_splits : Optional[List[Tuple[np.ndarray, np.ndarray]]] Predefined cross-validation splits. metadata : Dict[str, Any] Additional metadata. """ name: str task_type: TaskType X: np.ndarray y: np.ndarray feature_names: list[str] = field(default_factory=list) categorical_indicator: list[bool] = field(default_factory=list) n_samples: int = 0 n_features: int = 0 n_classes: int = 0 class_distribution: dict[Any, int] = field(default_factory=dict) source: str = "unknown" openml_id: int | None = None cv_splits: list[tuple[np.ndarray, np.ndarray]] | None = None metadata: dict[str, Any] = field(default_factory=dict) def __post_init__(self): """Compute derived attributes.""" self.n_samples = self.X.shape[0] self.n_features = self.X.shape[1] if self.X.ndim > 1 else 1 if not self.feature_names: self.feature_names = [f"f{i}" for i in range(self.n_features)] if not self.categorical_indicator: self.categorical_indicator = [False] * self.n_features if self.task_type in (TaskType.CLASSIFICATION, TaskType.MULTICLASS): unique, counts = np.unique(self.y, return_counts=True) self.n_classes = len(unique) self.class_distribution = dict(zip(unique.tolist(), counts.tolist())) @property def n_categorical(self) -> int: """Number of categorical features.""" return sum(self.categorical_indicator) @property def n_numerical(self) -> int: """Number of numerical features.""" return self.n_features - self.n_categorical @property def imbalance_ratio(self) -> float: """Class imbalance ratio (max_count / min_count).""" if self.task_type == TaskType.REGRESSION: return 1.0 if not self.class_distribution: return 1.0 counts = list(self.class_distribution.values()) return max(counts) / min(counts) if min(counts) > 0 else float('inf')
[docs] def get_cv_splits( self, n_splits: int = 10, shuffle: bool = True, random_state: int = 42, ) -> list[tuple[np.ndarray, np.ndarray]]: """Get cross-validation splits. Returns predefined splits if available, otherwise generates new ones. """ if self.cv_splits is not None: return self.cv_splits if self.task_type in (TaskType.CLASSIFICATION, TaskType.MULTICLASS): cv = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state) else: cv = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state) return list(cv.split(self.X, self.y))
# Predefined benchmark suites BUILTIN_SUITES: dict[str, dict[str, Any]] = { # OpenML suites "OpenML-CC18": { "type": "openml", "suite_id": 99, "description": "OpenML-CC18 classification benchmark (72 datasets)", }, "OpenML-CTR23": { "type": "openml", "suite_id": 336, "description": "OpenML-CTR23 regression benchmark", }, "AutoML-Benchmark": { "type": "openml", "suite_id": 271, "description": "AutoML benchmark suite", }, # Sklearn built-in datasets "sklearn-classic": { "type": "sklearn", "datasets": ["iris", "wine", "breast_cancer", "digits", "diabetes"], "description": "Classic sklearn toy datasets", }, "sklearn-classification": { "type": "sklearn", "datasets": ["iris", "wine", "breast_cancer", "digits"], "description": "Sklearn classification datasets", }, "sklearn-regression": { "type": "sklearn", "datasets": ["diabetes", "california_housing"], "description": "Sklearn regression datasets", }, # Popular UCI datasets via OpenML "uci-popular": { "type": "openml", "task_ids": [ 3, # kr-vs-kp 6, # letter 11, # balance-scale 12, # mfeat-factors 14, # mfeat-fourier 16, # mfeat-karhunen 18, # mfeat-morphological 22, # mfeat-zernike 28, # optdigits 32, # pendigits 37, # diabetes 44, # spambase 54, # vehicle 182, # satimage 188, # eucalyptus 1461, # bank-marketing 1464, # blood-transfusion 1480, # ilpd 1494, # qsar-biodeg 1510, # wdbc ], "description": "Popular UCI ML Repository datasets", }, # Small datasets for quick testing "quick-test": { "type": "sklearn", "datasets": ["iris", "wine", "breast_cancer"], "description": "Quick test suite with small datasets", }, # Grinsztajn benchmark (NeurIPS 2022) — full 45 datasets across 4 suites # Reference: "Why do tree-based models still outperform deep learning # on typical tabular data?" (Grinsztajn et al., NeurIPS 2022) "grinsztajn": { "type": "openml", "suite_ids": [ 337, # Classification on numerical features 334, # Classification on numerical + categorical features 336, # Regression on numerical features 335, # Regression on numerical + categorical features ], "description": "Grinsztajn et al. NeurIPS 2022 benchmark (~45 datasets, classification + regression)", }, # Grinsztajn classification-only subset "grinsztajn-classif": { "type": "openml", "suite_ids": [ 337, # Classification on numerical features 334, # Classification on numerical + categorical features ], "description": "Grinsztajn NeurIPS 2022 classification datasets only", }, # Grinsztajn regression-only subset "grinsztajn-regression": { "type": "openml", "suite_ids": [ 336, # Regression on numerical features 335, # Regression on numerical + categorical features ], "description": "Grinsztajn NeurIPS 2022 regression datasets only", }, }
[docs] class SuiteLoader: """Load benchmark datasets from various sources. Supports OpenML benchmark suites, sklearn built-in datasets, and custom datasets. Provides standardized interface for benchmark experiments. Parameters ---------- suite : str or List[int] Suite name (e.g., "OpenML-CC18") or list of OpenML task IDs. max_datasets : int, optional Maximum number of datasets to load. max_samples : int, optional Maximum samples per dataset (larger datasets are sampled). max_features : int, optional Maximum features per dataset. cache_dir : str, optional Directory for caching downloaded datasets. random_state : int, default=42 Random seed for sampling. verbose : bool, default=True Enable verbose output. Examples -------- >>> loader = SuiteLoader(suite="sklearn-classic") >>> for dataset in loader.load(): ... print(f"{dataset.name}: {dataset.n_samples} samples, {dataset.n_features} features") >>> loader = SuiteLoader(suite="OpenML-CC18", max_datasets=10) >>> datasets = list(loader.load()) """ def __init__( self, suite: str | list[int] = "sklearn-classic", max_datasets: int | None = None, max_samples: int | None = None, max_features: int | None = None, cache_dir: str | None = None, random_state: int = 42, verbose: bool = True, ): self.suite = suite self.max_datasets = max_datasets self.max_samples = max_samples self.max_features = max_features self.cache_dir = cache_dir self.random_state = random_state self.verbose = verbose self._rng = np.random.RandomState(random_state) def _log(self, message: str) -> None: """Print message if verbose.""" if self.verbose: print(f"[SuiteLoader] {message}")
[docs] def load(self) -> Generator[DatasetInfo, None, None]: """Load datasets from the suite. Yields ------ DatasetInfo Dataset information and data. """ if isinstance(self.suite, list): # List of OpenML task IDs yield from self._load_openml_tasks(self.suite) elif self.suite in BUILTIN_SUITES: suite_config = BUILTIN_SUITES[self.suite] if suite_config["type"] == "openml": if "suite_ids" in suite_config: for sid in suite_config["suite_ids"]: yield from self._load_openml_suite(sid) elif "suite_id" in suite_config: yield from self._load_openml_suite(suite_config["suite_id"]) elif "task_ids" in suite_config: yield from self._load_openml_tasks(suite_config["task_ids"]) elif suite_config["type"] == "sklearn": yield from self._load_sklearn_datasets(suite_config["datasets"]) elif suite_config["type"] == "mixed": # Mixed suite with both OpenML and sklearn datasets if "openml_task_ids" in suite_config: yield from self._load_openml_tasks(suite_config["openml_task_ids"]) if "sklearn_datasets" in suite_config: yield from self._load_sklearn_datasets(suite_config["sklearn_datasets"]) else: raise ValueError( f"Unknown suite: {self.suite}. " f"Available: {list(BUILTIN_SUITES.keys())} or list of OpenML task IDs" )
def _load_openml_suite(self, suite_id: int) -> Generator[DatasetInfo, None, None]: """Load datasets from an OpenML benchmark suite.""" if not HAS_OPENML: raise ImportError( "openml is required for OpenML suites. " "Install with: pip install openml" ) self._log(f"Loading OpenML suite {suite_id}...") import time as _time for _attempt in range(5): try: suite = openml.study.get_suite(suite_id) break except Exception as e: if _attempt < 4 and "connection" in str(e).lower(): wait = 10 * (2 ** _attempt) self._log(f"OpenML server error, retrying in {wait}s... ({e})") _time.sleep(wait) else: raise task_ids = suite.tasks if self.max_datasets: task_ids = task_ids[:self.max_datasets] self._log(f"Found {len(task_ids)} tasks") yield from self._load_openml_tasks(task_ids) def _load_openml_tasks(self, task_ids: list[int]) -> Generator[DatasetInfo, None, None]: """Load datasets from OpenML task IDs.""" if not HAS_OPENML: raise ImportError( "openml is required for OpenML datasets. " "Install with: pip install openml" ) if self.max_datasets: task_ids = task_ids[:self.max_datasets] import time as _time for i, task_id in enumerate(task_ids): self._log(f"Loading task {task_id} ({i+1}/{len(task_ids)})...") dataset_info = None for _attempt in range(3): try: dataset_info = self._load_openml_task(task_id) break except Exception as e: if _attempt < 2: wait = 10 * (2 ** _attempt) self._log(f"Task {task_id} failed ({e}), retrying in {wait}s...") _time.sleep(wait) else: self._log(f"Failed to load task {task_id} after 3 attempts: {e}") if dataset_info is not None: yield dataset_info def _load_openml_task(self, task_id: int) -> DatasetInfo | None: """Load a single OpenML task.""" try: task = openml.tasks.get_task(task_id) dataset = task.get_dataset() # Get data X, y, categorical_indicator, feature_names = dataset.get_data( target=dataset.default_target_attribute, dataset_format="array", ) # Handle missing values if hasattr(X, 'toarray'): X = X.toarray() X = np.asarray(X, dtype=np.float64) X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0) # Encode target if needed y = np.asarray(y) if y.dtype == object or (hasattr(y, 'dtype') and y.dtype.kind in ['U', 'S', 'O']): le = LabelEncoder() y = le.fit_transform(y.astype(str)) y = np.nan_to_num(y, nan=0.0) # Determine task type (compare .value — OpenML enum != int) type_id = task.task_type_id if hasattr(type_id, "value"): type_id = type_id.value if type_id == 1: # Classification n_classes = len(np.unique(y)) task_type = TaskType.MULTICLASS if n_classes > 2 else TaskType.CLASSIFICATION elif type_id == 2: # Regression task_type = TaskType.REGRESSION else: task_type = TaskType.CLASSIFICATION # Apply size limits X, y = self._apply_size_limits(X, y) # Handle categorical indicator if categorical_indicator is None: categorical_indicator = [False] * X.shape[1] else: categorical_indicator = list(categorical_indicator)[:X.shape[1]] # Handle feature names if feature_names is None: feature_names = [f"f{i}" for i in range(X.shape[1])] else: feature_names = list(feature_names)[:X.shape[1]] # Try to get CV splits cv_splits = None try: splits = task.get_split() cv_splits = [] for fold in range(splits.get_maximum_folds()): train_idx, test_idx = splits.get(fold=fold) # Filter indices to valid range train_idx = train_idx[train_idx < len(y)] test_idx = test_idx[test_idx < len(y)] if len(train_idx) > 0 and len(test_idx) > 0: cv_splits.append((train_idx, test_idx)) except Exception: pass # Use default CV splits return DatasetInfo( name=dataset.name, task_type=task_type, X=X, y=y, feature_names=feature_names, categorical_indicator=categorical_indicator, source="openml", openml_id=dataset.dataset_id, cv_splits=cv_splits if cv_splits else None, metadata={ "task_id": task_id, "openml_url": f"https://www.openml.org/d/{dataset.dataset_id}", }, ) except Exception as e: warnings.warn(f"Failed to load OpenML task {task_id}: {e}") return None def _load_sklearn_datasets( self, dataset_names: list[str], ) -> Generator[DatasetInfo, None, None]: """Load sklearn built-in datasets.""" from sklearn import datasets sklearn_loaders = { "iris": (datasets.load_iris, TaskType.MULTICLASS), "wine": (datasets.load_wine, TaskType.MULTICLASS), "breast_cancer": (datasets.load_breast_cancer, TaskType.CLASSIFICATION), "digits": (datasets.load_digits, TaskType.MULTICLASS), "diabetes": (datasets.load_diabetes, TaskType.REGRESSION), "california_housing": (datasets.fetch_california_housing, TaskType.REGRESSION), "covtype": (datasets.fetch_covtype, TaskType.MULTICLASS), } if self.max_datasets: dataset_names = dataset_names[:self.max_datasets] for name in dataset_names: if name not in sklearn_loaders: self._log(f"Unknown sklearn dataset: {name}") continue try: self._log(f"Loading sklearn dataset: {name}") loader, task_type = sklearn_loaders[name] data = loader() X = np.asarray(data.data, dtype=np.float64) y = np.asarray(data.target) # Apply size limits X, y = self._apply_size_limits(X, y) feature_names = list(data.feature_names) if hasattr(data, 'feature_names') else None yield DatasetInfo( name=name, task_type=task_type, X=X, y=y, feature_names=feature_names or [f"f{i}" for i in range(X.shape[1])], categorical_indicator=[False] * X.shape[1], source="sklearn", metadata={"sklearn_name": name}, ) except Exception as e: self._log(f"Failed to load {name}: {e}") continue def _apply_size_limits( self, X: np.ndarray, y: np.ndarray, ) -> tuple[np.ndarray, np.ndarray]: """Apply sample and feature limits.""" # Sample limit if self.max_samples and X.shape[0] > self.max_samples: idx = self._rng.choice(X.shape[0], self.max_samples, replace=False) X = X[idx] y = y[idx] # Feature limit if self.max_features and X.shape[1] > self.max_features: # Use variance-based selection variances = np.var(X, axis=0) top_features = np.argsort(variances)[-self.max_features:] X = X[:, top_features] return X, y
[docs] @staticmethod def list_suites() -> dict[str, str]: """List available benchmark suites.""" return {name: config["description"] for name, config in BUILTIN_SUITES.items()}
[docs] @staticmethod def get_suite_info(suite_name: str) -> dict[str, Any]: """Get detailed information about a suite.""" if suite_name not in BUILTIN_SUITES: raise ValueError(f"Unknown suite: {suite_name}") return BUILTIN_SUITES[suite_name].copy()