Source code for endgame.tune.spaces

from __future__ import annotations

"""Predefined hyperparameter search spaces for common models."""

from typing import Any

# =============================================================================
# LightGBM Search Spaces
# =============================================================================

# Rationale for n_estimators=100-1000 range:
# - With early_stopping_rounds=50, models converge well before 1000 trees
# - Gorishniy et al. (2021) "Revisiting Deep Learning for Tabular Data" uses 2000
#   but with early stopping, achieving similar effective ensemble sizes
# - Reduces worst-case training time by 2x while early stopping handles convergence
# - Upper bound of 1000 still allows complex patterns; early stopping prevents waste
LGBM_STANDARD_SPACE: dict[str, dict[str, Any]] = {
    "learning_rate": {"type": "loguniform", "low": 0.005, "high": 0.2},
    "n_estimators": {"type": "int", "low": 100, "high": 1000},
    "num_leaves": {"type": "int", "low": 16, "high": 128},
    "max_depth": {"type": "int", "low": 3, "high": 10},
    "min_child_samples": {"type": "int", "low": 5, "high": 100},
    "subsample": {"type": "float", "low": 0.5, "high": 1.0},
    "colsample_bytree": {"type": "float", "low": 0.5, "high": 1.0},
    "reg_alpha": {"type": "loguniform", "low": 1e-8, "high": 10.0},
    "reg_lambda": {"type": "loguniform", "low": 1e-8, "high": 10.0},
}

LGBM_LARGE_SPACE: dict[str, dict[str, Any]] = {
    "learning_rate": {"type": "loguniform", "low": 0.001, "high": 0.3},
    "n_estimators": {"type": "int", "low": 50, "high": 5000},
    "num_leaves": {"type": "int", "low": 8, "high": 256},
    "max_depth": {"type": "int", "low": 2, "high": 16},
    "min_child_samples": {"type": "int", "low": 1, "high": 200},
    "subsample": {"type": "float", "low": 0.3, "high": 1.0},
    "colsample_bytree": {"type": "float", "low": 0.3, "high": 1.0},
    "reg_alpha": {"type": "loguniform", "low": 1e-10, "high": 100.0},
    "reg_lambda": {"type": "loguniform", "low": 1e-10, "high": 100.0},
    "min_split_gain": {"type": "loguniform", "low": 1e-8, "high": 1.0},
    "path_smooth": {"type": "float", "low": 0.0, "high": 10.0},
}

LGBM_FAST_SPACE: dict[str, dict[str, Any]] = {
    "learning_rate": {"type": "loguniform", "low": 0.01, "high": 0.3},
    "n_estimators": {"type": "int", "low": 50, "high": 500},
    "num_leaves": {"type": "int", "low": 16, "high": 64},
    "max_depth": {"type": "int", "low": 4, "high": 8},
}

# =============================================================================
# XGBoost Search Spaces
# =============================================================================

# Same rationale as LGBM: early stopping makes high n_estimators wasteful
# max_depth capped at 10 per Grinsztajn et al. finding that deeper trees rarely help
XGB_STANDARD_SPACE: dict[str, dict[str, Any]] = {
    "learning_rate": {"type": "loguniform", "low": 0.005, "high": 0.2},
    "n_estimators": {"type": "int", "low": 100, "high": 1000},
    "max_depth": {"type": "int", "low": 3, "high": 10},
    "min_child_weight": {"type": "int", "low": 1, "high": 20},
    "subsample": {"type": "float", "low": 0.5, "high": 1.0},
    "colsample_bytree": {"type": "float", "low": 0.5, "high": 1.0},
    "colsample_bylevel": {"type": "float", "low": 0.5, "high": 1.0},
    "reg_alpha": {"type": "loguniform", "low": 1e-8, "high": 10.0},
    "reg_lambda": {"type": "loguniform", "low": 1e-8, "high": 10.0},
    "gamma": {"type": "loguniform", "low": 1e-8, "high": 5.0},
}

XGB_LARGE_SPACE: dict[str, dict[str, Any]] = {
    "learning_rate": {"type": "loguniform", "low": 0.001, "high": 0.3},
    "n_estimators": {"type": "int", "low": 50, "high": 5000},
    "max_depth": {"type": "int", "low": 2, "high": 16},
    "min_child_weight": {"type": "int", "low": 1, "high": 100},
    "subsample": {"type": "float", "low": 0.3, "high": 1.0},
    "colsample_bytree": {"type": "float", "low": 0.3, "high": 1.0},
    "colsample_bylevel": {"type": "float", "low": 0.3, "high": 1.0},
    "colsample_bynode": {"type": "float", "low": 0.3, "high": 1.0},
    "reg_alpha": {"type": "loguniform", "low": 1e-10, "high": 100.0},
    "reg_lambda": {"type": "loguniform", "low": 1e-10, "high": 100.0},
    "gamma": {"type": "loguniform", "low": 1e-10, "high": 10.0},
    "max_delta_step": {"type": "int", "low": 0, "high": 10},
}

# =============================================================================
# CatBoost Search Spaces
# =============================================================================

# CatBoost: depth has outsized impact on training time, cap at 8
# CatBoost's ordered boosting is inherently slower, so tighter bounds help
CATBOOST_STANDARD_SPACE: dict[str, dict[str, Any]] = {
    "learning_rate": {"type": "loguniform", "low": 0.005, "high": 0.2},
    "iterations": {"type": "int", "low": 100, "high": 1000},
    "depth": {"type": "int", "low": 4, "high": 8},
    "l2_leaf_reg": {"type": "loguniform", "low": 1e-8, "high": 10.0},
    "bagging_temperature": {"type": "float", "low": 0.0, "high": 1.0},
    "random_strength": {"type": "float", "low": 0.0, "high": 10.0},
    "border_count": {"type": "int", "low": 32, "high": 255},
}

CATBOOST_LARGE_SPACE: dict[str, dict[str, Any]] = {
    "learning_rate": {"type": "loguniform", "low": 0.001, "high": 0.3},
    "iterations": {"type": "int", "low": 50, "high": 5000},
    "depth": {"type": "int", "low": 2, "high": 12},
    "l2_leaf_reg": {"type": "loguniform", "low": 1e-10, "high": 100.0},
    "bagging_temperature": {"type": "float", "low": 0.0, "high": 5.0},
    "random_strength": {"type": "float", "low": 0.0, "high": 20.0},
    "border_count": {"type": "int", "low": 16, "high": 255},
    "min_data_in_leaf": {"type": "int", "low": 1, "high": 100},
}

# =============================================================================
# Neural Network Search Spaces
# =============================================================================

MLP_STANDARD_SPACE: dict[str, dict[str, Any]] = {
    "learning_rate": {"type": "loguniform", "low": 1e-5, "high": 1e-2},
    "weight_decay": {"type": "loguniform", "low": 1e-8, "high": 1e-2},
    "dropout": {"type": "float", "low": 0.0, "high": 0.5},
    "hidden_dim": {"type": "categorical", "choices": [64, 128, 256, 512]},
    "n_layers": {"type": "int", "low": 1, "high": 4},
    "batch_size": {"type": "categorical", "choices": [32, 64, 128, 256]},
}

TRANSFORMER_STANDARD_SPACE: dict[str, dict[str, Any]] = {
    "learning_rate": {"type": "loguniform", "low": 1e-6, "high": 5e-4},
    "weight_decay": {"type": "loguniform", "low": 1e-8, "high": 1e-1},
    "warmup_ratio": {"type": "float", "low": 0.0, "high": 0.2},
    "num_epochs": {"type": "int", "low": 2, "high": 10},
    "batch_size": {"type": "categorical", "choices": [8, 16, 32]},
    "max_length": {"type": "categorical", "choices": [256, 384, 512]},
}

# Rotation Forest search space
# Note: RotationForest uses base_estimator for tree params, only tune ensemble-level params
# n_estimators capped at 500 since RF is much slower than GBDT per tree
ROTATION_FOREST_SPACE: dict[str, dict[str, Any]] = {
    "n_estimators": {"type": "int", "low": 50, "high": 500},
    "n_subsets": {"type": "int", "low": 2, "high": 4},
    "max_features": {"type": "float", "low": 0.5, "high": 1.0},
}

# Random Forest search space - matched to GBDT ranges for fair comparison
# RF benefits less from very high n_estimators due to lack of boosting
RANDOM_FOREST_SPACE: dict[str, dict[str, Any]] = {
    "n_estimators": {"type": "int", "low": 100, "high": 1000},
    "max_depth": {"type": "int", "low": 3, "high": 10},
    "min_samples_split": {"type": "int", "low": 2, "high": 20},
    "min_samples_leaf": {"type": "int", "low": 1, "high": 10},
    "max_features": {"type": "categorical", "choices": ["sqrt", "log2", 0.5, 0.8, 1.0]},
}

# =============================================================================
# Registry
# =============================================================================

SPACE_REGISTRY: dict[str, dict[str, dict[str, Any]]] = {
    "lgbm_standard": LGBM_STANDARD_SPACE,
    "lgbm_large": LGBM_LARGE_SPACE,
    "lgbm_fast": LGBM_FAST_SPACE,
    "xgb_standard": XGB_STANDARD_SPACE,
    "xgb_large": XGB_LARGE_SPACE,
    "catboost_standard": CATBOOST_STANDARD_SPACE,
    "catboost_large": CATBOOST_LARGE_SPACE,
    "mlp_standard": MLP_STANDARD_SPACE,
    "transformer_standard": TRANSFORMER_STANDARD_SPACE,
    "rotation_forest": ROTATION_FOREST_SPACE,
    "random_forest": RANDOM_FOREST_SPACE,
}


[docs] def get_space(name: str) -> dict[str, dict[str, Any]]: """Get a predefined search space by name. Parameters ---------- name : str Space name: 'lgbm_standard', 'xgb_standard', etc. Returns ------- Dict Search space configuration. Raises ------ ValueError If space name is not found. """ if name not in SPACE_REGISTRY: raise ValueError( f"Unknown space '{name}'. " f"Available: {list(SPACE_REGISTRY.keys())}" ) return SPACE_REGISTRY[name].copy()
[docs] def get_lgbm_space(size: str = "standard") -> dict[str, dict[str, Any]]: """Get LightGBM search space. Parameters ---------- size : str, default='standard' Space size: 'standard', 'large', 'fast'. Returns ------- Dict Search space configuration. """ return get_space(f"lgbm_{size}")
[docs] def get_xgb_space(size: str = "standard") -> dict[str, dict[str, Any]]: """Get XGBoost search space. Parameters ---------- size : str, default='standard' Space size: 'standard', 'large'. Returns ------- Dict Search space configuration. """ return get_space(f"xgb_{size}")
[docs] def get_catboost_space(size: str = "standard") -> dict[str, dict[str, Any]]: """Get CatBoost search space. Parameters ---------- size : str, default='standard' Space size: 'standard', 'large'. Returns ------- Dict Search space configuration. """ return get_space(f"catboost_{size}")