Source code for endgame.automl.presets

from __future__ import annotations

"""Preset configurations for AutoML.

This module defines preset configurations that control the behavior of
AutoML predictors, including time limits, model selection, ensembling,
and calibration settings.

Presets
-------
- best_quality: Maximum accuracy, no time limit
- high_quality: High accuracy with 4 hour default
- good_quality: Good accuracy with 1 hour default
- medium_quality: Balanced speed/accuracy with 15 min default
- fast: Quick results with 5 min default
- exhaustive: Evolutionary search over ALL models + preprocessing + ensembles
- interpretable: Only interpretable models
"""

from dataclasses import dataclass, field
from typing import Any



[docs]
@dataclass
class PresetConfig:
    """Configuration for an AutoML preset.

    Attributes
    ----------
    name : str
        Name of the preset.
    description : str
        Human-readable description.
    default_time_limit : int or None
        Default time limit in seconds. None means no limit.
    cv_folds : int
        Number of cross-validation folds.
    num_bag_folds : int
        Number of bagging folds (0 to disable).
    num_stack_levels : int
        Number of stacking levels (0 to disable).
    hyperparameter_tune : bool
        Whether to tune hyperparameters.
    tune_trials : int
        Number of tuning trials per model.
    ensemble_method : str
        Ensemble method: "none", "hill_climbing", "stacking".
    calibrate : bool
        Whether to calibrate probabilities.
    use_holdout : bool
        Whether to use a holdout set.
    holdout_frac : float
        Fraction of data for holdout (if use_holdout=True).
    feature_engineering : str
        Feature engineering level: "none", "light", "moderate", "aggressive".
    model_pool : list of str
        List of model names to consider.
    time_allocations : dict
        Time allocation fractions for each stage.
    """

    name: str
    description: str
    default_time_limit: int | None
    cv_folds: int
    num_bag_folds: int
    num_stack_levels: int
    hyperparameter_tune: bool
    tune_trials: int
    ensemble_method: str
    calibrate: bool
    use_holdout: bool
    holdout_frac: float
    feature_engineering: str
    model_pool: list[str]
    time_allocations: dict[str, float] = field(default_factory=dict)

    def __post_init__(self):
        """Set default time allocations if not provided."""
        if not self.time_allocations:
            self.time_allocations = DEFAULT_TIME_ALLOCATIONS.copy()



# Default time allocation across pipeline stages
DEFAULT_TIME_ALLOCATIONS = {
    "profiling": 0.01,
    "quality_guardrails": 0.02,
    "data_cleaning": 0.02,
    "preprocessing": 0.05,
    "feature_engineering": 0.03,
    "data_augmentation": 0.02,
    "model_selection": 0.04,
    "model_training": 0.40,
    "constraint_check": 0.01,
    "hyperparameter_tuning": 0.20,
    "ensembling": 0.06,
    "threshold_opt": 0.02,
    "calibration": 0.03,
    "post_training": 0.02,
    "explainability": 0.02,
}

# Model pools for different presets
MODEL_POOLS = {
    "all": [
        # GBDTs (always strong)
        "lgbm", "xgb", "catboost", "ngboost",
        # Deep tabular (modern)
        "ft_transformer", "saint", "tabnet", "node", "nam",
        "tabular_resnet", "tabm", "realmlp", "grande",
        "gandalf", "tabr", "tabdpt", "modern_nca",
        "tab_transformer", "mlp", "embedding_mlp",
        # Custom trees
        "rotation_forest", "c50", "c50_ensemble", "oblique_forest",
        "extra_oblique_forest", "patch_oblique_forest", "honest_forest",
        "evolutionary_tree", "rf", "extra_trees",
        "adtree", "model_tree", "cubist",
        # Linear
        "linear", "elm", "mars",
        # Kernel
        "svm", "gp",
        # Rules
        "rulefit", "furia", "prim",
        # Bayesian
        "tan", "eskdb", "kdb", "bart", "naive_bayes",
        "neural_kdb", "auto_sle", "ebmc_classifier",
        # Interpretable
        "ebm", "ebmc", "ebmr",
        "gam", "node_gam", "gami_net",
        "corels", "slim", "fasterrisk", "gosdt",
        # Ordinal
        "ordinal", "logistic_at", "logistic_it", "logistic_se",
        "ordinal_ridge", "ordinal_lad",
        # Discriminant
        "lda", "qda", "knn",
        # Foundation
        "tabpfn", "tabpfn_v2", "tabpfn_25", "xrfm",
        # Symbolic
        "symbolic_regression", "symbolic_regressor",
        # Quantile
        "quantile_forest",
    ],
    "best_quality": [
        # GBDTs (always strong)
        "lgbm", "xgb", "catboost", "ngboost",
        # Deep tabular (modern)
        "ft_transformer", "saint", "tabnet", "node", "nam",
        "tabular_resnet", "tabm", "realmlp", "grande",
        "gandalf", "tabr", "tabdpt", "modern_nca",
        "tab_transformer", "mlp", "embedding_mlp",
        # Custom trees
        "rotation_forest", "c50", "c50_ensemble", "oblique_forest",
        "extra_oblique_forest", "patch_oblique_forest", "honest_forest",
        "evolutionary_tree", "rf", "extra_trees",
        "adtree", "model_tree", "cubist",
        # Linear
        "linear", "elm", "mars",
        # Kernel
        "svm", "gp",
        # Rules
        "rulefit", "furia", "prim",
        # Bayesian
        "tan", "eskdb", "kdb", "bart", "naive_bayes",
        "neural_kdb", "auto_sle", "ebmc_classifier",
        # Interpretable
        "ebm", "ebmc", "ebmr",
        "gam", "node_gam", "gami_net",
        "corels", "slim", "fasterrisk", "gosdt",
        # Ordinal
        "ordinal", "logistic_at", "logistic_it", "logistic_se",
        "ordinal_ridge", "ordinal_lad",
        # Discriminant
        "lda", "qda", "knn",
        # Foundation
        "tabpfn", "tabpfn_v2", "tabpfn_25", "xrfm",
        # Symbolic
        "symbolic_regression", "symbolic_regressor",
        # Quantile
        "quantile_forest",
    ],
    "high_quality": [
        "lgbm", "xgb", "catboost",
        "ft_transformer", "tabm", "tabnet",
        "tabpfn_v2", "tabpfn_25",
        "realmlp", "tabr", "gandalf",
        "rotation_forest", "rf",
        "extra_oblique_forest", "honest_forest",
        "linear",
    ],
    "good_quality": [
        "lgbm", "xgb", "catboost",
        "tabm", "realmlp", "rf",
        "ebm",
        "linear", "elm",
    ],
    "medium_quality": [
        "lgbm", "xgb", "catboost",
        "rf",
        "linear",
    ],
    "fast": [
        "lgbm",
    ],
    "interpretable": [
        # GAM-style models
        "ebm", "gam", "nam", "node_gam", "gami_net",
        # Rule-based models
        "rulefit", "furia", "corels", "prim",
        # Sparse linear / scorecards
        "linear", "mars", "slim", "fasterrisk",
        # Interpretable trees
        "c50", "gosdt", "cubist",
        # Symbolic
        "symbolic_regression",
        # Probabilistic (feature importances + uncertainty)
        "ngboost",
        # Bayesian (interpretable)
        "naive_bayes", "tan", "lda",
        # Foundation (interpretable)
        "xrfm",
        # Ordinal
        "ordinal",
    ],
}


# Preset configurations
PRESETS: dict[str, PresetConfig] = {
    "best_quality": PresetConfig(
        name="best_quality",
        description="Maximum accuracy, no time limit",
        default_time_limit=None,
        cv_folds=8,
        num_bag_folds=5,
        num_stack_levels=2,
        hyperparameter_tune=True,
        tune_trials=100,
        ensemble_method="auto",
        calibrate=True,
        use_holdout=True,
        holdout_frac=0.1,
        feature_engineering="aggressive",
        model_pool=MODEL_POOLS["best_quality"],
        time_allocations={
            "profiling": 0.01,
            "quality_guardrails": 0.02,
            "data_cleaning": 0.02,
            "preprocessing": 0.08,
            "feature_engineering": 0.04,
            "data_augmentation": 0.02,
            "model_selection": 0.04,
            "model_training": 0.35,
            "constraint_check": 0.01,
            "hyperparameter_tuning": 0.25,
            "ensembling": 0.05,
            "threshold_opt": 0.02,
            "calibration": 0.03,
            "post_training": 0.03,
            "explainability": 0.03,
        },
    ),
    "high_quality": PresetConfig(
        name="high_quality",
        description="High accuracy with 4 hour default",
        default_time_limit=14400,  # 4 hours
        cv_folds=5,
        num_bag_folds=5,
        num_stack_levels=1,
        hyperparameter_tune=True,
        tune_trials=50,
        ensemble_method="auto",
        calibrate=True,
        use_holdout=True,
        holdout_frac=0.1,
        feature_engineering="moderate",
        model_pool=MODEL_POOLS["high_quality"],
        time_allocations={
            "profiling": 0.02,
            "quality_guardrails": 0.02,
            "data_cleaning": 0.02,
            "preprocessing": 0.08,
            "feature_engineering": 0.03,
            "data_augmentation": 0.02,
            "model_selection": 0.04,
            "model_training": 0.38,
            "constraint_check": 0.02,
            "hyperparameter_tuning": 0.20,
            "ensembling": 0.05,
            "threshold_opt": 0.03,
            "calibration": 0.03,
            "post_training": 0.03,
            "explainability": 0.03,
        },
    ),
    "good_quality": PresetConfig(
        name="good_quality",
        description="Good accuracy with 1 hour default",
        default_time_limit=3600,  # 1 hour
        cv_folds=5,
        num_bag_folds=3,
        num_stack_levels=0,
        hyperparameter_tune=True,
        tune_trials=25,
        ensemble_method="auto",
        calibrate=True,
        use_holdout=False,
        holdout_frac=0.0,
        feature_engineering="moderate",
        model_pool=MODEL_POOLS["good_quality"],
        time_allocations={
            "profiling": 0.02,
            "quality_guardrails": 0.02,
            "data_cleaning": 0.02,
            "preprocessing": 0.08,
            "feature_engineering": 0.03,
            "data_augmentation": 0.02,
            "model_selection": 0.04,
            "model_training": 0.42,
            "constraint_check": 0.02,
            "hyperparameter_tuning": 0.15,
            "ensembling": 0.05,
            "threshold_opt": 0.03,
            "calibration": 0.03,
            "post_training": 0.03,
            "explainability": 0.04,
        },
    ),
    "medium_quality": PresetConfig(
        name="medium_quality",
        description="Balanced speed/accuracy with 15 min default",
        default_time_limit=900,  # 15 minutes
        cv_folds=5,
        num_bag_folds=0,
        num_stack_levels=0,
        hyperparameter_tune=True,
        tune_trials=10,
        ensemble_method="auto",
        calibrate=False,
        use_holdout=False,
        holdout_frac=0.0,
        feature_engineering="light",
        model_pool=MODEL_POOLS["medium_quality"],
        time_allocations={
            "profiling": 0.02,
            "quality_guardrails": 0.02,
            "data_cleaning": 0.01,
            "preprocessing": 0.05,
            "feature_engineering": 0.02,
            "data_augmentation": 0.01,
            "model_selection": 0.04,
            "model_training": 0.55,
            "constraint_check": 0.01,
            "hyperparameter_tuning": 0.12,
            "ensembling": 0.05,
            "threshold_opt": 0.03,
            "calibration": 0.03,
            "post_training": 0.01,
            "explainability": 0.03,
        },
    ),
    "fast": PresetConfig(
        name="fast",
        description="Quick results with 5 min default",
        default_time_limit=300,  # 5 minutes
        cv_folds=3,
        num_bag_folds=0,
        num_stack_levels=0,
        hyperparameter_tune=False,
        tune_trials=0,
        ensemble_method="none",
        calibrate=False,
        use_holdout=False,
        holdout_frac=0.0,
        feature_engineering="none",
        model_pool=MODEL_POOLS["fast"],
        time_allocations={
            "profiling": 0.02,
            "quality_guardrails": 0.02,
            "data_cleaning": 0.0,
            "preprocessing": 0.10,
            "feature_engineering": 0.0,
            "data_augmentation": 0.0,
            "model_selection": 0.05,
            "model_training": 0.76,
            "constraint_check": 0.0,
            "hyperparameter_tuning": 0.0,
            "ensembling": 0.0,
            "calibration": 0.0,
            "post_training": 0.0,
            "threshold_opt": 0.05,
            "explainability": 0.0,
        },
    ),
    "exhaustive": PresetConfig(
        name="exhaustive",
        description="Evolutionary search over ALL models, preprocessing, and ensembles",
        default_time_limit=0,  # unlimited by default — use patience / keyboard interrupt
        cv_folds=3,
        num_bag_folds=3,
        num_stack_levels=1,
        hyperparameter_tune=True,
        tune_trials=30,
        ensemble_method="auto",
        calibrate=True,
        use_holdout=False,
        holdout_frac=0.0,
        feature_engineering="moderate",
        model_pool=MODEL_POOLS["all"],
        time_allocations={
            "profiling": 0.01,
            "quality_guardrails": 0.01,
            "data_cleaning": 0.01,
            "preprocessing": 0.02,
            "feature_engineering": 0.02,
            "data_augmentation": 0.01,
            "model_selection": 0.01,
            "model_training": 0.80,
            "constraint_check": 0.0,
            "hyperparameter_tuning": 0.0,
            "ensembling": 0.05,
            "threshold_opt": 0.01,
            "calibration": 0.02,
            "post_training": 0.01,
            "explainability": 0.02,
        },
    ),
    "interpretable": PresetConfig(
        name="interpretable",
        description="Only interpretable models",
        default_time_limit=900,  # 15 minutes
        cv_folds=3,
        num_bag_folds=0,
        num_stack_levels=0,
        hyperparameter_tune=True,
        tune_trials=25,
        ensemble_method="none",  # No ensemble for interpretability
        calibrate=True,
        use_holdout=False,
        holdout_frac=0.0,
        feature_engineering="light",
        model_pool=MODEL_POOLS["interpretable"],
        time_allocations={
            "profiling": 0.02,
            "quality_guardrails": 0.02,
            "data_cleaning": 0.0,
            "preprocessing": 0.05,
            "feature_engineering": 0.0,
            "data_augmentation": 0.0,
            "model_selection": 0.05,
            "model_training": 0.60,
            "constraint_check": 0.02,
            "hyperparameter_tuning": 0.10,
            "ensembling": 0.0,
            "calibration": 0.05,
            "post_training": 0.0,
            "threshold_opt": 0.03,
            "explainability": 0.06,
        },
    ),
}



[docs]
def get_preset(name: str) -> PresetConfig:
    """Get a preset configuration by name.

    Parameters
    ----------
    name : str
        Name of the preset.

    Returns
    -------
    PresetConfig
        The preset configuration.

    Raises
    ------
    ValueError
        If preset name is not recognized.
    """
    if name not in PRESETS:
        available = ", ".join(PRESETS.keys())
        raise ValueError(f"Unknown preset '{name}'. Available: {available}")
    return PRESETS[name]




[docs]
def list_presets() -> list[str]:
    """List available preset names.

    Returns
    -------
    list of str
        Available preset names.
    """
    return list(PRESETS.keys())



def get_preset_summary() -> dict[str, dict[str, Any]]:
    """Get a summary of all presets.

    Returns
    -------
    dict
        Summary of each preset.
    """
    return {
        name: {
            "description": preset.description,
            "time_limit": preset.default_time_limit,
            "cv_folds": preset.cv_folds,
            "n_models": len(preset.model_pool),
            "ensemble": preset.ensemble_method,
            "calibrate": preset.calibrate,
        }
        for name, preset in PRESETS.items()
    }