from __future__ import annotations
"""Preset configurations for AutoML.
This module defines preset configurations that control the behavior of
AutoML predictors, including time limits, model selection, ensembling,
and calibration settings.
Presets
-------
- best_quality: Maximum accuracy, no time limit
- high_quality: High accuracy with 4 hour default
- good_quality: Good accuracy with 1 hour default
- medium_quality: Balanced speed/accuracy with 15 min default
- fast: Quick results with 5 min default
- exhaustive: Evolutionary search over ALL models + preprocessing + ensembles
- interpretable: Only interpretable models
"""
from dataclasses import dataclass, field
from typing import Any
[docs]
@dataclass
class PresetConfig:
"""Configuration for an AutoML preset.
Attributes
----------
name : str
Name of the preset.
description : str
Human-readable description.
default_time_limit : int or None
Default time limit in seconds. None means no limit.
cv_folds : int
Number of cross-validation folds.
num_bag_folds : int
Number of bagging folds (0 to disable).
num_stack_levels : int
Number of stacking levels (0 to disable).
hyperparameter_tune : bool
Whether to tune hyperparameters.
tune_trials : int
Number of tuning trials per model.
ensemble_method : str
Ensemble method: "none", "hill_climbing", "stacking".
calibrate : bool
Whether to calibrate probabilities.
use_holdout : bool
Whether to use a holdout set.
holdout_frac : float
Fraction of data for holdout (if use_holdout=True).
feature_engineering : str
Feature engineering level: "none", "light", "moderate", "aggressive".
model_pool : list of str
List of model names to consider.
time_allocations : dict
Time allocation fractions for each stage.
"""
name: str
description: str
default_time_limit: int | None
cv_folds: int
num_bag_folds: int
num_stack_levels: int
hyperparameter_tune: bool
tune_trials: int
ensemble_method: str
calibrate: bool
use_holdout: bool
holdout_frac: float
feature_engineering: str
model_pool: list[str]
time_allocations: dict[str, float] = field(default_factory=dict)
def __post_init__(self):
"""Set default time allocations if not provided."""
if not self.time_allocations:
self.time_allocations = DEFAULT_TIME_ALLOCATIONS.copy()
# Default time allocation across pipeline stages
DEFAULT_TIME_ALLOCATIONS = {
"profiling": 0.01,
"quality_guardrails": 0.02,
"data_cleaning": 0.02,
"preprocessing": 0.05,
"feature_engineering": 0.03,
"data_augmentation": 0.02,
"model_selection": 0.04,
"model_training": 0.40,
"constraint_check": 0.01,
"hyperparameter_tuning": 0.20,
"ensembling": 0.06,
"threshold_opt": 0.02,
"calibration": 0.03,
"post_training": 0.02,
"explainability": 0.02,
}
# Model pools for different presets
MODEL_POOLS = {
"all": [
# GBDTs (always strong)
"lgbm", "xgb", "catboost", "ngboost",
# Deep tabular (modern)
"ft_transformer", "saint", "tabnet", "node", "nam",
"tabular_resnet", "tabm", "realmlp", "grande",
"gandalf", "tabr", "tabdpt", "modern_nca",
"tab_transformer", "mlp", "embedding_mlp",
# Custom trees
"rotation_forest", "c50", "c50_ensemble", "oblique_forest",
"extra_oblique_forest", "patch_oblique_forest", "honest_forest",
"evolutionary_tree", "rf", "extra_trees",
"adtree", "model_tree", "cubist",
# Linear
"linear", "elm", "mars",
# Kernel
"svm", "gp",
# Rules
"rulefit", "furia", "prim",
# Bayesian
"tan", "eskdb", "kdb", "bart", "naive_bayes",
"neural_kdb", "auto_sle", "ebmc_classifier",
# Interpretable
"ebm", "ebmc", "ebmr",
"gam", "node_gam", "gami_net",
"corels", "slim", "fasterrisk", "gosdt",
# Ordinal
"ordinal", "logistic_at", "logistic_it", "logistic_se",
"ordinal_ridge", "ordinal_lad",
# Discriminant
"lda", "qda", "knn",
# Foundation
"tabpfn", "tabpfn_v2", "tabpfn_25", "xrfm",
# Symbolic
"symbolic_regression", "symbolic_regressor",
# Quantile
"quantile_forest",
],
"best_quality": [
# GBDTs (always strong)
"lgbm", "xgb", "catboost", "ngboost",
# Deep tabular (modern)
"ft_transformer", "saint", "tabnet", "node", "nam",
"tabular_resnet", "tabm", "realmlp", "grande",
"gandalf", "tabr", "tabdpt", "modern_nca",
"tab_transformer", "mlp", "embedding_mlp",
# Custom trees
"rotation_forest", "c50", "c50_ensemble", "oblique_forest",
"extra_oblique_forest", "patch_oblique_forest", "honest_forest",
"evolutionary_tree", "rf", "extra_trees",
"adtree", "model_tree", "cubist",
# Linear
"linear", "elm", "mars",
# Kernel
"svm", "gp",
# Rules
"rulefit", "furia", "prim",
# Bayesian
"tan", "eskdb", "kdb", "bart", "naive_bayes",
"neural_kdb", "auto_sle", "ebmc_classifier",
# Interpretable
"ebm", "ebmc", "ebmr",
"gam", "node_gam", "gami_net",
"corels", "slim", "fasterrisk", "gosdt",
# Ordinal
"ordinal", "logistic_at", "logistic_it", "logistic_se",
"ordinal_ridge", "ordinal_lad",
# Discriminant
"lda", "qda", "knn",
# Foundation
"tabpfn", "tabpfn_v2", "tabpfn_25", "xrfm",
# Symbolic
"symbolic_regression", "symbolic_regressor",
# Quantile
"quantile_forest",
],
"high_quality": [
"lgbm", "xgb", "catboost",
"ft_transformer", "tabm", "tabnet",
"tabpfn_v2", "tabpfn_25",
"realmlp", "tabr", "gandalf",
"rotation_forest", "rf",
"extra_oblique_forest", "honest_forest",
"linear",
],
"good_quality": [
"lgbm", "xgb", "catboost",
"tabm", "realmlp", "rf",
"ebm",
"linear", "elm",
],
"medium_quality": [
"lgbm", "xgb", "catboost",
"rf",
"linear",
],
"fast": [
"lgbm",
],
"interpretable": [
# GAM-style models
"ebm", "gam", "nam", "node_gam", "gami_net",
# Rule-based models
"rulefit", "furia", "corels", "prim",
# Sparse linear / scorecards
"linear", "mars", "slim", "fasterrisk",
# Interpretable trees
"c50", "gosdt", "cubist",
# Symbolic
"symbolic_regression",
# Probabilistic (feature importances + uncertainty)
"ngboost",
# Bayesian (interpretable)
"naive_bayes", "tan", "lda",
# Foundation (interpretable)
"xrfm",
# Ordinal
"ordinal",
],
}
# Preset configurations
PRESETS: dict[str, PresetConfig] = {
"best_quality": PresetConfig(
name="best_quality",
description="Maximum accuracy, no time limit",
default_time_limit=None,
cv_folds=8,
num_bag_folds=5,
num_stack_levels=2,
hyperparameter_tune=True,
tune_trials=100,
ensemble_method="auto",
calibrate=True,
use_holdout=True,
holdout_frac=0.1,
feature_engineering="aggressive",
model_pool=MODEL_POOLS["best_quality"],
time_allocations={
"profiling": 0.01,
"quality_guardrails": 0.02,
"data_cleaning": 0.02,
"preprocessing": 0.08,
"feature_engineering": 0.04,
"data_augmentation": 0.02,
"model_selection": 0.04,
"model_training": 0.35,
"constraint_check": 0.01,
"hyperparameter_tuning": 0.25,
"ensembling": 0.05,
"threshold_opt": 0.02,
"calibration": 0.03,
"post_training": 0.03,
"explainability": 0.03,
},
),
"high_quality": PresetConfig(
name="high_quality",
description="High accuracy with 4 hour default",
default_time_limit=14400, # 4 hours
cv_folds=5,
num_bag_folds=5,
num_stack_levels=1,
hyperparameter_tune=True,
tune_trials=50,
ensemble_method="auto",
calibrate=True,
use_holdout=True,
holdout_frac=0.1,
feature_engineering="moderate",
model_pool=MODEL_POOLS["high_quality"],
time_allocations={
"profiling": 0.02,
"quality_guardrails": 0.02,
"data_cleaning": 0.02,
"preprocessing": 0.08,
"feature_engineering": 0.03,
"data_augmentation": 0.02,
"model_selection": 0.04,
"model_training": 0.38,
"constraint_check": 0.02,
"hyperparameter_tuning": 0.20,
"ensembling": 0.05,
"threshold_opt": 0.03,
"calibration": 0.03,
"post_training": 0.03,
"explainability": 0.03,
},
),
"good_quality": PresetConfig(
name="good_quality",
description="Good accuracy with 1 hour default",
default_time_limit=3600, # 1 hour
cv_folds=5,
num_bag_folds=3,
num_stack_levels=0,
hyperparameter_tune=True,
tune_trials=25,
ensemble_method="auto",
calibrate=True,
use_holdout=False,
holdout_frac=0.0,
feature_engineering="moderate",
model_pool=MODEL_POOLS["good_quality"],
time_allocations={
"profiling": 0.02,
"quality_guardrails": 0.02,
"data_cleaning": 0.02,
"preprocessing": 0.08,
"feature_engineering": 0.03,
"data_augmentation": 0.02,
"model_selection": 0.04,
"model_training": 0.42,
"constraint_check": 0.02,
"hyperparameter_tuning": 0.15,
"ensembling": 0.05,
"threshold_opt": 0.03,
"calibration": 0.03,
"post_training": 0.03,
"explainability": 0.04,
},
),
"medium_quality": PresetConfig(
name="medium_quality",
description="Balanced speed/accuracy with 15 min default",
default_time_limit=900, # 15 minutes
cv_folds=5,
num_bag_folds=0,
num_stack_levels=0,
hyperparameter_tune=True,
tune_trials=10,
ensemble_method="auto",
calibrate=False,
use_holdout=False,
holdout_frac=0.0,
feature_engineering="light",
model_pool=MODEL_POOLS["medium_quality"],
time_allocations={
"profiling": 0.02,
"quality_guardrails": 0.02,
"data_cleaning": 0.01,
"preprocessing": 0.05,
"feature_engineering": 0.02,
"data_augmentation": 0.01,
"model_selection": 0.04,
"model_training": 0.55,
"constraint_check": 0.01,
"hyperparameter_tuning": 0.12,
"ensembling": 0.05,
"threshold_opt": 0.03,
"calibration": 0.03,
"post_training": 0.01,
"explainability": 0.03,
},
),
"fast": PresetConfig(
name="fast",
description="Quick results with 5 min default",
default_time_limit=300, # 5 minutes
cv_folds=3,
num_bag_folds=0,
num_stack_levels=0,
hyperparameter_tune=False,
tune_trials=0,
ensemble_method="none",
calibrate=False,
use_holdout=False,
holdout_frac=0.0,
feature_engineering="none",
model_pool=MODEL_POOLS["fast"],
time_allocations={
"profiling": 0.02,
"quality_guardrails": 0.02,
"data_cleaning": 0.0,
"preprocessing": 0.10,
"feature_engineering": 0.0,
"data_augmentation": 0.0,
"model_selection": 0.05,
"model_training": 0.76,
"constraint_check": 0.0,
"hyperparameter_tuning": 0.0,
"ensembling": 0.0,
"calibration": 0.0,
"post_training": 0.0,
"threshold_opt": 0.05,
"explainability": 0.0,
},
),
"exhaustive": PresetConfig(
name="exhaustive",
description="Evolutionary search over ALL models, preprocessing, and ensembles",
default_time_limit=0, # unlimited by default — use patience / keyboard interrupt
cv_folds=3,
num_bag_folds=3,
num_stack_levels=1,
hyperparameter_tune=True,
tune_trials=30,
ensemble_method="auto",
calibrate=True,
use_holdout=False,
holdout_frac=0.0,
feature_engineering="moderate",
model_pool=MODEL_POOLS["all"],
time_allocations={
"profiling": 0.01,
"quality_guardrails": 0.01,
"data_cleaning": 0.01,
"preprocessing": 0.02,
"feature_engineering": 0.02,
"data_augmentation": 0.01,
"model_selection": 0.01,
"model_training": 0.80,
"constraint_check": 0.0,
"hyperparameter_tuning": 0.0,
"ensembling": 0.05,
"threshold_opt": 0.01,
"calibration": 0.02,
"post_training": 0.01,
"explainability": 0.02,
},
),
"interpretable": PresetConfig(
name="interpretable",
description="Only interpretable models",
default_time_limit=900, # 15 minutes
cv_folds=3,
num_bag_folds=0,
num_stack_levels=0,
hyperparameter_tune=True,
tune_trials=25,
ensemble_method="none", # No ensemble for interpretability
calibrate=True,
use_holdout=False,
holdout_frac=0.0,
feature_engineering="light",
model_pool=MODEL_POOLS["interpretable"],
time_allocations={
"profiling": 0.02,
"quality_guardrails": 0.02,
"data_cleaning": 0.0,
"preprocessing": 0.05,
"feature_engineering": 0.0,
"data_augmentation": 0.0,
"model_selection": 0.05,
"model_training": 0.60,
"constraint_check": 0.02,
"hyperparameter_tuning": 0.10,
"ensembling": 0.0,
"calibration": 0.05,
"post_training": 0.0,
"threshold_opt": 0.03,
"explainability": 0.06,
},
),
}
[docs]
def get_preset(name: str) -> PresetConfig:
"""Get a preset configuration by name.
Parameters
----------
name : str
Name of the preset.
Returns
-------
PresetConfig
The preset configuration.
Raises
------
ValueError
If preset name is not recognized.
"""
if name not in PRESETS:
available = ", ".join(PRESETS.keys())
raise ValueError(f"Unknown preset '{name}'. Available: {available}")
return PRESETS[name]
[docs]
def list_presets() -> list[str]:
"""List available preset names.
Returns
-------
list of str
Available preset names.
"""
return list(PRESETS.keys())
def get_preset_summary() -> dict[str, dict[str, Any]]:
"""Get a summary of all presets.
Returns
-------
dict
Summary of each preset.
"""
return {
name: {
"description": preset.description,
"time_limit": preset.default_time_limit,
"cv_folds": preset.cv_folds,
"n_models": len(preset.model_pool),
"ensemble": preset.ensemble_method,
"calibrate": preset.calibrate,
}
for name, preset in PRESETS.items()
}