Source code for endgame.automl.predictor

from __future__ import annotations

"""Unified AutoML predictor entry point.

This module provides the AutoMLPredictor class - the primary interface
for automated machine learning in Endgame.
"""

import logging
import warnings
from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd

from endgame.automl.base import BasePredictor, DataInput
from endgame.automl.tabular import TabularPredictor

logger = logging.getLogger(__name__)


[docs] class AutoMLPredictor: """Unified AutoML predictor that automatically selects the right domain. This is the main entry point for AutoML in Endgame. It provides a simple 3-line interface that matches AutoGluon's simplicity while leveraging Endgame's full capabilities. Parameters ---------- label : str Name of the target column. problem_type : str, default="auto" Type of problem: "classification", "regression", "multiclass", or "auto". eval_metric : str, default="auto" Evaluation metric. "auto" selects based on problem type. presets : str, default="medium_quality" Quality preset: "best_quality", "high_quality", "good_quality", "medium_quality", "fast", "interpretable". time_limit : int, optional Time limit in seconds. If None, uses preset default. search_strategy : str, default="portfolio" Search strategy: "portfolio", "heuristic", "genetic", "random", "bayesian". track_experiments : bool, default=True Whether to track experiments to the meta-learning database. output_path : str, optional Path to save outputs (models, logs, etc.). random_state : int, default=42 Random seed for reproducibility. verbosity : int, default=2 Verbosity level (0=silent, 1=progress, 2=detailed, 3=debug). Attributes ---------- predictor_ : BasePredictor The underlying domain-specific predictor. domain_ : str The detected or specified data domain. Examples -------- 3-line usage (matches AutoGluon): >>> from endgame.automl import AutoMLPredictor >>> predictor = AutoMLPredictor(label="target").fit("train.csv") >>> predictions = predictor.predict("test.csv") With more options: >>> predictor = AutoMLPredictor( ... label="price", ... presets="best_quality", ... time_limit=3600, ... ) >>> predictor.fit(train_df) >>> predictions = predictor.predict(test_df) Using different presets: >>> # Fast training for prototyping >>> predictor = AutoMLPredictor(label="target", presets="fast") >>> >>> # High quality for production >>> predictor = AutoMLPredictor(label="target", presets="high_quality") >>> >>> # Best quality for competitions >>> predictor = AutoMLPredictor(label="target", presets="best_quality") Different search strategies: >>> # Default portfolio search >>> predictor = AutoMLPredictor(label="target", search_strategy="portfolio") >>> >>> # Genetic algorithm search >>> predictor = AutoMLPredictor(label="target", search_strategy="genetic") >>> >>> # Bayesian optimization >>> predictor = AutoMLPredictor(label="target", search_strategy="bayesian") """ def __init__( self, label: str, problem_type: str = "auto", eval_metric: str = "auto", presets: str = "medium_quality", time_limit: int | None = None, search_strategy: str = "portfolio", track_experiments: bool = True, output_path: str | None = None, random_state: int = 42, verbosity: int = 2, ): self.label = label self.problem_type = problem_type self.eval_metric = eval_metric self.presets = presets self.time_limit = time_limit self.search_strategy = search_strategy self.track_experiments = track_experiments self.output_path = output_path self.random_state = random_state self.verbosity = verbosity # State self.predictor_: BasePredictor | None = None self.domain_: str | None = None
[docs] def fit( self, train_data: DataInput, tuning_data: DataInput | None = None, time_limit: int | None = None, presets: str | None = None, hyperparameters: dict[str, Any] | None = None, domain: str | None = None, **kwargs, ) -> AutoMLPredictor: """Fit the AutoML predictor. Parameters ---------- train_data : str, Path, DataFrame, or ndarray Training data. Can be a file path, DataFrame, or array. tuning_data : optional Validation/tuning data. If None, uses internal holdout. time_limit : int, optional Override the time limit. presets : str, optional Override the preset. hyperparameters : dict, optional Override hyperparameters for specific models. domain : str, optional Data domain: "tabular", "text", "vision", "timeseries", "audio". If None, auto-detects from data. **kwargs Additional arguments passed to the domain-specific predictor. Returns ------- AutoMLPredictor The fitted predictor. """ # Auto-detect domain if not specified if domain is None: domain = self._detect_domain(train_data) self.domain_ = domain # Create domain-specific predictor self.predictor_ = self._create_predictor(domain) # Fit the predictor self.predictor_.fit( train_data=train_data, tuning_data=tuning_data, time_limit=time_limit or self.time_limit, presets=presets or self.presets, hyperparameters=hyperparameters, **kwargs, ) return self
[docs] def predict( self, data: DataInput, model: str | None = None, ) -> np.ndarray: """Generate predictions. Parameters ---------- data : str, Path, DataFrame, or ndarray Input data to predict on. model : str, optional Specific model to use. If None, uses the ensemble. Returns ------- np.ndarray Predictions. """ self._check_is_fitted() return self.predictor_.predict(data, model=model)
[docs] def predict_proba( self, data: DataInput, model: str | None = None, ) -> np.ndarray: """Generate probability predictions (classification only). Parameters ---------- data : str, Path, DataFrame, or ndarray Input data. model : str, optional Specific model to use. Returns ------- np.ndarray Probability predictions with shape (n_samples, n_classes). """ self._check_is_fitted() return self.predictor_.predict_proba(data, model=model)
[docs] def evaluate( self, data: DataInput, metrics: list[str] | None = None, silent: bool = False, ) -> dict[str, float]: """Evaluate the predictor on data. Parameters ---------- data : str, Path, DataFrame, or ndarray Data to evaluate on. Must contain the target column. metrics : list of str, optional Metrics to compute. If None, uses default metrics. silent : bool, default=False Whether to suppress output. Returns ------- dict Dictionary mapping metric names to scores. """ self._check_is_fitted() return self.predictor_.evaluate(data, metrics=metrics, silent=silent)
[docs] def leaderboard( self, extra_info: bool = False, silent: bool = False, ) -> pd.DataFrame: """Get the model leaderboard. Parameters ---------- extra_info : bool, default=False Whether to include extra information (fit time, etc.). silent : bool, default=False Whether to suppress output. Returns ------- pd.DataFrame Leaderboard with model names and scores. """ self._check_is_fitted() return self.predictor_.leaderboard(extra_info=extra_info, silent=silent)
[docs] def feature_importance( self, model: str | None = None, importance_type: str = "split", ) -> pd.DataFrame: """Get feature importance scores. Parameters ---------- model : str, optional Specific model. If None, uses best model. importance_type : str, default="split" Type of importance: "split", "gain", "permutation". Returns ------- pd.DataFrame Feature importance scores. """ self._check_is_fitted() return self.predictor_.feature_importance( model=model, importance_type=importance_type )
[docs] def save(self, path: str | None = None) -> str: """Save the predictor to disk. Parameters ---------- path : str, optional Path to save to. If None, uses output_path. Returns ------- str Path where the predictor was saved. """ self._check_is_fitted() return self.predictor_.save(path)
[docs] @classmethod def load(cls, path: str) -> AutoMLPredictor: """Load a predictor from disk. Parameters ---------- path : str Path to load from. Returns ------- AutoMLPredictor The loaded predictor. """ path = Path(path) # Determine domain from saved predictor # For now, assume tabular predictor = TabularPredictor.load(str(path)) # Create wrapper wrapper = cls( label=predictor.label, problem_type=predictor.problem_type, eval_metric=predictor.eval_metric, presets=predictor.presets, time_limit=predictor.time_limit, search_strategy=predictor.search_strategy, random_state=predictor.random_state, verbosity=predictor.verbosity, ) wrapper.predictor_ = predictor wrapper.domain_ = "tabular" return wrapper
def _detect_domain(self, data: DataInput) -> str: """Detect the data domain from the input. Parameters ---------- data : various Input data. Returns ------- str Detected domain. """ # For now, default to tabular # Future: implement detection for text, vision, etc. if isinstance(data, (str, Path)): path = Path(data) suffix = path.suffix.lower() # Image formats if suffix in (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff"): return "vision" # Audio formats if suffix in (".wav", ".mp3", ".flac", ".ogg", ".m4a"): return "audio" # Check file contents for text vs tabular if suffix == ".csv": # Could be tabular or text - check first rows try: df = pd.read_csv(data, nrows=5) if self._looks_like_text_data(df): return "text" return "tabular" except Exception: return "tabular" return "tabular" elif isinstance(data, pd.DataFrame): if self._looks_like_text_data(data): return "text" return "tabular" elif isinstance(data, np.ndarray): # Check shape for images if data.ndim == 4: # Likely image data (N, H, W, C) or (N, C, H, W) return "vision" return "tabular" return "tabular" def _looks_like_text_data(self, df: pd.DataFrame) -> bool: """Check if DataFrame looks like text data. Parameters ---------- df : DataFrame Data to check. Returns ------- bool True if data appears to be text. """ # Check for text-like columns (long strings) for col in df.columns: if col == self.label: continue if df[col].dtype == object: sample = df[col].dropna().head(10) if len(sample) > 0: avg_len = sample.astype(str).str.len().mean() # Text columns typically have long strings if avg_len > 100: return True return False def _create_predictor(self, domain: str) -> BasePredictor: """Create a domain-specific predictor. Parameters ---------- domain : str Data domain. Returns ------- BasePredictor Domain-specific predictor. """ if domain == "tabular": return TabularPredictor( label=self.label, problem_type=self.problem_type, eval_metric=self.eval_metric, presets=self.presets, time_limit=self.time_limit, search_strategy=self.search_strategy, track_experiments=self.track_experiments, output_path=self.output_path, random_state=self.random_state, verbosity=self.verbosity, ) elif domain == "text": try: from endgame.automl.text import TextPredictor return TextPredictor( label=self.label, problem_type=self.problem_type, eval_metric=self.eval_metric, presets=self.presets, time_limit=self.time_limit, track_experiments=self.track_experiments, output_path=self.output_path, random_state=self.random_state, verbosity=self.verbosity, ) except ImportError: warnings.warn( "TextPredictor not available. Using TabularPredictor with text features.", UserWarning, ) return TabularPredictor( label=self.label, problem_type=self.problem_type, eval_metric=self.eval_metric, presets=self.presets, time_limit=self.time_limit, search_strategy=self.search_strategy, track_experiments=self.track_experiments, output_path=self.output_path, random_state=self.random_state, verbosity=self.verbosity, ) elif domain == "vision": try: from endgame.automl.vision import VisionPredictor return VisionPredictor( label=self.label, problem_type=self.problem_type, eval_metric=self.eval_metric, presets=self.presets, time_limit=self.time_limit, track_experiments=self.track_experiments, output_path=self.output_path, random_state=self.random_state, verbosity=self.verbosity, ) except ImportError: raise NotImplementedError( "VisionPredictor is not yet implemented. " "Please use TabularPredictor with image features." ) elif domain == "timeseries": try: from endgame.automl.timeseries import TimeSeriesPredictor return TimeSeriesPredictor( label=self.label, problem_type=self.problem_type, eval_metric=self.eval_metric, presets=self.presets, time_limit=self.time_limit, track_experiments=self.track_experiments, output_path=self.output_path, random_state=self.random_state, verbosity=self.verbosity, ) except ImportError: raise NotImplementedError( "TimeSeriesPredictor is not yet implemented." ) elif domain == "audio": try: from endgame.automl.audio import AudioPredictor return AudioPredictor( label=self.label, problem_type=self.problem_type, eval_metric=self.eval_metric, presets=self.presets, time_limit=self.time_limit, track_experiments=self.track_experiments, output_path=self.output_path, random_state=self.random_state, verbosity=self.verbosity, ) except ImportError: raise NotImplementedError( "AudioPredictor is not yet implemented." ) else: raise ValueError(f"Unknown domain: {domain}") def _check_is_fitted(self) -> None: """Check if the predictor is fitted.""" if self.predictor_ is None or not self.predictor_.is_fitted_: raise RuntimeError( "Predictor is not fitted. Call fit() before making predictions." ) @property def is_fitted(self) -> bool: """Check if the predictor is fitted.""" return self.predictor_ is not None and self.predictor_.is_fitted_ @property def problem_type_(self) -> str | None: """Get the detected problem type.""" if self.predictor_ is None: return None return self.predictor_.problem_type_ @property def classes_(self) -> np.ndarray | None: """Get the class labels for classification.""" if self.predictor_ is None: return None return self.predictor_.classes_ @property def feature_names_(self) -> list[str] | None: """Get the feature names.""" if self.predictor_ is None: return None return self.predictor_.feature_names_ @property def fit_summary_(self): """Get the fit summary.""" if self.predictor_ is None: return None return self.predictor_.fit_summary_ def __repr__(self) -> str: if self.predictor_ is None: fitted_str = "not fitted" else: fitted_str = "fitted" if self.predictor_.is_fitted_ else "not fitted" return ( f"AutoMLPredictor(" f"label='{self.label}', " f"presets='{self.presets}', " f"domain='{self.domain_}', " f"{fitted_str})" )