Source code for endgame.automl.predictor

from __future__ import annotations

"""Unified AutoML predictor entry point.

This module provides the AutoMLPredictor class - the primary interface
for automated machine learning in Endgame.
"""

import logging
import warnings
from pathlib import Path
from typing import Any

import numpy as np
import pandas as pd

from endgame.automl.base import BasePredictor, DataInput
from endgame.automl.tabular import TabularPredictor

logger = logging.getLogger(__name__)



[docs]
class AutoMLPredictor:
    """Unified AutoML predictor that automatically selects the right domain.

    This is the main entry point for AutoML in Endgame. It provides a simple
    3-line interface that matches AutoGluon's simplicity while leveraging
    Endgame's full capabilities.

    Parameters
    ----------
    label : str
        Name of the target column.
    problem_type : str, default="auto"
        Type of problem: "classification", "regression", "multiclass", or "auto".
    eval_metric : str, default="auto"
        Evaluation metric. "auto" selects based on problem type.
    presets : str, default="medium_quality"
        Quality preset: "best_quality", "high_quality", "good_quality",
        "medium_quality", "fast", "interpretable".
    time_limit : int, optional
        Time limit in seconds. If None, uses preset default.
    search_strategy : str, default="portfolio"
        Search strategy: "portfolio", "heuristic", "genetic", "random", "bayesian".
    track_experiments : bool, default=True
        Whether to track experiments to the meta-learning database.
    output_path : str, optional
        Path to save outputs (models, logs, etc.).
    random_state : int, default=42
        Random seed for reproducibility.
    verbosity : int, default=2
        Verbosity level (0=silent, 1=progress, 2=detailed, 3=debug).

    Attributes
    ----------
    predictor_ : BasePredictor
        The underlying domain-specific predictor.
    domain_ : str
        The detected or specified data domain.

    Examples
    --------
    3-line usage (matches AutoGluon):

    >>> from endgame.automl import AutoMLPredictor
    >>> predictor = AutoMLPredictor(label="target").fit("train.csv")
    >>> predictions = predictor.predict("test.csv")

    With more options:

    >>> predictor = AutoMLPredictor(
    ...     label="price",
    ...     presets="best_quality",
    ...     time_limit=3600,
    ... )
    >>> predictor.fit(train_df)
    >>> predictions = predictor.predict(test_df)

    Using different presets:

    >>> # Fast training for prototyping
    >>> predictor = AutoMLPredictor(label="target", presets="fast")
    >>>
    >>> # High quality for production
    >>> predictor = AutoMLPredictor(label="target", presets="high_quality")
    >>>
    >>> # Best quality for competitions
    >>> predictor = AutoMLPredictor(label="target", presets="best_quality")

    Different search strategies:

    >>> # Default portfolio search
    >>> predictor = AutoMLPredictor(label="target", search_strategy="portfolio")
    >>>
    >>> # Genetic algorithm search
    >>> predictor = AutoMLPredictor(label="target", search_strategy="genetic")
    >>>
    >>> # Bayesian optimization
    >>> predictor = AutoMLPredictor(label="target", search_strategy="bayesian")
    """

    def __init__(
        self,
        label: str,
        problem_type: str = "auto",
        eval_metric: str = "auto",
        presets: str = "medium_quality",
        time_limit: int | None = None,
        search_strategy: str = "portfolio",
        track_experiments: bool = True,
        output_path: str | None = None,
        random_state: int = 42,
        verbosity: int = 2,
    ):
        self.label = label
        self.problem_type = problem_type
        self.eval_metric = eval_metric
        self.presets = presets
        self.time_limit = time_limit
        self.search_strategy = search_strategy
        self.track_experiments = track_experiments
        self.output_path = output_path
        self.random_state = random_state
        self.verbosity = verbosity

        # State
        self.predictor_: BasePredictor | None = None
        self.domain_: str | None = None


[docs]
    def fit(
        self,
        train_data: DataInput,
        tuning_data: DataInput | None = None,
        time_limit: int | None = None,
        presets: str | None = None,
        hyperparameters: dict[str, Any] | None = None,
        domain: str | None = None,
        **kwargs,
    ) -> AutoMLPredictor:
        """Fit the AutoML predictor.

        Parameters
        ----------
        train_data : str, Path, DataFrame, or ndarray
            Training data. Can be a file path, DataFrame, or array.
        tuning_data : optional
            Validation/tuning data. If None, uses internal holdout.
        time_limit : int, optional
            Override the time limit.
        presets : str, optional
            Override the preset.
        hyperparameters : dict, optional
            Override hyperparameters for specific models.
        domain : str, optional
            Data domain: "tabular", "text", "vision", "timeseries", "audio".
            If None, auto-detects from data.
        **kwargs
            Additional arguments passed to the domain-specific predictor.

        Returns
        -------
        AutoMLPredictor
            The fitted predictor.
        """
        # Auto-detect domain if not specified
        if domain is None:
            domain = self._detect_domain(train_data)

        self.domain_ = domain

        # Create domain-specific predictor
        self.predictor_ = self._create_predictor(domain)

        # Fit the predictor
        self.predictor_.fit(
            train_data=train_data,
            tuning_data=tuning_data,
            time_limit=time_limit or self.time_limit,
            presets=presets or self.presets,
            hyperparameters=hyperparameters,
            **kwargs,
        )

        return self



[docs]
    def predict(
        self,
        data: DataInput,
        model: str | None = None,
    ) -> np.ndarray:
        """Generate predictions.

        Parameters
        ----------
        data : str, Path, DataFrame, or ndarray
            Input data to predict on.
        model : str, optional
            Specific model to use. If None, uses the ensemble.

        Returns
        -------
        np.ndarray
            Predictions.
        """
        self._check_is_fitted()
        return self.predictor_.predict(data, model=model)



[docs]
    def predict_proba(
        self,
        data: DataInput,
        model: str | None = None,
    ) -> np.ndarray:
        """Generate probability predictions (classification only).

        Parameters
        ----------
        data : str, Path, DataFrame, or ndarray
            Input data.
        model : str, optional
            Specific model to use.

        Returns
        -------
        np.ndarray
            Probability predictions with shape (n_samples, n_classes).
        """
        self._check_is_fitted()
        return self.predictor_.predict_proba(data, model=model)



[docs]
    def evaluate(
        self,
        data: DataInput,
        metrics: list[str] | None = None,
        silent: bool = False,
    ) -> dict[str, float]:
        """Evaluate the predictor on data.

        Parameters
        ----------
        data : str, Path, DataFrame, or ndarray
            Data to evaluate on. Must contain the target column.
        metrics : list of str, optional
            Metrics to compute. If None, uses default metrics.
        silent : bool, default=False
            Whether to suppress output.

        Returns
        -------
        dict
            Dictionary mapping metric names to scores.
        """
        self._check_is_fitted()
        return self.predictor_.evaluate(data, metrics=metrics, silent=silent)



[docs]
    def leaderboard(
        self,
        extra_info: bool = False,
        silent: bool = False,
    ) -> pd.DataFrame:
        """Get the model leaderboard.

        Parameters
        ----------
        extra_info : bool, default=False
            Whether to include extra information (fit time, etc.).
        silent : bool, default=False
            Whether to suppress output.

        Returns
        -------
        pd.DataFrame
            Leaderboard with model names and scores.
        """
        self._check_is_fitted()
        return self.predictor_.leaderboard(extra_info=extra_info, silent=silent)



[docs]
    def feature_importance(
        self,
        model: str | None = None,
        importance_type: str = "split",
    ) -> pd.DataFrame:
        """Get feature importance scores.

        Parameters
        ----------
        model : str, optional
            Specific model. If None, uses best model.
        importance_type : str, default="split"
            Type of importance: "split", "gain", "permutation".

        Returns
        -------
        pd.DataFrame
            Feature importance scores.
        """
        self._check_is_fitted()
        return self.predictor_.feature_importance(
            model=model, importance_type=importance_type
        )



[docs]
    def save(self, path: str | None = None) -> str:
        """Save the predictor to disk.

        Parameters
        ----------
        path : str, optional
            Path to save to. If None, uses output_path.

        Returns
        -------
        str
            Path where the predictor was saved.
        """
        self._check_is_fitted()
        return self.predictor_.save(path)



[docs]
    @classmethod
    def load(cls, path: str) -> AutoMLPredictor:
        """Load a predictor from disk.

        Parameters
        ----------
        path : str
            Path to load from.

        Returns
        -------
        AutoMLPredictor
            The loaded predictor.
        """
        path = Path(path)

        # Determine domain from saved predictor
        # For now, assume tabular
        predictor = TabularPredictor.load(str(path))

        # Create wrapper
        wrapper = cls(
            label=predictor.label,
            problem_type=predictor.problem_type,
            eval_metric=predictor.eval_metric,
            presets=predictor.presets,
            time_limit=predictor.time_limit,
            search_strategy=predictor.search_strategy,
            random_state=predictor.random_state,
            verbosity=predictor.verbosity,
        )
        wrapper.predictor_ = predictor
        wrapper.domain_ = "tabular"

        return wrapper


    def _detect_domain(self, data: DataInput) -> str:
        """Detect the data domain from the input.

        Parameters
        ----------
        data : various
            Input data.

        Returns
        -------
        str
            Detected domain.
        """
        # For now, default to tabular
        # Future: implement detection for text, vision, etc.

        if isinstance(data, (str, Path)):
            path = Path(data)
            suffix = path.suffix.lower()

            # Image formats
            if suffix in (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff"):
                return "vision"

            # Audio formats
            if suffix in (".wav", ".mp3", ".flac", ".ogg", ".m4a"):
                return "audio"

            # Check file contents for text vs tabular
            if suffix == ".csv":
                # Could be tabular or text - check first rows
                try:
                    df = pd.read_csv(data, nrows=5)
                    if self._looks_like_text_data(df):
                        return "text"
                    return "tabular"
                except Exception:
                    return "tabular"

            return "tabular"

        elif isinstance(data, pd.DataFrame):
            if self._looks_like_text_data(data):
                return "text"
            return "tabular"

        elif isinstance(data, np.ndarray):
            # Check shape for images
            if data.ndim == 4:
                # Likely image data (N, H, W, C) or (N, C, H, W)
                return "vision"
            return "tabular"

        return "tabular"

    def _looks_like_text_data(self, df: pd.DataFrame) -> bool:
        """Check if DataFrame looks like text data.

        Parameters
        ----------
        df : DataFrame
            Data to check.

        Returns
        -------
        bool
            True if data appears to be text.
        """
        # Check for text-like columns (long strings)
        for col in df.columns:
            if col == self.label:
                continue

            if df[col].dtype == object:
                sample = df[col].dropna().head(10)
                if len(sample) > 0:
                    avg_len = sample.astype(str).str.len().mean()
                    # Text columns typically have long strings
                    if avg_len > 100:
                        return True

        return False

    def _create_predictor(self, domain: str) -> BasePredictor:
        """Create a domain-specific predictor.

        Parameters
        ----------
        domain : str
            Data domain.

        Returns
        -------
        BasePredictor
            Domain-specific predictor.
        """
        if domain == "tabular":
            return TabularPredictor(
                label=self.label,
                problem_type=self.problem_type,
                eval_metric=self.eval_metric,
                presets=self.presets,
                time_limit=self.time_limit,
                search_strategy=self.search_strategy,
                track_experiments=self.track_experiments,
                output_path=self.output_path,
                random_state=self.random_state,
                verbosity=self.verbosity,
            )

        elif domain == "text":
            try:
                from endgame.automl.text import TextPredictor

                return TextPredictor(
                    label=self.label,
                    problem_type=self.problem_type,
                    eval_metric=self.eval_metric,
                    presets=self.presets,
                    time_limit=self.time_limit,
                    track_experiments=self.track_experiments,
                    output_path=self.output_path,
                    random_state=self.random_state,
                    verbosity=self.verbosity,
                )
            except ImportError:
                warnings.warn(
                    "TextPredictor not available. Using TabularPredictor with text features.",
                    UserWarning,
                )
                return TabularPredictor(
                    label=self.label,
                    problem_type=self.problem_type,
                    eval_metric=self.eval_metric,
                    presets=self.presets,
                    time_limit=self.time_limit,
                    search_strategy=self.search_strategy,
                    track_experiments=self.track_experiments,
                    output_path=self.output_path,
                    random_state=self.random_state,
                    verbosity=self.verbosity,
                )

        elif domain == "vision":
            try:
                from endgame.automl.vision import VisionPredictor

                return VisionPredictor(
                    label=self.label,
                    problem_type=self.problem_type,
                    eval_metric=self.eval_metric,
                    presets=self.presets,
                    time_limit=self.time_limit,
                    track_experiments=self.track_experiments,
                    output_path=self.output_path,
                    random_state=self.random_state,
                    verbosity=self.verbosity,
                )
            except ImportError:
                raise NotImplementedError(
                    "VisionPredictor is not yet implemented. "
                    "Please use TabularPredictor with image features."
                )

        elif domain == "timeseries":
            try:
                from endgame.automl.timeseries import TimeSeriesPredictor

                return TimeSeriesPredictor(
                    label=self.label,
                    problem_type=self.problem_type,
                    eval_metric=self.eval_metric,
                    presets=self.presets,
                    time_limit=self.time_limit,
                    track_experiments=self.track_experiments,
                    output_path=self.output_path,
                    random_state=self.random_state,
                    verbosity=self.verbosity,
                )
            except ImportError:
                raise NotImplementedError(
                    "TimeSeriesPredictor is not yet implemented."
                )

        elif domain == "audio":
            try:
                from endgame.automl.audio import AudioPredictor

                return AudioPredictor(
                    label=self.label,
                    problem_type=self.problem_type,
                    eval_metric=self.eval_metric,
                    presets=self.presets,
                    time_limit=self.time_limit,
                    track_experiments=self.track_experiments,
                    output_path=self.output_path,
                    random_state=self.random_state,
                    verbosity=self.verbosity,
                )
            except ImportError:
                raise NotImplementedError(
                    "AudioPredictor is not yet implemented."
                )

        else:
            raise ValueError(f"Unknown domain: {domain}")

    def _check_is_fitted(self) -> None:
        """Check if the predictor is fitted."""
        if self.predictor_ is None or not self.predictor_.is_fitted_:
            raise RuntimeError(
                "Predictor is not fitted. Call fit() before making predictions."
            )

    @property
    def is_fitted(self) -> bool:
        """Check if the predictor is fitted."""
        return self.predictor_ is not None and self.predictor_.is_fitted_

    @property
    def problem_type_(self) -> str | None:
        """Get the detected problem type."""
        if self.predictor_ is None:
            return None
        return self.predictor_.problem_type_

    @property
    def classes_(self) -> np.ndarray | None:
        """Get the class labels for classification."""
        if self.predictor_ is None:
            return None
        return self.predictor_.classes_

    @property
    def feature_names_(self) -> list[str] | None:
        """Get the feature names."""
        if self.predictor_ is None:
            return None
        return self.predictor_.feature_names_

    @property
    def fit_summary_(self):
        """Get the fit summary."""
        if self.predictor_ is None:
            return None
        return self.predictor_.fit_summary_

    def __repr__(self) -> str:
        if self.predictor_ is None:
            fitted_str = "not fitted"
        else:
            fitted_str = "fitted" if self.predictor_.is_fitted_ else "not fitted"

        return (
            f"AutoMLPredictor("
            f"label='{self.label}', "
            f"presets='{self.presets}', "
            f"domain='{self.domain_}', "
            f"{fitted_str})"
        )