Source code for endgame.automl.search.base

from __future__ import annotations

"""Base classes for search strategies.

This module defines the interfaces and data structures used by
all search strategy implementations.
"""

from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from typing import Any

import numpy as np



[docs]
@dataclass
class PipelineConfig:
    """Represents a complete ML pipeline configuration.

    A pipeline config specifies everything needed to train a model:
    preprocessing steps, model choice, and hyperparameters.

    Attributes
    ----------
    model_name : str
        Name of the model (key in model registry).
    model_params : dict
        Hyperparameters for the model.
    preprocessing : list of tuple
        Preprocessing steps as (name, params) tuples.
    feature_engineering : list of tuple
        Feature engineering steps as (name, params) tuples.
    ensemble_weight : float
        Initial weight for ensembling (may be adjusted later).
    config_id : str, optional
        Unique identifier for this configuration.
    metadata : dict
        Additional metadata about this configuration.

    Examples
    --------
    >>> config = PipelineConfig(
    ...     model_name="lgbm",
    ...     model_params={"n_estimators": 1000, "learning_rate": 0.05},
    ...     preprocessing=[
    ...         ("imputer", {"strategy": "median"}),
    ...         ("encoder", {"method": "target"}),
    ...     ],
    ... )
    """

    model_name: str
    model_params: dict[str, Any] = field(default_factory=dict)
    preprocessing: list[tuple[str, dict[str, Any]]] = field(default_factory=list)
    feature_engineering: list[tuple[str, dict[str, Any]]] = field(default_factory=list)
    ensemble_weight: float = 1.0
    config_id: str | None = None
    metadata: dict[str, Any] = field(default_factory=dict)

    def __post_init__(self):
        """Generate config_id if not provided."""
        if self.config_id is None:
            import hashlib
            import json

            # Create a deterministic hash of the config
            config_str = json.dumps(
                {
                    "model": self.model_name,
                    "params": self.model_params,
                    "preproc": self.preprocessing,
                    "fe": self.feature_engineering,
                },
                sort_keys=True,
                default=str,
            )
            self.config_id = hashlib.md5(config_str.encode()).hexdigest()[:12]


[docs]
    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary representation."""
        return {
            "model_name": self.model_name,
            "model_params": self.model_params,
            "preprocessing": self.preprocessing,
            "feature_engineering": self.feature_engineering,
            "ensemble_weight": self.ensemble_weight,
            "config_id": self.config_id,
            "metadata": self.metadata,
        }



[docs]
    @classmethod
    def from_dict(cls, d: dict[str, Any]) -> PipelineConfig:
        """Create from dictionary representation."""
        return cls(
            model_name=d["model_name"],
            model_params=d.get("model_params", {}),
            preprocessing=d.get("preprocessing", []),
            feature_engineering=d.get("feature_engineering", []),
            ensemble_weight=d.get("ensemble_weight", 1.0),
            config_id=d.get("config_id"),
            metadata=d.get("metadata", {}),
        )





[docs]
@dataclass
class SearchResult:
    """Result from evaluating a pipeline configuration.

    Attributes
    ----------
    config : PipelineConfig
        The configuration that was evaluated.
    score : float
        Primary evaluation metric score.
    scores : dict
        All evaluation metric scores.
    fit_time : float
        Time taken to fit the model in seconds.
    predict_time : float
        Time taken for predictions in seconds.
    oof_predictions : np.ndarray, optional
        Out-of-fold predictions.
    feature_importances : dict, optional
        Feature importance scores.
    success : bool
        Whether the evaluation completed successfully.
    error : str, optional
        Error message if evaluation failed.
    metadata : dict
        Additional metadata about this result.
    """

    config: PipelineConfig
    score: float
    scores: dict[str, float] = field(default_factory=dict)
    fit_time: float = 0.0
    predict_time: float = 0.0
    oof_predictions: np.ndarray | None = None
    feature_importances: dict[str, float] | None = None
    success: bool = True
    error: str | None = None
    metadata: dict[str, Any] = field(default_factory=dict)




[docs]
class BaseSearchStrategy(ABC):
    """Base class for pipeline search strategies.

    A search strategy is responsible for suggesting pipeline configurations
    to try and updating its internal state based on the results.

    Parameters
    ----------
    task_type : str
        Task type ("classification" or "regression").
    eval_metric : str or callable
        Evaluation metric to optimize.
    random_state : int, optional
        Random seed for reproducibility.
    verbose : int, default=0
        Verbosity level.

    Attributes
    ----------
    results_ : list of SearchResult
        Results from all evaluated configurations.
    best_result_ : SearchResult or None
        Best result found so far.
    n_evaluated_ : int
        Number of configurations evaluated.
    """

    def __init__(
        self,
        task_type: str = "classification",
        eval_metric: str = "auto",
        random_state: int | None = None,
        verbose: int = 0,
        excluded_models: set[str] | None = None,
    ):
        self.task_type = task_type
        self.eval_metric = eval_metric
        self.random_state = random_state
        self.verbose = verbose
        self.excluded_models = excluded_models or set()

        # State
        self.results_: list[SearchResult] = []
        self.best_result_: SearchResult | None = None
        self.n_evaluated_: int = 0

        # Set random state
        if random_state is not None:
            np.random.seed(random_state)


[docs]
    @abstractmethod
    def suggest(
        self,
        meta_features: dict[str, float] | None = None,
        n_suggestions: int = 1,
    ) -> list[PipelineConfig]:
        """Suggest pipeline configurations to try.

        Parameters
        ----------
        meta_features : dict, optional
            Dataset meta-features for informed suggestions.
        n_suggestions : int, default=1
            Number of configurations to suggest.

        Returns
        -------
        list of PipelineConfig
            Suggested configurations.
        """
        pass



[docs]
    def update(self, result: SearchResult) -> None:
        """Update the search strategy with a new result.

        Parameters
        ----------
        result : SearchResult
            Result from evaluating a configuration.
        """
        self.results_.append(result)
        self.n_evaluated_ += 1

        # Update best result
        if result.success:
            if self.best_result_ is None or result.score > self.best_result_.score:
                self.best_result_ = result

                if self.verbose > 0:
                    print(
                        f"New best: {result.config.model_name} "
                        f"score={result.score:.4f}"
                    )



[docs]
    def get_best(self, n: int = 1) -> list[SearchResult]:
        """Get the best results found so far.

        Parameters
        ----------
        n : int, default=1
            Number of best results to return.

        Returns
        -------
        list of SearchResult
            Top n results sorted by score (descending).
        """
        successful = [r for r in self.results_ if r.success]
        sorted_results = sorted(successful, key=lambda r: r.score, reverse=True)
        return sorted_results[:n]



[docs]
    def get_results_summary(self) -> dict[str, Any]:
        """Get a summary of search results.

        Returns
        -------
        dict
            Summary statistics.
        """
        successful = [r for r in self.results_ if r.success]
        failed = [r for r in self.results_ if not r.success]

        if not successful:
            return {
                "n_evaluated": self.n_evaluated_,
                "n_successful": 0,
                "n_failed": len(failed),
                "best_score": None,
                "best_model": None,
            }

        scores = [r.score for r in successful]
        return {
            "n_evaluated": self.n_evaluated_,
            "n_successful": len(successful),
            "n_failed": len(failed),
            "best_score": max(scores),
            "mean_score": np.mean(scores),
            "std_score": np.std(scores),
            "best_model": self.best_result_.config.model_name if self.best_result_ else None,
            "total_fit_time": sum(r.fit_time for r in successful),
        }



[docs]
    def should_stop(self, max_iterations: int | None = None) -> bool:
        """Check if search should stop.

        Parameters
        ----------
        max_iterations : int, optional
            Maximum number of iterations.

        Returns
        -------
        bool
            Whether to stop searching.
        """
        if max_iterations and self.n_evaluated_ >= max_iterations:
            return True
        return False



[docs]
    def reset(self) -> None:
        """Reset the search strategy state."""
        self.results_ = []
        self.best_result_ = None
        self.n_evaluated_ = 0




class SearchCallback:
    """Callback interface for search progress monitoring.

    Implement this interface to receive updates during search.
    """

    def on_search_start(self, strategy: BaseSearchStrategy) -> None:
        """Called when search begins."""
        pass

    def on_config_evaluated(
        self,
        strategy: BaseSearchStrategy,
        result: SearchResult,
    ) -> None:
        """Called after each configuration is evaluated."""
        pass

    def on_search_end(self, strategy: BaseSearchStrategy) -> None:
        """Called when search completes."""
        pass


class ProgressCallback(SearchCallback):
    """Simple progress callback that prints updates."""

    def __init__(self, total_configs: int | None = None):
        self.total_configs = total_configs

    def on_config_evaluated(
        self,
        strategy: BaseSearchStrategy,
        result: SearchResult,
    ) -> None:
        """Print progress after each evaluation."""
        n = strategy.n_evaluated_
        status = "OK" if result.success else "FAIL"
        score_str = f"{result.score:.4f}" if result.success else "N/A"

        if self.total_configs:
            print(
                f"[{n}/{self.total_configs}] {result.config.model_name}: "
                f"{status} score={score_str} time={result.fit_time:.1f}s"
            )
        else:
            print(
                f"[{n}] {result.config.model_name}: "
                f"{status} score={score_str} time={result.fit_time:.1f}s"
            )