Source code for endgame.benchmark.report

"""HTML Report Generator for Benchmark Results.

Generates beautiful, interactive HTML reports with:
- Performance comparison charts (accuracy, F1, AUC, etc.)
- Training time comparisons
- Model interpretability outputs (rules, trees, equations)
- Interactive Plotly charts
- Sortable tables
"""

from __future__ import annotations

import html
import json
from datetime import datetime
from pathlib import Path
from typing import TYPE_CHECKING

import numpy as np

if TYPE_CHECKING:
    from endgame.benchmark.tracker import ExperimentTracker

try:
    import plotly.express as px
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots
    HAS_PLOTLY = True
except ImportError:
    HAS_PLOTLY = False

try:
    import polars as pl
    HAS_POLARS = True
except ImportError:
    HAS_POLARS = False


def _escape_html(text: str) -> str:
    """Escape HTML special characters."""
    return html.escape(str(text))


def _format_number(value: float, precision: int = 4) -> str:
    """Format a number for display."""
    if value is None or (isinstance(value, float) and np.isnan(value)):
        return "N/A"
    if abs(value) < 0.0001 and value != 0:
        return f"{value:.2e}"
    return f"{value:.{precision}f}"


def _get_color_scale(n_colors: int) -> list[str]:
    """Get a list of distinct colors for charts."""
    colors = px.colors.qualitative.Set3 + px.colors.qualitative.Pastel1
    return colors[:n_colors] if n_colors <= len(colors) else colors * (n_colors // len(colors) + 1)


[docs] class BenchmarkReportGenerator: """Generate HTML reports from benchmark results. Parameters ---------- tracker : ExperimentTracker The experiment tracker with benchmark results. title : str, optional Report title. Examples -------- >>> from endgame.benchmark import BenchmarkRunner, BenchmarkReportGenerator >>> runner = BenchmarkRunner(suite="sklearn-classic") >>> tracker = runner.run(models) >>> report = BenchmarkReportGenerator(tracker) >>> report.generate("benchmark_report.html") """ def __init__( self, tracker: ExperimentTracker, title: str = "Endgame Benchmark Report", ): self.tracker = tracker self.title = title self._df = tracker.to_dataframe() self._interpretability_outputs: dict[str, dict[str, str]] = {}
[docs] def add_interpretability_output( self, model_name: str, dataset_name: str, output: str, output_type: str = "text", ) -> None: """Add interpretability output for a model. Parameters ---------- model_name : str Name of the model. dataset_name : str Name of the dataset. output : str The interpretability output (rules, tree structure, equation, etc.) output_type : str Type of output: "text", "html", "latex", "code" """ key = f"{model_name}_{dataset_name}" self._interpretability_outputs[key] = { "model_name": model_name, "dataset_name": dataset_name, "output": output, "output_type": output_type, }
[docs] def generate( self, output_path: str, include_interpretability: bool = True, include_meta_features: bool = False, ) -> str: """Generate the HTML report. Parameters ---------- output_path : str Path to save the HTML report. include_interpretability : bool Include interpretability outputs section. include_meta_features : bool Include dataset meta-features section. Returns ------- str Path to the generated report. """ if not HAS_PLOTLY: raise ImportError("Plotly is required for report generation. Install with: pip install plotly") sections = [] # Header sections.append(self._generate_header()) # Summary statistics sections.append(self._generate_summary_section()) # Performance comparison charts sections.append(self._generate_performance_section()) # Speed comparison sections.append(self._generate_speed_section()) # Per-dataset results sections.append(self._generate_dataset_section()) # Model rankings sections.append(self._generate_rankings_section()) # Interpretability outputs if include_interpretability and self._interpretability_outputs: sections.append(self._generate_interpretability_section()) # Meta-features if include_meta_features: sections.append(self._generate_meta_features_section()) # Failed experiments sections.append(self._generate_failures_section()) # Footer sections.append(self._generate_footer()) # Combine all sections html_content = self._wrap_html(sections) # Write to file Path(output_path).write_text(html_content, encoding="utf-8") return output_path
def _wrap_html(self, sections: list[str]) -> str: """Wrap sections in HTML document structure.""" return f"""<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, initial-scale=1.0"> <title>{_escape_html(self.title)}</title> <script src="https://cdn.plot.ly/plotly-2.27.0.min.js"></script> <style> :root {{ --primary-color: #6366f1; --secondary-color: #8b5cf6; --success-color: #22c55e; --warning-color: #f59e0b; --danger-color: #ef4444; --bg-color: #0f172a; --card-bg: #1e293b; --text-color: #e2e8f0; --text-muted: #94a3b8; --border-color: #334155; }} * {{ margin: 0; padding: 0; box-sizing: border-box; }} body {{ font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; background: var(--bg-color); color: var(--text-color); line-height: 1.6; }} .container {{ max-width: 1400px; margin: 0 auto; padding: 2rem; }} header {{ background: linear-gradient(135deg, var(--primary-color), var(--secondary-color)); padding: 3rem 2rem; text-align: center; margin-bottom: 2rem; border-radius: 0 0 1rem 1rem; }} header h1 {{ font-size: 2.5rem; font-weight: 700; margin-bottom: 0.5rem; }} header .subtitle {{ color: rgba(255, 255, 255, 0.8); font-size: 1.1rem; }} .card {{ background: var(--card-bg); border-radius: 1rem; padding: 1.5rem; margin-bottom: 1.5rem; border: 1px solid var(--border-color); box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.3); }} .card h2 {{ font-size: 1.5rem; margin-bottom: 1rem; color: var(--primary-color); display: flex; align-items: center; gap: 0.5rem; }} .card h3 {{ font-size: 1.2rem; margin: 1.5rem 0 1rem 0; color: var(--text-color); }} .stats-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; margin-bottom: 1.5rem; }} .stat-card {{ background: rgba(99, 102, 241, 0.1); border-radius: 0.75rem; padding: 1.25rem; text-align: center; border: 1px solid rgba(99, 102, 241, 0.2); }} .stat-card .value {{ font-size: 2rem; font-weight: 700; color: var(--primary-color); }} .stat-card .label {{ color: var(--text-muted); font-size: 0.875rem; margin-top: 0.25rem; }} .chart-container {{ width: 100%; min-height: 400px; margin: 1rem 0; }} table {{ width: 100%; border-collapse: collapse; margin: 1rem 0; font-size: 0.9rem; }} th, td {{ padding: 0.75rem 1rem; text-align: left; border-bottom: 1px solid var(--border-color); }} th {{ background: rgba(99, 102, 241, 0.1); font-weight: 600; color: var(--primary-color); cursor: pointer; user-select: none; }} th:hover {{ background: rgba(99, 102, 241, 0.2); }} tr:hover {{ background: rgba(255, 255, 255, 0.02); }} .badge {{ display: inline-block; padding: 0.25rem 0.75rem; border-radius: 9999px; font-size: 0.75rem; font-weight: 600; }} .badge-success {{ background: rgba(34, 197, 94, 0.2); color: var(--success-color); }} .badge-warning {{ background: rgba(245, 158, 11, 0.2); color: var(--warning-color); }} .badge-danger {{ background: rgba(239, 68, 68, 0.2); color: var(--danger-color); }} .interpretability-output {{ background: #0d1117; border-radius: 0.5rem; padding: 1rem; margin: 1rem 0; overflow-x: auto; font-family: 'Fira Code', 'Monaco', monospace; font-size: 0.85rem; line-height: 1.5; white-space: pre-wrap; border: 1px solid var(--border-color); }} .model-header {{ display: flex; justify-content: space-between; align-items: center; margin-bottom: 0.5rem; }} .model-name {{ font-weight: 600; color: var(--secondary-color); }} .dataset-name {{ color: var(--text-muted); font-size: 0.875rem; }} .tabs {{ display: flex; gap: 0.5rem; margin-bottom: 1rem; border-bottom: 1px solid var(--border-color); padding-bottom: 0.5rem; }} .tab {{ padding: 0.5rem 1rem; border-radius: 0.5rem 0.5rem 0 0; cursor: pointer; transition: all 0.2s; color: var(--text-muted); }} .tab:hover {{ background: rgba(99, 102, 241, 0.1); }} .tab.active {{ background: var(--primary-color); color: white; }} .tab-content {{ display: none; }} .tab-content.active {{ display: block; }} .rank-1 {{ color: #ffd700; font-weight: 700; }} .rank-2 {{ color: #c0c0c0; font-weight: 600; }} .rank-3 {{ color: #cd7f32; font-weight: 600; }} footer {{ text-align: center; padding: 2rem; color: var(--text-muted); font-size: 0.875rem; }} .collapsible {{ cursor: pointer; padding: 0.75rem; background: rgba(99, 102, 241, 0.1); border-radius: 0.5rem; margin: 0.5rem 0; display: flex; justify-content: space-between; align-items: center; }} .collapsible:hover {{ background: rgba(99, 102, 241, 0.2); }} .collapsible-content {{ display: none; padding: 1rem; animation: fadeIn 0.3s ease; }} .collapsible-content.show {{ display: block; }} @keyframes fadeIn {{ from {{ opacity: 0; transform: translateY(-10px); }} to {{ opacity: 1; transform: translateY(0); }} }} .progress-bar {{ width: 100%; height: 8px; background: var(--border-color); border-radius: 4px; overflow: hidden; }} .progress-bar-fill {{ height: 100%; background: linear-gradient(90deg, var(--primary-color), var(--secondary-color)); transition: width 0.3s ease; }} @media (max-width: 768px) {{ .container {{ padding: 1rem; }} header h1 {{ font-size: 1.75rem; }} .stats-grid {{ grid-template-columns: repeat(2, 1fr); }} }} </style> </head> <body> <div class="container"> {''.join(sections)} </div> <script> // Table sorting document.querySelectorAll('th[data-sort]').forEach(th => {{ th.addEventListener('click', () => {{ const table = th.closest('table'); const tbody = table.querySelector('tbody'); const rows = Array.from(tbody.querySelectorAll('tr')); const idx = Array.from(th.parentNode.children).indexOf(th); const asc = th.dataset.order !== 'asc'; rows.sort((a, b) => {{ const aVal = a.children[idx].textContent; const bVal = b.children[idx].textContent; const aNum = parseFloat(aVal); const bNum = parseFloat(bVal); if (!isNaN(aNum) && !isNaN(bNum)) {{ return asc ? aNum - bNum : bNum - aNum; }} return asc ? aVal.localeCompare(bVal) : bVal.localeCompare(aVal); }}); th.dataset.order = asc ? 'asc' : 'desc'; rows.forEach(row => tbody.appendChild(row)); }}); }}); // Collapsible sections document.querySelectorAll('.collapsible').forEach(el => {{ el.addEventListener('click', () => {{ const content = el.nextElementSibling; content.classList.toggle('show'); el.querySelector('.arrow').textContent = content.classList.contains('show') ? '▼' : '▶'; }}); }}); // Tabs document.querySelectorAll('.tab').forEach(tab => {{ tab.addEventListener('click', () => {{ const tabGroup = tab.closest('.tabs-container'); tabGroup.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); tabGroup.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active')); tab.classList.add('active'); tabGroup.querySelector(tab.dataset.target).classList.add('active'); }}); }}); </script> </body> </html>""" def _generate_header(self) -> str: """Generate the report header.""" timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") n_experiments = len(self.tracker) n_successful = len(self.tracker.get_successful()) return f""" <header> <h1>🏆 {_escape_html(self.title)}</h1> <p class="subtitle">Generated on {timestamp}{n_experiments} experiments • {n_successful} successful</p> </header> """ def _generate_summary_section(self) -> str: """Generate summary statistics section.""" df = self._df if HAS_POLARS: successful = df.filter(pl.col('status') == 'success') n_total = len(df) n_success = len(successful) n_failed = n_total - n_success n_datasets = df['dataset_name'].n_unique() n_models = df['model_name'].n_unique() # Best accuracy if 'metric_accuracy' in successful.columns and len(successful) > 0: best_acc_row = successful.sort('metric_accuracy', descending=True).head(1) best_acc = best_acc_row['metric_accuracy'][0] best_acc_model = best_acc_row['model_name'][0] else: best_acc = 0 best_acc_model = "N/A" # Average fit time if 'fit_time' in successful.columns: avg_fit_time = successful['fit_time'].mean() else: avg_fit_time = 0 else: successful = df[df['status'] == 'success'] n_total = len(df) n_success = len(successful) n_failed = n_total - n_success n_datasets = df['dataset_name'].nunique() n_models = df['model_name'].nunique() if 'metric_accuracy' in successful.columns and len(successful) > 0: best_acc = successful['metric_accuracy'].max() best_acc_model = successful.loc[successful['metric_accuracy'].idxmax(), 'model_name'] else: best_acc = 0 best_acc_model = "N/A" avg_fit_time = successful['fit_time'].mean() if 'fit_time' in successful.columns else 0 success_rate = (n_success / n_total * 100) if n_total > 0 else 0 return f""" <section class="card"> <h2>📊 Summary</h2> <div class="stats-grid"> <div class="stat-card"> <div class="value">{n_total}</div> <div class="label">Total Experiments</div> </div> <div class="stat-card"> <div class="value">{n_success}</div> <div class="label">Successful</div> </div> <div class="stat-card"> <div class="value">{n_datasets}</div> <div class="label">Datasets</div> </div> <div class="stat-card"> <div class="value">{n_models}</div> <div class="label">Models</div> </div> <div class="stat-card"> <div class="value">{_format_number(best_acc)}</div> <div class="label">Best Accuracy ({_escape_html(best_acc_model)})</div> </div> <div class="stat-card"> <div class="value">{_format_number(avg_fit_time, 2)}s</div> <div class="label">Avg Fit Time</div> </div> </div> <div class="progress-bar"> <div class="progress-bar-fill" style="width: {success_rate}%"></div> </div> <p style="text-align: center; color: var(--text-muted); margin-top: 0.5rem;"> {_format_number(success_rate, 1)}% success rate </p> </section> """ def _generate_performance_section(self) -> str: """Generate performance comparison charts.""" df = self._df if HAS_POLARS: successful = df.filter(pl.col('status') == 'success') else: successful = df[df['status'] == 'success'] if len(successful) == 0: return """<section class="card"><h2>📈 Performance</h2><p>No successful experiments to display.</p></section>""" charts_html = [] # Accuracy comparison chart accuracy_chart = self._create_metric_comparison_chart(successful, 'metric_accuracy', 'Accuracy') if accuracy_chart: charts_html.append('<div class="chart-container" id="accuracy-chart"></div>') charts_html.append(f'<script>{accuracy_chart}</script>') # F1 score chart (if available) f1_col = 'metric_f1' if 'metric_f1' in (successful.columns if HAS_POLARS else successful.columns) else 'metric_f1_weighted' if f1_col in (successful.columns if HAS_POLARS else successful.columns): f1_chart = self._create_metric_comparison_chart(successful, f1_col, 'F1 Score') if f1_chart: charts_html.append('<div class="chart-container" id="f1-chart"></div>') charts_html.append(f'<script>{f1_chart}</script>') # ROC-AUC chart (if available) auc_col = 'metric_roc_auc' if 'metric_roc_auc' in (successful.columns if HAS_POLARS else successful.columns) else 'metric_roc_auc_ovr_weighted' if auc_col in (successful.columns if HAS_POLARS else successful.columns): auc_chart = self._create_metric_comparison_chart(successful, auc_col, 'ROC-AUC') if auc_chart: charts_html.append('<div class="chart-container" id="auc-chart"></div>') charts_html.append(f'<script>{auc_chart}</script>') return f""" <section class="card"> <h2>📈 Performance Comparison</h2> {''.join(charts_html)} </section> """ def _create_metric_comparison_chart(self, df, metric_col: str, metric_name: str) -> str | None: """Create a bar chart comparing model performance on a metric.""" if metric_col not in (df.columns if HAS_POLARS else df.columns): return None # Aggregate by model if HAS_POLARS: agg = df.group_by('model_name').agg([ pl.col(metric_col).mean().alias('mean'), pl.col(metric_col).std().alias('std'), pl.col(metric_col).count().alias('count'), ]).sort('mean', descending=True) models = agg['model_name'].to_list() means = agg['mean'].to_list() stds = [s if s is not None else 0 for s in agg['std'].to_list()] else: agg = df.groupby('model_name')[metric_col].agg(['mean', 'std', 'count']).reset_index() agg = agg.sort_values('mean', ascending=False) models = agg['model_name'].tolist() means = agg['mean'].tolist() stds = agg['std'].fillna(0).tolist() # Create Plotly chart colors = _get_color_scale(len(models)) chart_id = metric_col.replace('metric_', '') + '-chart' fig_json = { 'data': [{ 'type': 'bar', 'x': models, 'y': means, 'error_y': { 'type': 'data', 'array': stds, 'visible': True, }, 'marker': {'color': colors[:len(models)]}, 'hovertemplate': '<b>%{x}</b><br>' + metric_name + ': %{y:.4f}<extra></extra>', }], 'layout': { 'title': {'text': f'{metric_name} by Model (Mean ± Std)', 'font': {'color': '#e2e8f0'}}, 'xaxis': {'title': 'Model', 'tickangle': -45, 'color': '#94a3b8'}, 'yaxis': {'title': metric_name, 'color': '#94a3b8'}, 'paper_bgcolor': 'rgba(0,0,0,0)', 'plot_bgcolor': 'rgba(0,0,0,0)', 'font': {'color': '#e2e8f0'}, 'margin': {'b': 150}, } } return f"Plotly.newPlot('{chart_id}', {json.dumps(fig_json['data'])}, {json.dumps(fig_json['layout'])});" def _generate_speed_section(self) -> str: """Generate training speed comparison section.""" df = self._df if HAS_POLARS: successful = df.filter(pl.col('status') == 'success') if 'fit_time' not in successful.columns or len(successful) == 0: return "" agg = successful.group_by('model_name').agg([ pl.col('fit_time').mean().alias('mean_time'), pl.col('fit_time').min().alias('min_time'), pl.col('fit_time').max().alias('max_time'), ]).sort('mean_time') models = agg['model_name'].to_list() mean_times = agg['mean_time'].to_list() min_times = agg['min_time'].to_list() max_times = agg['max_time'].to_list() else: successful = df[df['status'] == 'success'] if 'fit_time' not in successful.columns or len(successful) == 0: return "" agg = successful.groupby('model_name')['fit_time'].agg(['mean', 'min', 'max']).reset_index() agg = agg.sort_values('mean') models = agg['model_name'].tolist() mean_times = agg['mean'].tolist() min_times = agg['min'].tolist() max_times = agg['max'].tolist() # Create speed chart colors = ['#22c55e' if t < 10 else '#f59e0b' if t < 60 else '#ef4444' for t in mean_times] fig_json = { 'data': [{ 'type': 'bar', 'x': mean_times, 'y': models, 'orientation': 'h', 'marker': {'color': colors}, 'hovertemplate': '<b>%{y}</b><br>Avg: %{x:.2f}s<extra></extra>', }], 'layout': { 'title': {'text': 'Training Time by Model (seconds)', 'font': {'color': '#e2e8f0'}}, 'xaxis': {'title': 'Time (seconds)', 'color': '#94a3b8', 'type': 'log'}, 'yaxis': {'title': '', 'color': '#94a3b8', 'autorange': 'reversed'}, 'paper_bgcolor': 'rgba(0,0,0,0)', 'plot_bgcolor': 'rgba(0,0,0,0)', 'font': {'color': '#e2e8f0'}, 'margin': {'l': 200}, 'height': max(400, len(models) * 25), } } return f""" <section class="card"> <h2>⚡ Training Speed</h2> <div class="chart-container" id="speed-chart" style="min-height: {max(400, len(models) * 25)}px"></div> <script>Plotly.newPlot('speed-chart', {json.dumps(fig_json['data'])}, {json.dumps(fig_json['layout'])});</script> <p style="color: var(--text-muted); text-align: center; margin-top: 1rem;"> <span style="color: #22c55e;">●</span> Fast (&lt;10s) &nbsp; <span style="color: #f59e0b;">●</span> Medium (&lt;60s) &nbsp; <span style="color: #ef4444;">●</span> Slow (&gt;60s) </p> </section> """ def _generate_dataset_section(self) -> str: """Generate per-dataset results section.""" df = self._df if HAS_POLARS: successful = df.filter(pl.col('status') == 'success') datasets = successful['dataset_name'].unique().to_list() else: successful = df[df['status'] == 'success'] datasets = successful['dataset_name'].unique().tolist() if not datasets: return "" dataset_sections = [] for dataset in sorted(datasets): if HAS_POLARS: dataset_df = successful.filter(pl.col('dataset_name') == dataset) dataset_df = dataset_df.sort('metric_accuracy', descending=True, nulls_last=True) else: dataset_df = successful[successful['dataset_name'] == dataset] dataset_df = dataset_df.sort_values('metric_accuracy', ascending=False) # Create table rows rows = [] for i, row in enumerate(dataset_df.iter_rows(named=True) if HAS_POLARS else dataset_df.itertuples()): if HAS_POLARS: model_name = row['model_name'] accuracy = row.get('metric_accuracy', None) f1 = row.get('metric_f1', row.get('metric_f1_weighted', None)) fit_time = row.get('fit_time', None) else: model_name = row.model_name accuracy = getattr(row, 'metric_accuracy', None) f1 = getattr(row, 'metric_f1', getattr(row, 'metric_f1_weighted', None)) fit_time = getattr(row, 'fit_time', None) rank_class = f"rank-{i+1}" if i < 3 else "" rows.append(f""" <tr> <td class="{rank_class}">{i+1}</td> <td>{_escape_html(model_name)}</td> <td>{_format_number(accuracy)}</td> <td>{_format_number(f1)}</td> <td>{_format_number(fit_time, 2)}s</td> </tr> """) dataset_sections.append(f""" <div class="collapsible"> <span><strong>{_escape_html(dataset)}</strong></span> <span class="arrow">▶</span> </div> <div class="collapsible-content"> <table> <thead> <tr> <th data-sort>Rank</th> <th data-sort>Model</th> <th data-sort>Accuracy</th> <th data-sort>F1</th> <th data-sort>Fit Time</th> </tr> </thead> <tbody> {''.join(rows)} </tbody> </table> </div> """) return f""" <section class="card"> <h2>📁 Results by Dataset</h2> {''.join(dataset_sections)} </section> """ def _generate_rankings_section(self) -> str: """Generate model rankings section.""" df = self._df if HAS_POLARS: successful = df.filter(pl.col('status') == 'success') if len(successful) == 0: return "" # Compute mean rank per model rankings = successful.group_by('model_name').agg([ pl.col('metric_accuracy').mean().alias('mean_accuracy'), pl.col('metric_accuracy').count().alias('n_experiments'), pl.col('fit_time').mean().alias('mean_time'), ]).sort('mean_accuracy', descending=True) rows_data = list(rankings.iter_rows(named=True)) else: successful = df[df['status'] == 'success'] if len(successful) == 0: return "" rankings = successful.groupby('model_name').agg({ 'metric_accuracy': ['mean', 'count'], 'fit_time': 'mean', }).reset_index() rankings.columns = ['model_name', 'mean_accuracy', 'n_experiments', 'mean_time'] rankings = rankings.sort_values('mean_accuracy', ascending=False) rows_data = rankings.to_dict('records') # Build table rows = [] for i, row in enumerate(rows_data): model_name = row['model_name'] mean_acc = row['mean_accuracy'] n_exp = row['n_experiments'] mean_time = row['mean_time'] rank_class = f"rank-{i+1}" if i < 3 else "" badge = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else "" rows.append(f""" <tr> <td class="{rank_class}">{badge} {i+1}</td> <td><strong>{_escape_html(model_name)}</strong></td> <td>{_format_number(mean_acc)}</td> <td>{n_exp}</td> <td>{_format_number(mean_time, 2)}s</td> </tr> """) return f""" <section class="card"> <h2>🏅 Model Rankings</h2> <table> <thead> <tr> <th data-sort>Rank</th> <th data-sort>Model</th> <th data-sort>Mean Accuracy</th> <th data-sort>Experiments</th> <th data-sort>Mean Fit Time</th> </tr> </thead> <tbody> {''.join(rows)} </tbody> </table> </section> """ def _generate_interpretability_section(self) -> str: """Generate interpretability outputs section.""" if not self._interpretability_outputs: return "" outputs_html = [] for key, data in self._interpretability_outputs.items(): model_name = data['model_name'] dataset_name = data['dataset_name'] output = data['output'] output_type = data['output_type'] if output_type == "latex": # Wrap in math delimiters for potential MathJax rendering formatted_output = f"$${_escape_html(output)}$$" elif output_type == "html": formatted_output = output # Already HTML else: formatted_output = _escape_html(output) outputs_html.append(f""" <div class="collapsible"> <span> <span class="model-name">{_escape_html(model_name)}</span> <span class="dataset-name"> on {_escape_html(dataset_name)}</span> </span> <span class="arrow">▶</span> </div> <div class="collapsible-content"> <div class="interpretability-output">{formatted_output}</div> </div> """) return f""" <section class="card"> <h2>🔍 Model Interpretability</h2> <p style="color: var(--text-muted); margin-bottom: 1rem;"> Learned rules, trees, equations, and other interpretable model outputs. </p> {''.join(outputs_html)} </section> """ def _generate_meta_features_section(self) -> str: """Generate dataset meta-features section.""" # Get meta-features from tracker records meta_features_data = [] for record in self.tracker.get_successful(): if record.meta_features: meta_features_data.append({ 'dataset': record.dataset_name, **record.meta_features, }) if not meta_features_data: return "" # Deduplicate by dataset seen = set() unique_data = [] for mf in meta_features_data: if mf['dataset'] not in seen: seen.add(mf['dataset']) unique_data.append(mf) # Build table if not unique_data: return "" # Get common columns all_cols = set() for d in unique_data: all_cols.update(d.keys()) all_cols.discard('dataset') cols = ['dataset'] + sorted(all_cols)[:10] # Limit to 10 columns header = ''.join(f'<th data-sort>{_escape_html(c)}</th>' for c in cols) rows = [] for d in unique_data: cells = [] for c in cols: val = d.get(c, 'N/A') if isinstance(val, float): val = _format_number(val, 2) cells.append(f'<td>{_escape_html(str(val))}</td>') rows.append(f'<tr>{"".join(cells)}</tr>') return f""" <section class="card"> <h2>📐 Dataset Meta-Features</h2> <div style="overflow-x: auto;"> <table> <thead><tr>{header}</tr></thead> <tbody>{''.join(rows)}</tbody> </table> </div> </section> """ def _generate_failures_section(self) -> str: """Generate failed experiments section.""" df = self._df if HAS_POLARS: failed = df.filter(pl.col('status') == 'failed') if len(failed) == 0: return "" rows_data = list(failed.select(['model_name', 'dataset_name', 'error_message']).iter_rows(named=True)) else: failed = df[df['status'] == 'failed'] if len(failed) == 0: return "" rows_data = failed[['model_name', 'dataset_name', 'error_message']].to_dict('records') rows = [] for row in rows_data[:50]: # Limit to 50 failures rows.append(f""" <tr> <td>{_escape_html(row['model_name'])}</td> <td>{_escape_html(row['dataset_name'])}</td> <td style="color: var(--danger-color); font-size: 0.8rem;"> {_escape_html(str(row.get('error_message', 'Unknown error'))[:200])} </td> </tr> """) return f""" <section class="card"> <h2>❌ Failed Experiments ({len(rows_data)} total)</h2> <table> <thead> <tr> <th>Model</th> <th>Dataset</th> <th>Error</th> </tr> </thead> <tbody> {''.join(rows)} </tbody> </table> </section> """ def _generate_footer(self) -> str: """Generate report footer.""" return f""" <footer> <p>Generated by Endgame Benchmark Suite</p> <p>© {datetime.now().year} • Built with 🔬 for ML research</p> </footer> """
[docs] def extract_interpretability_outputs( models: list[tuple], X_sample: np.ndarray, y_sample: np.ndarray, dataset_name: str, feature_names: list[str] | None = None, ) -> dict[str, str]: """Extract interpretability outputs from fitted models. Parameters ---------- models : List[Tuple] List of (name, fitted_model) tuples. X_sample : np.ndarray Sample data used for fitting. y_sample : np.ndarray Sample targets. dataset_name : str Name of the dataset. feature_names : List[str], optional Feature names for better output. Returns ------- Dict[str, str] Dictionary mapping model names to their interpretability outputs. """ outputs = {} for name, model in models: if model is None: continue output = None output_type = "text" try: # RuleFit if hasattr(model, 'get_rules'): rules = model.get_rules() if rules: output = "\n".join([ f"Rule {i+1}: {r.get('rule', r)} (coef={r.get('coef', 'N/A'):.4f})" for i, r in enumerate(rules[:20]) # Top 20 rules ]) # Symbolic Regression elif hasattr(model, 'get_best_equation'): output = model.get_best_equation() if hasattr(model, 'latex'): try: output = model.latex() output_type = "latex" except Exception: pass # C5.0 / Decision Trees elif hasattr(model, 'get_structure'): output = model.get_structure() # FURIA elif hasattr(model, 'get_rules_str'): output = model.get_rules_str() # EBM elif hasattr(model, 'term_importances'): try: importances = model.term_importances() term_names = model.get_term_names() if hasattr(model, 'get_term_names') else [f"Term {i}" for i in range(len(importances))] sorted_terms = sorted(zip(term_names, importances), key=lambda x: abs(x[1]), reverse=True) output = "Top Feature Contributions:\n" + "\n".join([ f" {name}: {imp:.4f}" for name, imp in sorted_terms[:15] ]) except Exception: pass # MARS elif hasattr(model, 'summary') and 'MARS' in type(model).__name__: output = model.summary() # Generic summary method elif hasattr(model, 'summary'): try: output = model.summary() except Exception: pass # Feature importances fallback elif hasattr(model, 'feature_importances_'): importances = model.feature_importances_ if feature_names is None: feature_names = [f"Feature {i}" for i in range(len(importances))] sorted_features = sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True) output = "Feature Importances:\n" + "\n".join([ f" {name}: {imp:.4f}" for name, imp in sorted_features[:15] ]) except Exception as e: output = f"Error extracting interpretability: {str(e)}" if output: outputs[name] = {"output": output, "type": output_type, "dataset": dataset_name} return outputs