"""HTML Report Generator for Benchmark Results.
Generates beautiful, interactive HTML reports with:
- Performance comparison charts (accuracy, F1, AUC, etc.)
- Training time comparisons
- Model interpretability outputs (rules, trees, equations)
- Interactive Plotly charts
- Sortable tables
"""
from __future__ import annotations
import html
import json
from datetime import datetime
from pathlib import Path
from typing import TYPE_CHECKING
import numpy as np
if TYPE_CHECKING:
from endgame.benchmark.tracker import ExperimentTracker
try:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
HAS_PLOTLY = True
except ImportError:
HAS_PLOTLY = False
try:
import polars as pl
HAS_POLARS = True
except ImportError:
HAS_POLARS = False
def _escape_html(text: str) -> str:
"""Escape HTML special characters."""
return html.escape(str(text))
def _format_number(value: float, precision: int = 4) -> str:
"""Format a number for display."""
if value is None or (isinstance(value, float) and np.isnan(value)):
return "N/A"
if abs(value) < 0.0001 and value != 0:
return f"{value:.2e}"
return f"{value:.{precision}f}"
def _get_color_scale(n_colors: int) -> list[str]:
"""Get a list of distinct colors for charts."""
colors = px.colors.qualitative.Set3 + px.colors.qualitative.Pastel1
return colors[:n_colors] if n_colors <= len(colors) else colors * (n_colors // len(colors) + 1)
[docs]
class BenchmarkReportGenerator:
"""Generate HTML reports from benchmark results.
Parameters
----------
tracker : ExperimentTracker
The experiment tracker with benchmark results.
title : str, optional
Report title.
Examples
--------
>>> from endgame.benchmark import BenchmarkRunner, BenchmarkReportGenerator
>>> runner = BenchmarkRunner(suite="sklearn-classic")
>>> tracker = runner.run(models)
>>> report = BenchmarkReportGenerator(tracker)
>>> report.generate("benchmark_report.html")
"""
def __init__(
self,
tracker: ExperimentTracker,
title: str = "Endgame Benchmark Report",
):
self.tracker = tracker
self.title = title
self._df = tracker.to_dataframe()
self._interpretability_outputs: dict[str, dict[str, str]] = {}
[docs]
def add_interpretability_output(
self,
model_name: str,
dataset_name: str,
output: str,
output_type: str = "text",
) -> None:
"""Add interpretability output for a model.
Parameters
----------
model_name : str
Name of the model.
dataset_name : str
Name of the dataset.
output : str
The interpretability output (rules, tree structure, equation, etc.)
output_type : str
Type of output: "text", "html", "latex", "code"
"""
key = f"{model_name}_{dataset_name}"
self._interpretability_outputs[key] = {
"model_name": model_name,
"dataset_name": dataset_name,
"output": output,
"output_type": output_type,
}
[docs]
def generate(
self,
output_path: str,
include_interpretability: bool = True,
include_meta_features: bool = False,
) -> str:
"""Generate the HTML report.
Parameters
----------
output_path : str
Path to save the HTML report.
include_interpretability : bool
Include interpretability outputs section.
include_meta_features : bool
Include dataset meta-features section.
Returns
-------
str
Path to the generated report.
"""
if not HAS_PLOTLY:
raise ImportError("Plotly is required for report generation. Install with: pip install plotly")
sections = []
# Header
sections.append(self._generate_header())
# Summary statistics
sections.append(self._generate_summary_section())
# Performance comparison charts
sections.append(self._generate_performance_section())
# Speed comparison
sections.append(self._generate_speed_section())
# Per-dataset results
sections.append(self._generate_dataset_section())
# Model rankings
sections.append(self._generate_rankings_section())
# Interpretability outputs
if include_interpretability and self._interpretability_outputs:
sections.append(self._generate_interpretability_section())
# Meta-features
if include_meta_features:
sections.append(self._generate_meta_features_section())
# Failed experiments
sections.append(self._generate_failures_section())
# Footer
sections.append(self._generate_footer())
# Combine all sections
html_content = self._wrap_html(sections)
# Write to file
Path(output_path).write_text(html_content, encoding="utf-8")
return output_path
def _wrap_html(self, sections: list[str]) -> str:
"""Wrap sections in HTML document structure."""
return f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{_escape_html(self.title)}</title>
<script src="https://cdn.plot.ly/plotly-2.27.0.min.js"></script>
<style>
:root {{
--primary-color: #6366f1;
--secondary-color: #8b5cf6;
--success-color: #22c55e;
--warning-color: #f59e0b;
--danger-color: #ef4444;
--bg-color: #0f172a;
--card-bg: #1e293b;
--text-color: #e2e8f0;
--text-muted: #94a3b8;
--border-color: #334155;
}}
* {{
margin: 0;
padding: 0;
box-sizing: border-box;
}}
body {{
font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
background: var(--bg-color);
color: var(--text-color);
line-height: 1.6;
}}
.container {{
max-width: 1400px;
margin: 0 auto;
padding: 2rem;
}}
header {{
background: linear-gradient(135deg, var(--primary-color), var(--secondary-color));
padding: 3rem 2rem;
text-align: center;
margin-bottom: 2rem;
border-radius: 0 0 1rem 1rem;
}}
header h1 {{
font-size: 2.5rem;
font-weight: 700;
margin-bottom: 0.5rem;
}}
header .subtitle {{
color: rgba(255, 255, 255, 0.8);
font-size: 1.1rem;
}}
.card {{
background: var(--card-bg);
border-radius: 1rem;
padding: 1.5rem;
margin-bottom: 1.5rem;
border: 1px solid var(--border-color);
box-shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.3);
}}
.card h2 {{
font-size: 1.5rem;
margin-bottom: 1rem;
color: var(--primary-color);
display: flex;
align-items: center;
gap: 0.5rem;
}}
.card h3 {{
font-size: 1.2rem;
margin: 1.5rem 0 1rem 0;
color: var(--text-color);
}}
.stats-grid {{
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 1rem;
margin-bottom: 1.5rem;
}}
.stat-card {{
background: rgba(99, 102, 241, 0.1);
border-radius: 0.75rem;
padding: 1.25rem;
text-align: center;
border: 1px solid rgba(99, 102, 241, 0.2);
}}
.stat-card .value {{
font-size: 2rem;
font-weight: 700;
color: var(--primary-color);
}}
.stat-card .label {{
color: var(--text-muted);
font-size: 0.875rem;
margin-top: 0.25rem;
}}
.chart-container {{
width: 100%;
min-height: 400px;
margin: 1rem 0;
}}
table {{
width: 100%;
border-collapse: collapse;
margin: 1rem 0;
font-size: 0.9rem;
}}
th, td {{
padding: 0.75rem 1rem;
text-align: left;
border-bottom: 1px solid var(--border-color);
}}
th {{
background: rgba(99, 102, 241, 0.1);
font-weight: 600;
color: var(--primary-color);
cursor: pointer;
user-select: none;
}}
th:hover {{
background: rgba(99, 102, 241, 0.2);
}}
tr:hover {{
background: rgba(255, 255, 255, 0.02);
}}
.badge {{
display: inline-block;
padding: 0.25rem 0.75rem;
border-radius: 9999px;
font-size: 0.75rem;
font-weight: 600;
}}
.badge-success {{
background: rgba(34, 197, 94, 0.2);
color: var(--success-color);
}}
.badge-warning {{
background: rgba(245, 158, 11, 0.2);
color: var(--warning-color);
}}
.badge-danger {{
background: rgba(239, 68, 68, 0.2);
color: var(--danger-color);
}}
.interpretability-output {{
background: #0d1117;
border-radius: 0.5rem;
padding: 1rem;
margin: 1rem 0;
overflow-x: auto;
font-family: 'Fira Code', 'Monaco', monospace;
font-size: 0.85rem;
line-height: 1.5;
white-space: pre-wrap;
border: 1px solid var(--border-color);
}}
.model-header {{
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 0.5rem;
}}
.model-name {{
font-weight: 600;
color: var(--secondary-color);
}}
.dataset-name {{
color: var(--text-muted);
font-size: 0.875rem;
}}
.tabs {{
display: flex;
gap: 0.5rem;
margin-bottom: 1rem;
border-bottom: 1px solid var(--border-color);
padding-bottom: 0.5rem;
}}
.tab {{
padding: 0.5rem 1rem;
border-radius: 0.5rem 0.5rem 0 0;
cursor: pointer;
transition: all 0.2s;
color: var(--text-muted);
}}
.tab:hover {{
background: rgba(99, 102, 241, 0.1);
}}
.tab.active {{
background: var(--primary-color);
color: white;
}}
.tab-content {{
display: none;
}}
.tab-content.active {{
display: block;
}}
.rank-1 {{ color: #ffd700; font-weight: 700; }}
.rank-2 {{ color: #c0c0c0; font-weight: 600; }}
.rank-3 {{ color: #cd7f32; font-weight: 600; }}
footer {{
text-align: center;
padding: 2rem;
color: var(--text-muted);
font-size: 0.875rem;
}}
.collapsible {{
cursor: pointer;
padding: 0.75rem;
background: rgba(99, 102, 241, 0.1);
border-radius: 0.5rem;
margin: 0.5rem 0;
display: flex;
justify-content: space-between;
align-items: center;
}}
.collapsible:hover {{
background: rgba(99, 102, 241, 0.2);
}}
.collapsible-content {{
display: none;
padding: 1rem;
animation: fadeIn 0.3s ease;
}}
.collapsible-content.show {{
display: block;
}}
@keyframes fadeIn {{
from {{ opacity: 0; transform: translateY(-10px); }}
to {{ opacity: 1; transform: translateY(0); }}
}}
.progress-bar {{
width: 100%;
height: 8px;
background: var(--border-color);
border-radius: 4px;
overflow: hidden;
}}
.progress-bar-fill {{
height: 100%;
background: linear-gradient(90deg, var(--primary-color), var(--secondary-color));
transition: width 0.3s ease;
}}
@media (max-width: 768px) {{
.container {{
padding: 1rem;
}}
header h1 {{
font-size: 1.75rem;
}}
.stats-grid {{
grid-template-columns: repeat(2, 1fr);
}}
}}
</style>
</head>
<body>
<div class="container">
{''.join(sections)}
</div>
<script>
// Table sorting
document.querySelectorAll('th[data-sort]').forEach(th => {{
th.addEventListener('click', () => {{
const table = th.closest('table');
const tbody = table.querySelector('tbody');
const rows = Array.from(tbody.querySelectorAll('tr'));
const idx = Array.from(th.parentNode.children).indexOf(th);
const asc = th.dataset.order !== 'asc';
rows.sort((a, b) => {{
const aVal = a.children[idx].textContent;
const bVal = b.children[idx].textContent;
const aNum = parseFloat(aVal);
const bNum = parseFloat(bVal);
if (!isNaN(aNum) && !isNaN(bNum)) {{
return asc ? aNum - bNum : bNum - aNum;
}}
return asc ? aVal.localeCompare(bVal) : bVal.localeCompare(aVal);
}});
th.dataset.order = asc ? 'asc' : 'desc';
rows.forEach(row => tbody.appendChild(row));
}});
}});
// Collapsible sections
document.querySelectorAll('.collapsible').forEach(el => {{
el.addEventListener('click', () => {{
const content = el.nextElementSibling;
content.classList.toggle('show');
el.querySelector('.arrow').textContent = content.classList.contains('show') ? '▼' : '▶';
}});
}});
// Tabs
document.querySelectorAll('.tab').forEach(tab => {{
tab.addEventListener('click', () => {{
const tabGroup = tab.closest('.tabs-container');
tabGroup.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
tabGroup.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
tab.classList.add('active');
tabGroup.querySelector(tab.dataset.target).classList.add('active');
}});
}});
</script>
</body>
</html>"""
def _generate_header(self) -> str:
"""Generate the report header."""
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
n_experiments = len(self.tracker)
n_successful = len(self.tracker.get_successful())
return f"""
<header>
<h1>🏆 {_escape_html(self.title)}</h1>
<p class="subtitle">Generated on {timestamp} • {n_experiments} experiments • {n_successful} successful</p>
</header>
"""
def _generate_summary_section(self) -> str:
"""Generate summary statistics section."""
df = self._df
if HAS_POLARS:
successful = df.filter(pl.col('status') == 'success')
n_total = len(df)
n_success = len(successful)
n_failed = n_total - n_success
n_datasets = df['dataset_name'].n_unique()
n_models = df['model_name'].n_unique()
# Best accuracy
if 'metric_accuracy' in successful.columns and len(successful) > 0:
best_acc_row = successful.sort('metric_accuracy', descending=True).head(1)
best_acc = best_acc_row['metric_accuracy'][0]
best_acc_model = best_acc_row['model_name'][0]
else:
best_acc = 0
best_acc_model = "N/A"
# Average fit time
if 'fit_time' in successful.columns:
avg_fit_time = successful['fit_time'].mean()
else:
avg_fit_time = 0
else:
successful = df[df['status'] == 'success']
n_total = len(df)
n_success = len(successful)
n_failed = n_total - n_success
n_datasets = df['dataset_name'].nunique()
n_models = df['model_name'].nunique()
if 'metric_accuracy' in successful.columns and len(successful) > 0:
best_acc = successful['metric_accuracy'].max()
best_acc_model = successful.loc[successful['metric_accuracy'].idxmax(), 'model_name']
else:
best_acc = 0
best_acc_model = "N/A"
avg_fit_time = successful['fit_time'].mean() if 'fit_time' in successful.columns else 0
success_rate = (n_success / n_total * 100) if n_total > 0 else 0
return f"""
<section class="card">
<h2>📊 Summary</h2>
<div class="stats-grid">
<div class="stat-card">
<div class="value">{n_total}</div>
<div class="label">Total Experiments</div>
</div>
<div class="stat-card">
<div class="value">{n_success}</div>
<div class="label">Successful</div>
</div>
<div class="stat-card">
<div class="value">{n_datasets}</div>
<div class="label">Datasets</div>
</div>
<div class="stat-card">
<div class="value">{n_models}</div>
<div class="label">Models</div>
</div>
<div class="stat-card">
<div class="value">{_format_number(best_acc)}</div>
<div class="label">Best Accuracy ({_escape_html(best_acc_model)})</div>
</div>
<div class="stat-card">
<div class="value">{_format_number(avg_fit_time, 2)}s</div>
<div class="label">Avg Fit Time</div>
</div>
</div>
<div class="progress-bar">
<div class="progress-bar-fill" style="width: {success_rate}%"></div>
</div>
<p style="text-align: center; color: var(--text-muted); margin-top: 0.5rem;">
{_format_number(success_rate, 1)}% success rate
</p>
</section>
"""
def _generate_performance_section(self) -> str:
"""Generate performance comparison charts."""
df = self._df
if HAS_POLARS:
successful = df.filter(pl.col('status') == 'success')
else:
successful = df[df['status'] == 'success']
if len(successful) == 0:
return """<section class="card"><h2>📈 Performance</h2><p>No successful experiments to display.</p></section>"""
charts_html = []
# Accuracy comparison chart
accuracy_chart = self._create_metric_comparison_chart(successful, 'metric_accuracy', 'Accuracy')
if accuracy_chart:
charts_html.append('<div class="chart-container" id="accuracy-chart"></div>')
charts_html.append(f'<script>{accuracy_chart}</script>')
# F1 score chart (if available)
f1_col = 'metric_f1' if 'metric_f1' in (successful.columns if HAS_POLARS else successful.columns) else 'metric_f1_weighted'
if f1_col in (successful.columns if HAS_POLARS else successful.columns):
f1_chart = self._create_metric_comparison_chart(successful, f1_col, 'F1 Score')
if f1_chart:
charts_html.append('<div class="chart-container" id="f1-chart"></div>')
charts_html.append(f'<script>{f1_chart}</script>')
# ROC-AUC chart (if available)
auc_col = 'metric_roc_auc' if 'metric_roc_auc' in (successful.columns if HAS_POLARS else successful.columns) else 'metric_roc_auc_ovr_weighted'
if auc_col in (successful.columns if HAS_POLARS else successful.columns):
auc_chart = self._create_metric_comparison_chart(successful, auc_col, 'ROC-AUC')
if auc_chart:
charts_html.append('<div class="chart-container" id="auc-chart"></div>')
charts_html.append(f'<script>{auc_chart}</script>')
return f"""
<section class="card">
<h2>📈 Performance Comparison</h2>
{''.join(charts_html)}
</section>
"""
def _create_metric_comparison_chart(self, df, metric_col: str, metric_name: str) -> str | None:
"""Create a bar chart comparing model performance on a metric."""
if metric_col not in (df.columns if HAS_POLARS else df.columns):
return None
# Aggregate by model
if HAS_POLARS:
agg = df.group_by('model_name').agg([
pl.col(metric_col).mean().alias('mean'),
pl.col(metric_col).std().alias('std'),
pl.col(metric_col).count().alias('count'),
]).sort('mean', descending=True)
models = agg['model_name'].to_list()
means = agg['mean'].to_list()
stds = [s if s is not None else 0 for s in agg['std'].to_list()]
else:
agg = df.groupby('model_name')[metric_col].agg(['mean', 'std', 'count']).reset_index()
agg = agg.sort_values('mean', ascending=False)
models = agg['model_name'].tolist()
means = agg['mean'].tolist()
stds = agg['std'].fillna(0).tolist()
# Create Plotly chart
colors = _get_color_scale(len(models))
chart_id = metric_col.replace('metric_', '') + '-chart'
fig_json = {
'data': [{
'type': 'bar',
'x': models,
'y': means,
'error_y': {
'type': 'data',
'array': stds,
'visible': True,
},
'marker': {'color': colors[:len(models)]},
'hovertemplate': '<b>%{x}</b><br>' + metric_name + ': %{y:.4f}<extra></extra>',
}],
'layout': {
'title': {'text': f'{metric_name} by Model (Mean ± Std)', 'font': {'color': '#e2e8f0'}},
'xaxis': {'title': 'Model', 'tickangle': -45, 'color': '#94a3b8'},
'yaxis': {'title': metric_name, 'color': '#94a3b8'},
'paper_bgcolor': 'rgba(0,0,0,0)',
'plot_bgcolor': 'rgba(0,0,0,0)',
'font': {'color': '#e2e8f0'},
'margin': {'b': 150},
}
}
return f"Plotly.newPlot('{chart_id}', {json.dumps(fig_json['data'])}, {json.dumps(fig_json['layout'])});"
def _generate_speed_section(self) -> str:
"""Generate training speed comparison section."""
df = self._df
if HAS_POLARS:
successful = df.filter(pl.col('status') == 'success')
if 'fit_time' not in successful.columns or len(successful) == 0:
return ""
agg = successful.group_by('model_name').agg([
pl.col('fit_time').mean().alias('mean_time'),
pl.col('fit_time').min().alias('min_time'),
pl.col('fit_time').max().alias('max_time'),
]).sort('mean_time')
models = agg['model_name'].to_list()
mean_times = agg['mean_time'].to_list()
min_times = agg['min_time'].to_list()
max_times = agg['max_time'].to_list()
else:
successful = df[df['status'] == 'success']
if 'fit_time' not in successful.columns or len(successful) == 0:
return ""
agg = successful.groupby('model_name')['fit_time'].agg(['mean', 'min', 'max']).reset_index()
agg = agg.sort_values('mean')
models = agg['model_name'].tolist()
mean_times = agg['mean'].tolist()
min_times = agg['min'].tolist()
max_times = agg['max'].tolist()
# Create speed chart
colors = ['#22c55e' if t < 10 else '#f59e0b' if t < 60 else '#ef4444' for t in mean_times]
fig_json = {
'data': [{
'type': 'bar',
'x': mean_times,
'y': models,
'orientation': 'h',
'marker': {'color': colors},
'hovertemplate': '<b>%{y}</b><br>Avg: %{x:.2f}s<extra></extra>',
}],
'layout': {
'title': {'text': 'Training Time by Model (seconds)', 'font': {'color': '#e2e8f0'}},
'xaxis': {'title': 'Time (seconds)', 'color': '#94a3b8', 'type': 'log'},
'yaxis': {'title': '', 'color': '#94a3b8', 'autorange': 'reversed'},
'paper_bgcolor': 'rgba(0,0,0,0)',
'plot_bgcolor': 'rgba(0,0,0,0)',
'font': {'color': '#e2e8f0'},
'margin': {'l': 200},
'height': max(400, len(models) * 25),
}
}
return f"""
<section class="card">
<h2>⚡ Training Speed</h2>
<div class="chart-container" id="speed-chart" style="min-height: {max(400, len(models) * 25)}px"></div>
<script>Plotly.newPlot('speed-chart', {json.dumps(fig_json['data'])}, {json.dumps(fig_json['layout'])});</script>
<p style="color: var(--text-muted); text-align: center; margin-top: 1rem;">
<span style="color: #22c55e;">●</span> Fast (<10s)
<span style="color: #f59e0b;">●</span> Medium (<60s)
<span style="color: #ef4444;">●</span> Slow (>60s)
</p>
</section>
"""
def _generate_dataset_section(self) -> str:
"""Generate per-dataset results section."""
df = self._df
if HAS_POLARS:
successful = df.filter(pl.col('status') == 'success')
datasets = successful['dataset_name'].unique().to_list()
else:
successful = df[df['status'] == 'success']
datasets = successful['dataset_name'].unique().tolist()
if not datasets:
return ""
dataset_sections = []
for dataset in sorted(datasets):
if HAS_POLARS:
dataset_df = successful.filter(pl.col('dataset_name') == dataset)
dataset_df = dataset_df.sort('metric_accuracy', descending=True, nulls_last=True)
else:
dataset_df = successful[successful['dataset_name'] == dataset]
dataset_df = dataset_df.sort_values('metric_accuracy', ascending=False)
# Create table rows
rows = []
for i, row in enumerate(dataset_df.iter_rows(named=True) if HAS_POLARS else dataset_df.itertuples()):
if HAS_POLARS:
model_name = row['model_name']
accuracy = row.get('metric_accuracy', None)
f1 = row.get('metric_f1', row.get('metric_f1_weighted', None))
fit_time = row.get('fit_time', None)
else:
model_name = row.model_name
accuracy = getattr(row, 'metric_accuracy', None)
f1 = getattr(row, 'metric_f1', getattr(row, 'metric_f1_weighted', None))
fit_time = getattr(row, 'fit_time', None)
rank_class = f"rank-{i+1}" if i < 3 else ""
rows.append(f"""
<tr>
<td class="{rank_class}">{i+1}</td>
<td>{_escape_html(model_name)}</td>
<td>{_format_number(accuracy)}</td>
<td>{_format_number(f1)}</td>
<td>{_format_number(fit_time, 2)}s</td>
</tr>
""")
dataset_sections.append(f"""
<div class="collapsible">
<span><strong>{_escape_html(dataset)}</strong></span>
<span class="arrow">▶</span>
</div>
<div class="collapsible-content">
<table>
<thead>
<tr>
<th data-sort>Rank</th>
<th data-sort>Model</th>
<th data-sort>Accuracy</th>
<th data-sort>F1</th>
<th data-sort>Fit Time</th>
</tr>
</thead>
<tbody>
{''.join(rows)}
</tbody>
</table>
</div>
""")
return f"""
<section class="card">
<h2>📁 Results by Dataset</h2>
{''.join(dataset_sections)}
</section>
"""
def _generate_rankings_section(self) -> str:
"""Generate model rankings section."""
df = self._df
if HAS_POLARS:
successful = df.filter(pl.col('status') == 'success')
if len(successful) == 0:
return ""
# Compute mean rank per model
rankings = successful.group_by('model_name').agg([
pl.col('metric_accuracy').mean().alias('mean_accuracy'),
pl.col('metric_accuracy').count().alias('n_experiments'),
pl.col('fit_time').mean().alias('mean_time'),
]).sort('mean_accuracy', descending=True)
rows_data = list(rankings.iter_rows(named=True))
else:
successful = df[df['status'] == 'success']
if len(successful) == 0:
return ""
rankings = successful.groupby('model_name').agg({
'metric_accuracy': ['mean', 'count'],
'fit_time': 'mean',
}).reset_index()
rankings.columns = ['model_name', 'mean_accuracy', 'n_experiments', 'mean_time']
rankings = rankings.sort_values('mean_accuracy', ascending=False)
rows_data = rankings.to_dict('records')
# Build table
rows = []
for i, row in enumerate(rows_data):
model_name = row['model_name']
mean_acc = row['mean_accuracy']
n_exp = row['n_experiments']
mean_time = row['mean_time']
rank_class = f"rank-{i+1}" if i < 3 else ""
badge = "🥇" if i == 0 else "🥈" if i == 1 else "🥉" if i == 2 else ""
rows.append(f"""
<tr>
<td class="{rank_class}">{badge} {i+1}</td>
<td><strong>{_escape_html(model_name)}</strong></td>
<td>{_format_number(mean_acc)}</td>
<td>{n_exp}</td>
<td>{_format_number(mean_time, 2)}s</td>
</tr>
""")
return f"""
<section class="card">
<h2>🏅 Model Rankings</h2>
<table>
<thead>
<tr>
<th data-sort>Rank</th>
<th data-sort>Model</th>
<th data-sort>Mean Accuracy</th>
<th data-sort>Experiments</th>
<th data-sort>Mean Fit Time</th>
</tr>
</thead>
<tbody>
{''.join(rows)}
</tbody>
</table>
</section>
"""
def _generate_interpretability_section(self) -> str:
"""Generate interpretability outputs section."""
if not self._interpretability_outputs:
return ""
outputs_html = []
for key, data in self._interpretability_outputs.items():
model_name = data['model_name']
dataset_name = data['dataset_name']
output = data['output']
output_type = data['output_type']
if output_type == "latex":
# Wrap in math delimiters for potential MathJax rendering
formatted_output = f"$${_escape_html(output)}$$"
elif output_type == "html":
formatted_output = output # Already HTML
else:
formatted_output = _escape_html(output)
outputs_html.append(f"""
<div class="collapsible">
<span>
<span class="model-name">{_escape_html(model_name)}</span>
<span class="dataset-name"> on {_escape_html(dataset_name)}</span>
</span>
<span class="arrow">▶</span>
</div>
<div class="collapsible-content">
<div class="interpretability-output">{formatted_output}</div>
</div>
""")
return f"""
<section class="card">
<h2>🔍 Model Interpretability</h2>
<p style="color: var(--text-muted); margin-bottom: 1rem;">
Learned rules, trees, equations, and other interpretable model outputs.
</p>
{''.join(outputs_html)}
</section>
"""
def _generate_meta_features_section(self) -> str:
"""Generate dataset meta-features section."""
# Get meta-features from tracker records
meta_features_data = []
for record in self.tracker.get_successful():
if record.meta_features:
meta_features_data.append({
'dataset': record.dataset_name,
**record.meta_features,
})
if not meta_features_data:
return ""
# Deduplicate by dataset
seen = set()
unique_data = []
for mf in meta_features_data:
if mf['dataset'] not in seen:
seen.add(mf['dataset'])
unique_data.append(mf)
# Build table
if not unique_data:
return ""
# Get common columns
all_cols = set()
for d in unique_data:
all_cols.update(d.keys())
all_cols.discard('dataset')
cols = ['dataset'] + sorted(all_cols)[:10] # Limit to 10 columns
header = ''.join(f'<th data-sort>{_escape_html(c)}</th>' for c in cols)
rows = []
for d in unique_data:
cells = []
for c in cols:
val = d.get(c, 'N/A')
if isinstance(val, float):
val = _format_number(val, 2)
cells.append(f'<td>{_escape_html(str(val))}</td>')
rows.append(f'<tr>{"".join(cells)}</tr>')
return f"""
<section class="card">
<h2>📐 Dataset Meta-Features</h2>
<div style="overflow-x: auto;">
<table>
<thead><tr>{header}</tr></thead>
<tbody>{''.join(rows)}</tbody>
</table>
</div>
</section>
"""
def _generate_failures_section(self) -> str:
"""Generate failed experiments section."""
df = self._df
if HAS_POLARS:
failed = df.filter(pl.col('status') == 'failed')
if len(failed) == 0:
return ""
rows_data = list(failed.select(['model_name', 'dataset_name', 'error_message']).iter_rows(named=True))
else:
failed = df[df['status'] == 'failed']
if len(failed) == 0:
return ""
rows_data = failed[['model_name', 'dataset_name', 'error_message']].to_dict('records')
rows = []
for row in rows_data[:50]: # Limit to 50 failures
rows.append(f"""
<tr>
<td>{_escape_html(row['model_name'])}</td>
<td>{_escape_html(row['dataset_name'])}</td>
<td style="color: var(--danger-color); font-size: 0.8rem;">
{_escape_html(str(row.get('error_message', 'Unknown error'))[:200])}
</td>
</tr>
""")
return f"""
<section class="card">
<h2>❌ Failed Experiments ({len(rows_data)} total)</h2>
<table>
<thead>
<tr>
<th>Model</th>
<th>Dataset</th>
<th>Error</th>
</tr>
</thead>
<tbody>
{''.join(rows)}
</tbody>
</table>
</section>
"""
def _generate_footer(self) -> str:
"""Generate report footer."""
return f"""
<footer>
<p>Generated by Endgame Benchmark Suite</p>
<p>© {datetime.now().year} • Built with 🔬 for ML research</p>
</footer>
"""