Source code for endgame.calibration.analysis

from __future__ import annotations

"""Calibration analysis and visualization tools.

Provides metrics and diagnostic plots for assessing probability calibration.

References
----------
- Naeini et al. "Obtaining Well Calibrated Probabilities Using Bayesian Binning" (2015)
- Guo et al. "On Calibration of Modern Neural Networks" (2017)
- Nixon et al. "Measuring Calibration in Deep Learning" (2019)
"""

from dataclasses import dataclass

import numpy as np


[docs] @dataclass class CalibrationReport: """Container for calibration analysis results.""" # Core metrics ece: float # Expected Calibration Error mce: float # Maximum Calibration Error brier_score: float log_loss: float # Brier decomposition reliability: float # Calibration component resolution: float # How much predictions differ from base rate uncertainty: float # Baseline entropy # Reliability diagram data bin_edges: np.ndarray bin_accuracies: np.ndarray bin_confidences: np.ndarray bin_counts: np.ndarray # Summary statistics mean_confidence: float accuracy: float overconfidence: float # Mean(confidence - accuracy) def __repr__(self) -> str: return ( f"CalibrationReport(\n" f" ECE={self.ece:.4f},\n" f" MCE={self.mce:.4f},\n" f" Brier={self.brier_score:.4f},\n" f" Log Loss={self.log_loss:.4f},\n" f" Reliability={self.reliability:.4f},\n" f" Resolution={self.resolution:.4f},\n" f" Overconfidence={self.overconfidence:.4f}\n" f")" )
[docs] def expected_calibration_error( y_true: np.ndarray, y_proba: np.ndarray, n_bins: int = 10, strategy: str = "uniform", ) -> float: """Compute Expected Calibration Error (ECE). ECE measures the difference between predicted confidence and actual accuracy. ECE = Σ (|B_m| / n) × |acc(B_m) - conf(B_m)| Parameters ---------- y_true : array-like True binary labels. y_proba : array-like Predicted probabilities for positive class. n_bins : int, default=10 Number of bins. strategy : str, default='uniform' Binning strategy: 'uniform' or 'quantile'. Returns ------- float Expected Calibration Error (lower is better, 0 is perfectly calibrated). Examples -------- >>> y_true = np.array([0, 0, 1, 1]) >>> y_proba = np.array([0.2, 0.4, 0.6, 0.9]) >>> ece = expected_calibration_error(y_true, y_proba) """ y_true = np.asarray(y_true).ravel() y_proba = np.asarray(y_proba).ravel() if len(y_true) == 0: return 0.0 # Determine bin edges if strategy == "uniform": bin_edges = np.linspace(0, 1, n_bins + 1) elif strategy == "quantile": bin_edges = np.percentile(y_proba, np.linspace(0, 100, n_bins + 1)) bin_edges = np.unique(bin_edges) else: raise ValueError(f"Unknown strategy: {strategy}") ece = 0.0 n_samples = len(y_true) for i in range(len(bin_edges) - 1): # Find samples in this bin if i == len(bin_edges) - 2: mask = (y_proba >= bin_edges[i]) & (y_proba <= bin_edges[i + 1]) else: mask = (y_proba >= bin_edges[i]) & (y_proba < bin_edges[i + 1]) n_bin = np.sum(mask) if n_bin == 0: continue # Accuracy and confidence in this bin accuracy = np.mean(y_true[mask]) confidence = np.mean(y_proba[mask]) # Weighted absolute difference ece += (n_bin / n_samples) * np.abs(accuracy - confidence) return ece
[docs] def maximum_calibration_error( y_true: np.ndarray, y_proba: np.ndarray, n_bins: int = 10, strategy: str = "uniform", ) -> float: """Compute Maximum Calibration Error (MCE). MCE is the maximum calibration error across all bins. Useful for identifying worst-case miscalibration. Parameters ---------- y_true : array-like True binary labels. y_proba : array-like Predicted probabilities. n_bins : int, default=10 Number of bins. strategy : str, default='uniform' Binning strategy. Returns ------- float Maximum Calibration Error. """ y_true = np.asarray(y_true).ravel() y_proba = np.asarray(y_proba).ravel() if len(y_true) == 0: return 0.0 if strategy == "uniform": bin_edges = np.linspace(0, 1, n_bins + 1) else: bin_edges = np.percentile(y_proba, np.linspace(0, 100, n_bins + 1)) bin_edges = np.unique(bin_edges) mce = 0.0 for i in range(len(bin_edges) - 1): if i == len(bin_edges) - 2: mask = (y_proba >= bin_edges[i]) & (y_proba <= bin_edges[i + 1]) else: mask = (y_proba >= bin_edges[i]) & (y_proba < bin_edges[i + 1]) n_bin = np.sum(mask) if n_bin == 0: continue accuracy = np.mean(y_true[mask]) confidence = np.mean(y_proba[mask]) mce = max(mce, np.abs(accuracy - confidence)) return mce
[docs] def brier_score_decomposition( y_true: np.ndarray, y_proba: np.ndarray, n_bins: int = 10, ) -> dict[str, float]: """Decompose Brier score into reliability, resolution, and uncertainty. Brier Score = Reliability - Resolution + Uncertainty - Reliability: Measures calibration (lower is better) - Resolution: Measures how much predictions differ from base rate (higher is better) - Uncertainty: Base rate entropy (constant for a dataset) Parameters ---------- y_true : array-like True binary labels. y_proba : array-like Predicted probabilities. n_bins : int, default=10 Number of bins for decomposition. Returns ------- dict Dictionary with 'brier_score', 'reliability', 'resolution', 'uncertainty'. Examples -------- >>> decomp = brier_score_decomposition(y_true, y_proba) >>> print(f"Reliability: {decomp['reliability']:.4f}") >>> print(f"Resolution: {decomp['resolution']:.4f}") """ y_true = np.asarray(y_true).ravel() y_proba = np.asarray(y_proba).ravel() n_samples = len(y_true) base_rate = np.mean(y_true) # Brier score brier = np.mean((y_proba - y_true) ** 2) # Uncertainty (entropy of base rate) uncertainty = base_rate * (1 - base_rate) # Bin samples bin_edges = np.linspace(0, 1, n_bins + 1) reliability = 0.0 resolution = 0.0 for i in range(n_bins): if i == n_bins - 1: mask = (y_proba >= bin_edges[i]) & (y_proba <= bin_edges[i + 1]) else: mask = (y_proba >= bin_edges[i]) & (y_proba < bin_edges[i + 1]) n_bin = np.sum(mask) if n_bin == 0: continue # Average prediction and accuracy in bin avg_pred = np.mean(y_proba[mask]) avg_true = np.mean(y_true[mask]) # Reliability contribution reliability += (n_bin / n_samples) * (avg_pred - avg_true) ** 2 # Resolution contribution resolution += (n_bin / n_samples) * (avg_true - base_rate) ** 2 return { "brier_score": brier, "reliability": reliability, "resolution": resolution, "uncertainty": uncertainty, }
def log_loss(y_true: np.ndarray, y_proba: np.ndarray, eps: float = 1e-15) -> float: """Compute log loss (cross-entropy). Parameters ---------- y_true : array-like True binary labels. y_proba : array-like Predicted probabilities. eps : float Small value to avoid log(0). Returns ------- float Log loss. """ y_true = np.asarray(y_true).ravel() y_proba = np.asarray(y_proba).ravel() y_proba = np.clip(y_proba, eps, 1 - eps) return -np.mean(y_true * np.log(y_proba) + (1 - y_true) * np.log(1 - y_proba))
[docs] class CalibrationAnalyzer: """Analyze and visualize model calibration. Computes comprehensive calibration metrics and generates diagnostic plots. Parameters ---------- n_bins : int, default=10 Number of bins for binning-based metrics. strategy : str, default='uniform' Binning strategy: 'uniform' or 'quantile'. Examples -------- >>> analyzer = CalibrationAnalyzer(n_bins=15) >>> report = analyzer.analyze(y_true, y_proba) >>> print(report) >>> >>> # Visualize >>> analyzer.plot_reliability_diagram(y_true, y_proba) >>> analyzer.plot_confidence_histogram(y_proba) """ def __init__( self, n_bins: int = 10, strategy: str = "uniform", ): self.n_bins = n_bins self.strategy = strategy
[docs] def analyze( self, y_true: np.ndarray, y_proba: np.ndarray, ) -> CalibrationReport: """Compute comprehensive calibration metrics. Parameters ---------- y_true : array-like True binary labels. y_proba : array-like Predicted probabilities for positive class. Returns ------- CalibrationReport Comprehensive calibration analysis results. """ y_true = np.asarray(y_true).ravel() y_proba = np.asarray(y_proba).ravel() # Core metrics ece = expected_calibration_error(y_true, y_proba, self.n_bins, self.strategy) mce = maximum_calibration_error(y_true, y_proba, self.n_bins, self.strategy) brier = np.mean((y_proba - y_true) ** 2) ll = log_loss(y_true, y_proba) # Brier decomposition decomp = brier_score_decomposition(y_true, y_proba, self.n_bins) # Reliability diagram data bin_data = self._compute_bin_data(y_true, y_proba) # Summary statistics mean_conf = np.mean(y_proba) accuracy = np.mean(y_true) # Overconfidence: positive means overconfident, negative means underconfident overconf = mean_conf - accuracy return CalibrationReport( ece=ece, mce=mce, brier_score=brier, log_loss=ll, reliability=decomp["reliability"], resolution=decomp["resolution"], uncertainty=decomp["uncertainty"], bin_edges=bin_data["edges"], bin_accuracies=bin_data["accuracies"], bin_confidences=bin_data["confidences"], bin_counts=bin_data["counts"], mean_confidence=mean_conf, accuracy=accuracy, overconfidence=overconf, )
def _compute_bin_data( self, y_true: np.ndarray, y_proba: np.ndarray, ) -> dict[str, np.ndarray]: """Compute binned data for reliability diagram.""" if self.strategy == "uniform": bin_edges = np.linspace(0, 1, self.n_bins + 1) else: bin_edges = np.percentile(y_proba, np.linspace(0, 100, self.n_bins + 1)) bin_edges = np.unique(bin_edges) n_bins_actual = len(bin_edges) - 1 accuracies = np.zeros(n_bins_actual) confidences = np.zeros(n_bins_actual) counts = np.zeros(n_bins_actual, dtype=int) for i in range(n_bins_actual): if i == n_bins_actual - 1: mask = (y_proba >= bin_edges[i]) & (y_proba <= bin_edges[i + 1]) else: mask = (y_proba >= bin_edges[i]) & (y_proba < bin_edges[i + 1]) counts[i] = np.sum(mask) if counts[i] > 0: accuracies[i] = np.mean(y_true[mask]) confidences[i] = np.mean(y_proba[mask]) else: # Use bin midpoint for empty bins accuracies[i] = np.nan confidences[i] = (bin_edges[i] + bin_edges[i + 1]) / 2 return { "edges": bin_edges, "accuracies": accuracies, "confidences": confidences, "counts": counts, }
[docs] def plot_reliability_diagram( self, y_true: np.ndarray, y_proba: np.ndarray, ax=None, show_histogram: bool = True, show_ece: bool = True, title: str = "Reliability Diagram", ): """Plot reliability (calibration) diagram. A well-calibrated model has points close to the diagonal. Parameters ---------- y_true : array-like True binary labels. y_proba : array-like Predicted probabilities. ax : matplotlib axes, optional Axes to plot on. show_histogram : bool, default=True Show histogram of predictions at bottom. show_ece : bool, default=True Show ECE value on plot. title : str, default='Reliability Diagram' Plot title. Returns ------- matplotlib axes """ try: import matplotlib.pyplot as plt except ImportError: raise ImportError("matplotlib is required for plotting") y_true = np.asarray(y_true).ravel() y_proba = np.asarray(y_proba).ravel() # Get bin data bin_data = self._compute_bin_data(y_true, y_proba) if ax is None: fig, ax = plt.subplots(figsize=(8, 6)) # Plot perfect calibration line ax.plot([0, 1], [0, 1], 'k--', label='Perfect calibration', linewidth=2) # Plot bins bin_midpoints = (bin_data["edges"][:-1] + bin_data["edges"][1:]) / 2 # Only plot non-empty bins valid_mask = bin_data["counts"] > 0 ax.bar( bin_midpoints[valid_mask], bin_data["accuracies"][valid_mask], width=1.0 / len(bin_midpoints), alpha=0.7, edgecolor='black', label='Actual accuracy', ) # Gap visualization (miscalibration) for i, (conf, acc, count) in enumerate(zip( bin_data["confidences"], bin_data["accuracies"], bin_data["counts"] )): if count > 0 and not np.isnan(acc): color = 'red' if conf > acc else 'blue' ax.plot( [conf, conf], [acc, conf], color=color, alpha=0.5, linewidth=2 ) ax.set_xlabel('Mean Predicted Probability', fontsize=12) ax.set_ylabel('Fraction of Positives', fontsize=12) ax.set_xlim([0, 1]) ax.set_ylim([0, 1]) ax.set_title(title, fontsize=14) ax.legend(loc='upper left') if show_ece: ece = expected_calibration_error(y_true, y_proba, self.n_bins, self.strategy) ax.text( 0.95, 0.05, f'ECE = {ece:.4f}', transform=ax.transAxes, fontsize=11, verticalalignment='bottom', horizontalalignment='right', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8), ) if show_histogram: # Add histogram at bottom ax2 = ax.twinx() ax2.hist( y_proba, bins=self.n_bins, range=(0, 1), alpha=0.3, color='gray', edgecolor='gray' ) ax2.set_ylabel('Count', fontsize=10, color='gray') ax2.tick_params(axis='y', labelcolor='gray') ax2.set_ylim(0, ax2.get_ylim()[1] * 3) # Make histogram smaller plt.tight_layout() return ax
[docs] def plot_confidence_histogram( self, y_proba: np.ndarray, y_true: np.ndarray | None = None, ax=None, title: str = "Confidence Distribution", ): """Plot histogram of prediction confidences. Parameters ---------- y_proba : array-like Predicted probabilities. y_true : array-like, optional True labels for coloring by correctness. ax : matplotlib axes, optional Axes to plot on. title : str Plot title. Returns ------- matplotlib axes """ try: import matplotlib.pyplot as plt except ImportError: raise ImportError("matplotlib is required for plotting") y_proba = np.asarray(y_proba).ravel() if ax is None: fig, ax = plt.subplots(figsize=(8, 5)) if y_true is not None: y_true = np.asarray(y_true).ravel() # Color by correctness predictions = (y_proba >= 0.5).astype(int) correct = predictions == y_true ax.hist( y_proba[correct], bins=self.n_bins, range=(0, 1), alpha=0.7, label='Correct', color='green', edgecolor='darkgreen' ) ax.hist( y_proba[~correct], bins=self.n_bins, range=(0, 1), alpha=0.7, label='Incorrect', color='red', edgecolor='darkred' ) ax.legend() else: ax.hist( y_proba, bins=self.n_bins, range=(0, 1), alpha=0.7, edgecolor='black' ) ax.set_xlabel('Predicted Probability', fontsize=12) ax.set_ylabel('Count', fontsize=12) ax.set_title(title, fontsize=14) ax.set_xlim([0, 1]) # Add statistics mean_conf = np.mean(y_proba) max_conf = np.max(y_proba) ax.axvline(mean_conf, color='orange', linestyle='--', linewidth=2, label=f'Mean: {mean_conf:.3f}') plt.tight_layout() return ax
[docs] def compare_calibrations( self, y_true: np.ndarray, probas_dict: dict[str, np.ndarray], ax=None, title: str = "Calibration Comparison", ): """Compare calibration of multiple models. Parameters ---------- y_true : array-like True labels. probas_dict : dict Dictionary mapping model names to predicted probabilities. ax : matplotlib axes, optional Axes to plot on. title : str Plot title. Returns ------- matplotlib axes """ try: import matplotlib.pyplot as plt except ImportError: raise ImportError("matplotlib is required for plotting") y_true = np.asarray(y_true).ravel() if ax is None: fig, ax = plt.subplots(figsize=(8, 6)) # Perfect calibration line ax.plot([0, 1], [0, 1], 'k--', label='Perfect', linewidth=2) colors = plt.cm.Set1(np.linspace(0, 1, len(probas_dict))) for (name, y_proba), color in zip(probas_dict.items(), colors): y_proba = np.asarray(y_proba).ravel() bin_data = self._compute_bin_data(y_true, y_proba) ece = expected_calibration_error(y_true, y_proba, self.n_bins) valid_mask = bin_data["counts"] > 0 ax.plot( bin_data["confidences"][valid_mask], bin_data["accuracies"][valid_mask], 'o-', color=color, label=f'{name} (ECE={ece:.4f})', linewidth=2, markersize=6 ) ax.set_xlabel('Mean Predicted Probability', fontsize=12) ax.set_ylabel('Fraction of Positives', fontsize=12) ax.set_xlim([0, 1]) ax.set_ylim([0, 1]) ax.set_title(title, fontsize=14) ax.legend(loc='upper left') plt.tight_layout() return ax