Source code for endgame.utils.submission

from __future__ import annotations

"""Submission file generation and validation."""

from typing import Any

import numpy as np

try:
    import pandas as pd
    HAS_PANDAS = True
except ImportError:
    HAS_PANDAS = False

try:
    import polars as pl
    HAS_POLARS = True
except ImportError:
    HAS_POLARS = False


[docs] class SubmissionHelper: """Helper for generating properly formatted submission files. Handles common submission formats for Kaggle competitions. Parameters ---------- id_col : str, default='id' Name of the ID column. target_col : str or List[str], default='target' Name(s) of the target column(s). float_precision : int, default=6 Decimal places for float values. Examples -------- >>> helper = SubmissionHelper(id_col='Id', target_col='Prediction') >>> helper.to_csv(predictions, ids, 'submission.csv') >>> helper.validate('submission.csv', 'sample_submission.csv') """ def __init__( self, id_col: str = "id", target_col: str | list[str] = "target", float_precision: int = 6, ): self.id_col = id_col self.target_col = target_col self.float_precision = float_precision
[docs] def to_csv( self, predictions: np.ndarray, ids: np.ndarray | None = None, filepath: str = "submission.csv", sample_submission: str | None = None, ) -> str: """Generate submission CSV file. Parameters ---------- predictions : array-like Predicted values. ids : array-like, optional Sample IDs. If None, uses 0, 1, 2, ... filepath : str, default='submission.csv' Output file path. sample_submission : str, optional Path to sample submission for ID extraction. Returns ------- str Path to generated submission file. """ predictions = np.asarray(predictions) # Get IDs if ids is None: if sample_submission is not None: ids = self._load_ids_from_sample(sample_submission) else: ids = np.arange(len(predictions)) ids = np.asarray(ids) if len(ids) != len(predictions): raise ValueError( f"Length mismatch: {len(ids)} IDs vs {len(predictions)} predictions" ) # Build submission dataframe if HAS_PANDAS: df = pd.DataFrame({self.id_col: ids}) if isinstance(self.target_col, list): # Multi-target if predictions.ndim == 1: raise ValueError("Multi-target specified but predictions is 1D") for i, col in enumerate(self.target_col): df[col] = predictions[:, i] else: # Single target if predictions.ndim == 2 and predictions.shape[1] == 1: predictions = predictions.ravel() df[self.target_col] = predictions # Format floats float_cols = df.select_dtypes(include=[np.float64, np.float32]).columns for col in float_cols: df[col] = df[col].round(self.float_precision) df.to_csv(filepath, index=False) elif HAS_POLARS: data = {self.id_col: ids} if isinstance(self.target_col, list): for i, col in enumerate(self.target_col): data[col] = np.round(predictions[:, i], self.float_precision) else: if predictions.ndim == 2 and predictions.shape[1] == 1: predictions = predictions.ravel() data[self.target_col] = np.round(predictions, self.float_precision) df = pl.DataFrame(data) df.write_csv(filepath) else: # Pure numpy fallback with open(filepath, 'w') as f: # Header if isinstance(self.target_col, list): header = [self.id_col] + self.target_col else: header = [self.id_col, self.target_col] f.write(','.join(header) + '\n') # Data for i in range(len(ids)): row = [str(ids[i])] if isinstance(self.target_col, list): for j in range(len(self.target_col)): row.append(f"{predictions[i, j]:.{self.float_precision}f}") else: val = predictions[i] if predictions.ndim == 1 else predictions[i, 0] row.append(f"{val:.{self.float_precision}f}") f.write(','.join(row) + '\n') return filepath
def _load_ids_from_sample(self, sample_path: str) -> np.ndarray: """Load IDs from sample submission file.""" if HAS_PANDAS: df = pd.read_csv(sample_path) return df[self.id_col].values elif HAS_POLARS: df = pl.read_csv(sample_path) return df[self.id_col].to_numpy() else: ids = [] with open(sample_path) as f: header = f.readline().strip().split(',') id_idx = header.index(self.id_col) for line in f: parts = line.strip().split(',') ids.append(parts[id_idx]) return np.array(ids)
[docs] def validate( self, submission_path: str, sample_submission_path: str, ) -> dict[str, Any]: """Validate submission against sample submission. Parameters ---------- submission_path : str Path to submission file. sample_submission_path : str Path to sample submission file. Returns ------- Dict[str, Any] Validation results with keys: - valid: bool - errors: List[str] - warnings: List[str] - n_rows: int """ result = { "valid": True, "errors": [], "warnings": [], "n_rows": 0, } if HAS_PANDAS: try: sub = pd.read_csv(submission_path) sample = pd.read_csv(sample_submission_path) except Exception as e: result["valid"] = False result["errors"].append(f"Failed to read CSV: {e}") return result result["n_rows"] = len(sub) # Check row count if len(sub) != len(sample): result["valid"] = False result["errors"].append( f"Row count mismatch: {len(sub)} vs {len(sample)}" ) # Check columns if list(sub.columns) != list(sample.columns): result["valid"] = False result["errors"].append( f"Column mismatch: {list(sub.columns)} vs {list(sample.columns)}" ) # Check IDs if self.id_col in sub.columns and self.id_col in sample.columns: if not np.array_equal(sub[self.id_col].values, sample[self.id_col].values): result["valid"] = False result["errors"].append("ID values don't match sample submission") # Check for NaN if sub.isna().any().any(): result["warnings"].append("Submission contains NaN values") elif HAS_POLARS: try: sub = pl.read_csv(submission_path) sample = pl.read_csv(sample_submission_path) except Exception as e: result["valid"] = False result["errors"].append(f"Failed to read CSV: {e}") return result result["n_rows"] = len(sub) if len(sub) != len(sample): result["valid"] = False result["errors"].append( f"Row count mismatch: {len(sub)} vs {len(sample)}" ) if sub.columns != sample.columns: result["valid"] = False result["errors"].append( f"Column mismatch: {sub.columns} vs {sample.columns}" ) if sub.null_count().sum_horizontal()[0] > 0: result["warnings"].append("Submission contains null values") return result
[docs] def from_oof_predictions( self, oof_models: list[Any], X_test: np.ndarray, weights: dict[int, float] | None = None, ids: np.ndarray | None = None, filepath: str = "submission.csv", ) -> str: """Generate submission from OOF models. Parameters ---------- oof_models : List List of trained models (from cross-validation). X_test : array-like Test features. weights : Dict[int, float], optional Model weights. If None, uses uniform weights. ids : array-like, optional Test sample IDs. filepath : str Output file path. Returns ------- str Path to submission file. """ X_test = np.asarray(X_test) n_models = len(oof_models) if weights is None: weights = {i: 1.0 / n_models for i in range(n_models)} # Collect predictions predictions = np.zeros(len(X_test)) for i, model in enumerate(oof_models): weight = weights.get(i, 0.0) if weight > 0: if hasattr(model, "predict_proba"): pred = model.predict_proba(X_test) if pred.ndim == 2: pred = pred[:, 1] else: pred = model.predict(X_test) predictions += weight * pred # Normalize total_weight = sum(weights.values()) if total_weight > 0: predictions /= total_weight return self.to_csv(predictions, ids, filepath)