Source code for endgame.utils.metrics

from __future__ import annotations

"""Competition-specific metrics not in sklearn."""

from collections.abc import Callable

import numpy as np
from sklearn.metrics import cohen_kappa_score



[docs]
def quadratic_weighted_kappa(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    labels: list[int] | None = None,
) -> float:
    """Quadratic Weighted Kappa (QWK) metric.

    Used in education competitions (e.g., essay scoring).
    Measures agreement between two ratings with quadratic weighting.

    Parameters
    ----------
    y_true : array-like
        True labels.
    y_pred : array-like
        Predicted labels.
    labels : List[int], optional
        List of labels to use for the confusion matrix.

    Returns
    -------
    float
        QWK score in range [-1, 1], where 1 is perfect agreement.

    Examples
    --------
    >>> y_true = [1, 2, 3, 4, 5]
    >>> y_pred = [1, 2, 3, 4, 4]
    >>> qwk = quadratic_weighted_kappa(y_true, y_pred)
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # Round predictions if they are floats
    if y_pred.dtype in [np.float32, np.float64]:
        y_pred = np.round(y_pred).astype(int)

    return cohen_kappa_score(y_true, y_pred, weights="quadratic", labels=labels)



def mean_average_precision(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    k: int | None = None,
) -> float:
    """Mean Average Precision (MAP).

    Computes the mean of average precision scores for each sample.

    Parameters
    ----------
    y_true : array-like of shape (n_samples, n_classes) or (n_samples,)
        True relevance labels (binary).
    y_pred : array-like of shape (n_samples, n_classes) or (n_samples,)
        Predicted scores.
    k : int, optional
        Consider only top k predictions.

    Returns
    -------
    float
        MAP score.
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    if y_true.ndim == 1:
        y_true = y_true.reshape(-1, 1)
    if y_pred.ndim == 1:
        y_pred = y_pred.reshape(-1, 1)

    n_samples = y_true.shape[0]
    avg_precisions = []

    for i in range(n_samples):
        true_i = y_true[i]
        pred_i = y_pred[i]

        # Sort by predicted scores
        sorted_indices = np.argsort(pred_i)[::-1]

        if k is not None:
            sorted_indices = sorted_indices[:k]

        # Compute AP
        n_relevant = 0
        precision_sum = 0.0

        for j, idx in enumerate(sorted_indices):
            if true_i[idx] == 1:
                n_relevant += 1
                precision_sum += n_relevant / (j + 1)

        if n_relevant > 0:
            avg_precisions.append(precision_sum / min(n_relevant, len(sorted_indices)))
        else:
            avg_precisions.append(0.0)

    return np.mean(avg_precisions)



[docs]
def map_at_k(
    y_true: list[list[int]] | np.ndarray,
    y_pred: list[list[int]] | np.ndarray,
    k: int = 5,
) -> float:
    """Mean Average Precision @ K.

    For ranking competitions where each sample has multiple relevant items.

    Parameters
    ----------
    y_true : List[List[int]]
        List of relevant item indices for each sample.
    y_pred : List[List[int]]
        List of predicted item indices (ranked) for each sample.
    k : int, default=5
        Number of predictions to consider.

    Returns
    -------
    float
        MAP@K score.

    Examples
    --------
    >>> y_true = [[1, 2, 3], [4, 5]]
    >>> y_pred = [[1, 3, 5, 2, 4], [4, 1, 5, 2, 3]]
    >>> score = map_at_k(y_true, y_pred, k=5)
    """
    n_samples = len(y_true)
    avg_precisions = []

    for true_items, pred_items in zip(y_true, y_pred):
        true_set = set(true_items)
        pred_items = list(pred_items)[:k]

        if not true_set:
            avg_precisions.append(0.0)
            continue

        n_relevant = 0
        precision_sum = 0.0

        for i, item in enumerate(pred_items):
            if item in true_set:
                n_relevant += 1
                precision_sum += n_relevant / (i + 1)

        avg_precisions.append(precision_sum / min(len(true_set), k))

    return np.mean(avg_precisions)



def apk(actual: list[int], predicted: list[int], k: int = 10) -> float:
    """Average Precision @ K for a single sample.

    Parameters
    ----------
    actual : List[int]
        List of relevant items.
    predicted : List[int]
        List of predicted items (ranked).
    k : int, default=10
        Number of predictions to consider.

    Returns
    -------
    float
        AP@K score.
    """
    if not actual:
        return 0.0

    predicted = predicted[:k]
    actual_set = set(actual)

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual_set and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    return score / min(len(actual), k)



[docs]
def ndcg_at_k(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    k: int = 10,
) -> float:
    """Normalized Discounted Cumulative Gain @ K.

    Used in ranking competitions.

    Parameters
    ----------
    y_true : array-like
        True relevance scores.
    y_pred : array-like
        Predicted scores.
    k : int, default=10
        Number of predictions to consider.

    Returns
    -------
    float
        NDCG@K score in [0, 1].
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    # DCG
    def dcg(scores: np.ndarray, k: int) -> float:
        scores = scores[:k]
        gains = 2 ** scores - 1
        discounts = np.log2(np.arange(len(scores)) + 2)
        return np.sum(gains / discounts)

    # Sort by predicted scores
    sorted_indices = np.argsort(y_pred)[::-1]
    sorted_true = y_true[sorted_indices]

    # Ideal sorting
    ideal_sorted = np.sort(y_true)[::-1]

    dcg_score = dcg(sorted_true, k)
    idcg_score = dcg(ideal_sorted, k)

    if idcg_score == 0:
        return 0.0

    return dcg_score / idcg_score



def mcrmse(
    y_true: np.ndarray,
    y_pred: np.ndarray,
) -> float:
    """Mean Columnwise Root Mean Squared Error.

    Used in multi-target regression competitions.

    Parameters
    ----------
    y_true : array-like of shape (n_samples, n_targets)
        True values.
    y_pred : array-like of shape (n_samples, n_targets)
        Predicted values.

    Returns
    -------
    float
        MCRMSE score.
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)

    if y_true.ndim == 1:
        y_true = y_true.reshape(-1, 1)
    if y_pred.ndim == 1:
        y_pred = y_pred.reshape(-1, 1)

    rmse_per_col = np.sqrt(np.mean((y_true - y_pred) ** 2, axis=0))
    return np.mean(rmse_per_col)



[docs]
def competition_metric(metric_name: str) -> Callable:
    """Get metric function by name.

    Handles both sklearn metrics and competition-specific metrics.

    Parameters
    ----------
    metric_name : str
        Metric name: 'qwk', 'map_at_k', 'ndcg', 'mcrmse', etc.

    Returns
    -------
    Callable
        Metric function.
    """
    custom_metrics = {
        "qwk": quadratic_weighted_kappa,
        "quadratic_weighted_kappa": quadratic_weighted_kappa,
        "map": mean_average_precision,
        "map_at_k": map_at_k,
        "ndcg": ndcg_at_k,
        "ndcg_at_k": ndcg_at_k,
        "mcrmse": mcrmse,
    }

    if metric_name.lower() in custom_metrics:
        return custom_metrics[metric_name.lower()]

    # Try sklearn metrics
    try:
        from sklearn.metrics import get_scorer
        scorer = get_scorer(metric_name)
        return scorer._score_func
    except Exception:
        raise ValueError(
            f"Unknown metric: {metric_name}. "
            f"Available custom metrics: {list(custom_metrics.keys())}"
        )