Source code for endgame.utils.sharpe

from __future__ import annotations

"""Sharpe Ratio utilities for backtesting and multiple testing correction.

This module implements the Deflated Sharpe Ratio (DSR) and Probabilistic Sharpe
Ratio (PSR) from Bailey & López de Prado's work on addressing the multiple
testing problem in quantitative finance.

References
----------
- Bailey, D.H. and López de Prado, M. (2014). "The Deflated Sharpe Ratio:
  Correcting for Selection Bias, Backtest Overfitting, and Non-Normality."
  The Journal of Portfolio Management, 40(5), 94-107.
- López de Prado, M. (2018). "Advances in Financial Machine Learning."
  John Wiley & Sons, Chapter 14.
"""

from dataclasses import dataclass

import numpy as np
from scipy import stats

# Euler-Mascheroni constant
EULER_MASCHERONI = 0.5772156649015329



[docs]
@dataclass
class SharpeAnalysis:
    """Results from Sharpe ratio analysis.

    Attributes
    ----------
    sharpe_ratio : float
        The estimated Sharpe ratio.
    probabilistic_sharpe : float
        PSR - probability that true SR > benchmark.
    deflated_sharpe : float
        DSR - PSR adjusted for multiple testing.
    expected_max_sharpe : float
        Expected maximum SR under null hypothesis.
    p_value : float
        P-value for the null hypothesis that true SR = 0.
    is_significant : bool
        Whether DSR exceeds significance threshold.
    n_trials : int
        Number of trials considered.
    skewness : float
        Skewness of returns.
    kurtosis : float
        Excess kurtosis of returns.
    track_record_length : int
        Number of observations.
    """
    sharpe_ratio: float
    probabilistic_sharpe: float
    deflated_sharpe: float
    expected_max_sharpe: float
    p_value: float
    is_significant: bool
    n_trials: int
    skewness: float
    kurtosis: float
    track_record_length: int




[docs]
def sharpe_ratio(
    returns: np.ndarray,
    risk_free_rate: float = 0.0,
    annualization_factor: float = 252.0,
) -> float:
    """Calculate the annualized Sharpe ratio.

    Parameters
    ----------
    returns : np.ndarray
        Array of periodic returns.
    risk_free_rate : float, default=0.0
        Risk-free rate (same period as returns).
    annualization_factor : float, default=252.0
        Factor to annualize (252 for daily, 12 for monthly, 52 for weekly).

    Returns
    -------
    float
        Annualized Sharpe ratio.

    Examples
    --------
    >>> returns = np.random.randn(252) * 0.01 + 0.0005  # Daily returns
    >>> sr = sharpe_ratio(returns)
    """
    returns = np.asarray(returns)
    excess_returns = returns - risk_free_rate

    mean_excess = np.mean(excess_returns)
    std_excess = np.std(excess_returns, ddof=1)

    if std_excess == 0 or np.isclose(std_excess, 0, atol=1e-10):
        return 0.0

    # Annualize
    sr = (mean_excess / std_excess) * np.sqrt(annualization_factor)
    return sr




[docs]
def sharpe_ratio_std(
    sharpe: float,
    n_obs: int,
    skewness: float = 0.0,
    kurtosis: float = 3.0,
) -> float:
    """Calculate the standard error of the Sharpe ratio estimate.

    Uses the Lo (2002) / Mertens (2002) correction for non-normality.

    Parameters
    ----------
    sharpe : float
        Estimated Sharpe ratio.
    n_obs : int
        Number of observations.
    skewness : float, default=0.0
        Skewness of returns.
    kurtosis : float, default=3.0
        Kurtosis of returns (not excess kurtosis).

    Returns
    -------
    float
        Standard error of the Sharpe ratio.

    Notes
    -----
    The formula accounts for:
    - Sampling variability
    - Non-normal returns (skewness and fat tails)

    References
    ----------
    Lo, A. (2002). "The Statistics of Sharpe Ratios."
    Financial Analysts Journal, 58(4), 36-52.
    """
    # Excess kurtosis (kurtosis - 3 for normal)
    excess_kurt = kurtosis - 3.0

    # Variance of SR estimator under non-normality
    # Var(SR) = (1 - skew*SR + ((kurtosis-1)/4)*SR^2) / (n-1)
    variance = (
        1
        - skewness * sharpe
        + ((excess_kurt + 2) / 4) * sharpe ** 2
    ) / (n_obs - 1)

    return np.sqrt(max(variance, 0))




[docs]
def probabilistic_sharpe_ratio(
    sharpe: float,
    benchmark_sharpe: float,
    n_obs: int,
    skewness: float = 0.0,
    kurtosis: float = 3.0,
) -> float:
    """Calculate the Probabilistic Sharpe Ratio (PSR).

    PSR is the probability that the true Sharpe ratio exceeds the benchmark,
    accounting for non-normality of returns.

    Parameters
    ----------
    sharpe : float
        Estimated Sharpe ratio.
    benchmark_sharpe : float
        Benchmark Sharpe ratio to compare against.
    n_obs : int
        Number of observations.
    skewness : float, default=0.0
        Skewness of returns.
    kurtosis : float, default=3.0
        Kurtosis of returns (not excess kurtosis).

    Returns
    -------
    float
        Probability in [0, 1] that true SR > benchmark SR.

    Examples
    --------
    >>> # Test if strategy beats SR = 0
    >>> psr = probabilistic_sharpe_ratio(sharpe=1.5, benchmark_sharpe=0,
    ...                                   n_obs=252, skewness=-0.2, kurtosis=4.0)
    >>> print(f"Probability true SR > 0: {psr:.2%}")

    Notes
    -----
    PSR corrects for:
    - Sample length (finite track record)
    - Non-normal returns (skewness and fat tails)

    It does NOT correct for multiple testing - use DSR for that.

    References
    ----------
    Bailey, D.H. and López de Prado, M. (2012). "The Sharpe Ratio
    Efficient Frontier." Journal of Risk, 15(2), 3-44.
    """
    # Standard error of SR
    sr_std = sharpe_ratio_std(sharpe, n_obs, skewness, kurtosis)

    if sr_std == 0:
        return 1.0 if sharpe > benchmark_sharpe else 0.0

    # Z-score
    z = (sharpe - benchmark_sharpe) / sr_std

    # PSR = Phi(z)
    return float(stats.norm.cdf(z))




[docs]
def expected_max_sharpe(
    n_trials: int,
    sharpe_std: float,
    mean_sharpe: float = 0.0,
) -> float:
    """Calculate expected maximum Sharpe ratio under null hypothesis.

    This is the expected maximum SR when all strategies have true SR = mean_sharpe,
    but we observe inflated values due to multiple testing.

    Parameters
    ----------
    n_trials : int
        Number of independent trials/strategies tested.
    sharpe_std : float
        Standard deviation of Sharpe ratio estimates across trials.
    mean_sharpe : float, default=0.0
        Mean Sharpe ratio under null (typically 0).

    Returns
    -------
    float
        Expected maximum Sharpe ratio E[max{SR_i}].

    Notes
    -----
    Uses the approximation from Bailey & López de Prado (2014):

    E[max{SR}] ≈ μ + σ * [(1-γ)*Φ^(-1)(1-1/N) + γ*Φ^(-1)(1-1/(N*e))]

    where γ is the Euler-Mascheroni constant.

    Examples
    --------
    >>> # After 100 trials, what SR do we expect by chance?
    >>> e_max = expected_max_sharpe(n_trials=100, sharpe_std=0.5)
    >>> print(f"Expected max SR: {e_max:.2f}")
    """
    if n_trials <= 1:
        return mean_sharpe

    gamma = EULER_MASCHERONI

    # Quantiles
    # Φ^(-1)(1 - 1/N)
    q1 = stats.norm.ppf(1 - 1 / n_trials)
    # Φ^(-1)(1 - 1/(N*e))
    q2 = stats.norm.ppf(1 - 1 / (n_trials * np.e))

    # Expected maximum
    e_max = mean_sharpe + sharpe_std * ((1 - gamma) * q1 + gamma * q2)

    return e_max




[docs]
def deflated_sharpe_ratio(
    sharpe: float,
    n_trials: int,
    sharpe_std_trials: float,
    n_obs: int,
    skewness: float = 0.0,
    kurtosis: float = 3.0,
    mean_sharpe_null: float = 0.0,
) -> float:
    """Calculate the Deflated Sharpe Ratio (DSR).

    DSR corrects for multiple testing by computing the probability that the
    observed Sharpe ratio exceeds the expected maximum SR under the null
    hypothesis that all strategies have zero true SR.

    Parameters
    ----------
    sharpe : float
        Estimated Sharpe ratio of the selected strategy.
    n_trials : int
        Number of independent trials/strategies tested.
    sharpe_std_trials : float
        Standard deviation of Sharpe ratios across all trials.
    n_obs : int
        Number of observations (track record length).
    skewness : float, default=0.0
        Skewness of returns.
    kurtosis : float, default=3.0
        Kurtosis of returns (not excess kurtosis).
    mean_sharpe_null : float, default=0.0
        Mean Sharpe ratio under null hypothesis.

    Returns
    -------
    float
        Deflated Sharpe Ratio in [0, 1].

    Examples
    --------
    >>> # Tested 100 strategies, best has SR = 2.0
    >>> dsr = deflated_sharpe_ratio(
    ...     sharpe=2.0,
    ...     n_trials=100,
    ...     sharpe_std_trials=0.5,
    ...     n_obs=252,
    ...     skewness=-0.3,
    ...     kurtosis=4.5,
    ... )
    >>> print(f"DSR: {dsr:.2%}")
    >>> # If DSR < 0.95, the strategy may be a statistical fluke

    Notes
    -----
    DSR answers: "What is the probability that this strategy would have
    beaten random chance, given that we tested N strategies?"

    A DSR of 0.95 means there's a 95% probability that the strategy's
    performance is real and not due to overfitting from multiple testing.

    References
    ----------
    Bailey, D.H. and López de Prado, M. (2014). "The Deflated Sharpe Ratio:
    Correcting for Selection Bias, Backtest Overfitting, and Non-Normality."
    The Journal of Portfolio Management, 40(5), 94-107.
    """
    # Calculate expected max SR under null
    sr0 = expected_max_sharpe(n_trials, sharpe_std_trials, mean_sharpe_null)

    # DSR is just PSR with benchmark = expected max SR
    dsr = probabilistic_sharpe_ratio(
        sharpe=sharpe,
        benchmark_sharpe=sr0,
        n_obs=n_obs,
        skewness=skewness,
        kurtosis=kurtosis,
    )

    return dsr




[docs]
def analyze_sharpe(
    returns: np.ndarray,
    n_trials: int = 1,
    sharpe_std_trials: float | None = None,
    all_sharpes: np.ndarray | None = None,
    risk_free_rate: float = 0.0,
    annualization_factor: float = 252.0,
    significance_level: float = 0.05,
) -> SharpeAnalysis:
    """Comprehensive Sharpe ratio analysis with multiple testing correction.

    Parameters
    ----------
    returns : np.ndarray
        Array of periodic returns for the selected strategy.
    n_trials : int, default=1
        Number of independent trials/strategies tested.
    sharpe_std_trials : float, optional
        Standard deviation of Sharpe ratios across all trials.
        If not provided and all_sharpes is given, computed from all_sharpes.
        If neither provided, estimated as 1/sqrt(n_obs).
    all_sharpes : np.ndarray, optional
        Sharpe ratios of all tested strategies (for computing variance).
    risk_free_rate : float, default=0.0
        Risk-free rate (same period as returns).
    annualization_factor : float, default=252.0
        Factor to annualize Sharpe ratio.
    significance_level : float, default=0.05
        Significance level for hypothesis testing.

    Returns
    -------
    SharpeAnalysis
        Comprehensive analysis results.

    Examples
    --------
    >>> # Single strategy analysis
    >>> returns = np.random.randn(252) * 0.01 + 0.0005
    >>> analysis = analyze_sharpe(returns)
    >>> print(f"SR: {analysis.sharpe_ratio:.2f}")
    >>> print(f"PSR (SR > 0): {analysis.probabilistic_sharpe:.2%}")

    >>> # Multiple testing scenario
    >>> all_sharpes = np.random.randn(100) * 0.5  # 100 strategies tested
    >>> best_idx = np.argmax(all_sharpes)
    >>> analysis = analyze_sharpe(
    ...     returns=best_returns,
    ...     n_trials=100,
    ...     all_sharpes=all_sharpes,
    ... )
    >>> print(f"DSR: {analysis.deflated_sharpe:.2%}")
    >>> print(f"Significant: {analysis.is_significant}")
    """
    returns = np.asarray(returns)
    n_obs = len(returns)

    # Compute basic statistics
    excess_returns = returns - risk_free_rate
    skewness = float(stats.skew(excess_returns))
    kurtosis = float(stats.kurtosis(excess_returns, fisher=False))  # Not excess

    # Compute Sharpe ratio
    sr = sharpe_ratio(returns, risk_free_rate, annualization_factor)

    # Estimate SR standard deviation across trials
    if sharpe_std_trials is None:
        if all_sharpes is not None:
            sharpe_std_trials = float(np.std(all_sharpes, ddof=1))
        else:
            # Conservative estimate: assume SR ~ N(0, 1/sqrt(T))
            sharpe_std_trials = 1.0 / np.sqrt(n_obs) * np.sqrt(annualization_factor)

    # Compute PSR (probability true SR > 0)
    psr = probabilistic_sharpe_ratio(
        sharpe=sr,
        benchmark_sharpe=0.0,
        n_obs=n_obs,
        skewness=skewness,
        kurtosis=kurtosis,
    )

    # Compute expected max SR under null
    e_max_sr = expected_max_sharpe(n_trials, sharpe_std_trials, 0.0)

    # Compute DSR
    dsr = deflated_sharpe_ratio(
        sharpe=sr,
        n_trials=n_trials,
        sharpe_std_trials=sharpe_std_trials,
        n_obs=n_obs,
        skewness=skewness,
        kurtosis=kurtosis,
    )

    # P-value (1 - DSR when testing against expected max)
    p_value = 1.0 - dsr

    return SharpeAnalysis(
        sharpe_ratio=sr,
        probabilistic_sharpe=psr,
        deflated_sharpe=dsr,
        expected_max_sharpe=e_max_sr,
        p_value=p_value,
        is_significant=dsr >= (1 - significance_level),
        n_trials=n_trials,
        skewness=skewness,
        kurtosis=kurtosis,
        track_record_length=n_obs,
    )




[docs]
def minimum_track_record_length(
    sharpe: float,
    benchmark_sharpe: float = 0.0,
    confidence: float = 0.95,
    skewness: float = 0.0,
    kurtosis: float = 3.0,
) -> int:
    """Calculate minimum track record length needed for statistical significance.

    Answers: "How many observations do we need to be confident that the
    strategy's Sharpe ratio is real?"

    Parameters
    ----------
    sharpe : float
        Target Sharpe ratio.
    benchmark_sharpe : float, default=0.0
        Benchmark to beat.
    confidence : float, default=0.95
        Required confidence level.
    skewness : float, default=0.0
        Expected skewness of returns.
    kurtosis : float, default=3.0
        Expected kurtosis of returns.

    Returns
    -------
    int
        Minimum number of observations needed.

    Examples
    --------
    >>> # How long to verify SR = 1.0 strategy?
    >>> n_min = minimum_track_record_length(sharpe=1.0)
    >>> print(f"Need at least {n_min} observations")

    Notes
    -----
    This is the "MinTRL" from Bailey & López de Prado (2012).

    A strategy with SR = 2.0 and normal returns needs only ~16 observations.
    A strategy with SR = 0.5 needs ~256 observations!
    """
    if sharpe <= benchmark_sharpe:
        return np.inf

    # z-score for desired confidence
    z = stats.norm.ppf(confidence)

    # Excess kurtosis
    excess_kurt = kurtosis - 3.0

    # Solve for n in the PSR formula
    # We need: (SR - SR_0) * sqrt(n-1) / sqrt(1 - skew*SR + ...) >= z
    # Rearranging: n >= 1 + (z / (SR - SR_0))^2 * (1 - skew*SR + ...)

    sr_diff = sharpe - benchmark_sharpe
    variance_factor = 1 - skewness * sharpe + ((excess_kurt + 2) / 4) * sharpe ** 2

    n_min = 1 + (z / sr_diff) ** 2 * variance_factor

    return int(np.ceil(n_min))




[docs]
def haircut_sharpe_ratio(
    sharpe: float,
    n_trials: int,
    sharpe_std_trials: float = 0.5,
) -> tuple[float, float]:
    """Apply haircut to Sharpe ratio for multiple testing.

    Returns an adjusted Sharpe ratio that accounts for data mining.

    Parameters
    ----------
    sharpe : float
        Observed Sharpe ratio.
    n_trials : int
        Number of strategies tested.
    sharpe_std_trials : float, default=0.5
        Standard deviation of SR estimates across trials.

    Returns
    -------
    Tuple[float, float]
        (haircut_sharpe, haircut_percent)
        - haircut_sharpe: Adjusted Sharpe ratio
        - haircut_percent: Percentage reduction applied

    Examples
    --------
    >>> sr_adj, haircut = haircut_sharpe_ratio(sharpe=2.0, n_trials=100)
    >>> print(f"Adjusted SR: {sr_adj:.2f} (haircut: {haircut:.1%})")

    Notes
    -----
    The haircut is the expected maximum SR under null hypothesis.
    The adjusted SR is: SR_adjusted = SR_observed - E[max{SR}|null]
    """
    e_max = expected_max_sharpe(n_trials, sharpe_std_trials, 0.0)

    haircut_sr = sharpe - e_max
    haircut_pct = e_max / sharpe if sharpe != 0 else 0.0

    return haircut_sr, haircut_pct




[docs]
def estimate_n_independent_trials(
    sharpe_ratios: np.ndarray,
    method: str = "variance",
) -> int:
    """Estimate effective number of independent trials from correlated strategies.

    When strategies are correlated, the effective number of independent trials
    is less than the total number tested.

    Parameters
    ----------
    sharpe_ratios : np.ndarray
        Array of Sharpe ratios from all tested strategies.
    method : str, default="variance"
        Method to estimate N:
        - "variance": Use variance ratio (conservative)
        - "count": Just use the raw count (anti-conservative)

    Returns
    -------
    int
        Estimated number of independent trials.

    Notes
    -----
    López de Prado (2018) recommends using clustering (ONC algorithm) for
    more accurate estimation. This function provides simpler heuristics.
    """
    n_total = len(sharpe_ratios)

    if method == "count":
        return n_total
    elif method == "variance":
        # If SRs are highly correlated, their variance will be lower
        # than expected for independent trials
        sr_std = np.std(sharpe_ratios, ddof=1)

        # Expected std for independent trials with mean=0
        # is roughly 1/sqrt(T) where T is track record length
        # Here we use a heuristic based on observed variance

        # If variance is low, strategies are correlated
        # Conservative estimate: N_eff = N * (observed_var / expected_var)
        # We cap at n_total
        expected_var = 0.5 ** 2  # Assume SR ~ N(0, 0.5) for independent strategies
        observed_var = sr_std ** 2

        n_eff = int(np.ceil(n_total * min(observed_var / expected_var, 1.0)))
        return max(1, n_eff)
    else:
        raise ValueError(f"Unknown method: {method}. Use 'variance' or 'count'.")




[docs]
def multiple_testing_summary(
    sharpe_ratios: np.ndarray,
    returns_list: list[np.ndarray] | None = None,
    n_obs: int = 252,
    significance_level: float = 0.05,
) -> dict:
    """Generate a summary report for multiple testing analysis.

    Parameters
    ----------
    sharpe_ratios : np.ndarray
        Sharpe ratios of all tested strategies.
    returns_list : List[np.ndarray], optional
        List of return arrays for each strategy (for detailed stats).
    n_obs : int, default=252
        Number of observations per strategy.
    significance_level : float, default=0.05
        Significance level for testing.

    Returns
    -------
    dict
        Summary statistics including:
        - n_trials: Total strategies tested
        - n_effective: Estimated independent trials
        - best_sharpe: Highest observed SR
        - expected_max: Expected max SR under null
        - best_dsr: DSR of best strategy
        - haircut: Haircut percentage
        - n_significant: Number passing DSR threshold
    """
    n_trials = len(sharpe_ratios)
    sharpe_std = np.std(sharpe_ratios, ddof=1)
    best_idx = np.argmax(sharpe_ratios)
    best_sharpe = sharpe_ratios[best_idx]

    # Estimate effective number of trials
    n_eff = estimate_n_independent_trials(sharpe_ratios)

    # Expected max under null
    e_max = expected_max_sharpe(n_trials, sharpe_std, 0.0)

    # Compute DSR for best strategy
    if returns_list is not None and len(returns_list) > best_idx:
        best_returns = returns_list[best_idx]
        skewness = float(stats.skew(best_returns))
        kurtosis = float(stats.kurtosis(best_returns, fisher=False))
        n_obs = len(best_returns)
    else:
        skewness = 0.0
        kurtosis = 3.0

    best_dsr = deflated_sharpe_ratio(
        sharpe=best_sharpe,
        n_trials=n_trials,
        sharpe_std_trials=sharpe_std,
        n_obs=n_obs,
        skewness=skewness,
        kurtosis=kurtosis,
    )

    # Count significant strategies
    threshold = 1 - significance_level
    n_significant = 0
    for sr in sharpe_ratios:
        dsr = deflated_sharpe_ratio(
            sharpe=sr,
            n_trials=n_trials,
            sharpe_std_trials=sharpe_std,
            n_obs=n_obs,
            skewness=skewness,
            kurtosis=kurtosis,
        )
        if dsr >= threshold:
            n_significant += 1

    # Haircut
    haircut_sr, haircut_pct = haircut_sharpe_ratio(best_sharpe, n_trials, sharpe_std)

    return {
        "n_trials": n_trials,
        "n_effective": n_eff,
        "sharpe_mean": float(np.mean(sharpe_ratios)),
        "sharpe_std": sharpe_std,
        "best_sharpe": best_sharpe,
        "expected_max_sharpe": e_max,
        "best_dsr": best_dsr,
        "haircut_sharpe": haircut_sr,
        "haircut_percent": haircut_pct,
        "n_significant": n_significant,
        "significance_level": significance_level,
    }