from __future__ import annotations
"""Sharpe Ratio utilities for backtesting and multiple testing correction.
This module implements the Deflated Sharpe Ratio (DSR) and Probabilistic Sharpe
Ratio (PSR) from Bailey & López de Prado's work on addressing the multiple
testing problem in quantitative finance.
References
----------
- Bailey, D.H. and López de Prado, M. (2014). "The Deflated Sharpe Ratio:
Correcting for Selection Bias, Backtest Overfitting, and Non-Normality."
The Journal of Portfolio Management, 40(5), 94-107.
- López de Prado, M. (2018). "Advances in Financial Machine Learning."
John Wiley & Sons, Chapter 14.
"""
from dataclasses import dataclass
import numpy as np
from scipy import stats
# Euler-Mascheroni constant
EULER_MASCHERONI = 0.5772156649015329
[docs]
@dataclass
class SharpeAnalysis:
"""Results from Sharpe ratio analysis.
Attributes
----------
sharpe_ratio : float
The estimated Sharpe ratio.
probabilistic_sharpe : float
PSR - probability that true SR > benchmark.
deflated_sharpe : float
DSR - PSR adjusted for multiple testing.
expected_max_sharpe : float
Expected maximum SR under null hypothesis.
p_value : float
P-value for the null hypothesis that true SR = 0.
is_significant : bool
Whether DSR exceeds significance threshold.
n_trials : int
Number of trials considered.
skewness : float
Skewness of returns.
kurtosis : float
Excess kurtosis of returns.
track_record_length : int
Number of observations.
"""
sharpe_ratio: float
probabilistic_sharpe: float
deflated_sharpe: float
expected_max_sharpe: float
p_value: float
is_significant: bool
n_trials: int
skewness: float
kurtosis: float
track_record_length: int
[docs]
def sharpe_ratio(
returns: np.ndarray,
risk_free_rate: float = 0.0,
annualization_factor: float = 252.0,
) -> float:
"""Calculate the annualized Sharpe ratio.
Parameters
----------
returns : np.ndarray
Array of periodic returns.
risk_free_rate : float, default=0.0
Risk-free rate (same period as returns).
annualization_factor : float, default=252.0
Factor to annualize (252 for daily, 12 for monthly, 52 for weekly).
Returns
-------
float
Annualized Sharpe ratio.
Examples
--------
>>> returns = np.random.randn(252) * 0.01 + 0.0005 # Daily returns
>>> sr = sharpe_ratio(returns)
"""
returns = np.asarray(returns)
excess_returns = returns - risk_free_rate
mean_excess = np.mean(excess_returns)
std_excess = np.std(excess_returns, ddof=1)
if std_excess == 0 or np.isclose(std_excess, 0, atol=1e-10):
return 0.0
# Annualize
sr = (mean_excess / std_excess) * np.sqrt(annualization_factor)
return sr
[docs]
def sharpe_ratio_std(
sharpe: float,
n_obs: int,
skewness: float = 0.0,
kurtosis: float = 3.0,
) -> float:
"""Calculate the standard error of the Sharpe ratio estimate.
Uses the Lo (2002) / Mertens (2002) correction for non-normality.
Parameters
----------
sharpe : float
Estimated Sharpe ratio.
n_obs : int
Number of observations.
skewness : float, default=0.0
Skewness of returns.
kurtosis : float, default=3.0
Kurtosis of returns (not excess kurtosis).
Returns
-------
float
Standard error of the Sharpe ratio.
Notes
-----
The formula accounts for:
- Sampling variability
- Non-normal returns (skewness and fat tails)
References
----------
Lo, A. (2002). "The Statistics of Sharpe Ratios."
Financial Analysts Journal, 58(4), 36-52.
"""
# Excess kurtosis (kurtosis - 3 for normal)
excess_kurt = kurtosis - 3.0
# Variance of SR estimator under non-normality
# Var(SR) = (1 - skew*SR + ((kurtosis-1)/4)*SR^2) / (n-1)
variance = (
1
- skewness * sharpe
+ ((excess_kurt + 2) / 4) * sharpe ** 2
) / (n_obs - 1)
return np.sqrt(max(variance, 0))
[docs]
def probabilistic_sharpe_ratio(
sharpe: float,
benchmark_sharpe: float,
n_obs: int,
skewness: float = 0.0,
kurtosis: float = 3.0,
) -> float:
"""Calculate the Probabilistic Sharpe Ratio (PSR).
PSR is the probability that the true Sharpe ratio exceeds the benchmark,
accounting for non-normality of returns.
Parameters
----------
sharpe : float
Estimated Sharpe ratio.
benchmark_sharpe : float
Benchmark Sharpe ratio to compare against.
n_obs : int
Number of observations.
skewness : float, default=0.0
Skewness of returns.
kurtosis : float, default=3.0
Kurtosis of returns (not excess kurtosis).
Returns
-------
float
Probability in [0, 1] that true SR > benchmark SR.
Examples
--------
>>> # Test if strategy beats SR = 0
>>> psr = probabilistic_sharpe_ratio(sharpe=1.5, benchmark_sharpe=0,
... n_obs=252, skewness=-0.2, kurtosis=4.0)
>>> print(f"Probability true SR > 0: {psr:.2%}")
Notes
-----
PSR corrects for:
- Sample length (finite track record)
- Non-normal returns (skewness and fat tails)
It does NOT correct for multiple testing - use DSR for that.
References
----------
Bailey, D.H. and López de Prado, M. (2012). "The Sharpe Ratio
Efficient Frontier." Journal of Risk, 15(2), 3-44.
"""
# Standard error of SR
sr_std = sharpe_ratio_std(sharpe, n_obs, skewness, kurtosis)
if sr_std == 0:
return 1.0 if sharpe > benchmark_sharpe else 0.0
# Z-score
z = (sharpe - benchmark_sharpe) / sr_std
# PSR = Phi(z)
return float(stats.norm.cdf(z))
[docs]
def expected_max_sharpe(
n_trials: int,
sharpe_std: float,
mean_sharpe: float = 0.0,
) -> float:
"""Calculate expected maximum Sharpe ratio under null hypothesis.
This is the expected maximum SR when all strategies have true SR = mean_sharpe,
but we observe inflated values due to multiple testing.
Parameters
----------
n_trials : int
Number of independent trials/strategies tested.
sharpe_std : float
Standard deviation of Sharpe ratio estimates across trials.
mean_sharpe : float, default=0.0
Mean Sharpe ratio under null (typically 0).
Returns
-------
float
Expected maximum Sharpe ratio E[max{SR_i}].
Notes
-----
Uses the approximation from Bailey & López de Prado (2014):
E[max{SR}] ≈ μ + σ * [(1-γ)*Φ^(-1)(1-1/N) + γ*Φ^(-1)(1-1/(N*e))]
where γ is the Euler-Mascheroni constant.
Examples
--------
>>> # After 100 trials, what SR do we expect by chance?
>>> e_max = expected_max_sharpe(n_trials=100, sharpe_std=0.5)
>>> print(f"Expected max SR: {e_max:.2f}")
"""
if n_trials <= 1:
return mean_sharpe
gamma = EULER_MASCHERONI
# Quantiles
# Φ^(-1)(1 - 1/N)
q1 = stats.norm.ppf(1 - 1 / n_trials)
# Φ^(-1)(1 - 1/(N*e))
q2 = stats.norm.ppf(1 - 1 / (n_trials * np.e))
# Expected maximum
e_max = mean_sharpe + sharpe_std * ((1 - gamma) * q1 + gamma * q2)
return e_max
[docs]
def deflated_sharpe_ratio(
sharpe: float,
n_trials: int,
sharpe_std_trials: float,
n_obs: int,
skewness: float = 0.0,
kurtosis: float = 3.0,
mean_sharpe_null: float = 0.0,
) -> float:
"""Calculate the Deflated Sharpe Ratio (DSR).
DSR corrects for multiple testing by computing the probability that the
observed Sharpe ratio exceeds the expected maximum SR under the null
hypothesis that all strategies have zero true SR.
Parameters
----------
sharpe : float
Estimated Sharpe ratio of the selected strategy.
n_trials : int
Number of independent trials/strategies tested.
sharpe_std_trials : float
Standard deviation of Sharpe ratios across all trials.
n_obs : int
Number of observations (track record length).
skewness : float, default=0.0
Skewness of returns.
kurtosis : float, default=3.0
Kurtosis of returns (not excess kurtosis).
mean_sharpe_null : float, default=0.0
Mean Sharpe ratio under null hypothesis.
Returns
-------
float
Deflated Sharpe Ratio in [0, 1].
Examples
--------
>>> # Tested 100 strategies, best has SR = 2.0
>>> dsr = deflated_sharpe_ratio(
... sharpe=2.0,
... n_trials=100,
... sharpe_std_trials=0.5,
... n_obs=252,
... skewness=-0.3,
... kurtosis=4.5,
... )
>>> print(f"DSR: {dsr:.2%}")
>>> # If DSR < 0.95, the strategy may be a statistical fluke
Notes
-----
DSR answers: "What is the probability that this strategy would have
beaten random chance, given that we tested N strategies?"
A DSR of 0.95 means there's a 95% probability that the strategy's
performance is real and not due to overfitting from multiple testing.
References
----------
Bailey, D.H. and López de Prado, M. (2014). "The Deflated Sharpe Ratio:
Correcting for Selection Bias, Backtest Overfitting, and Non-Normality."
The Journal of Portfolio Management, 40(5), 94-107.
"""
# Calculate expected max SR under null
sr0 = expected_max_sharpe(n_trials, sharpe_std_trials, mean_sharpe_null)
# DSR is just PSR with benchmark = expected max SR
dsr = probabilistic_sharpe_ratio(
sharpe=sharpe,
benchmark_sharpe=sr0,
n_obs=n_obs,
skewness=skewness,
kurtosis=kurtosis,
)
return dsr
[docs]
def analyze_sharpe(
returns: np.ndarray,
n_trials: int = 1,
sharpe_std_trials: float | None = None,
all_sharpes: np.ndarray | None = None,
risk_free_rate: float = 0.0,
annualization_factor: float = 252.0,
significance_level: float = 0.05,
) -> SharpeAnalysis:
"""Comprehensive Sharpe ratio analysis with multiple testing correction.
Parameters
----------
returns : np.ndarray
Array of periodic returns for the selected strategy.
n_trials : int, default=1
Number of independent trials/strategies tested.
sharpe_std_trials : float, optional
Standard deviation of Sharpe ratios across all trials.
If not provided and all_sharpes is given, computed from all_sharpes.
If neither provided, estimated as 1/sqrt(n_obs).
all_sharpes : np.ndarray, optional
Sharpe ratios of all tested strategies (for computing variance).
risk_free_rate : float, default=0.0
Risk-free rate (same period as returns).
annualization_factor : float, default=252.0
Factor to annualize Sharpe ratio.
significance_level : float, default=0.05
Significance level for hypothesis testing.
Returns
-------
SharpeAnalysis
Comprehensive analysis results.
Examples
--------
>>> # Single strategy analysis
>>> returns = np.random.randn(252) * 0.01 + 0.0005
>>> analysis = analyze_sharpe(returns)
>>> print(f"SR: {analysis.sharpe_ratio:.2f}")
>>> print(f"PSR (SR > 0): {analysis.probabilistic_sharpe:.2%}")
>>> # Multiple testing scenario
>>> all_sharpes = np.random.randn(100) * 0.5 # 100 strategies tested
>>> best_idx = np.argmax(all_sharpes)
>>> analysis = analyze_sharpe(
... returns=best_returns,
... n_trials=100,
... all_sharpes=all_sharpes,
... )
>>> print(f"DSR: {analysis.deflated_sharpe:.2%}")
>>> print(f"Significant: {analysis.is_significant}")
"""
returns = np.asarray(returns)
n_obs = len(returns)
# Compute basic statistics
excess_returns = returns - risk_free_rate
skewness = float(stats.skew(excess_returns))
kurtosis = float(stats.kurtosis(excess_returns, fisher=False)) # Not excess
# Compute Sharpe ratio
sr = sharpe_ratio(returns, risk_free_rate, annualization_factor)
# Estimate SR standard deviation across trials
if sharpe_std_trials is None:
if all_sharpes is not None:
sharpe_std_trials = float(np.std(all_sharpes, ddof=1))
else:
# Conservative estimate: assume SR ~ N(0, 1/sqrt(T))
sharpe_std_trials = 1.0 / np.sqrt(n_obs) * np.sqrt(annualization_factor)
# Compute PSR (probability true SR > 0)
psr = probabilistic_sharpe_ratio(
sharpe=sr,
benchmark_sharpe=0.0,
n_obs=n_obs,
skewness=skewness,
kurtosis=kurtosis,
)
# Compute expected max SR under null
e_max_sr = expected_max_sharpe(n_trials, sharpe_std_trials, 0.0)
# Compute DSR
dsr = deflated_sharpe_ratio(
sharpe=sr,
n_trials=n_trials,
sharpe_std_trials=sharpe_std_trials,
n_obs=n_obs,
skewness=skewness,
kurtosis=kurtosis,
)
# P-value (1 - DSR when testing against expected max)
p_value = 1.0 - dsr
return SharpeAnalysis(
sharpe_ratio=sr,
probabilistic_sharpe=psr,
deflated_sharpe=dsr,
expected_max_sharpe=e_max_sr,
p_value=p_value,
is_significant=dsr >= (1 - significance_level),
n_trials=n_trials,
skewness=skewness,
kurtosis=kurtosis,
track_record_length=n_obs,
)
[docs]
def minimum_track_record_length(
sharpe: float,
benchmark_sharpe: float = 0.0,
confidence: float = 0.95,
skewness: float = 0.0,
kurtosis: float = 3.0,
) -> int:
"""Calculate minimum track record length needed for statistical significance.
Answers: "How many observations do we need to be confident that the
strategy's Sharpe ratio is real?"
Parameters
----------
sharpe : float
Target Sharpe ratio.
benchmark_sharpe : float, default=0.0
Benchmark to beat.
confidence : float, default=0.95
Required confidence level.
skewness : float, default=0.0
Expected skewness of returns.
kurtosis : float, default=3.0
Expected kurtosis of returns.
Returns
-------
int
Minimum number of observations needed.
Examples
--------
>>> # How long to verify SR = 1.0 strategy?
>>> n_min = minimum_track_record_length(sharpe=1.0)
>>> print(f"Need at least {n_min} observations")
Notes
-----
This is the "MinTRL" from Bailey & López de Prado (2012).
A strategy with SR = 2.0 and normal returns needs only ~16 observations.
A strategy with SR = 0.5 needs ~256 observations!
"""
if sharpe <= benchmark_sharpe:
return np.inf
# z-score for desired confidence
z = stats.norm.ppf(confidence)
# Excess kurtosis
excess_kurt = kurtosis - 3.0
# Solve for n in the PSR formula
# We need: (SR - SR_0) * sqrt(n-1) / sqrt(1 - skew*SR + ...) >= z
# Rearranging: n >= 1 + (z / (SR - SR_0))^2 * (1 - skew*SR + ...)
sr_diff = sharpe - benchmark_sharpe
variance_factor = 1 - skewness * sharpe + ((excess_kurt + 2) / 4) * sharpe ** 2
n_min = 1 + (z / sr_diff) ** 2 * variance_factor
return int(np.ceil(n_min))
[docs]
def haircut_sharpe_ratio(
sharpe: float,
n_trials: int,
sharpe_std_trials: float = 0.5,
) -> tuple[float, float]:
"""Apply haircut to Sharpe ratio for multiple testing.
Returns an adjusted Sharpe ratio that accounts for data mining.
Parameters
----------
sharpe : float
Observed Sharpe ratio.
n_trials : int
Number of strategies tested.
sharpe_std_trials : float, default=0.5
Standard deviation of SR estimates across trials.
Returns
-------
Tuple[float, float]
(haircut_sharpe, haircut_percent)
- haircut_sharpe: Adjusted Sharpe ratio
- haircut_percent: Percentage reduction applied
Examples
--------
>>> sr_adj, haircut = haircut_sharpe_ratio(sharpe=2.0, n_trials=100)
>>> print(f"Adjusted SR: {sr_adj:.2f} (haircut: {haircut:.1%})")
Notes
-----
The haircut is the expected maximum SR under null hypothesis.
The adjusted SR is: SR_adjusted = SR_observed - E[max{SR}|null]
"""
e_max = expected_max_sharpe(n_trials, sharpe_std_trials, 0.0)
haircut_sr = sharpe - e_max
haircut_pct = e_max / sharpe if sharpe != 0 else 0.0
return haircut_sr, haircut_pct
[docs]
def estimate_n_independent_trials(
sharpe_ratios: np.ndarray,
method: str = "variance",
) -> int:
"""Estimate effective number of independent trials from correlated strategies.
When strategies are correlated, the effective number of independent trials
is less than the total number tested.
Parameters
----------
sharpe_ratios : np.ndarray
Array of Sharpe ratios from all tested strategies.
method : str, default="variance"
Method to estimate N:
- "variance": Use variance ratio (conservative)
- "count": Just use the raw count (anti-conservative)
Returns
-------
int
Estimated number of independent trials.
Notes
-----
López de Prado (2018) recommends using clustering (ONC algorithm) for
more accurate estimation. This function provides simpler heuristics.
"""
n_total = len(sharpe_ratios)
if method == "count":
return n_total
elif method == "variance":
# If SRs are highly correlated, their variance will be lower
# than expected for independent trials
sr_std = np.std(sharpe_ratios, ddof=1)
# Expected std for independent trials with mean=0
# is roughly 1/sqrt(T) where T is track record length
# Here we use a heuristic based on observed variance
# If variance is low, strategies are correlated
# Conservative estimate: N_eff = N * (observed_var / expected_var)
# We cap at n_total
expected_var = 0.5 ** 2 # Assume SR ~ N(0, 0.5) for independent strategies
observed_var = sr_std ** 2
n_eff = int(np.ceil(n_total * min(observed_var / expected_var, 1.0)))
return max(1, n_eff)
else:
raise ValueError(f"Unknown method: {method}. Use 'variance' or 'count'.")
[docs]
def multiple_testing_summary(
sharpe_ratios: np.ndarray,
returns_list: list[np.ndarray] | None = None,
n_obs: int = 252,
significance_level: float = 0.05,
) -> dict:
"""Generate a summary report for multiple testing analysis.
Parameters
----------
sharpe_ratios : np.ndarray
Sharpe ratios of all tested strategies.
returns_list : List[np.ndarray], optional
List of return arrays for each strategy (for detailed stats).
n_obs : int, default=252
Number of observations per strategy.
significance_level : float, default=0.05
Significance level for testing.
Returns
-------
dict
Summary statistics including:
- n_trials: Total strategies tested
- n_effective: Estimated independent trials
- best_sharpe: Highest observed SR
- expected_max: Expected max SR under null
- best_dsr: DSR of best strategy
- haircut: Haircut percentage
- n_significant: Number passing DSR threshold
"""
n_trials = len(sharpe_ratios)
sharpe_std = np.std(sharpe_ratios, ddof=1)
best_idx = np.argmax(sharpe_ratios)
best_sharpe = sharpe_ratios[best_idx]
# Estimate effective number of trials
n_eff = estimate_n_independent_trials(sharpe_ratios)
# Expected max under null
e_max = expected_max_sharpe(n_trials, sharpe_std, 0.0)
# Compute DSR for best strategy
if returns_list is not None and len(returns_list) > best_idx:
best_returns = returns_list[best_idx]
skewness = float(stats.skew(best_returns))
kurtosis = float(stats.kurtosis(best_returns, fisher=False))
n_obs = len(best_returns)
else:
skewness = 0.0
kurtosis = 3.0
best_dsr = deflated_sharpe_ratio(
sharpe=best_sharpe,
n_trials=n_trials,
sharpe_std_trials=sharpe_std,
n_obs=n_obs,
skewness=skewness,
kurtosis=kurtosis,
)
# Count significant strategies
threshold = 1 - significance_level
n_significant = 0
for sr in sharpe_ratios:
dsr = deflated_sharpe_ratio(
sharpe=sr,
n_trials=n_trials,
sharpe_std_trials=sharpe_std,
n_obs=n_obs,
skewness=skewness,
kurtosis=kurtosis,
)
if dsr >= threshold:
n_significant += 1
# Haircut
haircut_sr, haircut_pct = haircut_sharpe_ratio(best_sharpe, n_trials, sharpe_std)
return {
"n_trials": n_trials,
"n_effective": n_eff,
"sharpe_mean": float(np.mean(sharpe_ratios)),
"sharpe_std": sharpe_std,
"best_sharpe": best_sharpe,
"expected_max_sharpe": e_max,
"best_dsr": best_dsr,
"haircut_sharpe": haircut_sr,
"haircut_percent": haircut_pct,
"n_significant": n_significant,
"significance_level": significance_level,
}