Performance Evaluation¶
Evaluating FDR control methods requires careful measurement of statistical performance, computational efficiency, and robustness. This guide covers the evaluation utilities and best practices for assessing method performance.
Core Evaluation Metrics¶
False Discovery Rate (FDR)¶
The primary metric for multiple testing correction:
where \(V\) = false discoveries, \(R\) = total discoveries.
from online_fdr.utils.evaluation import calculate_sfdr
# Example evaluation
true_positives = 45 # Correctly rejected nulls
false_positives = 5 # Incorrectly rejected nulls
total_discoveries = true_positives + false_positives
# Calculate smoothed FDR (handles R=0 case)
empirical_fdr = calculate_sfdr(true_positives, false_positives)
print(f"Discoveries: {total_discoveries}")
print(f"False discoveries: {false_positives}")
print(f"Empirical FDR: {empirical_fdr:.3f}")
print(f"Target FDR: 0.05")
print(f"FDR controlled: {'✓' if empirical_fdr <= 0.05 else '✗'}")
Statistical Power¶
The proportion of true alternatives correctly identified:
from online_fdr.utils.evaluation import calculate_power
# Power calculation
true_positives = 45
false_negatives = 15 # Missed true alternatives
total_alternatives = true_positives + false_negatives
power = calculate_power(true_positives, false_negatives)
print(f"True alternatives: {total_alternatives}")
print(f"Correctly identified: {true_positives}")
print(f"Statistical power: {power:.3f}")
Comprehensive Performance Assessment¶
def evaluate_method_performance(decisions, true_labels, alpha=0.05):
"""Comprehensive performance evaluation."""
from online_fdr.utils.evaluation import calculate_sfdr, calculate_power
# Convert to boolean arrays if needed
decisions = [bool(d) for d in decisions]
true_labels = [bool(t) for t in true_labels]
# Calculate confusion matrix elements
true_positives = sum(d and t for d, t in zip(decisions, true_labels))
false_positives = sum(d and not t for d, t in zip(decisions, true_labels))
true_negatives = sum(not d and not t for d, t in zip(decisions, true_labels))
false_negatives = sum(not d and t for d, t in zip(decisions, true_labels))
# Primary metrics
total_discoveries = true_positives + false_positives
total_alternatives = true_positives + false_negatives
total_nulls = false_positives + true_negatives
empirical_fdr = calculate_sfdr(true_positives, false_positives)
power = calculate_power(true_positives, false_negatives)
# Additional metrics
precision = true_positives / max(total_discoveries, 1)
specificity = true_negatives / max(total_nulls, 1)
results = {
'total_tests': len(decisions),
'total_discoveries': total_discoveries,
'total_alternatives': total_alternatives,
'true_positives': true_positives,
'false_positives': false_positives,
'true_negatives': true_negatives,
'false_negatives': false_negatives,
'empirical_fdr': empirical_fdr,
'power': power,
'precision': precision,
'specificity': specificity,
'fdr_controlled': empirical_fdr <= alpha * 1.1, # 10% tolerance
'discovery_rate': total_discoveries / len(decisions)
}
return results
# Example usage
decisions = [True, False, True, True, False, False, True, False]
true_labels = [True, False, False, True, True, False, True, False]
performance = evaluate_method_performance(decisions, true_labels, alpha=0.1)
print("Performance Evaluation Results:")
print("=" * 40)
for metric, value in performance.items():
if isinstance(value, float):
print(f"{metric:>20}: {value:.3f}")
elif isinstance(value, bool):
print(f"{metric:>20}: {'✓' if value else '✗'}")
else:
print(f"{metric:>20}: {value}")
Online-Specific Metrics¶
Temporal FDR Control¶
For sequential testing, we can monitor FDR over time:
class OnlineFDRTracker:
"""Track FDR control during sequential testing."""
def __init__(self):
self.decisions = []
self.true_labels = []
self.fdr_history = []
self.power_history = []
def update(self, decision, true_label):
"""Update with new test result."""
self.decisions.append(decision)
self.true_labels.append(true_label)
# Calculate current FDR and power
tp = sum(d and t for d, t in zip(self.decisions, self.true_labels))
fp = sum(d and not t for d, t in zip(self.decisions, self.true_labels))
fn = sum(not d and t for d, t in zip(self.decisions, self.true_labels))
current_fdr = fp / max(tp + fp, 1)
current_power = tp / max(tp + fn, 1)
self.fdr_history.append(current_fdr)
self.power_history.append(current_power)
return current_fdr, current_power
def plot_history(self):
"""Plot FDR and power over time."""
try:
import matplotlib.pyplot as plt
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))
# FDR over time
ax1.plot(self.fdr_history, 'b-', label='Empirical FDR')
ax1.axhline(y=0.05, color='r', linestyle='--', label='Target FDR')
ax1.set_ylabel('FDR')
ax1.set_title('FDR Control Over Time')
ax1.legend()
ax1.grid(True, alpha=0.3)
# Power over time
ax2.plot(self.power_history, 'g-', label='Power')
ax2.set_xlabel('Test Number')
ax2.set_ylabel('Power')
ax2.set_title('Power Over Time')
ax2.legend()
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
except ImportError:
print("Matplotlib not available for plotting")
# Example usage with sequential testing
from online_fdr.investing.addis.addis import Addis
from online_fdr.utils.generation import DataGenerator, GaussianLocationModel
# Set up simulation
dgp = GaussianLocationModel(alt_mean=2.0, alt_std=1.0, one_sided=True)
generator = DataGenerator(n=200, pi0=0.9, dgp=dgp)
addis = Addis(alpha=0.05, wealth=0.025, lambda_=0.25, tau=0.5)
# Track performance over time
tracker = OnlineFDRTracker()
print("Sequential FDR Tracking:")
for i in range(50):
p_value, is_alternative = generator.sample_one()
decision = addis.test_one(p_value)
current_fdr, current_power = tracker.update(decision, is_alternative)
if decision:
print(f"Test {i+1}: Discovery! FDR={current_fdr:.3f}, Power={current_power:.3f}")
print(f"\nFinal FDR: {tracker.fdr_history[-1]:.3f}")
print(f"Final Power: {tracker.power_history[-1]:.3f}")
Memory-Decay FDR¶
For non-stationary sequences, use memory-decay FDR:
from online_fdr.utils.evaluation import MemoryDecayFDR
# Memory-decay FDR with forgetting factor
memory_fdr = MemoryDecayFDR(delta=0.99, offset=0)
print("Memory-Decay FDR Evaluation:")
# Simulate a sequence with time-varying signal
decisions = [True, False, True, False, False, True, True, False]
true_labels = [True, False, False, False, True, True, True, False]
for i, (decision, true_label) in enumerate(zip(decisions, true_labels)):
current_fdr = memory_fdr.score_one(decision, true_label)
print(f"Test {i+1}: Decision={decision}, FDR={current_fdr:.4f}")
print(f"Final memory-decay FDR: {current_fdr:.4f}")
Method Comparison Framework¶
Simulation-Based Comparison¶
def compare_methods_simulation(methods, simulation_configs, n_reps=100):
"""Comprehensive method comparison via simulation."""
from online_fdr.utils.generation import DataGenerator, GaussianLocationModel
import random
results = {name: [] for name in methods.keys()}
print(f"Method Comparison Study ({n_reps} replications)")
print("=" * 50)
for rep in range(n_reps):
if (rep + 1) % 20 == 0:
print(f"Replication {rep + 1}/{n_reps}")
# Generate data for this replication
dgp = GaussianLocationModel(**simulation_configs['dgp_params'])
generator = DataGenerator(**simulation_configs['generator_params'], dgp=dgp)
p_values = []
true_labels = []
for _ in range(simulation_configs['n_tests']):
p_val, is_alt = generator.sample_one()
p_values.append(p_val)
true_labels.append(is_alt)
# Test each method
for method_name, method_class in methods.items():
# Create fresh instance for each replication
method = method_class()
# Apply method
if hasattr(method, 'test_batch'):
decisions = method.test_batch(p_values)
else:
decisions = [method.test_one(p) for p in p_values]
# Evaluate performance
performance = evaluate_method_performance(
decisions, true_labels,
alpha=simulation_configs.get('alpha', 0.05)
)
results[method_name].append(performance)
# Aggregate results
print("\nAggregated Results:")
print("=" * 50)
summary = {}
for method_name, rep_results in results.items():
# Calculate means and standard errors
fdr_values = [r['empirical_fdr'] for r in rep_results]
power_values = [r['power'] for r in rep_results]
discovery_values = [r['total_discoveries'] for r in rep_results]
mean_fdr = sum(fdr_values) / len(fdr_values)
se_fdr = (sum((x - mean_fdr)**2 for x in fdr_values) / len(fdr_values))**0.5 / len(fdr_values)**0.5
mean_power = sum(power_values) / len(power_values)
se_power = (sum((x - mean_power)**2 for x in power_values) / len(power_values))**0.5 / len(power_values)**0.5
mean_discoveries = sum(discovery_values) / len(discovery_values)
fdr_control_rate = sum(1 for r in rep_results if r['fdr_controlled']) / len(rep_results)
summary[method_name] = {
'mean_fdr': mean_fdr,
'se_fdr': se_fdr,
'mean_power': mean_power,
'se_power': se_power,
'mean_discoveries': mean_discoveries,
'fdr_control_rate': fdr_control_rate
}
print(f"{method_name:>12}: FDR={mean_fdr:.3f}±{se_fdr:.3f}, "
f"Power={mean_power:.3f}±{se_power:.3f}, "
f"Discoveries={mean_discoveries:.1f}, "
f"Control={fdr_control_rate:.0%}")
return summary
# Example comparison
from online_fdr.investing.addis.addis import Addis
from online_fdr.investing.lord.three import LordThree
from online_fdr.batching.bh import BatchBH
methods_to_compare = {
'ADDIS': lambda: Addis(alpha=0.05, wealth=0.025, lambda_=0.25, tau=0.5),
'LORD3': lambda: LordThree(alpha=0.05, wealth=0.025, reward=0.05),
'BatchBH': lambda: BatchBH(alpha=0.05)
}
sim_config = {
'n_tests': 500,
'alpha': 0.05,
'dgp_params': {'alt_mean': 2.0, 'alt_std': 1.0, 'one_sided': True},
'generator_params': {'n': 500, 'pi0': 0.9}
}
comparison_results = compare_methods_simulation(
methods_to_compare, sim_config, n_reps=50
)
Computational Performance Evaluation¶
Speed and Memory Benchmarks¶
import time
import sys
def benchmark_method_performance(method, p_values, n_runs=10):
"""Benchmark computational performance."""
print(f"Benchmarking {method.__class__.__name__}:")
# Time benchmarking
times = []
for run in range(n_runs):
# Create fresh instance
if hasattr(method, '__class__'):
fresh_method = method.__class__(**method.__dict__)
else:
fresh_method = method
start_time = time.time()
# Run method
if hasattr(fresh_method, 'test_batch'):
results = fresh_method.test_batch(p_values)
else:
results = [fresh_method.test_one(p) for p in p_values]
end_time = time.time()
times.append(end_time - start_time)
# Calculate statistics
mean_time = sum(times) / len(times)
std_time = (sum((t - mean_time)**2 for t in times) / len(times))**0.5
# Memory usage (rough estimate)
memory_usage = sys.getsizeof(method) + sys.getsizeof(p_values)
# Throughput
throughput = len(p_values) / mean_time
print(f" Mean time: {mean_time:.4f} ± {std_time:.4f} seconds")
print(f" Throughput: {throughput:.0f} tests/second")
print(f" Memory usage: {memory_usage/1024:.1f} KB")
print(f" Time per test: {mean_time/len(p_values)*1000:.3f} ms")
return {
'mean_time': mean_time,
'std_time': std_time,
'throughput': throughput,
'memory_usage': memory_usage
}
# Benchmark different methods
import random
random.seed(42)
test_p_values = [random.uniform(0, 1) for _ in range(1000)]
print("Performance Benchmarks:")
print("=" * 30)
# Benchmark ADDIS
addis = Addis(alpha=0.05, wealth=0.025, lambda_=0.25, tau=0.5)
addis_perf = benchmark_method_performance(addis, test_p_values)
print()
# Benchmark BatchBH
bh = BatchBH(alpha=0.05)
bh_perf = benchmark_method_performance(bh, test_p_values)
print(f"\nSpeedup: {addis_perf['mean_time'] / bh_perf['mean_time']:.1f}x "
f"({'BatchBH' if bh_perf['mean_time'] < addis_perf['mean_time'] else 'ADDIS'} faster)")
Robustness Testing¶
Parameter Sensitivity Analysis¶
def parameter_sensitivity_analysis(method_class, param_grid, base_params):
"""Analyze method sensitivity to parameter changes."""
from online_fdr.utils.generation import DataGenerator, GaussianLocationModel
print("Parameter Sensitivity Analysis:")
print("=" * 40)
# Generate fixed test data
dgp = GaussianLocationModel(alt_mean=2.0, alt_std=1.0, one_sided=True)
generator = DataGenerator(n=200, pi0=0.9, dgp=dgp)
p_values = []
true_labels = []
for _ in range(200):
p_val, is_alt = generator.sample_one()
p_values.append(p_val)
true_labels.append(is_alt)
results = {}
for param_name, param_values in param_grid.items():
print(f"\nTesting {param_name}:")
param_results = []
for param_value in param_values:
# Create method with modified parameter
test_params = base_params.copy()
test_params[param_name] = param_value
method = method_class(**test_params)
# Test method
decisions = [method.test_one(p) for p in p_values]
# Evaluate
performance = evaluate_method_performance(decisions, true_labels)
param_results.append({
'param_value': param_value,
'fdr': performance['empirical_fdr'],
'power': performance['power'],
'discoveries': performance['total_discoveries']
})
print(f" {param_name}={param_value}: "
f"FDR={performance['empirical_fdr']:.3f}, "
f"Power={performance['power']:.3f}")
results[param_name] = param_results
return results
# Example: ADDIS parameter sensitivity
addis_param_grid = {
'wealth': [0.01, 0.025, 0.05, 0.075],
'lambda_': [0.1, 0.25, 0.5, 0.75],
'tau': [0.3, 0.5, 0.7, 0.9]
}
addis_base_params = {
'alpha': 0.05,
'wealth': 0.025,
'lambda_': 0.25,
'tau': 0.5
}
sensitivity_results = parameter_sensitivity_analysis(
Addis, addis_param_grid, addis_base_params
)
Dependency Structure Robustness¶
def test_dependency_robustness(method, dependency_scenarios):
"""Test method robustness under different dependency structures."""
import numpy as np
from scipy.stats import multivariate_normal
print("Dependency Robustness Testing:")
print("=" * 35)
n_tests = 500
n_alternatives = 50
results = {}
for scenario_name, correlation in dependency_scenarios.items():
print(f"\nTesting {scenario_name} (ρ={correlation}):")
# Generate correlated test statistics
if correlation == 0:
# Independent case
test_stats = np.random.normal(0, 1, n_tests)
else:
# Correlated case - use AR(1) structure
test_stats = np.zeros(n_tests)
test_stats[0] = np.random.normal(0, 1)
for i in range(1, n_tests):
test_stats[i] = (correlation * test_stats[i-1] +
np.sqrt(1 - correlation**2) * np.random.normal(0, 1))
# Add signal to some tests (alternatives)
alt_indices = np.random.choice(n_tests, n_alternatives, replace=False)
test_stats[alt_indices] += 2.0 # Add signal
# Convert to p-values
from scipy.stats import norm
p_values = 2 * (1 - norm.cdf(np.abs(test_stats))) # Two-sided
true_labels = np.zeros(n_tests, dtype=bool)
true_labels[alt_indices] = True
# Test method
if hasattr(method, 'test_batch'):
decisions = method.test_batch(p_values.tolist())
else:
# Need fresh instance for sequential methods
fresh_method = method.__class__(**method.__dict__)
decisions = [fresh_method.test_one(p) for p in p_values]
# Evaluate
performance = evaluate_method_performance(decisions, true_labels.tolist())
results[scenario_name] = performance
print(f" FDR: {performance['empirical_fdr']:.3f}")
print(f" Power: {performance['power']:.3f}")
print(f" FDR controlled: {'✓' if performance['fdr_controlled'] else '✗'}")
return results
# Test dependency robustness
dependency_scenarios = {
'Independent': 0.0,
'Weak positive': 0.3,
'Moderate positive': 0.6,
'Strong positive': 0.9
}
bh = BatchBH(alpha=0.05)
dependency_results = test_dependency_robustness(bh, dependency_scenarios)
Evaluation Best Practices¶
Comprehensive Evaluation Checklist¶
Statistical Performance
- ✅ FDR control under null (empirical FDR ≤ α)
- ✅ Power comparison across effect sizes
- ✅ Robustness to parameter misspecification
- ✅ Performance under different dependency structures
Computational Performance
- ✅ Scalability to large numbers of tests
- ✅ Memory efficiency
- ✅ Real-time processing capability (for online methods)
Practical Considerations
- ✅ Parameter sensitivity analysis
- ✅ Robustness to model misspecification
- ✅ Performance with realistic effect sizes
- ✅ Behavior in edge cases (no discoveries, all discoveries)
Reporting Guidelines¶
def generate_evaluation_report(results, method_name, simulation_config):
"""Generate standardized evaluation report."""
report = f"""
Method Evaluation Report: {method_name}
{'='*50}
Simulation Configuration:
- Number of tests: {simulation_config.get('n_tests', 'Unknown')}
- Null proportion (π₀): {simulation_config.get('pi0', 'Unknown')}
- Effect size: {simulation_config.get('effect_size', 'Unknown')}
- Target FDR (α): {simulation_config.get('alpha', 'Unknown')}
- Replications: {simulation_config.get('n_reps', 'Unknown')}
Performance Results:
- Empirical FDR: {results.get('mean_fdr', 0):.3f} ± {results.get('se_fdr', 0):.3f}
- Statistical Power: {results.get('mean_power', 0):.3f} ± {results.get('se_power', 0):.3f}
- Average Discoveries: {results.get('mean_discoveries', 0):.1f}
- FDR Control Rate: {results.get('fdr_control_rate', 0):.1%}
Interpretation:
"""
# Add interpretation
fdr_controlled = results.get('mean_fdr', 1) <= simulation_config.get('alpha', 0.05) * 1.1
if fdr_controlled:
report += "✓ FDR is successfully controlled\n"
else:
report += "✗ FDR control violation detected\n"
power = results.get('mean_power', 0)
if power >= 0.8:
report += "✓ High statistical power achieved\n"
elif power >= 0.5:
report += "~ Moderate statistical power\n"
else:
report += "✗ Low statistical power\n"
control_rate = results.get('fdr_control_rate', 0)
if control_rate >= 0.95:
report += "✓ Consistent FDR control across replications\n"
else:
report += f"⚠ FDR control inconsistent ({control_rate:.0%} of replications)\n"
return report
# Example report generation
sample_results = {
'mean_fdr': 0.047,
'se_fdr': 0.012,
'mean_power': 0.73,
'se_power': 0.05,
'mean_discoveries': 42.3,
'fdr_control_rate': 0.96
}
sample_config = {
'n_tests': 500,
'pi0': 0.9,
'effect_size': 2.0,
'alpha': 0.05,
'n_reps': 100
}
print(generate_evaluation_report(sample_results, "ADDIS", sample_config))
References¶
-
Storey, J. D. (2003). "The positive false discovery rate: a Bayesian interpretation and the q-value." Annals of Statistics, 31(6):2013-2035.
-
Genovese, C., and L. Wasserman (2002). "Operating characteristics and extensions of the false discovery rate procedure." Journal of the Royal Statistical Society, Series B, 64(3):499-517.
-
Efron, B. (2010). "Large-scale inference: empirical Bayes methods for estimation, testing, and prediction." Cambridge University Press.
-
Yekutieli, D., and Y. Benjamini (1999). "Resampling-based false discovery rate controlling multiple test procedures for correlated test statistics." Journal of Statistical Planning and Inference, 82(1-2):171-196.
Next Steps¶
- Apply evaluation techniques in examples
- Learn about data generation for simulation studies
- Explore theory for theoretical foundations of guarantees