Skip to content

Cross-validation Conformal Detection

This example demonstrates how to use k-fold cross-validation for conformal anomaly detection.

Setup

import numpy as np
from pyod.models.lof import LOF
from sklearn.datasets import load_breast_cancer
from scipy.stats import false_discovery_control
from nonconform.estimation import ConformalDetector
from nonconform.strategy import CrossValidation
from nonconform.utils.func import Aggregation

# Load example data
data = load_breast_cancer()
X = data.data
y = data.target

Basic Usage

# Initialize base detector
base_detector = LOF()

# Create cross-validation strategy
cv_strategy = CrossValidation(k=5)

# Initialize detector with cross-validation strategy
detector = ConformalDetector(
    detector=base_detector,
    strategy=cv_strategy,
    aggregation=Aggregation.MEDIAN,
    seed=42,
)

# Fit and predict
detector.fit(X)
p_values = detector.predict(X, raw=False)

# Detect anomalies
anomalies = p_values < 0.05
print(f"Number of anomalies detected: {anomalies.sum()}")

Cross-Validation Plus Mode

# Use plus mode to retain all fold models
cv_plus_strategy = CrossValidation(k=5, plus=True)

detector_plus = ConformalDetector(
    detector=base_detector,
    strategy=cv_plus_strategy,
    aggregation=Aggregation.MEDIAN,
    seed=42
)

# Fit and predict with ensemble
detector_plus.fit(X)
p_values_plus = detector_plus.predict(X, raw=False)

print(f"CV detections: {(p_values < 0.05).sum()}")
print(f"CV+ detections: {(p_values_plus < 0.05).sum()}")

Comparing Different Numbers of Folds

# Try different numbers of folds
fold_options = [3, 5, 10]

results = {}
for n_folds in fold_options:
    strategy = CrossValidation(k=n_folds)
    detector = ConformalDetector(
        detector=base_detector,
        strategy=strategy,
        aggregation=Aggregation.MEDIAN,
        seed=42,
    )
    detector.fit(X)
    p_vals = detector.predict(X, raw=False)

    results[f"{n_folds}-fold"] = (p_vals < 0.05).sum()
    print(f"{n_folds}-fold CV: {results[f'{n_folds}-fold']} detections")

FDR Control

# Apply FDR control to cross-validation p-values
adjusted_p_values = false_discovery_control(p_values, method='bh')
discoveries = adjusted_p_values < 0.05

print(f"\nFDR Control Results:")
print(f"Discoveries: {discoveries.sum()}")
print(f"Original detections: {(p_values < 0.05).sum()}")
print(f"FDR-controlled precision improvement: {(discoveries.sum() / max(1, (p_values < 0.05).sum())):.2%}")

Stratified Cross-Validation

from sklearn.model_selection import StratifiedKFold

# Create stratified CV strategy (useful when you have class labels)
# Note: In anomaly detection, we typically don't have labels during training,
# but this example shows how to use it if you do have some labeled data
stratified_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# For demonstration, create synthetic labels based on anomaly scores
# In practice, you might have some labeled normal/anomaly data
temp_detector = LOF(contamination=0.1)
temp_detector.fit(X)
synthetic_labels = (temp_detector.decision_function(X) > np.percentile(temp_detector.decision_function(X), 90)).astype(int)

# Use stratified splits
for fold, (train_idx, val_idx) in enumerate(stratified_cv.split(X, synthetic_labels)):
    print(f"Fold {fold + 1}: Train size = {len(train_idx)}, Val size = {len(val_idx)}")

Cross-Validation Stability Analysis

import matplotlib.pyplot as plt

# Analyze stability across different random seeds
seeds = range(10)
cv_results = []

for seed in seeds:
    detector = ConformalDetector(
        detector=base_detector,
        strategy=CrossValidation(k=5),
        aggregation=Aggregation.MEDIAN,
        seed=seed,
    )
    detector.fit(X)
    p_vals = detector.predict(X, raw=False)
    cv_results.append((p_vals < 0.05).sum())

plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.plot(seeds, cv_results, 'o-')
plt.xlabel('Random Seed')
plt.ylabel('Number of Detections')
plt.title('CV Detection Stability Across Seeds')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.boxplot(cv_results)
plt.ylabel('Number of Detections')
plt.title('CV Detection Distribution')
plt.xticks([1], ['5-fold CV'])

plt.tight_layout()
plt.show()

print(f"Mean detections: {np.mean(cv_results):.1f}")
print(f"Std detections: {np.std(cv_results):.1f}")

Comparison with Other Strategies

from nonconform.strategy.split import Split
from nonconform.strategy.bootstrap import Bootstrap
from nonconform.strategy.jackknife import Jackknife

# Compare cross-validation with other strategies
strategies = {
    'Split': Split(n_calib=0.2),
    '5-fold CV': CrossValidation(k=5),
    '10-fold CV': CrossValidation(k=10),
    'Bootstrap': Bootstrap(n_bootstraps=100, resampling_ratio=0.8),
    'Jackknife': Jackknife()
}

comparison_results = {}
for name, strategy in strategies.items():
    detector = ConformalDetector(
        detector=base_detector,
        strategy=strategy,
        aggregation=Aggregation.MEDIAN,
        seed=42,
    )
    detector.fit(X)
    p_vals = detector.predict(X, raw=False)

    # Apply FDR control
    adj_p_vals = false_discovery_control(p_vals, method='bh')

    comparison_results[name] = {
        'raw_detections': (p_vals < 0.05).sum(),
        'fdr_detections': (adj_p_vals < 0.05).sum(),
        'min_p': p_vals.min(),
        'calibration_size': len(detector.calibration_set)
    }

print("\nStrategy Comparison:")
print("-" * 70)
print(f"{'Strategy':<15} {'Raw Det.':<10} {'FDR Det.':<10} {'Min p-val':<12} {'Cal. Size':<10}")
print("-" * 70)
for name, results in comparison_results.items():
    print(f"{name:<15} {results['raw_detections']:<10} {results['fdr_detections']:<10} "
          f"{results['min_p']:<12.4f} {results['calibration_size']:<10}")

Next Steps