calculate_elo_ratings.py 6.68 KB
Newer Older
wanglch's avatar
wanglch committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""

Elo ratings for olmOCR vs baselines.

See data at scripts/elo/ratings.csv

    MethodA,MethodB,A_wins,B_wins,A_rate(%),B_rate(%)
    marker,mineru,53,26,67.1,32.9
    mineru,pdelf,22,55,28.6,71.4
    gotocr_format,marker,26,45,36.6,63.4
    marker,pdelf,31,49,38.8,61.3
    gotocr_format,pdelf,29,41,41.4,58.6
    gotocr_format,mineru,38,37,50.7,49.3

Invoke via
    python calculate_elo_ratings.py ratings.csv --num-bootstrap 5000 --num-elo-sims 100 --confidence-level 95 --seed 123

Output:

    Bootstrapped Elo Ratings (95% CI):
    --------------------------------------------------
    pdelf        1813.0 ± 84.9 [1605.9, 1930.0]
    mineru       1545.2 ± 99.7 [1336.7, 1714.1]
    marker       1429.1 ± 100.7 [1267.6, 1645.5]
    gotocr_format 1212.7 ± 82.0 [1097.3, 1408.3]

    Pairwise Significance Tests:
    --------------------------------------------------
    gotocr_format vs marker       Δ = -216.3 [-470.8,  135.0] p = 0.218
    gotocr_format vs mineru       Δ = -332.5 [-567.5,   19.3] p = 0.051
    gotocr_format vs pdelf        Δ = -600.3 [-826.1, -344.3] p = 0.000*
    marker       vs mineru       Δ = -116.1 [-365.4,  246.5] p = 0.430
    marker       vs pdelf        Δ = -383.9 [-610.6,  -10.9] p = 0.044*
    mineru       vs pdelf        Δ = -267.8 [-517.3,  104.0] p = 0.135


@kylel

"""

import random
from itertools import combinations

import click
import numpy as np
import pandas as pd
from tqdm import tqdm


def calculate_elo(matches_data, all_methods, k_factor=32, initial_rating=1500, n_replications=10, random_state=None):
    """Calculate Elo ratings with multiple replications per dataset"""
    all_ratings = {method: [] for method in all_methods}

    for _ in range(n_replications):
        matches = matches_data.sample(frac=1, replace=False, random_state=random_state).reset_index(drop=True)
        ratings = {method: initial_rating for method in all_methods}

        for _, row in matches.iterrows():
            method_a, method_b = row["MethodA"], row["MethodB"]
            a_wins, b_wins = row["A_wins"], row["B_wins"]

            for _ in range(int(a_wins)):
                ra, rb = update_single_match(ratings[method_a], ratings[method_b], 1, k_factor)
                ratings[method_a], ratings[method_b] = ra, rb

            for _ in range(int(b_wins)):
                ra, rb = update_single_match(ratings[method_a], ratings[method_b], 0, k_factor)
                ratings[method_a], ratings[method_b] = ra, rb

        for method in all_methods:
            all_ratings[method].append(ratings[method])

    return {method: np.mean(ratings) for method, ratings in all_ratings.items()}


def update_single_match(rating_a, rating_b, actual_score, k_factor):
    """Update ratings for a single match"""
    expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
    new_rating_a = rating_a + k_factor * (actual_score - expected_a)
    new_rating_b = rating_b + k_factor * ((1 - actual_score) - (1 - expected_a))
    return new_rating_a, new_rating_b


def bootstrap_elo_and_tests(df, num_bootstrap=1000, num_elo_sims=10, confidence_level=95, k_factor=32, initial_rating=1500, random_state=None):
    """Calculate bootstrapped Elo ratings with confidence intervals and perform pairwise significance tests"""

    ci_lower = (100 - confidence_level) / 2
    ci_upper = 100 - ci_lower

    all_methods = set(df["MethodA"].unique()) | set(df["MethodB"].unique())
    bootstrap_ratings = {method: [] for method in all_methods}

    for _ in tqdm(range(num_bootstrap)):
        bootstrap_sample = df.sample(n=len(df), replace=True, random_state=random_state)
        ratings = calculate_elo(bootstrap_sample, all_methods, k_factor, initial_rating, num_elo_sims)

        for method in all_methods:
            bootstrap_ratings[method].append(ratings[method])

    # Calculate statistics and perform significance tests
    results = {}

    # Basic statistics
    for method in all_methods:
        ratings_array = np.array(bootstrap_ratings[method])
        results[method] = {
            "mean": np.mean(ratings_array),
            "std": np.std(ratings_array),
            "ci_lower": np.percentile(ratings_array, ci_lower),
            "ci_upper": np.percentile(ratings_array, ci_upper),
            "bootstrap_samples": ratings_array,  # Store for significance testing
        }

    # Pairwise significance tests
    significance_tests = {}
    for method1, method2 in combinations(all_methods, 2):
        # Calculate difference distribution
        diff_distribution = results[method1]["bootstrap_samples"] - results[method2]["bootstrap_samples"]

        # Calculate p-value (two-tailed test)
        p_value = 2 * min(np.mean(diff_distribution >= 0), np.mean(diff_distribution <= 0))

        # Store results
        significance_tests[(method1, method2)] = {
            "diff_mean": np.mean(diff_distribution),
            "diff_ci_lower": np.percentile(diff_distribution, ci_lower),
            "diff_ci_upper": np.percentile(diff_distribution, ci_upper),
            "p_value": p_value,
        }

    return results, significance_tests


@click.command()
@click.argument("ratings_file", type=click.Path(exists=True))
@click.option("--num-bootstrap", default=1000, help="Number of bootstrap iterations")
@click.option("--num-elo-sims", default=10, help="Number of ELO simulations per bootstrap")
@click.option("--confidence-level", default=95, help="Confidence level for intervals (in percent)")
@click.option("--seed", default=42, help="Random seed for reproducibility")
def main(ratings_file, num_bootstrap, num_elo_sims, confidence_level, seed):
    # Set random seed
    random.seed(seed)
    np.random.seed(seed)

    # Load data
    df = pd.read_csv(ratings_file)

    # Calculate bootstrapped Elo ratings
    results, significance_tests = bootstrap_elo_and_tests(df, num_bootstrap=num_bootstrap, num_elo_sims=num_elo_sims)

    # Sort and display results
    print(f"\nBootstrapped Elo Ratings ({confidence_level}% CI):")
    print("-" * 50)
    sorted_results = dict(sorted(results.items(), key=lambda x: x[1]["mean"], reverse=True))
    for method, stats in sorted_results.items():
        print(f"{method:12} {stats['mean']:6.1f} ± {stats['std']:4.1f} [{stats['ci_lower']:6.1f}, {stats['ci_upper']:6.1f}]")

    # Display pairwise significance tests
    print("\nPairwise Significance Tests:")
    print("-" * 50)
    for (method1, method2), stats in significance_tests.items():
        sig_marker = "*" if stats["p_value"] < (1 - confidence_level / 100) else " "
        print(
            f"{method1:12} vs {method2:12} Δ = {stats['diff_mean']:6.1f} "
            + f"[{stats['diff_ci_lower']:6.1f}, {stats['diff_ci_upper']:6.1f}] "
            + f"p = {stats['p_value']:.3f}{sig_marker}"
        )


if __name__ == "__main__":
    main()