fix arenahard summarizer (#1154)

Co-authored-by: Leymore <zfz-960727@163.com>

fix arenahard summarizer (#1154)
Co-authored-by: Leymore <zfz-960727@163.com>
8a8987be · bittersweet1999 · GitHub · 62dbf047 · 8a8987be
Unverified Commit 8a8987be authored May 15, 2024 by bittersweet1999 Committed by GitHub May 15, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 47 additions and 49 deletions

opencompass/summarizers/subjective/arenahard.py opencompass/summarizers/subjective/arenahard.py +47 -49

No files found.
--- a/opencompass/summarizers/subjective/arenahard.py
+++ b/opencompass/summarizers/subjective/arenahard.py
@@ -50,8 +50,7 @@ def compute_mle_elo(df, SCALE=400, BASE=10, INIT_RATING=1000):
    tie_idx = (df['winner'] == 'tie') | (df['winner'] == 'tie (bothbad)')
    tie_idx[len(tie_idx)//2:] = False
    Y[tie_idx] = 1.0
-
-    lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8)
+    lr = LogisticRegression(fit_intercept=False, penalty=None, tol=1e-8) # May need to set a small value when not use GPT4 as judge model
    lr.fit(X,Y)

    elo_scores = SCALE * lr.coef_[0] + INIT_RATING
@@ -134,10 +133,8 @@ def get_win_rate_column(df, column, baseline='gpt4-0314'):
    return win_rate_table[baseline].fillna(0.5).apply(lambda x: round(x * 100, 2))


-def get_battles_from_judgment(dataset, subdir_path, post_process, first_game_only=False, WEIGHT=3):
+def get_battles_from_judgment(dataset, subdir_path, post_process, WEIGHT=3):
    arena_hard_battles = pd.DataFrame()
-    print('Turning judgment results into battles...')
-
    dataset_abbr = dataset_abbr_from_cfg(dataset)
    filename = osp.join(subdir_path, dataset_abbr + '.json')
    partial_filename = osp.join(subdir_path, dataset_abbr + '_0.json')
@@ -197,7 +194,6 @@ def get_battles_from_judgment(dataset, subdir_path, post_process, first_game_onl
        if weight:
            arena_hard_battles = pd.concat([arena_hard_battles, pd.DataFrame([output] * weight)])

-    arena_hard_battles.to_json(os.path.join(subdir_path,'arena_hard_battles.jsonl'), lines=True, orient='records')
    return arena_hard_battles

 class ArenaHardSummarizer:
@@ -240,6 +236,8 @@ class ArenaHardSummarizer:
            judge_model = model_abbr_from_cfg(judge_model_cfg)
            for dataset in self.cfg['datasets']:
                dataset_abbr = dataset_abbr_from_cfg(dataset)
+                battles = pd.DataFrame()
+                print('Turning judgment results into battles...')
                for model_pair in unique_combinations:
                    model1 = model_pair[0]['abbr']
                    model2 = model_pair[1]['abbr']
@@ -252,49 +250,49 @@ class ArenaHardSummarizer:
                        print(subdir_path + ' is not exist! please check!')
                        continue

-                    battles = get_battles_from_judgment(dataset, subdir_path, self.judge_function)
-
-                    bootstrap_online_elo = compute_mle_elo(battles)
-
-                    np.random.seed(42)
-                    bootstrap_elo_lu = get_bootstrap_result(battles, compute_mle_elo, 100)
-                    bootstrap_elo_lu.to_json(os.path.join(subdir_path,'bootstrapping_results.jsonl'), lines=True, orient='records')
-
-                    stats = pd.DataFrame()
-                    stats['results'] = None
-                    stats['results'] = stats['results'].astype('object')
-
-                    for i, model in enumerate(bootstrap_online_elo.index):
-                        assert model in bootstrap_elo_lu.columns
-
-                        stats.at[i, 'model'] = model
-                        stats.at[i, 'score'] = bootstrap_online_elo[model]
-                        stats.at[i, 'lower'] = np.percentile(bootstrap_elo_lu[model], 2.5)
-                        stats.at[i, 'upper'] = np.percentile(bootstrap_elo_lu[model], 97.5)
-                        if model == 'gpt4-0314':
-                            stats.at[i, 'avg_tokens'] = 423
-                        else:
-                            with open(os.path.join(output_dir.split('summary')[0], 'predictions', model, dataset_abbr+'.json'), 'r') as f:
-                                model_preds = json.load(f)
-                                pred_length = 0
-                                for k, v in model_preds.items():
-                                    pred_length += len(tiktoken.encoding_for_model('gpt-3.5-turbo').encode(v['prediction']))
-                                pred_length /= len(model_preds)
-                            stats.at[i, 'avg_tokens'] = pred_length
-                        stats.at[i, 'results'] = bootstrap_elo_lu[model].tolist()
-                    stats.sort_values(by='model', inplace=True)
-                    stats['score'] = get_win_rate_column(stats, 'score', 'gpt4-0314').tolist()
-                    stats['lower'] = get_win_rate_column(stats, 'lower', 'gpt4-0314').tolist()
-                    stats['upper'] = get_win_rate_column(stats, 'upper', 'gpt4-0314').tolist()
-                    decimal = 1
-
-
-                    stats.sort_values(by='score', ascending=False, inplace=True)
-                    for _, row in stats.iterrows():
-                        interval = str((round(row['lower'] - row['score'], decimal), round(row['upper'] - row['score'], decimal)))
-                        print(f"{row['model'] : <30} | score: {round(row['score'], decimal) : ^5} | 95% CI: {interval : ^12} | average #tokens: {int(row['avg_tokens'])}")
-
-                    stats.to_json(os.path.join(output_dir,'arena_hard_leaderboard.json'), orient='records', indent=4)
+                    new_battle = get_battles_from_judgment(dataset, subdir_path, self.judge_function)
+                    battles = pd.concat([battles, new_battle], ignore_index=True)
+                battles.to_json(os.path.join(output_dir,'arena_hard_battles_judged-by--'+ judge_model+'.jsonl'), lines=True, orient='records')
+
+                bootstrap_online_elo = compute_mle_elo(battles)
+
+                np.random.seed(42)
+                bootstrap_elo_lu = get_bootstrap_result(battles, compute_mle_elo, 100)
+                bootstrap_elo_lu.to_json(os.path.join(output_dir,'bootstrapping_results'+ judge_model+'.jsonl'), lines=True, orient='records')
+
+                stats = pd.DataFrame()
+                stats['results'] = None
+                stats['results'] = stats['results'].astype('object')
+
+                for i, model in enumerate(bootstrap_online_elo.index):
+                    assert model in bootstrap_elo_lu.columns
+
+                    stats.at[i, 'model'] = model
+                    stats.at[i, 'score'] = bootstrap_online_elo[model]
+                    stats.at[i, 'lower'] = np.percentile(bootstrap_elo_lu[model], 2.5)
+                    stats.at[i, 'upper'] = np.percentile(bootstrap_elo_lu[model], 97.5)
+                    if model == 'gpt4-0314':
+                        stats.at[i, 'avg_tokens'] = 423
+                    else:
+                        with open(os.path.join(output_dir.split('summary')[0], 'predictions', model, dataset_abbr+'.json'), 'r') as f:
+                            model_preds = json.load(f)
+                            pred_length = 0
+                            for k, v in model_preds.items():
+                                pred_length += len(tiktoken.encoding_for_model('gpt-3.5-turbo').encode(v['prediction']))
+                            pred_length /= len(model_preds)
+                        stats.at[i, 'avg_tokens'] = pred_length
+                    stats.at[i, 'results'] = bootstrap_elo_lu[model].tolist()
+                stats.sort_values(by='model', inplace=True)
+                stats['score'] = get_win_rate_column(stats, 'score', 'gpt4-0314').tolist()
+                stats['lower'] = get_win_rate_column(stats, 'lower', 'gpt4-0314').tolist()
+                stats['upper'] = get_win_rate_column(stats, 'upper', 'gpt4-0314').tolist()
+                decimal = 1
+                stats.sort_values(by='score', ascending=False, inplace=True)
+                for _, row in stats.iterrows():
+                    interval = str((round(row['lower'] - row['score'], decimal), round(row['upper'] - row['score'], decimal)))
+                    print(f"{row['model'] : <30} | score: {round(row['score'], decimal) : ^5} | 95% CI: {interval : ^12} | average #tokens: {int(row['avg_tokens'])}")
+                stats.to_json(os.path.join(output_dir,'arena_hard_leaderboard_judged-by--'+judge_model+'.json'), orient='records', indent=4)
+                stats.to_csv(os.path.join(output_dir,'arena_hard_leaderboard_judged-by--'+judge_model+'.csv'))

    def summarize(
            self,