reorganize subject files (#801)

814b3f73 · bittersweet1999 · GitHub · 2cd09164 · 814b3f73 · 814b3f73
Unverified Commit 814b3f73 authored Jan 16, 2024 by bittersweet1999 Committed by GitHub Jan 16, 2024
14 changed files
--- a/opencompass/datasets/subject_ir.py
+++ b/opencompass/datasets/subject_ir.py
--- a/opencompass/datasets/subject_multiround.py
+++ b/opencompass/datasets/subject_multiround.py
@@ -8,7 +8,7 @@ from datasets import Dataset, DatasetDict

 from opencompass.registry import LOAD_DATASET

-from .base import BaseDataset
+from ..base import BaseDataset

 base_prefix_en = """
 You are a helper who will help me to evaluate the quality of AI assistants.

--- a/opencompass/datasets/subjective_cmp.py
+++ b/opencompass/datasets/subjective_cmp.py
@@ -5,7 +5,7 @@ from datasets import Dataset, DatasetDict

 from opencompass.registry import LOAD_DATASET

-from .base import BaseDataset
+from ..base import BaseDataset


 @LOAD_DATASET.register_module()

--- a/opencompass/summarizers/__init__.py
+++ b/opencompass/summarizers/__init__.py
 # flake8: noqa: F401, E501
-from .alignmentbench import AlignmentBenchSummarizer
 from .circular import CircularSummarizer  # noqa: F401
-from .corev2 import Corev2Summarizer  # noqa: F401
-from .creationbench import CreationBenchSummarizer
-from .creationv01 import Creationv01Summarizer  # noqa: F401
 from .default import DefaultSummarizer  # noqa: F401
-from .information_retrival import IRSummarizer  # noqa: F401
-from .multiround import MultiroundSummarizer  # noqa: F401
-from .subjective import SubjectiveSummarizer  # noqa: F401
+from .subjective import *  # noqa: F401
--- a/opencompass/summarizers/creationv01.py
+++ b/opencompass/summarizers/creationv01.py
-# flake8: noqa: E501
-import csv
-import os
-import os.path as osp
-import re
-from collections import defaultdict
-from datetime import datetime
-
-import mmengine
-from mmengine import ConfigDict
-
-try:
-    from prettytable import from_csv
-except ImportError:
-    from_csv = None
-
-from opencompass.utils import dataset_abbr_from_cfg
-
-
-def match_general_answer(s):
-    temp = s[0]
-    if temp in ['A', 'B', 'C', 'D']:
-        return temp
-    else:
-        return None
-
-
-def match_GPT4_answer(s):
-    result = re.search(r'分数：(.)', s)
-    if result:
-        return int(result.group(1))
-    else:
-        return None
-
-
-judge_map = {'smart': match_GPT4_answer, 'other': match_general_answer}
-
-
-def call_function(name, arg):
-    if name in judge_map:
-        return judge_map[name](arg)
-    else:
-        print('Function not found in the map.')
-
-
-class Creationv01Summarizer:
-    """Do the subjectivity analyze based on evaluation results.
-
-    Args:
-        config (ConfigDict): The configuration object of the evaluation task.
-            It's expected to be filled out at runtime.
-    """
-
-    def __init__(self, config: ConfigDict, match_method='smart') -> None:
-        self.tasks = []
-        self.cfg = config
-        self.match_method = match_method
-
-    def summarize(self,
-                  time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
-        """Summarize the subjectivity analysis based on evaluation results.
-
-        Args:
-            time_str (str): Timestamp for file naming.
-
-        Returns:
-            pd.DataFrame: The summary results.
-        """
-        dataset_cfgs = self.cfg['datasets']
-        work_dir = self.cfg['work_dir']
-        self.work_dir = work_dir
-
-        self.time_str = time_str
-        output_path = osp.join(self.work_dir, 'summary',
-                               f'summary_{self.time_str}.txt')
-        output_dir = osp.join(osp.split(output_path)[0], f'{self.time_str}')
-        mmengine.mkdir_or_exist(output_dir)
-        results_folder = osp.join(work_dir, 'results')
-
-        for subdir in os.listdir(results_folder):
-            subdir_path = os.path.join(results_folder, subdir)
-            if os.path.isdir(subdir_path):
-                model, judge_model = subdir.split('_')
-                fout = osp.join(output_dir, judge_model + '-report.csv')
-                for dataset in dataset_cfgs:
-                    dataset_abbr = dataset_abbr_from_cfg(dataset)
-                    filepath = os.path.join(subdir_path,
-                                            dataset_abbr + '.json')
-                    result = mmengine.load(filepath)
-                    judged_answers = []
-                    references = []
-                    for k, v in result.items():
-                        judged_answers.append(
-                            call_function(self.match_method, v['prediction']))
-                        references.append(v['gold'])
-                    print(
-                        f'Among {len(judged_answers)} judgements, successfully extracted {len(judged_answers)-judged_answers.count(None)} judgements.'
-                    )
-                    model_scores, categories = defaultdict(float), defaultdict(
-                        float)
-                    for prediction, reference in zip(judged_answers,
-                                                     references):
-                        categories[reference['capability']] += 1
-                        if prediction is not None:
-                            model_scores[reference['capability']] += prediction
-                    for capability in categories:
-                        if capability not in model_scores:
-                            model_scores[capability] = 0.0
-                        else:
-                            model_scores[capability] = round(
-                                model_scores[capability] /
-                                categories[capability], 2)
-                    scores = {model: model_scores}
-                    rows = list(scores.keys())
-                    columns = list(scores[rows[0]].keys())
-                    with open(fout, 'a+', newline='') as csvfile:
-                        writer = csv.writer(csvfile)
-                        writer.writerow([''] + columns)
-                        for row in rows:
-                            writer.writerow(
-                                [row] +
-                                [scores[row][column] for column in columns])
-        with open(fout, 'r') as f:
-            x = from_csv(f)
-        print(x)
--- a/opencompass/summarizers/subjective.py
+++ b/opencompass/summarizers/subjective.py
-import copy as cp
-import io
-import json
-import math
-import multiprocessing as mp
-import os
-import os.path as osp
-import pickle
-import random as rd
-from collections import defaultdict
-from datetime import datetime
-from typing import List, Optional
-
-try:
-    import cv2
-except ImportError:
-    import traceback
-
-    traceback.print_exc()
-    raise ImportError(
-        'Import cv2 failed. Please install it with '
-        '"pip install opencv-python-headless" and try again.\n\n'
-        'If the prompt `ImportError: libGL.so.1` appears,'
-        ' you may consider one of the following two methods:\n'
-        'Method 1 - Uninstall opencv and then install opencv-headless\n'
-        'pip uninstall opencv-python; pip install opencv-python-headless\n\n'
-        'Method 2: Install the missing dynamic link libraries\n'
-        'sudo apt-get update; sudo apt-get install -y libgl1 libglib2.0-0')
-import mmengine
-import numpy as np
-import pandas as pd
-from mmengine import ConfigDict
-from tabulate import tabulate
-from tqdm import tqdm
-
-from opencompass.utils import build_dataset_from_cfg, dataset_abbr_from_cfg
-
-
-def dump(data, f):
-    """Dump data to file."""
-
-    def dump_pkl(data, pth):
-        pickle.dump(data, open(pth, 'wb'))
-
-    def dump_json(data, pth):
-        json.dump(data, open(pth, 'w'), indent=4)
-
-    def dump_jsonl(data, f):
-        lines = [json.dumps(x, ensure_ascii=False) for x in data]
-        with open(f, 'w', encoding='utf8') as fout:
-            fout.write('\n'.join(lines))
-
-    def dump_xlsx(data, f):
-        data.to_excel(f, index=False)
-
-    def dump_csv(data, f):
-        data.to_csv(f, index=False)
-
-    def dump_tsv(data, f):
-        data.to_csv(f, sep='\t', index=False)
-
-    handlers = dict(pkl=dump_pkl,
-                    json=dump_json,
-                    jsonl=dump_jsonl,
-                    xlsx=dump_xlsx,
-                    csv=dump_csv,
-                    tsv=dump_tsv)
-    suffix = f.split('.')[-1]
-    return handlers[suffix](data, f)
-
-
-def load(f):
-    """Load data from file."""
-
-    def load_pkl(pth):
-        return pickle.load(open(pth, 'rb'))
-
-    def load_json(pth):
-        return json.load(open(pth, 'r', encoding='utf-8'))
-
-    def load_jsonl(f):
-        lines = open(f, encoding='utf-8').readlines()
-        lines = [x.strip() for x in lines]
-        if lines[-1] == '':
-            lines = lines[:-1]
-        data = [json.loads(x) for x in lines]
-        return data
-
-    def load_xlsx(f):
-        return pd.read_excel(f)
-
-    def load_csv(f):
-        return pd.read_csv(f)
-
-    def load_tsv(f):
-        return pd.read_csv(f, sep='\t')
-
-    handlers = dict(pkl=load_pkl,
-                    json=load_json,
-                    jsonl=load_jsonl,
-                    xlsx=load_xlsx,
-                    csv=load_csv,
-                    tsv=load_tsv)
-    suffix = f.split('.')[-1]
-    return handlers[suffix](f)
-
-
-def double_log(msg, fout=None):
-    """Prints a message and optionally writes it to a file.
-
-    Args:
-        msg (str): The message to be printed and, if fout is provided,
-            written to the file.
-        fout (file, optional): A file object to write the message
-            to (default is None).
-
-    Returns:
-        None
-    """
-    print(msg)
-    if fout is not None:
-        fout.write(str(msg) + '\n')
-        fout.flush()
-
-
-def stack_image(imgs, shape=(1, 3)):
-    """Stacks a list of images into a grid.
-
-    Args:
-        imgs (list): A list of image arrays to be stacked.
-        shape (tuple): A tuple specifying the grid shape
-            (rows, columns) for the stacked images (default is (1, 3)).
-
-    Returns:
-        numpy.ndarray: The stacked image grid.
-    """
-    total_imgs = shape[0] * shape[1]
-    assert len(imgs) <= total_imgs
-    h, w, _ = imgs[0].shape
-    imgs = [cv2.resize(im, dsize=(w, h)) for im in imgs]
-    for i in range(total_imgs - len(imgs)):
-        imgs.append(np.ones((h, w, 3)).astype(np.uint8) * 127)
-    rows = []
-    for i in range(shape[0]):
-        if shape[1] == 1:
-            rows.append(imgs[i])
-        else:
-            rows.append(np.hstack(imgs[i * shape[1]:(i + 1) * shape[1]]))
-    if shape[0] == 1:
-        return rows[0]
-    else:
-        return np.vstack(rows)
-
-
-def simple_count(data_in, lang=None, capa=None):
-    """Counts occurrences of outcomes (win, lose, both, neither) in a dataset.
-
-    Args:
-        data_in (dict): The input data containing 'A', 'B', 'extracted' fields.
-        lang (str, optional): Filter by language (default is None).
-        capa (str, optional): Filter by capability (default is None).
-
-    Returns:
-        dict: A dictionary containing outcome counts for each
-            entry in 'A' and 'B'.
-    """
-    data = cp.deepcopy(data_in)
-    if lang is not None and 'lang' in data:
-        data = data[data['lang'] == lang]
-    if capa is not None and 'capability' in data:
-        flag = [(capa in x) for x in data['capability']]
-        data = data[flag]
-
-    A, B, ext = data['A'], data['B'], data['extracted']
-    res = {}
-    for a, b, choice in zip(A, B, ext):
-        if a not in res:
-            res[a] = defaultdict(lambda: 0)
-        if b not in res:
-            res[b] = defaultdict(lambda: 0)
-        ans_map = dict(A=['win', 'lose'],
-                       B=['lose', 'win'],
-                       C=['both', 'both'],
-                       D=['neither', 'neither'])
-        ak, bk = ans_map[choice]
-        res[a][ak] += 1
-        res[b][bk] += 1
-    return res
-
-
-def calc_win_rate(data_copy, models, lang=None, capa=None):
-    """Calculates win rates, tie rates, and loss rates between models based on
-    given data.
-
-    Args:
-        data_copy (pd.DataFrame): The input data containing
-            'A', 'B', 'extracted', 'lang', and 'capability' columns.
-        models (list): List of model names to calculate rates for.
-        lang (str, optional): Filter data by language (default is None).
-        capa (str, optional): Filter data by capability (default is None).
-
-    Returns:
-        pd.DataFrame, pd.DataFrame: DataFrames containing win rates
-            (cnt) and tie rates (ff) between models.
-    """
-    data = cp.deepcopy(data_copy)
-    if lang is not None and 'lang' in data:
-        data = data[data['lang'] == lang]
-    if capa is not None and 'capability' in data:
-        flag = [(capa in x) for x in data['capability']]
-        data = data[flag]
-
-    win = defaultdict(lambda: 0)
-    tie = defaultdict(lambda: 0)
-    lose = defaultdict(lambda: 0)
-
-    for i in range(len(data)):
-        v = data.iloc[i]
-        o = v['extracted']
-        key = v['A'] + ';' + v['B']
-
-        if o == 'A':
-            win[key] += 1
-        if o == 'B':
-            lose[key] += 1
-        if o in ['C', 'D']:
-            tie[key] += 1
-
-    nmodel = len(models)
-    cnt = pd.DataFrame({k: [0] * nmodel for k in models}, index=models)
-    ff = pd.DataFrame({k: [0] * nmodel for k in models}, index=models)
-    tot = pd.DataFrame({k: [0] * nmodel for k in models}, index=models)
-    for i, k in enumerate(win):
-        m1, m2 = k.split(';')
-        cnt.at[m1, m2] += win[k]
-        cnt.at[m2, m1] += lose[k]
-        ff.at[m1, m2] += tie[k]
-        ff.at[m2, m1] += tie[k]
-        tot.at[m1, m2] += tie[k] + win[k] + lose[k]
-        tot.at[m2, m1] += tie[k] + win[k] + lose[k]
-
-    for m1 in models:
-        for m2 in models:
-            if tot.at[m1, m2]:
-                cnt.at[m1, m2] /= tot.at[m1, m2]
-                ff.at[m1, m2] /= tot.at[m1, m2]
-    return cnt, ff
-
-
-def find_inconsistent(data, vals=['A', 'B', 'C', 'D']):
-    """Finds inconsistent data entries based on specified values.
-
-    Args:
-        data (pd.DataFrame): The input data containing
-            'cmp_index' and 'extracted' columns.
-        vals (list, optional): List of possible values
-            (default is ['A', 'B', 'C', 'D']).
-
-    Returns:
-        pd.DataFrame, pd.DataFrame: DataFrames containing
-            consistent (cons) and inconsistent (incons) data entries.
-    """
-    assert 'extracted' in data
-    cons, incons = [], []
-    pred_map = {x: y for x, y in zip(data['cmp_index'], data['extracted'])}
-    for k in data['cmp_index']:
-        parts = k.split(';')
-        kct = ';'.join([parts[0], parts[2], parts[1]])
-        if kct not in pred_map:
-            cons.append(k)
-            continue
-        cons_tups = [(vals[0], vals[1]), (vals[1], vals[0]),
-                     (vals[2], vals[2]), (vals[3], vals[3])]
-        flag = True
-        for tup in cons_tups:
-            if pred_map[k] == tup[0] and pred_map[kct] == tup[1]:
-                flag = False
-                cons.append(k)
-                break
-        if flag:
-            incons.append(k)
-    cons, incons = data[data['cmp_index'].isin(cons)], data[
-        data['cmp_index'].isin(incons)]
-    return cons, incons
-
-
-def extract_vispair(data, vals='ABCD', vispair=None):
-    """Extracts specific data pairs and writes them to Excel files.
-
-    Args:
-        data (pd.DataFrame): The input data containing
-            'A', 'B', and 'extracted' columns.
-        vals (str, optional): A string of possible
-            values (default is 'ABCD').
-        vispair (tuple, optional): A tuple specifying the pair
-            of values to extract (e.g., ('A', 'B')).
-
-    Returns:
-        None
-    """
-    assert vispair is not None
-    ma, mb = vispair
-    indices_map = defaultdict(list)
-    lt = len(data)
-    for i in range(lt):
-        item = data.iloc[i]
-        if (item['A'] == ma and item['B'] == mb
-                and item['extracted'] == vals[0]):
-            indices_map[f'{ma}_win_{mb}'].append(i)
-
-        if (item['A'] == mb and item['B'] == ma
-                and item['extracted'] == vals[1]):
-            indices_map[f'{ma}_win_{mb}'].append(i)
-
-        if (item['A'] == ma and item['B'] == mb
-                and item['extracted'] == vals[1]):
-            indices_map[f'{ma}_lose_{mb}'].append(i)
-
-        if (item['A'] == mb and item['B'] == ma
-                and item['extracted'] == vals[0]):
-            indices_map[f'{ma}_lose_{mb}'].append(i)
-
-        if (set([item['A'], item['B']]) == set([ma, mb])
-                and item['extracted'] == vals[2]):
-            indices_map[f'{ma}_both_{mb}'].append(i)
-
-        if (set([item['A'], item['B']]) == set([ma, mb])
-                and item['extracted'] == vals[3]):
-            indices_map[f'{ma}_neither_{mb}'].append(i)
-
-    for k in indices_map:
-        data_sub = data.iloc[indices_map[k]]
-        dump(data_sub, f'{k}.xlsx')
-
-
-def get_shape(lt):
-    """Calculates the shape (rows, columns) for a grid based on the number of
-    elements.
-
-    Args:
-        lt (int): The total number of elements in the grid.
-
-    Returns:
-        tuple: A tuple containing the calculated number
-            of rows and columns.
-    """
-    h = int(math.sqrt(lt))
-    w = lt // h
-    if h * w < lt:
-        w += 1
-    return h, w
-
-
-def compute_elo_score(data,
-                      K=32,
-                      SCALE=400,
-                      BASE=10,
-                      INIT_RATING=1000,
-                      seed=2680,
-                      vals='ABCD'):
-    """Computes Elo ratings for models based on provided data.
-
-    Args:
-        data (pd.DataFrame): The input data containing
-            'A', 'B', and 'extracted' columns.
-        K (float, optional): The K factor for Elo
-            calculation (default is 32).
-        SCALE (float, optional): The Elo scale factor (default is 400).
-        BASE (float, optional): The Elo base factor (default is 10).
-        INIT_RATING (float, optional): The initial rating
-            for models (default is 1000).
-        seed (int, optional): Random seed for shuffling
-            battles (default is 2680).
-        vals (str, optional): A string of possible values
-            (default is 'ABCD').
-
-    Returns:
-        dict: A dictionary containing model ratings.
-    """
-    rating = defaultdict(lambda: INIT_RATING)
-    battles = []
-    lt = len(data)
-    for i in range(lt):
-        item = data.iloc[i]
-        score_map = {vals[0]: 1, vals[1]: 0, vals[2]: 0.5, vals[3]: 0.5}
-        score = score_map[
-            item['extracted']] if item['extracted'] in score_map else 0.5
-        battles.append((item['A'], item['B'], score))
-
-    rd.seed(seed)
-    rd.shuffle(battles)
-
-    for m0, m1, v in battles:
-        ra = rating[m0]
-        rb = rating[m1]
-        ea = 1 / (1 + BASE**((rb - ra) / SCALE))
-        eb = 1 / (1 + BASE**((ra - rb) / SCALE))
-        sa = v
-        rating[m0] += K * (sa - ea)
-        rating[m1] += K * (1 - sa - eb)
-    return {k: v for k, v in rating.items()}
-
-
-def compute_elo_score_pack(tup):
-    return compute_elo_score(tup[0], seed=tup[1], vals=tup[2])
-
-
-def mrlines(fname, sp='\n'):
-    f = open(fname).read().split(sp)
-    while f != [] and f[-1] == '':
-        f = f[:-1]
-    return f
-
-
-def get_bootstrap_result(data,
-                         num_round,
-                         base_seed=1000,
-                         num_thread=20,
-                         vals='ABCD'):
-    """Computes Elo scores with bootstrapping and returns the results as a
-    DataFrame.
-
-    Args:
-        data (pd.DataFrame): The input data containing 'A', 'B',
-            and 'extracted' columns.
-        num_round (int): The number of bootstrap rounds to perform.
-        base_seed (int, optional): The base seed for randomization
-            (default is 1000).
-        num_thread (int, optional): The number of threads to use
-            for parallel processing (default is 20).
-        vals (str, optional): A string of possible values
-            (default is 'ABCD').
-
-    Returns:
-        pd.DataFrame: A DataFrame containing Elo scores for
-            models based on bootstrapping.
-    """
-    rows = []
-    tups = [(data, base_seed + i, vals) for i in range(num_round)]
-    pool = mp.Pool(num_thread)
-    rets = pool.map(compute_elo_score_pack, tups)
-    for ret in rets:
-        rows.append(ret)
-    df = pd.DataFrame(rows)
-    return df[df.median().sort_values(ascending=False).index]
-
-
-def bootstrap_elo(data, num_round=1000, times=10, vals='ABCD'):
-    """Computes Elo scores with bootstrapping over multiple runs and returns
-    aggregated results.
-
-    Args:
-        data (pd.DataFrame): The input data containing 'A', 'B',
-            and 'extracted' columns.
-        num_round (int, optional): The number of bootstrap rounds
-            to perform in each run (default is 1000).
-        times (int, optional): The number of runs to perform
-            (default is 10).
-        vals (str, optional): A string of possible values
-            (default is 'ABCD').
-
-    Returns:
-        pd.DataFrame: A DataFrame containing aggregated Elo
-            scores with mean and standard deviation.
-    """
-    results = defaultdict(list)
-    for i in tqdm(range(times)):
-        bootstrap_elo_lu = get_bootstrap_result(data,
-                                                num_round,
-                                                base_seed=num_round * i,
-                                                num_thread=20,
-                                                vals=vals)
-        bootstrap_lu_median = bootstrap_elo_lu.median().reset_index().set_axis(
-            ['model', 'rating'], axis=1)
-        for m, r in zip(bootstrap_lu_median['model'],
-                        bootstrap_lu_median['rating']):
-            results[m].append(r)
-    res_dict = {}
-    keys = list(results.keys())
-    keys.sort()
-    for k in keys:
-        res_dict[k] = [np.mean(results[k]), np.std(results[k])]
-    df = pd.DataFrame(res_dict, index=['elo_score [Mean]', 'elo_score [Std]'])
-    return df
-
-
-FONT_FILE = os.environ.get('FONT_FILE', None)
-
-
-def match_answer(s):
-    """Match the selected answer (A, B, C, or D) in a given string.
-
-    Args:
-        s (str): The input string to search for the selected answer.
-
-    Returns:
-        str or None: The matched answer ('A', 'B', 'C', or 'D')
-            or None if not found.
-    """
-
-    def match_char(s, chars):
-        cin = [c in s for c in chars]
-        if sum(cin) == 1:
-            return chars[cin.index(True)]
-        else:
-            return None
-
-    lines = s.split('\n')
-    for _, line in enumerate(lines):
-        if line.startswith('选择：'):
-            return match_char(line, 'ABCD')
-    return None
-
-
-def draw_heatmap(hmap, title):
-    """Draw a heatmap using the given data.
-
-    Args:
-        hmap (pd.DataFrame): The data for the heatmap.
-        title (str): The title for the heatmap.
-
-    Returns:
-        np.ndarray: An image of the heatmap.
-    """
-    from matplotlib import font_manager
-    if FONT_FILE is None:
-        fontP = font_manager.FontProperties()
-    else:
-        fontP = font_manager.FontProperties(fname=FONT_FILE)
-    fontP.set_size(18)
-    import matplotlib.pyplot as plt
-    import seaborn as sns
-    ax = sns.heatmap(hmap,
-                     annot=True,
-                     cmap='Blues',
-                     annot_kws={'size': 35 / np.sqrt(len(hmap))})
-    ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=12)
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=12)
-    plt.yticks(rotation=0)
-    ax.xaxis.tick_top()  # x axis on top
-    ax.xaxis.set_label_position('top')
-    plt.title(title, color='Blue', fontproperties=fontP)
-    plt.tight_layout()
-    buffer = io.BytesIO()
-    plt.savefig(buffer, format='png', dpi=100)
-    plt.close()
-    buffer.seek(0)
-    image_data = buffer.getvalue()
-    image = cv2.imdecode(np.frombuffer(image_data, np.uint8), cv2.IMREAD_COLOR)
-    return image
-
-
-def proc_capa(capas):
-    capa_lists = [capa_str for capa_str in capas]
-    capa_set = set(capa_lists)
-    capa_set = list(capa_set)
-    return capa_set
-
-
-class SubjectiveSummarizer:
-    """Do the subjectivity analyze based on evaluation results.
-
-    Args:
-        config (ConfigDict): The configuration object of the evaluation task.
-            It's expected to be filled out at runtime.
-        vispair (List[str], optional): List of
-            two models to visualize.
-        refm (str, optional): Reference model
-            for win rate comparison.
-        col_name (str): Name of the column
-            containing evaluation results.
-        fout (str): Output file name.
-        ignore (str, optional): Ignore certain
-            comparisons based on a file.
-    """
-
-    def __init__(
-        self,
-        config: ConfigDict,
-        vispair: Optional[List[str]] = None,
-        refm: Optional[str] = None,
-        col_name: str = 'gpt4',
-        fout: str = 'report.md',
-        ignore: Optional[str] = None,
-    ) -> None:
-        self.tasks = []
-        self.cfg = config
-        self.vispair = vispair
-        self.refm = refm
-        self.col_name = col_name
-        self.fout = fout
-        self.ignore = ignore
-
-    def summarize(self,
-                  time_str: str = datetime.now().strftime('%Y%m%d_%H%M%S')):
-        """Summarize the subjectivity analysis based on evaluation results.
-
-        Args:
-            time_str (str): Timestamp for file naming.
-
-        Returns:
-            pd.DataFrame: The summary results.
-        """
-
-        dataset_cfgs = self.cfg['datasets']
-        eval_cfg = self.cfg['eval']
-        work_dir = self.cfg['work_dir']
-        self.work_dir = work_dir
-
-        self.time_str = time_str
-        output_path = osp.join(self.work_dir, 'summary',
-                               f'summary_{self.time_str}.txt')
-        output_dir = osp.join(osp.split(output_path)[0], f'{self.time_str}')
-        mmengine.mkdir_or_exist(output_dir)
-        fout = open(osp.join(output_dir, self.fout), 'w')
-        results_folder = osp.join(work_dir, 'results')
-        data_list = []
-        for subdir in os.listdir(results_folder):
-            subdir_path = os.path.join(results_folder, subdir)
-            if os.path.isdir(subdir_path):
-                model1, model2 = subdir.split('_')
-                for dataset in dataset_cfgs:
-                    origin_dataset = build_dataset_from_cfg(dataset)
-                    dataset_abbr = dataset_abbr_from_cfg(dataset)
-                    filepath = os.path.join(subdir_path,
-                                            dataset_abbr + '.json')
-                    result = mmengine.load(filepath)
-                    if eval_cfg['partitioner']['mode'] == 'all':
-                        for key, value in result.items():
-                            prediction = value.get('prediction', None)
-                            q_index = origin_dataset.test[int(key) % len(
-                                origin_dataset.test)]['index']
-                            cmp_index = f'{q_index};{model1};{model2}'
-                            data_list.append(
-                                [cmp_index, model1, model2, prediction])
-
-        data = pd.DataFrame(data_list, columns=['cmp_index', 'A', 'B', 'gpt4'])
-        meta = pd.read_excel(
-            osp.join(dataset_cfgs[0]['path'],
-                     dataset_cfgs[0]['name'] + '.xlsx'))
-
-        if self.ignore is not None:
-            q_index = [x.split(';')[0] for x in data['cmp_index']]
-            to_ignore = set(mrlines(self.ignore))
-            flag = [x not in to_ignore for x in q_index]
-            data = data[flag]
-
-        double_log('# Subjective Analysis', fout)
-        capas = proc_capa(meta['capability'])
-        capa_map = {i: c for i, c in zip(meta['index'], meta['capability'])}
-
-        nonem = [x != 'EM' for x in data[self.col_name]]
-        double_log(
-            f'A total of {len(data)} comparisons, of which {sum(nonem)} '
-            f'comparisons are meaningful (A / B answers inconsistent)', fout)
-        data = data[nonem]
-
-        data['capability'] = [
-            capa_map[str(i).split(';')[0]] for i in data['cmp_index']
-        ]
-        data['extracted'] = [match_answer(ans) for ans in data[self.col_name]]
-
-        succeed = [not pd.isna(x) for x in data['extracted']]
-        succeed_rate = np.mean(succeed)
-        double_log(
-            f'A total of {len(succeed)} answer comparisons, successfully '
-            f'extracted {sum(succeed)} answers from GPT-4 replies, with '
-            f'an extraction success rate of {succeed_rate * 100:.2f}%', fout)
-        data = data[succeed]
-
-        cons, incons = find_inconsistent(data, 'ABCD')
-        if len(cons) != len(data):
-            double_log(
-                f'A total of {len(data)} answer comparisons, {len(cons)} '
-                f'pairs (A vs. B <-> B vs. A) are consistent，consistent '
-                f'rate is {len(cons) / len(data) * 100:.2f}%', fout)
-
-        dump(cons, osp.join(output_dir, 'consistent_cmp.xlsx'))
-        dump(incons, osp.join(output_dir, 'inconsistent_cmp.xlsx'))
-
-        data = cons
-        if self.vispair is not None and len(self.vispair) == 2:
-            extract_vispair(data, vispair=self.vispair)
-
-        data['lang'] = [x.split('-')[0] for x in data['cmp_index']]
-        langs = [None, 'cn', 'en']
-        return self.analyze(data, self.refm, langs, capas, fout)
-
-    def analyze(self, data, refm, langs, capas, fout):
-        """Do the subjectivity analysis based on evaluation results.
-
-        Args:
-            data (pd.DataFrame): The evaluation data.
-            refm (str): Reference model for win rate comparison.
-            langs (List[str]): List of languages to analyze.
-            capas (List[str]): List of capabilities to analyze.
-            fout (str): Output file name.
-
-        Returns:
-            None
-        """
-        output_path = osp.join(self.work_dir, 'summary',
-                               f'summary_{self.time_str}.txt')
-        output_dir = osp.join(osp.split(output_path)[0], f'{self.time_str}')
-        mmengine.mkdir_or_exist(output_dir)
-
-        stats = defaultdict(list)
-        scores = defaultdict(list)
-
-        dim_key = 'Dimension \\ Stat [W / T / L / NB]'
-        scores_dim_key = 'Dimension \\ Score'
-
-        for lang in langs:
-            name = (lang.upper() if lang is not None else 'Overall')
-            stats[dim_key].append(f'LANG: {name}')
-            scores[scores_dim_key].append(f'LANG: {name}')
-
-            count_stat = simple_count(data, lang=lang)
-            if count_stat == {}:
-                for k, v in stats.items():
-                    if k != dim_key:
-                        v.append('N/A')
-                for k, v in scores.items():
-                    if k != scores_dim_key:
-                        v.append('N/A')
-
-            for k in count_stat:
-                stat = count_stat[k]
-                winr = stat['win'] / sum(stat.values())
-                tier = (stat['both'] + stat['neither']) / sum(stat.values())
-                loser = stat['lose'] / sum(stat.values())
-                not_bad = (stat['win'] + stat['both']) / sum(stat.values())
-                msg = f'{winr * 100:.1f}% / {tier * 100:.1f}% / {loser * 100:.1f}% / {not_bad * 100:.1f}%'  # noqa
-                stats[k].append(msg)
-                score = 3 * stat['win'] + stat['both'] - stat[
-                    'neither'] - 3 * stat['lose']
-                scores[k].append(score)
-        for capa in capas:
-            stats[dim_key].append(f'CAPA: {capa}')
-            scores[scores_dim_key].append(f'CAPA: {capa}')
-            count_stat = simple_count(data, capa=capa)
-            if count_stat == {}:
-                for k, v in stats.items():
-                    if k != dim_key:
-                        v.append('N/A')
-                for k, v in scores.items():
-                    if k != scores_dim_key:
-                        v.append('N/A')
-
-            for k in count_stat:
-                stat = count_stat[k]
-                winr = stat['win'] / sum(stat.values())
-                tier = (stat['both'] + stat['neither']) / sum(stat.values())
-                loser = stat['lose'] / sum(stat.values())
-                not_bad = (stat['win'] + stat['both']) / sum(stat.values())
-                msg = f'{winr * 100:.1f}% / {tier * 100:.1f}% / {loser * 100:.1f}% / {not_bad * 100:.1f}%'  # noqa
-                stats[k].append(msg)
-                score = 3 * stat['win'] + stat['both'] - stat[
-                    'neither'] - 3 * stat['lose']
-                scores[k].append(score)
-        double_log(
-            '### Basic statistics (4 stats: win / tie / lose / not bad)', fout)
-        all_models = list(stats.keys())
-        all_models.remove(dim_key)
-
-        table_width = 3
-        num_tables = len(all_models) // table_width + (
-            len(all_models) % table_width != 0)
-        for i in range(num_tables):
-            cur_keys = [dim_key
-                        ] + all_models[i * table_width:(i + 1) * table_width]
-            sub_stats = {k: stats[k] for k in cur_keys}
-            double_log(tabulate(sub_stats, headers='keys', tablefmt='github'),
-                       fout)
-
-        image_url1 = 'by_capa.png'
-        image_url2 = 'by_lang.png'
-        double_log(
-            f'\n\n![Capabilities Dimension '
-            f'Classification Result]({image_url1})'
-            f'\n\n![Language Classification  Result]({image_url2})', fout)
-
-        double_log(
-            '\n\n### Model scores (base score is 0, win +3,'
-            ' both +1, neither -1, lose -3)', fout)
-        double_log(tabulate(scores, headers='keys', tablefmt='github'), fout)
-
-        double_log('### Bootstrap ELO, Median of n=1000 times ', fout)
-        elo_table = bootstrap_elo(data)
-        double_log(tabulate(elo_table, headers='keys', tablefmt='github'),
-                   fout)
-
-        models = list(count_stat.keys())
-        models.sort()
-
-        images = []
-        for lang in langs:
-            wr, dr = calc_win_rate(data, models, lang=lang)
-            lang_name = lang.upper() if lang is not None else 'Overall'
-
-            wr_table = defaultdict(list)
-            if refm is not None:
-                for m in models:
-                    if m == refm:
-                        continue
-                    wr_table['model'].append(m)
-                    wr_table['win_rate'].append(wr.at[m, refm])
-                    wr_table['draw_rate'].append(dr.at[m, refm])
-                    wr_table['win + draw'].append(dr.at[m, refm] +
-                                                  wr.at[m, refm])
-                double_log(
-                    f'By language {lang_name}, calculate '
-                    f'the win rate against {refm}:', fout)
-                double_log(
-                    tabulate(wr_table, headers='keys', tablefmt='github'),
-                    fout)
-
-            im = draw_heatmap(
-                wr, f'Language: {lang if lang is not None else "All"}')
-            images.append(im)
-        image = stack_image(images, shape=(1, 3))
-        cv2.imwrite(osp.join(output_dir, 'by_lang.png'), image)
-
-        images = []
-        for capa in capas:
-            wr, dr = calc_win_rate(data, models, capa=capa)
-
-            wr_table = defaultdict(list)
-            if refm is not None:
-                for m in models:
-                    if m == refm:
-                        continue
-                    wr_table['model'].append(m)
-                    wr_table['win_rate'].append(wr.at[m, refm])
-                    wr_table['draw_rate'].append(dr.at[m, refm])
-                    wr_table['win + draw'].append(dr.at[m, refm] +
-                                                  wr.at[m, refm])
-                double_log(
-                    f'By capability {capa}, calculate the '
-                    f'win rate against {refm}:', fout)
-                double_log(
-                    tabulate(wr_table, headers='keys', tablefmt='github'),
-                    fout)
-
-            im = draw_heatmap(wr, f'Capability: {capa}')
-            images.append(im)
-
-        lt = len(capas)
-        h, w = get_shape(lt)
-        image = stack_image(images, shape=(h, w))
-        cv2.imwrite(osp.join(output_dir, 'by_capa.png'), image)
-        dump(data, osp.join(output_dir, 'tmp.xlsx'))
-        fout.close()
--- a/opencompass/summarizers/subjective/__init__.py
+++ b/opencompass/summarizers/subjective/__init__.py
+# flake8: noqa: F401, E501
+from .alignmentbench import AlignmentBenchSummarizer
+from .corev2 import Corev2Summarizer
+from .creationbench import CreationBenchSummarizer
+from .information_retrival import IRSummarizer
+from .multiround import MultiroundSummarizer
--- a/opencompass/summarizers/alignmentbench.py
+++ b/opencompass/summarizers/alignmentbench.py
--- a/opencompass/summarizers/corev2.py
+++ b/opencompass/summarizers/corev2.py
@@ -124,7 +124,7 @@ class Corev2Summarizer:
                        print('There are no results for ' + filename + ' or ' +
                              partial_filename)
                        print('*' * 100)
-                        assert len(result > 0)
+                        assert len(result) > 0

                    judged_answers = []
                    references = []

--- a/opencompass/summarizers/creationbench.py
+++ b/opencompass/summarizers/creationbench.py
--- a/opencompass/summarizers/information_retrival.py
+++ b/opencompass/summarizers/information_retrival.py
--- a/opencompass/summarizers/multiround.py
+++ b/opencompass/summarizers/multiround.py
--- a/opencompass/summarizers/subjective_post_process.py
+++ b/opencompass/summarizers/subjective_post_process.py
--- a/opencompass/summarizers/utils.py
+++ b/opencompass/summarizers/utils.py