[Sync] Add InternLM2 Keyset Evaluation Demo (#807)

Co-authored-by: zhangyifan1 <zhangyifan1@pjlab.org.cn>

[Sync] Add InternLM2 Keyset Evaluation Demo (#807)
Co-authored-by: zhangyifan1 <zhangyifan1@pjlab.org.cn>
b4afe3e7 · Fengzhe Zhou · GitHub · acae5609 · b4afe3e7 · b4afe3e7
Unverified Commit b4afe3e7 authored Jan 17, 2024 by Fengzhe Zhou Committed by GitHub Jan 17, 2024
14 changed files
--- a/configs/summarizers/longbench.py
+++ b/configs/summarizers/longbench.py
 summarizer = dict(
    dataset_abbrs = [
        '--------- LongBench Single-Document QA ---------', # category
-        "LongBench_narrativeqa",
+        'LongBench_narrativeqa',
        'LongBench_qasper',
        'LongBench_multifieldqa_en',
-        "LongBench_multifieldqa_zh",
+        'LongBench_multifieldqa_zh',
        '--------- LongBench Multi-Document QA ---------', # category
        'LongBench_hotpotqa',
        'LongBench_2wikimqa',
@@ -28,5 +28,5 @@ summarizer = dict(
        'LongBench_lcc',
        'LongBench_repobench-p',
    ],
-    summary_groups=sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []),
+    summary_groups=sum([v for k, v in locals().items() if k.endswith('_summary_groups')], []),
 )
--- a/opencompass/datasets/circular.py
+++ b/opencompass/datasets/circular.py
@@ -13,7 +13,9 @@ from .commonsenseqa import commonsenseqaDataset
 from .hellaswag import hellaswagDataset_V2
 from .mmlu import MMLUDataset
 from .obqa import OBQADataset
+from .piqa import piqaDataset_V2
 from .race import RaceDataset
+from .siqa import siqaDataset_V3
 from .xiezhi import XiezhiDataset


@@ -273,6 +275,24 @@ class CircularXiezhiDataset(XiezhiDataset, metaclass=CircularDatasetMeta):
    default_answer_key = 'answer'


+class CircularsiqaDataset(siqaDataset_V3, metaclass=CircularDatasetMeta):
+    dataset_class = siqaDataset_V3
+    default_circular_splits = ['validation']
+    default_option_keys = ['A', 'B', 'C']
+    default_answer_key = 'answer'
+
+
+class CircularpiqaDataset(piqaDataset_V2, metaclass=CircularDatasetMeta):
+    dataset_class = piqaDataset_V2
+    default_circular_splits = ['validation']
+    default_option_keys = ['sol1', 'sol2']
+
+    def default_answer_key_switch_method(item, circular_pattern):
+        circular_pattern = tuple(int(i[-1]) - 1 for i in circular_pattern)
+        item['answer'] = 'AB'[circular_pattern['AB'.index(item['answer'])]]
+        return item
+
+
 class CircularEvaluator(BaseEvaluator):
    """This Evaluator assesses datasets post-Circular processing, generating
    the following evaluation metrics:

--- a/opencompass/datasets/ds1000.py
+++ b/opencompass/datasets/ds1000.py
@@ -378,6 +378,8 @@ class DS1000ServiceEvaluator(BaseEvaluator):
        processed_predictions = {}
        assert len(predictions) == len(references)
        for i, (pred, gold) in enumerate(zip(predictions, references)):
+            if len(pred) > 10000:
+                pred = ''
            processed_predictions[str(i)] = {'prediction': pred, 'gold': gold}

        with tempfile.TemporaryDirectory() as tmp_dir:

--- a/opencompass/datasets/humaneval.py
+++ b/opencompass/datasets/humaneval.py
@@ -155,6 +155,11 @@ def humaneval_postprocess(text: str) -> str:
 def humaneval_postprocess_v2(text: str) -> str:
    """This is an advanced version of previous postprocess to handle more
    situations, better to use this one."""
+    try:
+        # for chatGLM raw text
+        text = eval(text)
+    except Exception:
+        pass
    text = text.lstrip('\n')
    if '```' in text:
        blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
@@ -173,11 +178,11 @@ def humaneval_postprocess_v2(text: str) -> str:
    text = text.lstrip('\n')
    if text.strip().startswith('def'):
        text = '\n'.join(text.split('\n')[1:])
-    if not text.startswith('    '):
-        if text.startswith(' '):
-            text = '    ' + text.lstrip()
-        else:
-            text = '\n'.join(['    ' + line for line in text.split('\n')])
+    # deal with the indentation error
+    if text.startswith(' '):
+        text = '    ' + text.lstrip()
+    else:
+        text = '\n'.join(['    ' + line for line in text.split('\n')])
    text = text.split('\n')

    # If number of leading space reduces, we assume that the code block ends.

--- a/opencompass/datasets/humanevalx.py
+++ b/opencompass/datasets/humanevalx.py
@@ -14,6 +14,7 @@ from datasets import Dataset
 from opencompass.openicl.icl_evaluator import BaseEvaluator

 from .base import BaseDataset
+from .humaneval import humaneval_postprocess_v2

 _LANGUAGE_NAME_DICT = {
    'cpp': 'CPP',
@@ -89,9 +90,11 @@ class HumanevalXEvaluator(BaseEvaluator):

    def score(self, predictions, references):
        predictions = [{
-            'task_id': f'{_LANGUAGE_NAME_DICT[self.language]}/{i}',
-            'generation': _clean_up_code(pred, self.language),
-        } for i, pred in enumerate(predictions)]
+            'task_id':
+            f'{_LANGUAGE_NAME_DICT[self.language]}/{i}',
+            'generation':
+            _clean_up_code(pred, self.language, refer),
+        } for i, (pred, refer) in enumerate(zip(predictions, references))]
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_out_path = osp.join(tmp_dir,
                                    f'humanevalx_{self.language}.json')
@@ -161,15 +164,28 @@ class HumanevalXEvaluator(BaseEvaluator):
            return False, err


-def _clean_up_code(text: str, language_type: str) -> str:
+def _clean_up_code(text: str, language_type: str, reference) -> str:
    """Cleans up the generated code."""
+    try:
+        # for chatGLM related text
+        text = eval(text)
+    except Exception:
+        pass
+    # extract code from code block
+    text = text.lstrip('\n')
+    if '```' in text:
+        blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
+        if len(blocks) == 0:
+            text = text.split('```')[1]  # fall back to default strategy
+        else:
+            text = blocks[0]  # fetch the first code block
+            if not text.startswith('\n'):  # in case starting with ```xxx
+                text = text[max(text.find('\n') + 1, 0):]
    if language_type.lower() == 'python':
+        text = humaneval_postprocess_v2(text)
        # we need to take care of the first line
        # append extra space for first line for correct indentation
-        for c_index, c in enumerate(text[:5]):
-            if c != ' ':
-                text = ' ' * (4 - c_index) + text
-                break
+        text = '    ' + text.lstrip()

        text_splits = text.split('\n')
        is_empty_line = False
@@ -189,7 +205,13 @@ def _clean_up_code(text: str, language_type: str) -> str:
            for w in end_words:
                if w in text:
                    text = text[:text.rfind(w)]
-    elif language_type.lower() == 'java':
+    # strip function head for all other language
+    func_name = reference.strip().split('\n')[-1]
+    if func_name:
+        func_name = func_name.strip().strip('{')
+        if func_name in text:
+            text = '\n'.join(text[text.find(func_name):].split('\n')[1:])
+    if language_type.lower() == 'java':
        main_pos = text.find('public static void main')
        if main_pos != -1:
            text = text[:main_pos] + '}'

--- a/opencompass/datasets/mbpp.py
+++ b/opencompass/datasets/mbpp.py
@@ -200,30 +200,28 @@ class MBPPEvaluator(BaseEvaluator):

    def score(self, predictions, references):
        assert len(predictions) == len(references)
-        predictions = [self._process_answer(pred) for pred in predictions]

        if self.metric == 'MBPP':
            result = {'pass': 0, 'timeout': 0, 'failed': 0, 'wrong_answer': 0}
            details = {}
-            for index, (test_case,
-                        pred) in enumerate(zip(references, predictions)):
-                programs = self._process_test(test_case, pred)
-                try:
-                    # Add exec globals to prevent the exec to raise
-                    # unnecessary NameError for correct answer
-                    exec_globals = {}
-                    with swallow_io():
-                        with time_limit(2):
-                            exec(programs, exec_globals)
-                    r = 'pass'
-                except TimeOutException:
-                    r = 'timeout'
-                except AssertionError:
-                    r = 'wrong_answer'
-                except BaseException:
-                    r = 'failed'
-                result[r] += 1
-                details[str(index)] = {'programs': programs, 'result': r}
+            # change to thread pool for better killing blocked instance
+            with ThreadPoolExecutor() as executor:
+                futures = []
+                for i, (refer, pred) in enumerate(zip(references,
+                                                      predictions)):
+                    pred = self._process_answer(pred)
+                    programs = self._process_test(refer, pred)
+                    future = executor.submit(execution, programs, i, 3)
+                    futures.append(future)
+
+                from tqdm import tqdm
+                for future in tqdm(as_completed(futures), total=len(futures)):
+                    index, key = future.result()
+                    result[key] += 1
+                    details[str(index)] = {
+                        'programs': predictions[index],
+                        'result': key
+                    }

            result['score'] = result['pass'] / len(predictions) * 100
            result['details'] = details
@@ -263,6 +261,20 @@ class MBPPEvaluator(BaseEvaluator):
                return {f'mbpp_plus_{k}': score[k] * 100 for k in score}

    def _process_answer(self, text):
+        try:
+            # for chatGLM related text
+            text = eval(text)
+        except Exception:
+            pass
+        # deal with code block
+        if '```' in text:
+            blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
+            if len(blocks) == 0:
+                text = text.split('```')[1]  # fall back to default strategy
+            else:
+                text = blocks[0]  # fetch the first code block
+                if not text.startswith('\n'):  # in case starting with ```xxx
+                    text = text[max(text.find('\n') + 1, 0):]
        text = text.strip()
        match = re.search(r"('\s*|)(\[DONE\]|DONE)", text)
        if match:
@@ -275,6 +287,10 @@ class MBPPEvaluator(BaseEvaluator):
            text = text[1:]
        if text.endswith("'"):
            text = text[:-1]
+        text = text.replace('\\', '')
+        match = re.search(r'```python(.*)```', text, re.DOTALL)
+        if match:
+            text = match.group(1).strip().split('```')[0].strip()
        return text

    def _process_test(self, test_case, pred):

--- a/opencompass/datasets/siqa.py
+++ b/opencompass/datasets/siqa.py
@@ -78,3 +78,37 @@ class siqaDataset_V2(BaseDataset):
        val_dataset = siqaDataset_V2.load_single(path, 'dev.jsonl',
                                                 'dev-labels.lst')
        return DatasetDict({'train': train_dataset, 'validation': val_dataset})
+
+
+@LOAD_DATASET.register_module()
+class siqaDataset_V3(BaseDataset):
+    """Disconnect from HuggingFace version of HFDataset."""
+
+    @staticmethod
+    def load_single(path, data_filename, label_filename):
+        data_path = os.path.join(path, data_filename)
+        label_path = os.path.join(path, label_filename)
+        dataset = []
+        with open(data_path, 'r', encoding='utf-8') as f:
+            data_lines = f.readlines()
+        with open(label_path, 'r', encoding='utf-8') as f:
+            label_lines = f.readlines()
+        assert len(data_lines) == len(label_lines)
+        for data, label in zip(data_lines, label_lines):
+            i = json.loads(data.strip())
+            i['A'] = i.pop('answerA')
+            i['B'] = i.pop('answerB')
+            i['C'] = i.pop('answerC')
+            i['answer'] = 'ABC'[int(label.strip()) - 1]
+            dataset.append(i)
+
+        return Dataset.from_list(dataset)
+
+    @staticmethod
+    def load(path):
+        train_dataset = siqaDataset_V3.load_single(path, 'train.jsonl',
+                                                   'train-labels.lst')
+        val_dataset = siqaDataset_V3.load_single(path, 'dev.jsonl',
+                                                 'dev-labels.lst')
+
+        return DatasetDict({'train': train_dataset, 'validation': val_dataset})
--- a/opencompass/lagent/actions/ipython_interpreter.py
+++ b/opencompass/lagent/actions/ipython_interpreter.py
@@ -57,6 +57,8 @@ class IPythonInterpreter(BaseAction):
        user_data_dir (str): Specified the user data directory for files
            loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
            Defaults to `ENV`.
+        force_user_data (bool): Whether to force use user data.
+            Defaults to True.
    """

    _KERNEL_CLIENTS = {}
@@ -68,7 +70,8 @@ class IPythonInterpreter(BaseAction):
                 disable_description: Optional[str] = None,
                 timeout: int = 20,
                 trim_output: Optional[int] = 1024,
-                 user_data_dir: str = 'ENV') -> None:
+                 user_data_dir: str = 'ENV',
+                 force_user_data: bool = True) -> None:
        super().__init__(description, name, enable, disable_description)

        self.timeout = timeout
@@ -82,6 +85,11 @@ class IPythonInterpreter(BaseAction):
                f'{user_data_dir} does not exist.'
            user_data_dir = os.path.abspath(user_data_dir)
            user_data_dir = f"import os\nos.chdir('{user_data_dir}')"
+        else:
+            if force_user_data:
+                raise ValueError('user_data_dir is not set. Please '
+                                 'set force_user_data to False if '
+                                 'no extra data needed.')
        self.user_data_dir = user_data_dir
        self._initialized = False
        self.trim_output = trim_output

--- a/opencompass/models/huggingface.py
+++ b/opencompass/models/huggingface.py
@@ -225,6 +225,7 @@ class HuggingFace(BaseModel):
    def generate(self,
                 inputs: List[str],
                 max_out_len: int,
+                 min_out_len: Optional[int] = None,
                 stopping_criteria: List[str] = [],
                 **kwargs) -> List[str]:
        """Generate results given a list of inputs.
@@ -232,6 +233,7 @@ class HuggingFace(BaseModel):
        Args:
            inputs (List[str]): A list of strings.
            max_out_len (int): The maximum length of the output.
+            min_out_len (Optional[int]): The minimum length of the output.

        Returns:
            List[str]: A list of generated strings.
@@ -241,12 +243,14 @@ class HuggingFace(BaseModel):
        if self.batch_padding and len(inputs) > 1:
            return self._batch_generate(inputs=inputs,
                                        max_out_len=max_out_len,
+                                        min_out_len=min_out_len,
                                        stopping_criteria=stopping_criteria,
                                        **generation_kwargs)
        else:
            return sum(
                (self._single_generate(inputs=[input_],
                                       max_out_len=max_out_len,
+                                       min_out_len=min_out_len,
                                       stopping_criteria=stopping_criteria,
                                       **generation_kwargs)
                 for input_ in inputs), [])
@@ -254,6 +258,7 @@ class HuggingFace(BaseModel):
    def _batch_generate(self,
                        inputs: List[str],
                        max_out_len: int,
+                        min_out_len: Optional[int] = None,
                        stopping_criteria: List[str] = [],
                        **kwargs) -> List[str]:
        """Support for batch prompts inference.
@@ -308,6 +313,9 @@ class HuggingFace(BaseModel):
            ])
            kwargs['stopping_criteria'] = stopping_criteria

+        if min_out_len is not None:
+            kwargs['min_new_tokens'] = min_out_len
+
        # step-2: conduct model forward to generate output
        outputs = self.model.generate(**tokens,
                                      max_new_tokens=max_out_len,
@@ -331,6 +339,7 @@ class HuggingFace(BaseModel):
    def _single_generate(self,
                         inputs: List[str],
                         max_out_len: int,
+                         min_out_len: Optional[int] = None,
                         stopping_criteria: List[str] = [],
                         **kwargs) -> List[str]:
        """Support for single prompt inference.
@@ -390,6 +399,9 @@ class HuggingFace(BaseModel):
            ])
            kwargs['stopping_criteria'] = stopping_criteria

+        if min_out_len is not None:
+            kwargs['min_new_tokens'] = min_out_len
+
        # To accommodate the PeftModel, parameters should be passed in
        # key-value format for generate.
        outputs = self.model.generate(input_ids=input_ids,
@@ -502,7 +514,7 @@ class HuggingFace(BaseModel):
                self.tokenizer.pad_token_id).sum(-1).cpu().numpy()
        if mask_length is not None:
            lens -= np.array(mask_length)
-        ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
+        ce_loss = loss.float().sum(-1).cpu().detach().numpy() / lens
        return ce_loss

    def get_loglikelihood(
@@ -554,7 +566,6 @@ class HuggingFace(BaseModel):

        input_ids = input_tokenizer_out['input_ids'][:, :self.max_seq_len]
        input_length = input_tokenizer_out['length']
-        attention_mask = input_tokenizer_out['attention_mask']
        context_ids = [
            self.tokenizer(inputs[i].replace(conts[i], ''),
                           padding=False,
@@ -563,7 +574,7 @@ class HuggingFace(BaseModel):
            for i in range(len(inputs))
        ]
        # forward
-        outputs = self.model(input_ids, attention_mask)['logits']
+        outputs = self.model(input_ids)['logits']
        outputs = torch.nn.functional.log_softmax(outputs, dim=-1)
        # calculate loglikelihood
        answer = np.zeros(len(inputs))
@@ -609,9 +620,10 @@ class HuggingFace(BaseModel):
                self.tokenizer.pad_token_id).sum(-1).cpu().numpy()
        mink_percent = []
        for nloss, nlen in zip(loss, lens):
-            nlen = max(int(nlen) * k // 100, 1)
-            nloss = torch.topk(loss, nlen, dim=-1)[0]
-            nloss = -nloss.mean().cpu().detach().numpy()
+            nlen = int(nlen)
+            minklen = max(nlen * k // 100, 1)
+            nloss = torch.topk(loss[-nlen:], minklen, dim=-1)[0]
+            nloss = -nloss.float().mean().cpu().detach().numpy()
            mink_percent.append(nloss)
        return np.array(mink_percent)


--- a/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py
@@ -29,6 +29,8 @@ class GenInferencer(BaseInferencer):
        model (:obj:`BaseModelWrapper`, optional): The module to inference.
        max_seq_len (:obj:`int`, optional): Maximum number of tokenized words
            allowed by the LM.
+        min_out_len (:obj:`int`, optional): Minimum number of generated tokens
+            by the LM
        batch_size (:obj:`int`, optional): Batch size for the
            :obj:`DataLoader`.
        output_json_filepath (:obj:`str`, optional): File path for output
@@ -49,6 +51,7 @@ class GenInferencer(BaseInferencer):
            max_out_len: int,
            stopping_criteria: List[str] = [],
            max_seq_len: Optional[int] = None,
+            min_out_len: Optional[int] = None,
            batch_size: Optional[int] = 1,
            gen_field_replace_token: Optional[str] = '',
            output_json_filepath: Optional[str] = './icl_inference_output',
@@ -66,6 +69,7 @@ class GenInferencer(BaseInferencer):

        self.gen_field_replace_token = gen_field_replace_token
        self.max_out_len = max_out_len
+        self.min_out_len = min_out_len
        self.stopping_criteria = stopping_criteria

        if self.model.is_api and save_every is None:
@@ -135,6 +139,8 @@ class GenInferencer(BaseInferencer):
            sig = inspect.signature(self.model.generate)
            if 'stopping_criteria' in sig.parameters:
                extra_gen_kwargs['stopping_criteria'] = self.stopping_criteria
+            if 'min_out_len' in sig.parameters:
+                extra_gen_kwargs['min_out_len'] = self.min_out_len
            with torch.no_grad():
                parsed_entries = self.model.parse_template(entry, mode='gen')
                results = self.model.generate_from_template(

--- a/opencompass/runners/dlc.py
+++ b/opencompass/runners/dlc.py
@@ -116,7 +116,7 @@ class DLCRunner(BaseRunner):
                    ' --worker_count 1'
                    f' --worker_cpu {max(num_gpus * 6, 8)}'
                    f' --worker_gpu {num_gpus}'
-                    f' --worker_memory {max(num_gpus * 32, 48)}'
+                    f' --worker_memory {max(num_gpus * 64, 48)}'
                    f" --worker_image {self.aliyun_cfg['worker_image']}"
                    ' --interactive')
            get_cmd = partial(task.get_command,

--- a/opencompass/tasks/openicl_infer.py
+++ b/opencompass/tasks/openicl_infer.py
@@ -61,6 +61,7 @@ class OpenICLInferTask(BaseTask):
        for model_cfg, dataset_cfgs in zip(self.model_cfgs, self.dataset_cfgs):
            self.max_out_len = model_cfg.get('max_out_len', None)
            self.batch_size = model_cfg.get('batch_size', None)
+            self.min_out_len = model_cfg.get('min_out_len', None)
            self.model = build_model_from_cfg(model_cfg)

            for dataset_cfg in dataset_cfgs:
@@ -102,6 +103,8 @@ class OpenICLInferTask(BaseTask):
        inferencer_cfg['model'] = self.model
        self._set_default_value(inferencer_cfg, 'max_out_len',
                                self.max_out_len)
+        self._set_default_value(inferencer_cfg, 'min_out_len',
+                                self.min_out_len)
        self._set_default_value(inferencer_cfg, 'batch_size', self.batch_size)
        inferencer_cfg['max_seq_len'] = self.model_cfg.get('max_seq_len')
        inferencer = ICL_INFERENCERS.build(inferencer_cfg)

--- a/opencompass/utils/build.py
+++ b/opencompass/utils/build.py
@@ -21,4 +21,5 @@ def build_model_from_cfg(model_cfg: ConfigDict):
    model_cfg.pop('abbr', None)
    model_cfg.pop('summarizer_abbr', None)
    model_cfg.pop('pred_postprocessor', None)
+    model_cfg.pop('min_out_len', None)
    return MODELS.build(model_cfg)
--- a/tools/prompt_viewer.py
+++ b/tools/prompt_viewer.py
@@ -5,7 +5,8 @@ from typing import Dict
 from mmengine.config import Config, ConfigDict

 from opencompass.openicl.icl_inferencer import (CLPInferencer, GenInferencer,
-                                                PPLInferencer)
+                                                PPLInferencer,
+                                                PPLOnlyInferencer)
 from opencompass.registry import ICL_PROMPT_TEMPLATES, ICL_RETRIEVERS
 from opencompass.utils import (Menu, build_dataset_from_cfg,
                               build_model_from_cfg, dataset_abbr_from_cfg,
@@ -77,7 +78,8 @@ def print_prompts(model_cfg, dataset_cfg, count=1):

    ice_idx_list = retriever.retrieve()

-    assert infer_cfg.inferencer.type in [PPLInferencer, GenInferencer], \
+    assert infer_cfg.inferencer.type in [
+        PPLInferencer, GenInferencer, CLPInferencer, PPLOnlyInferencer], \
        'Only PPLInferencer and GenInferencer are supported'

    for idx in range(min(count, len(ice_idx_list))):
@@ -127,7 +129,9 @@ def print_prompts(model_cfg, dataset_cfg, count=1):
                print('-' * 100)
                print(prompt)
                print('-' * 100)
-        elif infer_cfg.inferencer.type in [GenInferencer, CLPInferencer]:
+        elif infer_cfg.inferencer.type in [
+                GenInferencer, CLPInferencer, PPLOnlyInferencer
+        ]:
            ice_idx = ice_idx_list[idx]
            ice = retriever.generate_ice(ice_idx, ice_template=ice_template)
            prompt = retriever.generate_prompt_for_generate_task(