[Sync] minor test (#683)

e78857ac · Hubert · GitHub · dd4318f6 · e78857ac · e78857ac
Unverified Commit e78857ac authored Dec 11, 2023 by Hubert Committed by GitHub Dec 11, 2023
20 changed files
--- a/configs/models/hf_internlm/hf_internlm_chat_20b.py
+++ b/configs/models/hf_internlm/hf_internlm_chat_20b.py
@@ -29,5 +29,6 @@ models = [
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=2, num_procs=1),
+        end_str='<eoa>',
    )
 ]
--- a/configs/models/hf_internlm/hf_internlm_chat_7b.py
+++ b/configs/models/hf_internlm/hf_internlm_chat_7b.py
@@ -29,5 +29,6 @@ models = [
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<eoa>',
    )
 ]
--- a/configs/models/hf_internlm/hf_internlm_chat_7b_8k.py
+++ b/configs/models/hf_internlm/hf_internlm_chat_7b_8k.py
@@ -29,5 +29,6 @@ models = [
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<eoa>',
    )
 ]
--- a/configs/models/qwen/hf_qwen_14b_chat.py
+++ b/configs/models/qwen/hf_qwen_14b_chat.py
@@ -22,12 +22,14 @@ models = [
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
-            use_fast=False,),
+            use_fast=False,
+        ),
        pad_token_id=151643,
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<|im_end|>',
    )
 ]
--- a/configs/models/qwen/hf_qwen_7b_chat.py
+++ b/configs/models/qwen/hf_qwen_7b_chat.py
@@ -22,12 +22,14 @@ models = [
            padding_side='left',
            truncation_side='left',
            trust_remote_code=True,
-            use_fast=False,),
+            use_fast=False,
+        ),
        pad_token_id=151643,
        max_out_len=100,
        max_seq_len=2048,
        batch_size=8,
        meta_template=_meta_template,
        run_cfg=dict(num_gpus=1, num_procs=1),
+        end_str='<|im_end|>',
    )
 ]
--- a/configs/summarizers/groups/cibench.py
+++ b/configs/summarizers/groups/cibench.py
+_cibench = ['Pandas', 'Matplotlib', 'Opencv', 'SciPy', 'Seaborn', 'PyTorch']
+_cibench = ['cibench_' + i for i in _cibench]
+cibench_summary_groups = [{'name': 'cibench', 'subsets': _cibench}]
--- a/configs/summarizers/groups/mathbench.py
+++ b/configs/summarizers/groups/mathbench.py
+mathbench_summary_groups = [
+    {
+        'name': 'mathbench-college',
+        'subsets': [
+            ['mathbench-college-single_choice_cn', 'acc_1'],
+            ['mathbench-college-cloze_en', 'accuracy'],
+        ]
+    },
+    {
+        'name': 'mathbench-high',
+        'subsets': [
+            ['mathbench-high-single_choice_cn', 'acc_1'],
+            ['mathbench-high-single_choice_en', 'acc_1'],
+        ]
+    },
+    {
+        'name': 'mathbench-middle',
+        'subsets': [
+            ['mathbench-middle-single_choice_cn', 'acc_1'],
+        ]
+    },
+    {
+        'name': 'mathbench-primary',
+        'subsets': [
+            ['mathbench-primary-cloze_cn', 'accuracy'],
+        ]
+    },
+    {
+        'name': 'mathbench',
+        'subsets': [
+            'mathbench-college',
+            'mathbench-high',
+            'mathbench-middle',
+            'mathbench-primary',
+        ],
+    },
+    {
+        'name': 'mathbench-college-circular',
+        'subsets': [
+            ['mathbench-college-single_choice_cn', 'perf_4'],
+        ]
+    },
+    {
+        'name': 'mathbench-high-circular',
+        'subsets': [
+            ['mathbench-high-single_choice_cn', 'perf_4'],
+            ['mathbench-high-single_choice_en', 'perf_4'],
+        ]
+    },
+    {
+        'name': 'mathbench-middle-circular',
+        'subsets': [
+            ['mathbench-middle-single_choice_cn', 'perf_4'],
+        ]
+    },
+    {
+        'name': 'mathbench-circular',
+        'subsets': [
+            'mathbench-college-circular',
+            'mathbench-high-circular',
+            'mathbench-middle-circular',
+        ],
+    },
+    {
+        'name': 'mathbench-circular-and-cloze',
+        'subsets': [
+            'mathbench-high-circular',
+            'mathbench-middle-circular',
+            'mathbench-circular',
+            'mathbench-college-cloze_en',
+            'mathbench-primary-cloze_cn',
+        ],
+    }
+]
--- a/configs/summarizers/math_agent.py
+++ b/configs/summarizers/math_agent.py
+summarizer = dict(
+    dataset_abbrs=[
+        '######## GSM8K-Agent Accuracy ########', # category
+        ['gsm8k-agent', 'follow_acc'],
+        ['gsm8k-agent', 'reasoning_acc'],
+        ['gsm8k-agent', 'code_acc'],
+        ['gsm8k-agent', 'action_pct'],
+        '######## MATH-Agent Accuracy ########', # category
+        ['math-agent', 'follow_acc'],
+        ['math-agent', 'reasoning_acc'],
+        ['math-agent', 'code_acc'],
+        ['math-agent', 'action_pct'],
+        '######## MathBench-Agent Accuracy ########', # category
+        ['mathbench-college-single_choice_cn-agent', 'acc_1'],
+        ['mathbench-college-cloze_en-agent', 'accuracy'],
+        ['mathbench-high-single_choice_cn-agent', 'acc_1'],
+        ['mathbench-high-single_choice_en-agent', 'acc_1'],
+        ['mathbench-middle-single_choice_cn-agent', 'acc_1'],
+        ['mathbench-primary-cloze_cn-agent', 'accuracy'],
+        '######## MathBench-Agent CircularEval ########', # category
+        ['mathbench-college-single_choice_cn-agent', 'perf_4'],
+        ['mathbench-high-single_choice_cn-agent', 'perf_4'],
+        ['mathbench-high-single_choice_en-agent', 'perf_4'],
+        ['mathbench-middle-single_choice_cn-agent', 'perf_4'],
+    ],
+    summary_groups=sum(
+        [v for k, v in locals().items() if k.endswith("_summary_groups")], [])
+)
--- a/opencompass/datasets/cibench.py
+++ b/opencompass/datasets/cibench.py
@@ -2,13 +2,15 @@ import json
 import os
 import os.path as osp
 import re
+import subprocess
+from collections import defaultdict
 from typing import List, Optional
 import numpy as np
 from datasets import Dataset
 from opencompass.openicl.icl_evaluator import BaseEvaluator
-from opencompass.registry import LOAD_DATASET
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
 from .base import BaseDataset
@@ -18,16 +20,29 @@ def load_experiment(file: str) -> dict:
    with open(file, 'r') as f:
        notebook = json.load(f)
        example = notebook['cells']
+        metadata = notebook['metadata']
+        modules = metadata.get('modules', [])
+        if modules:
+            # these two annotations should be the same
+            assert len(modules) == len(metadata.get('step_types'))
+            # reformat annotations
+            modules = [[_m.strip() for _m in _modules.split('&')]
+                       for _modules in modules]
        questions = []
+        source_codes = []
        outputs = []
        tags = []
        for cell in example:
            if cell['cell_type'] == 'markdown':
-                text = ''.join(cell['source'])
+                text = ''.join(cell['source']).strip()
+                if modules:
+                    _modules = modules.pop(0)
+                    text += f"Please use {' and '.join(_modules)} modules."
+                text = text.strip() + '\n'
                # append the formatted text
                questions.append(text)
            elif cell['cell_type'] == 'code':
+                source_codes.append(''.join(cell['source']))
                if cell['outputs'] and 'data' in cell['outputs'][-1]:
                    if 'image/png' in cell['outputs'][-1]['data']:
                        # skip vis temporarily due to lack of evaluation
@@ -39,15 +54,18 @@ def load_experiment(file: str) -> dict:
                        outputs.append(''.join(
                            cell['outputs'][-1]['data']['text/plain']))
                else:
-                    tags.append('executable')
+                    tags.append('exec')
                    outputs.append(None)
    return dict(
        experiment=file,
        questions=sum(([
            dict(role='user', content=question),
-            dict(role='assistant', content=output)
+            dict(role='assistant', content=source_code)
-        ] for question, output in zip(questions, outputs)), []),
+        ] for question, source_code in zip(questions, source_codes)), []),
-        references=dict(outputs=outputs, tags=tags, experiment=file),
+        references=dict(outputs=outputs,
+                        tags=tags,
+                        metadata=metadata,
+                        experiment=file),
    )
@@ -58,6 +76,7 @@ class CIBenchDataset(BaseDataset):
    @staticmethod
    def load(path: str):
        """Load whole dataset."""
+        assert os.path.exists(path), f'Path {path} does not exist.'
        data_list = []
        for cwd, dirs, files in os.walk(path):
            dirs.sort()
@@ -79,21 +98,57 @@ class CIBenchEvaluator(BaseEvaluator):
    """Evaluator for CI dataset.
    Args:
+        text_evaluator (optional, dict): The text evaluator for text result
+            comparison[]. Defaults to None, which use Rouge as defaults.
+            Please notice that a extra key for `metric_name` should be set
+            to get the exact metric result, such as `rouge1`.
        output_dir (optional, str): The directory to save experiment
            files in a markdown or notebook format.
+        with_ipynb (bool): Generate ipynb correspondingly.
+            Defaults to False.
        user_data_dir (str): The directory to load local files.
            Defaults to 'ENV', which means use environment variable
            `USER_DATA_DIR` to get the data dir.
    """
    def __init__(self,
+                 text_evaluator: Optional[dict] = None,
                 output_dir: Optional[str] = None,
+                 with_ipynb: bool = False,
                 user_data_dir: str = 'ENV') -> None:
+        if text_evaluator is None:
+            from opencompass.openicl.icl_evaluator import RougeEvaluator
+            self.text_evaluator = ICL_EVALUATORS.build(
+                dict(type=RougeEvaluator))
+            self.text_eval_metric = 'rouge1'
+        else:
+            self.text_eval_metric = text_evaluator.pop('metric_name')
+            self.text_evaluator = ICL_EVALUATORS.build(text_evaluator)
        # TODO: should use work dir for this task.
        self.output_dir = output_dir
+        self.user_data_dir = self.check_user_data_dir(user_data_dir)
+        self.with_ipynb = with_ipynb
+        self.TAG_MAPPING = {
+            'exec': ('executable', self.valid_step),
+            'general': ('general_correct', self.correct_step),
+            'num': ('numeric_correct', self.correct_step),
+            'text': ('text_score', self.text_step),
+            'vis': ('vis_sim', self.vis_similarity_step),
+        }
+    def check_user_data_dir(self, user_data_dir):
        if user_data_dir == 'ENV':
            user_data_dir = os.environ.get('USER_DATA_DIR', '')
-        self.user_data_dir = user_data_dir
+        user_data_dir = user_data_dir.rstrip('/')
+        basename = osp.basename(user_data_dir)
+        if basename and basename != 'data':
+            user_data_dir = osp.join(user_data_dir, 'data')
+            assert osp.exists(user_data_dir), \
+                f'a subfolder named `data` should exist under {user_data_dir}.'
+        elif basename:
+            assert osp.exists(user_data_dir), \
+                f'{user_data_dir} does not exist.'
+        return user_data_dir
    @staticmethod
    def valid_step(step):
@@ -126,6 +181,24 @@ class CIBenchEvaluator(BaseEvaluator):
        # Fall back to False
        return False
+    def text_step(self, step, target):
+        """Whether the step output is correct."""
+        # Found the latest code interpreter to determine correct
+        for action in step[::-1]:
+            if action['type'] == 'IPythonInterpreter':
+                if action['result']:
+                    try:
+                        pred = action['result']['text']
+                        match = re.search('```\n(.*?)\n```', pred, re.DOTALL)
+                        if match:
+                            out = match.group(1)
+                            score = self.text_evaluator.score([out], [target])
+                            return score[self.text_eval_metric] / 100
+                    except Exception:
+                        return False
+        # Fall back to False
+        return False
    @staticmethod
    def vis_similarity_step(step, target):
        """Whether the step output image has the same structure similarity with
@@ -174,6 +247,7 @@ class CIBenchEvaluator(BaseEvaluator):
                'the conversion processes.')
        check_jupytext()
+        p_list = []
        from opencompass.lagent.actions.ipython_interpreter import extract_code
        for idx, (example_origin_prompt,
                  example_steps) in enumerate(zip(origin_prompt, steps)):
@@ -198,20 +272,25 @@ class CIBenchEvaluator(BaseEvaluator):
                f.writelines(markdown_lines)
            # TODO: be careful for this
+            # The result might be different with infer process
+            # please check carefully
            # convert markdown to ipynb and exectue with error tolerance
-            # subprocess.Popen(
+            if self.with_ipynb:
-            #     "jupytext --to ipynb --pipe-fmt ipynb "
+                p = subprocess.Popen(
-            #     "--pipe 'jupyter nbconvert --to ipynb --execute "
+                    'jupytext --to ipynb --pipe-fmt ipynb '
-            #     f"--allow-errors --stdin --stdout' {md_file}",
+                    "--pipe 'jupyter nbconvert --to ipynb --execute "
-            #     shell=True)
+                    f"--allow-errors --stdin --stdout' {md_file}",
+                    shell=True)
+                p_list.append(p)
+        # TODO: async wait
+        for p in p_list:
+            p.wait()
    def set_data_dir(self, work_dir):
        """Set work directory and link data files for save notebook results."""
        if self.user_data_dir:
-            if self.user_data_dir.endswith('/'):
+            basename = osp.basename(self.user_data_dir)
-                basename = osp.basename(osp.split(self.user_data_dir)[0])
-            else:
-                basename = osp.basename(self.user_data_dir)
            if not osp.exists(osp.join(self.output_dir, basename)):
                os.symlink(self.user_data_dir,
                           osp.join(self.output_dir, basename))
@@ -221,10 +300,54 @@ class CIBenchEvaluator(BaseEvaluator):
        """Change work directory and keep the symlink."""
        os.chdir(work_dir)
+    def single_exp(self, gold, steps):
+        tags = gold['tags']
+        outputs = gold['outputs']
+        metadata = gold['metadata']
+        hard_tags = metadata.get('step_types', [])
+        if hard_tags:
+            tags = hard_tags
+        # executable: exec succeed
+        # general_correct: general correct
+        # numeric_correct: numerical correct
+        # text_score: text score
+        # vis_sim: visual similarity
+        result = defaultdict(list)
+        for tag, step, output in zip(tags, steps, outputs):
+            # check whether this step is valid
+            result['executable'].append(self.valid_step(step))
+            if tag != 'exec':
+                key, func = self.TAG_MAPPING[tag]
+                result[key].append(func(step, output))
+        # add missing metric for better analyse if not exists
+        if hard_tags:
+            check_tags = ['exec', 'num', 'text', 'vis']
+        else:
+            check_tags = ['exec', 'general', 'vis']
+        for tag in check_tags:
+            key = self.TAG_MAPPING[tag][0]
+            if key not in result:
+                result[key] = []
+        return result
+    def get_output_dir(self):
+        """Get output dir from eval task.
+        Notice: output dir should be in format xxx/data.
+        All the needed files should be
+        """
+        # hard hack for get output dir from eval task
+        if hasattr(self, '_out_dir') and self.output_dir is None:
+            self.output_dir = self._out_dir
    def score(self, predictions: List, references: List, steps: List,
              origin_prompt: List):
        """Calculate accuracy."""
        cwd = os.getcwd()
+        self.get_output_dir()
        if self.output_dir:
            if not osp.exists(self.output_dir):
                os.makedirs(self.output_dir)
@@ -232,56 +355,20 @@ class CIBenchEvaluator(BaseEvaluator):
            self.save_results(origin_prompt, steps)
            self.unset_data_dir(cwd)
-        num_cells_list = []
+        total_results = defaultdict(float)
-        num_general_list = []
+        total_scores = defaultdict(float)
-        passed_list = []
+        total_nums = defaultdict(int)
-        correct_list = []
-        vis_list = []
        for gold, single_steps in zip(references, steps):
-            tags = gold['tags']
+            result = self.single_exp(gold, single_steps)
-            outputs = gold['outputs']
-            num_cells = len(tags)
-            num_general = sum([tag == 'general' for tag in tags])
-            passed = sum([self.valid_step(step) for step in single_steps])
-            correct = 0
-            vis_sim = []
-            for tag, step, output in zip(tags, single_steps, outputs):
-                if tag == 'general':
-                    correct += self.correct_step(step, output)
-                elif tag == 'vis':
-                    vis_sim.append(self.vis_similarity_step(step, output))
-            num_cells_list.append(num_cells)
-            num_general_list.append(num_general)
-            passed_list.append(passed)
-            correct_list.append(correct)
-            if vis_sim:
-                vis_list.append(sum(vis_sim) / len(vis_sim))
-            else:
-                vis_list.append(-1)
-        if len([v for v in vis_list if v >= 0]) > 0:
+            for k, v in result.items():
-            visualize_similarity = sum([v for v in vis_list if v >= 0]) / len(
+                total_scores[k] += sum(v)
-                [v for v in vis_list if v >= 0])
+                total_nums[k] += len(v)
-        else:
-            # not valid
-            visualize_similarity = -1
-        if sum(num_general_list) > 0:
+        for k, v in total_scores.items():
-            general_accuracy = sum(correct_list) / sum(num_general_list)
+            if total_nums[k] > 0:
-        else:
+                total_results[k] = total_scores[k] / total_nums[k] * 100
-            # not valid
+            else:
-            general_accuracy = -1
+                total_results[k] = -1
-        result = dict(
+        return total_results
-            executable_rate=sum(passed_list) / sum(num_cells_list) * 100,
-            general_accuracy=general_accuracy * 100,
-            visualize_similarity=visualize_similarity * 100,
-            num_cells_list=num_cells_list,
-            num_general_list=num_general_list,
-            passed_list=passed_list,
-            correct_list=correct_list,
-            vis_list=vis_list,
-        )
-        return result
--- a/opencompass/datasets/cmnli.py
+++ b/opencompass/datasets/cmnli.py
@@ -16,6 +16,8 @@ class cmnliDataset(BaseDataset):
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = json.loads(line)
+                if line['label'] == '-':
+                    continue
                data.append(line)
        return Dataset.from_list(data)

--- a/opencompass/datasets/ds1000.py
+++ b/opencompass/datasets/ds1000.py
@@ -143,6 +143,17 @@ def ds1000_postprocess(text: str) -> str:
    return text
+@TEXT_POSTPROCESSORS.register_module('ds1000_completion')
+def ds1000_completion_postprocess(text: str) -> str:
+    text += '</code>'
+    match = re.search('(.*?)</code>', text, re.DOTALL)
+    if match:
+        text = match.group(1)
+    return text
 @TEXT_POSTPROCESSORS.register_module('ds1000_matplotlib')
 def ds1000_matplotlib_postprocess(text: str) -> str:
    text = ds1000_postprocess(text)

--- a/opencompass/datasets/gsm8k.py
+++ b/opencompass/datasets/gsm8k.py
@@ -142,6 +142,6 @@ class Gsm8kAgentEvaluator(BaseEvaluator):
            reasoning_acc=100 *
            (reasoning_scope + final_scope + row_reasoning_scope) / total,
            code_acc=100 * (code_scope + final_scope) / total,
-            action_acc=100 * (action_scope + final_scope) / total,
+            action_pct=100 * (action_scope + final_scope) / total,
        )
        return result
--- a/opencompass/datasets/wikibench.py
+++ b/opencompass/datasets/wikibench.py
@@ -25,7 +25,7 @@ class WikiBenchDataset(BaseDataset):
        circular_patterns = ['ABCD', 'BCDA', 'CDAB', 'DABC']
        data = []
-        with open(path, 'r') as infile:
+        with open(path, 'r', encoding='utf-8') as infile:
            for id, line in enumerate(infile):
                entry = json.loads(line)
                if 'cloze' in name:

--- a/opencompass/datasets/winogrande.py
+++ b/opencompass/datasets/winogrande.py
@@ -20,14 +20,14 @@ class winograndeDataset(BaseDataset):
            for line in f:
                line = json.loads(line)
                prompt = line['sentence']
-                dataset_list.append({
+                continue_prompt = prompt.split('_')
-                    'opt1':
+                data_item = {
-                    prompt.replace('_', line['option1']),
+                    'opt1': prompt.replace('_', line['option1']),
-                    'opt2':
+                    'opt2': prompt.replace('_', line['option2']),
-                    prompt.replace('_', line['option2']),
+                    'answer': line['answer'],
-                    'answer':
+                    'cont': continue_prompt[1]
-                    line['answer']
+                }
-                })
+                dataset_list.append(data_item)
        dataset_list = Dataset.from_list(dataset_list)
        return dataset_list
@@ -46,13 +46,11 @@ class winograndeDataset_V2(BaseDataset):
                prompt = line['sentence']
                answer = line['answer']
                answer = ' AB'[int(answer)] if answer != '' else 'NULL'
-                dataset_list.append({
+                data_item = {
-                    'opt1':
+                    'opt1': prompt.replace('_', line['option1']),
-                    prompt.replace('_', line['option1']),
+                    'opt2': prompt.replace('_', line['option2']),
-                    'opt2':
+                    'answer': answer,
-                    prompt.replace('_', line['option2']),
+                }
-                    'answer':
+                dataset_list.append(data_item)
-                    answer
-                })
        dataset_list = Dataset.from_list(dataset_list)
        return dataset_list
--- a/opencompass/lagent/actions/ipython_interpreter.py
+++ b/opencompass/lagent/actions/ipython_interpreter.py
@@ -47,6 +47,10 @@ class IPythonInterpreter(BaseAction):
            it is disabled. Defaults to None.
        timeout (int): Upper bound of waiting time for Python script execution.
            Defaults to 20.
+        trim_output (int, optional): Max characters restriction of ipython
+            outputs. If None, do not perform any trim.
+            TODO: Notice that, this is not token len. Anf trim strategies
+            might be added later. Defaults to 1024.
        user_data_dir (str): Specified the user data directory for files
            loading. If set to `ENV`, use `USER_DATA_DIR` environment variable.
            Defaults to `ENV`.
@@ -60,6 +64,7 @@ class IPythonInterpreter(BaseAction):
                 enable: bool = True,
                 disable_description: Optional[str] = None,
                 timeout: int = 20,
+                 trim_output: Optional[int] = 1024,
                 user_data_dir: str = 'ENV') -> None:
        super().__init__(description, name, enable, disable_description)
@@ -68,10 +73,11 @@ class IPythonInterpreter(BaseAction):
            user_data_dir = os.environ.get('USER_DATA_DIR', '')
        if user_data_dir:
-            user_data_dir = os.path.dirname(user_data_dir)
+            # user_data_dir = os.path.dirname(user_data_dir)
            user_data_dir = f"import os\nos.chdir('{user_data_dir}')"
        self.user_data_dir = user_data_dir
        self._initialized = False
+        self.trim_output = trim_output
        if not os.path.exists(WORK_DIR):
            os.mkdir(WORK_DIR)
@@ -178,6 +184,12 @@ class IPythonInterpreter(BaseAction):
                if image:
                    result += f'\n\n{image}'
                if finished:
+                    # in case output text too long
+                    # might need better design later
+                    if self.trim_output and len(result) > self.trim_output:
+                        ellip = '......'
+                        half_len = int((self.trim_output - len(ellip)) / 2)
+                        result = result[:half_len] + ellip + result[-half_len:]
                    return succeed, result
        try:
@@ -204,13 +216,20 @@ class IPythonInterpreter(BaseAction):
                 command: str,
                 timeout: Optional[int] = None) -> ActionReturn:
        tool_return = ActionReturn(url=None, args=None, type=self.name)
-        tool_return.args = dict(text=command)
+        extracted_command = extract_code(command)
-        succeed, result = self._call(command, timeout)
+        tool_return.args = dict(text=command, extract_code=extracted_command)
-        if succeed:
+        if extracted_command:
-            tool_return.result = dict(text=result)
+            succeed, result = self._call(extracted_command, timeout)
-            tool_return.state = ActionStatusCode.SUCCESS
+            if succeed:
+                if not result:
+                    result = 'The code is succeed without any outputs.'
+                tool_return.result = dict(text=result)
+                tool_return.state = ActionStatusCode.SUCCESS
+            else:
+                tool_return.errmsg = repr(result)
+                tool_return.state = ActionStatusCode.API_ERROR
        else:
-            tool_return.errmsg = repr(result)
+            tool_return.errmsg = 'The input code is empty. Please follow the format.'  # noqa
            tool_return.state = ActionStatusCode.API_ERROR
        return tool_return

--- a/opencompass/models/base.py
+++ b/opencompass/models/base.py
@@ -115,6 +115,20 @@ class BaseModel:
        inputs = self.parse_template(templates, mode='ppl')
        return self.get_ppl(inputs, mask_length)
+    def get_loglikelihood_from_template(self,
+                                        templates: List[PromptType],
+                                        conts: List[str],
+                                        mask_length=None):
+        """Get perplexity given a list of templates.
+        Args:
+            templates (List[PromptType]): A list of templates.
+            mask_length (List[int]): A list of mask lengths. If provided, the
+                perplexity will be calculated only on the unmasked tokens.
+        """
+        inputs = self.parse_template(templates, mode='ppl')
+        return self.get_loglikelihood(inputs, conts, mask_length)
    def generate_from_template(self, templates: List[PromptType],
                               max_out_len: int, **kwargs):
        """Generate completion from a list of templates.

--- a/opencompass/models/base_api.py
+++ b/opencompass/models/base_api.py
 import re
 import sys
 import threading
+import time
 import warnings
 from abc import abstractmethod
 from copy import deepcopy
+from queue import Queue
 from time import sleep
 from typing import Dict, List, Optional, Tuple, Union
@@ -37,6 +39,7 @@ class BaseAPIModel(BaseModel):
    def __init__(self,
                 path: str,
                 query_per_second: int = 1,
+                 rpm_verbose: bool = False,
                 retry: int = 2,
                 max_seq_len: int = 2048,
                 meta_template: Optional[Dict] = None,
@@ -46,7 +49,7 @@ class BaseAPIModel(BaseModel):
        self.meta_template = meta_template
        self.retry = retry
        self.query_per_second = query_per_second
-        self.token_bucket = TokenBucket(query_per_second)
+        self.token_bucket = TokenBucket(query_per_second, rpm_verbose)
        self.template_parser = APITemplateParser(meta_template)
        self.logger = get_logger()
        self.generation_kwargs = generation_kwargs
@@ -422,10 +425,13 @@ class TokenBucket:
        query_per_second (float): The rate of the token bucket.
    """
-    def __init__(self, rate):
+    def __init__(self, rate, verbose=False):
        self._rate = rate
        self._tokens = threading.Semaphore(0)
        self.started = False
+        self._request_queue = Queue()
+        self.logger = get_logger()
+        self.verbose = verbose
    def _add_tokens(self):
        """Add tokens to the bucket."""
@@ -440,3 +446,12 @@ class TokenBucket:
            self.started = True
            threading.Thread(target=self._add_tokens, daemon=True).start()
        self._tokens.acquire()
+        if self.verbose:
+            cur_time = time.time()
+            while not self._request_queue.empty():
+                if cur_time - self._request_queue.queue[0] > 60:
+                    self._request_queue.get()
+                else:
+                    break
+            self._request_queue.put(cur_time)
+            self.logger.info(f'Current RPM {self._request_queue.qsize()}.')
--- a/opencompass/models/huggingface.py
+++ b/opencompass/models/huggingface.py
@@ -3,6 +3,7 @@ from typing import Dict, List, Optional, Union
 import numpy as np
 import torch
+import transformers
 from opencompass.models.base import BaseModel
 from opencompass.models.base_api import APITemplateParser
@@ -13,6 +14,33 @@ from opencompass.utils.prompt import PromptList
 PromptType = Union[PromptList, str]
+class MultiTokenEOSCriteria(transformers.StoppingCriteria):
+    """Criteria to stop on the specified multi-token sequence."""
+    def __init__(
+        self,
+        sequence: str,
+        tokenizer: transformers.PreTrainedTokenizer,
+        batch_size: int,
+    ):
+        self.done_tracker = [False] * batch_size
+        self.sequence = sequence
+        self.sequence_ids = tokenizer.encode(sequence,
+                                             add_special_tokens=False)
+        self.sequence_id_len = len(self.sequence_ids)
+        self.tokenizer = tokenizer
+    def __call__(self, input_ids, scores, **kwargs) -> bool:
+        # compare the last len(stop) tokens
+        lookback_ids_batch = input_ids[:, -self.sequence_id_len:]
+        lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
+        for i, done in enumerate(self.done_tracker):
+            if done:
+                continue
+            self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
+        return False not in self.done_tracker
 @MODELS.register_module()
 class HuggingFace(BaseModel):
    """Model wrapper around HuggingFace models.
@@ -194,7 +222,10 @@ class HuggingFace(BaseModel):
            self.model.config.eos_token_id = 2
            self.model.config.pad_token_id = self.tokenizer.pad_token_id
-    def generate(self, inputs: List[str], max_out_len: int,
+    def generate(self,
+                 inputs: List[str],
+                 max_out_len: int,
+                 stopping_criteria: List[str] = [],
                 **kwargs) -> List[str]:
        """Generate results given a list of inputs.
@@ -212,9 +243,12 @@ class HuggingFace(BaseModel):
                                        max_out_len=max_out_len,
                                        **generation_kwargs)
        else:
-            return sum((self._single_generate(
+            return sum(
-                inputs=[input_], max_out_len=max_out_len, **generation_kwargs)
+                (self._single_generate(inputs=[input_],
-                        for input_ in inputs), [])
+                                       max_out_len=max_out_len,
+                                       stopping_criteria=stopping_criteria,
+                                       **generation_kwargs)
+                 for input_ in inputs), [])
    def _batch_generate(self, inputs: List[str], max_out_len: int,
                        **kwargs) -> List[str]:
@@ -275,7 +309,10 @@ class HuggingFace(BaseModel):
            decodeds = [token.split(self.end_str)[0] for token in decodeds]
        return decodeds
-    def _single_generate(self, inputs: List[str], max_out_len: int,
+    def _single_generate(self,
+                         inputs: List[str],
+                         max_out_len: int,
+                         stopping_criteria: List[str] = [],
                         **kwargs) -> List[str]:
        """Support for single prompt inference.
@@ -319,6 +356,19 @@ class HuggingFace(BaseModel):
                                   max_length=self.max_seq_len -
                                   max_out_len)['input_ids']
        input_ids = torch.tensor(input_ids, device=self.model.device)
+        if stopping_criteria:
+            # Construct huggingface stopping criteria
+            stopping_criteria = stopping_criteria + [self.tokenizer.eos_token]
+            stopping_criteria = transformers.StoppingCriteriaList([
+                *[
+                    MultiTokenEOSCriteria(sequence, self.tokenizer,
+                                          input_ids.shape[0])
+                    for sequence in stopping_criteria
+                ],
+            ])
+            kwargs['stopping_criteria'] = stopping_criteria
        # To accommodate the PeftModel, parameters should be passed in
        # key-value format for generate.
        outputs = self.model.generate(input_ids=input_ids,
@@ -434,6 +484,71 @@ class HuggingFace(BaseModel):
        ce_loss = loss.sum(-1).cpu().detach().numpy() / lens
        return ce_loss
+    def get_loglikelihood(
+            self,
+            inputs: List[str],
+            conts: List[str],
+            mask_length: Optional[List[int]] = None) -> List[float]:
+        """Get loglikelihood scores given a list of inputs.
+        Args:
+            inputs (List[str]): A list of strings.
+            conts (List[str]): A list of strings: slices after the space.
+            NOT SUPPORT mask_length YET!
+            mask_length (Optional[List[int]]): A list of mask lengths. If
+                provided, the perplexity scores will be calculated with the
+                first mask_length[i] tokens masked out. It's okay to skip
+                its implementation if advanced features in PPLInfernecer is
+                not needed.
+        Returns:
+            List[float]: A list of loglikelihood scores.
+        """
+        assert mask_length is None, 'Not support mask_length yet.'
+        if self.batch_padding and len(inputs) > 1:
+            raise NotImplementedError('Batch padding is not supported yet.')
+            # assert self.tokenizer.pad_token
+            # return self._get_loglikelihood(inputs, mask_length=mask_length)
+        return np.array([
+            self._get_loglikelihood(inputs=inputs[idx], conts=conts[idx])
+            for idx in range(len(inputs))
+        ])
+    def _get_loglikelihood(self, inputs: str, conts: str) -> float:
+        """Get loglikelihood scores given input string and continuation string.
+        Args:
+            inputs (str): string.
+            conts (str): strings: slices after the space.
+        Returns:
+            float: loglikelihood scores.
+        """
+        input_ids = self.tokenizer(inputs,
+                                   padding=False,
+                                   truncation=True,
+                                   max_length=self.max_seq_len)['input_ids']
+        input_ids = torch.tensor(input_ids, device=self.model.device)
+        context_ids = self.tokenizer(inputs.replace(conts, ''),
+                                     padding=False,
+                                     truncation=True,
+                                     max_length=self.max_seq_len)['input_ids']
+        cont_ids = input_ids[len(context_ids):]
+        output = self.model(input_ids.unsqueeze(0))
+        logits = output['logits'][:, :-1]
+        logits = torch.nn.functional.log_softmax(logits, dim=-1)
+        contlen = cont_ids.shape[0]
+        logits = logits[:, -contlen:, :]
+        # Reducing the dimension will lead to a wrong outcome
+        logits_gather = torch.gather(
+            logits, 2,
+            cont_ids.unsqueeze(0).unsqueeze(-1))  # [1, seq]
+        # Answer: sum the likelihood of each token in continuation
+        answer = float(logits_gather.detach().cpu().sum())
+        return answer
    def get_token_len(self, prompt: str) -> int:
        """Get lengths of the tokenized strings.
@@ -554,8 +669,8 @@ class HuggingFaceChatGLM3(HuggingFace):
                        'role': {
                            'HUMAN': 'user',
                            'BOT': 'assistant',
-                            'SYSTEM': 'system'
+                            'SYSTEM': 'system',
-                        }[item['role']]
+                        }[item['role'].upper()]
                    }
                    history.append(msg)
            user_content = history[-1]['content']
@@ -578,6 +693,9 @@ class HuggingFaceChatGLM3(HuggingFace):
                response, history = self.model.chat(self.tokenizer,
                                                    user_content,
                                                    history=history)
+                # response will be dict sometime
+                if isinstance(response, dict):
+                    response = response.get('content', '')
                responses.append(response)
            except Exception:
                responses.append('')

--- a/opencompass/models/lagent.py
+++ b/opencompass/models/lagent.py
@@ -52,7 +52,7 @@ class LagentAgent:
    def chat(self,
             user_input: str,
-             history: List[dict] = None) -> Tuple[str, List[dict]]:
+             history: List[dict] = None) -> Tuple[str, List[dict], List[dict]]:
        """Chat with agent."""
        if history:
            self.agent._session_history = history
@@ -60,6 +60,7 @@ class LagentAgent:
        from lagent.schema import ActionReturn, AgentReturn
        generation: AgentReturn = self.agent.chat(user_input)
+        inner_steps = generation.inner_steps
        answer = generation.response
        steps = []
@@ -76,7 +77,7 @@ class LagentAgent:
                    valid=int(step.valid),
                ))
-        return answer, steps
+        return answer, steps, inner_steps
 FORCE_STOP_PROMPT_EN = (

--- a/opencompass/models/llama2.py
+++ b/opencompass/models/llama2.py
@@ -179,12 +179,14 @@ class Llama2Chat(BaseModel):
                dialog = []
                for item in input:
                    msg = {'content': item['prompt']}
-                    if item['role'] == 'HUMAN':
+                    if item['role'].upper() == 'HUMAN':
                        msg['role'] = 'user'
-                    elif item['role'] == 'BOT':
+                    elif item['role'].upper() == 'BOT':
                        msg['role'] = 'assistant'
-                    elif item['role'] == 'SYSTEM':
+                    elif item['role'].upper() == 'SYSTEM':
                        msg['role'] = 'system'
+                    else:
+                        raise ValueError(f'Unknown role: {item["role"]}')
                    dialog.append(msg)
            dialogs.append(dialog)