[Sync] Fix cmnli, fix vicuna meta template, fix longbench postprocess and other minor fixes (#625)

d4d1330a · Fengzhe Zhou · GitHub · 5329724b · d4d1330a · d4d1330a
Unverified Commit d4d1330a authored Nov 23, 2023 by Fengzhe Zhou Committed by GitHub Nov 23, 2023
5 changed files
--- a/opencompass/datasets/longbench/longbench_trivia_qa.py
+++ b/opencompass/datasets/longbench/longbench_trivia_qa.py
 from datasets import Dataset, load_dataset

-from opencompass.registry import LOAD_DATASET
+from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS

 from ..base import BaseDataset

@@ -24,3 +24,9 @@ class LongBenchtriviaqaDataset(BaseDataset):
            })
        dataset[split] = Dataset.from_list(raw_data)
        return dataset
+
+
+@TEXT_POSTPROCESSORS.register_module()
+def triviaqa_postprocess(text: str) -> str:
+    text = text.lstrip('\n').split('\n')[0]
+    return text
--- a/opencompass/models/huggingface.py
+++ b/opencompass/models/huggingface.py
@@ -46,6 +46,9 @@ class HuggingFace(BaseModel):
        mode (str, optional): The method of input truncation when input length
            exceeds max_seq_len. 'mid' represents the part of input to
            truncate. Defaults to 'none'.
+        use_fastchat_template (str, optional): Whether to use fastchat to get
+            the conversation template. If True, fastchat needs to be
+            implemented first. Defaults to False.

    Note:
        About ``extract_pred_after_decode``: Commonly, we should extract the
@@ -68,7 +71,8 @@ class HuggingFace(BaseModel):
                 extract_pred_after_decode: bool = False,
                 batch_padding: bool = False,
                 pad_token_id: Optional[int] = None,
-                 mode: str = 'none'):
+                 mode: str = 'none',
+                 use_fastchat_template: bool = False):
        super().__init__(path=path,
                         max_seq_len=max_seq_len,
                         tokenizer_only=tokenizer_only,
@@ -91,6 +95,7 @@ class HuggingFace(BaseModel):
                             model_kwargs=model_kwargs,
                             peft_path=peft_path)
        self.generation_kwargs = generation_kwargs
+        self.use_fastchat_template = use_fastchat_template

    def _load_tokenizer(self, path: str, tokenizer_path: Optional[str],
                        tokenizer_kwargs: dict):
@@ -220,6 +225,20 @@ class HuggingFace(BaseModel):
        if self.extract_pred_after_decode:
            prompt_lens = [len(input_) for input_ in inputs]

+        if self.use_fastchat_template:
+            try:
+                from fastchat.model import get_conversation_template
+            except ModuleNotFoundError:
+                raise ModuleNotFoundError(
+                    'Fastchat is not implemented. You can use '
+                    '\'pip install "fschat[model_worker,webui]"\' '
+                    'to implement fastchat.')
+            for i in range(len(inputs)):
+                conv = get_conversation_template('vicuna')
+                conv.append_message(conv.roles[0], inputs[i])
+                conv.append_message(conv.roles[1], None)
+                inputs[i] = conv.get_prompt()
+
        # step-1: tokenize the input with batch_encode_plus
        tokens = self.tokenizer.batch_encode_plus(inputs,
                                                  padding=True,
@@ -263,6 +282,19 @@ class HuggingFace(BaseModel):
        if self.extract_pred_after_decode:
            prompt_lens = [len(input_) for input_ in inputs]

+        if self.use_fastchat_template:
+            try:
+                from fastchat.model import get_conversation_template
+            except ModuleNotFoundError:
+                raise ModuleNotFoundError(
+                    'Fastchat is not implemented. You can use '
+                    '\'pip install "fschat[model_worker,webui]"\' '
+                    'to implement fastchat.')
+            conv = get_conversation_template('vicuna')
+            conv.append_message(conv.roles[0], inputs[0])
+            conv.append_message(conv.roles[1], None)
+            inputs = [conv.get_prompt()]
+
        if self.mode == 'mid':
            input_ids = self.tokenizer(inputs, truncation=False)['input_ids']
            input_ids = torch.tensor(input_ids, device=self.model.device)
@@ -491,7 +523,8 @@ class HuggingFaceChatGLM3(HuggingFace):
    def generate(self,
                 inputs: List[str or PromptList],
                 max_out_len: int = 512,
-                 temperature: float = 0.6) -> str:
+                 temperature: float = 0.6,
+                 skip_overlength=False) -> str:
        """Generate response from input prompt.

        Args:
@@ -518,6 +551,20 @@ class HuggingFaceChatGLM3(HuggingFace):
                    history.append(msg)
            user_content = history[-1]['content']
            history = history[:-1]
+
+            if skip_overlength:
+                # The model will report the following error
+                # if the sequence length is greater than the maximum length:
+                # "Input length of input_ids is {INPUT_IDS},
+                # but `max_length` is set to 8192.
+                # This can lead to unexpected behavior.
+                # You should consider increasing `max_new_tokens`."
+                # The following hardcode can fix this exception.
+                len_user_content = len(self.tokenizer.encode(user_content))
+                if len_user_content > 8192:
+                    responses.append('')
+                    continue
+
            try:
                response, history = self.model.chat(self.tokenizer,
                                                    user_content,

--- a/opencompass/models/llama2.py
+++ b/opencompass/models/llama2.py
@@ -141,12 +141,19 @@ class Llama2Chat(BaseModel):
                    path: str,
                    max_seq_len: int,
                    max_batch_size: int,
-                    tokenizer_path: Optional[str] = None):
+                    tokenizer_path: Optional[str] = None,
+                    force_bf16=False):
        from llama import Llama
        self.generator = Llama.build(path, tokenizer_path, max_seq_len,
                                     max_batch_size)
        self.tokenizer = self.generator.tokenizer
        self.model = self.generator.model
+        if force_bf16:
+            # force set model to `bfloat16` to fix
+            # the exception of 'RuntimeError: probability tensor
+            # contains either `inf`, `nan` or element < 0',
+            # encountered during the inference of llama2-7b
+            self.model = self.model.bfloat16()

    def _load_tokenizer(self, tokenizer_path: str):
        from llama import Tokenizer

--- a/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py
+++ b/opencompass/openicl/icl_inferencer/icl_gen_inferencer.py
@@ -108,7 +108,11 @@ class GenInferencer(BaseInferencer):
                                         'tmp_' + output_json_filename)
        if osp.exists(tmp_json_filepath):
            # TODO: move resume to output handler
+            try:
                tmp_result_dict = mmengine.load(tmp_json_filepath)
+            except Exception:
+                pass
+            else:
                output_handler.results_dict = tmp_result_dict
                index = len(tmp_result_dict)


--- a/opencompass/runners/slurm_sequential.py
+++ b/opencompass/runners/slurm_sequential.py
@@ -96,7 +96,7 @@ class SlurmSequentialRunner(BaseRunner):

        try:
            parent_conns = []
-            num_workers = min(self.max_num_workers, len(tasks))
+            num_workers = max(min(self.max_num_workers, len(tasks)), 1)
            with Pool(processes=num_workers) as pool:
                for task in tasks:
                    parent_conn, child_conn = Pipe()