[Feat] support wizardcoder series (#344)

* [Feat] support wizardcoder series * minor fix

[Feat] support wizardcoder series (#344)
* [Feat] support wizardcoder series * minor fix
ddb81972 · Hubert · GitHub · 2c71b0f6 · ddb81972 · ddb81972
Unverified Commit ddb81972 authored Sep 06, 2023 by Hubert Committed by GitHub Sep 06, 2023
9 changed files
--- a/configs/datasets/humanevalx/humanevalx_gen_0af626.py
+++ b/configs/datasets/humanevalx/humanevalx_gen_0af626.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import HumanevalXDataset, HumanevalXEvaluator
+humanevalx_reader_cfg = dict(
+    input_columns=['prompt'], output_column='task_id', train_split='test')
+# This prompt is used for WizardLMCode series
+# You can use 620cfa for basic generation
+humanevalx_infer_cfg = {
+    lang: dict(
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.
+### Instruction:
+Create a {lang} script for this problem:
+{{prompt}}
+### Response:"""),
+            ])),
+        retriever=dict(type=ZeroRetriever),
+        inferencer=dict(type=GenInferencer, max_out_len=1024))
+    for lang in ['python', 'cpp', 'go', 'java', 'js']
+}
+humanevalx_eval_cfg_dict = {
+    lang: dict(
+        evaluator=dict(
+            type=HumanevalXEvaluator,
+            language=lang,
+            ip_address=
+            "localhost",  # replace to your code_eval_server ip_address, port
+            port=5000
+        ),  # refer to https://github.com/Ezra-Yu/code-evaluator to launch a server
+        pred_role='BOT')
+    for lang in ['python', 'cpp', 'go', 'java', 'js'
+                 ]  # do not support rust now
+}
+humanevalx_datasets = [
+    dict(
+        type=HumanevalXDataset,
+        abbr=f'humanevalx-{lang}',
+        language=lang,
+        path='./backup_data/humanevalx',
+        reader_cfg=humanevalx_reader_cfg,
+        infer_cfg=humanevalx_infer_cfg[lang],
+        eval_cfg=humanevalx_eval_cfg_dict[lang])
+    for lang in ['python', 'cpp', 'go', 'java', 'js']
+]
\ No newline at end of file
--- a/configs/datasets/mbpp/mbpp_gen_5d6316.py
+++ b/configs/datasets/mbpp/mbpp_gen_5d6316.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import MBPPDataset, MBPPEvaluator2
+mbpp_reader_cfg = dict(
+    input_columns=['text', 'test_list'], output_column='test_list_2')
+# This prompt is used for WizardLMCode series
+# You can use other config file for basic 3-shot generation
+mbpp_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=
+                """Below is an instruction that describes a task. Write a response that appropriately completes the request.
+### Instruction:
+Create a Python script for this problem:
+{text}
+Test examples:
+{test_list}
+### Response:"""),
+        ])),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512))
+mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator2), pred_role="BOT")
+mbpp_datasets = [
+    dict(
+        type=MBPPDataset,
+        abbr='mbpp',
+        path='./data/mbpp/mbpp.jsonl',
+        reader_cfg=mbpp_reader_cfg,
+        infer_cfg=mbpp_infer_cfg,
+        eval_cfg=mbpp_eval_cfg)
+]
--- a/configs/models/wizardcoder/hf_wizardcoder_15b.py
+++ b/configs/models/wizardcoder/hf_wizardcoder_15b.py
+from opencompass.models import HuggingFaceCausalLM
+models = [
+    # WizardCoder 15B
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='WizardCoder-15B-V1.0',
+        path="WizardLM/WizardCoder-15B-V1.0",
+        tokenizer_path='WizardLM/WizardCoder-15B-V1.0',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        run_cfg=dict(num_gpus=2, num_procs=1),
+    ),
+]
--- a/configs/models/wizardcoder/hf_wizardcoder_1b.py
+++ b/configs/models/wizardcoder/hf_wizardcoder_1b.py
+from opencompass.models import HuggingFaceCausalLM
+models = [
+    # WizardCoder 1B
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='WizardCoder-1B-V1.0',
+        path="WizardLM/WizardCoder-1B-V1.0",
+        tokenizer_path='WizardLM/WizardCoder-1B-V1.0',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    ),
+]
--- a/configs/models/wizardcoder/hf_wizardcoder_3b.py
+++ b/configs/models/wizardcoder/hf_wizardcoder_3b.py
+from opencompass.models import HuggingFaceCausalLM
+models = [
+    # WizardCoder 1B
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='WizardCoder-1B-V1.0',
+        path="WizardLM/WizardCoder-1B-V1.0",
+        tokenizer_path='WizardLM/WizardCoder-1B-V1.0',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        run_cfg=dict(num_gpus=1, num_procs=1),
+    ),
+]
--- a/configs/models/wizardcoder/hf_wizardcoder_python_13b.py
+++ b/configs/models/wizardcoder/hf_wizardcoder_python_13b.py
+from opencompass.models import HuggingFaceCausalLM
+models = [
+    # WizardCoder Python 13B
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='WizardCoder-Python-13B-V1.0',
+        path="WizardLM/WizardCoder-Python-13B-V1.0",
+        tokenizer_path='WizardLM/WizardCoder-Python-13B-V1.0',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        run_cfg=dict(num_gpus=2, num_procs=1),
+    ),
+]
--- a/configs/models/wizardcoder/hf_wizardcoder_python_34b.py
+++ b/configs/models/wizardcoder/hf_wizardcoder_python_34b.py
+from opencompass.models import HuggingFaceCausalLM
+models = [
+    # WizardCoder Python 34B
+    dict(
+        type=HuggingFaceCausalLM,
+        abbr='WizardCoder-Python-34B-V1.0',
+        path="WizardLM/WizardCoder-Python-34B-V1.0",
+        tokenizer_path='WizardLM/WizardCoder-Python-34B-V1.0',
+        tokenizer_kwargs=dict(
+            padding_side='left',
+            truncation_side='left',
+            trust_remote_code=True,
+        ),
+        max_out_len=1024,
+        max_seq_len=2048,
+        batch_size=8,
+        model_kwargs=dict(trust_remote_code=True, device_map='auto'),
+        run_cfg=dict(num_gpus=4, num_procs=1),
+    ),
+]
--- a/opencompass/datasets/mbpp.py
+++ b/opencompass/datasets/mbpp.py
@@ -45,9 +45,12 @@ class MBPPEvaluator(BaseEvaluator):
        for test_case, pred in zip(references, predictions):
            programs = self._process_test(test_case, pred)
            try:
+                # Add exec globals to prevent the exec to raise
+                # unnecessary NameError for correct answer
+                exec_globals = {}
                with self.swallow_io():
                    with self.time_limit(2):
-                        exec(programs)
+                        exec(programs, exec_globals)
                result['pass'] += 1
            except TimeOutException:
                result['timeout'] += 1
@@ -118,3 +121,41 @@ class MBPPEvaluator(BaseEvaluator):
    class redirect_stdin(contextlib._RedirectStream):  # type: ignore
        _stream = 'stdin'
+@ICL_EVALUATORS.register_module()
+class MBPPEvaluator2(MBPPEvaluator):
+    """Better use for WizardCoder evaluation."""
+    def _process_answer(self, text):
+        if '```' in text:
+            blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
+            if len(blocks) == 0:
+                text = text.split('```')[1]  # fall back to default strategy
+            else:
+                text = blocks[0]  # fetch the first code block
+                if not text.startswith(
+                        '\n'):  # in case starting with ```python
+                    text = text[max(text.find('\n') + 1, 0):]
+        else:
+            match = re.search(r'Here(.*?)\n', text)
+            if match:
+                text = re.sub('Here(.*?)\n', '', text, count=1)
+        # remove test in generation
+        test_list = ['# Test', '#Test', '#test', '# test']
+        for s in test_list:
+            if s in text:
+                text = text[:text.find(s)]
+        text = text.strip()
+        match = re.search(r"('\s*|)(\[DONE\]|DONE)", text)
+        if match:
+            text = text[:match.start()]
+        match = re.search(r"(\[BEGIN\]|BEGIN)('\s*|)", text)
+        if match:
+            text = text[match.end():]
+        text = text.strip()
+        if text.startswith("'"):
+            text = text[1:]
+        return text
--- a/tools/collect_code_preds.py
+++ b/tools/collect_code_preds.py
@@ -70,6 +70,24 @@ def gpt_python_postprocess(ori_prompt: str, text: str) -> str:
    return text
+def wizardcoder_postprocess(text: str) -> str:
+    """Postprocess for WizardCoder Models."""
+    if '```' in text:
+        blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
+        if len(blocks) == 0:
+            text = text.split('```')[1]  # fall back to default strategy
+        else:
+            text = blocks[0]  # fetch the first code block
+            if not text.startswith('\n'):  # in case starting with ```python
+                text = text[max(text.find('\n') + 1, 0):]
+    else:
+        match = re.search(r'Here(.*?)\n', text)
+        if match:
+            text = re.sub('Here(.*?)\n', '', text, count=1)
+    return text
 def collect_preds(filename: str):
    # in case the prediction is partial
    root, ext = osp.splitext(filename)
@@ -147,7 +165,18 @@ def main():
                    break
            # special postprocess for GPT
-            if 'CodeLlama' not in model_abbr and lang == 'python':
+            if model_abbr in [
+                    'WizardCoder-1B-V1.0',
+                    'WizardCoder-3B-V1.0',
+                    'WizardCoder-15B-V1.0',
+                    'WizardCoder-Python-13B-V1.0',
+                    'WizardCoder-Python-34B-V1.0',
+            ]:
+                predictions = [{
+                    'task_id': f'{task}/{i}',
+                    'generation': wizardcoder_postprocess(pred),
+                } for i, pred in enumerate(pred_strs)]
+            elif 'CodeLlama' not in model_abbr and lang == 'python':
                predictions = [{
                    'task_id':
                    f'{task}/{i}',