first commit

f314e457 · dengjb · 50406f0b · f314e457 · f314e457 · f314e457
Commit f314e457 authored May 24, 2024 by dengjb
20 changed files
--- a/Evaluation/HumanEval/data/humaneval-rb.jsonl
+++ b/Evaluation/HumanEval/data/humaneval-rb.jsonl
--- a/Evaluation/HumanEval/data/humaneval-rkt.jsonl
+++ b/Evaluation/HumanEval/data/humaneval-rkt.jsonl
--- a/Evaluation/HumanEval/data/humaneval-rs.jsonl
+++ b/Evaluation/HumanEval/data/humaneval-rs.jsonl
--- a/Evaluation/HumanEval/data/humaneval-scala.jsonl
+++ b/Evaluation/HumanEval/data/humaneval-scala.jsonl
--- a/Evaluation/HumanEval/data/humaneval-sh
+++ b/Evaluation/HumanEval/data/humaneval-sh
--- a/Evaluation/HumanEval/data/humaneval-sh.jsonl
+++ b/Evaluation/HumanEval/data/humaneval-sh.jsonl
--- a/Evaluation/HumanEval/data/humaneval-swift.jsonl
+++ b/Evaluation/HumanEval/data/humaneval-swift.jsonl
--- a/Evaluation/HumanEval/data/humaneval-ts
+++ b/Evaluation/HumanEval/data/humaneval-ts
--- a/Evaluation/HumanEval/data/humaneval-ts.jsonl
+++ b/Evaluation/HumanEval/data/humaneval-ts.jsonl
--- a/Evaluation/HumanEval/eval.sh
+++ b/Evaluation/HumanEval/eval.sh
+MODEL_NAME_OR_PATH="deepseek/deepseek-coder-1b"
+DATASET_ROOT="data/"
+LANGUAGE="python"
+CUDA_VISIBLE_DEVICES=1,2,3 python -m accelerate.commands.launch --config_file test_config.yaml eval_pal.py --logdir ${MODEL_NAME_OR_PATH} --language ${LANGUAGE} --dataroot ${DATASET_ROOT}
\ No newline at end of file
--- a/Evaluation/HumanEval/eval_instruct.py
+++ b/Evaluation/HumanEval/eval_instruct.py
+import argparse
+import json
+import os
+import torch
+from pathlib import Path
+from tqdm import tqdm
+data_abs_dir = Path(__file__).parent / "data"
+from utils.utils import extract_generation_code, languge_settings
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from human_eval.evaluation import evaluate_functional_correctness
+def build_deepseekcoder_instruction(languge: str, question: str):
+    return '''
+Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
+```{}
+{}
+```
+'''.strip().format(languge.lower(), question.strip())
+def generate_one(example, lang, tokenizer, model):
+    prompt = build_deepseekcoder_instruction(languge_settings[lang]['full_name'], example['prompt'])
+    inputs = tokenizer.apply_chat_template(
+        [{'role': 'user', 'content': prompt }],
+        return_tensors="pt",
+        add_generation_prompt=True
+    ).to(model.device)
+    stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>")
+    assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found"
+    outputs = model.generate(
+        inputs, 
+        max_new_tokens=1024,
+        do_sample=False,
+        # top_p=0.95,
+        # temperature=temperature,
+        pad_token_id=stop_id,
+        eos_token_id=stop_id
+    )
+    output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
+    example['output'] = output
+    return extract_generation_code(example, lang_code=lang)
+def generate_main(args):
+    model_name_or_path = args.model
+    lang = args.language
+    saved_path = args.output_path
+    temp_dir = args.temp_dir
+    os.makedirs(temp_dir, exist_ok=True)
+    problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
+    print("model", model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+    print("load tokenizer {} from {} over.".format(tokenizer.__class__, model_name_or_path))
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name_or_path,
+        torch_dtype=torch.bfloat16,
+        device_map="auto",
+        #use_flash_attention_2=True
+    )
+    model.eval()
+    examples = [json.loads(x) for x in open(problem_file) if x.strip()]
+    print("Read {} examples for evaluation over.".format(len(examples)))
+    generated_examples = []
+    for ex in tqdm(examples, desc='Generating'):
+        gen_example = generate_one(ex, args.language, tokenizer, model)
+        generated_examples.append(gen_example)
+    print("Generate all over!!!")
+    with open(saved_path, 'w', encoding='utf-8') as fw:
+        for ex in generated_examples:
+            fw.write(json.dumps(ex) + '\n')
+        print("Save {} processed examples into {} over!".format(len(generated_examples), saved_path))
+    result = evaluate_functional_correctness(
+        input_file=saved_path,
+        tmp_dir=temp_dir,
+        n_workers=8,
+        timeout=3.0,
+        problem_file=problem_file,
+        language=lang
+    )
+    print(lang, result, model_name_or_path)
+    pass
+def evaluation_only(args):
+    lang = args.language
+    temp_dir = args.temp_dir
+    assert os.path.exists(args.output_path), "Not fond output file: {}".format(args.output_path)
+    os.makedirs(temp_dir, exist_ok=True)
+    output_name = os.path.basename(args.output_path)
+    output_examples = [json.loads(x) for x in open(args.output_path) if x.strip()]
+    processed_examples = [extract_generation_code(ex, lang) for ex in tqdm(output_examples, "Processing")]
+    processed_path = os.path.join(temp_dir, output_name)
+    with open(processed_path, 'w', encoding='utf-8') as fw:
+        for ex in processed_examples:
+            fw.write(json.dumps(ex) + '\n')
+        print("Save {} processed examples into {} over!".format(len(processed_examples), processed_path))
+    problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
+    from human_eval.evaluation import evaluate_functional_correctness
+    result = evaluate_functional_correctness(
+        input_file=processed_path,
+        tmp_dir=temp_dir,
+        n_workers=8,
+        timeout=3.0,
+        problem_file=problem_file,
+        language=lang
+    )
+    print(lang, result)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, help="model name or path")
+    parser.add_argument('--output_path', type=str, help="output path of your generation")
+    parser.add_argument('--language', type=str, help="langauge")
+    parser.add_argument('--temp_dir', type=str, help="temp dir for evaluation", default="tmp")
+    args = parser.parse_args()
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    generate_main(args)
+    pass
--- a/Evaluation/HumanEval/eval_pal.py
+++ b/Evaluation/HumanEval/eval_pal.py
+import os
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+import json
+import torch.distributed as dist
+import subprocess
+import sys
+from accelerate import Accelerator
+from accelerate import DistributedDataParallelKwargs
+from pathlib import Path
+from argparse import ArgumentParser
+from humaneval import HumanEval as evaltor
+from transformers import AutoTokenizer, AutoModelForCausalLM
+if __name__ == '__main__':
+    kwargs_handlers = [DistributedDataParallelKwargs(find_unused_parameters=True)]
+    accelerator = Accelerator(mixed_precision="bf16", kwargs_handlers=kwargs_handlers)   
+    parser = ArgumentParser()
+    parser.add_argument("--logdir", type=str, default="")
+    parser.add_argument("--language", type=str, default="")
+    parser.add_argument("--dataroot", type=str, default="")
+    args = parser.parse_args()
+    logdir = args.logdir
+    language = args.language
+    if logdir == "":
+        logdir = "tmp/"
+    tokenizer = dict(
+        cls=AutoTokenizer,
+        model_path=logdir,)
+    dataroot = args.dataroot
+    evaluator = evaltor(data_root=dataroot, max_seq_len=4096, tokenizer_cfg=tokenizer, log_dir=logdir, n_sample=1, batch_size=1, language=language, max_gen_len=500)
+    model = AutoModelForCausalLM.from_pretrained(logdir, device_map=accelerator.device, trust_remote_code=True, torch_dtype=torch.bfloat16)
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    evaluator.eval_model(model, accelerator)
--- a/Evaluation/HumanEval/human_eval/__init__.py
+++ b/Evaluation/HumanEval/human_eval/__init__.py
--- a/Evaluation/HumanEval/human_eval/data.py
+++ b/Evaluation/HumanEval/human_eval/data.py
+from typing import Iterable, Dict
+import gzip
+import json
+import os
+ROOT = os.path.dirname(os.path.abspath(__file__))
+HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz")
+def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
+    return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
+def stream_jsonl(filename: str) -> Iterable[Dict]:
+    """
+    Parses each jsonl line and yields it as a dictionary
+    """
+    if filename.endswith(".gz"):
+        with open(filename, "rb") as gzfp:
+            with gzip.open(gzfp, 'rt') as fp:
+                for line in fp:
+                    if any(not x.isspace() for x in line):
+                        yield json.loads(line)
+    else:
+        with open(filename, "r", encoding="utf-8") as fp:
+            for line in fp:
+                if any(not x.isspace() for x in line):
+                    yield json.loads(line)
+def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
+    """
+    Writes an iterable of dictionaries to jsonl
+    """
+    if append:
+        mode = 'ab'
+    else:
+        mode = 'wb'
+    filename = os.path.expanduser(filename)
+    if filename.endswith(".gz"):
+        with open(filename, mode) as fp:
+            with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
+                for x in data:
+                    gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
+    else:
+        with open(filename, mode) as fp:
+            for x in data:
+                fp.write((json.dumps(x) + "\n").encode('utf-8'))
--- a/Evaluation/HumanEval/human_eval/evaluate_functional_correctness.py
+++ b/Evaluation/HumanEval/human_eval/evaluate_functional_correctness.py
+import fire
+import sys
+from .data import HUMAN_EVAL
+from .evaluation import evaluate_functional_correctness
+def entry_point(
+    sample_file: str,
+    k: str = "1,10,100",
+    n_workers: int = 4,
+    timeout: float = 3.0,
+    problem_file: str = "",
+    is_mbpp: bool = False,
+):
+    """
+    Evaluates the functional correctness of generated samples, and writes
+    results to f"{sample_file}_results.jsonl.gz"
+    """
+    k = list(map(int, k.split(",")))
+    results = evaluate_functional_correctness(sample_file, k, n_workers, timeout, problem_file, is_mbpp)
+    print(results)
+def main():
+    fire.Fire(entry_point)
+sys.exit(main())
--- a/Evaluation/HumanEval/human_eval/evaluation.py
+++ b/Evaluation/HumanEval/human_eval/evaluation.py
+import os
+import sys
+import fire
+import json
+import gzip
+import regex
+import numpy as np
+import itertools
+from typing import *
+from tqdm.auto import tqdm
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from .data import stream_jsonl
+from .execution import check_correctness
+IMPORT_HELPER = {
+    "python": [
+        "import math",
+        "import re",
+        "import sys",
+        "import copy",
+        "import datetime",
+        "import itertools",
+        "import collections",
+        "import heapq",
+        "import functools",
+        "import hashlib",
+        "import numpy",
+        "import numpy as np",
+        "import string",
+        "from typing import *",
+        "from collections import *",
+    ],
+    "go"    : [
+        "math",
+        "strings",
+        "fmt",
+        "strconv",
+        "time",
+        "bytes",
+        "regexp",
+        "sort",
+        "math/rand",
+        "crypto/md5",
+    ],
+    "cpp"   : [
+        "#include<stdlib.h>",
+        "#include<algorithm>",
+        "#include<math.h>",
+        "#include<stdio.h>",
+        "#include<vector>",
+        "#include<string>",
+        "#include<climits>",
+        "#include<cstring>",
+        "#include<iostream>",
+        "#include<cassert>"
+    ],
+    "cs": ["using System.Numerics;", "using System.Diagnostics;", "using System.Collections.Generic;", "using System.Linq;", "using System.Text;", "using System.Security.Cryptography;", "using System.Collections.Generic;"]
+}
+LANGUAGE_NAME = {
+    "cpp"   : "CPP",
+    "go"    : "Go",
+    "java"  : "Java",
+    "js"    : "JavaScript",
+    "python": "Python",
+}
+def read_dataset(
+    data_file: str = None,
+    dataset_type: str = "humaneval",
+    num_shot=None,
+) -> Dict:
+    """
+    Reads a dataset and returns a dictionary of tasks.
+    """
+    if num_shot is not None:
+        print(f"{num_shot}-shot setting...")
+    if "humaneval" in dataset_type.lower():
+        if data_file is None:
+            current_path = os.path.dirname(os.path.abspath(__file__))
+            data_file = os.path.join(current_path, "..", "humaneval-x", "python", "data", "humaneval_python.jsonl.gz")
+        dataset = {task["task_id"]: task for task in stream_jsonl(data_file)}
+    else:
+        raise f"Dataset: {dataset_type} not supported."
+    return dataset
+def estimate_pass_at_k(
+        num_samples: Union[int, List[int], np.ndarray],
+        num_correct: Union[List[int], np.ndarray],
+        k: int
+) -> np.ndarray:
+    """
+    Estimates pass@k of each problem and returns them in an array.
+    """
+    def estimator(n: int, c: int, k: int) -> float:
+        """
+        Calculates 1 - comb(n - c, k) / comb(n, k).
+        """
+        if n - c < k:
+            return 1.0
+        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+    if isinstance(num_samples, int):
+        num_samples_it = itertools.repeat(num_samples, len(num_correct))
+    else:
+        assert len(num_samples) == len(num_correct)
+        num_samples_it = iter(num_samples)
+    return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
+def process_humaneval_test(sample, problems, example_test=False, is_mbpp=False, language="python"):
+    """
+    Processes a sample for evaluation.
+    """
+    task_id = sample["task_id"]
+    if is_mbpp:
+        return sample["generation"] + "\n" + "\n".join(problems[task_id]["test"])
+    prompt = sample["prompt"]
+    if example_test and "example_test" in problems[task_id] and problems[task_id]["example_test"] != "":
+        test = problems[task_id]["example_test"]
+    else:
+        test = problems[task_id]["test"]
+    code = sample["generation"]
+    # Pre-process for different languages
+    if language == "python":
+        test_setup = "\n".join(IMPORT_HELPER["python"]) + "\n"
+        test_string = test_setup + code + "\n" + test + "\n"
+    elif language == "cpp":
+        test_set_up = ""
+        for s in IMPORT_HELPER["cpp"]:
+            if s not in prompt:
+                test_set_up += s + "\n"
+        test_string = test_set_up + "\n" + code + "\n" + test
+    elif language == "java":
+        test_string = code + "\n" + test
+    elif language == "cs":
+        test_set_up = ""
+        for s in IMPORT_HELPER["cs"]:
+            test_set_up += s + "\n"
+        test_string = test_set_up + "\n" + code + "\n" + test
+    elif language in ["js", "javascript", "ts", "sh", "go"]:
+        test_string = code + "\n" + test
+    elif language == "go232":
+        import_string = problems[task_id]["import"]
+        prompt = prompt.replace(import_string, "")
+        if example_test and "example_test" in problems[task_id]:
+            test = problems[task_id]["example_test"]
+        else:
+            test = problems[task_id]["test"]
+        test_setup = problems[task_id]["test_setup"]
+        other_pkgs = []
+        for pkg in IMPORT_HELPER["go"]:
+            if pkg not in test_setup:
+                p = pkg.split("/")[-1]
+                if p + "." in code:
+                    other_pkgs.append(f"\"{pkg}\"")
+        if other_pkgs:
+            import_other_pkgs = "import (\n" + "    ".join([p + "\n" for p in other_pkgs]) + ")"
+            test_string = test_setup + "\n" + import_other_pkgs + "\n" + prompt + code + "\n" + test
+        else:
+            test_string = test_setup + "\n" + prompt + code + "\n" + test
+    elif language == "rust":
+        main = "\nfn main(){ \n } \n"
+        declaration = problems[task_id]["declaration"]
+        test_string = main + declaration + prompt + code + test
+    elif language == "php":
+        if code[:5] != "<?php":
+            code = "<?php\n" + code
+        test_string = code + "\n" + test + "?>"
+    return test_string
+def stream_jsonl_all(filename: str) -> Iterable[Dict]:
+    """
+    Streams a JSONL file.
+    """
+    results = []
+    if filename.endswith(".gz"):
+        fp = gzip.open(open(filename, "rb"), "rt")
+    else:
+        fp = open(filename, "r")
+    for line in fp:
+        if any(not x.isspace() for x in line):
+            results.append(json.loads(line))
+    fp.close()
+    return results
+def evaluate_functional_correctness(
+        input_file: str = None,
+        tmp_dir: str = "./",
+        n_workers: int = 32,
+        timeout: float = 10.0,
+        problem_file: str = "../data/humaneval_python.jsonl.gz",
+        out_dir: str = None,
+        k: List[int] = [1, 10, 100],
+        test_groundtruth: bool = False,
+        example_test: bool = False,
+        is_mbpp: bool = False,
+        language: str = "python",
+):
+    """
+    Evaluates the functional correctness of a model.
+    """
+    if example_test:
+        print("Example test...")
+    problems = read_dataset(problem_file,
+                            dataset_type="humaneval")
+    sample_jsonl = stream_jsonl_all(input_file)
+    with ThreadPoolExecutor(max_workers=n_workers) as executor:
+        futures = []
+        completion_id = Counter()
+        n_samples = 0
+        results = defaultdict(list)
+        if test_groundtruth:
+            print("Testing ground truth...")
+            for sample in tqdm(problems.values()):
+                task_id = sample["task_id"]
+                lang = task_id.split("/")[0].lower()
+                if lang == "javascript":
+                    lang = "js"
+                tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
+                sample["generation"] = sample["canonical_solution"]
+                sample["test_code"] = process_humaneval_test(sample, problems, example_test, language)
+                if sample["test_code"] is None:
+                    continue
+                args = (task_id, sample, lang, timeout, tmp_dir_, completion_id[task_id])
+                future = executor.submit(check_correctness, *args)
+                futures.append(future)
+                completion_id[task_id] += 1
+                n_samples += 1
+        else:
+            print("Reading samples...")
+            for sample in tqdm(sample_jsonl):
+                task_id = sample["task_id"]
+                if not is_mbpp:
+                    lang = language
+                if not is_mbpp and lang == "javascript":
+                    lang = "js"
+                if is_mbpp:
+                    lang = "python"
+                tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
+                sample["task_id"] = task_id
+                sample["test_code"] = process_humaneval_test(sample, problems, example_test, is_mbpp, language)
+                if sample["test_code"] is None:
+                    continue
+                if "completion_id" in sample:
+                    completion_id_ = sample["completion_id"]
+                else:
+                    completion_id_ = completion_id[task_id]
+                args = (task_id, sample, lang, timeout, tmp_dir_, completion_id_)
+                future = executor.submit(check_correctness, *args)
+                futures.append(future)
+                completion_id[task_id] += 1
+                n_samples += 1
+        if len(completion_id) == len(problems):
+            evaluate_pass_at_k = True
+        else:
+            evaluate_pass_at_k = False
+        print("Running test suites...")
+        for future in tqdm(as_completed(futures), total=len(futures)):
+            result = future.result()
+            results[result["task_id"]].append((result["completion_id"], result))
+    # Calculate pass@k.
+    total, correct = [], []
+    for result in results.values():
+        passed = [r[1]["passed"] for r in result]
+        total.append(len(passed))
+        correct.append(sum(passed))
+    total = np.array(total)
+    correct = np.array(correct)
+    if evaluate_pass_at_k:
+        ks = k
+        pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
+                     for k in ks if (total >= k).all()}
+        print(pass_at_k)
+    else:
+        print("Total:", np.sum(total))
+        print("Correct:", np.sum(correct))
+    return pass_at_k
--- a/Evaluation/HumanEval/human_eval/execution.py
+++ b/Evaluation/HumanEval/human_eval/execution.py
--- a/Evaluation/HumanEval/humaneval.py
+++ b/Evaluation/HumanEval/humaneval.py
+import time
+import string
+import multiprocessing
+import os
+import numpy as np
+import json
+import re
+import torch
+import datetime
+import subprocess
+import torch.distributed as dist
+from attrdict import AttrDict
+from human_eval.evaluation import evaluate_functional_correctness
+from transformers import AutoTokenizer
+from utils.dataset import HumanEvalDataset
+from utils.utils import cleanup_code
+class HumanEval:
+    """
+    HumanEval evaluation class.
+    """
+    def __init__(self, data_root, max_seq_len=2048,
+                language="python", max_gen_len=200, batch_size=512,
+                log_dir=None, temperature=0, issft=False, top_p=0.95,
+                model_name="", inference_increment=True,
+                tokenizer_cfg=None, n_sample=40, k_sample=1):
+        self.data_root = data_root
+        self.max_seq_len = max_seq_len
+        self.max_gen_len = max_gen_len
+        self.batch_size = batch_size
+        self.k = k_sample
+        self.n_sample = n_sample
+        self.language = language
+        self.log_dir = log_dir
+        self.sft = issft
+        self.temperature = temperature
+        self.top_p = top_p
+        self.model_name = tokenizer_cfg["model_path"].replace("/", "_")
+        self.inference_increment = inference_increment
+        os.makedirs(self.log_dir, exist_ok=True)
+        tokenizer_cls = tokenizer_cfg.pop('cls')
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_cfg.pop("model_path"), trust_remote_code=True)       
+        except Exception as e:
+            print(e)
+            assert False
+    @torch.no_grad()
+    def eval_model(self, gpt, accelerator):
+        """
+        Evaluate the model on HumanEval.
+        """
+        assert self.log_dir is not None, "log_dir should not be None when evaluating humaneval"
+        dataset = HumanEvalDataset(self.data_root, sample_num=self.n_sample, language=self.language, issft=self.sft)
+        nprompt = len(dataset) // self.n_sample
+        dp_rank = accelerator.process_index 
+        dp_size = accelerator.num_processes 
+        if self.k > 1:
+            assert self.n_sample >= 100, "HumanEval PASS@100 needs n_sample >= 100"
+        gpt.eval()
+        # each process will process a subset of the dataset
+        prompt_indices_split = np.array_split(range(nprompt), dp_size)
+        prompt_indices = prompt_indices_split[dp_rank]
+        indices = [x * self.n_sample + j for x in prompt_indices for j in range(self.n_sample)]
+        all_num = len(indices) 
+        processed_num = 0
+        log_file = os.path.join(self.log_dir,
+                                    f'{self.model_name}_rank{dp_rank}_bs{self.batch_size}_shot_log_{self.language}.json')
+        tmpfile = open(log_file, "w")
+        start_time = time.time()
+        # split the dataset into batches and construct a list of inputs
+        for idx in range(0, len(indices), self.batch_size):
+            prompt_list = []
+            prompt_lens = []
+            orriginal_prompt_list = []
+            tokenized_prompt_lens = []
+            taskid = []
+            # get the prompts from the dataset
+            for j in indices[idx:idx + self.batch_size]:
+                data = dataset[j]
+                fprompt = data["prompt"].strip()
+                prompt_list.append(fprompt)
+                tmp = self.tokenizer.encode(fprompt)
+                orriginal_prompt_list.append(data["original_prompt"])
+                prompt_lens.append(len(fprompt))
+                tokenized_prompt_lens.append(tmp)
+                taskid.append(data["task_id"])
+            input_ids = torch.tensor(tokenized_prompt_lens).to(accelerator.device)
+            # generate the code
+            if self.temperature != 0:       
+                decoded = gpt.generate(
+                    input_ids=input_ids,
+                    max_new_tokens=self.max_gen_len,
+                    do_sample=True,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                    temperature=self.temperature,
+                    top_p=self.top_p,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                )
+            else:
+                decoded = gpt.generate(
+                    input_ids=input_ids,
+                    max_new_tokens=self.max_gen_len,
+                    do_sample=False,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                    pad_token_id=self.tokenizer.eos_token_id,
+                )
+            # save the results to a file
+            for local_idx, text in enumerate(decoded):
+                prediction = decoded[local_idx]
+                prediction = self.tokenizer.decode(prediction, skip_special_tokens=True)
+                suffixprediction = prediction[prompt_lens[local_idx]:]
+                suffixprediction = cleanup_code(suffixprediction, self.language, "humaneval", self.sft, dataset.stopwords)
+                # sft mode does not need original prompt
+                if not self.sft:
+                    suffixprediction = orriginal_prompt_list[local_idx] + "\n" + suffixprediction
+                res = {"task_id": taskid[local_idx], "generation": suffixprediction, "prompt": orriginal_prompt_list[local_idx], "wholecode":prediction}
+                tmpfile.write(json.dumps(res) + "\n")
+                tmpfile.flush()
+                processed_num += 1
+            self.log_score(dp_rank, processed_num, all_num, start_time, self.batch_size)
+        tmpfile.close()        
+        accelerator.wait_for_everyone()
+        # calculate the final score of pass@k
+        self._calculate_final_score(accelerator)
+        accelerator.wait_for_everyone()
+        return
+    def log_score(self, dp_rank, processed_num, all_num, start_time, bs):
+        """
+        Log the score.
+        """
+        mem = torch.cuda.max_memory_allocated() / (1 << 30)
+        avg_time = (time.time() - start_time) / processed_num * bs
+        print(
+            f'DP RANK:{dp_rank} process_num/all_num:{int(processed_num)}/{all_num} '
+            f'avg_time_per_batch:{avg_time:.2f} s '
+            f'still_need:{((all_num - processed_num) // bs + 1) * avg_time / 60:.2f} m',
+            f'mem:{mem:.3f} GiB bs:{bs}',
+            flush=True
+        )
+        if processed_num == all_num:
+            print(f'EVAL DONE! Process time {(time.time() - start_time) / 60:.2f} m', flush=True)
+    def _calculate_final_score(self, accelerator):
+        """
+        Calculate the final score.
+        """
+        if accelerator.is_local_main_process:
+            logfilepath = os.path.join(self.log_dir, f'final_{self.model_name}.jsonl')
+            logfile = open(logfilepath, "w")
+            for i in range(accelerator.num_processes):
+                tmplogfile = os.path.join(self.log_dir, f'{self.model_name}_rank{i}_bs{self.batch_size}_shot_log_{self.language}.json')
+                logfile.write(open(tmplogfile).read().strip() + "\n")
+                os.remove(tmplogfile)
+            logfile.close()
+            timeout = 10
+            runlang = self.language
+            res = evaluate_functional_correctness(input_file=logfilepath, problem_file=os.path.join(self.data_root, f"humaneval-{self.language}.jsonl"), tmp_dir=self.log_dir, timeout=timeout, language=runlang)
+            print("score is", res['pass@%d' % self.k])
+            os.remove(logfilepath)
+        return
\ No newline at end of file
--- a/Evaluation/HumanEval/javatuples-1.2.jar
+++ b/Evaluation/HumanEval/javatuples-1.2.jar
--- a/Evaluation/HumanEval/test_config.yaml
+++ b/Evaluation/HumanEval/test_config.yaml
+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 3
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false