first commit

f314e457 · dengjb · 50406f0b · f314e457 · f314e457 · f314e457
Commit f314e457 authored May 24, 2024 by dengjb
20 changed files
--- a/Evaluation/MBPP/human_eval/evaluation.py
+++ b/Evaluation/MBPP/human_eval/evaluation.py
+import os
+import json
+import gzip
+import numpy as np
+import itertools
+
+from typing import *
+from tqdm.auto import tqdm
+from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from .data import stream_jsonl
+from .execution import check_correctness
+IMPORT_HELPER = {
+    "python": [
+        "import math",
+        "import re",
+        "import sys",
+        "import copy",
+        "import datetime",
+        "import itertools",
+        "import collections",
+        "import heapq",
+        "import functools",
+        "import hashlib",
+        "import numpy",
+        "import numpy as np",
+        "import string",
+        "from typing import *",
+        "from collections import *",
+    ],
+    "go"    : [
+        "math",
+        "strings",
+        "fmt",
+        "strconv",
+        "time",
+        "bytes",
+        "regexp",
+        "sort",
+        "math/rand",
+        "crypto/md5",
+    ],
+    "cpp"   : [
+        "#include<stdlib.h>",
+        "#include<algorithm>",
+        "#include<math.h>",
+        "#include<stdio.h>",
+        "#include<vector>",
+        "#include<string>",
+        "#include<climits>",
+        "#include<cstring>",
+        "#include<iostream>",
+        "#include<cassert>"
+    ],
+    "cs": ["using System.Numerics;", "using System.Diagnostics;", "using System.Collections.Generic;", "using System.Linq;", "using System.Text;", "using System.Security.Cryptography;", "using System.Collections.Generic;"]
+}
+
+
+LANGUAGE_NAME = {
+    "cpp"   : "CPP",
+    "go"    : "Go",
+    "java"  : "Java",
+    "js"    : "JavaScript",
+    "python": "Python",
+}
+
+
+def read_dataset(
+    data_file: str = None,
+    dataset_type: str = "humaneval",
+    num_shot=None,
+) -> Dict:
+    """
+    Reads a dataset and returns a dictionary of tasks.
+    """
+    if num_shot is not None:
+        print(f"{num_shot}-shot setting...")
+    if "humaneval" in dataset_type.lower():
+        if data_file is None:
+            current_path = os.path.dirname(os.path.abspath(__file__))
+            data_file = os.path.join(current_path, "..", "humaneval-x", "python", "data", "humaneval_python.jsonl.gz")
+        dataset = {task["task_id"]: task for task in stream_jsonl(data_file)}
+    else:
+        raise f"Dataset: {dataset_type} not supported."
+
+    return dataset
+
+def estimate_pass_at_k(
+        num_samples: Union[int, List[int], np.ndarray],
+        num_correct: Union[List[int], np.ndarray],
+        k: int
+) -> np.ndarray:
+    """
+    Estimates pass@k of each problem and returns them in an array.
+    """
+
+    def estimator(n: int, c: int, k: int) -> float:
+        """
+        Calculates 1 - comb(n - c, k) / comb(n, k).
+        """
+        if n - c < k:
+            return 1.0
+        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+
+    if isinstance(num_samples, int):
+        num_samples_it = itertools.repeat(num_samples, len(num_correct))
+    else:
+        assert len(num_samples) == len(num_correct)
+        num_samples_it = iter(num_samples)
+
+    return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
+
+def process_humaneval_test(sample, problems, example_test=False, is_mbpp=False, language="python"):
+    """
+    Processes a sample for evaluation.
+    """
+    task_id = sample["task_id"]
+    if is_mbpp:
+        return sample["generation"] + "\n" + "\n".join(problems[task_id]["test"])
+
+    #prompt = sample["prompt"]
+    if example_test and "example_test" in problems[task_id] and problems[task_id]["example_test"] != "":
+        test = problems[task_id]["example_test"]
+    else:
+        test = problems[task_id]["test"]
+        test = "\n".join(test)
+
+    code = sample["generation"]
+
+    # Pre-process for different languages
+    if language == "python":
+        test_setup = "\n".join(IMPORT_HELPER["python"]) + "\n"
+        test_string = test_setup + code + "\n" + test + "\n"
+    elif language == "cpp":
+        test_set_up = ""
+        for s in IMPORT_HELPER["cpp"]:
+            if s not in prompt:
+                test_set_up += s + "\n"
+        test_string = test_set_up + "\n" + code + "\n" + test
+    elif language == "java":
+        test_string = code + "\n" + test
+    elif language == "cs":
+        test_set_up = ""
+        for s in IMPORT_HELPER["cs"]:
+            test_set_up += s + "\n"
+        test_string = test_set_up + "\n" + code + "\n" + test
+    elif language in ["js", "javascript", "ts", "sh", "go"]:
+        test_string = code + "\n" + test
+    elif language == "go232":
+        import_string = problems[task_id]["import"]
+        prompt = prompt.replace(import_string, "")
+        if example_test and "example_test" in problems[task_id]:
+            test = problems[task_id]["example_test"]
+        else:
+            test = problems[task_id]["test"]
+        test_setup = problems[task_id]["test_setup"]
+        other_pkgs = []
+        for pkg in IMPORT_HELPER["go"]:
+            if pkg not in test_setup:
+                p = pkg.split("/")[-1]
+                if p + "." in code:
+                    other_pkgs.append(f"\"{pkg}\"")
+        if other_pkgs:
+            import_other_pkgs = "import (\n" + "    ".join([p + "\n" for p in other_pkgs]) + ")"
+            test_string = test_setup + "\n" + import_other_pkgs + "\n" + prompt + code + "\n" + test
+        else:
+            test_string = test_setup + "\n" + prompt + code + "\n" + test
+    elif language == "rust":
+        main = "\nfn main(){ \n } \n"
+        declaration = problems[task_id]["declaration"]
+        test_string = main + declaration + prompt + code + test
+    elif language == "php":
+        if code[:5] != "<?php":
+            code = "<?php\n" + code
+        test_string = code + "\n" + test + "?>"
+    return test_string
+
+
+def stream_jsonl_all(filename: str) -> Iterable[Dict]:
+    """
+    Streams a JSONL file.
+    """
+    results = []
+    if filename.endswith(".gz"):
+        fp = gzip.open(open(filename, "rb"), "rt")
+    else:
+        fp = open(filename, "r")
+    for line in fp:
+        if any(not x.isspace() for x in line):
+            results.append(json.loads(line))
+    fp.close()
+
+    return results
+
+
+def evaluate_functional_correctness(
+        input_file: str = None,
+        tmp_dir: str = "./",
+        n_workers: int = 32,
+        timeout: float = 10.0,
+        problem_file: str = "../data/humaneval_python.jsonl.gz",
+        out_dir: str = None,
+        k: List[int] = [1, 10, 100],
+        test_groundtruth: bool = False,
+        example_test: bool = False,
+        is_mbpp: bool = False,
+        language: str = "python",
+):
+    """
+    Evaluates the functional correctness of a model.
+    """
+    if example_test:
+        print("Example test...")
+
+    problems = read_dataset(problem_file,
+                            dataset_type="humaneval")
+    sample_jsonl = stream_jsonl_all(input_file)
+
+
+    with ThreadPoolExecutor(max_workers=n_workers) as executor:
+
+        futures = []
+        completion_id = Counter()
+        n_samples = 0
+        results = defaultdict(list)
+
+        if test_groundtruth:
+            print("Testing ground truth...")
+            for sample in tqdm(problems.values()):
+                task_id = sample["task_id"]
+                lang = task_id.split("/")[0].lower()
+                if lang == "javascript":
+                    lang = "js"
+                tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
+                sample["generation"] = sample["canonical_solution"]
+                sample["test_code"] = process_humaneval_test(sample, problems, example_test, language)
+                if sample["test_code"] is None:
+                    continue
+                args = (task_id, sample, lang, timeout, tmp_dir_, completion_id[task_id])
+                future = executor.submit(check_correctness, *args)
+                futures.append(future)
+                completion_id[task_id] += 1
+                n_samples += 1
+        else:
+            print("Reading samples...")
+            for sample in tqdm(sample_jsonl):
+                task_id = sample["task_id"]
+                if not is_mbpp:
+                    lang = language
+                if not is_mbpp and lang == "javascript":
+                    lang = "js"
+                if is_mbpp:
+                    lang = "python"
+                tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
+                sample["task_id"] = task_id
+                sample["test_code"] = process_humaneval_test(sample, problems, example_test, is_mbpp, language)
+                if sample["test_code"] is None:
+                    continue
+                if "completion_id" in sample:
+                    completion_id_ = sample["completion_id"]
+                else:
+                    completion_id_ = completion_id[task_id]
+                args = (task_id, sample, lang, timeout, tmp_dir_, completion_id_)
+                future = executor.submit(check_correctness, *args)
+                futures.append(future)
+                completion_id[task_id] += 1
+                n_samples += 1
+
+        if len(completion_id) == len(problems):
+            evaluate_pass_at_k = True
+        else:
+            evaluate_pass_at_k = False
+
+        print("Running test suites...")
+        for future in tqdm(as_completed(futures), total=len(futures)):
+            result = future.result()
+            results[result["task_id"]].append((result["completion_id"], result))
+
+    # Calculate pass@k.
+    total, correct = [], []
+    for result in results.values():
+        passed = [r[1]["passed"] for r in result]
+        total.append(len(passed))
+        correct.append(sum(passed))
+    total = np.array(total)
+    correct = np.array(correct)
+    if evaluate_pass_at_k:
+        ks = k
+        pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
+                     for k in ks if (total >= k).all()}
+        print(pass_at_k)
+    else:
+        print("Total:", np.sum(total))
+        print("Correct:", np.sum(correct))
+    return pass_at_k
--- a/Evaluation/MBPP/human_eval/execution.py
+++ b/Evaluation/MBPP/human_eval/execution.py
--- a/Evaluation/MBPP/mbpp.py
+++ b/Evaluation/MBPP/mbpp.py
+import time
+import string
+import multiprocessing
+import os
+import numpy as np
+import json
+import re
+import torch
+import datetime
+import subprocess
+import torch.distributed as dist
+from attrdict import AttrDict
+from tqdm import tqdm
+from human_eval.evaluation import evaluate_functional_correctness
+from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList
+from utils.dataset import MBPPDataset
+from utils.utils import cleanup_code
+
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords_str, tokenizer):
+        StoppingCriteria.__init__(self)
+        self.current_context = []
+        self.tokenizer = tokenizer
+        self.keywords_str = keywords_str
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        self.current_context.append(input_ids[0][-1].item())
+        current_context = self.tokenizer.decode(self.current_context)
+        for word in self.keywords_str:
+            if word in current_context:
+                return True
+        return False
+
+
+class MBPP:
+    """
+    MBPP evaluation class.
+    """
+    def __init__(self, data_root, max_seq_len=2048,
+                language="python", max_gen_len=200, batch_size=512,
+                log_dir=None, temperature=0, issft=False, top_p=0.95,
+                model_name="", inference_increment=True,
+                tokenizer_cfg=None, n_sample=40, k_sample=1):
+        self.data_root = data_root
+        self.max_seq_len = max_seq_len
+        self.max_gen_len = max_gen_len
+        self.batch_size = batch_size
+        self.k = k_sample
+        self.n_sample = n_sample
+        self.language = language
+        self.log_dir = log_dir
+        self.sft = issft
+        self.temperature = temperature
+        self.top_p = top_p
+        self.model_name = tokenizer_cfg["model_path"].replace("/", "_")
+        self.inference_increment = inference_increment
+        os.makedirs(self.log_dir, exist_ok=True)
+        tokenizer_cls = tokenizer_cfg.pop('cls')
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_cfg.pop("model_path"), trust_remote_code=True)       
+        except Exception as e:
+            print(e)
+            assert False
+
+    @torch.no_grad()
+    def eval_model(self, gpt, accelerator):
+        """
+        Evaluate the model.
+        """
+        assert self.log_dir is not None, "log_dir should not be None when evaluating MBPP"
+        dataset = MBPPDataset(self.data_root, samplenum=self.n_sample)
+        nprompt = len(dataset) // self.n_sample
+        dp_rank = accelerator.process_index 
+        dp_size = accelerator.num_processes 
+        if self.k > 1:
+            assert self.n_sample >= 80, "MBPP PASS@80 needs n_sample >= 80"
+        gpt.eval()
+        prompt_indices_split = np.array_split(range(nprompt), dp_size)
+        prompt_indices = prompt_indices_split[dp_rank]
+        indices = []
+        for x in prompt_indices:
+            for j in range(self.n_sample):
+                indices.append(x * self.n_sample + j)
+        all_num = len(indices)
+        processed_num = 0
+        log_file = os.path.join(self.log_dir,
+                                    f'{self.model_name}_rank{dp_rank}_bs{self.batch_size}_shot_log_{self.language}.json')
+        tmpfile = open(log_file, "w")
+
+        totoalnum = 0        
+        start_time = time.time()
+
+        for idx in tqdm(range(0, len(indices), self.batch_size)):
+            prompt_list = []
+            prompt_lens = []
+            answers_list = []
+            test_list = []
+            taskid = []
+            tokenized_prompt_lens = []
+            for j in indices[idx:idx + self.batch_size]:
+                data = dataset[j]
+                prompt = dataset.prompt
+                prompt1 = data["prompt"]
+                tests = "\n".join(data["test"])
+                test_list.append(data["test"])
+                prompt_curr = f"You are an expert Python programmer, and here is your task: {prompt1} Your code should pass these tests:\n\n{tests}\n[BEGIN]"
+                fprompt = ""
+                for i in range(len(prompt) - 1, -1, -1):
+                    finalprompt = prompt[i] + prompt_curr
+                    curr_seq_len = len(self.tokenizer.encode(finalprompt))
+                    if curr_seq_len >= self.max_seq_len - self.max_gen_len:
+                        continue
+                    else:
+                        fprompt = finalprompt
+                        break
+                if fprompt == "":
+                    fprompt = prompt_curr
+                    encodelist = self.tokenizer.encode(fprompt)
+                    while True:
+                        try:
+                            fprompt = self.tokenizer.decode(encodelist[:self.max_seq_len - self.max_gen_len])
+                            break
+                        except:
+                            encodelist.pop(-1)
+                prompt_list.append(fprompt)
+                answers_list.append(data['code'])
+                prompt_lens.append(len(fprompt))
+                taskid.append(data["task_id"])
+            tokenized_prompt = self.tokenizer(prompt_list, padding=True, return_tensors="pt")
+            inputids = tokenized_prompt["input_ids"].to(gpt.device)[:, -self.max_seq_len:]
+            attenion_mask = tokenized_prompt["attention_mask"].to(gpt.device)[:, -self.max_seq_len:]
+            if self.temperature == 0:
+                stop_criteria = KeywordsStoppingCriteria(["[DONE]"], self.tokenizer)
+                decoded = gpt.generate(
+                    input_ids=inputids,
+                    attention_mask=attenion_mask,
+                    max_new_tokens=self.max_gen_len,
+                    top_p=self.top_p,
+                    eos_token_id=self.tokenizer.eos_token_id,
+                    do_sample=False,
+                    stopping_criteria=StoppingCriteriaList([stop_criteria]),
+                    pad_token_id=self.tokenizer.eos_token_id,
+                )
+            else:
+                decoded = gpt.generate(
+                    tokenized_prompt_lens,
+                    max_new_tokens=self.max_gen_len,
+                    temperature=self.temperature,
+                    top_p=0.95,
+                    inference_increment=True,
+                    stopping_criteria=StoppingCriteriaList([stop_criteria]),
+                    pad_token_id=self.tokenizer.eos_token_id,
+                )
+       
+            for local_idx, text in enumerate(decoded):
+                prediction = decoded[local_idx]
+                prediction = self.tokenizer.decode(prediction, skip_special_tokens=True)
+                #print(prediction)
+                suffixprediction = prediction[prompt_lens[local_idx]:]
+                suffixprediction = suffixprediction.split("[DONE]")[0].strip()
+                res = {"task_id": taskid[local_idx], "generation": suffixprediction}
+                tmpfile.write(json.dumps(res) + "\n")
+                tmpfile.flush()
+                totoalnum += 1
+
+            self.log_score(dp_rank, totoalnum, all_num, start_time, self.batch_size)
+        tmpfile.close()
+        accelerator.wait_for_everyone()
+        self._calculate_final_score(accelerator)
+
+    def log_score(self, dp_rank, processed_num, all_num, start_time, bs):
+        """
+        Log the score.
+        """
+        mem = torch.cuda.max_memory_allocated() / (1 << 30)
+        avg_time = (time.time() - start_time) / processed_num * bs
+        print(
+            f'DP RANK:{dp_rank} process_num/all_num:{int(processed_num)}/{all_num} '
+            f'avg_time_per_batch:{avg_time:.2f} s '
+            f'still_need:{((all_num - processed_num) // bs + 1) * avg_time / 60:.2f} m',
+            f'mem:{mem:.3f} GiB bs:{bs}',
+            flush=True
+        )
+        if processed_num == all_num:
+            print(f'EVAL DONE! Process time {(time.time() - start_time) / 60:.2f} m', flush=True)
+
+    def _calculate_final_score(self, accelerator):
+        """
+        Calculate the final score.
+        """
+        if accelerator.is_local_main_process:
+            logfilepath = os.path.join(self.log_dir, f'final_{self.model_name}.jsonl')
+            logfile = open(logfilepath, "w")
+            for i in range(accelerator.num_processes):
+                tmplogfile = os.path.join(self.log_dir, f'{self.model_name}_rank{i}_bs{self.batch_size}_shot_log_{self.language}.json')
+                logfile.write(open(tmplogfile).read().strip() + "\n")
+                os.remove(tmplogfile)
+            logfile.close()
+            timeout = 10
+            runlang = self.language
+            res = evaluate_functional_correctness(input_file=logfilepath, problem_file=os.path.join(self.data_root, f"mbpp_test.jsonl"), tmp_dir=self.log_dir, timeout=timeout, language=runlang)
+            print("score is", res['pass@%d' % self.k])
+            os.remove(logfilepath)
+        return
+            
\ No newline at end of file
--- a/Evaluation/MBPP/test_config.yaml
+++ b/Evaluation/MBPP/test_config.yaml
+compute_environment: LOCAL_MACHINE
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 3
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/Evaluation/MBPP/utils/dataset.py
+++ b/Evaluation/MBPP/utils/dataset.py
+import os
+import numpy as np
+import json
+
+class MBPPDataset:
+
+    def __init__(self, root, samplenum=1):
+        """
+        root: 数据文件的根目录
+        """
+        self.root = root
+        self.data = open(os.path.join(root, "mbpp.jsonl")).readlines()
+
+        self.clean_data = self.get_qa_only_data(self.data)
+        self.prompt = []
+        for i in range(1, 4):            
+            prompt = self.clean_data[i]["prompt"]
+            tests = "\n".join(self.clean_data[i]["test"])
+            code = self.clean_data[i]["code"].replace("\r", "").replace("\t", "    ")
+            prompt1 = f"You are an expert Python programmer, and here is your task: {prompt} Your code should pass these tests:\n\n{tests}\n[BEGIN]\n{code}\n[DONE]\n"
+            if len(self.prompt) == 0:
+                self.prompt.append(prompt1)
+            else:
+                self.prompt.append(self.prompt[-1] + prompt1)
+        self.testdata = []
+        for i in range(10, 510):
+            for j in range(samplenum):
+                self.testdata.append(self.clean_data[i])
+        np.random.seed(1234)
+        print(f"Read MBPP from {root}, number of samples {len(self.testdata)}")
+
+    def get_qa_only_data(self, data_json):
+        ans = []
+        for line in data_json:
+            line = json.loads(line)
+            prompt = line["text"]
+            suffix = line["test_list"]
+            code = line["code"]
+            ans.append({"prompt":prompt, "test":suffix, "code":code, "task_id":line["task_id"]})
+        return ans
+
+    def __len__(self):
+        return len(self.testdata)
+
+    def __getitem__(self, index):
+        sample = self.testdata[index]
+        return sample
+
--- a/Evaluation/MBPP/utils/utils.py
+++ b/Evaluation/MBPP/utils/utils.py
--- a/Evaluation/PAL-Math/README.md
+++ b/Evaluation/PAL-Math/README.md
--- a/Evaluation/PAL-Math/datasets/asdiv/test.json
+++ b/Evaluation/PAL-Math/datasets/asdiv/test.json
--- a/Evaluation/PAL-Math/datasets/gsm-hard/test.json
+++ b/Evaluation/PAL-Math/datasets/gsm-hard/test.json
--- a/Evaluation/PAL-Math/datasets/gsm8k/test.json
+++ b/Evaluation/PAL-Math/datasets/gsm8k/test.json
--- a/Evaluation/PAL-Math/datasets/math/test.json
+++ b/Evaluation/PAL-Math/datasets/math/test.json
--- a/Evaluation/PAL-Math/datasets/mawps/addsub.jsonl
+++ b/Evaluation/PAL-Math/datasets/mawps/addsub.jsonl
--- a/Evaluation/PAL-Math/datasets/mawps/multiarith.jsonl
+++ b/Evaluation/PAL-Math/datasets/mawps/multiarith.jsonl
--- a/Evaluation/PAL-Math/datasets/mawps/singleeq.jsonl
+++ b/Evaluation/PAL-Math/datasets/mawps/singleeq.jsonl
--- a/Evaluation/PAL-Math/datasets/mawps/singleop.jsonl
+++ b/Evaluation/PAL-Math/datasets/mawps/singleop.jsonl
--- a/Evaluation/PAL-Math/datasets/mawps/test.json
+++ b/Evaluation/PAL-Math/datasets/mawps/test.json
--- a/Evaluation/PAL-Math/datasets/svamp/test.json
+++ b/Evaluation/PAL-Math/datasets/svamp/test.json
--- a/Evaluation/PAL-Math/datasets/tabmwp/test.json
+++ b/Evaluation/PAL-Math/datasets/tabmwp/test.json
--- a/Evaluation/PAL-Math/prompts/gsm8k.md
+++ b/Evaluation/PAL-Math/prompts/gsm8k.md
--- a/Evaluation/PAL-Math/prompts/math.md
+++ b/Evaluation/PAL-Math/prompts/math.md