Commit f314e457 authored by dengjb's avatar dengjb
Browse files

first commit

parent 50406f0b
Pipeline #1018 failed with stages
in 0 seconds
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
MODEL_NAME_OR_PATH="deepseek/deepseek-coder-1b"
DATASET_ROOT="data/"
LANGUAGE="python"
CUDA_VISIBLE_DEVICES=1,2,3 python -m accelerate.commands.launch --config_file test_config.yaml eval_pal.py --logdir ${MODEL_NAME_OR_PATH} --language ${LANGUAGE} --dataroot ${DATASET_ROOT}
\ No newline at end of file
import argparse
import json
import os
import torch
from pathlib import Path
from tqdm import tqdm
data_abs_dir = Path(__file__).parent / "data"
from utils.utils import extract_generation_code, languge_settings
from transformers import AutoTokenizer, AutoModelForCausalLM
from human_eval.evaluation import evaluate_functional_correctness
def build_deepseekcoder_instruction(languge: str, question: str):
return '''
Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
```{}
{}
```
'''.strip().format(languge.lower(), question.strip())
def generate_one(example, lang, tokenizer, model):
prompt = build_deepseekcoder_instruction(languge_settings[lang]['full_name'], example['prompt'])
inputs = tokenizer.apply_chat_template(
[{'role': 'user', 'content': prompt }],
return_tensors="pt",
add_generation_prompt=True
).to(model.device)
stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>")
assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found"
outputs = model.generate(
inputs,
max_new_tokens=1024,
do_sample=False,
# top_p=0.95,
# temperature=temperature,
pad_token_id=stop_id,
eos_token_id=stop_id
)
output = tokenizer.decode(outputs[0][len(inputs[0]):], skip_special_tokens=True)
example['output'] = output
return extract_generation_code(example, lang_code=lang)
def generate_main(args):
model_name_or_path = args.model
lang = args.language
saved_path = args.output_path
temp_dir = args.temp_dir
os.makedirs(temp_dir, exist_ok=True)
problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
print("model", model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
print("load tokenizer {} from {} over.".format(tokenizer.__class__, model_name_or_path))
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
torch_dtype=torch.bfloat16,
device_map="auto",
#use_flash_attention_2=True
)
model.eval()
examples = [json.loads(x) for x in open(problem_file) if x.strip()]
print("Read {} examples for evaluation over.".format(len(examples)))
generated_examples = []
for ex in tqdm(examples, desc='Generating'):
gen_example = generate_one(ex, args.language, tokenizer, model)
generated_examples.append(gen_example)
print("Generate all over!!!")
with open(saved_path, 'w', encoding='utf-8') as fw:
for ex in generated_examples:
fw.write(json.dumps(ex) + '\n')
print("Save {} processed examples into {} over!".format(len(generated_examples), saved_path))
result = evaluate_functional_correctness(
input_file=saved_path,
tmp_dir=temp_dir,
n_workers=8,
timeout=3.0,
problem_file=problem_file,
language=lang
)
print(lang, result, model_name_or_path)
pass
def evaluation_only(args):
lang = args.language
temp_dir = args.temp_dir
assert os.path.exists(args.output_path), "Not fond output file: {}".format(args.output_path)
os.makedirs(temp_dir, exist_ok=True)
output_name = os.path.basename(args.output_path)
output_examples = [json.loads(x) for x in open(args.output_path) if x.strip()]
processed_examples = [extract_generation_code(ex, lang) for ex in tqdm(output_examples, "Processing")]
processed_path = os.path.join(temp_dir, output_name)
with open(processed_path, 'w', encoding='utf-8') as fw:
for ex in processed_examples:
fw.write(json.dumps(ex) + '\n')
print("Save {} processed examples into {} over!".format(len(processed_examples), processed_path))
problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
from human_eval.evaluation import evaluate_functional_correctness
result = evaluate_functional_correctness(
input_file=processed_path,
tmp_dir=temp_dir,
n_workers=8,
timeout=3.0,
problem_file=problem_file,
language=lang
)
print(lang, result)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, help="model name or path")
parser.add_argument('--output_path', type=str, help="output path of your generation")
parser.add_argument('--language', type=str, help="langauge")
parser.add_argument('--temp_dir', type=str, help="temp dir for evaluation", default="tmp")
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
generate_main(args)
pass
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import json
import torch.distributed as dist
import subprocess
import sys
from accelerate import Accelerator
from accelerate import DistributedDataParallelKwargs
from pathlib import Path
from argparse import ArgumentParser
from humaneval import HumanEval as evaltor
from transformers import AutoTokenizer, AutoModelForCausalLM
if __name__ == '__main__':
kwargs_handlers = [DistributedDataParallelKwargs(find_unused_parameters=True)]
accelerator = Accelerator(mixed_precision="bf16", kwargs_handlers=kwargs_handlers)
parser = ArgumentParser()
parser.add_argument("--logdir", type=str, default="")
parser.add_argument("--language", type=str, default="")
parser.add_argument("--dataroot", type=str, default="")
args = parser.parse_args()
logdir = args.logdir
language = args.language
if logdir == "":
logdir = "tmp/"
tokenizer = dict(
cls=AutoTokenizer,
model_path=logdir,)
dataroot = args.dataroot
evaluator = evaltor(data_root=dataroot, max_seq_len=4096, tokenizer_cfg=tokenizer, log_dir=logdir, n_sample=1, batch_size=1, language=language, max_gen_len=500)
model = AutoModelForCausalLM.from_pretrained(logdir, device_map=accelerator.device, trust_remote_code=True, torch_dtype=torch.bfloat16)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
evaluator.eval_model(model, accelerator)
from typing import Iterable, Dict
import gzip
import json
import os
ROOT = os.path.dirname(os.path.abspath(__file__))
HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz")
def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
def stream_jsonl(filename: str) -> Iterable[Dict]:
"""
Parses each jsonl line and yields it as a dictionary
"""
if filename.endswith(".gz"):
with open(filename, "rb") as gzfp:
with gzip.open(gzfp, 'rt') as fp:
for line in fp:
if any(not x.isspace() for x in line):
yield json.loads(line)
else:
with open(filename, "r", encoding="utf-8") as fp:
for line in fp:
if any(not x.isspace() for x in line):
yield json.loads(line)
def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
"""
Writes an iterable of dictionaries to jsonl
"""
if append:
mode = 'ab'
else:
mode = 'wb'
filename = os.path.expanduser(filename)
if filename.endswith(".gz"):
with open(filename, mode) as fp:
with gzip.GzipFile(fileobj=fp, mode='wb') as gzfp:
for x in data:
gzfp.write((json.dumps(x) + "\n").encode('utf-8'))
else:
with open(filename, mode) as fp:
for x in data:
fp.write((json.dumps(x) + "\n").encode('utf-8'))
import fire
import sys
from .data import HUMAN_EVAL
from .evaluation import evaluate_functional_correctness
def entry_point(
sample_file: str,
k: str = "1,10,100",
n_workers: int = 4,
timeout: float = 3.0,
problem_file: str = "",
is_mbpp: bool = False,
):
"""
Evaluates the functional correctness of generated samples, and writes
results to f"{sample_file}_results.jsonl.gz"
"""
k = list(map(int, k.split(",")))
results = evaluate_functional_correctness(sample_file, k, n_workers, timeout, problem_file, is_mbpp)
print(results)
def main():
fire.Fire(entry_point)
sys.exit(main())
import os
import sys
import fire
import json
import gzip
import regex
import numpy as np
import itertools
from typing import *
from tqdm.auto import tqdm
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed
from .data import stream_jsonl
from .execution import check_correctness
IMPORT_HELPER = {
"python": [
"import math",
"import re",
"import sys",
"import copy",
"import datetime",
"import itertools",
"import collections",
"import heapq",
"import functools",
"import hashlib",
"import numpy",
"import numpy as np",
"import string",
"from typing import *",
"from collections import *",
],
"go" : [
"math",
"strings",
"fmt",
"strconv",
"time",
"bytes",
"regexp",
"sort",
"math/rand",
"crypto/md5",
],
"cpp" : [
"#include<stdlib.h>",
"#include<algorithm>",
"#include<math.h>",
"#include<stdio.h>",
"#include<vector>",
"#include<string>",
"#include<climits>",
"#include<cstring>",
"#include<iostream>",
"#include<cassert>"
],
"cs": ["using System.Numerics;", "using System.Diagnostics;", "using System.Collections.Generic;", "using System.Linq;", "using System.Text;", "using System.Security.Cryptography;", "using System.Collections.Generic;"]
}
LANGUAGE_NAME = {
"cpp" : "CPP",
"go" : "Go",
"java" : "Java",
"js" : "JavaScript",
"python": "Python",
}
def read_dataset(
data_file: str = None,
dataset_type: str = "humaneval",
num_shot=None,
) -> Dict:
"""
Reads a dataset and returns a dictionary of tasks.
"""
if num_shot is not None:
print(f"{num_shot}-shot setting...")
if "humaneval" in dataset_type.lower():
if data_file is None:
current_path = os.path.dirname(os.path.abspath(__file__))
data_file = os.path.join(current_path, "..", "humaneval-x", "python", "data", "humaneval_python.jsonl.gz")
dataset = {task["task_id"]: task for task in stream_jsonl(data_file)}
else:
raise f"Dataset: {dataset_type} not supported."
return dataset
def estimate_pass_at_k(
num_samples: Union[int, List[int], np.ndarray],
num_correct: Union[List[int], np.ndarray],
k: int
) -> np.ndarray:
"""
Estimates pass@k of each problem and returns them in an array.
"""
def estimator(n: int, c: int, k: int) -> float:
"""
Calculates 1 - comb(n - c, k) / comb(n, k).
"""
if n - c < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
if isinstance(num_samples, int):
num_samples_it = itertools.repeat(num_samples, len(num_correct))
else:
assert len(num_samples) == len(num_correct)
num_samples_it = iter(num_samples)
return np.array([estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)])
def process_humaneval_test(sample, problems, example_test=False, is_mbpp=False, language="python"):
"""
Processes a sample for evaluation.
"""
task_id = sample["task_id"]
if is_mbpp:
return sample["generation"] + "\n" + "\n".join(problems[task_id]["test"])
prompt = sample["prompt"]
if example_test and "example_test" in problems[task_id] and problems[task_id]["example_test"] != "":
test = problems[task_id]["example_test"]
else:
test = problems[task_id]["test"]
code = sample["generation"]
# Pre-process for different languages
if language == "python":
test_setup = "\n".join(IMPORT_HELPER["python"]) + "\n"
test_string = test_setup + code + "\n" + test + "\n"
elif language == "cpp":
test_set_up = ""
for s in IMPORT_HELPER["cpp"]:
if s not in prompt:
test_set_up += s + "\n"
test_string = test_set_up + "\n" + code + "\n" + test
elif language == "java":
test_string = code + "\n" + test
elif language == "cs":
test_set_up = ""
for s in IMPORT_HELPER["cs"]:
test_set_up += s + "\n"
test_string = test_set_up + "\n" + code + "\n" + test
elif language in ["js", "javascript", "ts", "sh", "go"]:
test_string = code + "\n" + test
elif language == "go232":
import_string = problems[task_id]["import"]
prompt = prompt.replace(import_string, "")
if example_test and "example_test" in problems[task_id]:
test = problems[task_id]["example_test"]
else:
test = problems[task_id]["test"]
test_setup = problems[task_id]["test_setup"]
other_pkgs = []
for pkg in IMPORT_HELPER["go"]:
if pkg not in test_setup:
p = pkg.split("/")[-1]
if p + "." in code:
other_pkgs.append(f"\"{pkg}\"")
if other_pkgs:
import_other_pkgs = "import (\n" + " ".join([p + "\n" for p in other_pkgs]) + ")"
test_string = test_setup + "\n" + import_other_pkgs + "\n" + prompt + code + "\n" + test
else:
test_string = test_setup + "\n" + prompt + code + "\n" + test
elif language == "rust":
main = "\nfn main(){ \n } \n"
declaration = problems[task_id]["declaration"]
test_string = main + declaration + prompt + code + test
elif language == "php":
if code[:5] != "<?php":
code = "<?php\n" + code
test_string = code + "\n" + test + "?>"
return test_string
def stream_jsonl_all(filename: str) -> Iterable[Dict]:
"""
Streams a JSONL file.
"""
results = []
if filename.endswith(".gz"):
fp = gzip.open(open(filename, "rb"), "rt")
else:
fp = open(filename, "r")
for line in fp:
if any(not x.isspace() for x in line):
results.append(json.loads(line))
fp.close()
return results
def evaluate_functional_correctness(
input_file: str = None,
tmp_dir: str = "./",
n_workers: int = 32,
timeout: float = 10.0,
problem_file: str = "../data/humaneval_python.jsonl.gz",
out_dir: str = None,
k: List[int] = [1, 10, 100],
test_groundtruth: bool = False,
example_test: bool = False,
is_mbpp: bool = False,
language: str = "python",
):
"""
Evaluates the functional correctness of a model.
"""
if example_test:
print("Example test...")
problems = read_dataset(problem_file,
dataset_type="humaneval")
sample_jsonl = stream_jsonl_all(input_file)
with ThreadPoolExecutor(max_workers=n_workers) as executor:
futures = []
completion_id = Counter()
n_samples = 0
results = defaultdict(list)
if test_groundtruth:
print("Testing ground truth...")
for sample in tqdm(problems.values()):
task_id = sample["task_id"]
lang = task_id.split("/")[0].lower()
if lang == "javascript":
lang = "js"
tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
sample["generation"] = sample["canonical_solution"]
sample["test_code"] = process_humaneval_test(sample, problems, example_test, language)
if sample["test_code"] is None:
continue
args = (task_id, sample, lang, timeout, tmp_dir_, completion_id[task_id])
future = executor.submit(check_correctness, *args)
futures.append(future)
completion_id[task_id] += 1
n_samples += 1
else:
print("Reading samples...")
for sample in tqdm(sample_jsonl):
task_id = sample["task_id"]
if not is_mbpp:
lang = language
if not is_mbpp and lang == "javascript":
lang = "js"
if is_mbpp:
lang = "python"
tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
sample["task_id"] = task_id
sample["test_code"] = process_humaneval_test(sample, problems, example_test, is_mbpp, language)
if sample["test_code"] is None:
continue
if "completion_id" in sample:
completion_id_ = sample["completion_id"]
else:
completion_id_ = completion_id[task_id]
args = (task_id, sample, lang, timeout, tmp_dir_, completion_id_)
future = executor.submit(check_correctness, *args)
futures.append(future)
completion_id[task_id] += 1
n_samples += 1
if len(completion_id) == len(problems):
evaluate_pass_at_k = True
else:
evaluate_pass_at_k = False
print("Running test suites...")
for future in tqdm(as_completed(futures), total=len(futures)):
result = future.result()
results[result["task_id"]].append((result["completion_id"], result))
# Calculate pass@k.
total, correct = [], []
for result in results.values():
passed = [r[1]["passed"] for r in result]
total.append(len(passed))
correct.append(sum(passed))
total = np.array(total)
correct = np.array(correct)
if evaluate_pass_at_k:
ks = k
pass_at_k = {f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
for k in ks if (total >= k).all()}
print(pass_at_k)
else:
print("Total:", np.sum(total))
print("Correct:", np.sum(correct))
return pass_at_k
This diff is collapsed.
import time
import string
import multiprocessing
import os
import numpy as np
import json
import re
import torch
import datetime
import subprocess
import torch.distributed as dist
from attrdict import AttrDict
from human_eval.evaluation import evaluate_functional_correctness
from transformers import AutoTokenizer
from utils.dataset import HumanEvalDataset
from utils.utils import cleanup_code
class HumanEval:
"""
HumanEval evaluation class.
"""
def __init__(self, data_root, max_seq_len=2048,
language="python", max_gen_len=200, batch_size=512,
log_dir=None, temperature=0, issft=False, top_p=0.95,
model_name="", inference_increment=True,
tokenizer_cfg=None, n_sample=40, k_sample=1):
self.data_root = data_root
self.max_seq_len = max_seq_len
self.max_gen_len = max_gen_len
self.batch_size = batch_size
self.k = k_sample
self.n_sample = n_sample
self.language = language
self.log_dir = log_dir
self.sft = issft
self.temperature = temperature
self.top_p = top_p
self.model_name = tokenizer_cfg["model_path"].replace("/", "_")
self.inference_increment = inference_increment
os.makedirs(self.log_dir, exist_ok=True)
tokenizer_cls = tokenizer_cfg.pop('cls')
try:
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_cfg.pop("model_path"), trust_remote_code=True)
except Exception as e:
print(e)
assert False
@torch.no_grad()
def eval_model(self, gpt, accelerator):
"""
Evaluate the model on HumanEval.
"""
assert self.log_dir is not None, "log_dir should not be None when evaluating humaneval"
dataset = HumanEvalDataset(self.data_root, sample_num=self.n_sample, language=self.language, issft=self.sft)
nprompt = len(dataset) // self.n_sample
dp_rank = accelerator.process_index
dp_size = accelerator.num_processes
if self.k > 1:
assert self.n_sample >= 100, "HumanEval PASS@100 needs n_sample >= 100"
gpt.eval()
# each process will process a subset of the dataset
prompt_indices_split = np.array_split(range(nprompt), dp_size)
prompt_indices = prompt_indices_split[dp_rank]
indices = [x * self.n_sample + j for x in prompt_indices for j in range(self.n_sample)]
all_num = len(indices)
processed_num = 0
log_file = os.path.join(self.log_dir,
f'{self.model_name}_rank{dp_rank}_bs{self.batch_size}_shot_log_{self.language}.json')
tmpfile = open(log_file, "w")
start_time = time.time()
# split the dataset into batches and construct a list of inputs
for idx in range(0, len(indices), self.batch_size):
prompt_list = []
prompt_lens = []
orriginal_prompt_list = []
tokenized_prompt_lens = []
taskid = []
# get the prompts from the dataset
for j in indices[idx:idx + self.batch_size]:
data = dataset[j]
fprompt = data["prompt"].strip()
prompt_list.append(fprompt)
tmp = self.tokenizer.encode(fprompt)
orriginal_prompt_list.append(data["original_prompt"])
prompt_lens.append(len(fprompt))
tokenized_prompt_lens.append(tmp)
taskid.append(data["task_id"])
input_ids = torch.tensor(tokenized_prompt_lens).to(accelerator.device)
# generate the code
if self.temperature != 0:
decoded = gpt.generate(
input_ids=input_ids,
max_new_tokens=self.max_gen_len,
do_sample=True,
eos_token_id=self.tokenizer.eos_token_id,
temperature=self.temperature,
top_p=self.top_p,
pad_token_id=self.tokenizer.eos_token_id,
)
else:
decoded = gpt.generate(
input_ids=input_ids,
max_new_tokens=self.max_gen_len,
do_sample=False,
eos_token_id=self.tokenizer.eos_token_id,
pad_token_id=self.tokenizer.eos_token_id,
)
# save the results to a file
for local_idx, text in enumerate(decoded):
prediction = decoded[local_idx]
prediction = self.tokenizer.decode(prediction, skip_special_tokens=True)
suffixprediction = prediction[prompt_lens[local_idx]:]
suffixprediction = cleanup_code(suffixprediction, self.language, "humaneval", self.sft, dataset.stopwords)
# sft mode does not need original prompt
if not self.sft:
suffixprediction = orriginal_prompt_list[local_idx] + "\n" + suffixprediction
res = {"task_id": taskid[local_idx], "generation": suffixprediction, "prompt": orriginal_prompt_list[local_idx], "wholecode":prediction}
tmpfile.write(json.dumps(res) + "\n")
tmpfile.flush()
processed_num += 1
self.log_score(dp_rank, processed_num, all_num, start_time, self.batch_size)
tmpfile.close()
accelerator.wait_for_everyone()
# calculate the final score of pass@k
self._calculate_final_score(accelerator)
accelerator.wait_for_everyone()
return
def log_score(self, dp_rank, processed_num, all_num, start_time, bs):
"""
Log the score.
"""
mem = torch.cuda.max_memory_allocated() / (1 << 30)
avg_time = (time.time() - start_time) / processed_num * bs
print(
f'DP RANK:{dp_rank} process_num/all_num:{int(processed_num)}/{all_num} '
f'avg_time_per_batch:{avg_time:.2f} s '
f'still_need:{((all_num - processed_num) // bs + 1) * avg_time / 60:.2f} m',
f'mem:{mem:.3f} GiB bs:{bs}',
flush=True
)
if processed_num == all_num:
print(f'EVAL DONE! Process time {(time.time() - start_time) / 60:.2f} m', flush=True)
def _calculate_final_score(self, accelerator):
"""
Calculate the final score.
"""
if accelerator.is_local_main_process:
logfilepath = os.path.join(self.log_dir, f'final_{self.model_name}.jsonl')
logfile = open(logfilepath, "w")
for i in range(accelerator.num_processes):
tmplogfile = os.path.join(self.log_dir, f'{self.model_name}_rank{i}_bs{self.batch_size}_shot_log_{self.language}.json')
logfile.write(open(tmplogfile).read().strip() + "\n")
os.remove(tmplogfile)
logfile.close()
timeout = 10
runlang = self.language
res = evaluate_functional_correctness(input_file=logfilepath, problem_file=os.path.join(self.data_root, f"humaneval-{self.language}.jsonl"), tmp_dir=self.log_dir, timeout=timeout, language=runlang)
print("score is", res['pass@%d' % self.k])
os.remove(logfilepath)
return
\ No newline at end of file
compute_environment: LOCAL_MACHINE
distributed_type: MULTI_GPU
downcast_bf16: 'no'
gpu_ids: all
machine_rank: 0
main_training_function: main
mixed_precision: 'no'
num_machines: 1
num_processes: 3
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment