Commit 53b3977b authored by dongchy920's avatar dongchy920
Browse files

Initial commit

parents
Pipeline #2841 failed with stages
in 0 seconds
from pathlib import Path
from .safe_subprocess import run
LANG_NAME = "bash"
LANG_EXT = ".sh"
def eval_script(path: Path):
# Capture output - will be generated regardless of success, fail, or syntax error
p = run(["bash", path])
if p.timeout:
status = "Timeout"
elif p.exit_code == 0:
status = "OK"
elif "syntax error" in p.stderr:
status = "SyntaxError"
else:
status = "Exception"
return {
"status": status,
"exit_code": p.exit_code,
"stdout": p.stdout,
"stderr": p.stderr,
}
import os
import subprocess
from pathlib import Path
from .safe_subprocess import run
def eval_script(path: Path):
basename = ".".join(str(path).split(".")[:-1])
r = run(["swiftc", path, "-o", basename], timeout_seconds=45)
if r.timeout:
status = "Timeout"
elif r.exit_code != 0:
# Well, it's a compile error. May be a type error or
# something. But, why break the set convention
status = "SyntaxError"
else:
r = run([basename], timeout_seconds=5)
if r.timeout:
status = "Timeout"
elif r.exit_code != 0:
# Well, it's a panic
status = "Exception"
else:
status = "OK"
os.remove(basename)
return {
"status": status,
"exit_code": r.exit_code,
"stdout": r.stdout,
"stderr": r.stderr,
}
from pathlib import Path
import subprocess
from .safe_subprocess import run
def eval_script(path: Path):
"""
npm install -g typescript
"""
try:
r = run(["tsc", "--target", "esnext", str(path)], timeout_seconds=15)
except:
subprocess.run("npm install -g typescript")
if r.exit_code != 0:
return {
"status": "SyntaxError",
"exit_code": r.exit_code,
"stdout": r.stdout,
"stderr": r.stderr,
}
r = run(["node", str(path).replace(".ts", ".js")], timeout_seconds=15)
if r.timeout:
status = "Timeout"
elif r.exit_code == 0:
status = "OK"
elif "ERR_ASSERTION" in r.stderr:
status = "AssertionError"
elif "SyntaxError" in r.stderr:
status = "SyntaxError"
elif "ReferenceError" in r.stderr:
status = "ReferenceError"
else:
status = "Exception"
return {
"status": status,
"exit_code": r.exit_code,
"stdout": r.stdout,
"stderr": r.stderr,
}
import json
import os
import time
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from threading import Lock
from typing import Optional
from .containerized_eval import eval_string_script
# Get working directory
WORKING_DIR = Path(__file__).parent.parent
# program: str => Result
CACHE = dict()
CACHE_LOCK = Lock()
def cache_get(program: str) -> Optional[dict]:
if program in CACHE:
result = CACHE[program]
return result
else:
return None
def cache_set(program: str, result: dict):
if program in CACHE:
print("Setting already-existing cache")
CACHE[program] = result
def cached_eval_script(problem, index) -> dict:
# here prompt is already included in completions
program = problem["completions"][index] + "\n" + problem["tests"]
CACHE_LOCK.acquire(True)
cached = cache_get(program)
if cached is not None:
CACHE_LOCK.release()
return cached
else:
result_yaml = dict()
cache_set(program, result_yaml)
CACHE_LOCK.release()
result_dict = eval_string_script(problem["language"], program)
for k in result_dict.keys():
result_yaml[k] = result_dict[k]
result_yaml["timestamp"] = int(time.time())
return result_yaml
def get_test_results_json_path(
output_dir: str, problem_json_path: str, input_dir: Path
) -> Path:
suffixes = ".results.json"
problem_name = problem_json_path[: -len(".json")]
if input_dir:
raise ValueError("input dir given")
return Path(output_dir) / (
problem_json_path.relative_to(input_dir).parent / (problem_name + suffixes)
)
return Path(output_dir) / (problem_name + suffixes)
def evaluate_problem(
output_dir: str, problem_json_path: str, max_workers: int, input_dir: Path = None
):
with open(problem_json_path, "r") as f:
problem = json.load(f)
test_results_path = get_test_results_json_path(
output_dir, problem_json_path, input_dir
)
test_results_path.parent.mkdir(mode=0o755, parents=True, exist_ok=True)
test_results = problem.copy()
del test_results["completions"]
test_results["results"] = []
num_problems = len(problem["completions"])
min_problem = len(test_results["results"])
with ThreadPoolExecutor(max_workers=max_workers) as executor:
for j in executor.map(
lambda index: cached_eval_script(problem, index),
range(min_problem, num_problems),
):
test_results["results"].append(j)
test_results["results"][-1].update(problem)
with open(test_results_path, "w") as f:
f.write(json.dumps(test_results, indent=2))
# This is a helper script for evaluating benchmarks that have been translated to
# different languages.
#
# To use this script, call eval_lang.py.
# The --directory argument is required, and tells the script where the benchmarks are located.
# The --files argument is optional, and takes a list of numbers corresponding to the files to be evaluated.
#
# The script will print the results on each benchmark, and also write to results/lang.csv.
# When the script completes, it will print a summary.
#
# Examples
#
# To run the entire benchmark suite:
# python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/
#
# To run benchmarks 1, 2, and 3:
# python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/ --files 1 2 3
import argparse
import sys
from pathlib import Path
from sys import exit as sysexit
def list_files(directory, ext):
files_unsorted = directory.glob(f"HumanEval_*{ext}")
# assumption: base filenames are in the format of HumanEval_X_*
# Where X is a valid number
def key(s):
return int(str(s.name).split("_")[1])
files_sorted = sorted(files_unsorted, key=(lambda s: key(s)))
# assumption: there may be missing files, but no extra files
# so we build files_array where the index corresponds to the file's number,
# and a missing file is represented by None
size = key(files_sorted[-1]) + 1
files_array = [None] * size
for f in files_sorted:
k = key(f)
files_array[k] = f
return files_array
def main(eval_script, language, extension):
args = argparse.ArgumentParser()
args.add_argument(
"--directory", type=str, required=True, help="Directory to read benchmarks from"
)
args.add_argument(
"--files",
type=int,
nargs="*",
default=[],
help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2",
)
args = args.parse_args()
directory = Path(args.directory).resolve()
files_sorted = list_files(directory, extension)
# the directory you specified does not contain the right language
if len(files_sorted) == 0:
print(f"The specified directory does not contain files of type {extension}")
sysexit(1)
files_index = []
if len(args.files) > 0:
files_index = args.files
else:
files_index = range(len(files_sorted))
total = 0
passed = 0
syntax_error = 0
results_file = Path(
Path(__file__).parent, "..", "results", language.lower() + ".csv"
).resolve()
with open(results_file, "w") as f:
for i in files_index:
filepath = files_sorted[i]
if filepath is None:
print("File {} does not exist!".format(i))
continue
res = eval_script(filepath)
output = f"{language},{filepath.stem},{res['status']}\n"
f.write(output)
print(output, end="")
total += 1
if res["status"] == "OK":
passed += 1
elif res["status"] == "SyntaxError":
syntax_error += 1
print(f"Total {total}, Syntax Error {syntax_error}, Passed {passed}")
def main_check_stubs(check_script, language, extension):
args = argparse.ArgumentParser()
args.add_argument(
"--directory", type=str, required=True, help="Directory to read benchmarks from"
)
args.add_argument(
"--files",
type=int,
nargs="*",
default=[],
help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2",
)
args = args.parse_args()
directory = Path(args.directory).resolve()
files_sorted = list_files(directory, extension)
# the directory you specified does not contain the right language
if len(files_sorted) == 0:
print(f"The specified directory does not contain files of type {extension}")
sysexit(1)
files_index = []
if len(args.files) > 0:
files_index = args.files
else:
files_index = range(len(files_sorted))
total = 0
passed = 0
results_file = Path(
Path(__file__).parent, "..", "check_results", language.lower() + ".csv"
).resolve()
with open(results_file, "w") as f:
for i in files_index:
filepath = files_sorted[i]
if filepath is None:
print("File {} does not exist!".format(i))
continue
res = check_script(filepath)
output = f"{language},{filepath.stem},{res['status']}\n"
f.write(output)
print(output, end="")
total += 1
if res["status"] == "OK":
passed += 1
print(f"Total {total}, Passed {passed}")
if total != passed:
sys.exit(1)
sudo apt-get update
#install c-sharp environment
sudo apt-get install mono-runtime mono-complete -y
#install php
sudo apt-get install php -y
#install javascript
#sudo apt-get install nodejs -y
#install typescript
mkdir -p /usr/lib/npm/
tar -xzf zips/node-v16.14.0-linux-x64.tar.gz -C /usr/lib/npm/
export PATH=$PATH:/usr/lib/npm/node-v16.14.0-linux-x64/bin/
npm install -g typescript
#install java
mkdir -p /usr/lib/jvm
sudo tar -xzf zips/openlogic-openjdk-8u412-b08-linux-x64.tar.gz -C /usr/lib/jvm
export JAVA_HOME=/usr/lib/jvm/openlogic-openjdk-8u412-b08-linux-x64
export PATH=$JAVA_HOME/bin:$PATH
#c++
#sudo apt-get install libboost-all-dev -y
cd zips/boost_1_76_0
./bootstrap.sh --prefix=/usr/local
sudo ./b2 install
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
\ No newline at end of file
import os
import signal
import subprocess
from typing import List
from . import generic_eval
def testing_mail(x, y, z):
generic_eval.gmain(x, y, z)
def run_without_exn(args: List[str]):
"""
Runs the given program with a five second timeout. Does not throw an exception
no matter what happens. The output is a dictionary of the format that we expect
for our evaluation scripts. The "status" field is "OK" when the exit code is
zero. If that isn't enough, you may want to tweak the status based on the
captured stderr and stdout.
"""
p = subprocess.Popen(
args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, start_new_session=True
)
try:
stdout, stderr = p.communicate(timeout=5)
exit_code = p.returncode
status = "OK" if exit_code == 0 else "Exception"
except subprocess.TimeoutExpired as exc:
stdout, stderr = p.stdout.read(), p.stderr.read()
os.killpg(os.getpgid(p.pid), signal.SIGTERM)
exit_code = -1
status = "Timeout"
if stdout is None:
stdout = b""
if stderr is None:
stderr = b""
return {
"status": status,
"exit_code": exit_code,
"stdout": stdout.decode("utf-8", errors="ignore"),
"stderr": stderr.decode("utf-8", errors="ignore"),
}
import fcntl
import os
import signal
import subprocess
import time
from typing import List
MAX_BYTES_PER_READ = 1024
SLEEP_BETWEEN_READS = 0.1
class Result:
timeout: int
exit_code: int
stdout: str
stderr: str
def __init__(self, timeout, exit_code, stdout, stderr):
self.timeout = timeout
self.exit_code = exit_code
self.stdout = stdout
self.stderr = stderr
def set_nonblocking(reader):
fd = reader.fileno()
fl = fcntl.fcntl(fd, fcntl.F_GETFL)
fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
def run(
args: List[str],
timeout_seconds: int = 15,
max_output_size: int = 2048,
env=None,
) -> Result:
"""
Runs the given program with arguments. After the timeout elapses, kills the process
and all other processes in the process group. Captures at most max_output_size bytes
of stdout and stderr each, and discards any output beyond that.
"""
p = subprocess.Popen(
args,
env=env,
stdin=subprocess.DEVNULL,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
start_new_session=True,
bufsize=MAX_BYTES_PER_READ,
)
set_nonblocking(p.stdout)
set_nonblocking(p.stderr)
process_group_id = os.getpgid(p.pid)
# We sleep for 0.1 seconds in each iteration.
max_iterations = timeout_seconds * 10
stdout_saved_bytes = []
stderr_saved_bytes = []
stdout_bytes_read = 0
stderr_bytes_read = 0
for _ in range(max_iterations):
this_stdout_read = p.stdout.read(MAX_BYTES_PER_READ)
this_stderr_read = p.stderr.read(MAX_BYTES_PER_READ)
# this_stdout_read and this_stderr_read may be None if stdout or stderr
# are closed. Without these checks, test_close_output fails.
if this_stdout_read is not None and stdout_bytes_read < max_output_size:
stdout_saved_bytes.append(this_stdout_read)
stdout_bytes_read += len(this_stdout_read)
if this_stderr_read is not None and stderr_bytes_read < max_output_size:
stderr_saved_bytes.append(this_stderr_read)
stderr_bytes_read += len(this_stderr_read)
exit_code = p.poll()
if exit_code is not None:
break
time.sleep(SLEEP_BETWEEN_READS)
try:
# Kills the process group. Without this line, test_fork_once fails.
os.killpg(process_group_id, signal.SIGKILL)
except ProcessLookupError:
pass
timeout = exit_code is None
exit_code = exit_code if exit_code is not None else -1
stdout = b"".join(stdout_saved_bytes).decode("utf-8", errors="ignore")
stderr = b"".join(stderr_saved_bytes).decode("utf-8", errors="ignore")
return Result(timeout=timeout, exit_code=exit_code, stdout=stdout, stderr=stderr)
import sys
print("This is the end")
sys.stdout.close()
sys.stderr.close()
while True:
pass
import os
import time
if os.fork() == 0:
while True:
time.sleep(60)
import time
from pathlib import Path
from safe_subprocess import run
ROOT = Path(__file__).resolve().parent / "evil_programs"
def assert_no_running_evil():
result = run(["pgrep", "-f", ROOT], timeout_seconds=1, max_output_size=1024)
assert (
result.exit_code == 1
), f"There are still evil processes running: {result.stdout}"
assert len(result.stderr) == 0
assert len(result.stdout) == 0
def test_fork_once():
# The program exits cleanly and immediately. But, it forks a child that runs
# forever.
result = run(
["python3", ROOT / "fork_once.py"],
timeout_seconds=2,
max_output_size=1024,
)
assert result.exit_code == 0
assert result.timeout == False
assert len(result.stderr) == 0
assert len(result.stdout) == 0
assert_no_running_evil()
def test_close_outputs():
# The program prints to stdout, closes its output, and then runs forever.
result = run(
["python3", ROOT / "close_outputs.py"],
timeout_seconds=2,
max_output_size=1024,
)
assert result.exit_code == -1
assert result.timeout == True
assert len(result.stderr) == 0
assert result.stdout == "This is the end\n"
assert_no_running_evil()
def test_unbounded_output():
result = run(
["python3", ROOT / "unbounded_output.py"],
timeout_seconds=3,
max_output_size=1024,
)
assert result.exit_code == -1
assert result.timeout == True
assert len(result.stderr) == 0
assert len(result.stdout) == 1024
assert_no_running_evil()
def test_sleep_forever():
result = run(
["python3", ROOT / "sleep_forever.py"],
timeout_seconds=2,
max_output_size=1024,
)
assert result.exit_code == -1
assert result.timeout == True
assert len(result.stderr) == 0
assert len(result.stdout) == 0
assert_no_running_evil()
def test_fork_bomb():
result = run(
["python3", ROOT / "fork_bomb.py"],
timeout_seconds=2,
max_output_size=1024,
)
assert result.exit_code == -1
assert result.timeout == True
assert len(result.stderr) == 0
assert len(result.stdout) == 0
# Unfortunately, this sleep seems to be necessary. My theories:
# 1. os.killpg doesn't block until the whole process group is dead.
# 2. pgrep can produce stale output
time.sleep(2)
assert_no_running_evil()
def test_block_on_inputs():
# We run the subprocess with /dev/null as input. So, any program that tries
# to read input will error.
result = run(
["python3", ROOT / "block_on_inputs.py"],
timeout_seconds=2,
max_output_size=1024,
)
assert result.exit_code == 1
assert result.timeout == False
assert len(result.stdout) == 0
assert "EOF when reading a line" in result.stderr
assert_no_running_evil()
import json
import numpy as np
def estimator(n: int, c: int, k: int) -> float:
"""
Calculates 1 - comb(n - c, k) / comb(n, k).
"""
if n - c < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
def for_file(path):
with open(path, "r") as f:
data = json.load(f)
n = len(data["results"])
c = len(
[True for r in data["results"] if r["status"] == "OK" and r["exit_code"] == 0]
)
return np.array([estimator(n, c, 1), estimator(n, c, 10), estimator(n, c, 100)])
import json
import torch
import random
import os
from torch.utils.data import IterableDataset, Dataset
from typing import Dict, Optional, Sequence
import transformers
import logging
import numpy as np
import utils
logging.basicConfig(level=logging.DEBUG)
class SupervisedDataset(Dataset):
"""Dataset for supervised fine-tuning."""
def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, args):
super(SupervisedDataset, self).__init__()
logging.warning("Loading data...")
if data_path.endswith(".npy"):
self.input_ids = np.load(data_path, allow_pickle=True)
else:
self.input_ids = utils.read_jsonl_file(data_path)
original_data_num = len(self.input_ids)
logging.info("Completely Loading tokenized sentences...")
def truncate(sentence):
return torch.tensor(sentence[:args.model_max_length] + [tokenizer.eos_token_id] if len(sentence) > args.model_max_length else sentence, dtype=torch.long)
if args.truncate_source:
self.labels = [truncate(example["label"]) for example in self.input_ids]
self.input_ids = [truncate(example["input_ids"]) for example in self.input_ids]
else:
self.labels = [torch.tensor(example["label"], dtype=torch.long) for example in self.input_ids if len(example["input_ids"]) < args.model_max_length]
self.input_ids = [torch.tensor(example["input_ids"], dtype=torch.long) for example in self.input_ids if len(example["input_ids"]) < args.model_max_length]
print(f"Samples: {original_data_num} -> {len(self.input_ids)}")
def __len__(self):
return len(self.input_ids)
def __getitem__(self, i) -> Dict[str, torch.Tensor]:
return dict(input_ids=self.input_ids[i], labels=self.labels[i])
class MMAPSupervisedDataset(Dataset):
"""Dataset for supervised fine-tuning."""
def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, args):
super(Dataset, self).__init__()
logging.warning("Loading data...")
input_ids_path = data_path
labels_path = data_path.replace(".input_ids.mmap", ".labels.mmap")
lengths_path = data_path.replace(".input_ids.mmap", ".lengths.mmap")
input_ids_shape_path = input_ids_path + ".shape.json"
labels_shape_path = labels_path + ".shape.json"
lengths_shape_path = lengths_path + ".shape.json"
with open(input_ids_shape_path, 'r') as f:
input_ids_shape_info = json.load(f)
with open(labels_shape_path, 'r') as f:
labels_shape_info = json.load(f)
with open(lengths_shape_path, 'r') as f:
lengths_shape_info = json.load(f)
self.input_ids = np.memmap(
input_ids_path,
dtype=np.int32,
mode='r',
shape=(input_ids_shape_info['n_samples'], input_ids_shape_info['max_len'])
)
self.labels = np.memmap(
labels_path,
dtype=np.int32,
mode='r',
shape=(labels_shape_info['n_samples'], labels_shape_info['max_len'])
)
self.lengths = np.memmap(
lengths_path,
dtype=np.int32,
mode='r',
shape=(lengths_shape_info['n_samples'], lengths_shape_info['max_len'])
)
logging.info(f"Loaded {len(self.input_ids)} samples using mmap")
def __len__(self):
return len(self.input_ids)
def __getitem__(self, i) -> Dict[str, torch.Tensor]:
length = int(self.lengths[i])
input_ids = self.input_ids[i][:self.lengths[i]]
return dict(input_ids=input_ids, labels=self.labels[i])
class BufferedJsonlDataset(IterableDataset):
def __init__(
self,
data_path: str,
buffer_size: int = 1000, # 缓冲区大小
seed: Optional[int] = None,
shuffle: bool = True
):
super().__init__()
self.data_path = data_path
self.buffer_size = buffer_size
self.shuffle = shuffle
self.seed = seed
self.file_size = os.path.getsize(data_path)
logging.info(f"Reading from {data_path}: {len(self.file_size)}")
def __iter__(self):
worker_info = torch.utils.data.get_worker_info()
if self.seed is not None:
random_seed = self.seed
if worker_info is not None:
random_seed += worker_info.id
np.random.seed(random_seed)
start_pos = np.random.randint(0, self.file_size) if self.shuffle else 0
buffer = []
with open(self.data_path, 'r', encoding='utf-8') as f:
while True:
if not buffer:
f.seek(start_pos)
partial_line = f.readline()
if not partial_line: # 如果到达文件末尾,从头开始
f.seek(0)
partial_line = f.readline()
buffer = []
for _ in range(self.buffer_size):
line = f.readline()
if not line:
f.seek(0)
line = f.readline()
try:
data = json.loads(line.strip())
if "input_ids" in data:
buffer.append(data["input_ids"])
except json.JSONDecodeError:
logging.info("Invalid json line")
continue
if self.shuffle:
random.shuffle(buffer)
if buffer:
yield buffer.pop()
else:
break
def __len__(self):
return self.file_size
import json
import mmap
import os
import torch
import random
from torch.utils.data import IterableDataset
class JSONLDataset(IterableDataset):
def __init__(self, data_path, buffer_size=1000):
"""
Args:
data_path: jsonl文件路径
buffer_size: 缓存大小
"""
super().__init__()
self.data_path = data_path
self.buffer_size = buffer_size
self.file_size = os.path.getsize(data_path)
def get_random_start_pos(self, mm):
"""获取随机起始位置"""
# 随机选择一个文件位置
pos = random.randint(0, self.file_size - 1)
# 调整到最近的行首
while pos > 0 and mm[pos-1] != ord('\n'):
pos -= 1
return pos
def read_lines(self, mm, start_pos):
"""从指定位置读取数据"""
buffer = []
current_pos = start_pos
while len(buffer) < self.buffer_size and current_pos < self.file_size:
line_start = current_pos
# 找到行尾
while current_pos < self.file_size and mm[current_pos] != ord('\n'):
current_pos += 1
if current_pos < self.file_size:
line = mm[line_start:current_pos].decode('utf-8')
try:
data = json.loads(line)
if "input_ids" in data:
buffer.append(data["input_ids"])
except json.JSONDecodeError:
pass # 跳过无效的JSON行
current_pos += 1 # 跳过换行符
return buffer, current_pos
def __iter__(self):
worker_info = torch.utils.data.get_worker_info()
with open(self.data_path, 'rb') as f:
mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
if worker_info is not None:
start_pos = self.get_random_start_pos(mm)
else:
start_pos = 0
current_pos = start_pos
while True:
buffer, next_pos = self.read_lines(mm, current_pos)
if not buffer and next_pos >= self.file_size:
current_pos = 0
continue
elif not buffer:
current_pos = next_pos
continue
random.shuffle(buffer)
for item in buffer:
yield torch.tensor(item)
current_pos = next_pos
def __len__(self):
return int(self.file_size / 100) # 假设每行平均100字节
if __name__ == "__main__":
from torch.utils.data import DataLoader
dataset = BufferedJsonlDataset(
data_path="path/to/your/large.jsonl",
buffer_size=1000,
seed=42,
shuffle=True
)
dataloader = DataLoader(
dataset,
batch_size=32,
num_workers=4,
pin_memory=True
)
for batch in dataloader:
pass
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment