Initial commit

53b3977b · dongchy920 · 53b3977b · 53b3977b · 53b3977b · 53b3977b
Commit 53b3977b authored Jul 11, 2025 by dongchy920
20 changed files
--- a/finetuning/sft/utils/multiple_metrics/eval_sh.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_sh.py
+from pathlib import Path
+from .safe_subprocess import run
+LANG_NAME = "bash"
+LANG_EXT = ".sh"
+def eval_script(path: Path):
+    # Capture output - will be generated regardless of success, fail, or syntax error
+    p = run(["bash", path])
+    if p.timeout:
+        status = "Timeout"
+    elif p.exit_code == 0:
+        status = "OK"
+    elif "syntax error" in p.stderr:
+        status = "SyntaxError"
+    else:
+        status = "Exception"
+    return {
+        "status": status,
+        "exit_code": p.exit_code,
+        "stdout": p.stdout,
+        "stderr": p.stderr,
+    }
--- a/finetuning/sft/utils/multiple_metrics/eval_swift.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_swift.py
+import os
+import subprocess
+from pathlib import Path
+from .safe_subprocess import run
+def eval_script(path: Path):
+    basename = ".".join(str(path).split(".")[:-1])
+    r = run(["swiftc", path, "-o", basename], timeout_seconds=45)
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code != 0:
+        # Well, it's a compile error. May be a type error or
+        # something. But, why break the set convention
+        status = "SyntaxError"
+    else:
+        r = run([basename], timeout_seconds=5)
+        if r.timeout:
+            status = "Timeout"
+        elif r.exit_code != 0:
+            # Well, it's a panic
+            status = "Exception"
+        else:
+            status = "OK"
+        os.remove(basename)
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }
--- a/finetuning/sft/utils/multiple_metrics/eval_ts.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_ts.py
+from pathlib import Path
+import subprocess
+from .safe_subprocess import run
+def eval_script(path: Path):
+    """
+    npm install -g typescript
+    """
+    try:
+        r = run(["tsc", "--target", "esnext", str(path)], timeout_seconds=15)
+    except:
+        subprocess.run("npm install -g typescript")
+    if r.exit_code != 0:
+        return {
+            "status": "SyntaxError",
+            "exit_code": r.exit_code,
+            "stdout": r.stdout,
+            "stderr": r.stderr,
+        }
+    r = run(["node", str(path).replace(".ts", ".js")], timeout_seconds=15)
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code == 0:
+        status = "OK"
+    elif "ERR_ASSERTION" in r.stderr:
+        status = "AssertionError"
+    elif "SyntaxError" in r.stderr:
+        status = "SyntaxError"
+    elif "ReferenceError" in r.stderr:
+        status = "ReferenceError"
+    else:
+        status = "Exception"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }
--- a/finetuning/sft/utils/multiple_metrics/evaluation.py
+++ b/finetuning/sft/utils/multiple_metrics/evaluation.py
+import json
+import os
+import time
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from threading import Lock
+from typing import Optional
+from .containerized_eval import eval_string_script
+# Get working directory
+WORKING_DIR = Path(__file__).parent.parent
+# program: str => Result
+CACHE = dict()
+CACHE_LOCK = Lock()
+def cache_get(program: str) -> Optional[dict]:
+    if program in CACHE:
+        result = CACHE[program]
+        return result
+    else:
+        return None
+def cache_set(program: str, result: dict):
+    if program in CACHE:
+        print("Setting already-existing cache")
+    CACHE[program] = result
+def cached_eval_script(problem, index) -> dict:
+    # here prompt is already included in completions
+    program = problem["completions"][index] + "\n" + problem["tests"]
+    CACHE_LOCK.acquire(True)
+    cached = cache_get(program)
+    if cached is not None:
+        CACHE_LOCK.release()
+        return cached
+    else:
+        result_yaml = dict()
+        cache_set(program, result_yaml)
+        CACHE_LOCK.release()
+        result_dict = eval_string_script(problem["language"], program)
+        for k in result_dict.keys():
+            result_yaml[k] = result_dict[k]
+            result_yaml["timestamp"] = int(time.time())
+        return result_yaml
+def get_test_results_json_path(
+    output_dir: str, problem_json_path: str, input_dir: Path
+) -> Path:
+    suffixes = ".results.json"
+    problem_name = problem_json_path[: -len(".json")]
+    if input_dir:
+        raise ValueError("input dir given")
+        return Path(output_dir) / (
+            problem_json_path.relative_to(input_dir).parent / (problem_name + suffixes)
+        )
+    return Path(output_dir) / (problem_name + suffixes)
+def evaluate_problem(
+    output_dir: str, problem_json_path: str, max_workers: int, input_dir: Path = None
+):
+    with open(problem_json_path, "r") as f:
+        problem = json.load(f)
+    test_results_path = get_test_results_json_path(
+        output_dir, problem_json_path, input_dir
+    )
+    test_results_path.parent.mkdir(mode=0o755, parents=True, exist_ok=True)
+    test_results = problem.copy()
+    del test_results["completions"]
+    test_results["results"] = []
+    num_problems = len(problem["completions"])
+    min_problem = len(test_results["results"])
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        for j in executor.map(
+            lambda index: cached_eval_script(problem, index),
+            range(min_problem, num_problems),
+        ):
+            test_results["results"].append(j)
+            test_results["results"][-1].update(problem)
+            with open(test_results_path, "w") as f:
+                f.write(json.dumps(test_results, indent=2))
--- a/finetuning/sft/utils/multiple_metrics/generic_eval.py
+++ b/finetuning/sft/utils/multiple_metrics/generic_eval.py
+# This is a helper script for evaluating benchmarks that have been translated to
+# different languages.
+#
+# To use this script, call eval_lang.py.
+# The --directory argument is required, and tells the script where the benchmarks are located.
+# The --files argument is optional, and takes a list of numbers corresponding to the files to be evaluated.
+#
+# The script will print the results on each benchmark, and also write to results/lang.csv.
+# When the script completes, it will print a summary.
+#
+# Examples
+#
+# To run the entire benchmark suite:
+#   python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/
+#
+# To run benchmarks 1, 2, and 3:
+#   python3 src/eval_php.py --directory datasets/php-keep-code_davinci_001_temp_0.2-0/ --files 1 2 3
+import argparse
+import sys
+from pathlib import Path
+from sys import exit as sysexit
+def list_files(directory, ext):
+    files_unsorted = directory.glob(f"HumanEval_*{ext}")
+    # assumption: base filenames are in the format of HumanEval_X_*
+    # Where X is a valid number
+    def key(s):
+        return int(str(s.name).split("_")[1])
+    files_sorted = sorted(files_unsorted, key=(lambda s: key(s)))
+    # assumption: there may be missing files, but no extra files
+    # so we build files_array where the index corresponds to the file's number,
+    # and a missing file is represented by None
+    size = key(files_sorted[-1]) + 1
+    files_array = [None] * size
+    for f in files_sorted:
+        k = key(f)
+        files_array[k] = f
+    return files_array
+def main(eval_script, language, extension):
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--directory", type=str, required=True, help="Directory to read benchmarks from"
+    )
+    args.add_argument(
+        "--files",
+        type=int,
+        nargs="*",
+        default=[],
+        help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2",
+    )
+    args = args.parse_args()
+    directory = Path(args.directory).resolve()
+    files_sorted = list_files(directory, extension)
+    # the directory you specified does not contain the right language
+    if len(files_sorted) == 0:
+        print(f"The specified directory does not contain files of type {extension}")
+        sysexit(1)
+    files_index = []
+    if len(args.files) > 0:
+        files_index = args.files
+    else:
+        files_index = range(len(files_sorted))
+    total = 0
+    passed = 0
+    syntax_error = 0
+    results_file = Path(
+        Path(__file__).parent, "..", "results", language.lower() + ".csv"
+    ).resolve()
+    with open(results_file, "w") as f:
+        for i in files_index:
+            filepath = files_sorted[i]
+            if filepath is None:
+                print("File {} does not exist!".format(i))
+                continue
+            res = eval_script(filepath)
+            output = f"{language},{filepath.stem},{res['status']}\n"
+            f.write(output)
+            print(output, end="")
+            total += 1
+            if res["status"] == "OK":
+                passed += 1
+            elif res["status"] == "SyntaxError":
+                syntax_error += 1
+    print(f"Total {total}, Syntax Error {syntax_error}, Passed {passed}")
+def main_check_stubs(check_script, language, extension):
+    args = argparse.ArgumentParser()
+    args.add_argument(
+        "--directory", type=str, required=True, help="Directory to read benchmarks from"
+    )
+    args.add_argument(
+        "--files",
+        type=int,
+        nargs="*",
+        default=[],
+        help="Specify the benchmarks to evaluate by their number, e.g. --files 0 1 2",
+    )
+    args = args.parse_args()
+    directory = Path(args.directory).resolve()
+    files_sorted = list_files(directory, extension)
+    # the directory you specified does not contain the right language
+    if len(files_sorted) == 0:
+        print(f"The specified directory does not contain files of type {extension}")
+        sysexit(1)
+    files_index = []
+    if len(args.files) > 0:
+        files_index = args.files
+    else:
+        files_index = range(len(files_sorted))
+    total = 0
+    passed = 0
+    results_file = Path(
+        Path(__file__).parent, "..", "check_results", language.lower() + ".csv"
+    ).resolve()
+    with open(results_file, "w") as f:
+        for i in files_index:
+            filepath = files_sorted[i]
+            if filepath is None:
+                print("File {} does not exist!".format(i))
+                continue
+            res = check_script(filepath)
+            output = f"{language},{filepath.stem},{res['status']}\n"
+            f.write(output)
+            print(output, end="")
+            total += 1
+            if res["status"] == "OK":
+                passed += 1
+    print(f"Total {total}, Passed {passed}")
+    if total != passed:
+        sys.exit(1)
--- a/finetuning/sft/utils/multiple_metrics/install.sh
+++ b/finetuning/sft/utils/multiple_metrics/install.sh
+sudo apt-get update
+#install c-sharp environment
+sudo apt-get install mono-runtime mono-complete -y
+#install php
+sudo apt-get install php -y
+#install javascript
+#sudo apt-get install nodejs -y
+#install typescript
+mkdir -p /usr/lib/npm/
+tar -xzf zips/node-v16.14.0-linux-x64.tar.gz -C /usr/lib/npm/
+export PATH=$PATH:/usr/lib/npm/node-v16.14.0-linux-x64/bin/
+npm install -g typescript
+#install java
+mkdir -p /usr/lib/jvm
+sudo tar -xzf zips/openlogic-openjdk-8u412-b08-linux-x64.tar.gz -C /usr/lib/jvm
+export JAVA_HOME=/usr/lib/jvm/openlogic-openjdk-8u412-b08-linux-x64
+export PATH=$JAVA_HOME/bin:$PATH
+#c++
+#sudo apt-get install libboost-all-dev -y
+cd zips/boost_1_76_0
+./bootstrap.sh --prefix=/usr/local
+sudo ./b2 install
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
\ No newline at end of file
--- a/finetuning/sft/utils/multiple_metrics/javatuples-1.2.jar
+++ b/finetuning/sft/utils/multiple_metrics/javatuples-1.2.jar
--- a/finetuning/sft/utils/multiple_metrics/libeval.py
+++ b/finetuning/sft/utils/multiple_metrics/libeval.py
+import os
+import signal
+import subprocess
+from typing import List
+from . import generic_eval
+def testing_mail(x, y, z):
+    generic_eval.gmain(x, y, z)
+def run_without_exn(args: List[str]):
+    """
+    Runs the given program with a five second timeout. Does not throw an exception
+    no matter what happens. The output is a dictionary of the format that we expect
+    for our evaluation scripts. The "status" field is "OK" when the exit code is
+    zero. If that isn't enough, you may want to tweak the status based on the
+    captured stderr and stdout.
+    """
+    p = subprocess.Popen(
+        args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, start_new_session=True
+    )
+    try:
+        stdout, stderr = p.communicate(timeout=5)
+        exit_code = p.returncode
+        status = "OK" if exit_code == 0 else "Exception"
+    except subprocess.TimeoutExpired as exc:
+        stdout, stderr = p.stdout.read(), p.stderr.read()
+        os.killpg(os.getpgid(p.pid), signal.SIGTERM)
+        exit_code = -1
+        status = "Timeout"
+    if stdout is None:
+        stdout = b""
+    if stderr is None:
+        stderr = b""
+    return {
+        "status": status,
+        "exit_code": exit_code,
+        "stdout": stdout.decode("utf-8", errors="ignore"),
+        "stderr": stderr.decode("utf-8", errors="ignore"),
+    }
--- a/finetuning/sft/utils/multiple_metrics/safe_subprocess/.gitkeep
+++ b/finetuning/sft/utils/multiple_metrics/safe_subprocess/.gitkeep
--- a/finetuning/sft/utils/multiple_metrics/safe_subprocess/__init__.py
+++ b/finetuning/sft/utils/multiple_metrics/safe_subprocess/__init__.py
+import fcntl
+import os
+import signal
+import subprocess
+import time
+from typing import List
+MAX_BYTES_PER_READ = 1024
+SLEEP_BETWEEN_READS = 0.1
+class Result:
+    timeout: int
+    exit_code: int
+    stdout: str
+    stderr: str
+    def __init__(self, timeout, exit_code, stdout, stderr):
+        self.timeout = timeout
+        self.exit_code = exit_code
+        self.stdout = stdout
+        self.stderr = stderr
+def set_nonblocking(reader):
+    fd = reader.fileno()
+    fl = fcntl.fcntl(fd, fcntl.F_GETFL)
+    fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
+def run(
+    args: List[str],
+    timeout_seconds: int = 15,
+    max_output_size: int = 2048,
+    env=None,
+) -> Result:
+    """
+    Runs the given program with arguments. After the timeout elapses, kills the process
+    and all other processes in the process group. Captures at most max_output_size bytes
+    of stdout and stderr each, and discards any output beyond that.
+    """
+    p = subprocess.Popen(
+        args,
+        env=env,
+        stdin=subprocess.DEVNULL,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        start_new_session=True,
+        bufsize=MAX_BYTES_PER_READ,
+    )
+    set_nonblocking(p.stdout)
+    set_nonblocking(p.stderr)
+    process_group_id = os.getpgid(p.pid)
+    # We sleep for 0.1 seconds in each iteration.
+    max_iterations = timeout_seconds * 10
+    stdout_saved_bytes = []
+    stderr_saved_bytes = []
+    stdout_bytes_read = 0
+    stderr_bytes_read = 0
+    for _ in range(max_iterations):
+        this_stdout_read = p.stdout.read(MAX_BYTES_PER_READ)
+        this_stderr_read = p.stderr.read(MAX_BYTES_PER_READ)
+        # this_stdout_read and this_stderr_read may be None if stdout or stderr
+        # are closed. Without these checks, test_close_output fails.
+        if this_stdout_read is not None and stdout_bytes_read < max_output_size:
+            stdout_saved_bytes.append(this_stdout_read)
+            stdout_bytes_read += len(this_stdout_read)
+        if this_stderr_read is not None and stderr_bytes_read < max_output_size:
+            stderr_saved_bytes.append(this_stderr_read)
+            stderr_bytes_read += len(this_stderr_read)
+        exit_code = p.poll()
+        if exit_code is not None:
+            break
+        time.sleep(SLEEP_BETWEEN_READS)
+    try:
+        # Kills the process group. Without this line, test_fork_once fails.
+        os.killpg(process_group_id, signal.SIGKILL)
+    except ProcessLookupError:
+        pass
+    timeout = exit_code is None
+    exit_code = exit_code if exit_code is not None else -1
+    stdout = b"".join(stdout_saved_bytes).decode("utf-8", errors="ignore")
+    stderr = b"".join(stderr_saved_bytes).decode("utf-8", errors="ignore")
+    return Result(timeout=timeout, exit_code=exit_code, stdout=stdout, stderr=stderr)
--- a/finetuning/sft/utils/multiple_metrics/safe_subprocess/evil_programs/.gitkeep
+++ b/finetuning/sft/utils/multiple_metrics/safe_subprocess/evil_programs/.gitkeep
--- a/finetuning/sft/utils/multiple_metrics/safe_subprocess/evil_programs/block_on_inputs.py
+++ b/finetuning/sft/utils/multiple_metrics/safe_subprocess/evil_programs/block_on_inputs.py
+while True:
+    input()
--- a/finetuning/sft/utils/multiple_metrics/safe_subprocess/evil_programs/close_outputs.py
+++ b/finetuning/sft/utils/multiple_metrics/safe_subprocess/evil_programs/close_outputs.py
+import sys
+print("This is the end")
+sys.stdout.close()
+sys.stderr.close()
+while True:
+    pass
--- a/finetuning/sft/utils/multiple_metrics/safe_subprocess/evil_programs/fork_bomb.py
+++ b/finetuning/sft/utils/multiple_metrics/safe_subprocess/evil_programs/fork_bomb.py
+import os
+while True:
+    os.fork()
--- a/finetuning/sft/utils/multiple_metrics/safe_subprocess/evil_programs/fork_once.py
+++ b/finetuning/sft/utils/multiple_metrics/safe_subprocess/evil_programs/fork_once.py
+import os
+import time
+if os.fork() == 0:
+    while True:
+        time.sleep(60)
--- a/finetuning/sft/utils/multiple_metrics/safe_subprocess/evil_programs/sleep_forever.py
+++ b/finetuning/sft/utils/multiple_metrics/safe_subprocess/evil_programs/sleep_forever.py
+import time
+while True:
+    time.sleep(60)
--- a/finetuning/sft/utils/multiple_metrics/safe_subprocess/evil_programs/unbounded_output.py
+++ b/finetuning/sft/utils/multiple_metrics/safe_subprocess/evil_programs/unbounded_output.py
+b = True
+while True:
+    print(b)
+    b = not b
--- a/finetuning/sft/utils/multiple_metrics/safe_subprocess/module_test.py
+++ b/finetuning/sft/utils/multiple_metrics/safe_subprocess/module_test.py
+import time
+from pathlib import Path
+from safe_subprocess import run
+ROOT = Path(__file__).resolve().parent / "evil_programs"
+def assert_no_running_evil():
+    result = run(["pgrep", "-f", ROOT], timeout_seconds=1, max_output_size=1024)
+    assert (
+        result.exit_code == 1
+    ), f"There are still evil processes running: {result.stdout}"
+    assert len(result.stderr) == 0
+    assert len(result.stdout) == 0
+def test_fork_once():
+    # The program exits cleanly and immediately. But, it forks a child that runs
+    # forever.
+    result = run(
+        ["python3", ROOT / "fork_once.py"],
+        timeout_seconds=2,
+        max_output_size=1024,
+    )
+    assert result.exit_code == 0
+    assert result.timeout == False
+    assert len(result.stderr) == 0
+    assert len(result.stdout) == 0
+    assert_no_running_evil()
+def test_close_outputs():
+    # The program prints to stdout, closes its output, and then runs forever.
+    result = run(
+        ["python3", ROOT / "close_outputs.py"],
+        timeout_seconds=2,
+        max_output_size=1024,
+    )
+    assert result.exit_code == -1
+    assert result.timeout == True
+    assert len(result.stderr) == 0
+    assert result.stdout == "This is the end\n"
+    assert_no_running_evil()
+def test_unbounded_output():
+    result = run(
+        ["python3", ROOT / "unbounded_output.py"],
+        timeout_seconds=3,
+        max_output_size=1024,
+    )
+    assert result.exit_code == -1
+    assert result.timeout == True
+    assert len(result.stderr) == 0
+    assert len(result.stdout) == 1024
+    assert_no_running_evil()
+def test_sleep_forever():
+    result = run(
+        ["python3", ROOT / "sleep_forever.py"],
+        timeout_seconds=2,
+        max_output_size=1024,
+    )
+    assert result.exit_code == -1
+    assert result.timeout == True
+    assert len(result.stderr) == 0
+    assert len(result.stdout) == 0
+    assert_no_running_evil()
+def test_fork_bomb():
+    result = run(
+        ["python3", ROOT / "fork_bomb.py"],
+        timeout_seconds=2,
+        max_output_size=1024,
+    )
+    assert result.exit_code == -1
+    assert result.timeout == True
+    assert len(result.stderr) == 0
+    assert len(result.stdout) == 0
+    # Unfortunately, this sleep seems to be necessary. My theories:
+    # 1. os.killpg doesn't block until the whole process group is dead.
+    # 2. pgrep can produce stale output
+    time.sleep(2)
+    assert_no_running_evil()
+def test_block_on_inputs():
+    # We run the subprocess with /dev/null as input. So, any program that tries
+    # to read input will error.
+    result = run(
+        ["python3", ROOT / "block_on_inputs.py"],
+        timeout_seconds=2,
+        max_output_size=1024,
+    )
+    assert result.exit_code == 1
+    assert result.timeout == False
+    assert len(result.stdout) == 0
+    assert "EOF when reading a line" in result.stderr
+    assert_no_running_evil()
--- a/finetuning/sft/utils/multiple_metrics/single_experiment_pass_k.py
+++ b/finetuning/sft/utils/multiple_metrics/single_experiment_pass_k.py
+import json
+import numpy as np
+def estimator(n: int, c: int, k: int) -> float:
+    """
+    Calculates 1 - comb(n - c, k) / comb(n, k).
+    """
+    if n - c < k:
+        return 1.0
+    return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+def for_file(path):
+    with open(path, "r") as f:
+        data = json.load(f)
+    n = len(data["results"])
+    c = len(
+        [True for r in data["results"] if r["status"] == "OK" and r["exit_code"] == 0]
+    )
+    return np.array([estimator(n, c, 1), estimator(n, c, 10), estimator(n, c, 100)])
--- a/finetuning/sft/utils/training_datasets.py
+++ b/finetuning/sft/utils/training_datasets.py
+import json
+import torch
+import random
+import os
+from torch.utils.data import IterableDataset, Dataset
+from typing import Dict, Optional, Sequence
+import transformers
+import logging
+import numpy as np
+import utils
+logging.basicConfig(level=logging.DEBUG)  
+class SupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, args):
+        super(SupervisedDataset, self).__init__()
+        logging.warning("Loading data...")
+        if data_path.endswith(".npy"):
+            self.input_ids = np.load(data_path, allow_pickle=True)
+        else:
+            self.input_ids = utils.read_jsonl_file(data_path)
+        original_data_num = len(self.input_ids)
+        logging.info("Completely Loading tokenized sentences...")
+        def truncate(sentence):
+            return torch.tensor(sentence[:args.model_max_length] + [tokenizer.eos_token_id] if len(sentence) > args.model_max_length else sentence, dtype=torch.long)
+        if args.truncate_source:
+            self.labels = [truncate(example["label"]) for example in self.input_ids]
+            self.input_ids = [truncate(example["input_ids"]) for example in self.input_ids]
+        else:
+            self.labels = [torch.tensor(example["label"], dtype=torch.long) for example in self.input_ids if len(example["input_ids"]) < args.model_max_length]
+            self.input_ids = [torch.tensor(example["input_ids"], dtype=torch.long) for example in self.input_ids if len(example["input_ids"]) < args.model_max_length]
+        print(f"Samples: {original_data_num} -> {len(self.input_ids)}")
+    def __len__(self):
+        return len(self.input_ids)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:        
+        return dict(input_ids=self.input_ids[i], labels=self.labels[i])
+class MMAPSupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, args):
+        super(Dataset, self).__init__()
+        logging.warning("Loading data...")
+        input_ids_path = data_path
+        labels_path = data_path.replace(".input_ids.mmap", ".labels.mmap")
+        lengths_path = data_path.replace(".input_ids.mmap", ".lengths.mmap")
+        input_ids_shape_path = input_ids_path + ".shape.json"
+        labels_shape_path = labels_path + ".shape.json"
+        lengths_shape_path = lengths_path + ".shape.json"
+        with open(input_ids_shape_path, 'r') as f:
+            input_ids_shape_info = json.load(f)
+        with open(labels_shape_path, 'r') as f:
+            labels_shape_info = json.load(f)
+        with open(lengths_shape_path, 'r') as f:
+            lengths_shape_info = json.load(f)
+        self.input_ids = np.memmap(
+            input_ids_path,
+            dtype=np.int32,
+            mode='r',
+            shape=(input_ids_shape_info['n_samples'], input_ids_shape_info['max_len'])
+        )
+        self.labels = np.memmap(
+            labels_path, 
+            dtype=np.int32,
+            mode='r',
+            shape=(labels_shape_info['n_samples'], labels_shape_info['max_len'])
+        )
+        self.lengths = np.memmap(
+            lengths_path, 
+            dtype=np.int32,
+            mode='r',
+            shape=(lengths_shape_info['n_samples'], lengths_shape_info['max_len'])
+        )
+        logging.info(f"Loaded {len(self.input_ids)} samples using mmap")
+    def __len__(self):
+        return len(self.input_ids)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:     
+        length = int(self.lengths[i])
+        input_ids = self.input_ids[i][:self.lengths[i]]
+        return dict(input_ids=input_ids, labels=self.labels[i])
+class BufferedJsonlDataset(IterableDataset):
+    def __init__(
+        self,
+        data_path: str,
+        buffer_size: int = 1000,  # 缓冲区大小
+        seed: Optional[int] = None,
+        shuffle: bool = True
+    ):
+        super().__init__()
+        self.data_path = data_path
+        self.buffer_size = buffer_size
+        self.shuffle = shuffle
+        self.seed = seed
+        self.file_size = os.path.getsize(data_path)
+        logging.info(f"Reading from {data_path}: {len(self.file_size)}")
+    def __iter__(self):
+        worker_info = torch.utils.data.get_worker_info()
+        if self.seed is not None:
+            random_seed = self.seed
+            if worker_info is not None:
+                random_seed += worker_info.id
+            np.random.seed(random_seed)
+        start_pos = np.random.randint(0, self.file_size) if self.shuffle else 0      
+        buffer = []
+        with open(self.data_path, 'r', encoding='utf-8') as f:
+            while True:
+                if not buffer:
+                    f.seek(start_pos)
+                    partial_line = f.readline()
+                    if not partial_line:  # 如果到达文件末尾，从头开始
+                        f.seek(0)
+                        partial_line = f.readline()
+                    buffer = []
+                    for _ in range(self.buffer_size):
+                        line = f.readline()
+                        if not line:  
+                            f.seek(0)
+                            line = f.readline()
+                        try:
+                            data = json.loads(line.strip())
+                            if "input_ids" in data:
+                                buffer.append(data["input_ids"])
+                        except json.JSONDecodeError:
+                            logging.info("Invalid json line")
+                            continue
+                    if self.shuffle:
+                        random.shuffle(buffer)
+                if buffer:
+                    yield buffer.pop()
+                else:
+                    break
+    def __len__(self):
+        return self.file_size
+import json
+import mmap
+import os
+import torch
+import random
+from torch.utils.data import IterableDataset
+class JSONLDataset(IterableDataset):
+    def __init__(self, data_path, buffer_size=1000):
+        """
+        Args:
+            data_path: jsonl文件路径
+            buffer_size: 缓存大小
+        """
+        super().__init__()
+        self.data_path = data_path
+        self.buffer_size = buffer_size
+        self.file_size = os.path.getsize(data_path)
+    def get_random_start_pos(self, mm):
+        """获取随机起始位置"""
+        # 随机选择一个文件位置
+        pos = random.randint(0, self.file_size - 1)
+        # 调整到最近的行首
+        while pos > 0 and mm[pos-1] != ord('\n'):
+            pos -= 1
+        return pos
+    def read_lines(self, mm, start_pos):
+        """从指定位置读取数据"""
+        buffer = []
+        current_pos = start_pos
+        while len(buffer) < self.buffer_size and current_pos < self.file_size:
+            line_start = current_pos
+            # 找到行尾
+            while current_pos < self.file_size and mm[current_pos] != ord('\n'):
+                current_pos += 1
+            if current_pos < self.file_size:
+                line = mm[line_start:current_pos].decode('utf-8')
+                try:
+                    data = json.loads(line)
+                    if "input_ids" in data:
+                        buffer.append(data["input_ids"])
+                except json.JSONDecodeError:
+                    pass  # 跳过无效的JSON行
+                current_pos += 1  # 跳过换行符
+        return buffer, current_pos
+    def __iter__(self):
+        worker_info = torch.utils.data.get_worker_info()
+        with open(self.data_path, 'rb') as f:
+            mm = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
+            if worker_info is not None:
+                start_pos = self.get_random_start_pos(mm)
+            else:
+                start_pos = 0
+            current_pos = start_pos
+            while True:
+                buffer, next_pos = self.read_lines(mm, current_pos)
+                if not buffer and next_pos >= self.file_size:
+                    current_pos = 0
+                    continue
+                elif not buffer:
+                    current_pos = next_pos
+                    continue
+                random.shuffle(buffer)
+                for item in buffer:
+                    yield torch.tensor(item)
+                current_pos = next_pos
+    def __len__(self):
+        return int(self.file_size / 100)  # 假设每行平均100字节
+if __name__ == "__main__":
+    from torch.utils.data import DataLoader
+    dataset = BufferedJsonlDataset(
+        data_path="path/to/your/large.jsonl",
+        buffer_size=1000,
+        seed=42,
+        shuffle=True
+    )
+    dataloader = DataLoader(
+        dataset,
+        batch_size=32,
+        num_workers=4, 
+        pin_memory=True 
+    )
+    for batch in dataloader:
+        pass