Initial commit

53b3977b · dongchy920 · 53b3977b · 53b3977b · 53b3977b · 53b3977b
Commit 53b3977b authored Jul 11, 2025 by dongchy920
20 changed files
--- a/finetuning/sft/utils/decont.py
+++ b/finetuning/sft/utils/decont.py
+import jsonlines
+import requests
+import json
+import argparse
+import multiprocessing as mp
+import traceback
+import argparse
+import tqdm
+import time
+import tempfile
+from datasketch import MinHash, MinHashLSH
+import subprocess
+import collections
+import numpy as np
+import hashlib
+from func_timeout import func_set_timeout
+import pandas as pd
+import os
+import sys
+import xlsxwriter
+import itertools
+import copy
+import re
+import numpy as np
+import pandas as pd
+import math
+from sacrebleu import metrics
+import ahocorasick
+import datasets
+import itertools
+from pathlib import Path
+from utils import utils
+
+DATA_DIR = "./test_data/"
+
+
+def has_n_gram_overlap(string1, string2, n_gram=10, if_tokenize=False):
+    if if_tokenize:
+        string1 = nltk.tokenize.word_tokenize(string1)
+        string2 = nltk.tokenize.word_tokenize(string2)
+        string1 = " ".join(string1)
+        string2 = " ".join(string2)
+    tokens1 = string1.split()
+    tokens2 = string2.split()
+    grams1 = set([" ".join(tokens1[i:i + n_gram]) for i in range(len(tokens1) - (n_gram - 1))])
+    grams2 = set([" ".join(tokens2[i:i + n_gram]) for i in range(len(tokens2) - (n_gram - 1))])
+    overlap = grams1.intersection(grams2)
+    return len(overlap) > 0
+
+
+def has_n_gram_overlap_with_testset(string1, testset, n_gram=10, if_tokenize=False, overlaps=[], verbose=False):
+    if if_tokenize:
+        string1 = nltk.tokenize.word_tokenize(string1)
+        string1 = " ".join(string1)
+    tokens1 = string1.split()
+    grams1 = set([" ".join(tokens1[i:i + n_gram]) for i in range(len(tokens1) - (n_gram - 1))])
+    overlap = grams1.intersection(testset)
+    overlaps.extend(list(overlap))
+    if len(overlap) > 0 and verbose:
+        print(overlap)
+    return len(overlap) > 0
+
+
+def get_n_gram(string, n_gram=10, if_tokenize=False):
+    if if_tokenize:
+        string1 = nltk.tokenize.word_tokenize(string1)
+        string1 = " ".join(string1)
+    tokens1 = string.split()
+    return set([" ".join(tokens1[i:i + n_gram]) for i in range(len(tokens1) - (n_gram - 1))])
+
+
+def load_leetcode_test_data():
+    data = utils.read_jsonl_file("livecodebench.jsonl")
+    samples = []
+    for obj in data:
+        samples.append({"prompt": obj["prompt"]})
+    return samples
+
+
+def load_humanevalpack_test_data(data_path=f"{DATA_DIR}/humanevalpack"):
+    ds1 = datasets.load_dataset(data_path, "python", trust_remote_code=True)["test"]
+    ds2 = datasets.load_dataset(data_path, "js", trust_remote_code=True)["test"]
+    ds3 = datasets.load_dataset(data_path, "java", trust_remote_code=True)["test"]
+    ds4 = datasets.load_dataset(data_path, "go", trust_remote_code=True)["test"]
+    ds5 = datasets.load_dataset(data_path, "cpp", trust_remote_code=True)["test"]
+    ds6 = datasets.load_dataset(data_path, "rust", trust_remote_code=True)["test"]
+    combined_dataset = datasets.concatenate_datasets([ds1, ds2, ds3, ds4, ds5, ds6])
+    data = []
+    for j, sample in enumerate(combined_dataset):
+        data.append(sample)
+    return data
+
+
+def load_multiply_e():
+    data = {}
+    samples = []
+    for lg in ["sh", "ts", "cs", "php", "java", "cpp", "js", "go", "rs"]:
+        objs = utils.read_jsonl_file(f"{DATA_DIR}/multiple/data/humaneval-{lg}.jsonl")
+        for j, sample in enumerate(objs):
+            samples.append({"prompt": sample["prompt"]})
+    return samples
+
+
+def load_mbpp_test_data(data_path=f"{DATA_DIR}/mbpp/"):
+    ds = utils.read_jsonl_file(f"{data_path}/mbpp.jsonl")
+    data = []
+    for j, sample in enumerate(ds):
+        data.append(sample)
+    return data
+
+
+def load_ds1000_test_data(data_path="data/ds1000_data/"):
+
+    def extract_ds_1000_prompt(prompt: str):
+        if "SOLUTION START" in prompt:
+            assert prompt.count("SOLUTION START") == 1
+            return prompt.split("SOLUTION START")[0]
+        elif "BEGIN SOLUTION" in prompt:
+            assert prompt.count("BEGIN SOLUTION") == 1
+            return prompt.split("BEGIN SOLUTION")[0]
+        else:
+            return prompt
+
+    def load_ds_1000(data_path):
+        data = []
+        for prompt_file in Path(data_path).glob("*/Insertion/q*/prompt.txt"):
+            with open(prompt_file) as f:
+                data.append(extract_ds_1000_prompt({"insertion": f.read()}))
+        return data
+
+    return load_ds_1000(data_path)
+
+
+def load_codeapex_data():
+    ds = utils.read_jsonl_file("data/eval/eval_codeapex_v1.jsonl")
+    data = []
+    for j, sample in enumerate(ds):
+        data.append(sample)
+    return data
+
+
+def get_testset_n_gram(n_gram=10, test_set=["mbpp", "multiple", "humanevalpack"]):
+    print("Start Loading decont test set")
+    mbpp_data = load_mbpp_test_data()
+    humaneval_data = load_humanevalpack_test_data()
+    multiply_e_data = load_multiply_e()
+    ds1000_data = load_ds1000_test_data()
+    codeapex_data = load_codeapex_data()
+    #leetcode_data = load_leetcode_test_data()
+    print("Successfully Loading decont test set")
+    all_grams = set([])
+    if "mbpp" in test_set:
+        for obj in mbpp_data:
+            n_grams = get_n_gram(obj["text"] + "\n" + obj["code"] + "\n".join(obj["test_list"]), n_gram=n_gram)
+            all_grams.update(n_grams)
+    if "humanevalpack" in test_set:
+        for obj in humaneval_data:
+            n_grams = get_n_gram(obj["instruction"] + obj["prompt"] + obj["canonical_solution"] + obj["test"], n_gram=n_gram)
+            all_grams.update(n_grams)
+    if "multiple" in test_set:
+        for obj in multiply_e_data:
+            n_grams = get_n_gram(obj["prompt"], n_gram=n_gram)
+            all_grams.update(n_grams)
+    if "ds1000" in test_set:
+        for obj in ds1000_data:
+            n_grams = get_n_gram(obj["insertion"], n_gram=n_gram)
+            all_grams.update(n_grams)
+    if "codeapex" in test_set:
+        for obj in codeapex_data:
+            n_grams = get_n_gram(obj["prompt"], n_gram=n_gram)
+            all_grams.update(n_grams)
+    # for obj in leetcode_data:
+    #     n_grams = get_n_gram(obj["prompt"], n_gram = n_gram)
+    #     all_grams.update(n_grams)
+    return all_grams
+
+
+def decontaminate_for_cpt(text, testset_n_gram, testset_func_names, n_gram=10, if_tokenize=False, verbose=False):
+    """ 
+    True denotes contamination
+    """
+    if has_n_gram_overlap_with_testset(text, testset_n_gram, n_gram=n_gram, if_tokenize=if_tokenize, verbose=verbose):
+        return True
+    if contain_func_name(text, testset_func_names):
+        return True
+    return False
+
+
+def contain_func_name(text, testset_func_names):
+    if f'{extract_func_name(text)}' in testset_func_names:
+        return True
+    else:
+        return False
+
+
+def extract_func_name(text):
+    if re.search(r"def (.*?)\(().*?\)", text) is not None:
+        return re.search(r"def (.*?)\(().*?\)", text).group(1).strip()
+    if re.search(r"public \w+ \w+ (.*?)\(().*?\)", text) is not None:
+        return re.search(r"public \w+ \w+ (.*?)\(().*?\)", text).group(1).strip()
+    else:
+        return None
+
+
+def extract_class_name(text):
+    if re.search(r"public\w+(\s+)\w*{", text) is not None:
+        return re.search(r"def (.*?)\(().*?\)", text).group(1).strip()
+    else:
+        return None
+
+
+def get_testset_func_name(datasets=["humaneval", "mbpp"]):
+    test_func_names = set()
+    if "humaneval" in datasets:
+        humaneval_data = load_humanevalpack_test_data()
+        test_func_names.update(set([obj["entry_point"] for obj in humaneval_data]))
+    if "mbpp" in datasets:
+        mbpp_data = load_mbpp_test_data()
+        test_func_names.update(set([extract_func_name(obj["code"]) for obj in mbpp_data]))
+    return test_func_names
+
+
+def deduplicate_similar_strings(objs, num_perm=512, jaccard_threshold=0.8):
+    """
+    # # Example usage
+    # strings = ["hello", "h3llo", "helloo", "world", "w0rld", "word", "whirled"]
+    # deduplicated = deduplicate_similar_strings(strings, jaccard_threshold=0.8)
+    # print(deduplicated)
+    """
+    # Create an LSH index with a given Jaccard similarity threshold
+    lsh = MinHashLSH(threshold=jaccard_threshold, num_perm=num_perm)
+    # Create MinHash objects for each string and add to the LSH index
+    signatures = {}
+    for i, obj in tqdm.tqdm(enumerate(objs)):
+        minhash = MinHash(num_perm=num_perm)
+        for word in obj["text"].split():
+            minhash.update(word.encode('utf8'))
+        lsh.insert(f'string_{i}', minhash)
+        signatures[f'string_{i}'] = minhash
+    unique_strings = []
+    processed = set()
+    for i, obj in enumerate(objs):
+        key = f'string_{i}'
+        if key in processed:
+            continue
+        similar_keys = lsh.query(signatures[key])
+        for sim_key in similar_keys:
+            processed.add(sim_key)
+        unique_strings.append(obj)
+    print(f"{len(objs)} -> {len(unique_strings)}")
+    return unique_strings
+
+
+def deduplicate_similar_strings_chatml(objs, num_perm=512, jaccard_threshold=0.6):
+    """
+    # # Example usage
+    # strings = ["hello", "h3llo", "helloo", "world", "w0rld", "word", "whirled"]
+    # deduplicated = deduplicate_similar_strings(strings, jaccard_threshold=0.8)
+    # print(deduplicated)
+    """
+    # Create an LSH index with a given Jaccard similarity threshold
+    lsh = MinHashLSH(threshold=jaccard_threshold, num_perm=num_perm)
+    # Create MinHash objects for each string and add to the LSH index
+    signatures = {}
+    for i, obj in tqdm.tqdm(enumerate(objs)):
+        minhash = MinHash(num_perm=num_perm)
+        for word in (obj["messages"][1]["content"] + "\n" + obj["messages"][1]["content"]).split():
+            minhash.update(word.encode('utf8'))
+        lsh.insert(f'string_{i}', minhash)
+        signatures[f'string_{i}'] = minhash
+    unique_strings = []
+    processed = set()
+    for i, obj in enumerate(objs):
+        key = f'string_{i}'
+        if key in processed:
+            continue
+        similar_keys = lsh.query(signatures[key])
+        for sim_key in similar_keys:
+            processed.add(sim_key)
+        unique_strings.append(obj)
+    return unique_strings
+
+
+def multi_tasks(objs, workers=64, path="data/system_role/log_gpt.jsonl", task=None, prompt_template=None, chunk_size=None, language=None):
+    p = mp.Pool(workers)
+    if chunk_size:
+        results = []
+        job_num = math.ceil(len(objs) / chunk_size)
+        print(f"job num: {job_num}")
+        for worker_id in range(job_num):
+            results.append(p.apply_async(MPLogExceptions(task), args=(objs[worker_id * chunk_size:(worker_id + 1) * chunk_size], worker_id, workers, None, path, prompt_template, language)))
+    else:
+        chunk_size = math.ceil(len(objs) / float(workers))
+        results = []
+        for worker_id in range(workers):
+            results.append(p.apply_async(MPLogExceptions(task), args=(objs[worker_id * chunk_size:(worker_id + 1) * chunk_size], worker_id, workers, None, path, prompt_template, language)))
+    p.close()
+    p.join()
+    output_objs = []
+    for result in results:
+        output_objs.extend(result.get())
+    return output_objs
+
+
+if __name__ == "__main__":
+    test_n_grams = get_testset_n_gram()
+    objs = read_jsonl_file("./sft.jsonl")
+    cnt = 0
+    for obj in tqdm.tqdm(objs):
+        overlaps = []
+        dialog = "\n".join([obj["messages"][i]["content"] for i in range(1, len(obj["messages"]))])
+        if has_n_gram_overlap_with_testset(obj["text"], test_n_grams, n_gram=10, if_tokenize=False, overlaps=overlaps):
+            print(obj["text"])
+            cnt += 1
+    print(cnt)
--- a/finetuning/sft/utils/multiple_metrics/.gitkeep
+++ b/finetuning/sft/utils/multiple_metrics/.gitkeep
--- a/finetuning/sft/utils/multiple_metrics/__init__.py
+++ b/finetuning/sft/utils/multiple_metrics/__init__.py
--- a/finetuning/sft/utils/multiple_metrics/containerized_eval.py
+++ b/finetuning/sft/utils/multiple_metrics/containerized_eval.py
+"""
+NOTE: Nothing containerized about this any more. This is just a helper
+for problem_evaluator.py.
+"""
+
+import tempfile
+from pathlib import Path
+
+from . import (eval_cpp, eval_dlang, eval_java, eval_javascript, eval_julia,
+               eval_lua, eval_php, eval_python, eval_r, eval_racket, eval_ruby,
+               eval_rust, eval_swift, eval_ts, eval_go, eval_pl, eval_sh, eval_scala, eval_cs)
+
+EVALUATORS = {
+    "rb": (eval_ruby.eval_script, ".rb"),
+    "lua": (eval_lua.eval_script, ".lua"),
+    "python": (eval_python.eval_script, ".py"),
+    "py": (eval_python.eval_script, ".py"),
+    "notypes.py": (eval_python.eval_script, ".py"),
+    "julia": (eval_julia.eval_script, ".jl"),
+    "java": (eval_java.eval_script, ".java"),
+    "rust": (eval_rust.eval_script, ".rs"),
+    "rs": (eval_rust.eval_script, ".rs"),
+    "swift": (eval_swift.eval_script, ".swift"),
+    "lua": (eval_lua.eval_script, ".lua"),
+    "racket": (eval_racket.eval_script, ".rkt"),
+    "rkt": (eval_racket.eval_script, ".rkt"),
+    "javascript": (eval_javascript.eval_script, ".js"),
+    "js": (eval_javascript.eval_script, ".js"),
+    "cpp": (eval_cpp.eval_script, ".cpp"),
+    "cs": (eval_cs.eval_script, ".cs"),
+    "php": (eval_php.eval_script, ".php"),
+    "humaneval_to_dlang.py": (eval_dlang.eval_script, ".d"),
+    "d": (eval_dlang.eval_script, ".d"),
+    "r": (eval_r.eval_script, ".r"),
+    "humaneval_to_r.py": (eval_r.eval_script, ".r"),
+    "jl": (eval_julia.eval_script, ".jl"),
+    "ts": (eval_ts.eval_script, ".ts"),
+    "go": (eval_go.eval_script, ".go"),
+    "pl": (eval_pl.eval_script, ".pl"),
+    "sh": (eval_sh.eval_script, ".sh"),
+    "scala": (eval_scala.eval_script, ".scala"),
+}
+
+
+def eval_string_script(language, program):
+    if language in EVALUATORS:
+        (eval_script, file_ext) = EVALUATORS[language]
+    else:
+        eval_module = __import__(
+            f"eval_{language}" if language != "go_test.go" else "eval_go"
+        )
+        eval_script = eval_module.eval_script
+        file_ext = f".{language}" if language != "go_test.go" else "_test.go"
+    with tempfile.NamedTemporaryFile(suffix=file_ext, delete=True) as f:
+        f.write(program.encode("utf-8"))
+        f.flush()
+        result = eval_script(Path(f.name))
+        # Only save the first 2K of output from the running program. Any futher
+        # output is very likely an exceptionally long stack trace or a long
+        # series of prints.
+        if type(result["stdout"]) == bytes:
+            result["stdout"] = result["stdout"].decode("utf-8", errors="ignore")
+        if result["stdout"] is None:
+            result["stdout"] = ""
+        if result["stderr"] is None:
+            result["stderr"] = ""
+        if type(result["stderr"]) == bytes:
+            result["stderr"] = result["stderr"].decode("utf-8", errors="ignore")
+        assert type(result["stdout"]) == str
+        assert type(result["stderr"]) == str
+        return {
+            "program": program,
+            "stdout": result["stdout"].replace("!!int", "")[:2048],
+            "stderr": result["stderr"][:2048],
+            "exit_code": result["exit_code"],
+            "status": result["status"],
+        }
--- a/finetuning/sft/utils/multiple_metrics/eval_cpp.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_cpp.py
+from pathlib import Path
+
+from .generic_eval import main
+from .safe_subprocess import run
+
+LANG_NAME = "C++"
+LANG_EXT = ".cpp"
+
+
+def eval_script(path: Path):
+    basename = ".".join(str(path).split(".")[:-1])
+    build_result = run(["g++", path, "-o", basename, "-std=c++17"])
+    if build_result.exit_code != 0:
+        return {
+            "status": "SyntaxError",
+            "exit_code": build_result.exit_code,
+            "stdout": build_result.stdout,
+            "stderr": build_result.stderr,
+        }
+
+    run_result = run([basename])
+    if "In file included from /shared/centos7/gcc/9.2.0-skylake/" in run_result.stderr:
+        raise Exception("Skylake bug encountered")
+    if "/4.8.2" in run_result.stderr:
+        raise Exception("Ancient compiler encountered")
+    if run_result.timeout:
+        status = "Timeout"
+    elif run_result.exit_code != 0:
+        status = "Exception"
+    else:
+        status = "OK"
+    return {
+        "status": status,
+        "exit_code": run_result.exit_code,
+        "stdout": run_result.stdout,
+        "stderr": run_result.stderr,
+    }
+
+
+if __name__ == "__main__":
+    main(eval_script, LANG_NAME, LANG_EXT)
--- a/finetuning/sft/utils/multiple_metrics/eval_cs.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_cs.py
+import os
+import subprocess
+
+from .generic_eval import main
+
+LANG_NAME = "CSharp"
+LANG_EXT = ".cs"
+
+# Following files have problems:
+# 137,
+# 22: Any
+# 148: Elipsis
+
+
+def eval_script(path: str):
+    if ".cs" not in path.name:
+        return
+    basename = ".".join(str(path).split(".")[:-1])
+    binaryname = basename + ".exe"
+    try:
+        # build = subprocess.run(
+        #     ["csc", "/d:DEBUG", "-r:System.Numerics.dll", path, f"/out:{binaryname}"],
+        #     capture_output=True,
+        # )
+        build = subprocess.run(
+            ["mcs", "/d:DEBUG", "-r:System.Numerics.dll", path, f"/out:{binaryname}"],
+            capture_output=True,
+        )
+    except:
+        subprocess.run("apt-get install mono-devel")
+    status = None
+    returncode = -1
+    output = None
+    if build.returncode != 0:
+        # Well, it's a compile error. May be a type error or
+        # something. But, why break the set convention
+        status = "SyntaxError"
+        returncode = build.returncode
+        output = build
+    else:
+        try:
+            output = subprocess.run(
+                ["mono", binaryname],
+                env={"PATH": os.getenv("PATH"), "MONO_TRACE_LISTENER": "Console.Error"},
+                capture_output=True,
+                timeout=10,
+            )
+            returncode = output.returncode
+            output.stderr = str(output.stderr, "utf-8")
+            # mono return 0 even when failing
+            fail = (
+                "System.Diagnostics.DefaultTraceListener.Fail" in output.stderr
+                or "Unhandled Exception" in output.stderr
+            )
+            output.returncode = 1 if fail else 0
+            if output.returncode == 0:
+                status = "OK"
+            else:
+                # Well, it's a panic
+                status = "Exception"
+        except subprocess.TimeoutExpired as exc:
+            status = "Timeout"
+            output = exc
+        os.remove(binaryname)
+
+    if output.stdout is not None:
+        output.stdout = output.stdout.decode("utf-8")
+    else:
+        output.stdout = "None"
+
+    if output.stderr == "":
+        output.stderr = "None"
+
+    return {
+        "status": status,
+        "exit_code": returncode,
+        "stdout": output.stdout,
+        "stderr": output.stderr,
+    }
+
+
+if __name__ == "__main__":
+    main(eval_script, LANG_NAME, LANG_EXT)
--- a/finetuning/sft/utils/multiple_metrics/eval_dlang.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_dlang.py
+import os
+import re
+from pathlib import Path
+
+from .safe_subprocess import run
+
+ENABLE_SYNTAX_CHECK = False
+
+
+def eval_script(path: Path):
+    result = run(["rdmd", "-unittest", str(path)], timeout_seconds=15)
+    if "might not be correctly installed" in result.stderr:
+        raise Exception("D is not correctly installed")
+
+    if result.timeout:
+        status = "Timeout"
+    elif result.exit_code == 0:
+        status = "OK"
+    elif "Error:" in result.stderr:
+        status = "SyntaxError"
+    else:
+        status = "Exception"
+
+    return {
+        "status": status,
+        "exit_code": result.exit_code,
+        "stdout": result.stdout,
+        "stderr": result.stderr,
+    }
+
+
+DIR = "d-keep-code_davinci_001_temp_0.2"
+
+
+def main():
+    directory = Path(Path(__file__).parent, "..", "datasets", DIR).resolve()
+
+    count = {"OK": 0, "Timeout": 0, "Exception": 0, "SyntaxError": 0}
+    for filename in os.listdir(directory):
+        path = Path.joinpath(directory, filename)
+        r = eval_script(path)
+        status = r["status"]
+        count[status] += 1
+
+        if ENABLE_SYNTAX_CHECK and status == "SyntaxError":
+            error_msgs = r["stderr"].split("\n")
+            with open(path) as source_file:
+                lines = source_file.readlines()
+                unittest_line_start = lines.index("unittest\n")
+                unittest_line_end = len(lines)
+                for err_msg_line in error_msgs:
+                    matched_parts = re.match(
+                        r"(\/?.*?\.[\w:]+\/.*.d)\(([0-9]+)\): Error: (.*)",
+                        err_msg_line[2:-1],
+                    )
+                    _file, line_num = matched_parts[1], int(matched_parts[2])
+                    if (
+                        unittest_line_start <= line_num
+                        and line_num <= unittest_line_end
+                    ):
+                        print("===============")
+                        print(path, "contains error in unit test part")
+                        print(error_msgs)
+                        print("===============")
+
+        filename = filename.split(".")[0]
+        print(f"Dlang,{filename},{status}")
+
+    print(DIR + ":" + str(count))
+
+
+if __name__ == "__main__":
+    main()
--- a/finetuning/sft/utils/multiple_metrics/eval_go.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_go.py
+import argparse
+import subprocess
+from pathlib import Path
+from sys import exit
+
+from .generic_eval import main as gmain
+
+
+def eval_script(path: Path):
+    status = None
+    stdout = None
+    stderr = None
+    exit_code = None
+    try:
+        build = subprocess.run(
+            ["go", "test", path],
+            timeout=30,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+
+        stdout = build.stdout.decode("utf-8", errors="ignore")
+        stderr = build.stderr.decode("utf-8", errors="ignore")
+        exit_code = build.returncode
+        # write to stderr just so that we can redirect stdout to a csv
+
+        if "[setup failed]" in stdout or "[build failed]" in stdout:
+            status = "SyntaxError"
+        elif "FAIL" in stdout:
+            status = "Exception"
+        else:
+            status = "OK"
+    except subprocess.TimeoutExpired:
+        status = "Timeout"
+
+    return {
+        "status": status,
+        "exit_code": exit_code,
+        "stdout": stdout,
+        "stderr": stderr,
+    }
+
+
+if __name__ == "__main__":
+    gmain(eval_script, "Go", ".go")
--- a/finetuning/sft/utils/multiple_metrics/eval_java.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_java.py
+import os
+import tempfile
+from pathlib import Path
+
+from .generic_eval import main
+from .safe_subprocess import run
+
+LANG_NAME = "Java"
+LANG_EXT = ".java"
+
+# Following files have problems:
+# 137,
+# 22: Any
+# 148: Elipsis
+
+
+def eval_script(path: Path):
+
+    sys_env = os.environ.copy()
+    javatuples_path = Path(f"{os.path.dirname(__file__)}/javatuples-1.2.jar")
+
+    sys_env["CLASSPATH"] = f"{javatuples_path}"
+
+    with tempfile.TemporaryDirectory() as outdir:
+        # Each Java file contains the class with same name `JAVA_CLASS_NAME`
+        # Hence, javac will same JAVA_CLASS_NAME.class file for each problem
+        # Write class for each problem to a different temp dir
+        # Use UTF8 encoding with javac
+        result = run(["javac", "-encoding", "UTF8", "-d", outdir, path], env=sys_env)
+
+        if result.exit_code != 0:
+            # Well, it's a compile error. May be a type error or
+            # something. But, why break the set convention
+            status = "SyntaxError"
+        else:
+            result = run(["java", "-ea", "-cp", f"{outdir}", "Problem"], env=sys_env)
+            if result.timeout:
+                status = "Timeout"
+            elif result.exit_code == 0:
+                status = "OK"
+            else:
+                status = "Exception"
+
+    return {
+        "status": status,
+        "exit_code": result.exit_code,
+        "stdout": result.stdout,
+        "stderr": result.stderr,
+    }
+
+
+if __name__ == "__main__":
+    main(eval_script, LANG_NAME, LANG_EXT)
--- a/finetuning/sft/utils/multiple_metrics/eval_javascript.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_javascript.py
+import os
+import subprocess
+from pathlib import Path
+
+
+def eval_script(path: Path):
+    try:
+        # Assumes exit-code 0 is all okay
+        output = subprocess.run(["node", str(path)], capture_output=True, timeout=5)
+
+        if output.returncode == 0:
+            status = "OK"
+        else:
+            outmessage = str(output)
+            if "ERR_ASSERTION" in outmessage:
+                status = "AssertionError"
+            elif "SyntaxError" in outmessage:
+                status = "SyntaxError"
+            elif "ReferenceError" in outmessage:
+                status = "ReferenceError"
+            else:
+                status = "Exception"
+        returncode = output.returncode
+    except subprocess.TimeoutExpired as exc:
+        status = "Timeout"
+        output = exc
+        returncode = -1
+    except subprocess.CalledProcessError as exc:
+        status = "Exception"
+        returncode = exc.returncode
+        output = exc
+    return {
+        "status": status,
+        "exit_code": returncode,
+        "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
+        "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
+    }
+
+
+def main():
+    directory = Path(
+        Path(__file__).parent, "..", "datasets", "js-keep-code_davinci_001_temp_0.2"
+    ).resolve()
+
+    for filename in os.listdir(directory):
+        r = eval_script(Path.joinpath(directory, filename))
+        filename = filename.split(".")[0]
+        print(f"JavaScript,{filename},{r['status']}")
+
+
+if __name__ == "__main__":
+    main()
--- a/finetuning/sft/utils/multiple_metrics/eval_julia.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_julia.py
+from pathlib import Path
+
+from .safe_subprocess import run
+
+
+def eval_script(path: Path):
+    result = run(["julia", str(path)], timeout_seconds=5)
+    if result.timeout:
+        status = "Timeout"
+    elif result.exit_code == 0:
+        status = "OK"
+    # TODO(arjun): I would like this to be reviewed more carefully by John.
+    elif len(result.stderr) < 1:
+        status = "Exception"
+    else:
+        status = "SyntaxError"
+
+    return {
+        "status": status,
+        "exit_code": result.exit_code,
+        "stdout": result.stdout,
+        "stderr": result.stderr,
+    }
--- a/finetuning/sft/utils/multiple_metrics/eval_lua.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_lua.py
+from pathlib import Path
+
+from .safe_subprocess import run
+
+
+def eval_script(path: Path):
+    r = run(["lua", str(path)])
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code == 0:
+        status = "OK"
+    else:
+        status = "Exception"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }
--- a/finetuning/sft/utils/multiple_metrics/eval_php.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_php.py
+from pathlib import Path
+
+from .safe_subprocess import run
+
+LANG_NAME = "PHP"
+LANG_EXT = ".php"
+
+
+def eval_script(path: Path):
+    r = run(["php", path])
+    if "PHP Parse error" in r.stdout:
+        status = "SyntaxError"
+    elif r.exit_code != 0:
+        status = "Exception"
+    else:
+        status = "OK"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }
--- a/finetuning/sft/utils/multiple_metrics/eval_pl.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_pl.py
+from pathlib import Path
+
+from .safe_subprocess import run
+
+
+def eval_script(path: Path):
+    r = run(["perl", path])
+
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code != 0:
+        status = "Exception"
+    elif "ERROR" in r.stdout or "ERROR" in r.stderr:
+        status = "Exception"
+    else:
+        status = "OK"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }
--- a/finetuning/sft/utils/multiple_metrics/eval_python.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_python.py
+from pathlib import Path
+import os
+from .safe_subprocess import run
+
+
+def eval_script(path: Path):
+    os.environ["PATH"] = "miniconda3/envs/qwen/bin/:" + os.environ["PATH"]
+    r = run(["python", str(path)])
+    if r.timeout:
+        status = "Timeout"
+    elif r.exit_code == 0:
+        status = "OK"
+    elif "SyntaxError" in r.stderr:
+        status = "SyntaxError"
+    else:
+        status = "Exception"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }
--- a/finetuning/sft/utils/multiple_metrics/eval_r.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_r.py
+import os
+import subprocess
+from pathlib import Path
+
+
+def eval_script(path: Path):
+    try:
+        # Assumes exit-code 0 is all okay
+        # Run R on the file, capturing stderr
+        output = subprocess.run(["Rscript", str(path)], capture_output=True, timeout=5)
+        if output.returncode == 0:
+            status = "OK"
+        else:
+            outmessage = str(output)
+            if "unexpected" in outmessage:
+                status = "SyntaxError"
+            elif "err=b''" in outmessage:
+                status = "AssertionError"
+            else:
+                status = "Exception"
+        returncode = output.returncode
+    except subprocess.TimeoutExpired as exc:
+        status = "Timeout"
+        output = exc
+        returncode = -1
+    except subprocess.CalledProcessError as exc:
+        status = "Exception"
+        returncode = exc.returncode
+        output = exc
+    return {
+        "status": status,
+        "exit_code": returncode,
+        "stdout": output.stdout,
+        "stderr": output.stderr,
+    }
+
+
+def main():
+    directory = Path(
+        Path(__file__).parent, "..", "datasets", "R-keep-code_davinci_001_temp_0.2"
+    ).resolve()
+
+    for filename in os.listdir(directory):
+        r = eval_script(Path.joinpath(directory, filename))
+        filename = filename.split(".")[0]
+        print(f"R,{filename},{r['status']}")
+
+
+if __name__ == "__main__":
+    main()
--- a/finetuning/sft/utils/multiple_metrics/eval_racket.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_racket.py
+"""
+Evaluates a generated Racket program (.rkt).
+"""
+import os
+from pathlib import Path
+
+from .safe_subprocess import run
+
+
+def eval_script(path: Path):
+    result = run(["racket", str(path)])
+
+    if (
+        "standard-module-name-resolver: collection not found\n  for module path: rackunit"
+        in result.stderr
+    ):
+        print(f"Failed to run evaluation for {path}: rackunit is not installed")
+        return None
+
+    # rackunit produces exit code 0 even if tests fail.
+    if len(result.stderr) > 0 or result.exit_code != 0:
+        if "read-syntax" in result.stderr:
+            status = "SyntaxError"
+        else:
+            status = "Exception"
+    else:
+        status = "OK"
+
+    return {
+        "status": status,
+        "exit_code": result.exit_code,
+        "stdout": result.stdout,
+        "stderr": result.stderr,
+    }
+
+
+def main():
+    directory = Path(
+        Path(__file__).parent, "..", "datasets", "racket-keep-code_davinci_001_temp_0.2"
+    ).resolve()
+
+    for filename in os.listdir(directory):
+        r = eval_script(Path.joinpath(directory, filename))
+        filename = filename.split(".")[0]
+        print(f"Racket,{filename},{r['status']}")
+
+
+if __name__ == "__main__":
+    main()
--- a/finetuning/sft/utils/multiple_metrics/eval_ruby.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_ruby.py
+import argparse
+import subprocess
+from pathlib import Path
+
+from .generic_eval import main as gmain
+
+
+def eval_script(path: Path):
+    try:
+        # Assumes exit-code 0 is all okay
+        # Need check=True for Ruby to pass errors to CalledProcessError
+        output = subprocess.run(
+            ["ruby", path], check=True, capture_output=True, timeout=5
+        )
+        if output.returncode == 0:
+            status = "OK"
+            out = output.stderr
+            error = output.stdout
+            returncode = 0
+        else:
+            raise Exception("there's an issue with check = True for Ruby, INVESTIGATE!")
+    except subprocess.TimeoutExpired as exc:
+        status = "Timeout"
+        out = exc.stdout
+        error = exc.stderr
+        returncode = -1
+    except subprocess.CalledProcessError as exc:
+        returncode = exc.returncode
+        out = exc.stdout
+        error = exc.stderr
+        # failure with code 1 but no error message is an Exception from Failed tests
+        if len(error) < 1:
+            status = "Exception"
+        else:  # everything that prints out an error message is a SyntaxError
+            status = "SyntaxError"
+    return {
+        "status": status,
+        "exit_code": returncode,
+        "stdout": out,
+        "stderr": error,
+    }
+
+
+if __name__ == "__main__":
+    gmain(eval_script, "Ruby", ".rb")
--- a/finetuning/sft/utils/multiple_metrics/eval_rust.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_rust.py
+import os
+import subprocess
+from pathlib import Path
+
+from .generic_eval import main
+
+LANG_NAME = "Rust"
+LANG_EXT = ".rs"
+
+
+def eval_script(path: Path):
+    basename = ".".join(str(path).split(".")[:-1])
+    try:
+        build = subprocess.run(
+            ["rustc", path, "-o", basename], capture_output=True, timeout=15
+        )
+    except subprocess.TimeoutExpired as exc:
+        return {
+            "status": "Timeout",
+            "exit_code": -1,
+            "stdout": "Compiler timeout",
+            "stderr": "Compiler timeout",
+        }
+    status = None
+    returncode = -1
+    output = None
+    if build.returncode != 0:
+        # Well, it's a compile error. May be a type error or
+        # something. But, why break the set convention
+        status = "SyntaxError"
+        returncode = build.returncode
+        output = build
+    else:
+        try:
+            # Assumes exit-code 0 is all okay
+            output = subprocess.run([basename], capture_output=True, timeout=5)
+            returncode = output.returncode
+            if output.returncode == 0:
+                status = "OK"
+            else:
+                # Well, it's a panic
+                status = "Exception"
+        except subprocess.TimeoutExpired as exc:
+            status = "Timeout"
+            output = exc
+        os.remove(basename)
+    return {
+        "status": status,
+        "exit_code": returncode,
+        "stdout": "" if output.stdout is None else output.stdout.decode("utf-8"),
+        "stderr": "" if output.stderr is None else output.stderr.decode("utf-8"),
+    }
+
+
+if __name__ == "__main__":
+    main(eval_script, LANG_NAME, LANG_EXT)
--- a/finetuning/sft/utils/multiple_metrics/eval_scala.py
+++ b/finetuning/sft/utils/multiple_metrics/eval_scala.py
+import tempfile
+from pathlib import Path
+
+from .safe_subprocess import run
+
+LANG_NAME = "Scala"
+LANG_EXT = ".scala"
+
+
+def eval_script(path: Path):
+    with tempfile.TemporaryDirectory() as outdir:
+        # Each Scala file contains the class with same name `JAVA_CLASS_NAME`
+        # Hence, scalac will same JAVA_CLASS_NAME.class file for each problem
+        # Write class for each problem to a different temp dir
+        build = run(["scalac", "-d", outdir, path], timeout_seconds=45)
+        if build.exit_code != 0:
+            # Well, it's a compile error. May be a type error or
+            # something. But, why break the set convention
+            return {
+                "status": "SyntaxError",
+                "exit_code": build.exit_code,
+                "stdout": build.stdout,
+                "stderr": build.stderr,
+            }
+        # "Problem" is the name of the class we emit.
+        r = run(["scala", "-cp", f"{outdir}", "Problem"])
+        if r.timeout:
+            status = "Timeout"
+        elif r.exit_code == 0 and r.stderr == "":
+            status = "OK"
+        else:
+            # Well, it's a panic
+            status = "Exception"
+    return {
+        "status": status,
+        "exit_code": r.exit_code,
+        "stdout": r.stdout,
+        "stderr": r.stderr,
+    }