v1.0

b6edc328 · chenzk · b6edc328 · b6edc328 · b6edc328 · b6edc328
Commit b6edc328 authored Jan 17, 2025 by chenzk
20 changed files
--- a/scripts/lcb_runner/runner/claude3_runner.py
+++ b/scripts/lcb_runner/runner/claude3_runner.py
+import os
+from time import sleep
+
+try:
+    from anthropic import Anthropic
+except ImportError as e:
+    pass
+
+from lcb_runner.runner.base_runner import BaseRunner
+
+
+class Claude3Runner(BaseRunner):
+    client = Anthropic(api_key=os.getenv("ANTHROPIC_KEY"))
+
+    def __init__(self, args, model):
+        super().__init__(args, model)
+        self.client_kwargs: dict[str | str] = {
+            "model": args.model,
+            "temperature": args.temperature,
+            "max_tokens": args.max_tokens,
+            "top_p": args.top_p,
+        }
+
+    def _run_single(self, prompt: tuple[str, str]) -> list[str]:
+
+        def __run_single(counter):
+            try:
+                response = self.client.messages.create(
+                    system=prompt[0],
+                    messages=prompt[1],
+                    **self.client_kwargs,
+                )
+                content = "\n".join([x.text for x in response.content])
+                return content
+            except Exception as e:
+                print("Exception: ", repr(e), "Sleeping for 20 seconds...")
+                sleep(20 * (11 - counter))
+                counter = counter - 1
+                if counter == 0:
+                    print(f"Failed to run model for {prompt}!")
+                    print("Exception: ", repr(e))
+                    raise e
+                return __run_single(counter)
+
+        outputs = []
+        try:
+            for _ in range(self.args.n):
+                outputs.append(__run_single(10))
+        except Exception as e:
+            raise e
+
+        return outputs
--- a/scripts/lcb_runner/runner/claude_runner.py
+++ b/scripts/lcb_runner/runner/claude_runner.py
+import os
+from time import sleep
+
+try:
+    from anthropic import Anthropic
+except ImportError as e:
+    pass
+
+from lcb_runner.runner.base_runner import BaseRunner
+
+
+class ClaudeRunner(BaseRunner):
+    client = Anthropic(api_key=os.getenv("ANTHROPIC_KEY"))
+
+    def __init__(self, args, model):
+        super().__init__(args, model)
+        self.client_kwargs: dict[str | str] = {
+            "model": args.model,
+            "temperature": args.temperature,
+            "max_tokens_to_sample": args.max_tokens,
+            "top_p": args.top_p,
+        }
+
+    def _run_single(self, prompt: str) -> list[str]:
+
+        def __run_single(counter):
+            try:
+                response = self.client.completions.create(
+                    prompt=prompt,
+                    **self.client_kwargs,
+                )                                
+                content = response.completion
+                return content
+            except Exception as e:
+                print("Exception: ", repr(e), "Sleeping for 20 seconds...")
+                sleep(20 * (11 - counter))
+                counter = counter - 1
+                if counter == 0:
+                    print(f"Failed to run model for {prompt}!")
+                    print("Exception: ", repr(e))
+                    raise e
+                return __run_single(counter)
+
+        outputs = []
+        try:
+            for _ in range(self.args.n):
+                outputs.append(__run_single(10))
+        except Exception as e:
+            raise e
+
+        return outputs
--- a/scripts/lcb_runner/runner/cohere_runner.py
+++ b/scripts/lcb_runner/runner/cohere_runner.py
+import os
+from time import sleep
+
+try:
+    import cohere
+except ImportError as e:
+    pass
+
+from lcb_runner.runner.base_runner import BaseRunner
+
+
+class CohereRunner(BaseRunner):
+    client = cohere.Client(os.getenv("COHERE_API_KEY"))
+
+    def __init__(self, args, model):
+        super().__init__(args, model)
+        self.client_kwargs: dict[str | str] = {
+            "model": args.model,
+            "temperature": args.temperature,
+            "max_tokens": args.max_tokens,
+            "p": args.top_p,
+        }
+
+    def _run_single(self, prompt: tuple[dict[str,str], str]) -> list[str]:
+        chat_history, message = prompt
+
+        def __run_single(counter):
+            try:
+                response = self.client.chat(
+                    message=message,
+                    chat_history=chat_history,
+                    **self.client_kwargs,
+                )
+                content = response.text
+                return content
+            except Exception as e:
+                print("Exception: ", repr(e), "Sleeping for 20 seconds...")
+                sleep(20 * (11 - counter))
+                counter = counter - 1
+                if counter == 0:
+                    print(f"Failed to run model for {prompt}!")
+                    print("Exception: ", repr(e))
+                    raise e
+                return __run_single(counter)
+
+        outputs = []
+        try:
+            for _ in range(self.args.n):
+                outputs.append(__run_single(10))
+        except Exception as e:
+            raise e
+
+        return outputs
--- a/scripts/lcb_runner/runner/custom_evaluator.py
+++ b/scripts/lcb_runner/runner/custom_evaluator.py
+import os
+import json
+
+from lcb_runner.runner.parser import get_args
+from lcb_runner.utils.scenarios import Scenario
+from lcb_runner.utils.path_utils import get_output_path
+from lcb_runner.evaluation import extract_instance_results
+from lcb_runner.runner.scenario_router import (
+    build_prompt_benchmark,
+    sort_and_extract_save_results,
+    get_metrics,
+)
+
+
+def main():
+    args = get_args()
+
+    benchmark, _ = build_prompt_benchmark(args)
+
+    with open(args.custom_output_file, "r") as f:
+        custom_outputs = json.load(f)
+        assert isinstance(custom_outputs, list)
+        assert len(custom_outputs) == len(benchmark), f"{len(custom_outputs)} != {len(benchmark)}"
+        if isinstance(custom_outputs[0], list):
+            ## custom outputs must list[list[str]]
+            ## list of extracted outputs per question
+            ## sorted by the benchmark question_id, test_id, id depending on the scenario
+
+            assert all(
+                isinstance(custom_output, list) for custom_output in custom_outputs
+            )
+        elif isinstance(custom_outputs[0], dict):
+            ## custom outputs must list[dict[str, Any]]
+            ## list of extracted outputs per question
+            ## for codegeneration and selfrepair scenario -- `code_list` and `question_id` are required
+            ## for testoutputprediction -- `pred_list`, `question_id`, `test_id` are required 
+            ## for codeexecution -- `pred_list`, `id` are required 
+            ## code_list/pred_list is a list of extracted answers (code or assertions) for a question
+
+            assert all(
+                isinstance(custom_output, dict) for custom_output in custom_outputs
+            )
+            if args.scenario in [Scenario.codegeneration, Scenario.selfrepair]:
+                custom_outputs = [
+                    custom_output["code_list"]
+                    for custom_output in sorted(
+                        custom_outputs, key=lambda x: str(x["question_id"])
+                    )
+                ]
+            elif args.scenario == Scenario.testoutputprediction:
+                custom_outputs = [
+                    custom_output['pred_list']
+                    for custom_output in sorted(
+                        custom_outputs, key=lambda x: (str(x["question_id"]), str(x['test_id']))
+                    )
+                ]
+            elif args.scenario == Scenario.codeexecution:
+                custom_outputs = [
+                    custom_output['pred_list']
+                    for custom_output in sorted(
+                        custom_outputs, key=lambda x: int(x.id.split("_")[1])
+                    )
+                ]
+
+    save_results = [
+        instance.insert_output(custom_output, custom_output)
+        for instance, custom_output in zip(benchmark, custom_outputs)
+    ]
+
+    save_results, combined_results = sort_and_extract_save_results(
+        args.scenario, save_results
+    )
+
+    metrics = get_metrics(args.scenario, args, benchmark, combined_results)
+    graded = extract_instance_results(metrics[1])
+
+    if args.scenario == Scenario.codegeneration:
+        metadatas = metrics[2]
+        save_eval_results = [
+            instance.insert_output_evaluation(
+                outputs_list, extracted_list, graded_list, metadata=meta
+            )
+            for instance, (outputs_list, extracted_list), graded_list, meta in zip(
+                benchmark, combined_results, graded, metadatas
+            )
+        ]
+    else:
+        save_eval_results = [
+            instance.insert_output_evaluation(
+                outputs_list, extracted_list, graded_list
+            )
+            for instance, (outputs_list, extracted_list), graded_list in zip(
+                benchmark, combined_results, graded
+            )
+        ]
+    
+
+    if args.custom_output_save_name is None:
+        output_path = args.custom_output_file[:-5] + f"_{args.scenario.value}_output.json"
+    else:
+        output_path = get_output_path(args.custom_output_save_name, args)
+
+    with open(output_path, "w") as f:
+        json.dump(save_results, f, indent=4)
+
+
+    with open(output_path.replace(".json", "_eval.json"), "w") as f:
+        json.dump(metrics, f, indent=4)
+
+    with open(output_path.replace(".json", "_eval_all.json"), "w") as f:
+        json.dump(save_eval_results, f, indent=4)
+
+if __name__ == "__main__":
+    main()
--- a/scripts/lcb_runner/runner/deepseek_runner.py
+++ b/scripts/lcb_runner/runner/deepseek_runner.py
+import os
+from time import sleep
+
+try:
+    import openai
+    from openai import OpenAI
+except ImportError as e:
+    pass
+
+from lcb_runner.runner.base_runner import BaseRunner
+
+
+class DeepSeekRunner(BaseRunner):
+    client = OpenAI(
+        api_key=os.getenv("DEEPSEEK_API"), base_url="https://api.deepseek.com"
+    )
+
+    def __init__(self, args, model):
+        super().__init__(args, model)
+        self.client_kwargs: dict[str | str] = {
+            "model": args.model,
+            "temperature": args.temperature,
+            "max_tokens": args.max_tokens,
+            "top_p": args.top_p,
+            "frequency_penalty": 0,
+            "presence_penalty": 0,
+            "n": 1,
+            "timeout": args.openai_timeout,
+            # "stop": args.stop, --> stop is only used for base models currently
+        }
+
+    def _run_single(self, prompt: list[dict[str, str]]) -> list[str]:
+        assert isinstance(prompt, list)
+
+        def __run_single(counter):
+            try:
+                response = self.client.chat.completions.create(
+                    messages=prompt,
+                    **self.client_kwargs,
+                )
+                content = response.choices[0].message.content
+                return content
+            except (
+                openai.APIError,
+                openai.RateLimitError,
+                openai.InternalServerError,
+                openai.OpenAIError,
+                openai.APIStatusError,
+                openai.APITimeoutError,
+                openai.InternalServerError,
+                openai.APIConnectionError,
+            ) as e:
+                print("Exception: ", repr(e))
+                print("Sleeping for 30 seconds...")
+                print("Consider reducing the number of parallel processes.")
+                sleep(30)
+                return DeepSeekRunner._run_single(prompt)
+            except Exception as e:
+                print(f"Failed to run the model for {prompt}!")
+                print("Exception: ", repr(e))
+                raise e
+
+        outputs = []
+        try:
+            for _ in range(self.args.n):
+                outputs.append(__run_single(10))
+        except Exception as e:
+            raise e
+        return outputs
--- a/scripts/lcb_runner/runner/gemini_runner.py
+++ b/scripts/lcb_runner/runner/gemini_runner.py
+import os
+from time import sleep
+
+try:
+    import google.generativeai as genai
+    from google.generativeai import GenerationConfig
+except ImportError as e:
+    pass
+
+from lcb_runner.runner.base_runner import BaseRunner
+
+
+class GeminiRunner(BaseRunner):
+    client = genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
+    safety_settings = [
+        {
+            "category": "HARM_CATEGORY_HARASSMENT",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_HATE_SPEECH",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+            "threshold": "BLOCK_NONE",
+        },
+        {
+            "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+            "threshold": "BLOCK_NONE",
+        },
+    ]
+
+    def __init__(self, args, model):
+        super().__init__(args, model)
+        self.client = genai.GenerativeModel(model.model_name)
+        self.generation_config = GenerationConfig(
+            candidate_count=1,
+            max_output_tokens=args.max_tokens,
+            temperature=args.temperature,
+            top_p=args.top_p,
+        )
+
+    def _run_single(self, prompt: str) -> list[str]:
+
+        def __run_single(counter):
+            try:
+                return self.client.generate_content(
+                    prompt,
+                    generation_config=self.generation_config,
+                    safety_settings=GeminiRunner.safety_settings,
+                )
+            except Exception as e:
+                print("Exception: ", repr(e), "Sleeping for 20 seconds...")
+                sleep(20 * (11 - counter))
+                counter = counter - 1
+                if counter == 0:
+                    print(f"Failed to run model for {prompt}!")
+                    print("Exception: ", repr(e))
+                    raise e
+                return __run_single(counter)
+
+        outputs = []
+        try:
+            for _ in range(self.args.n):
+                outputs.append(__run_single(10))
+        except Exception as e:
+            raise e
+
+        new_outputs = []
+        for output in outputs:
+            try:
+                new_outputs.append(output.text)
+            except Exception as e:
+                print("Cannot extract text exception: ", repr(e))
+                print(output.__dict__)
+                new_outputs.append("")
+        outputs = new_outputs
+
+        return outputs
--- a/scripts/lcb_runner/runner/main.py
+++ b/scripts/lcb_runner/runner/main.py
+import os
+import json
+
+from lcb_runner.runner.parser import get_args
+from lcb_runner.utils.scenarios import Scenario
+from lcb_runner.lm_styles import LanguageModelStore
+from lcb_runner.runner.runner_utils import build_runner
+from lcb_runner.utils.path_utils import get_output_path
+from lcb_runner.evaluation import extract_instance_results
+from lcb_runner.runner.scenario_router import (
+    build_prompt_benchmark,
+    combine_results,
+    sort_and_extract_save_results,
+    get_metrics,
+)
+
+
+def main():
+    args = get_args()
+
+    model = LanguageModelStore[args.model]
+    benchmark, format_prompt = build_prompt_benchmark(args)
+    if args.debug:
+        print(f"Running with {len(benchmark)} instances in debug mode")
+        benchmark = benchmark[:5]
+
+    output_path = get_output_path(model.model_repr, args)
+    eval_file = output_path.replace(".json", "_eval.json")
+    eval_all_file = output_path.replace(".json", "_eval_all.json")
+
+    if args.continue_existing or args.continue_existing_with_eval:
+        if os.path.exists(output_path):
+            with open(output_path, "r") as f:
+                old_save_results = json.load(f)
+        elif os.path.exists(eval_all_file):
+            with open(eval_all_file, "r") as f:
+                old_save_results = json.load(f)
+        else:
+            print(
+                f"File {output_path} does not exist in --continue_existing, starting from scratch"
+            )
+            old_save_results = []
+
+        old_save_results = [
+            instance
+            for instance in old_save_results
+            if instance["output_list"] and [x for x in instance["output_list"] if x]
+        ]
+        old_save_results_question_ids = [
+            instance["question_id"] for instance in old_save_results
+        ]
+        remaining_benchmark = [
+            instance
+            for instance in benchmark
+            if instance.question_id not in old_save_results_question_ids
+        ]
+        print(
+            f"Found {len(old_save_results)} existing generations, continuing with {len(remaining_benchmark)} remaining"
+        )
+    else:
+        old_save_results = []
+        remaining_benchmark = benchmark
+
+    if len(remaining_benchmark) > 0:
+        runner = build_runner(args, model)
+        results: list[list[str]] = runner.run_main(remaining_benchmark, format_prompt)
+    else:
+        results = []
+
+    combined_results = combine_results(
+        args.scenario, results, model, args.cot_code_execution
+    )
+
+    save_results = [
+        instance.insert_output(outputs_list, extracted_list)
+        for instance, (outputs_list, extracted_list) in zip(
+            remaining_benchmark, combined_results
+        )
+    ]
+
+    if args.continue_existing or args.continue_existing_with_eval:
+        save_results += old_save_results
+
+    save_results, combined_results = sort_and_extract_save_results(
+        args.scenario, save_results
+    )
+
+    with open(output_path, "w") as f:
+        json.dump(save_results, f, indent=4)
+
+    if args.evaluate:
+        if args.continue_existing_with_eval and os.path.exists(eval_all_file):
+            with open(eval_all_file) as fp:
+                old_eval_all_results = json.load(fp)
+
+            if os.path.exists(eval_file):
+                with open(eval_file) as fp:
+                    old_eval_results = json.load(fp)
+            else:
+                old_eval_results = None
+
+            old_eval_results_question_ids = [
+                instance["question_id"] for instance in old_eval_all_results
+            ]
+            remaining_indices = [
+                idx
+                for idx in range(len(benchmark))
+                if benchmark[idx].question_id not in old_eval_results_question_ids
+            ]
+            benchmark = [benchmark[idx] for idx in remaining_indices]
+            combined_results = [combined_results[idx] for idx in remaining_indices]
+
+            old_eval_size = len(old_eval_results_question_ids)
+            new_eval_size = len(benchmark)
+
+            if new_eval_size == 0:
+                return
+
+            print(f"Found {old_eval_size}, running evals for {new_eval_size} problems")
+
+            metrics = get_metrics(args.scenario, args, benchmark, combined_results)
+            graded = extract_instance_results(metrics[1])
+
+            if old_eval_results:
+                for key in metrics[0]:
+                    if key in old_eval_results[0]:
+                        if key != "detail":
+                            metrics[0][key] = (
+                                old_eval_size * old_eval_results[0][key]
+                                + new_eval_size * metrics[0][key]
+                            )
+                            metrics[0][key] /= old_eval_size + new_eval_size
+
+                for key in metrics[0]["detail"]:
+                    if key in old_eval_results[0]["detail"]:
+                        metrics[0]["detail"][key] = {
+                            **metrics[0]["detail"][key],
+                            **old_eval_results[0]["detail"][key],
+                        }
+                metrics[1] = {**metrics[1], **old_eval_results[1]}
+            else:
+                print("Old eval file not present, cannot update eval file")
+                metrics = {}
+
+        else:
+            metrics = get_metrics(args.scenario, args, benchmark, combined_results)
+            graded = extract_instance_results(metrics[1])
+            old_eval_all_results = []
+            old_eval_results = []
+
+        if args.scenario == Scenario.codegeneration:
+            if metrics:
+                metadatas = metrics[2]
+            else:
+                metadatas = [[] for _ in benchmark]
+            save_eval_results = [
+                instance.insert_output_evaluation(
+                    outputs_list, extracted_list, graded_list, metadata=meta
+                )
+                for instance, (outputs_list, extracted_list), graded_list, meta in zip(
+                    benchmark, combined_results, graded, metadatas
+                )
+            ]
+            if metrics and old_eval_results:
+                old_eval_results
+                metrics[2] = old_eval_results[2] + metrics[2]
+        elif args.scenario == Scenario.selfrepair:
+            metadatas = metrics[2]
+            with open(
+                f"output/{model.model_repr}/{Scenario.codegeneration}_{args.codegen_n}_{args.temperature}_eval_all.json"
+            ) as f:
+                code_gen_evals = json.load(f)
+            original_code_lists = [
+                code_gen_eval["code_list"] for code_gen_eval in code_gen_evals
+            ]
+
+            save_eval_results = [
+                instance.insert_output_evaluation(
+                    outputs_list,
+                    extracted_list,
+                    graded_list,
+                    metadata=meta,
+                    original_code_list=original_code_list,
+                )
+                for instance, (
+                    outputs_list,
+                    extracted_list,
+                ), graded_list, meta, original_code_list in zip(
+                    benchmark, combined_results, graded, metadatas, original_code_lists
+                )
+            ]
+
+        else:
+            save_eval_results = [
+                instance.insert_output_evaluation(
+                    outputs_list, extracted_list, graded_list
+                )
+                for instance, (outputs_list, extracted_list), graded_list in zip(
+                    benchmark, combined_results, graded
+                )
+            ]
+
+        save_eval_results = old_eval_all_results + save_eval_results
+
+        with open(eval_file, "w") as f:
+            json.dump(metrics, f, indent=4)
+
+        with open(eval_all_file, "w") as f:
+            json.dump(save_eval_results, f, indent=4)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/lcb_runner/runner/mistral_runner.py
+++ b/scripts/lcb_runner/runner/mistral_runner.py
+import os
+from time import sleep
+
+try:
+    from mistralai.client import MistralClient
+except ImportError as e:
+    pass
+
+from lcb_runner.runner.base_runner import BaseRunner
+
+
+class MistralRunner(BaseRunner):
+    client = MistralClient(
+        api_key=os.environ["MISTRAL_API_KEY"],
+    )
+
+    def __init__(self, args, model):
+        super().__init__(args, model)
+        self.client_kwargs: dict[str | str] = {
+            "model": args.model,
+            "temperature": args.temperature,
+            "max_tokens": args.max_tokens,
+            "top_p": args.top_p,
+        }
+
+    def _run_single(self, prompt: list[dict[str, str]]) -> list[str]:
+
+        def __run_single(counter):
+            try:
+                response = self.client.chat(
+                    messages=prompt,
+                    **self.client_kwargs,
+                )
+                content = response.choices[0].message.content
+                return content
+            except Exception as e:
+                print("Exception: ", repr(e), "Sleeping for 20 seconds...")
+                sleep(20 * (11 - counter))
+                counter = counter - 1
+                if counter == 0:
+                    print(f"Failed to run model for {prompt}!")
+                    print("Exception: ", repr(e))
+                    raise e
+                return __run_single(counter)
+
+        outputs = []
+        try:
+            for _ in range(self.args.n):
+                outputs.append(__run_single(10))
+        except Exception as e:
+            raise e
+
+        return outputs
--- a/scripts/lcb_runner/runner/oai_runner.py
+++ b/scripts/lcb_runner/runner/oai_runner.py
+import os
+from time import sleep
+
+try:
+    import openai
+    from openai import OpenAI
+except ImportError as e:
+    pass
+
+from lcb_runner.lm_styles import LMStyle
+from lcb_runner.runner.base_runner import BaseRunner
+
+
+class OpenAIRunner(BaseRunner):
+    client = OpenAI(
+        api_key=os.getenv("OPENAI_KEY"),
+    )
+
+    def __init__(self, args, model):
+        super().__init__(args, model)
+        if model.model_style == LMStyle.OpenAIReason:
+            self.client_kwargs: dict[str | str] = {
+                "model": args.model,
+                "max_completion_tokens": 25000,
+            }
+        else:
+            self.client_kwargs: dict[str | str] = {
+                "model": args.model,
+                "temperature": args.temperature,
+                "max_tokens": args.max_tokens,
+                "top_p": args.top_p,
+                "frequency_penalty": 0,
+                "presence_penalty": 0,
+                "n": args.n,
+                "timeout": args.openai_timeout,
+                # "stop": args.stop, --> stop is only used for base models currently
+            }
+
+    def _run_single(self, prompt: list[dict[str, str]]) -> list[str]:
+        assert isinstance(prompt, list)
+
+        try:
+            response = OpenAIRunner.client.chat.completions.create(
+                messages=prompt,
+                **self.client_kwargs,
+            )
+        except (
+            openai.APIError,
+            openai.RateLimitError,
+            openai.InternalServerError,
+            openai.OpenAIError,
+            openai.APIStatusError,
+            openai.APITimeoutError,
+            openai.InternalServerError,
+            openai.APIConnectionError,
+        ) as e:
+            print("Exception: ", repr(e))
+            print("Sleeping for 30 seconds...")
+            print("Consider reducing the number of parallel processes.")
+            sleep(30)
+            return self._run_single(prompt)
+        except Exception as e:
+            print(f"Failed to run the model for {prompt}!")
+            print("Exception: ", repr(e))
+            raise e
+        return [c.message.content for c in response.choices]
--- a/scripts/lcb_runner/runner/parser.py
+++ b/scripts/lcb_runner/runner/parser.py
+import os
+import torch
+import argparse
+
+from lcb_runner.utils.scenarios import Scenario
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="gpt-3.5-turbo-0301",
+        help="Name of the model to use matching `lm_styles.py`",
+    )
+    parser.add_argument(
+        "--local_model_path",
+        type=str,
+        default=None,
+        help="If you have a local model, specify it here in conjunction with --model",
+    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help="trust_remote_code option used in huggingface models",
+    )
+    parser.add_argument(
+        "--scenario",
+        type=Scenario,
+        default=Scenario.codegeneration,
+        help="Type of scenario to run",
+    )
+    parser.add_argument(
+        "--not_fast",
+        action="store_true",
+        help="whether to use full set of tests (slower and more memory intensive evaluation)",
+    )
+    parser.add_argument(
+        "--release_version",
+        type=str,
+        default="release_latest",
+        help="whether to use full set of tests (slower and more memory intensive evaluation)",
+    )
+    parser.add_argument(
+        "--cot_code_execution",
+        action="store_true",
+        help="whether to use CoT in code execution scenario",
+    )
+    parser.add_argument(
+        "--n", type=int, default=10, help="Number of samples to generate"
+    )
+    parser.add_argument(
+        "--codegen_n",
+        type=int,
+        default=10,
+        help="Number of samples for which code generation was run (used to map the code generation file during self-repair)",
+    )
+    parser.add_argument(
+        "--temperature", type=float, default=0.2, help="Temperature for sampling"
+    )
+    parser.add_argument("--top_p", type=float, default=0.95, help="Top p for sampling")
+    parser.add_argument(
+        "--max_tokens", type=int, default=2000, help="Max tokens for sampling"
+    )
+    parser.add_argument(
+        "--multiprocess",
+        default=0,
+        type=int,
+        help="Number of processes to use for generation (vllm runs do not use this)",
+    )
+    parser.add_argument(
+        "--stop",
+        default="###",
+        type=str,
+        help="Stop token (use `,` to separate multiple tokens)",
+    )
+    parser.add_argument("--continue_existing", action="store_true")
+    parser.add_argument("--continue_existing_with_eval", action="store_true")
+    parser.add_argument(
+        "--use_cache", action="store_true", help="Use cache for generation"
+    )
+    parser.add_argument(
+        "--cache_batch_size", type=int, default=100, help="Batch size for caching"
+    )
+    parser.add_argument("--debug", action="store_true", help="Debug mode")
+    parser.add_argument("--evaluate", action="store_true", help="Evaluate the results")
+    parser.add_argument(
+        "--num_process_evaluate",
+        type=int,
+        default=12,
+        help="Number of processes to use for evaluation",
+    )
+    parser.add_argument("--timeout", type=int, default=6, help="Timeout for evaluation")
+    parser.add_argument(
+        "--openai_timeout", type=int, default=90, help="Timeout for requests to OpenAI"
+    )
+    parser.add_argument(
+        "--tensor_parallel_size",
+        type=int,
+        default=-1,
+        help="Tensor parallel size for vllm",
+    )
+    parser.add_argument(
+        "--enable_prefix_caching",
+        action="store_true",
+        help="Enable prefix caching for vllm",
+    )
+    parser.add_argument(
+        "--custom_output_file",
+        type=str,
+        default=None,
+        help="Path to the custom output file used in `custom_evaluator.py`",
+    )
+    parser.add_argument(
+        "--custom_output_save_name",
+        type=str,
+        default=None,
+        help="Folder name to save the custom output results (output file folder modified if None)",
+    )
+    parser.add_argument("--dtype", type=str, default="bfloat16", help="Dtype for vllm")
+
+    args = parser.parse_args()
+
+    args.stop = args.stop.split(",")
+
+    if args.tensor_parallel_size == -1:
+        args.tensor_parallel_size = torch.cuda.device_count()
+
+    if args.multiprocess == -1:
+        args.multiprocess = os.cpu_count()
+
+    return args
+
+
+def test():
+    args = get_args()
+    print(args)
+
+
+if __name__ == "__main__":
+    test()
--- a/scripts/lcb_runner/runner/runner_utils.py
+++ b/scripts/lcb_runner/runner/runner_utils.py
+from lcb_runner.lm_styles import LMStyle, LanguageModel
+
+
+def build_runner(args, model: LanguageModel):
+    if model.model_style == LMStyle.OpenAIChat:
+        from lcb_runner.runner.oai_runner import OpenAIRunner
+
+        return OpenAIRunner(args, model)
+    if model.model_style == LMStyle.OpenAIReason:
+        from lcb_runner.runner.oai_runner import OpenAIRunner
+
+        return OpenAIRunner(args, model)
+    if model.model_style == LMStyle.Gemini:
+        from lcb_runner.runner.gemini_runner import GeminiRunner
+
+        return GeminiRunner(args, model)
+    if model.model_style == LMStyle.Claude3:
+        from lcb_runner.runner.claude3_runner import Claude3Runner
+
+        return Claude3Runner(args, model)
+    if model.model_style == LMStyle.Claude:
+        from lcb_runner.runner.claude_runner import ClaudeRunner
+
+        return ClaudeRunner(args, model)
+    if model.model_style == LMStyle.MistralWeb:
+        from lcb_runner.runner.mistral_runner import MistralRunner
+
+        return MistralRunner(args, model)
+    if model.model_style == LMStyle.CohereCommand:
+        from lcb_runner.runner.cohere_runner import CohereRunner
+
+        return CohereRunner(args, model)
+    if model.model_style == LMStyle.DeepSeekAPI:
+        from lcb_runner.runner.deepseek_runner import DeepSeekRunner
+
+        return DeepSeekRunner(args, model)
+    elif model.model_style in []:
+        raise NotImplementedError(
+            f"Runner for language model style {model.model_style} not implemented yet"
+        )
+    else:
+        from lcb_runner.runner.vllm_runner import VLLMRunner
+
+        return VLLMRunner(args, model)
--- a/scripts/lcb_runner/runner/scenario_router.py
+++ b/scripts/lcb_runner/runner/scenario_router.py
+from typing import Union
+
+from lcb_runner.utils.scenarios import Scenario
+from lcb_runner.lm_styles import LanguageModel
+from lcb_runner.evaluation import (
+    codegen_metrics,
+    test_output_metrics,
+    code_execution_metrics,
+)
+
+from lcb_runner.prompts import (
+    format_prompt_generation,
+    format_prompt_test_output,
+    format_prompt_execution,
+    format_prompt_execution_cot,
+    format_prompt_self_repair,
+)
+from lcb_runner.utils.extraction_utils import (
+    extract_code,
+    extract_test_output_code,
+    extract_execution_code,
+)
+
+from lcb_runner.benchmarks import (
+    CodeGenerationProblem,
+    TestOutputPredictionProblem,
+    CodeExecutionProblem,
+    load_code_generation_dataset,
+    load_code_generation_dataset_not_fast,
+    load_test_prediction_dataset,
+    load_code_execution_dataset,
+)
+
+# BenchMarkType = list[CodeGenerationProblem | TestOutputPredictionProblem]
+BenchMarkType = list[
+    Union[CodeGenerationProblem, CodeExecutionProblem, TestOutputPredictionProblem]
+]
+
+
+def build_prompt_benchmark(
+    args,
+) -> tuple[
+    list[CodeExecutionProblem]
+    | list[CodeGenerationProblem]
+    | list[TestOutputPredictionProblem],
+    callable,
+]:
+    scenario: Scenario = args.scenario
+
+    if scenario == Scenario.codegeneration:
+        not_fast: bool = args.not_fast
+        if not_fast:
+            benchmark = load_code_generation_dataset_not_fast(args.release_version)
+        else:
+            benchmark = load_code_generation_dataset(args.release_version)
+        benchmark = sorted(benchmark, key=lambda x: x.question_id)
+        format_prompt = format_prompt_generation
+    elif scenario == Scenario.testoutputprediction:
+        benchmark = load_test_prediction_dataset(args.release_version)
+        benchmark = sorted(benchmark, key=lambda x: (x.question_id, x.test_id))
+        format_prompt = format_prompt_test_output
+    elif scenario == Scenario.selfrepair:
+        benchmark = load_code_generation_dataset(args.release_version)
+        benchmark = sorted(benchmark, key=lambda x: x.question_id)
+        format_prompt = format_prompt_self_repair
+    elif scenario == Scenario.codeexecution:
+        cot_code_execution: bool = args.cot_code_execution
+        benchmark = load_code_execution_dataset(args.release_version)
+        benchmark = sorted(benchmark, key=lambda x: int(x.id.split("_")[1]))
+        if cot_code_execution:
+            format_prompt = format_prompt_execution_cot
+        else:
+            format_prompt = format_prompt_execution
+    else:
+        raise ValueError(f"Scenario {scenario} not implemented")
+    return benchmark, format_prompt
+
+
+def combine_results(
+    scenario: Scenario,
+    results: list[list[str]],
+    model: LanguageModel,
+    cot_code_execution: bool = False,
+):
+    if scenario == Scenario.codegeneration:
+        combined_results = [
+            (
+                outputs_list,
+                [extract_code(output, model.model_style) for output in outputs_list],
+            )
+            for outputs_list in results
+        ]
+    elif scenario == Scenario.testoutputprediction:
+        combined_results = [
+            (
+                outputs_list,
+                [
+                    extract_test_output_code(output, model.model_style)
+                    for output in outputs_list
+                ],
+            )
+            for outputs_list in results
+        ]
+    elif scenario == Scenario.selfrepair:
+        combined_results = [
+            (
+                [
+                    output[0] if type(output) is list else output
+                    for output in outputs_list
+                ],
+                [
+                    (
+                        extract_code(output[0], model.model_style)
+                        if type(output) is list
+                        else extract_code(output, model.model_style)
+                    )
+                    for output in outputs_list
+                ],
+            )
+            for outputs_list in results
+        ]
+    elif scenario == Scenario.codeexecution:
+        combined_results = [
+            (
+                outputs_list,
+                [
+                    extract_execution_code(
+                        output, model.model_style, cot=cot_code_execution
+                    )
+                    for output in outputs_list
+                ],
+            )
+            for outputs_list in results
+        ]
+    else:
+        raise ValueError(f"Scenario {scenario} not implemented")
+
+    return combined_results
+
+
+def sort_and_extract_save_results(scenario: Scenario, save_results: list[dict]):
+    if scenario == Scenario.codegeneration:
+        save_results = sorted(save_results, key=lambda x: x["question_id"])
+        combined_results = [
+            (save_result_instance["output_list"], save_result_instance["code_list"])
+            for save_result_instance in save_results
+        ]
+
+    elif scenario == Scenario.testoutputprediction:
+        save_results = sorted(
+            save_results, key=lambda x: (x["question_id"], x["test_id"])
+        )
+        combined_results = [
+            (save_result_instance["output_list"], save_result_instance["pred_list"])
+            for save_result_instance in save_results
+        ]
+    elif scenario == Scenario.selfrepair:
+        save_results = sorted(save_results, key=lambda x: x["question_id"])
+        combined_results = [
+            (save_result_instance["output_list"], save_result_instance["code_list"])
+            for save_result_instance in save_results
+        ]
+    elif scenario == Scenario.codeexecution:
+        save_results = sorted(save_results, key=lambda x: int(x["id"].split("_")[1]))
+        combined_results = [
+            (save_result_instance["output_list"], save_result_instance["pred_list"])
+            for save_result_instance in save_results
+        ]
+
+    else:
+        raise ValueError(f"Scenario {scenario} not implemented")
+
+    return save_results, combined_results
+
+
+def get_metrics(
+    scenario: Scenario,
+    args,
+    benchmark: list[
+        CodeGenerationProblem | CodeExecutionProblem | TestOutputPredictionProblem
+    ],
+    combined_results,
+):
+    eval_samples = [instance.get_evaluation_sample() for instance in benchmark]
+    generations = [extracted for _, extracted in combined_results]
+
+    if scenario == Scenario.codegeneration or scenario == Scenario.selfrepair:
+        metrics = codegen_metrics(
+            eval_samples,
+            generations,
+            num_process_evaluate=args.num_process_evaluate,
+            timeout=args.timeout,
+        )
+
+    elif args.scenario == Scenario.testoutputprediction:
+        metrics = test_output_metrics(
+            eval_samples,
+            generations,
+            k_list=[1, 5],
+        )
+
+    elif args.scenario == Scenario.codeexecution:
+        metrics = code_execution_metrics(
+            eval_samples,
+            generations,
+        )
+
+    else:
+        raise ValueError(f"Scenario {scenario} not implemented")
+
+    print(metrics[0]["pass@1"])
+
+    return metrics
--- a/scripts/lcb_runner/runner/vllm_runner.py
+++ b/scripts/lcb_runner/runner/vllm_runner.py
+try:
+    from transformers import AutoTokenizer
+    from vllm import LLM, SamplingParams
+except ImportError as e:
+    # print("Cannot import vllm")
+    pass
+
+from lcb_runner.runner.base_runner import BaseRunner
+
+
+class VLLMRunner(BaseRunner):
+    def __init__(self, args, model):
+        super().__init__(args, model)
+        model_tokenizer_path = (
+            model.model_name if args.local_model_path is None else args.local_model_path
+        )
+        self.llm = LLM(
+            model=model_tokenizer_path,
+            tokenizer=model_tokenizer_path,
+            tensor_parallel_size=args.tensor_parallel_size,
+            # dtype=args.dtype,
+            enforce_eager=True,
+            max_model_len=4096,
+            disable_custom_all_reduce=True,
+            enable_prefix_caching=args.enable_prefix_caching,
+            trust_remote_code=args.trust_remote_code,
+        )
+        self.sampling_params = SamplingParams(
+            n=self.args.n,
+            max_tokens=self.args.max_tokens,
+            temperature=self.args.temperature,
+            top_p=self.args.top_p,
+            frequency_penalty=0,
+            presence_penalty=0,
+            stop=self.args.stop,
+        )
+
+    def _run_single(self, prompt: str) -> list[str]:
+        pass
+
+    def run_batch(self, prompts: list[str]) -> list[list[str]]:
+        outputs = [None for _ in prompts]
+        remaining_prompts = []
+        remaining_indices = []
+        for prompt_index, prompt in enumerate(prompts):
+            if self.args.use_cache and prompt in self.cache:
+                if len(self.cache[prompt]) == self.args.n:
+                    outputs[prompt_index] = self.cache[prompt]
+                    continue
+            remaining_prompts.append(prompt)
+            remaining_indices.append(prompt_index)
+        if remaining_prompts:
+            vllm_outputs = self.llm.generate(remaining_prompts, self.sampling_params)
+            if self.args.use_cache:
+                assert len(remaining_prompts) == len(vllm_outputs)
+                for index, remaining_prompt, vllm_output in zip(
+                    remaining_indices, remaining_prompts, vllm_outputs
+                ):
+                    self.cache[remaining_prompt] = [o.text for o in vllm_output.outputs]
+                    outputs[index] = [o.text for o in vllm_output.outputs]
+            else:
+                for index, vllm_output in zip(remaining_indices, vllm_outputs):
+                    outputs[index] = [o.text for o in vllm_output.outputs]
+        return outputs
--- a/scripts/lcb_runner/utils/extraction_utils.py
+++ b/scripts/lcb_runner/utils/extraction_utils.py
+from lcb_runner.lm_styles import LMStyle
+
+
+def extract_code(model_output: str, lmstyle: LMStyle):
+    outputlines = model_output.split("\n")
+    if lmstyle == LMStyle.CodeLLaMaInstruct:
+        indexlines = [i for i, line in enumerate(outputlines) if "PYTHON]" in line]
+        if len(indexlines) < 2:
+            indexlines = [i for i, line in enumerate(outputlines) if "```" in line]
+    elif lmstyle == LMStyle.GenericBase:
+        return model_output.strip()
+    else:
+        indexlines = [i for i, line in enumerate(outputlines) if "```" in line]
+    if len(indexlines) < 2:
+        return ""
+    return "\n".join(outputlines[indexlines[0] + 1 : indexlines[1]])
+
+
+def extract_test_output_code(model_output: str, lmstyle: LMStyle = None):
+    outputlines = model_output.split("\n")
+    # find the last line startwith assert...
+    indexlines = [i for i, line in enumerate(outputlines) if line.startswith("assert")]
+    if indexlines:
+        return outputlines[indexlines[-1]]
+    if lmstyle and lmstyle == LMStyle.CodeLLaMaInstruct:
+        indexlines = [i for i, line in enumerate(outputlines) if "PYTHON]" in line]
+    else:
+        # first try to extract ```python if not then try ```
+        indexlines = [
+            i
+            for i, line in enumerate(outputlines)
+            if "```python" in line or "```Python" in line
+        ]
+        if indexlines:
+            start_index = indexlines[0]
+        else:
+            start_index = None
+        indexlines = [i for i, line in enumerate(outputlines) if "```" in line]
+        if start_index is not None:
+            indexlines = [i for i in indexlines if i > start_index]
+            indexlines = [start_index] + indexlines
+
+    if len(indexlines) < 2:
+        return ""
+    return "\n".join(outputlines[indexlines[0] + 1 : indexlines[1]])
+
+
+def extract_execution_code(model_output: str, lmstyle: LMStyle, cot: bool = False):
+    if cot:
+        if "[ANSWER]" in model_output:
+            model_output = model_output.split("[ANSWER]")[1].strip()
+    if "==" in model_output:
+        model_output = model_output.split("==")[1].strip()
+    if "[/ANSWER]" in model_output:
+        model_output = model_output.split("[/ANSWER]")[0].strip()
+    else:
+        model_output = model_output.split("\n")[0].strip()
+    return model_output.strip()
--- a/scripts/lcb_runner/utils/multiprocess.py
+++ b/scripts/lcb_runner/utils/multiprocess.py
+""" Utilities for running functions in parallel processes. """
+import sys
+import resource
+import multiprocessing as mp
+import queue
+import traceback
+from enum import Enum
+from typing import Callable, Optional, Dict, Any, List, Iterator
+from concurrent.futures import TimeoutError
+
+import attrs
+import tqdm
+from pebble import concurrent, ProcessPool, ProcessExpired
+
+
+class FuncTimeoutError(TimeoutError):
+    pass
+
+
+def generate_queue() -> mp.Queue:
+    """
+    Generates a queue that can be shared amongst processes
+    Returns:
+        (multiprocessing.Queue): A queue instance
+    """
+    manager = mp.Manager()
+    return manager.Queue()
+
+
+QueueEmptyException = queue.Empty
+
+
+def run_func_in_process(
+    func: Callable,
+    *args,
+    _timeout: Optional[int] = None,
+    _use_spawn: bool = True,
+    **kwargs,
+):
+    """
+    Runs the provided function in a separate process with the supplied args
+    and kwargs. The args, kwargs, and
+    return values must all be pickle-able.
+    Args:
+        func: The function to run.
+        *args: Positional args, if any.
+        _timeout: A timeout to use for the function.
+        _use_spawn: The 'spawn' multiprocess context is used.'fork' otherwise.
+        **kwargs: Keyword args, if any.
+    Returns:
+        The result of executing the function.
+    """
+    mode = "spawn" if _use_spawn else "fork"
+    c_func = concurrent.process(timeout=_timeout, context=mp.get_context(mode))(func)
+    future = c_func(*args, **kwargs)
+
+    try:
+        result = future.result()
+        return result
+
+    except TimeoutError:
+        raise FuncTimeoutError
+
+
+class TaskRunStatus(Enum):
+    SUCCESS = 0
+    EXCEPTION = 1
+    TIMEOUT = 2
+    PROCESS_EXPIRED = 3
+
+
+@attrs.define(eq=False, repr=False)
+class TaskResult:
+    status: TaskRunStatus
+
+    result: Optional[Any] = None
+    exception_tb: Optional[str] = None
+
+    def is_success(self) -> bool:
+        return self.status == TaskRunStatus.SUCCESS
+
+    def is_timeout(self) -> bool:
+        return self.status == TaskRunStatus.TIMEOUT
+
+    def is_exception(self) -> bool:
+        return self.status == TaskRunStatus.EXCEPTION
+
+    def is_process_expired(self) -> bool:
+        return self.status == TaskRunStatus.PROCESS_EXPIRED
+
+
+def initializer(limit):
+    """Set maximum amount of memory each worker process can allocate."""
+    soft, hard = resource.getrlimit(resource.RLIMIT_AS)
+    resource.setrlimit(resource.RLIMIT_AS, (limit, hard))
+
+
+def run_tasks_in_parallel_iter(
+    func: Callable,
+    tasks: List[Any],
+    num_workers: int = 2,
+    timeout_per_task: Optional[int] = None,
+    use_progress_bar: bool = False,
+    progress_bar_desc: Optional[str] = None,
+    max_tasks_per_worker: Optional[int] = None,
+    use_spawn: bool = True,
+    max_mem: int = 1024 * 1024 * 1024 * 4,
+) -> Iterator[TaskResult]:
+    """
+    Args:
+        func: The function to run. The function must accept a single argument.
+        tasks: A list of tasks i.e. arguments to func.
+        num_workers: Maximum number of parallel workers.
+        timeout_per_task: The timeout, in seconds, to use per task.
+        use_progress_bar: Whether to use a progress bar. Default False.
+        progress_bar_desc: String to display in the progress bar. Default None.
+        max_tasks_per_worker: Maximum number of tasks assigned
+        to a single process / worker. None means infinite.
+            Use 1 to force a restart.
+        use_spawn: The 'spawn' multiprocess context is used. 'fork' otherwise.
+    Returns:
+        A list of TaskResult objects, one per task.
+    """
+
+    mode = "spawn" if use_spawn else "fork"
+
+    with ProcessPool(
+        max_workers=num_workers,
+        max_tasks=0 if max_tasks_per_worker is None else max_tasks_per_worker,
+        context=mp.get_context(mode),
+    ) as pool:
+        future = pool.map(func, tasks, timeout=timeout_per_task)
+
+        iterator = future.result()
+        if use_progress_bar:
+            pbar = tqdm.tqdm(
+                desc=progress_bar_desc,
+                total=len(tasks),
+                dynamic_ncols=True,
+                file=sys.stdout,
+            )
+        else:
+            pbar = None
+
+        succ = timeouts = exceptions = expirations = 0
+
+        while True:
+            try:
+                result = next(iterator)
+
+            except StopIteration:
+                break
+
+            except TimeoutError as error:
+                yield TaskResult(
+                    status=TaskRunStatus.TIMEOUT,
+                )
+
+                timeouts += 1
+
+            except ProcessExpired as error:
+                yield TaskResult(
+                    status=TaskRunStatus.PROCESS_EXPIRED,
+                )
+                expirations += 1
+
+            except Exception as error:
+                exception_tb = traceback.format_exc()
+
+                yield TaskResult(
+                    status=TaskRunStatus.EXCEPTION,
+                    exception_tb=exception_tb,
+                )
+                exceptions += 1
+
+            else:
+                yield TaskResult(
+                    status=TaskRunStatus.SUCCESS,
+                    result=result,
+                )
+
+                succ += 1
+
+            if pbar is not None:
+                pbar.update(1)
+                pbar.set_postfix(
+                    succ=succ, timeouts=timeouts, exc=exceptions, p_exp=expirations
+                )
+                sys.stdout.flush()
+                sys.stderr.flush()
+
+
+def run_tasks_in_parallel(
+    func: Callable,
+    tasks: List[Any],
+    num_workers: int = 2,
+    timeout_per_task: Optional[int] = None,
+    use_progress_bar: bool = False,
+    progress_bar_desc: Optional[str] = None,
+    max_tasks_per_worker: Optional[int] = None,
+    use_spawn: bool = True,
+) -> List[TaskResult]:
+    """
+    Args:
+        func: The function to run. The function must accept a single argument.
+        tasks: A list of tasks i.e. arguments to func.
+        num_workers: Maximum number of parallel workers.
+        timeout_per_task: The timeout, in seconds, to use per task.
+        use_progress_bar: Whether to use a progress bar. Defaults False.
+        progress_bar_desc: String to display in the progress bar. Default None.
+        max_tasks_per_worker: Maximum number of tasks assigned to a single
+        process / worker. None means infinite.
+            Use 1 to force a restart.
+        use_spawn: The 'spawn' multiprocess context is used. 'fork' otherwise.
+    Returns:
+        A list of TaskResult objects, one per task.
+    """
+
+    task_results: List[TaskResult] = list(
+        run_tasks_in_parallel_iter(
+            func=func,
+            tasks=tasks,
+            num_workers=num_workers,
+            timeout_per_task=timeout_per_task,
+            use_progress_bar=use_progress_bar,
+            progress_bar_desc=progress_bar_desc,
+            max_tasks_per_worker=max_tasks_per_worker,
+            use_spawn=use_spawn,
+        )
+    )
+
+    return task_results
--- a/scripts/lcb_runner/utils/path_utils.py
+++ b/scripts/lcb_runner/utils/path_utils.py
+import pathlib
+
+from lcb_runner.lm_styles import LanguageModel, LMStyle
+from lcb_runner.utils.scenarios import Scenario
+
+
+def ensure_dir(path: str, is_file=True):
+    if is_file:
+        pathlib.Path(path).parent.mkdir(parents=True, exist_ok=True)
+    else:
+        pathlib.Path(path).mkdir(parents=True, exist_ok=True)
+    return
+
+
+def get_cache_path(model_repr:str, args) -> str:
+    scenario: Scenario = args.scenario
+    n = args.n
+    temperature = args.temperature
+    path = f"cache/{model_repr}/{scenario}_{n}_{temperature}.json"
+    ensure_dir(path)
+    return path
+
+
+def get_output_path(model_repr:str, args) -> str:
+    scenario: Scenario = args.scenario
+    n = args.n
+    temperature = args.temperature
+    cot_suffix = "_cot" if args.cot_code_execution else ""
+    path = f"output/{model_repr}/{scenario}_{n}_{temperature}{cot_suffix}.json"
+    ensure_dir(path)
+    return path
+
+
+def get_eval_all_output_path(model_repr:str, args) -> str:
+    scenario: Scenario = args.scenario
+    n = args.n
+    temperature = args.temperature
+    cot_suffix = "_cot" if args.cot_code_execution else ""
+    path = f"output/{model_repr}/{scenario}_{n}_{temperature}{cot_suffix}_eval_all.json"
+    return path
--- a/scripts/lcb_runner/utils/scenarios.py
+++ b/scripts/lcb_runner/utils/scenarios.py
+from enum import Enum
+
+
+class Scenario(Enum):
+    codegeneration = "codegeneration"
+    selfrepair = "selfrepair"
+    testoutputprediction = "testoutputprediction"
+    codeexecution = "codeexecution"
--- a/scripts/prompts.py
+++ b/scripts/prompts.py
+
+
+def get_gpqa_search_o1_instruction(MAX_SEARCH_LIMIT):
+    return (
+        "You are a reasoning assistant with the ability to perform web searches to help "
+        "you answer the user's question accurately. You have special tools:\n\n"
+        "- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
+        "Then, the system will search and analyze relevant web pages, then provide you with helpful information in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n\n"
+        f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n\n"
+        "Once you have all the information you need, continue your reasoning.\n\n"
+        "Example:\n"
+        "Question: \"What is the energy range of pp III neutrinos?\"\n"
+        "Assistant thinking steps:\n"
+        "- I might need to look up details about pp III neutrinos.\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>pp III neutrino energy spectrum<|end_search_query|>\n\n"
+        "(System returns processed information from relevant web pages)\n\n"
+        "Assistant continues reasoning with the new information...\n\n"
+        "Remember:\n"
+        "- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
+        "- When done searching, continue your reasoning.\n\n"
+    )
+
+
+def get_math_search_o1_instruction(MAX_SEARCH_LIMIT):
+    return (
+        "You are a reasoning assistant with the ability to perform web searches to help "
+        "you answer the user's question accurately. You have special tools:\n\n"
+        "- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
+        "Then, the system will search and analyze relevant web pages, then provide you with helpful information in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n\n"
+        f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n\n"
+        "Once you have all the information you need, continue your reasoning.\n\n"
+        "Example:\n"
+        "Question: \"How do you compute the integral of e^(x^2) dx?\"\n"
+        "Assistant thinking steps:\n"
+        "- I might need to look up techniques for integrating e^(x^2).\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>methods to integrate e^(x^2)<|end_search_query|>\n\n"
+        "(System returns processed information from relevant web pages)\n\n"
+        "Assistant continues reasoning with the new information...\n\n"
+        "Remember:\n"
+        "- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
+        "- When done searching, continue your reasoning.\n\n"
+    )
+
+
+def get_code_search_o1_instruction(MAX_SEARCH_LIMIT):
+    return (
+        "You are a reasoning assistant with the ability to perform web searches to help "
+        "you answer the user's question accurately. You have special tools:\n\n"
+        "- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
+        "Then, the system will search and analyze relevant web pages, then provide you with helpful information in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n\n"
+        f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n\n"
+        "Once you have all the information you need, continue your reasoning.\n\n"
+        "Example:\n"
+        "Question: \"Find the minimum number of vertices in a Steiner tree that includes all specified vertices in a given tree.\"\n"
+        "Assistant thinking steps:\n"
+        "- I need to understand what a Steiner tree is and how to compute the minimum number of vertices required to include all specified vertices in a given tree.\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>Minimum Steiner Tree problem in trees<|end_search_query|>\n\n"
+        "(System returns processed information from relevant web pages)\n\n"
+        "Assistant continues reasoning with the new information...\n\n"
+        "Remember:\n"
+        "- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
+        "- When done searching, continue your reasoning.\n\n"
+    )
+
+
+def get_webpage_to_reasonchain_instruction(prev_reasoning, search_query, document):
+    return f"""**Task Instruction:**
+
+You are tasked with reading and analyzing web pages based on the following inputs: **Previous Reasoning Steps**, **Current Search Query**, and **Searched Web Pages**. Your objective is to extract relevant and helpful information for **Current Search Query** from the **Searched Web Pages** and seamlessly integrate this information into the **Previous Reasoning Steps** to continue reasoning for the original question.
+
+**Guidelines:**
+
+1. **Analyze the Searched Web Pages:**
+- Carefully review the content of each searched web page.
+- Identify factual information that is relevant to the **Current Search Query** and can aid in the reasoning process for the original question.
+
+2. **Extract Relevant Information:**
+- Select the information from the Searched Web Pages that directly contributes to advancing the **Previous Reasoning Steps**.
+- Ensure that the extracted information is accurate and relevant.
+
+3. **Output Format:**
+- **If the web pages provide helpful information for current search query:** Present the information beginning with `**Final Information**` as shown below.
+**Final Information**
+
+[Helpful information]
+
+- **If the web pages do not provide any helpful information for current search query:** Output the following text.
+
+**Final Information**
+
+No helpful information found.
+
+**Inputs:**
+- **Previous Reasoning Steps:**  
+{prev_reasoning}
+
+- **Current Search Query:**  
+{search_query}
+
+- **Searched Web Pages:**  
+{document}
+
+Now you should analyze each web page and find helpful information based on the current search query "{search_query}" and previous reasoning steps.
+"""
+
+
+def get_singleqa_search_o1_instruction(MAX_SEARCH_LIMIT):
+    return (
+        "You are a reasoning assistant with the ability to perform web searches to help "
+        "you answer the user's question accurately. You have special tools:\n\n"
+        "- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
+        "Then, the system will search and analyze relevant web pages, then provide you with helpful information in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n\n"
+        f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n\n"
+        "Once you have all the information you need, continue your reasoning.\n\n"
+        "Example:\n"
+        "Question: \"Who got the first Nobel Prize in Physics?\"\n"
+        "Assistant thinking steps:\n"
+        "- I need to find out who was awarded the first Nobel Prize in Physics.\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>first Nobel Prize in Physics winner<|end_search_query|>\n\n"
+        "(System returns processed information from relevant web pages)\n\n"
+        "Assistant continues reasoning with the new information...\n\n"
+        "Remember:\n"
+        "- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
+        "- When done searching, continue your reasoning.\n\n"
+    )
+
+def get_multiqa_search_o1_instruction(MAX_SEARCH_LIMIT):
+    return (
+        "You are a reasoning assistant with the ability to perform web searches to help "
+        "you answer the user's question accurately. You have special tools:\n\n"
+        "- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
+        "Then, the system will search and analyze relevant web pages, then provide you with helpful information in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n\n"
+        f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n\n"
+        "Once you have all the information you need, continue your reasoning.\n\n"
+        "Example:\n"
+        "Question: \"Alice David is the voice of Lara Croft in a video game developed by which company?\"\n"
+        "Assistant thinking steps:\n"
+        "- I need to find out who voices Lara Croft in the video game.\n"
+        "- Then, I need to determine which company developed that video game.\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>Alice David Lara Croft voice<|end_search_query|>\n\n"
+        "(System returns processed information from relevant web pages)\n\n"
+        "Assistant thinks: The search results indicate that Alice David is the voice of Lara Croft in a specific video game. Now, I need to find out which company developed that game.\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>video game developed by Alice David Lara Croft<|end_search_query|>\n\n"
+        "(System returns processed information from relevant web pages)\n\n"
+        "Assistant continues reasoning with the new information...\n\n"
+        "Remember:\n"
+        "- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
+        "- When done searching, continue your reasoning.\n\n"
+    )
+
+    
+def get_singleqa_rag_agent_instruction(MAX_SEARCH_LIMIT, MAX_URL_FETCH):
+    return (
+        "You are a reasoning assistant with the ability to perform web searches and retrieve webpage content to help "
+        "you answer the user’s question accurately. You have special tools:\n\n"
+        "- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
+        "Then, the system will call the web search API with your query and return the search results to you in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n"
+        "  The search results will contain a list of webpages with titles, URLs, and snippets (but not full content).\n\n"
+        "- After receiving the search results, if you need more detailed information from one or more specific URLs, write <|begin_url|> url1, url2, ... <|end_url|>.\n"
+        "  The system will fetch the full page content of those URLs and return it to you as <|begin_full_page|> ...full page content... <|end_full_page|>.\n\n"
+        f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n"
+        f"You can fetch up to {MAX_URL_FETCH} URLs for detailed information.\n\n"
+        "Once you have all the information you need, continue your reasoning.\n\n"
+        "Example:\n"
+        "Question: \"Who got the first Nobel Prize in Physics?\"\n"
+        "Assistant thinking steps:\n"
+        "- I need to find out who was awarded the first Nobel Prize in Physics.\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>first Nobel Prize in Physics winner<|end_search_query|>\n\n"
+        "(System returns search results)\n\n"
+        "Assistant:\n"
+        "<|begin_search_result|> ...search results without full page... <|end_search_result|>\n\n"
+        "Assistant thinks: The search results mention several URLs. I want full details from one of them.\n\n"
+        "Assistant:\n"
+        "<|begin_url|>http://example.com/first_nobel_physics.html<|end_url|>\n\n"
+        "(System returns full page content)\n\n"
+        "Assistant:\n"
+        "<|begin_full_page|> ...full page content... <|end_full_page|>\n\n"
+        "Now the assistant has enough info and can continue reasoning.\n\n"
+        "Remember:\n"
+        "- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
+        "- Use <|begin_url|> to request full page content and end with <|end_url|>.\n"
+        "- When done retrieving information, continue your reasoning.\n\n"
+    )
+
+
+def get_multiqa_rag_agent_instruction(MAX_SEARCH_LIMIT, MAX_URL_FETCH):
+    return (
+        "You are a reasoning assistant with the ability to perform web searches and retrieve webpage content to help "
+        "you answer the user’s question accurately. You have special tools:\n\n"
+        "- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
+        "Then, the system will call the web search API with your query and return the search results to you in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n"
+        "  The search results will contain a list of webpages with titles, URLs, and snippets (but not full content).\n\n"
+        "- After receiving the search results, if you need more detailed information from one or more specific URLs, write <|begin_url|> url1, url2, ... <|end_url|>.\n"
+        "  The system will fetch the full page content of those URLs and return it to you as <|begin_full_page|> ...full page content... <|end_full_page|>.\n\n"
+        f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n"
+        f"You can fetch up to {MAX_URL_FETCH} URLs for detailed information.\n\n"
+        "Once you have all the information you need, continue your reasoning.\n\n"
+        "Example:\n"
+        "Question: \"Alice David is the voice of Lara Croft in a video game developed by which company?\"\n"
+        "Assistant thinking steps:\n"
+        "- I need to find out who voices Lara Croft in the video game.\n"
+        "- Then, I need to determine which company developed that video game.\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>voice actor of Lara Croft<|end_search_query|>\n\n"
+        "(System returns search results)\n\n"
+        "Assistant:\n"
+        "<|begin_search_result|> ...search results without full page... <|end_search_result|>\n\n"
+        "Assistant thinks: The search results provide names of voice actors for Lara Croft. I need to confirm if Alice David is one of them.\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>Alice David Lara Croft voice<|end_search_query|>\n\n"
+        "(System returns search results)\n\n"
+        "Assistant:\n"
+        "<|begin_search_result|> ...search results without full page... <|end_search_result|>\n\n"
+        "Assistant thinks: The search results indicate that Alice David is the voice of Lara Croft in a specific video game. Now, I need to find out which company developed that game.\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>video game developed by Alice David Lara Croft<|end_search_query|>\n\n"
+        "(System returns search results)\n\n"
+        "Assistant:\n"
+        "<|begin_search_result|> ...search results without full page... <|end_search_result|>\n\n"
+        "Assistant thinks: The search results mention the company that developed the video game featuring Alice David as Lara Croft.\n\n"
+        "Assistant:\n"
+        "<|begin_url|>http://example.com/lara_croft_voice_actor.html, http://example.com/game_developer.html<|end_url|>\n\n" 
+        "(System returns full page content)\n\n"
+        "Assistant:\n"
+        "<|begin_full_page|> ...full page content... <|end_full_page|>\n\n"
+        "Now the assistant has enough info and can continue reasoning.\n\n"
+        "Remember:\n"
+        "- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
+        "- Use <|begin_url|> to request full page content and end with <|end_url|>.\n"
+        "- When done retrieving information, continue your reasoning.\n\n"
+    )
+
+
+def get_gpqa_rag_agent_instruction(MAX_SEARCH_LIMIT, MAX_URL_FETCH):
+    return (
+        "You are a reasoning assistant with the ability to perform web searches and retrieve webpage content to help "
+        "you answer the user’s question accurately. You have special tools:\n\n"
+        "- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
+        "Then, the system will call the web search API with your query and return the search results to you in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n"
+        "  The search results will contain a list of webpages with titles, URLs, and snippets (but not full content).\n\n"
+        "- After receiving the search results, if you need more detailed information from one or more specific URLs, write <|begin_url|> url1, url2, ... <|end_url|>.\n"
+        "  The system will fetch the full page content of those URLs and return it to you as <|begin_full_page|> ...full page content... <|end_full_page|>.\n\n"
+        f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n"
+        f"You can fetch up to {MAX_URL_FETCH} URLs for detailed information.\n\n"
+        "Once you have all the information you need, continue your reasoning.\n\n"
+        "Example:\n"
+        "Question: \"What is the energy range of pp III neutrinos?\"\n"
+        "Assistant thinking steps:\n"
+        "- I might need to look up details about pp III neutrinos.\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>pp III neutrino energy spectrum<|end_search_query|>\n\n"
+        "(System returns search results)\n\n"
+        "Assistant:\n"
+        "<|begin_search_result|> ...search results without full page... <|end_search_result|>\n\n"
+        "Assistant thinks: The search results mention some URLs. I want full details from one of them.\n\n"
+        "Assistant:\n"
+        "<|begin_url|>http://example.com/ppIII_neutrino.html<|end_url|>\n\n" 
+        "(System returns full page content)\n\n"
+        "Assistant:\n"
+        "<|begin_full_page|> ...full page content... <|end_full_page|>\n\n"
+        "Now the assistant has enough info and can continue reasoning.\n\n"
+        "Remember:\n"
+        "- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
+        "- Use <|begin_url|> to request full page content and end with <|end_url|>.\n"
+        "- When done retrieving information, continue your reasoning.\n\n"
+    )
+
+
+def get_math_rag_agent_instruction(MAX_SEARCH_LIMIT, MAX_URL_FETCH):
+    return (
+        "You are a reasoning assistant with the ability to perform web searches and retrieve webpage content to help "
+        "you answer the user’s math-related question accurately. You have special tools:\n\n"
+        "- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
+        "Then, the system will call the web search API with your query and return the search results to you in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n"
+        "  The search results will contain a list of webpages with titles, URLs, and snippets (but not full content).\n\n"
+        "- After receiving the search results, if you need more detailed information from one or more specific URLs, write <|begin_url|> url1, url2, ... <|end_url|>.\n"
+        "  The system will fetch the full page content of those URLs and return it to you as <|begin_full_page|> ...full page content... <|end_full_page|>.\n\n"
+        f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n"
+        f"You can fetch up to {MAX_URL_FETCH} URLs for detailed information.\n\n"
+        "Once you have all the information you need, continue your reasoning.\n\n"
+        "Example:\n"
+        "Question: \"How do you compute the integral of e^(x^2) dx?\"\n"
+        "Assistant thinking steps:\n"
+        "- I might need to look up techniques for integrating e^(x^2).\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>methods to integrate e^(x^2)<|end_search_query|>\n\n"
+        "(System returns search results)\n\n"
+        "Assistant:\n"
+        "<|begin_search_result|> ...search results without full page... <|end_search_result|>\n\n"
+        "Assistant thinks: The search results mention some URLs. I want full details from one of them.\n\n"
+        "Assistant:\n"
+        "<|begin_url|>http://example.com/integration_e_x_squared.html<|end_url|>\n\n" 
+        "(System returns full page content)\n\n"
+        "Assistant:\n"
+        "<|begin_full_page|> ...full page content... <|end_full_page|>\n\n"
+        "Now the assistant has enough info and can continue reasoning.\n\n"
+        "Remember:\n"
+        "- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
+        "- Use <|begin_url|> to request full page content and end with <|end_url|>.\n"
+        "- When done retrieving information, continue your reasoning.\n\n"
+    )
+
+
+def get_code_rag_agent_instruction(MAX_SEARCH_LIMIT, MAX_URL_FETCH):
+    return (
+        "You are a reasoning assistant with the ability to perform web searches and retrieve webpage content to help "
+        "you answer the user’s programming-related question accurately. You have special tools:\n\n"
+        "- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
+        "Then, the system will call the web search API with your query and return the search results to you in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n"
+        "  The search results will contain a list of webpages with titles, URLs, and snippets (but not full content).\n\n"
+        "- After receiving the search results, if you need more detailed information from one or more specific URLs, write <|begin_url|> url1, url2, ... <|end_url|>.\n"
+        "  The system will fetch the full page content of those URLs and return it to you as <|begin_full_page|> ...full page content... <|end_full_page|>.\n\n"
+        f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n"
+        f"You can fetch up to {MAX_URL_FETCH} URLs for detailed information.\n\n"
+        "Once you have all the information you need, continue your reasoning.\n\n"
+        "Example:\n"
+        "Question: \"How do I implement a binary search algorithm in Python?\"\n"
+        "Assistant thinking steps:\n"
+        "- I might need to look up the implementation details of binary search in Python.\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>binary search algorithm implementation in Python<|end_search_query|>\n\n"
+        "(System returns search results)\n\n"
+        "Assistant:\n"
+        "<|begin_search_result|> ...search results without full page... <|end_search_result|>\n\n"
+        "Assistant thinks: The search results mention some URLs. I want full details from one of them.\n\n"
+        "Assistant:\n"
+        "<|begin_url|>http://example.com/python_binary_search.html<|end_url|>\n\n" 
+        "(System returns full page content)\n\n"
+        "Assistant:\n"
+        "<|begin_full_page|> ...full page content... <|end_full_page|>\n\n"
+        "Now the assistant has enough info and can continue reasoning.\n\n"
+        "Remember:\n"
+        "- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
+        "- Use <|begin_url|> to request full page content and end with <|end_url|>.\n"
+        "- When done retrieving information, continue your reasoning.\n\n"
+    )
+
+
+def get_naive_rag_instruction(question, documents):
+    return (
+        "You are a knowledgeable assistant that uses the provided documents to answer the user's question.\n\n"
+        "Question:\n"
+        f"{question}\n"
+        "Documents:\n"
+        f"{documents}\n"
+    )
+
+
+
+def get_task_instruction_openqa(question, model_name=None):
+    if model_name == 'qwq':
+        user_prompt = (
+            'Please answer the following question. '
+            'You should provide your final answer in the format \\boxed{YOUR_ANSWER}.\n\n'
+            f'Question:\n{question}\n\n'
+        )
+    else:
+        user_prompt = (
+            'Please answer the following question. You should think step by step to solve it.\n\n'
+            'Provide your final answer in the format \\boxed{YOUR_ANSWER}.\n\n'
+            f'Question:\n{question}\n\n'
+        )
+    return user_prompt
+
+def get_task_instruction_math(question, model_name=None):
+    if model_name == 'qwq':
+        user_prompt = (
+            'Please answer the following math question. '
+            'You should provide your final answer in the format \\boxed{YOUR_ANSWER}.\n\n'
+            f'Question:\n{question}\n\n'
+        )
+    else:
+        user_prompt = (
+            'Please answer the following math question. You should think step by step to solve it.\n\n'
+            'Provide your final answer in the format \\boxed{YOUR_ANSWER}.\n\n'
+            f'Question:\n{question}\n\n'
+        )
+    return user_prompt
+
+def get_task_instruction_multi_choice(question, model_name=None):
+    if model_name == 'qwq':
+        user_prompt = (
+            'Please answer the following multiple-choice question. '
+            'You should provide your final choice in the format \\boxed{YOUR_CHOICE}.\n\n'
+            f'Question:\n{question}\n\n'
+        )
+    elif model_name == 'llama':
+        user_prompt = (
+            'Please answer the following multiple-choice question. You should think step by step to solve it.\n\n'
+            'Provide your final choice in the format \\boxed{YOUR_CHOICE}. Your final choice should be one of the letters A, B, C, or D, DO NOT include any answer content.\n\n'
+            f'Question:\n{question}\n\n'
+        )
+    else:
+        user_prompt = (
+            'Please answer the following multiple-choice question. You should think step by step to solve it.\n\n'
+            'Provide your final choice in the format \\boxed{YOUR_CHOICE}.\n\n'
+            f'Question:\n{question}\n\n'
+        )
+    return user_prompt
+
+def get_task_instruction_code(question, question_title=None, model_name=None):
+    if model_name == 'qwq':
+        user_prompt = (
+            'Generate a correct Python program that passes all tests for the given problem. '
+            'You should provide your final code within a Python code block using triple backticks (```python\n'
+            'YOUR_CODE\n'
+            '```).\n\n'
+            f'Problem Title: {question_title}\n\n'
+            f'Problem Statement:\n{question}\n\n'
+        )
+    else:
+        user_prompt = (
+            'You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. '
+            f'You should think step by step to solve it.\n\nQuestion:\n{question}\n\n'
+            'Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows.\n\n'
+            "```python\n# YOUR CODE HERE\n```\n\n"
+        )
+    return user_prompt
+
--- a/scripts/run_direct_gen.py
+++ b/scripts/run_direct_gen.py
+import csv
+import json
+import random
+import torch
+import re
+import os, time
+import numpy as np
+from tqdm import tqdm
+from vllm import LLM, SamplingParams
+from transformers import AutoTokenizer
+from evaluate import run_evaluation
+from prompts import (
+    get_task_instruction_openqa, 
+    get_task_instruction_math, 
+    get_task_instruction_multi_choice, 
+    get_task_instruction_code, 
+)
+import argparse
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run direct generation for various datasets and models.")
+    
+    parser.add_argument(
+        '--dataset_name', 
+        type=str, 
+        required=True, 
+        choices=['gpqa', 'math500', 'aime', 'amc', 'livecode', 'nq', 'triviaqa', 'hotpotqa', '2wiki', 'musique', 'bamboogle', 'medmcqa', 'pubhealth'],
+        help="Name of the dataset to use."
+    )
+    
+    parser.add_argument(
+        '--split', 
+        type=str, 
+        required=True, 
+        choices=['test', 'diamond', 'main', 'extended'],
+        help="Dataset split to use."
+    )
+    
+    parser.add_argument(
+        '--subset_num', 
+        type=int, 
+        default=-1, 
+        help="Number of examples to process. Defaults to all if not specified."
+    )
+    
+    parser.add_argument(
+        '--model_path', 
+        type=str, 
+        required=True,
+        help="Path to the pre-trained model."
+    )
+    
+    parser.add_argument(
+        '--temperature', 
+        type=float, 
+        default=0.7, 
+        help="Sampling temperature."
+    )
+    
+    parser.add_argument(
+        '--top_p', 
+        type=float, 
+        default=0.8, 
+        help="Top-p sampling parameter."
+    )
+    
+    parser.add_argument(
+        '--top_k', 
+        type=int, 
+        default=20, 
+        help="Top-k sampling parameter."
+    )
+    
+    parser.add_argument(
+        '--repetition_penalty', 
+        type=float, 
+        default=None, 
+        help="Repetition penalty. If not set, defaults based on the model."
+    )
+    
+    parser.add_argument(
+        '--max_tokens', 
+        type=int, 
+        default=32768, 
+        help="Maximum number of tokens to generate. If not set, defaults based on the model and dataset."
+    )
+    
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+    
+    dataset_name = args.dataset_name
+    split = args.split
+    subset_num = args.subset_num
+    model_path = args.model_path
+    temperature = args.temperature
+    top_p = args.top_p
+    top_k = args.top_k
+    repetition_penalty = args.repetition_penalty
+    max_tokens = args.max_tokens
+    
+    # Set default repetition_penalty if not provided
+    if repetition_penalty is None:
+        repetition_penalty = 1.05 if 'qwq' in model_path.lower() else 1.0
+    
+    # Paths to datasets
+    if dataset_name == 'math500':
+        data_path = f'./data/MATH500/{split}.json'
+    elif dataset_name == 'gpqa':
+        data_path = f'./data/GPQA/{split}.json'
+    elif dataset_name == 'aime':
+        data_path = f'./data/AIME/{split}.json'
+    elif dataset_name == 'amc':
+        data_path = f'./data/AMC/{split}.json'
+    elif dataset_name == 'livecode':
+        data_path = f'./data/LiveCodeBench/{split}.json'
+    elif dataset_name in ['nq', 'triviaqa', 'hotpotqa', 'musique', 'bamboogle', '2wiki', 'medmcqa', 'pubhealth']:
+        data_path = f'./data/QA_Datasets/{dataset_name}.json'
+    else:
+        raise ValueError(f"Unsupported dataset_name: {dataset_name}")
+    
+    # Load the model
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = 'left'
+    
+    if 'qwq' in model_path.lower():
+        if dataset_name in ['math500', 'gpqa', 'aime', 'amc', 'livecode']:
+            output_dir = f'./outputs/{dataset_name}.qwq.direct'
+        else:
+            output_dir = f'./outputs/runs.qa/{dataset_name}.qwq.direct'
+    else:
+        model_short_name = model_path.split('/')[-1].lower().replace('-instruct', '')
+        output_dir = f'./outputs/runs.baselines/{dataset_name}.{model_short_name}.direct'
+    os.makedirs(output_dir, exist_ok=True)
+    
+    llm = LLM(
+        model=model_path,
+        tensor_parallel_size=torch.cuda.device_count(),
+        gpu_memory_utilization=0.95,
+    )
+    
+    # Load data
+    with open(data_path, mode='r', encoding='utf-8') as json_file:
+        filtered_data = json.load(json_file)
+    
+    # prepare input
+    input_list = []
+    for item in filtered_data:
+        question = item['Question']
+        if dataset_name in ['nq', 'triviaqa', 'hotpotqa', 'musique', 'bamboogle', '2wiki']:
+            if 'qwq' in model_path.lower():
+                user_prompt = get_task_instruction_openqa(question, model_name='qwq')
+            else:
+                user_prompt = get_task_instruction_openqa(question)
+
+        elif dataset_name in ['math500', 'aime', 'amc']:
+            if 'qwq' in model_path.lower():
+                user_prompt = get_task_instruction_math(question, model_name='qwq')
+            else:
+                user_prompt = get_task_instruction_math(question)
+
+        elif dataset_name in ['gpqa']:
+            if 'qwq' in model_path.lower():
+                user_prompt = get_task_instruction_multi_choice(question, model_name='qwq')
+            elif 'llama' in model_path.lower():
+                user_prompt = get_task_instruction_multi_choice(question, model_name='llama')
+            else:
+                user_prompt = get_task_instruction_multi_choice(question)
+            
+        elif dataset_name == 'livecode':
+            question_title = item.get('question_title', '')
+            if 'qwq' in model_path.lower():
+                user_prompt = get_task_instruction_code(question, question_title=question_title, model_name='qwq')
+            else:
+                user_prompt = get_task_instruction_code(question)
+        else:
+            user_prompt = ""  # Default to empty if dataset not matched
+        prompt = [{"role": "user", "content": user_prompt}]
+        prompt = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
+        input_list.append(prompt)
+    
+    if subset_num != -1:
+        input_list = input_list[:subset_num]
+        filtered_data = filtered_data[:subset_num]
+    
+    # Set default max_tokens if not provided
+    if max_tokens is None:
+        if 'qwq' in model_path.lower():
+            if dataset_name in ['aime', 'amc', 'livecode']:
+                max_tokens = 32768
+            else:
+                max_tokens = 25600
+        else:
+            max_tokens = 3096
+    
+    t_start = time.time()
+    # Generate model outputs
+    output_list = llm.generate(
+        input_list, 
+        sampling_params=SamplingParams(
+            max_tokens=max_tokens, 
+            temperature=temperature, 
+            top_p=top_p, 
+            top_k=top_k, 
+            repetition_penalty=repetition_penalty,
+        )
+    )
+    total_time = time.time() - t_start
+    
+    # Run evaluation
+    run_evaluation(
+        filtered_data, 
+        input_list, 
+        output_list, 
+        dataset_name, 
+        output_dir, 
+        total_time, 
+        split,
+    )
+
+if __name__ == "__main__":
+    main()
--- a/scripts/run_naive_rag.py
+++ b/scripts/run_naive_rag.py
+# run_naive_rag.py
+import os
+import json
+import time
+from tqdm import tqdm
+from typing import List, Dict, Optional, Tuple
+import argparse
+
+from bing_search import (
+    bing_web_search,
+    extract_relevant_info,
+    fetch_page_content,
+    extract_snippet_with_context,
+)
+from evaluate import run_evaluation, extract_answer
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+
+import re
+import string
+from nltk.tokenize import sent_tokenize
+import torch
+from prompts import (
+    get_task_instruction_openqa, 
+    get_task_instruction_math, 
+    get_task_instruction_multi_choice, 
+    get_task_instruction_code, 
+    get_naive_rag_instruction, 
+)
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run Naive RAG for various datasets and models.")
+
+    # Dataset and split configuration
+    parser.add_argument(
+        '--dataset_name',
+        type=str,
+        required=True,
+        choices=['gpqa', 'math500', 'aime', 'amc', 'livecode', 'nq', 'triviaqa', 'hotpotqa', '2wiki', 'musique', 'bamboogle', 'medmcqa', 'pubhealth'],
+        help="Name of the dataset to use."
+    )
+
+    parser.add_argument(
+        '--split',
+        type=str,
+        required=True,
+        choices=['test', 'diamond', 'main', 'extended'],
+        help="Dataset split to use."
+    )
+
+    parser.add_argument(
+        '--subset_num',
+        type=int,
+        default=None,
+        help="Number of examples to process. Defaults to all if not specified."
+    )
+
+    # Search and document retrieval configuration
+    parser.add_argument(
+        '--top_k',
+        type=int,
+        default=10,
+        help="Number of top search results to retrieve."
+    )
+
+    parser.add_argument(
+        '--max_doc_len',
+        type=int,
+        default=3000,
+        help="Maximum length of each searched document."
+    )
+
+    # Model configuration
+    parser.add_argument(
+        '--model_path',
+        type=str,
+        required=True,
+        help="Path to the pre-trained model."
+    )
+
+    parser.add_argument(
+        '--use_jina',
+        type=bool,
+        default=True,
+        help="Whether to use Jina API for document fetching."
+    )
+
+    parser.add_argument(
+        '--jina_api_key',
+        type=str,
+        default='None',
+        help="Your Jina API Key to Fetch URL Content."
+    )
+
+    # Sampling parameters
+    parser.add_argument(
+        '--temperature',
+        type=float,
+        default=0.7,
+        help="Sampling temperature."
+    )
+
+    parser.add_argument(
+        '--top_p',
+        type=float,
+        default=0.8,
+        help="Top-p sampling parameter."
+    )
+
+    parser.add_argument(
+        '--top_k_sampling',
+        type=int,
+        default=20,
+        help="Top-k sampling parameter."
+    )
+
+    parser.add_argument(
+        '--repetition_penalty',
+        type=float,
+        default=None,
+        help="Repetition penalty. If not set, defaults based on the model."
+    )
+
+    parser.add_argument(
+        '--max_tokens',
+        type=int,
+        default=32768,
+        help="Maximum number of tokens to generate. If not set, defaults based on the model and dataset."
+    )
+
+    # Bing API Configuration
+    parser.add_argument(
+        '--bing_subscription_key',
+        type=str,
+        required=True,
+        help="Bing Search API subscription key."
+    )
+
+    parser.add_argument(
+        '--bing_endpoint',
+        type=str,
+        default="https://api.bing.microsoft.com/v7.0/search",
+        help="Bing Search API endpoint."
+    )
+
+    return parser.parse_args()
+
+def main():
+    args = parse_args()
+
+    # Extract arguments
+    dataset_name = args.dataset_name
+    split = args.split
+    subset_num = args.subset_num
+    top_k = args.top_k
+    max_doc_len = args.max_doc_len
+    model_path = args.model_path
+    temperature = args.temperature
+    top_p = args.top_p
+    top_k_sampling = args.top_k_sampling
+    repetition_penalty = args.repetition_penalty
+    max_tokens = args.max_tokens
+    bing_subscription_key = args.bing_subscription_key
+    bing_endpoint = args.bing_endpoint
+    use_jina = args.use_jina
+    jina_api_key = args.jina_api_key
+
+    # Set default repetition_penalty if not provided
+    if repetition_penalty is None:
+        repetition_penalty = 1.05 if 'qwq' in model_path.lower() else 1.0
+    
+    if args.jina_api_key == 'None':
+        jina_api_key = None
+
+    # Paths to datasets
+    if dataset_name == 'livecode':
+        data_path = f'./data/LiveCodeBench/{split}.json'
+    elif dataset_name in ['math500', 'gpqa', 'aime', 'amc']:
+        data_path = f'./data/{dataset_name.upper()}/{split}.json'
+    else:
+        data_path = f'./data/QA_Datasets/{dataset_name}.json'
+
+    # ---------------------- Caching Mechanism ----------------------
+    # Define cache directories and file paths
+    cache_dir = './cache'
+    search_cache_path = os.path.join(cache_dir, 'search_cache.json')
+    url_cache_path = os.path.join(cache_dir, 'url_cache.json')
+
+    # Ensure cache directory exists
+    os.makedirs(cache_dir, exist_ok=True)
+
+    # Load existing caches or initialize empty dictionaries
+    if os.path.exists(search_cache_path):
+        with open(search_cache_path, 'r', encoding='utf-8') as f:
+            search_cache = json.load(f)
+    else:
+        search_cache = {}
+
+    if os.path.exists(url_cache_path):
+        with open(url_cache_path, 'r', encoding='utf-8') as f:
+            url_cache = json.load(f)
+    else:
+        url_cache = {}
+
+    # Function to save caches
+    def save_caches():
+        with open(search_cache_path, 'w', encoding='utf-8') as f:
+            json.dump(search_cache, f, ensure_ascii=False, indent=2)
+        with open(url_cache_path, 'w', encoding='utf-8') as f:
+            json.dump(url_cache, f, ensure_ascii=False, indent=2)
+
+    # ---------------------- Model Loading ----------------------
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = 'left'
+
+    # Define output directory based on model and dataset
+    if 'qwq' in model_path.lower():
+        if dataset_name in ['math500', 'gpqa', 'aime', 'amc', 'livecode']:
+            output_dir = f'./outputs/{dataset_name}.qwq.naive_rag'
+        else:
+            output_dir = f'./outputs/runs.qa/{dataset_name}.qwq.naive_rag'
+    else:
+        model_short_name = model_path.split('/')[-1].lower().replace('-instruct', '')
+        output_dir = f'./outputs/runs.baselines/{dataset_name}.{model_short_name}.naive_rag'
+    os.makedirs(output_dir, exist_ok=True)
+
+    # ---------------------- Data Loading ----------------------
+    with open(data_path, 'r', encoding='utf-8') as json_file:
+        data = json.load(json_file)
+        if subset_num is not None:
+            data = data[:subset_num]
+
+    # ---------------------- Search and Document Retrieval ----------------------
+    print("Performing Bing Web Searches for all questions...")
+
+    # Initialize a list to hold relevant information for each question
+    all_relevant_info = []
+
+    for item in tqdm(data, desc="Searching"):
+        question = item['Question']
+        # Check if the question has already been searched and cached
+        if question in search_cache:
+            results = search_cache[question]
+            # print(f"Using cached search results for question: {question}")
+        else:
+            if dataset_name == 'livecode':
+                search_question = question[:500]
+            else:
+                search_question = question
+            results = bing_web_search(search_question, bing_subscription_key, bing_endpoint, market='en-US', language='en')
+            search_cache[question] = results
+            # print(f"Executed and cached search for question: {question}")
+
+        # Extract relevant information from search results
+        relevant_info = extract_relevant_info(results)[:top_k]
+        all_relevant_info.append(relevant_info)
+
+    # Save search cache after retrieval
+    save_caches()
+    print("Search cache saved.")
+
+    # Collect all unique URLs to fetch
+    unique_urls = set()
+    url_snippets_map = {}
+
+    for relevant_info in all_relevant_info:
+        for info in relevant_info:
+            url = info['url']
+            snippet = info.get('snippet', "")
+            unique_urls.add(url)
+            url_snippets_map[url] = snippet
+
+    # Determine which URLs need to be fetched
+    urls_to_fetch = [url for url in unique_urls if url not in url_cache]
+
+    print(f"Fetching {len(urls_to_fetch)} unique URLs...")
+    fetched_contents = fetch_page_content(
+        urls_to_fetch,
+        use_jina=use_jina,
+        jina_api_key=jina_api_key,
+        # snippets=url_snippets_map
+    )
+
+    # Update URL cache with fetched contents
+    for url, content in fetched_contents.items():
+        url_cache[url] = content
+
+    # Save URL cache after fetching
+    save_caches()
+    print("URL cache saved.")
+
+    # ---------------------- Prompt Construction ----------------------
+    print("Constructing prompts for generation...")
+    input_prompts = []
+
+    for idx, item in enumerate(tqdm(data, desc="Constructing Prompts")):
+        question = item['Question']
+
+        formatted_documents = ""
+        relevant_info = all_relevant_info[idx]
+        for i, doc_info in enumerate(relevant_info):
+            url = doc_info['url']
+            snippet = doc_info.get('snippet', "")
+            raw_context = url_cache.get(url, "")
+            success, context = extract_snippet_with_context(raw_context, snippet, context_chars=max_doc_len)
+            if success:
+                context = context
+            else:
+                context = raw_context[:2 * max_doc_len]
+
+            # Clean snippet from HTML tags if any
+            clean_snippet = re.sub('<[^<]+?>', '', snippet)  # Removes HTML tags
+
+            formatted_documents += f"**Document {i + 1}:**\n"
+            formatted_documents += f"**Title:** {doc_info.get('title', '')}\n"
+            formatted_documents += f"**URL:** {url}\n"
+            formatted_documents += f"**Snippet:** {clean_snippet}\n"
+            formatted_documents += f"**Content:** {context}\n\n"
+
+        # Construct the instruction with documents and question
+        instruction = get_naive_rag_instruction(question, formatted_documents)
+
+        # Construct dataset and model-specific prompts
+        if dataset_name in ['nq', 'triviaqa', 'hotpotqa', 'musique', 'bamboogle', '2wiki']:
+            if 'qwq' in model_path.lower():
+                user_prompt = get_task_instruction_openqa(question, model_name='qwq')
+            else:
+                user_prompt = get_task_instruction_openqa(question)
+
+        elif dataset_name in ['math500', 'aime', 'amc']:
+            if 'qwq' in model_path.lower():
+                user_prompt = get_task_instruction_math(question, model_name='qwq')
+            else:
+                user_prompt = get_task_instruction_math(question)
+
+        elif dataset_name == 'gpqa':
+            if 'qwq' in model_path.lower():
+                user_prompt = get_task_instruction_multi_choice(question, model_name='qwq')
+            elif 'llama' in model_path.lower():
+                user_prompt = get_task_instruction_multi_choice(question, model_name='llama')
+            else:
+                user_prompt = get_task_instruction_multi_choice(question)
+
+        elif dataset_name == 'livecode':
+            question_title = item.get('question_title', '')
+            if 'qwq' in model_path.lower():
+                user_prompt = get_task_instruction_code(question, question_title=question_title, model_name='qwq')
+            else:
+                user_prompt = get_task_instruction_code(question)
+        else:
+            user_prompt = ""  # Default to empty if dataset not matched
+
+        # Combine instruction and user prompt
+        full_prompt = instruction + "\n\n" + user_prompt
+
+        # Apply tokenizer and chat template
+        prompt = [{"role": "user", "content": full_prompt}]
+        prompt = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
+        input_prompts.append(prompt)
+
+    # ---------------------- Generation ----------------------
+    # Initialize the LLM
+    llm = LLM(
+        model=model_path,
+        tensor_parallel_size=torch.cuda.device_count(),
+        gpu_memory_utilization=0.95,
+    )
+
+    print("Generating answers with LLM...")
+
+    # Set default max_tokens if not provided
+    if max_tokens is None:
+        if 'qwq' in model_path.lower():
+            max_tokens = 20480
+        else:
+            max_tokens = 10240
+
+    start_time = time.time()
+    # Generate model outputs
+    output_list = llm.generate(
+        input_prompts, 
+        sampling_params=SamplingParams(
+            max_tokens=max_tokens, 
+            temperature=temperature, 
+            top_p=top_p, 
+            top_k=top_k_sampling, 
+            repetition_penalty=repetition_penalty,
+        )
+    )
+
+    total_time = time.time() - start_time
+
+    # ---------------------- Evaluation ----------------------
+    print("Evaluating generated answers...")
+    run_evaluation(
+        filtered_data=data,
+        input_list=input_prompts,
+        output_list=output_list,
+        dataset_name=dataset_name,
+        output_dir=output_dir,
+        total_time=total_time,
+        split=split,
+    )
+
+    # ---------------------- Update Search and URL Cache ----------------------
+    print('Updating Search and URL Cache...')
+    # Load existing caches or initialize empty dictionaries
+    if os.path.exists(search_cache_path):
+        with open(search_cache_path, 'r', encoding='utf-8') as f:
+            search_cache_new = json.load(f)
+    else:
+        search_cache_new = {}
+
+    if os.path.exists(url_cache_path):
+        with open(url_cache_path, 'r', encoding='utf-8') as f:
+            url_cache_new = json.load(f)
+    else:
+        url_cache_new = {}
+
+    search_cache.update(search_cache_new)
+    url_cache.update(url_cache_new)
+
+    save_caches()
+
+    print("Process completed.")
+
+if __name__ == "__main__":
+    main()