"backend/apps/webui/internal/migrations/001_initial_schema.py" did not exist on "0d78b638057c80f09c47599f56d91dbab40ce5d0"
Commit b6edc328 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #2229 canceled with stages
import os
from time import sleep
try:
from anthropic import Anthropic
except ImportError as e:
pass
from lcb_runner.runner.base_runner import BaseRunner
class Claude3Runner(BaseRunner):
client = Anthropic(api_key=os.getenv("ANTHROPIC_KEY"))
def __init__(self, args, model):
super().__init__(args, model)
self.client_kwargs: dict[str | str] = {
"model": args.model,
"temperature": args.temperature,
"max_tokens": args.max_tokens,
"top_p": args.top_p,
}
def _run_single(self, prompt: tuple[str, str]) -> list[str]:
def __run_single(counter):
try:
response = self.client.messages.create(
system=prompt[0],
messages=prompt[1],
**self.client_kwargs,
)
content = "\n".join([x.text for x in response.content])
return content
except Exception as e:
print("Exception: ", repr(e), "Sleeping for 20 seconds...")
sleep(20 * (11 - counter))
counter = counter - 1
if counter == 0:
print(f"Failed to run model for {prompt}!")
print("Exception: ", repr(e))
raise e
return __run_single(counter)
outputs = []
try:
for _ in range(self.args.n):
outputs.append(__run_single(10))
except Exception as e:
raise e
return outputs
import os
from time import sleep
try:
from anthropic import Anthropic
except ImportError as e:
pass
from lcb_runner.runner.base_runner import BaseRunner
class ClaudeRunner(BaseRunner):
client = Anthropic(api_key=os.getenv("ANTHROPIC_KEY"))
def __init__(self, args, model):
super().__init__(args, model)
self.client_kwargs: dict[str | str] = {
"model": args.model,
"temperature": args.temperature,
"max_tokens_to_sample": args.max_tokens,
"top_p": args.top_p,
}
def _run_single(self, prompt: str) -> list[str]:
def __run_single(counter):
try:
response = self.client.completions.create(
prompt=prompt,
**self.client_kwargs,
)
content = response.completion
return content
except Exception as e:
print("Exception: ", repr(e), "Sleeping for 20 seconds...")
sleep(20 * (11 - counter))
counter = counter - 1
if counter == 0:
print(f"Failed to run model for {prompt}!")
print("Exception: ", repr(e))
raise e
return __run_single(counter)
outputs = []
try:
for _ in range(self.args.n):
outputs.append(__run_single(10))
except Exception as e:
raise e
return outputs
import os
from time import sleep
try:
import cohere
except ImportError as e:
pass
from lcb_runner.runner.base_runner import BaseRunner
class CohereRunner(BaseRunner):
client = cohere.Client(os.getenv("COHERE_API_KEY"))
def __init__(self, args, model):
super().__init__(args, model)
self.client_kwargs: dict[str | str] = {
"model": args.model,
"temperature": args.temperature,
"max_tokens": args.max_tokens,
"p": args.top_p,
}
def _run_single(self, prompt: tuple[dict[str,str], str]) -> list[str]:
chat_history, message = prompt
def __run_single(counter):
try:
response = self.client.chat(
message=message,
chat_history=chat_history,
**self.client_kwargs,
)
content = response.text
return content
except Exception as e:
print("Exception: ", repr(e), "Sleeping for 20 seconds...")
sleep(20 * (11 - counter))
counter = counter - 1
if counter == 0:
print(f"Failed to run model for {prompt}!")
print("Exception: ", repr(e))
raise e
return __run_single(counter)
outputs = []
try:
for _ in range(self.args.n):
outputs.append(__run_single(10))
except Exception as e:
raise e
return outputs
import os
import json
from lcb_runner.runner.parser import get_args
from lcb_runner.utils.scenarios import Scenario
from lcb_runner.utils.path_utils import get_output_path
from lcb_runner.evaluation import extract_instance_results
from lcb_runner.runner.scenario_router import (
build_prompt_benchmark,
sort_and_extract_save_results,
get_metrics,
)
def main():
args = get_args()
benchmark, _ = build_prompt_benchmark(args)
with open(args.custom_output_file, "r") as f:
custom_outputs = json.load(f)
assert isinstance(custom_outputs, list)
assert len(custom_outputs) == len(benchmark), f"{len(custom_outputs)} != {len(benchmark)}"
if isinstance(custom_outputs[0], list):
## custom outputs must list[list[str]]
## list of extracted outputs per question
## sorted by the benchmark question_id, test_id, id depending on the scenario
assert all(
isinstance(custom_output, list) for custom_output in custom_outputs
)
elif isinstance(custom_outputs[0], dict):
## custom outputs must list[dict[str, Any]]
## list of extracted outputs per question
## for codegeneration and selfrepair scenario -- `code_list` and `question_id` are required
## for testoutputprediction -- `pred_list`, `question_id`, `test_id` are required
## for codeexecution -- `pred_list`, `id` are required
## code_list/pred_list is a list of extracted answers (code or assertions) for a question
assert all(
isinstance(custom_output, dict) for custom_output in custom_outputs
)
if args.scenario in [Scenario.codegeneration, Scenario.selfrepair]:
custom_outputs = [
custom_output["code_list"]
for custom_output in sorted(
custom_outputs, key=lambda x: str(x["question_id"])
)
]
elif args.scenario == Scenario.testoutputprediction:
custom_outputs = [
custom_output['pred_list']
for custom_output in sorted(
custom_outputs, key=lambda x: (str(x["question_id"]), str(x['test_id']))
)
]
elif args.scenario == Scenario.codeexecution:
custom_outputs = [
custom_output['pred_list']
for custom_output in sorted(
custom_outputs, key=lambda x: int(x.id.split("_")[1])
)
]
save_results = [
instance.insert_output(custom_output, custom_output)
for instance, custom_output in zip(benchmark, custom_outputs)
]
save_results, combined_results = sort_and_extract_save_results(
args.scenario, save_results
)
metrics = get_metrics(args.scenario, args, benchmark, combined_results)
graded = extract_instance_results(metrics[1])
if args.scenario == Scenario.codegeneration:
metadatas = metrics[2]
save_eval_results = [
instance.insert_output_evaluation(
outputs_list, extracted_list, graded_list, metadata=meta
)
for instance, (outputs_list, extracted_list), graded_list, meta in zip(
benchmark, combined_results, graded, metadatas
)
]
else:
save_eval_results = [
instance.insert_output_evaluation(
outputs_list, extracted_list, graded_list
)
for instance, (outputs_list, extracted_list), graded_list in zip(
benchmark, combined_results, graded
)
]
if args.custom_output_save_name is None:
output_path = args.custom_output_file[:-5] + f"_{args.scenario.value}_output.json"
else:
output_path = get_output_path(args.custom_output_save_name, args)
with open(output_path, "w") as f:
json.dump(save_results, f, indent=4)
with open(output_path.replace(".json", "_eval.json"), "w") as f:
json.dump(metrics, f, indent=4)
with open(output_path.replace(".json", "_eval_all.json"), "w") as f:
json.dump(save_eval_results, f, indent=4)
if __name__ == "__main__":
main()
import os
from time import sleep
try:
import openai
from openai import OpenAI
except ImportError as e:
pass
from lcb_runner.runner.base_runner import BaseRunner
class DeepSeekRunner(BaseRunner):
client = OpenAI(
api_key=os.getenv("DEEPSEEK_API"), base_url="https://api.deepseek.com"
)
def __init__(self, args, model):
super().__init__(args, model)
self.client_kwargs: dict[str | str] = {
"model": args.model,
"temperature": args.temperature,
"max_tokens": args.max_tokens,
"top_p": args.top_p,
"frequency_penalty": 0,
"presence_penalty": 0,
"n": 1,
"timeout": args.openai_timeout,
# "stop": args.stop, --> stop is only used for base models currently
}
def _run_single(self, prompt: list[dict[str, str]]) -> list[str]:
assert isinstance(prompt, list)
def __run_single(counter):
try:
response = self.client.chat.completions.create(
messages=prompt,
**self.client_kwargs,
)
content = response.choices[0].message.content
return content
except (
openai.APIError,
openai.RateLimitError,
openai.InternalServerError,
openai.OpenAIError,
openai.APIStatusError,
openai.APITimeoutError,
openai.InternalServerError,
openai.APIConnectionError,
) as e:
print("Exception: ", repr(e))
print("Sleeping for 30 seconds...")
print("Consider reducing the number of parallel processes.")
sleep(30)
return DeepSeekRunner._run_single(prompt)
except Exception as e:
print(f"Failed to run the model for {prompt}!")
print("Exception: ", repr(e))
raise e
outputs = []
try:
for _ in range(self.args.n):
outputs.append(__run_single(10))
except Exception as e:
raise e
return outputs
import os
from time import sleep
try:
import google.generativeai as genai
from google.generativeai import GenerationConfig
except ImportError as e:
pass
from lcb_runner.runner.base_runner import BaseRunner
class GeminiRunner(BaseRunner):
client = genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
safety_settings = [
{
"category": "HARM_CATEGORY_HARASSMENT",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_HATE_SPEECH",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
"threshold": "BLOCK_NONE",
},
{
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
"threshold": "BLOCK_NONE",
},
]
def __init__(self, args, model):
super().__init__(args, model)
self.client = genai.GenerativeModel(model.model_name)
self.generation_config = GenerationConfig(
candidate_count=1,
max_output_tokens=args.max_tokens,
temperature=args.temperature,
top_p=args.top_p,
)
def _run_single(self, prompt: str) -> list[str]:
def __run_single(counter):
try:
return self.client.generate_content(
prompt,
generation_config=self.generation_config,
safety_settings=GeminiRunner.safety_settings,
)
except Exception as e:
print("Exception: ", repr(e), "Sleeping for 20 seconds...")
sleep(20 * (11 - counter))
counter = counter - 1
if counter == 0:
print(f"Failed to run model for {prompt}!")
print("Exception: ", repr(e))
raise e
return __run_single(counter)
outputs = []
try:
for _ in range(self.args.n):
outputs.append(__run_single(10))
except Exception as e:
raise e
new_outputs = []
for output in outputs:
try:
new_outputs.append(output.text)
except Exception as e:
print("Cannot extract text exception: ", repr(e))
print(output.__dict__)
new_outputs.append("")
outputs = new_outputs
return outputs
import os
import json
from lcb_runner.runner.parser import get_args
from lcb_runner.utils.scenarios import Scenario
from lcb_runner.lm_styles import LanguageModelStore
from lcb_runner.runner.runner_utils import build_runner
from lcb_runner.utils.path_utils import get_output_path
from lcb_runner.evaluation import extract_instance_results
from lcb_runner.runner.scenario_router import (
build_prompt_benchmark,
combine_results,
sort_and_extract_save_results,
get_metrics,
)
def main():
args = get_args()
model = LanguageModelStore[args.model]
benchmark, format_prompt = build_prompt_benchmark(args)
if args.debug:
print(f"Running with {len(benchmark)} instances in debug mode")
benchmark = benchmark[:5]
output_path = get_output_path(model.model_repr, args)
eval_file = output_path.replace(".json", "_eval.json")
eval_all_file = output_path.replace(".json", "_eval_all.json")
if args.continue_existing or args.continue_existing_with_eval:
if os.path.exists(output_path):
with open(output_path, "r") as f:
old_save_results = json.load(f)
elif os.path.exists(eval_all_file):
with open(eval_all_file, "r") as f:
old_save_results = json.load(f)
else:
print(
f"File {output_path} does not exist in --continue_existing, starting from scratch"
)
old_save_results = []
old_save_results = [
instance
for instance in old_save_results
if instance["output_list"] and [x for x in instance["output_list"] if x]
]
old_save_results_question_ids = [
instance["question_id"] for instance in old_save_results
]
remaining_benchmark = [
instance
for instance in benchmark
if instance.question_id not in old_save_results_question_ids
]
print(
f"Found {len(old_save_results)} existing generations, continuing with {len(remaining_benchmark)} remaining"
)
else:
old_save_results = []
remaining_benchmark = benchmark
if len(remaining_benchmark) > 0:
runner = build_runner(args, model)
results: list[list[str]] = runner.run_main(remaining_benchmark, format_prompt)
else:
results = []
combined_results = combine_results(
args.scenario, results, model, args.cot_code_execution
)
save_results = [
instance.insert_output(outputs_list, extracted_list)
for instance, (outputs_list, extracted_list) in zip(
remaining_benchmark, combined_results
)
]
if args.continue_existing or args.continue_existing_with_eval:
save_results += old_save_results
save_results, combined_results = sort_and_extract_save_results(
args.scenario, save_results
)
with open(output_path, "w") as f:
json.dump(save_results, f, indent=4)
if args.evaluate:
if args.continue_existing_with_eval and os.path.exists(eval_all_file):
with open(eval_all_file) as fp:
old_eval_all_results = json.load(fp)
if os.path.exists(eval_file):
with open(eval_file) as fp:
old_eval_results = json.load(fp)
else:
old_eval_results = None
old_eval_results_question_ids = [
instance["question_id"] for instance in old_eval_all_results
]
remaining_indices = [
idx
for idx in range(len(benchmark))
if benchmark[idx].question_id not in old_eval_results_question_ids
]
benchmark = [benchmark[idx] for idx in remaining_indices]
combined_results = [combined_results[idx] for idx in remaining_indices]
old_eval_size = len(old_eval_results_question_ids)
new_eval_size = len(benchmark)
if new_eval_size == 0:
return
print(f"Found {old_eval_size}, running evals for {new_eval_size} problems")
metrics = get_metrics(args.scenario, args, benchmark, combined_results)
graded = extract_instance_results(metrics[1])
if old_eval_results:
for key in metrics[0]:
if key in old_eval_results[0]:
if key != "detail":
metrics[0][key] = (
old_eval_size * old_eval_results[0][key]
+ new_eval_size * metrics[0][key]
)
metrics[0][key] /= old_eval_size + new_eval_size
for key in metrics[0]["detail"]:
if key in old_eval_results[0]["detail"]:
metrics[0]["detail"][key] = {
**metrics[0]["detail"][key],
**old_eval_results[0]["detail"][key],
}
metrics[1] = {**metrics[1], **old_eval_results[1]}
else:
print("Old eval file not present, cannot update eval file")
metrics = {}
else:
metrics = get_metrics(args.scenario, args, benchmark, combined_results)
graded = extract_instance_results(metrics[1])
old_eval_all_results = []
old_eval_results = []
if args.scenario == Scenario.codegeneration:
if metrics:
metadatas = metrics[2]
else:
metadatas = [[] for _ in benchmark]
save_eval_results = [
instance.insert_output_evaluation(
outputs_list, extracted_list, graded_list, metadata=meta
)
for instance, (outputs_list, extracted_list), graded_list, meta in zip(
benchmark, combined_results, graded, metadatas
)
]
if metrics and old_eval_results:
old_eval_results
metrics[2] = old_eval_results[2] + metrics[2]
elif args.scenario == Scenario.selfrepair:
metadatas = metrics[2]
with open(
f"output/{model.model_repr}/{Scenario.codegeneration}_{args.codegen_n}_{args.temperature}_eval_all.json"
) as f:
code_gen_evals = json.load(f)
original_code_lists = [
code_gen_eval["code_list"] for code_gen_eval in code_gen_evals
]
save_eval_results = [
instance.insert_output_evaluation(
outputs_list,
extracted_list,
graded_list,
metadata=meta,
original_code_list=original_code_list,
)
for instance, (
outputs_list,
extracted_list,
), graded_list, meta, original_code_list in zip(
benchmark, combined_results, graded, metadatas, original_code_lists
)
]
else:
save_eval_results = [
instance.insert_output_evaluation(
outputs_list, extracted_list, graded_list
)
for instance, (outputs_list, extracted_list), graded_list in zip(
benchmark, combined_results, graded
)
]
save_eval_results = old_eval_all_results + save_eval_results
with open(eval_file, "w") as f:
json.dump(metrics, f, indent=4)
with open(eval_all_file, "w") as f:
json.dump(save_eval_results, f, indent=4)
if __name__ == "__main__":
main()
import os
from time import sleep
try:
from mistralai.client import MistralClient
except ImportError as e:
pass
from lcb_runner.runner.base_runner import BaseRunner
class MistralRunner(BaseRunner):
client = MistralClient(
api_key=os.environ["MISTRAL_API_KEY"],
)
def __init__(self, args, model):
super().__init__(args, model)
self.client_kwargs: dict[str | str] = {
"model": args.model,
"temperature": args.temperature,
"max_tokens": args.max_tokens,
"top_p": args.top_p,
}
def _run_single(self, prompt: list[dict[str, str]]) -> list[str]:
def __run_single(counter):
try:
response = self.client.chat(
messages=prompt,
**self.client_kwargs,
)
content = response.choices[0].message.content
return content
except Exception as e:
print("Exception: ", repr(e), "Sleeping for 20 seconds...")
sleep(20 * (11 - counter))
counter = counter - 1
if counter == 0:
print(f"Failed to run model for {prompt}!")
print("Exception: ", repr(e))
raise e
return __run_single(counter)
outputs = []
try:
for _ in range(self.args.n):
outputs.append(__run_single(10))
except Exception as e:
raise e
return outputs
import os
from time import sleep
try:
import openai
from openai import OpenAI
except ImportError as e:
pass
from lcb_runner.lm_styles import LMStyle
from lcb_runner.runner.base_runner import BaseRunner
class OpenAIRunner(BaseRunner):
client = OpenAI(
api_key=os.getenv("OPENAI_KEY"),
)
def __init__(self, args, model):
super().__init__(args, model)
if model.model_style == LMStyle.OpenAIReason:
self.client_kwargs: dict[str | str] = {
"model": args.model,
"max_completion_tokens": 25000,
}
else:
self.client_kwargs: dict[str | str] = {
"model": args.model,
"temperature": args.temperature,
"max_tokens": args.max_tokens,
"top_p": args.top_p,
"frequency_penalty": 0,
"presence_penalty": 0,
"n": args.n,
"timeout": args.openai_timeout,
# "stop": args.stop, --> stop is only used for base models currently
}
def _run_single(self, prompt: list[dict[str, str]]) -> list[str]:
assert isinstance(prompt, list)
try:
response = OpenAIRunner.client.chat.completions.create(
messages=prompt,
**self.client_kwargs,
)
except (
openai.APIError,
openai.RateLimitError,
openai.InternalServerError,
openai.OpenAIError,
openai.APIStatusError,
openai.APITimeoutError,
openai.InternalServerError,
openai.APIConnectionError,
) as e:
print("Exception: ", repr(e))
print("Sleeping for 30 seconds...")
print("Consider reducing the number of parallel processes.")
sleep(30)
return self._run_single(prompt)
except Exception as e:
print(f"Failed to run the model for {prompt}!")
print("Exception: ", repr(e))
raise e
return [c.message.content for c in response.choices]
import os
import torch
import argparse
from lcb_runner.utils.scenarios import Scenario
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model",
type=str,
default="gpt-3.5-turbo-0301",
help="Name of the model to use matching `lm_styles.py`",
)
parser.add_argument(
"--local_model_path",
type=str,
default=None,
help="If you have a local model, specify it here in conjunction with --model",
)
parser.add_argument(
"--trust_remote_code",
action="store_true",
help="trust_remote_code option used in huggingface models",
)
parser.add_argument(
"--scenario",
type=Scenario,
default=Scenario.codegeneration,
help="Type of scenario to run",
)
parser.add_argument(
"--not_fast",
action="store_true",
help="whether to use full set of tests (slower and more memory intensive evaluation)",
)
parser.add_argument(
"--release_version",
type=str,
default="release_latest",
help="whether to use full set of tests (slower and more memory intensive evaluation)",
)
parser.add_argument(
"--cot_code_execution",
action="store_true",
help="whether to use CoT in code execution scenario",
)
parser.add_argument(
"--n", type=int, default=10, help="Number of samples to generate"
)
parser.add_argument(
"--codegen_n",
type=int,
default=10,
help="Number of samples for which code generation was run (used to map the code generation file during self-repair)",
)
parser.add_argument(
"--temperature", type=float, default=0.2, help="Temperature for sampling"
)
parser.add_argument("--top_p", type=float, default=0.95, help="Top p for sampling")
parser.add_argument(
"--max_tokens", type=int, default=2000, help="Max tokens for sampling"
)
parser.add_argument(
"--multiprocess",
default=0,
type=int,
help="Number of processes to use for generation (vllm runs do not use this)",
)
parser.add_argument(
"--stop",
default="###",
type=str,
help="Stop token (use `,` to separate multiple tokens)",
)
parser.add_argument("--continue_existing", action="store_true")
parser.add_argument("--continue_existing_with_eval", action="store_true")
parser.add_argument(
"--use_cache", action="store_true", help="Use cache for generation"
)
parser.add_argument(
"--cache_batch_size", type=int, default=100, help="Batch size for caching"
)
parser.add_argument("--debug", action="store_true", help="Debug mode")
parser.add_argument("--evaluate", action="store_true", help="Evaluate the results")
parser.add_argument(
"--num_process_evaluate",
type=int,
default=12,
help="Number of processes to use for evaluation",
)
parser.add_argument("--timeout", type=int, default=6, help="Timeout for evaluation")
parser.add_argument(
"--openai_timeout", type=int, default=90, help="Timeout for requests to OpenAI"
)
parser.add_argument(
"--tensor_parallel_size",
type=int,
default=-1,
help="Tensor parallel size for vllm",
)
parser.add_argument(
"--enable_prefix_caching",
action="store_true",
help="Enable prefix caching for vllm",
)
parser.add_argument(
"--custom_output_file",
type=str,
default=None,
help="Path to the custom output file used in `custom_evaluator.py`",
)
parser.add_argument(
"--custom_output_save_name",
type=str,
default=None,
help="Folder name to save the custom output results (output file folder modified if None)",
)
parser.add_argument("--dtype", type=str, default="bfloat16", help="Dtype for vllm")
args = parser.parse_args()
args.stop = args.stop.split(",")
if args.tensor_parallel_size == -1:
args.tensor_parallel_size = torch.cuda.device_count()
if args.multiprocess == -1:
args.multiprocess = os.cpu_count()
return args
def test():
args = get_args()
print(args)
if __name__ == "__main__":
test()
from lcb_runner.lm_styles import LMStyle, LanguageModel
def build_runner(args, model: LanguageModel):
if model.model_style == LMStyle.OpenAIChat:
from lcb_runner.runner.oai_runner import OpenAIRunner
return OpenAIRunner(args, model)
if model.model_style == LMStyle.OpenAIReason:
from lcb_runner.runner.oai_runner import OpenAIRunner
return OpenAIRunner(args, model)
if model.model_style == LMStyle.Gemini:
from lcb_runner.runner.gemini_runner import GeminiRunner
return GeminiRunner(args, model)
if model.model_style == LMStyle.Claude3:
from lcb_runner.runner.claude3_runner import Claude3Runner
return Claude3Runner(args, model)
if model.model_style == LMStyle.Claude:
from lcb_runner.runner.claude_runner import ClaudeRunner
return ClaudeRunner(args, model)
if model.model_style == LMStyle.MistralWeb:
from lcb_runner.runner.mistral_runner import MistralRunner
return MistralRunner(args, model)
if model.model_style == LMStyle.CohereCommand:
from lcb_runner.runner.cohere_runner import CohereRunner
return CohereRunner(args, model)
if model.model_style == LMStyle.DeepSeekAPI:
from lcb_runner.runner.deepseek_runner import DeepSeekRunner
return DeepSeekRunner(args, model)
elif model.model_style in []:
raise NotImplementedError(
f"Runner for language model style {model.model_style} not implemented yet"
)
else:
from lcb_runner.runner.vllm_runner import VLLMRunner
return VLLMRunner(args, model)
from typing import Union
from lcb_runner.utils.scenarios import Scenario
from lcb_runner.lm_styles import LanguageModel
from lcb_runner.evaluation import (
codegen_metrics,
test_output_metrics,
code_execution_metrics,
)
from lcb_runner.prompts import (
format_prompt_generation,
format_prompt_test_output,
format_prompt_execution,
format_prompt_execution_cot,
format_prompt_self_repair,
)
from lcb_runner.utils.extraction_utils import (
extract_code,
extract_test_output_code,
extract_execution_code,
)
from lcb_runner.benchmarks import (
CodeGenerationProblem,
TestOutputPredictionProblem,
CodeExecutionProblem,
load_code_generation_dataset,
load_code_generation_dataset_not_fast,
load_test_prediction_dataset,
load_code_execution_dataset,
)
# BenchMarkType = list[CodeGenerationProblem | TestOutputPredictionProblem]
BenchMarkType = list[
Union[CodeGenerationProblem, CodeExecutionProblem, TestOutputPredictionProblem]
]
def build_prompt_benchmark(
args,
) -> tuple[
list[CodeExecutionProblem]
| list[CodeGenerationProblem]
| list[TestOutputPredictionProblem],
callable,
]:
scenario: Scenario = args.scenario
if scenario == Scenario.codegeneration:
not_fast: bool = args.not_fast
if not_fast:
benchmark = load_code_generation_dataset_not_fast(args.release_version)
else:
benchmark = load_code_generation_dataset(args.release_version)
benchmark = sorted(benchmark, key=lambda x: x.question_id)
format_prompt = format_prompt_generation
elif scenario == Scenario.testoutputprediction:
benchmark = load_test_prediction_dataset(args.release_version)
benchmark = sorted(benchmark, key=lambda x: (x.question_id, x.test_id))
format_prompt = format_prompt_test_output
elif scenario == Scenario.selfrepair:
benchmark = load_code_generation_dataset(args.release_version)
benchmark = sorted(benchmark, key=lambda x: x.question_id)
format_prompt = format_prompt_self_repair
elif scenario == Scenario.codeexecution:
cot_code_execution: bool = args.cot_code_execution
benchmark = load_code_execution_dataset(args.release_version)
benchmark = sorted(benchmark, key=lambda x: int(x.id.split("_")[1]))
if cot_code_execution:
format_prompt = format_prompt_execution_cot
else:
format_prompt = format_prompt_execution
else:
raise ValueError(f"Scenario {scenario} not implemented")
return benchmark, format_prompt
def combine_results(
scenario: Scenario,
results: list[list[str]],
model: LanguageModel,
cot_code_execution: bool = False,
):
if scenario == Scenario.codegeneration:
combined_results = [
(
outputs_list,
[extract_code(output, model.model_style) for output in outputs_list],
)
for outputs_list in results
]
elif scenario == Scenario.testoutputprediction:
combined_results = [
(
outputs_list,
[
extract_test_output_code(output, model.model_style)
for output in outputs_list
],
)
for outputs_list in results
]
elif scenario == Scenario.selfrepair:
combined_results = [
(
[
output[0] if type(output) is list else output
for output in outputs_list
],
[
(
extract_code(output[0], model.model_style)
if type(output) is list
else extract_code(output, model.model_style)
)
for output in outputs_list
],
)
for outputs_list in results
]
elif scenario == Scenario.codeexecution:
combined_results = [
(
outputs_list,
[
extract_execution_code(
output, model.model_style, cot=cot_code_execution
)
for output in outputs_list
],
)
for outputs_list in results
]
else:
raise ValueError(f"Scenario {scenario} not implemented")
return combined_results
def sort_and_extract_save_results(scenario: Scenario, save_results: list[dict]):
if scenario == Scenario.codegeneration:
save_results = sorted(save_results, key=lambda x: x["question_id"])
combined_results = [
(save_result_instance["output_list"], save_result_instance["code_list"])
for save_result_instance in save_results
]
elif scenario == Scenario.testoutputprediction:
save_results = sorted(
save_results, key=lambda x: (x["question_id"], x["test_id"])
)
combined_results = [
(save_result_instance["output_list"], save_result_instance["pred_list"])
for save_result_instance in save_results
]
elif scenario == Scenario.selfrepair:
save_results = sorted(save_results, key=lambda x: x["question_id"])
combined_results = [
(save_result_instance["output_list"], save_result_instance["code_list"])
for save_result_instance in save_results
]
elif scenario == Scenario.codeexecution:
save_results = sorted(save_results, key=lambda x: int(x["id"].split("_")[1]))
combined_results = [
(save_result_instance["output_list"], save_result_instance["pred_list"])
for save_result_instance in save_results
]
else:
raise ValueError(f"Scenario {scenario} not implemented")
return save_results, combined_results
def get_metrics(
scenario: Scenario,
args,
benchmark: list[
CodeGenerationProblem | CodeExecutionProblem | TestOutputPredictionProblem
],
combined_results,
):
eval_samples = [instance.get_evaluation_sample() for instance in benchmark]
generations = [extracted for _, extracted in combined_results]
if scenario == Scenario.codegeneration or scenario == Scenario.selfrepair:
metrics = codegen_metrics(
eval_samples,
generations,
num_process_evaluate=args.num_process_evaluate,
timeout=args.timeout,
)
elif args.scenario == Scenario.testoutputprediction:
metrics = test_output_metrics(
eval_samples,
generations,
k_list=[1, 5],
)
elif args.scenario == Scenario.codeexecution:
metrics = code_execution_metrics(
eval_samples,
generations,
)
else:
raise ValueError(f"Scenario {scenario} not implemented")
print(metrics[0]["pass@1"])
return metrics
try:
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
except ImportError as e:
# print("Cannot import vllm")
pass
from lcb_runner.runner.base_runner import BaseRunner
class VLLMRunner(BaseRunner):
def __init__(self, args, model):
super().__init__(args, model)
model_tokenizer_path = (
model.model_name if args.local_model_path is None else args.local_model_path
)
self.llm = LLM(
model=model_tokenizer_path,
tokenizer=model_tokenizer_path,
tensor_parallel_size=args.tensor_parallel_size,
# dtype=args.dtype,
enforce_eager=True,
max_model_len=4096,
disable_custom_all_reduce=True,
enable_prefix_caching=args.enable_prefix_caching,
trust_remote_code=args.trust_remote_code,
)
self.sampling_params = SamplingParams(
n=self.args.n,
max_tokens=self.args.max_tokens,
temperature=self.args.temperature,
top_p=self.args.top_p,
frequency_penalty=0,
presence_penalty=0,
stop=self.args.stop,
)
def _run_single(self, prompt: str) -> list[str]:
pass
def run_batch(self, prompts: list[str]) -> list[list[str]]:
outputs = [None for _ in prompts]
remaining_prompts = []
remaining_indices = []
for prompt_index, prompt in enumerate(prompts):
if self.args.use_cache and prompt in self.cache:
if len(self.cache[prompt]) == self.args.n:
outputs[prompt_index] = self.cache[prompt]
continue
remaining_prompts.append(prompt)
remaining_indices.append(prompt_index)
if remaining_prompts:
vllm_outputs = self.llm.generate(remaining_prompts, self.sampling_params)
if self.args.use_cache:
assert len(remaining_prompts) == len(vllm_outputs)
for index, remaining_prompt, vllm_output in zip(
remaining_indices, remaining_prompts, vllm_outputs
):
self.cache[remaining_prompt] = [o.text for o in vllm_output.outputs]
outputs[index] = [o.text for o in vllm_output.outputs]
else:
for index, vllm_output in zip(remaining_indices, vllm_outputs):
outputs[index] = [o.text for o in vllm_output.outputs]
return outputs
from lcb_runner.lm_styles import LMStyle
def extract_code(model_output: str, lmstyle: LMStyle):
outputlines = model_output.split("\n")
if lmstyle == LMStyle.CodeLLaMaInstruct:
indexlines = [i for i, line in enumerate(outputlines) if "PYTHON]" in line]
if len(indexlines) < 2:
indexlines = [i for i, line in enumerate(outputlines) if "```" in line]
elif lmstyle == LMStyle.GenericBase:
return model_output.strip()
else:
indexlines = [i for i, line in enumerate(outputlines) if "```" in line]
if len(indexlines) < 2:
return ""
return "\n".join(outputlines[indexlines[0] + 1 : indexlines[1]])
def extract_test_output_code(model_output: str, lmstyle: LMStyle = None):
outputlines = model_output.split("\n")
# find the last line startwith assert...
indexlines = [i for i, line in enumerate(outputlines) if line.startswith("assert")]
if indexlines:
return outputlines[indexlines[-1]]
if lmstyle and lmstyle == LMStyle.CodeLLaMaInstruct:
indexlines = [i for i, line in enumerate(outputlines) if "PYTHON]" in line]
else:
# first try to extract ```python if not then try ```
indexlines = [
i
for i, line in enumerate(outputlines)
if "```python" in line or "```Python" in line
]
if indexlines:
start_index = indexlines[0]
else:
start_index = None
indexlines = [i for i, line in enumerate(outputlines) if "```" in line]
if start_index is not None:
indexlines = [i for i in indexlines if i > start_index]
indexlines = [start_index] + indexlines
if len(indexlines) < 2:
return ""
return "\n".join(outputlines[indexlines[0] + 1 : indexlines[1]])
def extract_execution_code(model_output: str, lmstyle: LMStyle, cot: bool = False):
if cot:
if "[ANSWER]" in model_output:
model_output = model_output.split("[ANSWER]")[1].strip()
if "==" in model_output:
model_output = model_output.split("==")[1].strip()
if "[/ANSWER]" in model_output:
model_output = model_output.split("[/ANSWER]")[0].strip()
else:
model_output = model_output.split("\n")[0].strip()
return model_output.strip()
""" Utilities for running functions in parallel processes. """
import sys
import resource
import multiprocessing as mp
import queue
import traceback
from enum import Enum
from typing import Callable, Optional, Dict, Any, List, Iterator
from concurrent.futures import TimeoutError
import attrs
import tqdm
from pebble import concurrent, ProcessPool, ProcessExpired
class FuncTimeoutError(TimeoutError):
pass
def generate_queue() -> mp.Queue:
"""
Generates a queue that can be shared amongst processes
Returns:
(multiprocessing.Queue): A queue instance
"""
manager = mp.Manager()
return manager.Queue()
QueueEmptyException = queue.Empty
def run_func_in_process(
func: Callable,
*args,
_timeout: Optional[int] = None,
_use_spawn: bool = True,
**kwargs,
):
"""
Runs the provided function in a separate process with the supplied args
and kwargs. The args, kwargs, and
return values must all be pickle-able.
Args:
func: The function to run.
*args: Positional args, if any.
_timeout: A timeout to use for the function.
_use_spawn: The 'spawn' multiprocess context is used.'fork' otherwise.
**kwargs: Keyword args, if any.
Returns:
The result of executing the function.
"""
mode = "spawn" if _use_spawn else "fork"
c_func = concurrent.process(timeout=_timeout, context=mp.get_context(mode))(func)
future = c_func(*args, **kwargs)
try:
result = future.result()
return result
except TimeoutError:
raise FuncTimeoutError
class TaskRunStatus(Enum):
SUCCESS = 0
EXCEPTION = 1
TIMEOUT = 2
PROCESS_EXPIRED = 3
@attrs.define(eq=False, repr=False)
class TaskResult:
status: TaskRunStatus
result: Optional[Any] = None
exception_tb: Optional[str] = None
def is_success(self) -> bool:
return self.status == TaskRunStatus.SUCCESS
def is_timeout(self) -> bool:
return self.status == TaskRunStatus.TIMEOUT
def is_exception(self) -> bool:
return self.status == TaskRunStatus.EXCEPTION
def is_process_expired(self) -> bool:
return self.status == TaskRunStatus.PROCESS_EXPIRED
def initializer(limit):
"""Set maximum amount of memory each worker process can allocate."""
soft, hard = resource.getrlimit(resource.RLIMIT_AS)
resource.setrlimit(resource.RLIMIT_AS, (limit, hard))
def run_tasks_in_parallel_iter(
func: Callable,
tasks: List[Any],
num_workers: int = 2,
timeout_per_task: Optional[int] = None,
use_progress_bar: bool = False,
progress_bar_desc: Optional[str] = None,
max_tasks_per_worker: Optional[int] = None,
use_spawn: bool = True,
max_mem: int = 1024 * 1024 * 1024 * 4,
) -> Iterator[TaskResult]:
"""
Args:
func: The function to run. The function must accept a single argument.
tasks: A list of tasks i.e. arguments to func.
num_workers: Maximum number of parallel workers.
timeout_per_task: The timeout, in seconds, to use per task.
use_progress_bar: Whether to use a progress bar. Default False.
progress_bar_desc: String to display in the progress bar. Default None.
max_tasks_per_worker: Maximum number of tasks assigned
to a single process / worker. None means infinite.
Use 1 to force a restart.
use_spawn: The 'spawn' multiprocess context is used. 'fork' otherwise.
Returns:
A list of TaskResult objects, one per task.
"""
mode = "spawn" if use_spawn else "fork"
with ProcessPool(
max_workers=num_workers,
max_tasks=0 if max_tasks_per_worker is None else max_tasks_per_worker,
context=mp.get_context(mode),
) as pool:
future = pool.map(func, tasks, timeout=timeout_per_task)
iterator = future.result()
if use_progress_bar:
pbar = tqdm.tqdm(
desc=progress_bar_desc,
total=len(tasks),
dynamic_ncols=True,
file=sys.stdout,
)
else:
pbar = None
succ = timeouts = exceptions = expirations = 0
while True:
try:
result = next(iterator)
except StopIteration:
break
except TimeoutError as error:
yield TaskResult(
status=TaskRunStatus.TIMEOUT,
)
timeouts += 1
except ProcessExpired as error:
yield TaskResult(
status=TaskRunStatus.PROCESS_EXPIRED,
)
expirations += 1
except Exception as error:
exception_tb = traceback.format_exc()
yield TaskResult(
status=TaskRunStatus.EXCEPTION,
exception_tb=exception_tb,
)
exceptions += 1
else:
yield TaskResult(
status=TaskRunStatus.SUCCESS,
result=result,
)
succ += 1
if pbar is not None:
pbar.update(1)
pbar.set_postfix(
succ=succ, timeouts=timeouts, exc=exceptions, p_exp=expirations
)
sys.stdout.flush()
sys.stderr.flush()
def run_tasks_in_parallel(
func: Callable,
tasks: List[Any],
num_workers: int = 2,
timeout_per_task: Optional[int] = None,
use_progress_bar: bool = False,
progress_bar_desc: Optional[str] = None,
max_tasks_per_worker: Optional[int] = None,
use_spawn: bool = True,
) -> List[TaskResult]:
"""
Args:
func: The function to run. The function must accept a single argument.
tasks: A list of tasks i.e. arguments to func.
num_workers: Maximum number of parallel workers.
timeout_per_task: The timeout, in seconds, to use per task.
use_progress_bar: Whether to use a progress bar. Defaults False.
progress_bar_desc: String to display in the progress bar. Default None.
max_tasks_per_worker: Maximum number of tasks assigned to a single
process / worker. None means infinite.
Use 1 to force a restart.
use_spawn: The 'spawn' multiprocess context is used. 'fork' otherwise.
Returns:
A list of TaskResult objects, one per task.
"""
task_results: List[TaskResult] = list(
run_tasks_in_parallel_iter(
func=func,
tasks=tasks,
num_workers=num_workers,
timeout_per_task=timeout_per_task,
use_progress_bar=use_progress_bar,
progress_bar_desc=progress_bar_desc,
max_tasks_per_worker=max_tasks_per_worker,
use_spawn=use_spawn,
)
)
return task_results
import pathlib
from lcb_runner.lm_styles import LanguageModel, LMStyle
from lcb_runner.utils.scenarios import Scenario
def ensure_dir(path: str, is_file=True):
if is_file:
pathlib.Path(path).parent.mkdir(parents=True, exist_ok=True)
else:
pathlib.Path(path).mkdir(parents=True, exist_ok=True)
return
def get_cache_path(model_repr:str, args) -> str:
scenario: Scenario = args.scenario
n = args.n
temperature = args.temperature
path = f"cache/{model_repr}/{scenario}_{n}_{temperature}.json"
ensure_dir(path)
return path
def get_output_path(model_repr:str, args) -> str:
scenario: Scenario = args.scenario
n = args.n
temperature = args.temperature
cot_suffix = "_cot" if args.cot_code_execution else ""
path = f"output/{model_repr}/{scenario}_{n}_{temperature}{cot_suffix}.json"
ensure_dir(path)
return path
def get_eval_all_output_path(model_repr:str, args) -> str:
scenario: Scenario = args.scenario
n = args.n
temperature = args.temperature
cot_suffix = "_cot" if args.cot_code_execution else ""
path = f"output/{model_repr}/{scenario}_{n}_{temperature}{cot_suffix}_eval_all.json"
return path
from enum import Enum
class Scenario(Enum):
codegeneration = "codegeneration"
selfrepair = "selfrepair"
testoutputprediction = "testoutputprediction"
codeexecution = "codeexecution"
def get_gpqa_search_o1_instruction(MAX_SEARCH_LIMIT):
return (
"You are a reasoning assistant with the ability to perform web searches to help "
"you answer the user's question accurately. You have special tools:\n\n"
"- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
"Then, the system will search and analyze relevant web pages, then provide you with helpful information in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n\n"
f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n\n"
"Once you have all the information you need, continue your reasoning.\n\n"
"Example:\n"
"Question: \"What is the energy range of pp III neutrinos?\"\n"
"Assistant thinking steps:\n"
"- I might need to look up details about pp III neutrinos.\n\n"
"Assistant:\n"
"<|begin_search_query|>pp III neutrino energy spectrum<|end_search_query|>\n\n"
"(System returns processed information from relevant web pages)\n\n"
"Assistant continues reasoning with the new information...\n\n"
"Remember:\n"
"- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
"- When done searching, continue your reasoning.\n\n"
)
def get_math_search_o1_instruction(MAX_SEARCH_LIMIT):
return (
"You are a reasoning assistant with the ability to perform web searches to help "
"you answer the user's question accurately. You have special tools:\n\n"
"- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
"Then, the system will search and analyze relevant web pages, then provide you with helpful information in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n\n"
f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n\n"
"Once you have all the information you need, continue your reasoning.\n\n"
"Example:\n"
"Question: \"How do you compute the integral of e^(x^2) dx?\"\n"
"Assistant thinking steps:\n"
"- I might need to look up techniques for integrating e^(x^2).\n\n"
"Assistant:\n"
"<|begin_search_query|>methods to integrate e^(x^2)<|end_search_query|>\n\n"
"(System returns processed information from relevant web pages)\n\n"
"Assistant continues reasoning with the new information...\n\n"
"Remember:\n"
"- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
"- When done searching, continue your reasoning.\n\n"
)
def get_code_search_o1_instruction(MAX_SEARCH_LIMIT):
return (
"You are a reasoning assistant with the ability to perform web searches to help "
"you answer the user's question accurately. You have special tools:\n\n"
"- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
"Then, the system will search and analyze relevant web pages, then provide you with helpful information in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n\n"
f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n\n"
"Once you have all the information you need, continue your reasoning.\n\n"
"Example:\n"
"Question: \"Find the minimum number of vertices in a Steiner tree that includes all specified vertices in a given tree.\"\n"
"Assistant thinking steps:\n"
"- I need to understand what a Steiner tree is and how to compute the minimum number of vertices required to include all specified vertices in a given tree.\n\n"
"Assistant:\n"
"<|begin_search_query|>Minimum Steiner Tree problem in trees<|end_search_query|>\n\n"
"(System returns processed information from relevant web pages)\n\n"
"Assistant continues reasoning with the new information...\n\n"
"Remember:\n"
"- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
"- When done searching, continue your reasoning.\n\n"
)
def get_webpage_to_reasonchain_instruction(prev_reasoning, search_query, document):
return f"""**Task Instruction:**
You are tasked with reading and analyzing web pages based on the following inputs: **Previous Reasoning Steps**, **Current Search Query**, and **Searched Web Pages**. Your objective is to extract relevant and helpful information for **Current Search Query** from the **Searched Web Pages** and seamlessly integrate this information into the **Previous Reasoning Steps** to continue reasoning for the original question.
**Guidelines:**
1. **Analyze the Searched Web Pages:**
- Carefully review the content of each searched web page.
- Identify factual information that is relevant to the **Current Search Query** and can aid in the reasoning process for the original question.
2. **Extract Relevant Information:**
- Select the information from the Searched Web Pages that directly contributes to advancing the **Previous Reasoning Steps**.
- Ensure that the extracted information is accurate and relevant.
3. **Output Format:**
- **If the web pages provide helpful information for current search query:** Present the information beginning with `**Final Information**` as shown below.
**Final Information**
[Helpful information]
- **If the web pages do not provide any helpful information for current search query:** Output the following text.
**Final Information**
No helpful information found.
**Inputs:**
- **Previous Reasoning Steps:**
{prev_reasoning}
- **Current Search Query:**
{search_query}
- **Searched Web Pages:**
{document}
Now you should analyze each web page and find helpful information based on the current search query "{search_query}" and previous reasoning steps.
"""
def get_singleqa_search_o1_instruction(MAX_SEARCH_LIMIT):
return (
"You are a reasoning assistant with the ability to perform web searches to help "
"you answer the user's question accurately. You have special tools:\n\n"
"- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
"Then, the system will search and analyze relevant web pages, then provide you with helpful information in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n\n"
f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n\n"
"Once you have all the information you need, continue your reasoning.\n\n"
"Example:\n"
"Question: \"Who got the first Nobel Prize in Physics?\"\n"
"Assistant thinking steps:\n"
"- I need to find out who was awarded the first Nobel Prize in Physics.\n\n"
"Assistant:\n"
"<|begin_search_query|>first Nobel Prize in Physics winner<|end_search_query|>\n\n"
"(System returns processed information from relevant web pages)\n\n"
"Assistant continues reasoning with the new information...\n\n"
"Remember:\n"
"- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
"- When done searching, continue your reasoning.\n\n"
)
def get_multiqa_search_o1_instruction(MAX_SEARCH_LIMIT):
return (
"You are a reasoning assistant with the ability to perform web searches to help "
"you answer the user's question accurately. You have special tools:\n\n"
"- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
"Then, the system will search and analyze relevant web pages, then provide you with helpful information in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n\n"
f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n\n"
"Once you have all the information you need, continue your reasoning.\n\n"
"Example:\n"
"Question: \"Alice David is the voice of Lara Croft in a video game developed by which company?\"\n"
"Assistant thinking steps:\n"
"- I need to find out who voices Lara Croft in the video game.\n"
"- Then, I need to determine which company developed that video game.\n\n"
"Assistant:\n"
"<|begin_search_query|>Alice David Lara Croft voice<|end_search_query|>\n\n"
"(System returns processed information from relevant web pages)\n\n"
"Assistant thinks: The search results indicate that Alice David is the voice of Lara Croft in a specific video game. Now, I need to find out which company developed that game.\n\n"
"Assistant:\n"
"<|begin_search_query|>video game developed by Alice David Lara Croft<|end_search_query|>\n\n"
"(System returns processed information from relevant web pages)\n\n"
"Assistant continues reasoning with the new information...\n\n"
"Remember:\n"
"- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
"- When done searching, continue your reasoning.\n\n"
)
def get_singleqa_rag_agent_instruction(MAX_SEARCH_LIMIT, MAX_URL_FETCH):
return (
"You are a reasoning assistant with the ability to perform web searches and retrieve webpage content to help "
"you answer the user’s question accurately. You have special tools:\n\n"
"- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
"Then, the system will call the web search API with your query and return the search results to you in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n"
" The search results will contain a list of webpages with titles, URLs, and snippets (but not full content).\n\n"
"- After receiving the search results, if you need more detailed information from one or more specific URLs, write <|begin_url|> url1, url2, ... <|end_url|>.\n"
" The system will fetch the full page content of those URLs and return it to you as <|begin_full_page|> ...full page content... <|end_full_page|>.\n\n"
f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n"
f"You can fetch up to {MAX_URL_FETCH} URLs for detailed information.\n\n"
"Once you have all the information you need, continue your reasoning.\n\n"
"Example:\n"
"Question: \"Who got the first Nobel Prize in Physics?\"\n"
"Assistant thinking steps:\n"
"- I need to find out who was awarded the first Nobel Prize in Physics.\n\n"
"Assistant:\n"
"<|begin_search_query|>first Nobel Prize in Physics winner<|end_search_query|>\n\n"
"(System returns search results)\n\n"
"Assistant:\n"
"<|begin_search_result|> ...search results without full page... <|end_search_result|>\n\n"
"Assistant thinks: The search results mention several URLs. I want full details from one of them.\n\n"
"Assistant:\n"
"<|begin_url|>http://example.com/first_nobel_physics.html<|end_url|>\n\n"
"(System returns full page content)\n\n"
"Assistant:\n"
"<|begin_full_page|> ...full page content... <|end_full_page|>\n\n"
"Now the assistant has enough info and can continue reasoning.\n\n"
"Remember:\n"
"- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
"- Use <|begin_url|> to request full page content and end with <|end_url|>.\n"
"- When done retrieving information, continue your reasoning.\n\n"
)
def get_multiqa_rag_agent_instruction(MAX_SEARCH_LIMIT, MAX_URL_FETCH):
return (
"You are a reasoning assistant with the ability to perform web searches and retrieve webpage content to help "
"you answer the user’s question accurately. You have special tools:\n\n"
"- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
"Then, the system will call the web search API with your query and return the search results to you in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n"
" The search results will contain a list of webpages with titles, URLs, and snippets (but not full content).\n\n"
"- After receiving the search results, if you need more detailed information from one or more specific URLs, write <|begin_url|> url1, url2, ... <|end_url|>.\n"
" The system will fetch the full page content of those URLs and return it to you as <|begin_full_page|> ...full page content... <|end_full_page|>.\n\n"
f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n"
f"You can fetch up to {MAX_URL_FETCH} URLs for detailed information.\n\n"
"Once you have all the information you need, continue your reasoning.\n\n"
"Example:\n"
"Question: \"Alice David is the voice of Lara Croft in a video game developed by which company?\"\n"
"Assistant thinking steps:\n"
"- I need to find out who voices Lara Croft in the video game.\n"
"- Then, I need to determine which company developed that video game.\n\n"
"Assistant:\n"
"<|begin_search_query|>voice actor of Lara Croft<|end_search_query|>\n\n"
"(System returns search results)\n\n"
"Assistant:\n"
"<|begin_search_result|> ...search results without full page... <|end_search_result|>\n\n"
"Assistant thinks: The search results provide names of voice actors for Lara Croft. I need to confirm if Alice David is one of them.\n\n"
"Assistant:\n"
"<|begin_search_query|>Alice David Lara Croft voice<|end_search_query|>\n\n"
"(System returns search results)\n\n"
"Assistant:\n"
"<|begin_search_result|> ...search results without full page... <|end_search_result|>\n\n"
"Assistant thinks: The search results indicate that Alice David is the voice of Lara Croft in a specific video game. Now, I need to find out which company developed that game.\n\n"
"Assistant:\n"
"<|begin_search_query|>video game developed by Alice David Lara Croft<|end_search_query|>\n\n"
"(System returns search results)\n\n"
"Assistant:\n"
"<|begin_search_result|> ...search results without full page... <|end_search_result|>\n\n"
"Assistant thinks: The search results mention the company that developed the video game featuring Alice David as Lara Croft.\n\n"
"Assistant:\n"
"<|begin_url|>http://example.com/lara_croft_voice_actor.html, http://example.com/game_developer.html<|end_url|>\n\n"
"(System returns full page content)\n\n"
"Assistant:\n"
"<|begin_full_page|> ...full page content... <|end_full_page|>\n\n"
"Now the assistant has enough info and can continue reasoning.\n\n"
"Remember:\n"
"- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
"- Use <|begin_url|> to request full page content and end with <|end_url|>.\n"
"- When done retrieving information, continue your reasoning.\n\n"
)
def get_gpqa_rag_agent_instruction(MAX_SEARCH_LIMIT, MAX_URL_FETCH):
return (
"You are a reasoning assistant with the ability to perform web searches and retrieve webpage content to help "
"you answer the user’s question accurately. You have special tools:\n\n"
"- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
"Then, the system will call the web search API with your query and return the search results to you in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n"
" The search results will contain a list of webpages with titles, URLs, and snippets (but not full content).\n\n"
"- After receiving the search results, if you need more detailed information from one or more specific URLs, write <|begin_url|> url1, url2, ... <|end_url|>.\n"
" The system will fetch the full page content of those URLs and return it to you as <|begin_full_page|> ...full page content... <|end_full_page|>.\n\n"
f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n"
f"You can fetch up to {MAX_URL_FETCH} URLs for detailed information.\n\n"
"Once you have all the information you need, continue your reasoning.\n\n"
"Example:\n"
"Question: \"What is the energy range of pp III neutrinos?\"\n"
"Assistant thinking steps:\n"
"- I might need to look up details about pp III neutrinos.\n\n"
"Assistant:\n"
"<|begin_search_query|>pp III neutrino energy spectrum<|end_search_query|>\n\n"
"(System returns search results)\n\n"
"Assistant:\n"
"<|begin_search_result|> ...search results without full page... <|end_search_result|>\n\n"
"Assistant thinks: The search results mention some URLs. I want full details from one of them.\n\n"
"Assistant:\n"
"<|begin_url|>http://example.com/ppIII_neutrino.html<|end_url|>\n\n"
"(System returns full page content)\n\n"
"Assistant:\n"
"<|begin_full_page|> ...full page content... <|end_full_page|>\n\n"
"Now the assistant has enough info and can continue reasoning.\n\n"
"Remember:\n"
"- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
"- Use <|begin_url|> to request full page content and end with <|end_url|>.\n"
"- When done retrieving information, continue your reasoning.\n\n"
)
def get_math_rag_agent_instruction(MAX_SEARCH_LIMIT, MAX_URL_FETCH):
return (
"You are a reasoning assistant with the ability to perform web searches and retrieve webpage content to help "
"you answer the user’s math-related question accurately. You have special tools:\n\n"
"- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
"Then, the system will call the web search API with your query and return the search results to you in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n"
" The search results will contain a list of webpages with titles, URLs, and snippets (but not full content).\n\n"
"- After receiving the search results, if you need more detailed information from one or more specific URLs, write <|begin_url|> url1, url2, ... <|end_url|>.\n"
" The system will fetch the full page content of those URLs and return it to you as <|begin_full_page|> ...full page content... <|end_full_page|>.\n\n"
f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n"
f"You can fetch up to {MAX_URL_FETCH} URLs for detailed information.\n\n"
"Once you have all the information you need, continue your reasoning.\n\n"
"Example:\n"
"Question: \"How do you compute the integral of e^(x^2) dx?\"\n"
"Assistant thinking steps:\n"
"- I might need to look up techniques for integrating e^(x^2).\n\n"
"Assistant:\n"
"<|begin_search_query|>methods to integrate e^(x^2)<|end_search_query|>\n\n"
"(System returns search results)\n\n"
"Assistant:\n"
"<|begin_search_result|> ...search results without full page... <|end_search_result|>\n\n"
"Assistant thinks: The search results mention some URLs. I want full details from one of them.\n\n"
"Assistant:\n"
"<|begin_url|>http://example.com/integration_e_x_squared.html<|end_url|>\n\n"
"(System returns full page content)\n\n"
"Assistant:\n"
"<|begin_full_page|> ...full page content... <|end_full_page|>\n\n"
"Now the assistant has enough info and can continue reasoning.\n\n"
"Remember:\n"
"- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
"- Use <|begin_url|> to request full page content and end with <|end_url|>.\n"
"- When done retrieving information, continue your reasoning.\n\n"
)
def get_code_rag_agent_instruction(MAX_SEARCH_LIMIT, MAX_URL_FETCH):
return (
"You are a reasoning assistant with the ability to perform web searches and retrieve webpage content to help "
"you answer the user’s programming-related question accurately. You have special tools:\n\n"
"- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
"Then, the system will call the web search API with your query and return the search results to you in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n"
" The search results will contain a list of webpages with titles, URLs, and snippets (but not full content).\n\n"
"- After receiving the search results, if you need more detailed information from one or more specific URLs, write <|begin_url|> url1, url2, ... <|end_url|>.\n"
" The system will fetch the full page content of those URLs and return it to you as <|begin_full_page|> ...full page content... <|end_full_page|>.\n\n"
f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n"
f"You can fetch up to {MAX_URL_FETCH} URLs for detailed information.\n\n"
"Once you have all the information you need, continue your reasoning.\n\n"
"Example:\n"
"Question: \"How do I implement a binary search algorithm in Python?\"\n"
"Assistant thinking steps:\n"
"- I might need to look up the implementation details of binary search in Python.\n\n"
"Assistant:\n"
"<|begin_search_query|>binary search algorithm implementation in Python<|end_search_query|>\n\n"
"(System returns search results)\n\n"
"Assistant:\n"
"<|begin_search_result|> ...search results without full page... <|end_search_result|>\n\n"
"Assistant thinks: The search results mention some URLs. I want full details from one of them.\n\n"
"Assistant:\n"
"<|begin_url|>http://example.com/python_binary_search.html<|end_url|>\n\n"
"(System returns full page content)\n\n"
"Assistant:\n"
"<|begin_full_page|> ...full page content... <|end_full_page|>\n\n"
"Now the assistant has enough info and can continue reasoning.\n\n"
"Remember:\n"
"- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
"- Use <|begin_url|> to request full page content and end with <|end_url|>.\n"
"- When done retrieving information, continue your reasoning.\n\n"
)
def get_naive_rag_instruction(question, documents):
return (
"You are a knowledgeable assistant that uses the provided documents to answer the user's question.\n\n"
"Question:\n"
f"{question}\n"
"Documents:\n"
f"{documents}\n"
)
def get_task_instruction_openqa(question, model_name=None):
if model_name == 'qwq':
user_prompt = (
'Please answer the following question. '
'You should provide your final answer in the format \\boxed{YOUR_ANSWER}.\n\n'
f'Question:\n{question}\n\n'
)
else:
user_prompt = (
'Please answer the following question. You should think step by step to solve it.\n\n'
'Provide your final answer in the format \\boxed{YOUR_ANSWER}.\n\n'
f'Question:\n{question}\n\n'
)
return user_prompt
def get_task_instruction_math(question, model_name=None):
if model_name == 'qwq':
user_prompt = (
'Please answer the following math question. '
'You should provide your final answer in the format \\boxed{YOUR_ANSWER}.\n\n'
f'Question:\n{question}\n\n'
)
else:
user_prompt = (
'Please answer the following math question. You should think step by step to solve it.\n\n'
'Provide your final answer in the format \\boxed{YOUR_ANSWER}.\n\n'
f'Question:\n{question}\n\n'
)
return user_prompt
def get_task_instruction_multi_choice(question, model_name=None):
if model_name == 'qwq':
user_prompt = (
'Please answer the following multiple-choice question. '
'You should provide your final choice in the format \\boxed{YOUR_CHOICE}.\n\n'
f'Question:\n{question}\n\n'
)
elif model_name == 'llama':
user_prompt = (
'Please answer the following multiple-choice question. You should think step by step to solve it.\n\n'
'Provide your final choice in the format \\boxed{YOUR_CHOICE}. Your final choice should be one of the letters A, B, C, or D, DO NOT include any answer content.\n\n'
f'Question:\n{question}\n\n'
)
else:
user_prompt = (
'Please answer the following multiple-choice question. You should think step by step to solve it.\n\n'
'Provide your final choice in the format \\boxed{YOUR_CHOICE}.\n\n'
f'Question:\n{question}\n\n'
)
return user_prompt
def get_task_instruction_code(question, question_title=None, model_name=None):
if model_name == 'qwq':
user_prompt = (
'Generate a correct Python program that passes all tests for the given problem. '
'You should provide your final code within a Python code block using triple backticks (```python\n'
'YOUR_CODE\n'
'```).\n\n'
f'Problem Title: {question_title}\n\n'
f'Problem Statement:\n{question}\n\n'
)
else:
user_prompt = (
'You will be given a question (problem specification) and will generate a correct Python program that matches the specification and passes all tests. '
f'You should think step by step to solve it.\n\nQuestion:\n{question}\n\n'
'Read the inputs from stdin solve the problem and write the answer to stdout (do not directly test on the sample inputs). Enclose your code within delimiters as follows.\n\n'
"```python\n# YOUR CODE HERE\n```\n\n"
)
return user_prompt
import csv
import json
import random
import torch
import re
import os, time
import numpy as np
from tqdm import tqdm
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
from evaluate import run_evaluation
from prompts import (
get_task_instruction_openqa,
get_task_instruction_math,
get_task_instruction_multi_choice,
get_task_instruction_code,
)
import argparse
def parse_args():
parser = argparse.ArgumentParser(description="Run direct generation for various datasets and models.")
parser.add_argument(
'--dataset_name',
type=str,
required=True,
choices=['gpqa', 'math500', 'aime', 'amc', 'livecode', 'nq', 'triviaqa', 'hotpotqa', '2wiki', 'musique', 'bamboogle', 'medmcqa', 'pubhealth'],
help="Name of the dataset to use."
)
parser.add_argument(
'--split',
type=str,
required=True,
choices=['test', 'diamond', 'main', 'extended'],
help="Dataset split to use."
)
parser.add_argument(
'--subset_num',
type=int,
default=-1,
help="Number of examples to process. Defaults to all if not specified."
)
parser.add_argument(
'--model_path',
type=str,
required=True,
help="Path to the pre-trained model."
)
parser.add_argument(
'--temperature',
type=float,
default=0.7,
help="Sampling temperature."
)
parser.add_argument(
'--top_p',
type=float,
default=0.8,
help="Top-p sampling parameter."
)
parser.add_argument(
'--top_k',
type=int,
default=20,
help="Top-k sampling parameter."
)
parser.add_argument(
'--repetition_penalty',
type=float,
default=None,
help="Repetition penalty. If not set, defaults based on the model."
)
parser.add_argument(
'--max_tokens',
type=int,
default=32768,
help="Maximum number of tokens to generate. If not set, defaults based on the model and dataset."
)
return parser.parse_args()
def main():
args = parse_args()
dataset_name = args.dataset_name
split = args.split
subset_num = args.subset_num
model_path = args.model_path
temperature = args.temperature
top_p = args.top_p
top_k = args.top_k
repetition_penalty = args.repetition_penalty
max_tokens = args.max_tokens
# Set default repetition_penalty if not provided
if repetition_penalty is None:
repetition_penalty = 1.05 if 'qwq' in model_path.lower() else 1.0
# Paths to datasets
if dataset_name == 'math500':
data_path = f'./data/MATH500/{split}.json'
elif dataset_name == 'gpqa':
data_path = f'./data/GPQA/{split}.json'
elif dataset_name == 'aime':
data_path = f'./data/AIME/{split}.json'
elif dataset_name == 'amc':
data_path = f'./data/AMC/{split}.json'
elif dataset_name == 'livecode':
data_path = f'./data/LiveCodeBench/{split}.json'
elif dataset_name in ['nq', 'triviaqa', 'hotpotqa', 'musique', 'bamboogle', '2wiki', 'medmcqa', 'pubhealth']:
data_path = f'./data/QA_Datasets/{dataset_name}.json'
else:
raise ValueError(f"Unsupported dataset_name: {dataset_name}")
# Load the model
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'
if 'qwq' in model_path.lower():
if dataset_name in ['math500', 'gpqa', 'aime', 'amc', 'livecode']:
output_dir = f'./outputs/{dataset_name}.qwq.direct'
else:
output_dir = f'./outputs/runs.qa/{dataset_name}.qwq.direct'
else:
model_short_name = model_path.split('/')[-1].lower().replace('-instruct', '')
output_dir = f'./outputs/runs.baselines/{dataset_name}.{model_short_name}.direct'
os.makedirs(output_dir, exist_ok=True)
llm = LLM(
model=model_path,
tensor_parallel_size=torch.cuda.device_count(),
gpu_memory_utilization=0.95,
)
# Load data
with open(data_path, mode='r', encoding='utf-8') as json_file:
filtered_data = json.load(json_file)
# prepare input
input_list = []
for item in filtered_data:
question = item['Question']
if dataset_name in ['nq', 'triviaqa', 'hotpotqa', 'musique', 'bamboogle', '2wiki']:
if 'qwq' in model_path.lower():
user_prompt = get_task_instruction_openqa(question, model_name='qwq')
else:
user_prompt = get_task_instruction_openqa(question)
elif dataset_name in ['math500', 'aime', 'amc']:
if 'qwq' in model_path.lower():
user_prompt = get_task_instruction_math(question, model_name='qwq')
else:
user_prompt = get_task_instruction_math(question)
elif dataset_name in ['gpqa']:
if 'qwq' in model_path.lower():
user_prompt = get_task_instruction_multi_choice(question, model_name='qwq')
elif 'llama' in model_path.lower():
user_prompt = get_task_instruction_multi_choice(question, model_name='llama')
else:
user_prompt = get_task_instruction_multi_choice(question)
elif dataset_name == 'livecode':
question_title = item.get('question_title', '')
if 'qwq' in model_path.lower():
user_prompt = get_task_instruction_code(question, question_title=question_title, model_name='qwq')
else:
user_prompt = get_task_instruction_code(question)
else:
user_prompt = "" # Default to empty if dataset not matched
prompt = [{"role": "user", "content": user_prompt}]
prompt = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
input_list.append(prompt)
if subset_num != -1:
input_list = input_list[:subset_num]
filtered_data = filtered_data[:subset_num]
# Set default max_tokens if not provided
if max_tokens is None:
if 'qwq' in model_path.lower():
if dataset_name in ['aime', 'amc', 'livecode']:
max_tokens = 32768
else:
max_tokens = 25600
else:
max_tokens = 3096
t_start = time.time()
# Generate model outputs
output_list = llm.generate(
input_list,
sampling_params=SamplingParams(
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repetition_penalty=repetition_penalty,
)
)
total_time = time.time() - t_start
# Run evaluation
run_evaluation(
filtered_data,
input_list,
output_list,
dataset_name,
output_dir,
total_time,
split,
)
if __name__ == "__main__":
main()
# run_naive_rag.py
import os
import json
import time
from tqdm import tqdm
from typing import List, Dict, Optional, Tuple
import argparse
from bing_search import (
bing_web_search,
extract_relevant_info,
fetch_page_content,
extract_snippet_with_context,
)
from evaluate import run_evaluation, extract_answer
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
import re
import string
from nltk.tokenize import sent_tokenize
import torch
from prompts import (
get_task_instruction_openqa,
get_task_instruction_math,
get_task_instruction_multi_choice,
get_task_instruction_code,
get_naive_rag_instruction,
)
def parse_args():
parser = argparse.ArgumentParser(description="Run Naive RAG for various datasets and models.")
# Dataset and split configuration
parser.add_argument(
'--dataset_name',
type=str,
required=True,
choices=['gpqa', 'math500', 'aime', 'amc', 'livecode', 'nq', 'triviaqa', 'hotpotqa', '2wiki', 'musique', 'bamboogle', 'medmcqa', 'pubhealth'],
help="Name of the dataset to use."
)
parser.add_argument(
'--split',
type=str,
required=True,
choices=['test', 'diamond', 'main', 'extended'],
help="Dataset split to use."
)
parser.add_argument(
'--subset_num',
type=int,
default=None,
help="Number of examples to process. Defaults to all if not specified."
)
# Search and document retrieval configuration
parser.add_argument(
'--top_k',
type=int,
default=10,
help="Number of top search results to retrieve."
)
parser.add_argument(
'--max_doc_len',
type=int,
default=3000,
help="Maximum length of each searched document."
)
# Model configuration
parser.add_argument(
'--model_path',
type=str,
required=True,
help="Path to the pre-trained model."
)
parser.add_argument(
'--use_jina',
type=bool,
default=True,
help="Whether to use Jina API for document fetching."
)
parser.add_argument(
'--jina_api_key',
type=str,
default='None',
help="Your Jina API Key to Fetch URL Content."
)
# Sampling parameters
parser.add_argument(
'--temperature',
type=float,
default=0.7,
help="Sampling temperature."
)
parser.add_argument(
'--top_p',
type=float,
default=0.8,
help="Top-p sampling parameter."
)
parser.add_argument(
'--top_k_sampling',
type=int,
default=20,
help="Top-k sampling parameter."
)
parser.add_argument(
'--repetition_penalty',
type=float,
default=None,
help="Repetition penalty. If not set, defaults based on the model."
)
parser.add_argument(
'--max_tokens',
type=int,
default=32768,
help="Maximum number of tokens to generate. If not set, defaults based on the model and dataset."
)
# Bing API Configuration
parser.add_argument(
'--bing_subscription_key',
type=str,
required=True,
help="Bing Search API subscription key."
)
parser.add_argument(
'--bing_endpoint',
type=str,
default="https://api.bing.microsoft.com/v7.0/search",
help="Bing Search API endpoint."
)
return parser.parse_args()
def main():
args = parse_args()
# Extract arguments
dataset_name = args.dataset_name
split = args.split
subset_num = args.subset_num
top_k = args.top_k
max_doc_len = args.max_doc_len
model_path = args.model_path
temperature = args.temperature
top_p = args.top_p
top_k_sampling = args.top_k_sampling
repetition_penalty = args.repetition_penalty
max_tokens = args.max_tokens
bing_subscription_key = args.bing_subscription_key
bing_endpoint = args.bing_endpoint
use_jina = args.use_jina
jina_api_key = args.jina_api_key
# Set default repetition_penalty if not provided
if repetition_penalty is None:
repetition_penalty = 1.05 if 'qwq' in model_path.lower() else 1.0
if args.jina_api_key == 'None':
jina_api_key = None
# Paths to datasets
if dataset_name == 'livecode':
data_path = f'./data/LiveCodeBench/{split}.json'
elif dataset_name in ['math500', 'gpqa', 'aime', 'amc']:
data_path = f'./data/{dataset_name.upper()}/{split}.json'
else:
data_path = f'./data/QA_Datasets/{dataset_name}.json'
# ---------------------- Caching Mechanism ----------------------
# Define cache directories and file paths
cache_dir = './cache'
search_cache_path = os.path.join(cache_dir, 'search_cache.json')
url_cache_path = os.path.join(cache_dir, 'url_cache.json')
# Ensure cache directory exists
os.makedirs(cache_dir, exist_ok=True)
# Load existing caches or initialize empty dictionaries
if os.path.exists(search_cache_path):
with open(search_cache_path, 'r', encoding='utf-8') as f:
search_cache = json.load(f)
else:
search_cache = {}
if os.path.exists(url_cache_path):
with open(url_cache_path, 'r', encoding='utf-8') as f:
url_cache = json.load(f)
else:
url_cache = {}
# Function to save caches
def save_caches():
with open(search_cache_path, 'w', encoding='utf-8') as f:
json.dump(search_cache, f, ensure_ascii=False, indent=2)
with open(url_cache_path, 'w', encoding='utf-8') as f:
json.dump(url_cache, f, ensure_ascii=False, indent=2)
# ---------------------- Model Loading ----------------------
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'
# Define output directory based on model and dataset
if 'qwq' in model_path.lower():
if dataset_name in ['math500', 'gpqa', 'aime', 'amc', 'livecode']:
output_dir = f'./outputs/{dataset_name}.qwq.naive_rag'
else:
output_dir = f'./outputs/runs.qa/{dataset_name}.qwq.naive_rag'
else:
model_short_name = model_path.split('/')[-1].lower().replace('-instruct', '')
output_dir = f'./outputs/runs.baselines/{dataset_name}.{model_short_name}.naive_rag'
os.makedirs(output_dir, exist_ok=True)
# ---------------------- Data Loading ----------------------
with open(data_path, 'r', encoding='utf-8') as json_file:
data = json.load(json_file)
if subset_num is not None:
data = data[:subset_num]
# ---------------------- Search and Document Retrieval ----------------------
print("Performing Bing Web Searches for all questions...")
# Initialize a list to hold relevant information for each question
all_relevant_info = []
for item in tqdm(data, desc="Searching"):
question = item['Question']
# Check if the question has already been searched and cached
if question in search_cache:
results = search_cache[question]
# print(f"Using cached search results for question: {question}")
else:
if dataset_name == 'livecode':
search_question = question[:500]
else:
search_question = question
results = bing_web_search(search_question, bing_subscription_key, bing_endpoint, market='en-US', language='en')
search_cache[question] = results
# print(f"Executed and cached search for question: {question}")
# Extract relevant information from search results
relevant_info = extract_relevant_info(results)[:top_k]
all_relevant_info.append(relevant_info)
# Save search cache after retrieval
save_caches()
print("Search cache saved.")
# Collect all unique URLs to fetch
unique_urls = set()
url_snippets_map = {}
for relevant_info in all_relevant_info:
for info in relevant_info:
url = info['url']
snippet = info.get('snippet', "")
unique_urls.add(url)
url_snippets_map[url] = snippet
# Determine which URLs need to be fetched
urls_to_fetch = [url for url in unique_urls if url not in url_cache]
print(f"Fetching {len(urls_to_fetch)} unique URLs...")
fetched_contents = fetch_page_content(
urls_to_fetch,
use_jina=use_jina,
jina_api_key=jina_api_key,
# snippets=url_snippets_map
)
# Update URL cache with fetched contents
for url, content in fetched_contents.items():
url_cache[url] = content
# Save URL cache after fetching
save_caches()
print("URL cache saved.")
# ---------------------- Prompt Construction ----------------------
print("Constructing prompts for generation...")
input_prompts = []
for idx, item in enumerate(tqdm(data, desc="Constructing Prompts")):
question = item['Question']
formatted_documents = ""
relevant_info = all_relevant_info[idx]
for i, doc_info in enumerate(relevant_info):
url = doc_info['url']
snippet = doc_info.get('snippet', "")
raw_context = url_cache.get(url, "")
success, context = extract_snippet_with_context(raw_context, snippet, context_chars=max_doc_len)
if success:
context = context
else:
context = raw_context[:2 * max_doc_len]
# Clean snippet from HTML tags if any
clean_snippet = re.sub('<[^<]+?>', '', snippet) # Removes HTML tags
formatted_documents += f"**Document {i + 1}:**\n"
formatted_documents += f"**Title:** {doc_info.get('title', '')}\n"
formatted_documents += f"**URL:** {url}\n"
formatted_documents += f"**Snippet:** {clean_snippet}\n"
formatted_documents += f"**Content:** {context}\n\n"
# Construct the instruction with documents and question
instruction = get_naive_rag_instruction(question, formatted_documents)
# Construct dataset and model-specific prompts
if dataset_name in ['nq', 'triviaqa', 'hotpotqa', 'musique', 'bamboogle', '2wiki']:
if 'qwq' in model_path.lower():
user_prompt = get_task_instruction_openqa(question, model_name='qwq')
else:
user_prompt = get_task_instruction_openqa(question)
elif dataset_name in ['math500', 'aime', 'amc']:
if 'qwq' in model_path.lower():
user_prompt = get_task_instruction_math(question, model_name='qwq')
else:
user_prompt = get_task_instruction_math(question)
elif dataset_name == 'gpqa':
if 'qwq' in model_path.lower():
user_prompt = get_task_instruction_multi_choice(question, model_name='qwq')
elif 'llama' in model_path.lower():
user_prompt = get_task_instruction_multi_choice(question, model_name='llama')
else:
user_prompt = get_task_instruction_multi_choice(question)
elif dataset_name == 'livecode':
question_title = item.get('question_title', '')
if 'qwq' in model_path.lower():
user_prompt = get_task_instruction_code(question, question_title=question_title, model_name='qwq')
else:
user_prompt = get_task_instruction_code(question)
else:
user_prompt = "" # Default to empty if dataset not matched
# Combine instruction and user prompt
full_prompt = instruction + "\n\n" + user_prompt
# Apply tokenizer and chat template
prompt = [{"role": "user", "content": full_prompt}]
prompt = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
input_prompts.append(prompt)
# ---------------------- Generation ----------------------
# Initialize the LLM
llm = LLM(
model=model_path,
tensor_parallel_size=torch.cuda.device_count(),
gpu_memory_utilization=0.95,
)
print("Generating answers with LLM...")
# Set default max_tokens if not provided
if max_tokens is None:
if 'qwq' in model_path.lower():
max_tokens = 20480
else:
max_tokens = 10240
start_time = time.time()
# Generate model outputs
output_list = llm.generate(
input_prompts,
sampling_params=SamplingParams(
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k_sampling,
repetition_penalty=repetition_penalty,
)
)
total_time = time.time() - start_time
# ---------------------- Evaluation ----------------------
print("Evaluating generated answers...")
run_evaluation(
filtered_data=data,
input_list=input_prompts,
output_list=output_list,
dataset_name=dataset_name,
output_dir=output_dir,
total_time=total_time,
split=split,
)
# ---------------------- Update Search and URL Cache ----------------------
print('Updating Search and URL Cache...')
# Load existing caches or initialize empty dictionaries
if os.path.exists(search_cache_path):
with open(search_cache_path, 'r', encoding='utf-8') as f:
search_cache_new = json.load(f)
else:
search_cache_new = {}
if os.path.exists(url_cache_path):
with open(url_cache_path, 'r', encoding='utf-8') as f:
url_cache_new = json.load(f)
else:
url_cache_new = {}
search_cache.update(search_cache_new)
url_cache.update(url_cache_new)
save_caches()
print("Process completed.")
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment