Commit b6edc328 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #2229 canceled with stages
# torch
transformers
sentencepiece
# vllm
tqdm
nltk
pyext
bs4
pdfplumber
icon.png

53.8 KB

# 模型编码
modelCode=1233
# 模型名称
modelName=search-o1_pytorch
# 模型描述
modelDescription=动态获取和整合外部知识,无需训练即可赋予开源模型CoT“慢思考”能力,属于推理版o1。
# 应用场景
appScenario=推理,对话问答,制造,广媒,金融,能源,医疗,家居,教育
# 框架类型
frameType=pytorch
import os
import json
import requests
from requests.exceptions import Timeout
from bs4 import BeautifulSoup
from tqdm import tqdm
import time
import concurrent
from concurrent.futures import ThreadPoolExecutor
import pdfplumber
from io import BytesIO
import re
import string
from typing import Optional, Tuple
from nltk.tokenize import sent_tokenize
# ----------------------- Custom Headers -----------------------
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/58.0.3029.110 Safari/537.36',
'Referer': 'https://www.google.com/',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
}
# Initialize session
session = requests.Session()
session.headers.update(headers)
def remove_punctuation(text: str) -> str:
"""Remove punctuation from the text."""
return text.translate(str.maketrans("", "", string.punctuation))
def f1_score(true_set: set, pred_set: set) -> float:
"""Calculate the F1 score between two sets of words."""
intersection = len(true_set.intersection(pred_set))
if not intersection:
return 0.0
precision = intersection / float(len(pred_set))
recall = intersection / float(len(true_set))
return 2 * (precision * recall) / (precision + recall)
def extract_snippet_with_context(full_text: str, snippet: str, context_chars: int = 2500) -> Tuple[bool, str]:
"""
Extract the sentence that best matches the snippet and its context from the full text.
Args:
full_text (str): The full text extracted from the webpage.
snippet (str): The snippet to match.
context_chars (int): Number of characters to include before and after the snippet.
Returns:
Tuple[bool, str]: The first element indicates whether extraction was successful, the second element is the extracted context.
"""
try:
full_text = full_text[:50000]
snippet = snippet.lower()
snippet = remove_punctuation(snippet)
snippet_words = set(snippet.split())
best_sentence = None
best_f1 = 0.2
# sentences = re.split(r'(?<=[.!?]) +', full_text) # Split sentences using regex, supporting ., !, ? endings
sentences = sent_tokenize(full_text) # Split sentences using nltk's sent_tokenize
for sentence in sentences:
key_sentence = sentence.lower()
key_sentence = remove_punctuation(key_sentence)
sentence_words = set(key_sentence.split())
f1 = f1_score(snippet_words, sentence_words)
if f1 > best_f1:
best_f1 = f1
best_sentence = sentence
if best_sentence:
para_start = full_text.find(best_sentence)
para_end = para_start + len(best_sentence)
start_index = max(0, para_start - context_chars)
end_index = min(len(full_text), para_end + context_chars)
context = full_text[start_index:end_index]
return True, context
else:
# If no matching sentence is found, return the first context_chars*2 characters of the full text
return False, full_text[:context_chars * 2]
except Exception as e:
return False, f"Failed to extract snippet context due to {str(e)}"
def extract_text_from_url(url, use_jina=False, jina_api_key=None, snippet: Optional[str] = None):
"""
Extract text from a URL. If a snippet is provided, extract the context related to it.
Args:
url (str): URL of a webpage or PDF.
use_jina (bool): Whether to use Jina for extraction.
snippet (Optional[str]): The snippet to search for.
Returns:
str: Extracted text or context.
"""
try:
if use_jina:
jina_headers = {
'Authorization': f'Bearer {jina_api_key}',
'X-Return-Format': 'markdown',
# 'X-With-Links-Summary': 'true'
}
response = requests.get(f'https://r.jina.ai/{url}', headers=jina_headers).text
# Remove URLs
pattern = r"\(https?:.*?\)|\[https?:.*?\]"
text = re.sub(pattern, "", response).replace('---','-').replace('===','=').replace(' ',' ').replace(' ',' ')
else:
response = session.get(url, timeout=20) # Set timeout to 20 seconds
response.raise_for_status() # Raise HTTPError if the request failed
# Determine the content type
content_type = response.headers.get('Content-Type', '')
if 'pdf' in content_type:
# If it's a PDF file, extract PDF text
return extract_pdf_text(url)
# Try using lxml parser, fallback to html.parser if unavailable
try:
soup = BeautifulSoup(response.text, 'lxml')
except Exception:
print("lxml parser not found or failed, falling back to html.parser")
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text(separator=' ', strip=True)
if snippet:
success, context = extract_snippet_with_context(text, snippet)
if success:
return context
else:
return text
else:
# If no snippet is provided, return directly
return text[:8000]
except requests.exceptions.HTTPError as http_err:
return f"HTTP error occurred: {http_err}"
except requests.exceptions.ConnectionError:
return "Error: Connection error occurred"
except requests.exceptions.Timeout:
return "Error: Request timed out after 20 seconds"
except Exception as e:
return f"Unexpected error: {str(e)}"
def fetch_page_content(urls, max_workers=4, use_jina=False, snippets: Optional[dict] = None):
"""
Concurrently fetch content from multiple URLs.
Args:
urls (list): List of URLs to scrape.
max_workers (int): Maximum number of concurrent threads.
use_jina (bool): Whether to use Jina for extraction.
snippets (Optional[dict]): A dictionary mapping URLs to their respective snippets.
Returns:
dict: A dictionary mapping URLs to the extracted content or context.
"""
results = {}
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Use tqdm to display a progress bar
futures = {
executor.submit(extract_text_from_url, url, use_jina, snippets.get(url) if snippets else None): url
for url in urls
}
for future in tqdm(concurrent.futures.as_completed(futures), desc="Fetching URLs", total=len(urls)):
url = futures[future]
try:
data = future.result()
results[url] = data
except Exception as exc:
results[url] = f"Error fetching {url}: {exc}"
time.sleep(0.2) # Simple rate limiting
return results
def bing_web_search(query, subscription_key, endpoint, market='en-US', language='en', timeout=20):
"""
Perform a search using the Bing Web Search API with a set timeout.
Args:
query (str): Search query.
subscription_key (str): Subscription key for the Bing Search API.
endpoint (str): Endpoint for the Bing Search API.
market (str): Market, e.g., "en-US" or "zh-CN".
language (str): Language of the results, e.g., "en".
timeout (int or float or tuple): Request timeout in seconds.
Can be a float representing the total timeout,
or a tuple (connect timeout, read timeout).
Returns:
dict: JSON response of the search results. Returns None or raises an exception if the request times out.
"""
headers = {
"Ocp-Apim-Subscription-Key": subscription_key
}
params = {
"q": query,
"mkt": market,
"setLang": language,
"textDecorations": True,
"textFormat": "HTML"
}
try:
response = requests.get(endpoint, headers=headers, params=params, timeout=timeout)
response.raise_for_status() # Raise exception if the request failed
search_results = response.json()
return search_results
except Timeout:
print(f"Bing Web Search request timed out ({timeout} seconds) for query: {query}")
return {} # Or you can choose to raise an exception
except requests.exceptions.RequestException as e:
print(f"Error occurred during Bing Web Search request: {e}")
return {}
def extract_pdf_text(url):
"""
Extract text from a PDF.
Args:
url (str): URL of the PDF file.
Returns:
str: Extracted text content or error message.
"""
try:
response = session.get(url, timeout=20) # Set timeout to 20 seconds
if response.status_code != 200:
return f"Error: Unable to retrieve the PDF (status code {response.status_code})"
# Open the PDF file using pdfplumber
with pdfplumber.open(BytesIO(response.content)) as pdf:
full_text = ""
for page in pdf.pages:
text = page.extract_text()
if text:
full_text += text
# Limit the text length
cleaned_text = ' '.join(full_text.split()[:600])
return cleaned_text
except requests.exceptions.Timeout:
return "Error: Request timed out after 20 seconds"
except Exception as e:
return f"Error: {str(e)}"
def extract_relevant_info(search_results):
"""
Extract relevant information from Bing search results.
Args:
search_results (dict): JSON response from the Bing Web Search API.
Returns:
list: A list of dictionaries containing the extracted information.
"""
useful_info = []
if 'webPages' in search_results and 'value' in search_results['webPages']:
for id, result in enumerate(search_results['webPages']['value']):
info = {
'id': id + 1, # Increment id for easier subsequent operations
'title': result.get('name', ''),
'url': result.get('url', ''),
'site_name': result.get('siteName', ''),
'date': result.get('datePublished', '').split('T')[0],
'snippet': result.get('snippet', ''), # Remove HTML tags
# Add context content to the information
'context': '' # Reserved field to be filled later
}
useful_info.append(info)
return useful_info
# ------------------------------------------------------------
if __name__ == "__main__":
# Example usage
# Define the query to search
query = "Structure of dimethyl fumarate"
# Subscription key and endpoint for Bing Search API
BING_SUBSCRIPTION_KEY = "YOUR_BING_SUBSCRIPTION_KEY"
if not BING_SUBSCRIPTION_KEY:
raise ValueError("Please set the BING_SEARCH_V7_SUBSCRIPTION_KEY environment variable.")
bing_endpoint = "https://api.bing.microsoft.com/v7.0/search"
# Perform the search
print("Performing Bing Web Search...")
search_results = bing_web_search(query, BING_SUBSCRIPTION_KEY, bing_endpoint)
print("Extracting relevant information from search results...")
extracted_info = extract_relevant_info(search_results)
print("Fetching and extracting context for each snippet...")
for info in tqdm(extracted_info, desc="Processing Snippets"):
full_text = extract_text_from_url(info['url'], use_jina=True) # Get full webpage text
if full_text and not full_text.startswith("Error"):
success, context = extract_snippet_with_context(full_text, info['snippet'])
if success:
info['context'] = context
else:
info['context'] = f"Could not extract context. Returning first 8000 chars: {full_text[:8000]}"
else:
info['context'] = f"Failed to fetch full text: {full_text}"
# print("Your Search Query:", query)
# print("Final extracted information with context:")
# print(json.dumps(extracted_info, indent=2, ensure_ascii=False))
import re
import json
import numpy as np
from collections import Counter
import string
import os, time
from collections import defaultdict
from lcb_runner.evaluation import codegen_metrics
from utils.math_equivalence import is_equiv
def extract_answer(output, mode='gen'):
extracted_text = ''
if mode == 'codegen':
# Extract the code between ```python and ```
pattern = r'```python\s*(.*?)\s*```'
matches = re.findall(pattern, output, re.DOTALL | re.IGNORECASE)
if matches:
extracted_text = matches[-1].strip() # Take the last match
elif mode == 'infogen':
# Extract content after **Final Information** or **Modified Reasoning Steps**
pattern_info = "\n**Final Information**"
pattern_step = "\n**Modified Reasoning Steps**"
if pattern_info in output:
extracted_text = output.split(pattern_info)[-1].replace("\n","").strip("```").strip()
elif pattern_step in output:
extracted_text = output.split(pattern_step)[-1].strip("```").strip()
else:
extracted_text = "No helpful information found."
else:
# Existing extraction logic for 'gen' and 'choose' modes
pattern = r'\\boxed\{(.*)\}'
matches = re.findall(pattern, output)
if matches:
extracted_text = matches[-1] # Take the last match
if mode in ['choose', 'qa']:
# Handle 'choose' mode
inner_pattern = r'\\text\{(.*)\}'
inner_matches = re.findall(inner_pattern, extracted_text)
if inner_matches:
extracted_text = inner_matches[-1] # Take the last match
extracted_text = extracted_text.strip("()")
return extracted_text
def normalize_answer(text):
text = text.lower()
text = " ".join(text.strip().split())
return text
def normalize_answer_qa(s):
def remove_articles(text):
return re.sub(r"\b(a|an|the)\b", " ", text)
def white_space_fix(text):
return " ".join(text.strip().split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def evaluate_predictions(output, labeled_answer, mode='gen'):
final_metric = {"is_valid_answer": False, "acc": 0, "em": 0, "f1": 0, 'math_equal': 0}
pred_answer = extract_answer(output, mode=mode)
if pred_answer != '':
final_metric["is_valid_answer"] = True
if mode == 'qa':
normalized_pred_answer = normalize_answer_qa(pred_answer)
for answer in labeled_answer:
normalized_ground_truth = normalize_answer_qa(answer)
em = int(normalized_pred_answer == normalized_ground_truth)
acc = int(normalized_ground_truth in normalized_pred_answer)
prediction_tokens = normalized_pred_answer.split()
ground_truth_tokens = normalized_ground_truth.split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
continue
precision = 1.0 * num_same / len(prediction_tokens)
recall = 1.0 * num_same / len(ground_truth_tokens)
f1 = (2 * precision * recall) / (precision + recall)
for k in ["em", "acc", "f1"]:
final_metric[k] = max(eval(k), final_metric[k])
else:
normalized_pred_answer = normalize_answer(pred_answer)
normalized_ground_truth = normalize_answer(labeled_answer)
em = int(normalized_pred_answer == normalized_ground_truth)
acc = int(normalized_ground_truth in normalized_pred_answer)
prediction_tokens = normalized_pred_answer.split()
ground_truth_tokens = normalized_ground_truth.split()
common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
num_same = sum(common.values())
if num_same == 0:
f1 = 0
else:
precision = 1.0 * num_same / len(prediction_tokens) if len(prediction_tokens) > 0 else 0
recall = 1.0 * num_same / len(ground_truth_tokens) if len(ground_truth_tokens) > 0 else 0
if (precision + recall) == 0:
f1 = 0
else:
f1 = (2 * precision * recall) / (precision + recall)
final_metric["em"] = em
final_metric["acc"] = acc
final_metric["f1"] = f1
final_metric["math_equal"] = is_equiv(normalized_pred_answer, normalized_ground_truth)
# print(em, acc, f1, normalized_pred_answer, '|', normalized_ground_truth)
return final_metric, pred_answer
def run_evaluation(filtered_data, input_list, output_list, dataset_name, output_dir, total_time, split, apply_backoff=False):
if dataset_name == 'livecode':
# Prepare samples and generations for codegen_metrics
samples_list = []
generations_list = []
# Collect difficulty levels for per-domain metrics
difficulties = []
per_difficulty_count = {}
num_valid_answer = 0
for item, input_prompt, result in zip(filtered_data, input_list, output_list):
if type(result) == str:
item['Output'] = result
else:
item['Output'] = result.outputs[0].text
difficulty = item.get("difficulty", "Unknown")
difficulties.append(difficulty)
# Track metrics per domain
if difficulty not in per_difficulty_count.keys():
per_difficulty_count[difficulty] = 0
pred_code = extract_answer(item['Output'], mode='codegen')
if pred_code != '':
num_valid_answer += 1
per_difficulty_count[difficulty] += 1
# Assuming each item has 'input_output' with 'inputs' and 'outputs'
public_test_cases = json.loads(item.get("public_test_cases", "{}"))
inputs, outputs = [], []
for case in public_test_cases:
inputs.append(case["input"])
outputs.append(case["output"])
sample = {
"input_output": json.dumps({
"inputs": inputs,
"outputs": outputs
}),
}
samples_list.append(sample)
generations_list.append([pred_code])
item['Pred_Answer'] = pred_code
item['Question'] = input_prompt
# Call codegen_metrics with pass@1
metrics, results, final_metadata = codegen_metrics(
samples_list,
generations_list,
k_list=[1], # Evaluate the top 1 generated result
num_process_evaluate=2, # Parallel evaluation
timeout=10, # Set timeout to 10 seconds
debug=False, # Enable debug mode
)
# print('samples_list', samples_list)
# print('generations_list', generations_list)
# print('metrics', metrics)
# Extract pass@1
pass_at_1 = metrics.get('pass@1', 0.0)
detail_pass_at_1 = metrics['detail']['pass@1']
for item, pass1, res, meta in zip(filtered_data, detail_pass_at_1.values(), results.values(), final_metadata):
item['Metrics'] = {'pass@1': pass1}
item['Results'] = res
item['Final_metadata'] = meta
# Initialize per-difficulty metrics
difficulty_metrics = defaultdict(list)
for idx, difficulty in enumerate(difficulties):
pass1 = detail_pass_at_1[idx]
difficulty_metrics[difficulty].append(pass1)
# Compute overall pass@1
overall_metrics = {
'pass@1': pass_at_1, # / num_valid_answer * len(input_list),
'num_valid_answer': f'{num_valid_answer} of {len(input_list)}',
'query_latency': f'{(total_time / len(input_list) * 1000):.0f} ms',
}
# Compute per-difficulty pass@1
per_difficulty_metrics = {}
for difficulty, passes in difficulty_metrics.items():
avg_pass = np.mean(passes) if len(passes) > 0 else 0.0
num_valid_answer = per_difficulty_count[difficulty]
per_difficulty_metrics[difficulty] = {
'pass@1': avg_pass,
'num_valid_answer': f'{num_valid_answer} of {len(passes)}'
}
# Save the metrics
final_metrics = {
'overall': overall_metrics,
'per_domain': per_difficulty_metrics
}
else:
# Existing evaluation for other datasets
avg_em, avg_acc, avg_f1, avg_math = [], [], [], []
num_valid_answer = 0
# If the dataset is GPQA, track metrics per domain
domain_metrics = {}
for item, input_prompt, result in zip(filtered_data, input_list, output_list):
if type(result) == str:
item['Output'] = result
else:
item['Output'] = result.outputs[0].text
if dataset_name in ['gpqa', 'medmcqa']:
labeled_answer = item["Correct Choice"]
# labeled_choice_answer = item["Correct Answer"]
mode = 'choose'
elif dataset_name in ['math500', 'aime', 'amc']:
labeled_answer = item["answer"]
mode = 'gen'
elif dataset_name in ['nq', 'triviaqa', 'hotpotqa', 'musique', 'bamboogle', '2wiki']:
labeled_answer = item["answer"]
mode = 'qa'
elif dataset_name in ['pubhealth']:
labeled_answer = item["answer"]
mode = 'choose'
else:
raise ValueError(f"Unknown dataset_name: {dataset_name}")
metric, pred_answer = evaluate_predictions(output=item['Output'], labeled_answer=labeled_answer, mode=mode)
item['Pred_Answer'] = pred_answer
item['Metrics'] = metric
item['Question'] = input_prompt
# Determine the validity of the predicted answer
my_method_valid = (pred_answer != '' and not (mode == 'choose' and dataset_name == 'gpqa' and len(pred_answer) > 1))
avg_em.append(metric['em'])
avg_acc.append(metric['acc'])
avg_f1.append(metric['f1'])
avg_math.append(metric['math_equal'])
if my_method_valid:
num_valid_answer += 1
# If the dataset is GPQA, attempt to track metrics per domain
if dataset_name == 'gpqa':
domain = item.get("High-level domain", "Unknown")
if domain not in domain_metrics:
domain_metrics[domain] = {'em': [], 'acc': [], 'f1': [], 'math_equal': [], 'num_valid_answer': 0, 'total_num': 0}
domain_metrics[domain]['total_num'] += 1
domain_metrics[domain]['em'].append(metric['em'])
domain_metrics[domain]['acc'].append(metric['acc'])
domain_metrics[domain]['f1'].append(metric['f1'])
domain_metrics[domain]['math_equal'].append(metric['math_equal'])
if my_method_valid:
domain_metrics[domain]['num_valid_answer'] += 1
t = time.localtime()
result_json_name = f'{split}.{t.tm_mon}.{t.tm_mday},{t.tm_hour}:{t.tm_min}.json'
metrics_json_name = f'{split}.{t.tm_mon}.{t.tm_mday},{t.tm_hour}:{t.tm_min}.metrics.json'
# Compute overall metrics
overall_results = {
'em': np.mean(avg_em) if len(avg_em) > 0 else 0.0,
'acc': np.mean(avg_acc) if len(avg_acc) > 0 else 0.0,
'f1': np.mean(avg_f1) if len(avg_f1) > 0 else 0.0,
'math_equal': np.mean(avg_math) if len(avg_em) > 0 else 0.0,
'num_valid_answer': f'{num_valid_answer} of {len(input_list)}',
'query_latency': f'{(total_time / len(input_list) * 1000):.0f} ms',
}
# If the dataset is GPQA, output average metrics per domain
domain_avg_metrics = {}
if dataset_name == 'gpqa':
for dm, m in domain_metrics.items():
domain_avg_metrics[dm] = {
'em': np.mean(m['em']) if len(m['em']) > 0 else 0,
'acc': np.mean(m['acc']) if len(m['acc']) > 0 else 0,
'f1': np.mean(m['f1']) if len(m['f1']) > 0 else 0,
'math_equal': np.mean(m['math_equal']) if len(m['math_equal']) > 0 else 0,
'num_valid_answer': f'{m["num_valid_answer"]} of {m["total_num"]}'
}
# 保存总体和分domain的指标
final_metrics = {'overall': overall_results}
if dataset_name == 'gpqa':
final_metrics['per_domain'] = domain_avg_metrics
t = time.localtime()
result_json_name = f'{split}.{t.tm_mon}.{t.tm_mday},{t.tm_hour}:{t.tm_min}.json'
metrics_json_name = f'{split}.{t.tm_mon}.{t.tm_mday},{t.tm_hour}:{t.tm_min}.metrics.json'
if apply_backoff:
result_json_name = output_dir
metrics_json_name = output_dir.replace('.json', '.metrics.backoff.json')
# Save prediction results and metrics
with open(os.path.join(output_dir, result_json_name), mode='w', encoding='utf-8') as json_file:
json.dump(filtered_data, json_file, indent=4, ensure_ascii=False)
with open(os.path.join(output_dir, metrics_json_name), mode='w', encoding='utf-8') as json_file:
json.dump(final_metrics, json_file, indent=4, ensure_ascii=False)
if __name__ == "__main__":
import argparse
# Parse command-line arguments for flexibility
parser = argparse.ArgumentParser(description="Evaluate model outputs with optional backoff.")
parser.add_argument('--output_path', type=str, required=True, help='Path to the model output JSON file.')
parser.add_argument('--output_metrics_path', type=str, help='Path to save the evaluation metrics.')
parser.add_argument('--apply_backoff', action='store_true', help='Enable backoff to normal outputs if main output is invalid.')
args = parser.parse_args()
output_path = args.output_path
if args.output_metrics_path:
output_metrics_path = args.output_metrics_path
else:
output_metrics_path = output_path.replace('.json', '.metrics.json')
# Determine dataset name based on the output path
# NOTE: To apply back off strategy for retrieval-augmented reasoning methods, please replace normal_output_path with your actual path for results with run_direct_gen.
if 'gpqa' in output_path:
dataset_name = 'gpqa'
normal_output_path = './outputs/gpqa.qwq.direct/diamond.12.13,18:23.json'
if 'extended' in output_path:
normal_output_path = './outputs/gpqa.qwq.direct/extended.12.28,15:44.json'
if 'qwq' not in output_path:
normal_output_path = './outputs/runs.baselines/gpqa.qwen2.5-32b-instruct.direct/diamond.12.14,20:34.json'
elif 'math500' in output_path:
dataset_name = 'math500'
normal_output_path = './outputs/math500.qwq.direct/test.12.13,18:26.json'
if 'qwq' not in output_path:
normal_output_path = './outputs/runs.baselines/math500.qwen2.5-32b-instruct.direct/test.12.15,10:43.json'
elif 'aime' in output_path:
dataset_name = 'aime'
normal_output_path = './outputs/aime.qwq.direct/2024.12.13,19:36.json'
if 'qwq' not in output_path:
normal_output_path = './outputs/runs.baselines/aime.qwen2.5-32b-instruct.direct/test.12.14,20:28.json'
elif 'amc' in output_path:
dataset_name = 'amc'
normal_output_path = './outputs/amc.qwq.direct/test.12.14,14:31.json'
if 'qwq' not in output_path:
normal_output_path = './outputs/runs.baselines/amc.qwen2.5-32b-instruct.direct/test.12.14,20:26.json'
elif 'livecode' in output_path:
dataset_name = 'livecode'
normal_output_path = './outputs/livecode.qwq.direct/test.12.13,21:24.json'
if 'qwq' not in output_path:
normal_output_path = './outputs/runs.baselines/livecode.qwen2.5-32b-instruct.direct/test.12.14,20:32.json'
elif 'nq' in output_path:
dataset_name = 'nq'
normal_output_path = './outputs/runs.qa/nq.qwq.direct/test.12.15,14:50.json'
if 'qwq' not in output_path:
normal_output_path = ''
elif 'triviaqa' in output_path:
dataset_name = 'triviaqa'
normal_output_path = './outputs/runs.qa/triviaqa.qwq.direct/test.12.15,15:35.json'
if 'qwq' not in output_path:
normal_output_path = ''
elif 'hotpotqa' in output_path:
dataset_name = 'hotpotqa'
normal_output_path = './outputs/runs.qa/hotpotqa.qwq.direct/test.12.15,14:52.json'
if 'qwq' not in output_path:
normal_output_path = ''
elif 'musique' in output_path:
dataset_name = 'musique'
normal_output_path = './outputs/runs.qa/musique.qwq.direct/test.12.27,16:44.json'
if 'qwq' not in output_path:
normal_output_path = ''
elif 'bamboogle' in output_path:
dataset_name = 'bamboogle'
normal_output_path = './outputs/runs.qa/bamboogle.qwq.direct/test.12.28,9:51.json'
if 'qwq' not in output_path:
normal_output_path = ''
elif '2wiki' in output_path:
dataset_name = '2wiki'
normal_output_path = './outputs/runs.qa/2wiki.qwq.direct/test.12.15,15:32.json'
if 'qwq' not in output_path:
normal_output_path = ''
elif 'medmcqa' in output_path:
dataset_name = 'medmcqa'
normal_output_path = './outputs/runs.qa/medmcqa.qwq.direct/test.12.15,16:57.json'
if 'qwq' not in output_path:
normal_output_path = ''
elif 'pubhealth' in output_path:
dataset_name = 'pubhealth'
normal_output_path = './outputs/runs.qa/pubhealth.qwq.direct/test.12.15,20:32.json'
if 'qwq' not in output_path:
normal_output_path = ''
# Load main output data
with open(output_path, mode='r', encoding='utf-8') as file:
data = json.load(file)
# Load main metrics data
with open(output_metrics_path, mode='r', encoding='utf-8') as file:
metrics = json.load(file)
# Extract existing metrics
if 'overall' in metrics:
query_latency = metrics['overall']['query_latency']
original_num_valid_answer = metrics['overall']['num_valid_answer']
else:
query_latency = metrics.get('query_latency', 'N/A')
original_num_valid_answer = metrics.get('num_valid_answer', 'N/A')
# Load normal output data if backoff is enabled
normal_data = None
if args.apply_backoff:
if not os.path.exists(normal_output_path):
raise FileNotFoundError(f"Normal output file not found at: {normal_output_path}")
with open(normal_output_path, mode='r', encoding='utf-8') as file:
normal_data = json.load(file)
if dataset_name != 'livecode':
# Existing evaluation for non-livecode datasets
avg_em, avg_acc, avg_f1, avg_math = [], [], [], []
num_valid_answer = 0
# Initialize per-domain metrics
domain_metrics = {}
for i, item in enumerate(data):
if dataset_name in ['gpqa', 'medmcqa']:
labeled_answer = item["Correct Choice"]
domain = item.get("High-level domain", "Unknown")
mode = 'choose'
elif dataset_name == 'math500':
labeled_answer = item["answer"]
domain = item.get("level", "Unknown")
mode = 'gen'
elif dataset_name in ['aime', 'amc']:
labeled_answer = item["answer"]
mode = 'gen'
domain = 'Unknown'
elif dataset_name in ['nq', 'triviaqa', 'hotpotqa', 'musique', 'bamboogle', '2wiki']:
labeled_answer = item["answer"]
mode = 'qa'
domain = 'Unknown'
elif dataset_name in ['pubhealth']:
labeled_answer = item["answer"]
mode = 'choose'
domain = 'Unknown'
else:
raise ValueError(f"Unsupported dataset: {dataset_name}")
output = item['Output']
metric, pred_answer = evaluate_predictions(
output=output,
labeled_answer=labeled_answer,
mode=mode,
)
# Determine if the main method's answer is valid
my_method_valid = (pred_answer != '' and not (mode == 'choose' and dataset_name == 'gpqa' and len(pred_answer) > 1))
# If invalid and backoff is enabled, use normal method's output
if args.apply_backoff and not my_method_valid and normal_data is not None:
normal_item = normal_data[i]
if dataset_name in ['gpqa', 'medmcqa']:
normal_labeled_answer = normal_item["Correct Choice"]
normal_mode = 'choose'
elif dataset_name == 'math500':
normal_labeled_answer = normal_item["answer"]
normal_mode = 'gen'
elif dataset_name in ['aime', 'amc']:
normal_labeled_answer = normal_item["answer"]
normal_mode = 'gen'
elif dataset_name in ['nq', 'triviaqa', 'hotpotqa', 'musique', 'bamboogle', '2wiki']:
normal_labeled_answer = normal_item["answer"]
normal_mode = 'qa'
elif dataset_name in ['pubhealth']:
normal_labeled_answer = normal_item["answer"]
normal_mode = 'choose'
else:
raise ValueError(f"Unsupported dataset for backoff: {dataset_name}")
normal_output = normal_item['Output']
normal_metric, normal_pred_answer = evaluate_predictions(
output=normal_output,
labeled_answer=normal_labeled_answer,
mode=normal_mode,
)
normal_valid = (normal_pred_answer != '' and not (normal_mode == 'choose' and dataset_name == 'gpqa' and len(normal_pred_answer) > 1))
# Use normal method's result if valid
if normal_valid:
metric = normal_metric
pred_answer = normal_pred_answer
my_method_valid = True
# Track metrics per domain
if domain not in domain_metrics:
domain_metrics[domain] = {'em': [], 'acc': [], 'f1': [], 'math_equal': [], 'num_valid_answer': 0, 'total_num': 0}
domain_metrics[domain]['total_num'] += 1
avg_em.append(metric['em'])
avg_acc.append(metric['acc'])
avg_f1.append(metric['f1'])
avg_math.append(metric['math_equal'])
domain_metrics[domain]['em'].append(metric['em'])
domain_metrics[domain]['acc'].append(metric['acc'])
domain_metrics[domain]['f1'].append(metric['f1'])
domain_metrics[domain]['math_equal'].append(metric['math_equal'])
if my_method_valid:
num_valid_answer += 1
domain_metrics[domain]['num_valid_answer'] += 1
# Compute overall metrics
overall_metrics = {
'em': np.mean(avg_em) if len(avg_em) > 0 else 0,
'acc': np.mean(avg_acc) if len(avg_acc) > 0 else 0,
'f1': np.mean(avg_f1) if len(avg_f1) > 0 else 0,
'math_equal': np.mean(avg_math) if len(avg_math) > 0 else 0,
'num_valid_answer': f'{num_valid_answer} of {len(data)}',
'query_latency': query_latency,
}
if args.apply_backoff:
overall_metrics['original_num_valid_answer'] = original_num_valid_answer
# Compute per-domain metrics
domain_avg_metrics = {}
for dm, m in domain_metrics.items():
domain_avg_metrics[dm] = {
'em': np.mean(m['em']) if len(m['em']) > 0 else 0,
'acc': np.mean(m['acc']) if len(m['acc']) > 0 else 0,
'f1': np.mean(m['f1']) if len(m['f1']) > 0 else 0,
'math_equal': np.mean(m['math_equal']) if len(m['math_equal']) > 0 else 0,
'num_valid_answer': f'{m["num_valid_answer"]} of {m["total_num"]}',
}
# Prepare final metrics
final_metrics = {'overall': overall_metrics}
if dataset_name == 'gpqa':
final_metrics['per_domain'] = domain_avg_metrics
else:
# Evaluation and backoff for livecode dataset
split = 'test' # Modify as needed or extract from output_path
if args.apply_backoff and normal_data is not None:
# Apply backoff by replacing invalid outputs with normal outputs
for i, item in enumerate(data):
# Extract Pred_Answer from main output
pred_answer = item['Pred_Answer']
# Check if Pred_Answer is invalid
if pred_answer == '':
# Replace Output with normal output
item['Output'] = normal_data[i]['Output']
# Prepare input_list and output_list for run_evaluation
input_list = [item['Question'] for item in data]
output_list = [item['Output'] for item in data]
# Estimate total_time (if available). Here, set to 0 as a placeholder.
total_time = 0 # Modify if timing information is available
# Run evaluation
run_evaluation(
filtered_data=data,
input_list=input_list,
output_list=output_list,
dataset_name=dataset_name,
output_dir=output_path,
total_time=total_time,
split=split,
apply_backoff=True,
)
# run_evaluation handles saving the metrics for livecode
# Save metrics for non-livecode datasets
if dataset_name != 'livecode' or not args.apply_backoff:
# If dataset is livecode and backoff was applied, metrics are already saved by run_evaluation
if args.apply_backoff:
output_metrics_path = output_metrics_path.replace('.json', '.backoff.json')
with open(output_metrics_path, mode='w', encoding='utf-8') as json_file:
json.dump(final_metrics, json_file, indent=4, ensure_ascii=False)
print(f"Evaluation completed. Metrics saved to {output_metrics_path}")
from lcb_runner.benchmarks.code_generation import (
CodeGenerationProblem,
load_code_generation_dataset,
load_code_generation_dataset_not_fast,
)
from lcb_runner.benchmarks.test_output_prediction import (
TestOutputPredictionProblem,
load_test_prediction_dataset,
)
from lcb_runner.benchmarks.code_execution import (
CodeExecutionProblem,
load_code_execution_dataset,
)
import json
from enum import Enum
from datetime import datetime
from dataclasses import dataclass
from datasets import load_dataset
@dataclass
class CodeExecutionProblem:
question_id: str
contest_id: str
contest_date: datetime
difficulty: str
function_name: str
code: str
input: str
output: str
id: str
problem_id: str
numsteps: int
def __post_init__(self):
pass
def insert_output(self, output_list: list[str], pred_list: list[str]) -> dict:
return {
"question_id": self.question_id,
"contest_id": self.contest_id,
"contest_date": self.contest_date.isoformat(),
"difficulty": self.difficulty,
"function_name": self.function_name,
"code": self.code,
"input": self.input,
"output": self.output,
"id": self.id,
"problem_id": self.problem_id,
"numsteps": self.numsteps,
"output_list": output_list,
"pred_list": pred_list,
}
def insert_output_evaluation(
self, output_list: list[str], code_list: list[str], graded_list: list[bool]
) -> dict:
output = self.insert_output(output_list, code_list)
output["graded_list"] = graded_list
output["pass@1"] = graded_list.count(True) / len(graded_list)
return output
def get_evaluation_sample(self) -> dict:
return {
"code": self.code,
"input": self.input,
"output": self.output,
}
def load_code_execution_dataset(release_version="release_v1") -> list[CodeExecutionProblem]:
dataset = load_dataset("livecodebench/execution-v2", split="test")
dataset = [CodeExecutionProblem(**p) for p in dataset] # type: ignore
print(f"Loaded {len(dataset)} problems")
return dataset
if __name__ == "__main__":
dataset = load_code_execution_dataset()
import json
import zlib
import pickle
import base64
from enum import Enum
from datetime import datetime
from dataclasses import dataclass
from datasets import load_dataset
class Platform(Enum):
LEETCODE = "leetcode"
CODEFORCES = "codeforces"
ATCODER = "atcoder"
class Difficulty(Enum):
EASY = "easy"
MEDIUM = "medium"
HARD = "hard"
class TestType(Enum):
STDIN = "stdin"
FUNCTIONAL = "functional"
@dataclass
class Test:
input: str
output: str
testtype: TestType
def __post_init__(self):
self.testtype = TestType(self.testtype)
# if self.testtype == TestType.FUNCTIONAL:
# self.input = json.loads(self.input)
# self.output = json.loads(self.output)
@dataclass
class CodeGenerationProblem:
question_title: str
question_content: str
platform: Platform
question_id: str
contest_id: str
contest_date: datetime
starter_code: str
difficulty: Difficulty
public_test_cases: list[Test]
private_test_cases: list[Test]
metadata: dict
def __post_init__(self):
self.platform = Platform(self.platform)
self.difficulty = Difficulty(self.difficulty)
self.contest_date = datetime.fromisoformat(self.contest_date)
self.public_test_cases = json.loads(self.public_test_cases) # type: ignore
self.public_test_cases = [Test(**t) for t in self.public_test_cases]
try:
self.private_test_cases = json.loads(self.private_test_cases) # type: ignore
except:
self.private_test_cases = json.loads(
pickle.loads(
zlib.decompress(
base64.b64decode(self.private_test_cases.encode("utf-8")) # type: ignore
)
)
) # type: ignore
self.private_test_cases = [Test(**t) for t in self.private_test_cases]
self.metadata = json.loads(self.metadata) # type: ignore
def insert_output(self, output_list: list[str], code_list: list[str]) -> dict:
return {
"question_title": self.question_title,
"question_content": self.question_content,
"platform": self.platform.value,
"question_id": self.question_id,
"contest_id": self.contest_id,
"contest_date": self.contest_date.isoformat(),
"starter_code": self.starter_code,
"difficulty": self.difficulty.value,
"output_list": output_list,
"code_list": code_list,
}
def insert_output_evaluation(
self,
output_list: list[str],
code_list: list[str],
graded_list: list[bool],
**kwargs,
) -> dict:
output = self.insert_output(output_list, code_list)
output["graded_list"] = graded_list
output["pass@1"] = graded_list.count(True) / len(graded_list)
for k, v in kwargs.items():
output[k] = v
return output
def get_evaluation_sample(self):
return {
"input_output": json.dumps(
{
"inputs": [
t.input
for t in self.public_test_cases + self.private_test_cases
],
"outputs": [
t.output
for t in self.public_test_cases + self.private_test_cases
],
"fn_name": self.metadata.get("func_name", None),
}
),
}
def load_code_generation_dataset(release_version="release_v1") -> list[CodeGenerationProblem]:
dataset = load_dataset("livecodebench/code_generation_lite", split="test", version_tag=release_version, trust_remote_code=True)
dataset = [CodeGenerationProblem(**p) for p in dataset] # type: ignore
print(f"Loaded {len(dataset)} problems")
return dataset
def load_code_generation_dataset_not_fast(release_version="release_v1") -> list[CodeGenerationProblem]:
dataset = load_dataset("livecodebench/code_generation", split="test")
dataset = [CodeGenerationProblem(**p) for p in dataset] # type: ignore
print(f"Loaded {len(dataset)} problems")
return dataset
if __name__ == "__main__":
dataset = load_code_generation_dataset()
import json
from enum import Enum
from datetime import datetime
from dataclasses import dataclass
from datasets import load_dataset
@dataclass
class Test:
input: str
output: str
testtype: str
@dataclass
class TestOutputPredictionProblem:
question_title: str
question_content: str
question_id: str
contest_id: str
contest_date: datetime
difficulty: str
test: list[Test]
starter_code: str
function_name: str
test_id: int
def __post_init__(self):
self.test = [Test(**t) for t in json.loads(self.test)] # type: ignore
def insert_output(self, output_list: list[str], pred_list: list[str]) -> dict:
return {
"question_title": self.question_title,
"question_content": self.question_content,
"question_id": self.question_id,
"contest_id": self.contest_id,
"contest_date": self.contest_date.isoformat(),
"difficulty": self.difficulty,
"output_list": output_list,
"pred_list": pred_list,
"test_id": self.test_id,
"function_name": self.function_name,
"starter_code": self.starter_code,
}
def insert_output_evaluation(
self, output_list: list[str], code_list: list[str], graded_list: list[bool]
) -> dict:
output = self.insert_output(output_list, code_list)
output["graded_list"] = graded_list
output["pass@1"] = graded_list.count(True) / len(graded_list)
return output
def get_evaluation_sample(self) -> dict:
return {
"input": self.question_content,
"output": self.test[0].output,
}
def load_test_prediction_dataset(release_version="release_v1") -> list[TestOutputPredictionProblem]:
dataset = load_dataset("livecodebench/test_generation", split="test") # type: ignore
dataset = [TestOutputPredictionProblem(**d) for d in dataset]
print(f"Loaded {len(dataset)} prediction problems")
return dataset
if __name__ == "__main__":
dataset = load_test_prediction_dataset()
from lcb_runner.evaluation.compute_code_generation_metrics import codegen_metrics
from lcb_runner.evaluation.compute_code_execution_metrics import code_execution_metrics
from lcb_runner.evaluation.compute_test_output_prediction_metrics import (
test_output_metrics,
)
from lcb_runner.evaluation.pass_k_utils import extract_instance_results
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment