v1.0

b6edc328 · chenzk · b6edc328 · b6edc328 · b6edc328 · b6edc328
Commit b6edc328 authored Jan 17, 2025 by chenzk
20 changed files
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
+# torch
+transformers
+sentencepiece
+# vllm
+tqdm
+nltk
+pyext
+bs4
+pdfplumber
--- a/icon.png
+++ b/icon.png
--- a/model.properties
+++ b/model.properties
+# 模型编码
+modelCode=1233
+# 模型名称
+modelName=search-o1_pytorch
+# 模型描述
+modelDescription=动态获取和整合外部知识，无需训练即可赋予开源模型CoT“慢思考”能力，属于推理版o1。
+# 应用场景
+appScenario=推理,对话问答,制造,广媒,金融,能源,医疗,家居,教育
+# 框架类型
+frameType=pytorch
--- a/requirements.txt
+++ b/requirements.txt
+# torch
+transformers
+sentencepiece
+# vllm
+tqdm
+nltk
+pyext
+bs4
+pdfplumber
--- a/scripts/__pycache__/bing_search.cpython-310.pyc
+++ b/scripts/__pycache__/bing_search.cpython-310.pyc
--- a/scripts/__pycache__/bing_search.cpython-311.pyc
+++ b/scripts/__pycache__/bing_search.cpython-311.pyc
--- a/scripts/__pycache__/evaluate.cpython-310.pyc
+++ b/scripts/__pycache__/evaluate.cpython-310.pyc
--- a/scripts/__pycache__/evaluate.cpython-311.pyc
+++ b/scripts/__pycache__/evaluate.cpython-311.pyc
--- a/scripts/__pycache__/prompts.cpython-310.pyc
+++ b/scripts/__pycache__/prompts.cpython-310.pyc
--- a/scripts/__pycache__/prompts.cpython-311.pyc
+++ b/scripts/__pycache__/prompts.cpython-311.pyc
--- a/scripts/bing_search.py
+++ b/scripts/bing_search.py
+import os
+import json
+import requests
+from requests.exceptions import Timeout
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+import time
+import concurrent
+from concurrent.futures import ThreadPoolExecutor
+import pdfplumber
+from io import BytesIO
+import re
+import string
+from typing import Optional, Tuple
+from nltk.tokenize import sent_tokenize
+
+
+# ----------------------- Custom Headers -----------------------
+headers = {
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+                  'AppleWebKit/537.36 (KHTML, like Gecko) '
+                  'Chrome/58.0.3029.110 Safari/537.36',
+    'Referer': 'https://www.google.com/',
+    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+    'Accept-Language': 'en-US,en;q=0.5',
+    'Connection': 'keep-alive',
+    'Upgrade-Insecure-Requests': '1'
+}
+
+# Initialize session
+session = requests.Session()
+session.headers.update(headers)
+
+
+
+def remove_punctuation(text: str) -> str:
+    """Remove punctuation from the text."""
+    return text.translate(str.maketrans("", "", string.punctuation))
+
+def f1_score(true_set: set, pred_set: set) -> float:
+    """Calculate the F1 score between two sets of words."""
+    intersection = len(true_set.intersection(pred_set))
+    if not intersection:
+        return 0.0
+    precision = intersection / float(len(pred_set))
+    recall = intersection / float(len(true_set))
+    return 2 * (precision * recall) / (precision + recall)
+
+def extract_snippet_with_context(full_text: str, snippet: str, context_chars: int = 2500) -> Tuple[bool, str]:
+    """
+    Extract the sentence that best matches the snippet and its context from the full text.
+
+    Args:
+        full_text (str): The full text extracted from the webpage.
+        snippet (str): The snippet to match.
+        context_chars (int): Number of characters to include before and after the snippet.
+
+    Returns:
+        Tuple[bool, str]: The first element indicates whether extraction was successful, the second element is the extracted context.
+    """
+    try:
+        full_text = full_text[:50000]
+
+        snippet = snippet.lower()
+        snippet = remove_punctuation(snippet)
+        snippet_words = set(snippet.split())
+
+        best_sentence = None
+        best_f1 = 0.2
+
+        # sentences = re.split(r'(?<=[.!?]) +', full_text)  # Split sentences using regex, supporting ., !, ? endings
+        sentences = sent_tokenize(full_text)  # Split sentences using nltk's sent_tokenize
+
+        for sentence in sentences:
+            key_sentence = sentence.lower()
+            key_sentence = remove_punctuation(key_sentence)
+            sentence_words = set(key_sentence.split())
+            f1 = f1_score(snippet_words, sentence_words)
+            if f1 > best_f1:
+                best_f1 = f1
+                best_sentence = sentence
+
+        if best_sentence:
+            para_start = full_text.find(best_sentence)
+            para_end = para_start + len(best_sentence)
+            start_index = max(0, para_start - context_chars)
+            end_index = min(len(full_text), para_end + context_chars)
+            context = full_text[start_index:end_index]
+            return True, context
+        else:
+            # If no matching sentence is found, return the first context_chars*2 characters of the full text
+            return False, full_text[:context_chars * 2]
+    except Exception as e:
+        return False, f"Failed to extract snippet context due to {str(e)}"
+
+def extract_text_from_url(url, use_jina=False, jina_api_key=None, snippet: Optional[str] = None):
+    """
+    Extract text from a URL. If a snippet is provided, extract the context related to it.
+
+    Args:
+        url (str): URL of a webpage or PDF.
+        use_jina (bool): Whether to use Jina for extraction.
+        snippet (Optional[str]): The snippet to search for.
+
+    Returns:
+        str: Extracted text or context.
+    """
+    try:
+        if use_jina:
+            jina_headers = {
+                'Authorization': f'Bearer {jina_api_key}',
+                'X-Return-Format': 'markdown',
+                # 'X-With-Links-Summary': 'true'
+            }
+            response = requests.get(f'https://r.jina.ai/{url}', headers=jina_headers).text
+            # Remove URLs
+            pattern = r"\(https?:.*?\)|\[https?:.*?\]"
+            text = re.sub(pattern, "", response).replace('---','-').replace('===','=').replace('   ',' ').replace('   ',' ')
+        else:
+            response = session.get(url, timeout=20)  # Set timeout to 20 seconds
+            response.raise_for_status()  # Raise HTTPError if the request failed
+            # Determine the content type
+            content_type = response.headers.get('Content-Type', '')
+            if 'pdf' in content_type:
+                # If it's a PDF file, extract PDF text
+                return extract_pdf_text(url)
+            # Try using lxml parser, fallback to html.parser if unavailable
+            try:
+                soup = BeautifulSoup(response.text, 'lxml')
+            except Exception:
+                print("lxml parser not found or failed, falling back to html.parser")
+                soup = BeautifulSoup(response.text, 'html.parser')
+            text = soup.get_text(separator=' ', strip=True)
+
+        if snippet:
+            success, context = extract_snippet_with_context(text, snippet)
+            if success:
+                return context
+            else:
+                return text
+        else:
+            # If no snippet is provided, return directly
+            return text[:8000]
+    except requests.exceptions.HTTPError as http_err:
+        return f"HTTP error occurred: {http_err}"
+    except requests.exceptions.ConnectionError:
+        return "Error: Connection error occurred"
+    except requests.exceptions.Timeout:
+        return "Error: Request timed out after 20 seconds"
+    except Exception as e:
+        return f"Unexpected error: {str(e)}"
+
+def fetch_page_content(urls, max_workers=4, use_jina=False, snippets: Optional[dict] = None):
+    """
+    Concurrently fetch content from multiple URLs.
+
+    Args:
+        urls (list): List of URLs to scrape.
+        max_workers (int): Maximum number of concurrent threads.
+        use_jina (bool): Whether to use Jina for extraction.
+        snippets (Optional[dict]): A dictionary mapping URLs to their respective snippets.
+
+    Returns:
+        dict: A dictionary mapping URLs to the extracted content or context.
+    """
+    results = {}
+    with ThreadPoolExecutor(max_workers=max_workers) as executor:
+        # Use tqdm to display a progress bar
+        futures = {
+            executor.submit(extract_text_from_url, url, use_jina, snippets.get(url) if snippets else None): url
+            for url in urls
+        }
+        for future in tqdm(concurrent.futures.as_completed(futures), desc="Fetching URLs", total=len(urls)):
+            url = futures[future]
+            try:
+                data = future.result()
+                results[url] = data
+            except Exception as exc:
+                results[url] = f"Error fetching {url}: {exc}"
+            time.sleep(0.2)  # Simple rate limiting
+    return results
+
+
+def bing_web_search(query, subscription_key, endpoint, market='en-US', language='en', timeout=20):
+    """
+    Perform a search using the Bing Web Search API with a set timeout.
+
+    Args:
+        query (str): Search query.
+        subscription_key (str): Subscription key for the Bing Search API.
+        endpoint (str): Endpoint for the Bing Search API.
+        market (str): Market, e.g., "en-US" or "zh-CN".
+        language (str): Language of the results, e.g., "en".
+        timeout (int or float or tuple): Request timeout in seconds.
+                                         Can be a float representing the total timeout,
+                                         or a tuple (connect timeout, read timeout).
+
+    Returns:
+        dict: JSON response of the search results. Returns None or raises an exception if the request times out.
+    """
+    headers = {
+        "Ocp-Apim-Subscription-Key": subscription_key
+    }
+    params = {
+        "q": query,
+        "mkt": market,
+        "setLang": language,
+        "textDecorations": True,
+        "textFormat": "HTML"
+    }
+
+    try:
+        response = requests.get(endpoint, headers=headers, params=params, timeout=timeout)
+        response.raise_for_status()  # Raise exception if the request failed
+        search_results = response.json()
+        return search_results
+    except Timeout:
+        print(f"Bing Web Search request timed out ({timeout} seconds) for query: {query}")
+        return {}  # Or you can choose to raise an exception
+    except requests.exceptions.RequestException as e:
+        print(f"Error occurred during Bing Web Search request: {e}")
+        return {}
+
+
+def extract_pdf_text(url):
+    """
+    Extract text from a PDF.
+
+    Args:
+        url (str): URL of the PDF file.
+
+    Returns:
+        str: Extracted text content or error message.
+    """
+    try:
+        response = session.get(url, timeout=20)  # Set timeout to 20 seconds
+        if response.status_code != 200:
+            return f"Error: Unable to retrieve the PDF (status code {response.status_code})"
+        
+        # Open the PDF file using pdfplumber
+        with pdfplumber.open(BytesIO(response.content)) as pdf:
+            full_text = ""
+            for page in pdf.pages:
+                text = page.extract_text()
+                if text:
+                    full_text += text
+        
+        # Limit the text length
+        cleaned_text = ' '.join(full_text.split()[:600])
+        return cleaned_text
+    except requests.exceptions.Timeout:
+        return "Error: Request timed out after 20 seconds"
+    except Exception as e:
+        return f"Error: {str(e)}"
+
+def extract_relevant_info(search_results):
+    """
+    Extract relevant information from Bing search results.
+
+    Args:
+        search_results (dict): JSON response from the Bing Web Search API.
+
+    Returns:
+        list: A list of dictionaries containing the extracted information.
+    """
+    useful_info = []
+    
+    if 'webPages' in search_results and 'value' in search_results['webPages']:
+        for id, result in enumerate(search_results['webPages']['value']):
+            info = {
+                'id': id + 1,  # Increment id for easier subsequent operations
+                'title': result.get('name', ''),
+                'url': result.get('url', ''),
+                'site_name': result.get('siteName', ''),
+                'date': result.get('datePublished', '').split('T')[0],
+                'snippet': result.get('snippet', ''),  # Remove HTML tags
+                # Add context content to the information
+                'context': ''  # Reserved field to be filled later
+            }
+            useful_info.append(info)
+    
+    return useful_info
+
+
+# ------------------------------------------------------------
+
+if __name__ == "__main__":
+    # Example usage
+    # Define the query to search
+    query = "Structure of dimethyl fumarate"
+    
+    # Subscription key and endpoint for Bing Search API
+    BING_SUBSCRIPTION_KEY = "YOUR_BING_SUBSCRIPTION_KEY"
+    if not BING_SUBSCRIPTION_KEY:
+        raise ValueError("Please set the BING_SEARCH_V7_SUBSCRIPTION_KEY environment variable.")
+    
+    bing_endpoint = "https://api.bing.microsoft.com/v7.0/search"
+    
+    # Perform the search
+    print("Performing Bing Web Search...")
+    search_results = bing_web_search(query, BING_SUBSCRIPTION_KEY, bing_endpoint)
+    
+    print("Extracting relevant information from search results...")
+    extracted_info = extract_relevant_info(search_results)
+
+    print("Fetching and extracting context for each snippet...")
+    for info in tqdm(extracted_info, desc="Processing Snippets"):
+        full_text = extract_text_from_url(info['url'], use_jina=True)  # Get full webpage text
+        if full_text and not full_text.startswith("Error"):
+            success, context = extract_snippet_with_context(full_text, info['snippet'])
+            if success:
+                info['context'] = context
+            else:
+                info['context'] = f"Could not extract context. Returning first 8000 chars: {full_text[:8000]}"
+        else:
+            info['context'] = f"Failed to fetch full text: {full_text}"
+
+    # print("Your Search Query:", query)
+    # print("Final extracted information with context:")
+    # print(json.dumps(extracted_info, indent=2, ensure_ascii=False))
--- a/scripts/evaluate.py
+++ b/scripts/evaluate.py
+import re
+import json
+import numpy as np
+from collections import Counter
+import string
+import os, time
+from collections import defaultdict
+from lcb_runner.evaluation import codegen_metrics
+from utils.math_equivalence import is_equiv
+
+
+def extract_answer(output, mode='gen'):
+    extracted_text = ''
+    if mode == 'codegen':
+        # Extract the code between ```python and ```
+        pattern = r'```python\s*(.*?)\s*```'
+        matches = re.findall(pattern, output, re.DOTALL | re.IGNORECASE)
+        if matches:
+            extracted_text = matches[-1].strip()  # Take the last match
+    elif mode == 'infogen':
+        # Extract content after **Final Information** or **Modified Reasoning Steps**
+        pattern_info = "\n**Final Information**"
+        pattern_step = "\n**Modified Reasoning Steps**"
+        if pattern_info in output:
+            extracted_text = output.split(pattern_info)[-1].replace("\n","").strip("```").strip()
+        elif pattern_step in output:
+            extracted_text = output.split(pattern_step)[-1].strip("```").strip()
+        else:
+            extracted_text = "No helpful information found."
+    else:
+        # Existing extraction logic for 'gen' and 'choose' modes
+        pattern = r'\\boxed\{(.*)\}'
+        matches = re.findall(pattern, output)
+        if matches:
+            extracted_text = matches[-1]  # Take the last match
+            if mode in ['choose', 'qa']:
+                # Handle 'choose' mode
+                inner_pattern = r'\\text\{(.*)\}'
+                inner_matches = re.findall(inner_pattern, extracted_text)
+                if inner_matches:
+                    extracted_text = inner_matches[-1]  # Take the last match
+                extracted_text = extracted_text.strip("()")
+    return extracted_text
+
+
+def normalize_answer(text):
+    text = text.lower()
+    text = " ".join(text.strip().split())
+    return text
+
+def normalize_answer_qa(s):
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+    def white_space_fix(text):
+        return " ".join(text.strip().split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def evaluate_predictions(output, labeled_answer, mode='gen'):
+    final_metric = {"is_valid_answer": False, "acc": 0, "em": 0, "f1": 0, 'math_equal': 0}
+    pred_answer = extract_answer(output, mode=mode)
+    if pred_answer != '':
+        final_metric["is_valid_answer"] = True
+
+    if mode == 'qa':
+        normalized_pred_answer = normalize_answer_qa(pred_answer)
+        for answer in labeled_answer:
+            normalized_ground_truth = normalize_answer_qa(answer)
+            em = int(normalized_pred_answer == normalized_ground_truth)
+            acc = int(normalized_ground_truth in normalized_pred_answer)
+
+            prediction_tokens = normalized_pred_answer.split()
+            ground_truth_tokens = normalized_ground_truth.split()
+            common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+            num_same = sum(common.values())
+            if num_same == 0:
+                continue
+            precision = 1.0 * num_same / len(prediction_tokens)
+            recall = 1.0 * num_same / len(ground_truth_tokens)
+            f1 = (2 * precision * recall) / (precision + recall)
+            for k in ["em", "acc", "f1"]:
+                final_metric[k] = max(eval(k), final_metric[k])
+
+    else:
+        normalized_pred_answer = normalize_answer(pred_answer)
+        normalized_ground_truth = normalize_answer(labeled_answer)
+
+        em = int(normalized_pred_answer == normalized_ground_truth)
+        acc = int(normalized_ground_truth in normalized_pred_answer)
+    
+        prediction_tokens = normalized_pred_answer.split()
+        ground_truth_tokens = normalized_ground_truth.split()
+        common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
+        num_same = sum(common.values())
+        if num_same == 0:
+            f1 = 0
+        else:
+            precision = 1.0 * num_same / len(prediction_tokens) if len(prediction_tokens) > 0 else 0
+            recall = 1.0 * num_same / len(ground_truth_tokens) if len(ground_truth_tokens) > 0 else 0
+            if (precision + recall) == 0:
+                f1 = 0
+            else:
+                f1 = (2 * precision * recall) / (precision + recall)
+
+        final_metric["em"] = em
+        final_metric["acc"] = acc
+        final_metric["f1"] = f1
+
+        final_metric["math_equal"] = is_equiv(normalized_pred_answer, normalized_ground_truth)
+
+    # print(em, acc, f1, normalized_pred_answer, '|', normalized_ground_truth)
+    return final_metric, pred_answer
+
+
+
+def run_evaluation(filtered_data, input_list, output_list, dataset_name, output_dir, total_time, split, apply_backoff=False):
+    if dataset_name == 'livecode':
+        # Prepare samples and generations for codegen_metrics
+        samples_list = []
+        generations_list = []
+
+        # Collect difficulty levels for per-domain metrics
+        difficulties = []
+        per_difficulty_count = {}
+        num_valid_answer = 0
+
+        for item, input_prompt, result in zip(filtered_data, input_list, output_list):
+            if type(result) == str:
+                item['Output'] = result
+            else:
+                item['Output'] = result.outputs[0].text
+            difficulty = item.get("difficulty", "Unknown")
+            difficulties.append(difficulty)
+            # Track metrics per domain
+            if difficulty not in per_difficulty_count.keys():
+                per_difficulty_count[difficulty] = 0
+
+            pred_code = extract_answer(item['Output'], mode='codegen')
+            if pred_code != '':
+                num_valid_answer += 1
+                per_difficulty_count[difficulty] += 1
+            # Assuming each item has 'input_output' with 'inputs' and 'outputs'
+            public_test_cases = json.loads(item.get("public_test_cases", "{}"))
+
+            inputs, outputs = [], []
+            for case in public_test_cases:
+                inputs.append(case["input"])
+                outputs.append(case["output"])
+
+            sample = {
+                "input_output": json.dumps({
+                    "inputs": inputs,
+                    "outputs": outputs
+                }),
+            }
+
+            samples_list.append(sample)
+            generations_list.append([pred_code])
+            item['Pred_Answer'] = pred_code
+            item['Question'] = input_prompt
+
+
+        # Call codegen_metrics with pass@1
+        metrics, results, final_metadata = codegen_metrics(
+            samples_list,
+            generations_list,
+            k_list=[1],  # Evaluate the top 1 generated result
+            num_process_evaluate=2,   # Parallel evaluation
+            timeout=10,  # Set timeout to 10 seconds
+            debug=False,  # Enable debug mode
+        )
+        # print('samples_list', samples_list)
+        # print('generations_list', generations_list)
+        # print('metrics', metrics)
+
+        # Extract pass@1
+        pass_at_1 = metrics.get('pass@1', 0.0)
+        detail_pass_at_1 = metrics['detail']['pass@1']
+
+        for item, pass1, res, meta in zip(filtered_data, detail_pass_at_1.values(), results.values(), final_metadata):
+            item['Metrics'] = {'pass@1': pass1}
+            item['Results'] = res
+            item['Final_metadata'] = meta
+
+        # Initialize per-difficulty metrics
+        difficulty_metrics = defaultdict(list)
+        for idx, difficulty in enumerate(difficulties):
+            pass1 = detail_pass_at_1[idx]
+            difficulty_metrics[difficulty].append(pass1)
+
+        # Compute overall pass@1
+        overall_metrics = {
+            'pass@1': pass_at_1,  # / num_valid_answer * len(input_list),
+            'num_valid_answer': f'{num_valid_answer} of {len(input_list)}',
+            'query_latency': f'{(total_time / len(input_list) * 1000):.0f} ms',
+        }
+
+        # Compute per-difficulty pass@1
+        per_difficulty_metrics = {}
+        for difficulty, passes in difficulty_metrics.items():
+            avg_pass = np.mean(passes) if len(passes) > 0 else 0.0
+            num_valid_answer = per_difficulty_count[difficulty]
+            per_difficulty_metrics[difficulty] = {
+                'pass@1': avg_pass,
+                'num_valid_answer': f'{num_valid_answer} of {len(passes)}'
+            }
+
+        # Save the metrics
+        final_metrics = {
+            'overall': overall_metrics,
+            'per_domain': per_difficulty_metrics
+        }
+
+    else:
+        # Existing evaluation for other datasets
+        avg_em, avg_acc, avg_f1, avg_math = [], [], [], []
+        num_valid_answer = 0
+
+        # If the dataset is GPQA, track metrics per domain
+        domain_metrics = {}
+
+        for item, input_prompt, result in zip(filtered_data, input_list, output_list):
+            if type(result) == str:
+                item['Output'] = result
+            else:
+                item['Output'] = result.outputs[0].text
+            if dataset_name in ['gpqa', 'medmcqa']:
+                labeled_answer = item["Correct Choice"]
+                # labeled_choice_answer = item["Correct Answer"]
+                mode = 'choose'
+            elif dataset_name in ['math500', 'aime', 'amc']:
+                labeled_answer = item["answer"]
+                mode = 'gen'
+            elif dataset_name in ['nq', 'triviaqa', 'hotpotqa', 'musique', 'bamboogle', '2wiki']:
+                labeled_answer = item["answer"]
+                mode = 'qa'
+            elif dataset_name in ['pubhealth']:
+                labeled_answer = item["answer"]
+                mode = 'choose'
+            else:
+                raise ValueError(f"Unknown dataset_name: {dataset_name}")
+
+            metric, pred_answer = evaluate_predictions(output=item['Output'], labeled_answer=labeled_answer, mode=mode)
+            item['Pred_Answer'] = pred_answer
+            item['Metrics'] = metric
+            item['Question'] = input_prompt
+
+            # Determine the validity of the predicted answer
+            my_method_valid = (pred_answer != '' and not (mode == 'choose' and dataset_name == 'gpqa' and len(pred_answer) > 1))
+
+            avg_em.append(metric['em'])
+            avg_acc.append(metric['acc'])
+            avg_f1.append(metric['f1'])
+            avg_math.append(metric['math_equal'])
+
+            if my_method_valid:
+                num_valid_answer += 1
+
+            # If the dataset is GPQA, attempt to track metrics per domain
+            if dataset_name == 'gpqa':
+                domain = item.get("High-level domain", "Unknown")
+                if domain not in domain_metrics:
+                    domain_metrics[domain] = {'em': [], 'acc': [], 'f1': [], 'math_equal': [], 'num_valid_answer': 0, 'total_num': 0}
+                domain_metrics[domain]['total_num'] += 1
+                domain_metrics[domain]['em'].append(metric['em'])
+                domain_metrics[domain]['acc'].append(metric['acc'])
+                domain_metrics[domain]['f1'].append(metric['f1'])
+                domain_metrics[domain]['math_equal'].append(metric['math_equal'])
+                if my_method_valid:
+                    domain_metrics[domain]['num_valid_answer'] += 1
+
+        t = time.localtime()
+        result_json_name = f'{split}.{t.tm_mon}.{t.tm_mday},{t.tm_hour}:{t.tm_min}.json'
+        metrics_json_name = f'{split}.{t.tm_mon}.{t.tm_mday},{t.tm_hour}:{t.tm_min}.metrics.json'
+
+        # Compute overall metrics
+        overall_results = {
+            'em': np.mean(avg_em) if len(avg_em) > 0 else 0.0,
+            'acc': np.mean(avg_acc) if len(avg_acc) > 0 else 0.0,
+            'f1': np.mean(avg_f1) if len(avg_f1) > 0 else 0.0,
+            'math_equal': np.mean(avg_math) if len(avg_em) > 0 else 0.0,
+            'num_valid_answer': f'{num_valid_answer} of {len(input_list)}',
+            'query_latency': f'{(total_time / len(input_list) * 1000):.0f} ms',
+        }
+
+        # If the dataset is GPQA, output average metrics per domain
+        domain_avg_metrics = {}
+        if dataset_name == 'gpqa':
+            for dm, m in domain_metrics.items():
+                domain_avg_metrics[dm] = {
+                    'em': np.mean(m['em']) if len(m['em']) > 0 else 0,
+                    'acc': np.mean(m['acc']) if len(m['acc']) > 0 else 0,
+                    'f1': np.mean(m['f1']) if len(m['f1']) > 0 else 0,
+                    'math_equal': np.mean(m['math_equal']) if len(m['math_equal']) > 0 else 0,
+                    'num_valid_answer': f'{m["num_valid_answer"]} of {m["total_num"]}'
+                }
+
+        # 保存总体和分domain的指标
+        final_metrics = {'overall': overall_results}
+        if dataset_name == 'gpqa':
+            final_metrics['per_domain'] = domain_avg_metrics
+
+    t = time.localtime()
+    result_json_name = f'{split}.{t.tm_mon}.{t.tm_mday},{t.tm_hour}:{t.tm_min}.json'
+    metrics_json_name = f'{split}.{t.tm_mon}.{t.tm_mday},{t.tm_hour}:{t.tm_min}.metrics.json'
+    if apply_backoff:
+        result_json_name = output_dir
+        metrics_json_name = output_dir.replace('.json', '.metrics.backoff.json')
+
+    # Save prediction results and metrics
+    with open(os.path.join(output_dir, result_json_name), mode='w', encoding='utf-8') as json_file:
+        json.dump(filtered_data, json_file, indent=4, ensure_ascii=False)
+
+    with open(os.path.join(output_dir, metrics_json_name), mode='w', encoding='utf-8') as json_file:
+        json.dump(final_metrics, json_file, indent=4, ensure_ascii=False)
+
+
+
+if __name__ == "__main__":
+    import argparse
+
+    # Parse command-line arguments for flexibility
+    parser = argparse.ArgumentParser(description="Evaluate model outputs with optional backoff.")
+    parser.add_argument('--output_path', type=str, required=True, help='Path to the model output JSON file.')
+    parser.add_argument('--output_metrics_path', type=str, help='Path to save the evaluation metrics.')
+    parser.add_argument('--apply_backoff', action='store_true', help='Enable backoff to normal outputs if main output is invalid.')
+    args = parser.parse_args()
+
+    output_path = args.output_path
+    if args.output_metrics_path:
+        output_metrics_path = args.output_metrics_path
+    else:
+        output_metrics_path = output_path.replace('.json', '.metrics.json')
+
+    # Determine dataset name based on the output path
+    # NOTE: To apply back off strategy for retrieval-augmented reasoning methods, please replace normal_output_path with your actual path for results with run_direct_gen.
+    if 'gpqa' in output_path:
+        dataset_name = 'gpqa'
+        normal_output_path = './outputs/gpqa.qwq.direct/diamond.12.13,18:23.json'
+        if 'extended' in output_path:
+            normal_output_path = './outputs/gpqa.qwq.direct/extended.12.28,15:44.json'
+        if 'qwq' not in output_path:
+            normal_output_path = './outputs/runs.baselines/gpqa.qwen2.5-32b-instruct.direct/diamond.12.14,20:34.json'
+    elif 'math500' in output_path:
+        dataset_name = 'math500'
+        normal_output_path = './outputs/math500.qwq.direct/test.12.13,18:26.json'
+        if 'qwq' not in output_path:
+            normal_output_path = './outputs/runs.baselines/math500.qwen2.5-32b-instruct.direct/test.12.15,10:43.json'
+    elif 'aime' in output_path:
+        dataset_name = 'aime'
+        normal_output_path = './outputs/aime.qwq.direct/2024.12.13,19:36.json'
+        if 'qwq' not in output_path:
+            normal_output_path = './outputs/runs.baselines/aime.qwen2.5-32b-instruct.direct/test.12.14,20:28.json'
+    elif 'amc' in output_path:
+        dataset_name = 'amc'
+        normal_output_path = './outputs/amc.qwq.direct/test.12.14,14:31.json'
+        if 'qwq' not in output_path:
+            normal_output_path = './outputs/runs.baselines/amc.qwen2.5-32b-instruct.direct/test.12.14,20:26.json'
+    elif 'livecode' in output_path:
+        dataset_name = 'livecode'
+        normal_output_path = './outputs/livecode.qwq.direct/test.12.13,21:24.json'
+        if 'qwq' not in output_path:
+            normal_output_path = './outputs/runs.baselines/livecode.qwen2.5-32b-instruct.direct/test.12.14,20:32.json'
+    elif 'nq' in output_path:
+        dataset_name = 'nq'
+        normal_output_path = './outputs/runs.qa/nq.qwq.direct/test.12.15,14:50.json'
+        if 'qwq' not in output_path:
+            normal_output_path = ''
+    elif 'triviaqa' in output_path:
+        dataset_name = 'triviaqa'
+        normal_output_path = './outputs/runs.qa/triviaqa.qwq.direct/test.12.15,15:35.json'
+        if 'qwq' not in output_path:
+            normal_output_path = ''
+    elif 'hotpotqa' in output_path:
+        dataset_name = 'hotpotqa'
+        normal_output_path = './outputs/runs.qa/hotpotqa.qwq.direct/test.12.15,14:52.json'
+        if 'qwq' not in output_path:
+            normal_output_path = ''
+    elif 'musique' in output_path:
+        dataset_name = 'musique'
+        normal_output_path = './outputs/runs.qa/musique.qwq.direct/test.12.27,16:44.json'
+        if 'qwq' not in output_path:
+            normal_output_path = ''
+    elif 'bamboogle' in output_path:
+        dataset_name = 'bamboogle'
+        normal_output_path = './outputs/runs.qa/bamboogle.qwq.direct/test.12.28,9:51.json'
+        if 'qwq' not in output_path:
+            normal_output_path = ''
+    elif '2wiki' in output_path:
+        dataset_name = '2wiki'
+        normal_output_path = './outputs/runs.qa/2wiki.qwq.direct/test.12.15,15:32.json'
+        if 'qwq' not in output_path:
+            normal_output_path = ''
+    elif 'medmcqa' in output_path:
+        dataset_name = 'medmcqa'
+        normal_output_path = './outputs/runs.qa/medmcqa.qwq.direct/test.12.15,16:57.json'
+        if 'qwq' not in output_path:
+            normal_output_path = ''
+    elif 'pubhealth' in output_path:
+        dataset_name = 'pubhealth'
+        normal_output_path = './outputs/runs.qa/pubhealth.qwq.direct/test.12.15,20:32.json'
+        if 'qwq' not in output_path:
+            normal_output_path = ''
+
+    # Load main output data
+    with open(output_path, mode='r', encoding='utf-8') as file:
+        data = json.load(file)
+
+    # Load main metrics data
+    with open(output_metrics_path, mode='r', encoding='utf-8') as file:
+        metrics = json.load(file)
+
+    # Extract existing metrics
+    if 'overall' in metrics:
+        query_latency = metrics['overall']['query_latency']
+        original_num_valid_answer = metrics['overall']['num_valid_answer']
+    else:
+        query_latency = metrics.get('query_latency', 'N/A')
+        original_num_valid_answer = metrics.get('num_valid_answer', 'N/A')
+
+    # Load normal output data if backoff is enabled
+    normal_data = None
+    if args.apply_backoff:
+        if not os.path.exists(normal_output_path):
+            raise FileNotFoundError(f"Normal output file not found at: {normal_output_path}")
+        with open(normal_output_path, mode='r', encoding='utf-8') as file:
+            normal_data = json.load(file)
+
+    if dataset_name != 'livecode':
+        # Existing evaluation for non-livecode datasets
+        avg_em, avg_acc, avg_f1, avg_math = [], [], [], []
+        num_valid_answer = 0
+
+        # Initialize per-domain metrics
+        domain_metrics = {}
+
+        for i, item in enumerate(data):
+            if dataset_name in ['gpqa', 'medmcqa']:
+                labeled_answer = item["Correct Choice"]
+                domain = item.get("High-level domain", "Unknown")
+                mode = 'choose'
+            elif dataset_name == 'math500':
+                labeled_answer = item["answer"]
+                domain = item.get("level", "Unknown")
+                mode = 'gen'
+            elif dataset_name in ['aime', 'amc']:
+                labeled_answer = item["answer"]
+                mode = 'gen'
+                domain = 'Unknown'
+            elif dataset_name in ['nq', 'triviaqa', 'hotpotqa', 'musique', 'bamboogle', '2wiki']:
+                labeled_answer = item["answer"]
+                mode = 'qa'
+                domain = 'Unknown'
+            elif dataset_name in ['pubhealth']:
+                labeled_answer = item["answer"]
+                mode = 'choose'
+                domain = 'Unknown'
+            else:
+                raise ValueError(f"Unsupported dataset: {dataset_name}")
+
+            output = item['Output']
+
+            metric, pred_answer = evaluate_predictions(
+                output=output, 
+                labeled_answer=labeled_answer,
+                mode=mode,
+            )
+
+            # Determine if the main method's answer is valid
+            my_method_valid = (pred_answer != '' and not (mode == 'choose' and dataset_name == 'gpqa' and len(pred_answer) > 1))
+
+            # If invalid and backoff is enabled, use normal method's output
+            if args.apply_backoff and not my_method_valid and normal_data is not None:
+                normal_item = normal_data[i]
+                if dataset_name in ['gpqa', 'medmcqa']:
+                    normal_labeled_answer = normal_item["Correct Choice"]
+                    normal_mode = 'choose'
+                elif dataset_name == 'math500':
+                    normal_labeled_answer = normal_item["answer"]
+                    normal_mode = 'gen'
+                elif dataset_name in ['aime', 'amc']:
+                    normal_labeled_answer = normal_item["answer"]
+                    normal_mode = 'gen'
+                elif dataset_name in ['nq', 'triviaqa', 'hotpotqa', 'musique', 'bamboogle', '2wiki']:
+                    normal_labeled_answer = normal_item["answer"]
+                    normal_mode = 'qa'
+                elif dataset_name in ['pubhealth']:
+                    normal_labeled_answer = normal_item["answer"]
+                    normal_mode = 'choose'
+                else:
+                    raise ValueError(f"Unsupported dataset for backoff: {dataset_name}")
+
+                normal_output = normal_item['Output']
+
+                normal_metric, normal_pred_answer = evaluate_predictions(
+                    output=normal_output, 
+                    labeled_answer=normal_labeled_answer,
+                    mode=normal_mode,
+                )
+                normal_valid = (normal_pred_answer != '' and not (normal_mode == 'choose' and dataset_name == 'gpqa' and len(normal_pred_answer) > 1))
+
+                # Use normal method's result if valid
+                if normal_valid:
+                    metric = normal_metric
+                    pred_answer = normal_pred_answer
+                    my_method_valid = True
+
+            # Track metrics per domain
+            if domain not in domain_metrics:
+                domain_metrics[domain] = {'em': [], 'acc': [], 'f1': [], 'math_equal': [], 'num_valid_answer': 0, 'total_num': 0}
+            domain_metrics[domain]['total_num'] += 1
+                
+            avg_em.append(metric['em'])
+            avg_acc.append(metric['acc'])
+            avg_f1.append(metric['f1'])
+            avg_math.append(metric['math_equal'])
+            domain_metrics[domain]['em'].append(metric['em'])
+            domain_metrics[domain]['acc'].append(metric['acc'])
+            domain_metrics[domain]['f1'].append(metric['f1'])
+            domain_metrics[domain]['math_equal'].append(metric['math_equal'])
+
+            if my_method_valid:
+                num_valid_answer += 1
+                domain_metrics[domain]['num_valid_answer'] += 1
+
+        # Compute overall metrics
+        overall_metrics = {
+            'em': np.mean(avg_em) if len(avg_em) > 0 else 0, 
+            'acc': np.mean(avg_acc) if len(avg_acc) > 0 else 0, 
+            'f1': np.mean(avg_f1) if len(avg_f1) > 0 else 0, 
+            'math_equal': np.mean(avg_math) if len(avg_math) > 0 else 0, 
+            'num_valid_answer': f'{num_valid_answer} of {len(data)}',
+            'query_latency': query_latency,
+        }
+        if args.apply_backoff:
+            overall_metrics['original_num_valid_answer'] = original_num_valid_answer
+
+        # Compute per-domain metrics
+        domain_avg_metrics = {}
+        for dm, m in domain_metrics.items():
+            domain_avg_metrics[dm] = {
+                'em': np.mean(m['em']) if len(m['em']) > 0 else 0,
+                'acc': np.mean(m['acc']) if len(m['acc']) > 0 else 0,
+                'f1': np.mean(m['f1']) if len(m['f1']) > 0 else 0,
+                'math_equal': np.mean(m['math_equal']) if len(m['math_equal']) > 0 else 0,
+                'num_valid_answer': f'{m["num_valid_answer"]} of {m["total_num"]}',
+            }
+
+        # Prepare final metrics
+        final_metrics = {'overall': overall_metrics}
+        if dataset_name == 'gpqa':
+            final_metrics['per_domain'] = domain_avg_metrics
+
+    else:
+        # Evaluation and backoff for livecode dataset
+        split = 'test'  # Modify as needed or extract from output_path
+
+        if args.apply_backoff and normal_data is not None:
+            # Apply backoff by replacing invalid outputs with normal outputs
+            for i, item in enumerate(data):
+                # Extract Pred_Answer from main output
+                pred_answer = item['Pred_Answer']
+
+                # Check if Pred_Answer is invalid
+                if pred_answer == '':
+                    # Replace Output with normal output
+                    item['Output'] = normal_data[i]['Output']
+
+        # Prepare input_list and output_list for run_evaluation
+        input_list = [item['Question'] for item in data]
+        output_list = [item['Output'] for item in data]
+
+        # Estimate total_time (if available). Here, set to 0 as a placeholder.
+        total_time = 0  # Modify if timing information is available
+
+        # Run evaluation
+        run_evaluation(
+            filtered_data=data,
+            input_list=input_list,
+            output_list=output_list,
+            dataset_name=dataset_name,
+            output_dir=output_path,
+            total_time=total_time,
+            split=split,
+            apply_backoff=True,
+        )
+        # run_evaluation handles saving the metrics for livecode
+
+    # Save metrics for non-livecode datasets
+    if dataset_name != 'livecode' or not args.apply_backoff:
+        # If dataset is livecode and backoff was applied, metrics are already saved by run_evaluation
+        if args.apply_backoff:
+            output_metrics_path = output_metrics_path.replace('.json', '.backoff.json')
+        with open(output_metrics_path, mode='w', encoding='utf-8') as json_file:
+            json.dump(final_metrics, json_file, indent=4, ensure_ascii=False)
+
+    print(f"Evaluation completed. Metrics saved to {output_metrics_path}")
--- a/scripts/lcb_runner/benchmarks/__init__.py
+++ b/scripts/lcb_runner/benchmarks/__init__.py
+from lcb_runner.benchmarks.code_generation import (
+    CodeGenerationProblem,
+    load_code_generation_dataset,
+    load_code_generation_dataset_not_fast,
+)
+from lcb_runner.benchmarks.test_output_prediction import (
+    TestOutputPredictionProblem,
+    load_test_prediction_dataset,
+)
+from lcb_runner.benchmarks.code_execution import (
+    CodeExecutionProblem,
+    load_code_execution_dataset,
+)
--- a/scripts/lcb_runner/benchmarks/code_execution.py
+++ b/scripts/lcb_runner/benchmarks/code_execution.py
+import json
+from enum import Enum
+from datetime import datetime
+from dataclasses import dataclass
+
+from datasets import load_dataset
+
+
+@dataclass
+class CodeExecutionProblem:
+    question_id: str
+    contest_id: str
+    contest_date: datetime
+    difficulty: str
+    function_name: str
+    code: str
+    input: str
+    output: str
+    id: str
+    problem_id: str
+    numsteps: int
+
+    def __post_init__(self):
+        pass
+
+    def insert_output(self, output_list: list[str], pred_list: list[str]) -> dict:
+        return {
+            "question_id": self.question_id,
+            "contest_id": self.contest_id,
+            "contest_date": self.contest_date.isoformat(),
+            "difficulty": self.difficulty,
+            "function_name": self.function_name,
+            "code": self.code,
+            "input": self.input,
+            "output": self.output,
+            "id": self.id,
+            "problem_id": self.problem_id,
+            "numsteps": self.numsteps,
+            "output_list": output_list,
+            "pred_list": pred_list,
+        }
+
+    def insert_output_evaluation(
+        self, output_list: list[str], code_list: list[str], graded_list: list[bool]
+    ) -> dict:
+        output = self.insert_output(output_list, code_list)
+        output["graded_list"] = graded_list
+        output["pass@1"] = graded_list.count(True) / len(graded_list)
+        return output
+
+    def get_evaluation_sample(self) -> dict:
+        return {
+            "code": self.code,
+            "input": self.input,
+            "output": self.output,
+        }
+
+
+def load_code_execution_dataset(release_version="release_v1") -> list[CodeExecutionProblem]:
+    dataset = load_dataset("livecodebench/execution-v2", split="test")
+    dataset = [CodeExecutionProblem(**p) for p in dataset]  # type: ignore
+    print(f"Loaded {len(dataset)} problems")
+    return dataset
+
+
+if __name__ == "__main__":
+    dataset = load_code_execution_dataset()
--- a/scripts/lcb_runner/benchmarks/code_generation.py
+++ b/scripts/lcb_runner/benchmarks/code_generation.py
+import json
+import zlib
+import pickle
+import base64
+from enum import Enum
+from datetime import datetime
+from dataclasses import dataclass
+
+from datasets import load_dataset
+
+
+class Platform(Enum):
+    LEETCODE = "leetcode"
+    CODEFORCES = "codeforces"
+    ATCODER = "atcoder"
+
+
+class Difficulty(Enum):
+    EASY = "easy"
+    MEDIUM = "medium"
+    HARD = "hard"
+
+
+class TestType(Enum):
+    STDIN = "stdin"
+    FUNCTIONAL = "functional"
+
+
+@dataclass
+class Test:
+    input: str
+    output: str
+    testtype: TestType
+
+    def __post_init__(self):
+        self.testtype = TestType(self.testtype)
+        # if self.testtype == TestType.FUNCTIONAL:
+        #     self.input = json.loads(self.input)
+        #     self.output = json.loads(self.output)
+
+
+@dataclass
+class CodeGenerationProblem:
+    question_title: str
+    question_content: str
+    platform: Platform
+    question_id: str
+    contest_id: str
+    contest_date: datetime
+    starter_code: str
+    difficulty: Difficulty
+    public_test_cases: list[Test]
+    private_test_cases: list[Test]
+    metadata: dict
+
+    def __post_init__(self):
+        self.platform = Platform(self.platform)
+        self.difficulty = Difficulty(self.difficulty)
+        self.contest_date = datetime.fromisoformat(self.contest_date)
+
+        self.public_test_cases = json.loads(self.public_test_cases)  # type: ignore
+        self.public_test_cases = [Test(**t) for t in self.public_test_cases]
+
+        try:
+            self.private_test_cases = json.loads(self.private_test_cases)  # type: ignore
+        except:
+            self.private_test_cases = json.loads(
+                pickle.loads(
+                    zlib.decompress(
+                        base64.b64decode(self.private_test_cases.encode("utf-8"))  # type: ignore
+                    )
+                )
+            )  # type: ignore
+        self.private_test_cases = [Test(**t) for t in self.private_test_cases]
+
+        self.metadata = json.loads(self.metadata)  # type: ignore
+
+    def insert_output(self, output_list: list[str], code_list: list[str]) -> dict:
+        return {
+            "question_title": self.question_title,
+            "question_content": self.question_content,
+            "platform": self.platform.value,
+            "question_id": self.question_id,
+            "contest_id": self.contest_id,
+            "contest_date": self.contest_date.isoformat(),
+            "starter_code": self.starter_code,
+            "difficulty": self.difficulty.value,
+            "output_list": output_list,
+            "code_list": code_list,
+        }
+
+    def insert_output_evaluation(
+        self,
+        output_list: list[str],
+        code_list: list[str],
+        graded_list: list[bool],
+        **kwargs,
+    ) -> dict:
+        output = self.insert_output(output_list, code_list)
+        output["graded_list"] = graded_list
+        output["pass@1"] = graded_list.count(True) / len(graded_list)
+        for k, v in kwargs.items():
+            output[k] = v
+        return output
+
+    def get_evaluation_sample(self):
+        return {
+            "input_output": json.dumps(
+                {
+                    "inputs": [
+                        t.input
+                        for t in self.public_test_cases + self.private_test_cases
+                    ],
+                    "outputs": [
+                        t.output
+                        for t in self.public_test_cases + self.private_test_cases
+                    ],
+                    "fn_name": self.metadata.get("func_name", None),
+                }
+            ),
+        }
+
+
+def load_code_generation_dataset(release_version="release_v1") -> list[CodeGenerationProblem]:
+    dataset = load_dataset("livecodebench/code_generation_lite", split="test", version_tag=release_version, trust_remote_code=True)
+    dataset = [CodeGenerationProblem(**p) for p in dataset]  # type: ignore
+    print(f"Loaded {len(dataset)} problems")
+    return dataset
+
+
+def load_code_generation_dataset_not_fast(release_version="release_v1") -> list[CodeGenerationProblem]:
+    dataset = load_dataset("livecodebench/code_generation", split="test")
+    dataset = [CodeGenerationProblem(**p) for p in dataset]  # type: ignore
+    print(f"Loaded {len(dataset)} problems")
+    return dataset
+
+
+if __name__ == "__main__":
+    dataset = load_code_generation_dataset()
--- a/scripts/lcb_runner/benchmarks/test_output_prediction.py
+++ b/scripts/lcb_runner/benchmarks/test_output_prediction.py
+import json
+from enum import Enum
+from datetime import datetime
+from dataclasses import dataclass
+
+from datasets import load_dataset
+
+
+@dataclass
+class Test:
+    input: str
+    output: str
+    testtype: str
+
+
+@dataclass
+class TestOutputPredictionProblem:
+    question_title: str
+    question_content: str
+    question_id: str
+    contest_id: str
+    contest_date: datetime
+    difficulty: str
+    test: list[Test]
+    starter_code: str
+    function_name: str
+    test_id: int
+
+    def __post_init__(self):
+        self.test = [Test(**t) for t in json.loads(self.test)]  # type: ignore
+
+    def insert_output(self, output_list: list[str], pred_list: list[str]) -> dict:
+        return {
+            "question_title": self.question_title,
+            "question_content": self.question_content,
+            "question_id": self.question_id,
+            "contest_id": self.contest_id,
+            "contest_date": self.contest_date.isoformat(),
+            "difficulty": self.difficulty,
+            "output_list": output_list,
+            "pred_list": pred_list,
+            "test_id": self.test_id,
+            "function_name": self.function_name,
+            "starter_code": self.starter_code,
+        }
+
+    def insert_output_evaluation(
+        self, output_list: list[str], code_list: list[str], graded_list: list[bool]
+    ) -> dict:
+        output = self.insert_output(output_list, code_list)
+        output["graded_list"] = graded_list
+        output["pass@1"] = graded_list.count(True) / len(graded_list)
+        return output
+
+    def get_evaluation_sample(self) -> dict:
+        return {
+            "input": self.question_content,
+            "output": self.test[0].output,
+        }
+
+
+def load_test_prediction_dataset(release_version="release_v1") -> list[TestOutputPredictionProblem]:
+    dataset = load_dataset("livecodebench/test_generation", split="test")  # type: ignore
+    dataset = [TestOutputPredictionProblem(**d) for d in dataset]
+    print(f"Loaded {len(dataset)} prediction problems")
+    return dataset
+
+
+if __name__ == "__main__":
+    dataset = load_test_prediction_dataset()
--- a/scripts/lcb_runner/evaluation/__init__.py
+++ b/scripts/lcb_runner/evaluation/__init__.py
+from lcb_runner.evaluation.compute_code_generation_metrics import codegen_metrics
+from lcb_runner.evaluation.compute_code_execution_metrics import code_execution_metrics
+from lcb_runner.evaluation.compute_test_output_prediction_metrics import (
+    test_output_metrics,
+)
+from lcb_runner.evaluation.pass_k_utils import extract_instance_results
--- a/scripts/lcb_runner/evaluation/__pycache__/__init__.cpython-310.pyc
+++ b/scripts/lcb_runner/evaluation/__pycache__/__init__.cpython-310.pyc
--- a/scripts/lcb_runner/evaluation/__pycache__/__init__.cpython-311.pyc
+++ b/scripts/lcb_runner/evaluation/__pycache__/__init__.cpython-311.pyc
--- a/scripts/lcb_runner/evaluation/__pycache__/compute_code_execution_metrics.cpython-310.pyc
+++ b/scripts/lcb_runner/evaluation/__pycache__/compute_code_execution_metrics.cpython-310.pyc