{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Data preprocess for GPQA\n", "import csv\n", "import json\n", "import random\n", "from tqdm import tqdm\n", "\n", "# Paths to data\n", "data_path = './GPQA/original_data/gpqa_extended.csv'\n", "output_path = './GPQA/extended.json'\n", "\n", "# Define the keys we want to keep\n", "keys_to_keep = [\n", " 'id',\n", " 'Question',\n", " 'Subdomain',\n", " 'High-level domain',\n", " 'Correct Answer',\n", " 'Incorrect Answer 1',\n", " 'Incorrect Answer 2',\n", " 'Incorrect Answer 3'\n", "]\n", "\n", "filtered_data = []\n", "with open(data_path, mode='r', encoding='utf-8') as csv_file:\n", " csv_reader = csv.DictReader(csv_file)\n", " for idx, row in enumerate(tqdm(csv_reader), 0):\n", " # Add id field\n", " row['id'] = idx\n", " # Create new dictionary with only desired keys\n", " filtered_row = {key: row[key] for key in keys_to_keep}\n", "\n", " # Extract answers and shuffle them\n", " answers = [\n", " ('Correct Answer', filtered_row['Correct Answer']),\n", " ('Incorrect Answer 1', filtered_row['Incorrect Answer 1']),\n", " ('Incorrect Answer 2', filtered_row['Incorrect Answer 2']),\n", " ('Incorrect Answer 3', filtered_row['Incorrect Answer 3'])\n", " ]\n", " random.shuffle(answers)\n", "\n", " # Assign new choices A, B, C, D in order and determine the correct choice\n", " choices = ['A', 'B', 'C', 'D']\n", " formatted_answers = []\n", " correct_choice = None\n", " for i, (label, answer) in enumerate(answers):\n", " choice = choices[i]\n", " formatted_answers.append((choice, answer))\n", " if label == 'Correct Answer':\n", " correct_choice = choice\n", "\n", " # Update the Question field\n", " formatted_choices = \"\\n\".join([f\"({choice}) {answer}\" for choice, answer in formatted_answers])\n", " filtered_row['Question'] = f\"{filtered_row['Question']} Choices:\\n{formatted_choices}\\n\"\n", "\n", " # Add the Correct Choice field\n", " filtered_row['Correct Choice'] = correct_choice\n", "\n", " # Append the updated row to filtered_data\n", " filtered_data.append(filtered_row)\n", "\n", "# Write the updated data to JSON\n", "with open(output_path, mode='w', encoding='utf-8') as json_file:\n", " json.dump(filtered_data, json_file, indent=4, ensure_ascii=False)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Data preprocess for MATH500\n", "import csv\n", "import json\n", "from tqdm import tqdm\n", "\n", "test_path = './MATH500/original_data/test.jsonl'\n", "output_path = './MATH500/test.json'\n", "\n", "data_list = []\n", "with open(test_path, 'r') as file:\n", " for id, line in enumerate(file.readlines()):\n", " line = json.loads(line)\n", " data_list.append({\n", " 'id': id, \n", " 'Question': line['problem'],\n", " 'solution': line['solution'],\n", " 'answer': line['answer'],\n", " 'subject': line['subject'],\n", " 'level': line['level'],\n", " 'unique_id': line['unique_id'],\n", " })\n", "\n", "# Write the updated data to JSON\n", "with open(output_path, mode='w', encoding='utf-8') as json_file:\n", " json.dump(data_list, json_file, indent=4, ensure_ascii=False)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Data preprocess for AIME\n", "import csv\n", "import json\n", "from tqdm import tqdm\n", "\n", "test_path = './AIME/original_data/aime_2024.json'\n", "output_path = './AIME/2024.json'\n", "\n", "data_list = []\n", "with open(test_path, 'r') as file:\n", " data = json.load(file)\n", " for id, line in enumerate(tqdm(data)):\n", " data_list.append({\n", " 'id': id, \n", " 'Problem_ID': line['ID'],\n", " 'Question': line['Problem'],\n", " 'Solution': line['Solution'],\n", " 'answer': str(line['Answer']),\n", " })\n", "\n", "# Write the updated data to JSON\n", "with open(output_path, mode='w', encoding='utf-8') as json_file:\n", " json.dump(data_list, json_file, indent=4, ensure_ascii=False)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Data preprocess for AMC\n", "import csv\n", "import json\n", "from tqdm import tqdm\n", "\n", "test_path = './AMC/original_data/amc_2022_2023.json'\n", "output_path = './AMC/test.json'\n", "\n", "data_list = []\n", "with open(test_path, 'r') as file:\n", " data = json.load(file)\n", " id = 0\n", " for line in tqdm(data):\n", " if '2023' not in line['url']:\n", " continue\n", " data_list.append({\n", " 'id': id, \n", " 'Question': line['problem'],\n", " 'answer': str(int(line['answer'])),\n", " 'url': line['url'],\n", " })\n", " id += 1\n", "\n", "# Write the updated data to JSON\n", "with open(output_path, mode='w', encoding='utf-8') as json_file:\n", " json.dump(data_list, json_file, indent=4, ensure_ascii=False)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Data preprocess for LiveCodeBench\n", "import json\n", "from tqdm import tqdm\n", "from datetime import datetime\n", "\n", "def is_valid_date(date_str):\n", " \"\"\"\n", " Check if the given date string is within the range from August 1, 2024, to November 30, 2024.\n", "\n", " Args:\n", " date_str (str): The date string in the format \"%Y-%m-%dT%H:%M:%S\".\n", "\n", " Returns:\n", " bool: True if the date is within the specified range, False otherwise.\n", " \"\"\"\n", " try:\n", " # Parse the date string into a datetime object\n", " date = datetime.strptime(date_str, \"%Y-%m-%dT%H:%M:%S\")\n", " except ValueError:\n", " # If the date string is not in the expected format, consider it invalid\n", " return False\n", "\n", " # Define the start and end dates for the valid range\n", " start_date = datetime(2024, 8, 1)\n", " end_date = datetime(2024, 11, 30)\n", "\n", " # Check if the date falls within the valid range\n", " return start_date <= date <= end_date\n", "\n", "# Define the paths to the input JSONL files\n", "test_paths = [\n", " './LiveCodeBench/test.jsonl',\n", " './LiveCodeBench/test2.jsonl',\n", " './LiveCodeBench/test3.jsonl',\n", " './LiveCodeBench/test4.jsonl'\n", "]\n", "\n", "# Define the path to the output JSON file\n", "output_path = './LiveCodeBench/test.json'\n", "\n", "data_list = []\n", "seen_questions = set() # To track unique questions based on 'question_content'\n", "current_id = 0 # To assign unique IDs across all files\n", "\n", "for test_path in test_paths:\n", " try:\n", " with open(test_path, 'r', encoding='utf-8') as file:\n", " # Use tqdm to show progress; total can be estimated if needed\n", " for line in tqdm(file, desc=f'Processing {test_path}'):\n", " try:\n", " # Parse the JSON line\n", " line_data = json.loads(line)\n", " except json.JSONDecodeError:\n", " # Skip lines that are not valid JSON\n", " continue\n", "\n", " # Check if the 'contest_date' field exists and is valid\n", " contest_date = line_data.get('contest_date')\n", " if not contest_date or not is_valid_date(contest_date):\n", " continue\n", "\n", " # Get the question content to check for duplicates\n", " question_content = line_data.get('question_content')\n", " if not question_content:\n", " continue # Skip if 'question_content' is missing\n", "\n", " if question_content in seen_questions:\n", " continue # Duplicate question; skip\n", "\n", " # Add the question to the seen set\n", " seen_questions.add(question_content)\n", "\n", " # Append the question data to the list\n", " data_list.append({\n", " 'id': current_id,\n", " 'Question': question_content,\n", " 'question_title': line_data.get('question_title', ''),\n", " 'contest_date': contest_date,\n", " 'difficulty': line_data.get('difficulty', ''),\n", " 'public_test_cases': line_data.get('public_test_cases', [])\n", " })\n", "\n", " current_id += 1 # Increment the unique ID\n", "\n", " except FileNotFoundError:\n", " print(f\"File not found: {test_path}\")\n", " except Exception as e:\n", " print(f\"An error occurred while processing {test_path}: {e}\")\n", "\n", "# Write the aggregated and deduplicated data to the output JSON file\n", "try:\n", " with open(output_path, mode='w', encoding='utf-8') as json_file:\n", " json.dump(data_list, json_file, indent=4, ensure_ascii=False)\n", " print(f\"Data successfully written to {output_path}\")\n", "except Exception as e:\n", " print(f\"Failed to write data to {output_path}: {e}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Data preprocess for FlashRAG ODQA datasets\n", "import csv\n", "import json\n", "from tqdm import tqdm\n", "\n", "dataset_name = 'bamboogle'\n", "split = 'test'\n", "data_num = 200\n", "\n", "test_path = f'./FlashRAG_datasets/{dataset_name}/{split}.jsonl'\n", "output_path = f'./QA_Datasets/{dataset_name}.json'\n", "\n", "data_list = []\n", "with open(test_path, 'r') as file:\n", " for id, line in enumerate(tqdm(file.readlines())):\n", " line = json.loads(line)\n", " data_list.append({\n", " 'id': id, \n", " 'Question': line['question'],\n", " 'answer': line[\"golden_answers\"],\n", " })\n", " if len(data_list) >= data_num:\n", " break\n", "\n", "# Write the updated data to JSON\n", "with open(output_path, mode='w', encoding='utf-8') as json_file:\n", " json.dump(data_list, json_file, indent=4, ensure_ascii=False)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Data preprocess for FlashRAG ODQA datasets (All)\n", "import csv\n", "import json\n", "from tqdm import tqdm\n", "\n", "dataset_name = 'musique'\n", "split = 'dev'\n", "data_num = 100000\n", "\n", "test_path = f'./FlashRAG_datasets/{dataset_name}/{split}.jsonl'\n", "output_path = f'./QA_Datasets/{dataset_name}.json'\n", "\n", "data_list = []\n", "with open(test_path, 'r') as file:\n", " for id, line in enumerate(tqdm(file.readlines())):\n", " line = json.loads(line)\n", " data_list.append({\n", " 'id': id, \n", " 'Question': line['question'],\n", " 'answer': line[\"golden_answers\"],\n", " })\n", " if len(data_list) >= data_num:\n", " break\n", "\n", "# Write the updated data to JSON\n", "with open(output_path, mode='w', encoding='utf-8') as json_file:\n", " json.dump(data_list, json_file, indent=4, ensure_ascii=False)\n" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }