data_pre_precess.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data preprocess for GPQA\n",
    "import csv\n",
    "import json\n",
    "import random\n",
    "from tqdm import tqdm\n",
    "\n",
    "# Paths to data\n",
    "data_path = './GPQA/original_data/gpqa_extended.csv'\n",
    "output_path = './GPQA/extended.json'\n",
    "\n",
    "# Define the keys we want to keep\n",
    "keys_to_keep = [\n",
    "    'id',\n",
    "    'Question',\n",
    "    'Subdomain',\n",
    "    'High-level domain',\n",
    "    'Correct Answer',\n",
    "    'Incorrect Answer 1',\n",
    "    'Incorrect Answer 2',\n",
    "    'Incorrect Answer 3'\n",
    "]\n",
    "\n",
    "filtered_data = []\n",
    "with open(data_path, mode='r', encoding='utf-8') as csv_file:\n",
    "    csv_reader = csv.DictReader(csv_file)\n",
    "    for idx, row in enumerate(tqdm(csv_reader), 0):\n",
    "        # Add id field\n",
    "        row['id'] = idx\n",
    "        # Create new dictionary with only desired keys\n",
    "        filtered_row = {key: row[key] for key in keys_to_keep}\n",
    "\n",
    "        # Extract answers and shuffle them\n",
    "        answers = [\n",
    "            ('Correct Answer', filtered_row['Correct Answer']),\n",
    "            ('Incorrect Answer 1', filtered_row['Incorrect Answer 1']),\n",
    "            ('Incorrect Answer 2', filtered_row['Incorrect Answer 2']),\n",
    "            ('Incorrect Answer 3', filtered_row['Incorrect Answer 3'])\n",
    "        ]\n",
    "        random.shuffle(answers)\n",
    "\n",
    "        # Assign new choices A, B, C, D in order and determine the correct choice\n",
    "        choices = ['A', 'B', 'C', 'D']\n",
    "        formatted_answers = []\n",
    "        correct_choice = None\n",
    "        for i, (label, answer) in enumerate(answers):\n",
    "            choice = choices[i]\n",
    "            formatted_answers.append((choice, answer))\n",
    "            if label == 'Correct Answer':\n",
    "                correct_choice = choice\n",
    "\n",
    "        # Update the Question field\n",
    "        formatted_choices = \"\\n\".join([f\"({choice}) {answer}\" for choice, answer in formatted_answers])\n",
    "        filtered_row['Question'] = f\"{filtered_row['Question']} Choices:\\n{formatted_choices}\\n\"\n",
    "\n",
    "        # Add the Correct Choice field\n",
    "        filtered_row['Correct Choice'] = correct_choice\n",
    "\n",
    "        # Append the updated row to filtered_data\n",
    "        filtered_data.append(filtered_row)\n",
    "\n",
    "# Write the updated data to JSON\n",
    "with open(output_path, mode='w', encoding='utf-8') as json_file:\n",
    "    json.dump(filtered_data, json_file, indent=4, ensure_ascii=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data preprocess for MATH500\n",
    "import csv\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "\n",
    "test_path = './MATH500/original_data/test.jsonl'\n",
    "output_path = './MATH500/test.json'\n",
    "\n",
    "data_list = []\n",
    "with open(test_path, 'r') as file:\n",
    "    for id, line in enumerate(file.readlines()):\n",
    "        line = json.loads(line)\n",
    "        data_list.append({\n",
    "            'id': id, \n",
    "            'Question': line['problem'],\n",
    "            'solution': line['solution'],\n",
    "            'answer': line['answer'],\n",
    "            'subject': line['subject'],\n",
    "            'level': line['level'],\n",
    "            'unique_id': line['unique_id'],\n",
    "        })\n",
    "\n",
    "# Write the updated data to JSON\n",
    "with open(output_path, mode='w', encoding='utf-8') as json_file:\n",
    "    json.dump(data_list, json_file, indent=4, ensure_ascii=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data preprocess for AIME\n",
    "import csv\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "\n",
    "test_path = './AIME/original_data/aime_2024.json'\n",
    "output_path = './AIME/2024.json'\n",
    "\n",
    "data_list = []\n",
    "with open(test_path, 'r') as file:\n",
    "    data = json.load(file)\n",
    "    for id, line in enumerate(tqdm(data)):\n",
    "        data_list.append({\n",
    "            'id': id, \n",
    "            'Problem_ID': line['ID'],\n",
    "            'Question': line['Problem'],\n",
    "            'Solution': line['Solution'],\n",
    "            'answer': str(line['Answer']),\n",
    "        })\n",
    "\n",
    "# Write the updated data to JSON\n",
    "with open(output_path, mode='w', encoding='utf-8') as json_file:\n",
    "    json.dump(data_list, json_file, indent=4, ensure_ascii=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data preprocess for AMC\n",
    "import csv\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "\n",
    "test_path = './AMC/original_data/amc_2022_2023.json'\n",
    "output_path = './AMC/test.json'\n",
    "\n",
    "data_list = []\n",
    "with open(test_path, 'r') as file:\n",
    "    data = json.load(file)\n",
    "    id = 0\n",
    "    for line in tqdm(data):\n",
    "        if '2023' not in line['url']:\n",
    "            continue\n",
    "        data_list.append({\n",
    "            'id': id, \n",
    "            'Question': line['problem'],\n",
    "            'answer': str(int(line['answer'])),\n",
    "            'url': line['url'],\n",
    "        })\n",
    "        id += 1\n",
    "\n",
    "# Write the updated data to JSON\n",
    "with open(output_path, mode='w', encoding='utf-8') as json_file:\n",
    "    json.dump(data_list, json_file, indent=4, ensure_ascii=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data preprocess for LiveCodeBench\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "from datetime import datetime\n",
    "\n",
    "def is_valid_date(date_str):\n",
    "    \"\"\"\n",
    "    Check if the given date string is within the range from August 1, 2024, to November 30, 2024.\n",
    "\n",
    "    Args:\n",
    "        date_str (str): The date string in the format \"%Y-%m-%dT%H:%M:%S\".\n",
    "\n",
    "    Returns:\n",
    "        bool: True if the date is within the specified range, False otherwise.\n",
    "    \"\"\"\n",
    "    try:\n",
    "        # Parse the date string into a datetime object\n",
    "        date = datetime.strptime(date_str, \"%Y-%m-%dT%H:%M:%S\")\n",
    "    except ValueError:\n",
    "        # If the date string is not in the expected format, consider it invalid\n",
    "        return False\n",
    "\n",
    "    # Define the start and end dates for the valid range\n",
    "    start_date = datetime(2024, 8, 1)\n",
    "    end_date = datetime(2024, 11, 30)\n",
    "\n",
    "    # Check if the date falls within the valid range\n",
    "    return start_date <= date <= end_date\n",
    "\n",
    "# Define the paths to the input JSONL files\n",
    "test_paths = [\n",
    "    './LiveCodeBench/test.jsonl',\n",
    "    './LiveCodeBench/test2.jsonl',\n",
    "    './LiveCodeBench/test3.jsonl',\n",
    "    './LiveCodeBench/test4.jsonl'\n",
    "]\n",
    "\n",
    "# Define the path to the output JSON file\n",
    "output_path = './LiveCodeBench/test.json'\n",
    "\n",
    "data_list = []\n",
    "seen_questions = set()  # To track unique questions based on 'question_content'\n",
    "current_id = 0  # To assign unique IDs across all files\n",
    "\n",
    "for test_path in test_paths:\n",
    "    try:\n",
    "        with open(test_path, 'r', encoding='utf-8') as file:\n",
    "            # Use tqdm to show progress; total can be estimated if needed\n",
    "            for line in tqdm(file, desc=f'Processing {test_path}'):\n",
    "                try:\n",
    "                    # Parse the JSON line\n",
    "                    line_data = json.loads(line)\n",
    "                except json.JSONDecodeError:\n",
    "                    # Skip lines that are not valid JSON\n",
    "                    continue\n",
    "\n",
    "                # Check if the 'contest_date' field exists and is valid\n",
    "                contest_date = line_data.get('contest_date')\n",
    "                if not contest_date or not is_valid_date(contest_date):\n",
    "                    continue\n",
    "\n",
    "                # Get the question content to check for duplicates\n",
    "                question_content = line_data.get('question_content')\n",
    "                if not question_content:\n",
    "                    continue  # Skip if 'question_content' is missing\n",
    "\n",
    "                if question_content in seen_questions:\n",
    "                    continue  # Duplicate question; skip\n",
    "\n",
    "                # Add the question to the seen set\n",
    "                seen_questions.add(question_content)\n",
    "\n",
    "                # Append the question data to the list\n",
    "                data_list.append({\n",
    "                    'id': current_id,\n",
    "                    'Question': question_content,\n",
    "                    'question_title': line_data.get('question_title', ''),\n",
    "                    'contest_date': contest_date,\n",
    "                    'difficulty': line_data.get('difficulty', ''),\n",
    "                    'public_test_cases': line_data.get('public_test_cases', [])\n",
    "                })\n",
    "\n",
    "                current_id += 1  # Increment the unique ID\n",
    "\n",
    "    except FileNotFoundError:\n",
    "        print(f\"File not found: {test_path}\")\n",
    "    except Exception as e:\n",
    "        print(f\"An error occurred while processing {test_path}: {e}\")\n",
    "\n",
    "# Write the aggregated and deduplicated data to the output JSON file\n",
    "try:\n",
    "    with open(output_path, mode='w', encoding='utf-8') as json_file:\n",
    "        json.dump(data_list, json_file, indent=4, ensure_ascii=False)\n",
    "    print(f\"Data successfully written to {output_path}\")\n",
    "except Exception as e:\n",
    "    print(f\"Failed to write data to {output_path}: {e}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data preprocess for FlashRAG ODQA datasets\n",
    "import csv\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "\n",
    "dataset_name = 'bamboogle'\n",
    "split = 'test'\n",
    "data_num = 200\n",
    "\n",
    "test_path = f'./FlashRAG_datasets/{dataset_name}/{split}.jsonl'\n",
    "output_path = f'./QA_Datasets/{dataset_name}.json'\n",
    "\n",
    "data_list = []\n",
    "with open(test_path, 'r') as file:\n",
    "    for id, line in enumerate(tqdm(file.readlines())):\n",
    "        line = json.loads(line)\n",
    "        data_list.append({\n",
    "            'id': id, \n",
    "            'Question': line['question'],\n",
    "            'answer': line[\"golden_answers\"],\n",
    "        })\n",
    "        if len(data_list) >= data_num:\n",
    "            break\n",
    "\n",
    "# Write the updated data to JSON\n",
    "with open(output_path, mode='w', encoding='utf-8') as json_file:\n",
    "    json.dump(data_list, json_file, indent=4, ensure_ascii=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data preprocess for FlashRAG ODQA datasets (All)\n",
    "import csv\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "\n",
    "dataset_name = 'musique'\n",
    "split = 'dev'\n",
    "data_num = 100000\n",
    "\n",
    "test_path = f'./FlashRAG_datasets/{dataset_name}/{split}.jsonl'\n",
    "output_path = f'./QA_Datasets/{dataset_name}.json'\n",
    "\n",
    "data_list = []\n",
    "with open(test_path, 'r') as file:\n",
    "    for id, line in enumerate(tqdm(file.readlines())):\n",
    "        line = json.loads(line)\n",
    "        data_list.append({\n",
    "            'id': id, \n",
    "            'Question': line['question'],\n",
    "            'answer': line[\"golden_answers\"],\n",
    "        })\n",
    "        if len(data_list) >= data_num:\n",
    "            break\n",
    "\n",
    "# Write the updated data to JSON\n",
    "with open(output_path, mode='w', encoding='utf-8') as json_file:\n",
    "    json.dump(data_list, json_file, indent=4, ensure_ascii=False)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}