# Data preprocess for GPQA import csv import json import random from tqdm import tqdm # Paths to data data_path = './GPQA/gpqa_extended.csv' output_path = './GPQA/diamond.json' # Define the keys we want to keep keys_to_keep = [ 'id', 'Question', 'Subdomain', 'High-level domain', 'Correct Answer', 'Incorrect Answer 1', 'Incorrect Answer 2', 'Incorrect Answer 3' ] filtered_data = [] with open(data_path, mode='r', encoding='utf-8') as csv_file: csv_reader = csv.DictReader(csv_file) for idx, row in enumerate(tqdm(csv_reader), 0): # Add id field row['id'] = idx # Create new dictionary with only desired keys filtered_row = {key: row[key] for key in keys_to_keep} # Extract answers and shuffle them answers = [ ('Correct Answer', filtered_row['Correct Answer']), ('Incorrect Answer 1', filtered_row['Incorrect Answer 1']), ('Incorrect Answer 2', filtered_row['Incorrect Answer 2']), ('Incorrect Answer 3', filtered_row['Incorrect Answer 3']) ] random.shuffle(answers) # Assign new choices A, B, C, D in order and determine the correct choice choices = ['A', 'B', 'C', 'D'] formatted_answers = [] correct_choice = None for i, (label, answer) in enumerate(answers): choice = choices[i] formatted_answers.append((choice, answer)) if label == 'Correct Answer': correct_choice = choice # Update the Question field formatted_choices = "\n".join([f"({choice}) {answer}" for choice, answer in formatted_answers]) filtered_row['Question'] = f"{filtered_row['Question']} Choices:\n{formatted_choices}\n" # Add the Correct Choice field filtered_row['Correct Choice'] = correct_choice # Append the updated row to filtered_data filtered_data.append(filtered_row) # Write the updated data to JSON with open(output_path, mode='w', encoding='utf-8') as json_file: json.dump(filtered_data, json_file, indent=4, ensure_ascii=False)