data_pre_precess_aime.py 1.33 KB
Newer Older
chenzk's avatar
v1.0  
chenzk committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# Convert Parquet to JSON (AIME)
import pandas as pd

'''
# Specify the Parquet file path
# link: https://huggingface.co/datasets/AI-MO/aimo-validation-aime
parquet_file = "./train-00000-of-00001.parquet"

# Use pandas to read the Parquet file
df = pd.read_parquet(parquet_file)

# Filter the DataFrame to keep only rows where '2024_AIME' appears in the 'url' column
filtered_df = df[df['url'].str.contains('2024_AIME', na=False)]

# Print the first few rows of the filtered DataFrame to confirm
print(filtered_df.head())

# Export to a JSON file with indentation
json_file = "./aime_2024.json"
filtered_df.to_json(json_file, orient='records', force_ascii=False, indent=4)

print(f"Filtered data has been saved to {json_file}")
'''


# Data preprocess for AIME
import csv
import json
from tqdm import tqdm

test_path = './aime_2024.json'
output_path = './test.json'

data_list = []
with open(test_path, 'r', encoding='utf-8') as file:
    data = json.load(file)
    for id, line in enumerate(tqdm(data)):
        data_list.append({
            'id': id, 
            'Question': line['problem'],
            'Solution': line['solution'],
            'answer': str(int(line['answer'])),
        })

# Write the updated data to JSON
with open(output_path, mode='w', encoding='utf-8') as json_file:
    json.dump(data_list, json_file, indent=4, ensure_ascii=False)