import os
import json
from collections import OrderedDict
import re
# 刚才标注完的数据目录,建议把merge.py放在其上一级
directory_path = 'saves/'
# 创建一个空列表来存储合并后的内容
merged_data = []
#建议设置成到时候训练用的数据的绝对路径,方便qwen finetune时遍历
image_path_prefix = 'pathtoyourimages/'
# 正则表达式匹配
*.jpg
img_pattern = re.compile(r'
(.*?\.jpg)')
# 递归函数来更新图片路径
def update_img_paths(obj):
if isinstance(obj, OrderedDict):
for key, value in obj.items():
if isinstance(value, (OrderedDict, list)):
update_img_paths(value)
elif isinstance(value, str):
obj[key] = img_pattern.sub(
f'
{image_path_prefix}\\1
', value)
elif isinstance(obj, list):
for i, item in enumerate(obj):
if isinstance(item, (OrderedDict, list)):
update_img_paths(item)
elif isinstance(item, str):
obj[i] = img_pattern.sub(
f'
{image_path_prefix}\\1
', item)
# 遍历目录中的所有文件
for filename in os.listdir(directory_path):
# 检查文件是否是JSON文件
if filename.endswith('.json'):
# 构建完整的文件路径
file_path = os.path.join(directory_path, filename)
# 打开并读取JSON文件
with open(file_path, 'r', encoding='utf-8') as file:
try:
# 加载JSON内容
content = json.load(file, object_pairs_hook=OrderedDict)
# 创建一个新的OrderedDict以将'id'字段放在最前面
new_content = OrderedDict()
new_content['id'] = os.path.splitext(filename)[0]
new_content.update(content)
update_img_paths(new_content)
# 将此内容添加到合并后的数据列表
merged_data.append(new_content)
except json.JSONDecodeError as e:
print(f"Error reading {filename}: {e}")
# 指定新JSON文件的名称
output_filename = 'merged_data.json'
# 构建完整的输出文件路径
output_filepath = output_filename
# 写入合并后的数据到新的JSON文件
with open(output_filepath, 'w', encoding='utf-8') as output_file:
json.dump(merged_data, output_file, ensure_ascii=False, indent=4)
print(f"Merge complete. Combined file created at {output_filepath}")