import os import json from collections import OrderedDict import re # 刚才标注完的数据目录,建议把merge.py放在其上一级 directory_path = 'saves/' # 创建一个空列表来存储合并后的内容 merged_data = [] #建议设置成到时候训练用的数据的绝对路径,方便qwen finetune时遍历 image_path_prefix = 'pathtoyourimages/' # 正则表达式匹配 *.jpg img_pattern = re.compile(r'(.*?\.jpg)') # 递归函数来更新图片路径 def update_img_paths(obj): if isinstance(obj, OrderedDict): for key, value in obj.items(): if isinstance(value, (OrderedDict, list)): update_img_paths(value) elif isinstance(value, str): obj[key] = img_pattern.sub( f'{image_path_prefix}\\1', value) elif isinstance(obj, list): for i, item in enumerate(obj): if isinstance(item, (OrderedDict, list)): update_img_paths(item) elif isinstance(item, str): obj[i] = img_pattern.sub( f'{image_path_prefix}\\1', item) # 遍历目录中的所有文件 for filename in os.listdir(directory_path): # 检查文件是否是JSON文件 if filename.endswith('.json'): # 构建完整的文件路径 file_path = os.path.join(directory_path, filename) # 打开并读取JSON文件 with open(file_path, 'r', encoding='utf-8') as file: try: # 加载JSON内容 content = json.load(file, object_pairs_hook=OrderedDict) # 创建一个新的OrderedDict以将'id'字段放在最前面 new_content = OrderedDict() new_content['id'] = os.path.splitext(filename)[0] new_content.update(content) update_img_paths(new_content) # 将此内容添加到合并后的数据列表 merged_data.append(new_content) except json.JSONDecodeError as e: print(f"Error reading {filename}: {e}") # 指定新JSON文件的名称 output_filename = 'merged_data.json' # 构建完整的输出文件路径 output_filepath = output_filename # 写入合并后的数据到新的JSON文件 with open(output_filepath, 'w', encoding='utf-8') as output_file: json.dump(merged_data, output_file, ensure_ascii=False, indent=4) print(f"Merge complete. Combined file created at {output_filepath}")