from pathlib import Path import glob import os import time import pdb import shutil import tarfile import json def find_images(path): # 定义你想查找的图片格式 # image_formats = ['*.jpg','*.json','txt'] image_formats = ['*.jpg'] # 初始化一个空列表来存储找到的图片文件 images = [] # 遍历每一种图片格式 for format in image_formats: # 使用glob查找指定格式的图片文件 for filename in glob.glob(os.path.join(path, format)): images.append(filename) return images # 使用函数查找图片文件 path_to_search = './laion2B-multi-chinese-data/image-txt-all' # 替换为你的目录路径 # images = find_images(path_to_search) # 创建或覆盖 data.jsonl 文件 num=0 with open('data.json', 'w', encoding='utf-8') as jsonl_file: # 读取文本描述文件 for img in os.listdir(path_to_search): if img.endswith('jpg'): num+=1 if num%1000==0: print(f'Processing {num}') text_path = img[:-3]+'txt' with open(os.path.join(path_to_search, text_path), 'r', encoding='utf-8') as text_file: description = text_file.readlines() if len(description)==0: print('++'*20, text_path) continue else: description = description[0].strip() # 构造 JSON 对象 data = { 'image_path': os.path.join(path_to_search, img), 'text': description } # 将 JSON 对象转换为字符串并写入 JSONL 文件 jsonl_file.write(json.dumps(data) + '\n') print("data.json 文件已生成。")