
from pathlib import Path
import glob
import os
import time
import pdb
import shutil
import tarfile
import json

def find_images(path):
    # 定义你想查找的图片格式
    # image_formats = ['*.jpg','*.json','txt']
    image_formats = ['*.jpg']
    
    # 初始化一个空列表来存储找到的图片文件
    images = []
    
    # 遍历每一种图片格式
    for format in image_formats:
        # 使用glob查找指定格式的图片文件
        for filename in glob.glob(os.path.join(path, format)):
            images.append(filename)
    
    return images

# 使用函数查找图片文件    
path_to_search = './laion2B-multi-chinese-data/image-txt-all'  # 替换为你的目录路径
# images = find_images(path_to_search)

# 创建或覆盖 data.jsonl 文件
num=0
with open('data.json', 'w', encoding='utf-8') as jsonl_file:
    # 读取文本描述文件
    for img in os.listdir(path_to_search):
        if img.endswith('jpg'):
            num+=1
            if num%1000==0:
                print(f'Processing {num}')
            text_path = img[:-3]+'txt'
            with open(os.path.join(path_to_search, text_path), 'r', encoding='utf-8') as text_file:             
                description = text_file.readlines()
                if len(description)==0:
                    print('++'*20, text_path)
                    continue
                else:
                    description = description[0].strip()

            # 构造 JSON 对象
            data = {
                'image_path': os.path.join(path_to_search, img),
                'text': description
            }
            
            # 将 JSON 对象转换为字符串并写入 JSONL 文件
            jsonl_file.write(json.dumps(data) + '\n')

print("data.json 文件已生成。")