import pandas as pd from pathlib import Path from typing import Optional def convert_to_csv(data_root: str, image_subfolder: str = "", text_subfolder: str = "", save_path: str = None, counts: Optional[int] = 10000): if not save_path.endswith(".csv"): save_path += ".csv" data_root = Path(data_root) images_root = data_root / image_subfolder texts_root = data_root / text_subfolder image_files = [ *images_root.glob("*.png"), *images_root.glob("*.jpg"), *images_root.glob("*.jpeg"), *images_root.glob("*.bmp") ] text_files = [*texts_root.glob("*.txt")] image_files = {image_file.stem: image_file for image_file in image_files} text_files = {text_file.stem: text_file for text_file in text_files} keys = (image_files.keys() & text_files.keys()) keys = list(keys)[:counts] text_files = {k: v for k, v in text_files.items() if k in keys} image_files = {k: v for k, v in image_files.items() if k in keys} results = [] for key in keys: with open(text_files[key], "r") as f: title = f.read().strip() results.append({"filepath": image_files[key], "title": title}) df = pd.DataFrame(results) df.to_csv(save_path, index=False) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--data_root", type=str, help="存有images和texts文件夹的目录") parser.add_argument("--image_subfolder", type=str, default="") parser.add_argument("--text_subfolder", type=str, default="") parser.add_argument("--save_path", type=str, help="csv保存路径") args = parser.parse_args() convert_to_csv(args.data_root, args.image_subfolder, args.text_subfolder, args.save_path)