convert_to_csv.py 1.88 KB
Newer Older
mashun1's avatar
mashun1 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import pandas as pd

from pathlib import Path
from typing import Optional


def convert_to_csv(data_root: str,
                   image_subfolder: str = "",
                   text_subfolder: str = "",
                   save_path: str = None,
                   counts: Optional[int] = 10000):
    
    if not save_path.endswith(".csv"):
        save_path += ".csv"
    
    data_root = Path(data_root)
        
    images_root = data_root / image_subfolder
    texts_root = data_root / text_subfolder
    
    image_files = [
        *images_root.glob("*.png"), *images_root.glob("*.jpg"),
        *images_root.glob("*.jpeg"), *images_root.glob("*.bmp")
    ]
    text_files = [*texts_root.glob("*.txt")]
    
    image_files = {image_file.stem: image_file for image_file in image_files}
    text_files = {text_file.stem: text_file for text_file in text_files}
    
    keys = (image_files.keys() & text_files.keys())
    
    keys = list(keys)[:counts]
    text_files = {k: v for k, v in text_files.items() if k in keys}
    image_files = {k: v for k, v in image_files.items() if k in keys}
    
    results = []
    
    for key in keys:
        with open(text_files[key], "r") as f:
            title = f.read().strip()
        results.append({"filepath": image_files[key], "title": title})
    
    df = pd.DataFrame(results)
    
    df.to_csv(save_path, index=False)


if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser()
    
    parser.add_argument("--data_root", type=str, help="存有images和texts文件夹的目录")
    
    parser.add_argument("--image_subfolder", type=str, default="")
    
    parser.add_argument("--text_subfolder", type=str, default="")
    
    parser.add_argument("--save_path", type=str, help="csv保存路径")
    
    args = parser.parse_args()
    
    convert_to_csv(args.data_root, args.image_subfolder, args.text_subfolder, args.save_path)