build_metadata.py 11 KB
Newer Older
weishb's avatar
weishb committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import os
import shutil
import sys
import time
import importlib
import argparse
import pandas as pd
from easydict import EasyDict as edict


def update_metadata(path, opt):
    if not os.path.exists(path):
        return None
    timestamp = str(int(time.time()))
    os.makedirs(os.path.join(path, 'merged_records'), exist_ok=True)
    os.makedirs(os.path.join(path, 'new_records'), exist_ok=True)
    if opt.from_merged_records:
        df_files = [f for f in os.listdir(os.path.join(path, 'merged_records')) if f.endswith('.csv')]
        df_files = [f for f in df_files if int(f.split('_')[0]) >= opt.record_start]
    else:
        df_files = [f for f in os.listdir(os.path.join(path, 'new_records')) if f.startswith('part_') and f.endswith('.csv')]
    df_parts = []
    for f in df_files:
        try:
            df_parts.append(pd.read_csv(os.path.join(path, 'new_records', f)))
        except Exception as e:
            print(f"Failed to read {f}: {e}")
    if len(df_parts) > 0:
        if os.path.exists(os.path.join(path, 'metadata.csv')):
            metadata = pd.read_csv(os.path.join(path, 'metadata.csv'))
        else:
            columns = df_parts[0].columns
            metadata = pd.DataFrame(columns=columns)
        metadata.set_index('sha256', inplace=True)
        for df_part in df_parts:
            if 'sha256' in df_part.columns:
                df_part.set_index('sha256', inplace=True)
                metadata = df_part.combine_first(metadata)
        metadata.to_csv(os.path.join(path, 'metadata.csv'))
        for f in df_files:
            shutil.move(os.path.join(path, 'new_records', f), os.path.join(path, 'merged_records', f'{timestamp}_{f}'))
        return metadata
    else:
        if os.path.exists(os.path.join(path, 'metadata.csv')):
            return pd.read_csv(os.path.join(path, 'metadata.csv'))
    return None


if __name__ == '__main__':
    dataset_utils = importlib.import_module(f'datasets.{sys.argv[1]}')

    parser = argparse.ArgumentParser()
    parser.add_argument('--root', type=str, required=True,
                        help='Directory to save the metadata')
    parser.add_argument('--download_root', type=str, default=None,
                        help='Directory to save the downloaded files')
    parser.add_argument('--thumbnail_root', type=str, default=None,
                        help='Directory to save the thumbnail files')
    parser.add_argument('--render_cond_root', type=str, default=None,
                        help='Directory to save the render condition files')
    parser.add_argument('--mesh_dump_root', type=str, default=None,
                        help='Directory to save the mesh files')
    parser.add_argument('--pbr_dump_root', type=str, default=None,
                        help='Directory to save the pbr files')
    parser.add_argument('--dual_grid_root', type=str, default=None,
                        help='Directory to save the dual grid files')
    parser.add_argument('--pbr_voxel_root', type=str, default=None,
                        help='Directory to save the pbr voxel files')
    parser.add_argument('--ss_latent_root', type=str, default=None,
                        help='Directory to save the sparse structure latent files')
    parser.add_argument('--shape_latent_root', type=str, default=None,
                        help='Directory to save the shape latent files')
    parser.add_argument('--pbr_latent_root', type=str, default=None,
                        help='Directory to save the pbr latent files')
    parser.add_argument('--field', type=str, default='all',
                        help='Fields to process, separated by commas')
    parser.add_argument('--from_file', action='store_true',
                        help='Build metadata from file instead of from records of processings.' +
                             'Useful when some processing fail to generate records but file already exists.')
    parser.add_argument('--from_merged_records', action='store_true',
                        help='Build metadata from merged records')
    parser.add_argument('--record_start', type=int)
    parser.add_argument('--rebuild', action='store_true',
                        help='Rebuild metadata from scratch, ignore existing metadata.')
    dataset_utils.add_args(parser)
    opt = parser.parse_args(sys.argv[2:])
    opt = edict(vars(opt))
    opt.download_root = opt.download_root or opt.root
    opt.thumbnail_root = opt.thumbnail_root or opt.root
    opt.render_cond_root = opt.render_cond_root or opt.root
    opt.mesh_dump_root = opt.mesh_dump_root or opt.root
    opt.pbr_dump_root = opt.pbr_dump_root or opt.root
    opt.dual_grid_root = opt.dual_grid_root or opt.root
    opt.pbr_voxel_root = opt.pbr_voxel_root or opt.root
    opt.ss_latent_root = opt.ss_latent_root or opt.root
    opt.shape_latent_root = opt.shape_latent_root or opt.root
    opt.pbr_latent_root = opt.pbr_latent_root or opt.root

    os.makedirs(opt.root, exist_ok=True)

    opt.field = opt.field.split(',')
    
    # get file list
    if os.path.exists(os.path.join(opt.root, 'metadata.csv')):
        print('Loading previous metadata...')
        metadata = pd.read_csv(os.path.join(opt.root, 'metadata.csv'))
    else:
        metadata = dataset_utils.get_metadata(**opt)
        metadata.to_csv(os.path.join(opt.root, 'metadata.csv'), index=False)
    
    # merge downloaded
    downloaded_metadata = update_metadata(os.path.join(opt.download_root, 'raw'), opt)

    # merge thumbnails
    thumbnail_metadata = update_metadata(os.path.join(opt.thumbnail_root, 'thumbnails'), opt)
    
    # merge aesthetic scores
    aesthetic_score_metadata = update_metadata(os.path.join(opt.root, 'aesthetic_scores'), opt)
    
    # merge render conditions
    render_cond_metadata = update_metadata(os.path.join(opt.render_cond_root, 'renders_cond'), opt)

    # merge mesh dumped
    mesh_dumped_metadata = update_metadata(os.path.join(opt.mesh_dump_root, 'mesh_dumps'), opt)
        
    # merge pbr dumped
    pbr_dumped_metadata = update_metadata(os.path.join(opt.pbr_dump_root, 'pbr_dumps'), opt)
    
    # merge asset stats
    asset_stats_metadata = update_metadata(os.path.join(opt.root, 'asset_stats'), opt)
        
    # merge dual grid
    dual_grid_resolutions = []
    for dir in os.listdir(opt.dual_grid_root):
        if os.path.isdir(os.path.join(opt.dual_grid_root, dir)) and dir.startswith('dual_grid_'):
            dual_grid_resolutions.append(int(dir.split('_')[-1]))
    dual_grid_metadata = {}
    for res in dual_grid_resolutions:
        dual_grid_metadata[res] = update_metadata(os.path.join(opt.dual_grid_root, f'dual_grid_{res}'), opt)
    
    # merge pbr voxelized
    pbr_voxel_resolutions = []
    for dir in os.listdir(opt.pbr_voxel_root):
        if os.path.isdir(os.path.join(opt.pbr_voxel_root, dir)) and dir.startswith('pbr_voxels_'):
            pbr_voxel_resolutions.append(int(dir.split('_')[-1]))
    pbr_voxel_metadata = {}
    for res in pbr_voxel_resolutions:
        pbr_voxel_metadata[res] = update_metadata(os.path.join(opt.pbr_voxel_root, f'pbr_voxels_{res}'), opt)
        
    # merge ss latents
    ss_latent_models = []
    if os.path.exists(os.path.join(opt.ss_latent_root, 'ss_latents')):
        ss_latent_models = os.listdir(os.path.join(opt.ss_latent_root, 'ss_latents'))
    ss_latent_metadata = {}
    for model in ss_latent_models:
        ss_latent_metadata[model] = update_metadata(os.path.join(opt.ss_latent_root, f'ss_latents/{model}'), opt)
        
    # merge shape latents
    shape_latent_models = []
    if os.path.exists(os.path.join(opt.shape_latent_root, 'shape_latents')):
        shape_latent_models = os.listdir(os.path.join(opt.shape_latent_root, 'shape_latents'))
    shape_latent_metadata = {}
    for model in shape_latent_models:
        shape_latent_metadata[model] = update_metadata(os.path.join(opt.shape_latent_root, f'shape_latents/{model}'), opt)
        
    # merge pbr latents
    pbr_latent_models = []
    if os.path.exists(os.path.join(opt.pbr_latent_root, 'pbr_latents')):
        pbr_latent_models = os.listdir(os.path.join(opt.pbr_latent_root, 'pbr_latents'))
    pbr_latent_metadata = {}
    for model in pbr_latent_models:
        pbr_latent_metadata[model] = update_metadata(os.path.join(opt.pbr_latent_root, f'pbr_latents/{model}'), opt)

    # statistics
    num_downloaded = downloaded_metadata['local_path'].count() if downloaded_metadata is not None else 0
    with open(os.path.join(opt.root, 'statistics.txt'), 'w') as f:
        f.write('Statistics:\n')
        f.write(f'  - Number of assets: {len(metadata)}\n')
        f.write(f'  - Number of assets downloaded: {num_downloaded}\n')
        if thumbnail_metadata is not None:
            f.write(f'  - Number of assets with thumbnails: {thumbnail_metadata["thumbnailed"].sum()}\n')
        if aesthetic_score_metadata is not None:
            f.write(f'  - Number of assets with aesthetic scores: {aesthetic_score_metadata["aesthetic_score"].count()}\n')
        if render_cond_metadata is not None:
            f.write(f'  - Number of assets with render conditions: {render_cond_metadata["cond_rendered"].count()}\n')
        if mesh_dumped_metadata is not None:
            f.write(f'  - Number of assets with mesh dumped: {mesh_dumped_metadata["mesh_dumped"].sum()}\n')
        if pbr_dumped_metadata is not None:
            f.write(f'  - Number of assets with PBR dumped: {pbr_dumped_metadata["pbr_dumped"].sum()}\n')
        if asset_stats_metadata is not None:
            f.write(f'  - Number of assets with asset stats: {len(asset_stats_metadata)}\n')
        if len(dual_grid_resolutions) != 0:
            f.write(f'  - Number of assets with dual grid:\n')
            for res in dual_grid_resolutions:
                if dual_grid_metadata[res] is not None:
                    f.write(f'    - {res}: {dual_grid_metadata[res]["dual_grid_converted"].sum()}\n')
        if len(pbr_voxel_resolutions) != 0:
            f.write(f'  - Number of assets with PBR voxelization:\n')
            for res in pbr_voxel_resolutions:
                if pbr_voxel_metadata[res] is not None:
                    f.write(f'    - {res}: {pbr_voxel_metadata[res]["pbr_voxelized"].sum()}\n')
        if len(ss_latent_models) != 0:
            f.write(f'  - Number of assets with sparse structure latents:\n')
            for model in ss_latent_models:
                if ss_latent_metadata[model] is not None:
                    f.write(f'    - {model}: {ss_latent_metadata[model]["ss_latent_encoded"].sum()}\n')
        if len(shape_latent_models) != 0:
            f.write(f'  - Number of assets with shape latents:\n')
            for model in shape_latent_models:
                if shape_latent_metadata[model] is not None:
                    f.write(f'    - {model}: {shape_latent_metadata[model]["shape_latent_encoded"].sum()}\n')
        if len(pbr_latent_models) != 0:
            f.write(f'  - Number of assets with PBR latents:\n')
            for model in pbr_latent_models:
                if pbr_latent_metadata[model] is not None:
                    f.write(f'    - {model}: {pbr_latent_metadata[model]["pbr_latent_encoded"].sum()}\n')
        
    with open(os.path.join(opt.root, 'statistics.txt'), 'r') as f:
        print(f.read())