#!/usr/bin/env python3 import argparse from index_kits.dataset.make_dataset_core import startup, make_multireso from index_kits.common import show_index_info from index_kits import __version__ def common_args(parser): parser.add_argument('-t', '--target', type=str, required=True, help='Save path') def get_args(): parser = argparse.ArgumentParser(description=""" IndexKits is a tool to build and manage index files for large-scale datasets. It supports both base index and multi-resolution index. Introduction ------------ This command line tool provides the following functionalities: 1. Show index v2 information 2. Build base index v2 3. Build multi-resolution index v2 Examples -------- 1. Show index v2 information index_kits show /path/to/index.json 2. Build base index v2 Default usage: index_kits base -c /path/to/config.yaml -t /path/to/index.json Use multiple processes: index_kits base -c /path/to/config.yaml -t /path/to/index.json -w 40 3. Build multi-resolution index v2 Build with a configuration file: index_kits multireso -c /path/to/config.yaml -t /path/to/index_mb_gt512.json Build by specifying arguments without a configuration file: index_kits multireso --src /path/to/index.json --base-size 512 --reso-step 32 --min-size 512 -t /path/to/index_mb_gt512.json Build by specifying target-ratios: index_kits multireso --src /path/to/index.json --base-size 512 --target-ratios 1:1 4:3 3:4 16:9 9:16 --min-size 512 -t /path/to/index_mb_gt512.json Build with multiple source index files. index_kits multireso --src /path/to/index1.json /path/to/index2.json --base-size 512 --reso-step 32 --min-size 512 -t /path/to/index_mb_gt512.json """, formatter_class=argparse.RawTextHelpFormatter) sub_parsers = parser.add_subparsers(dest='task', required=True) # Show index message show_parser = sub_parsers.add_parser('show', description=""" Show base/multireso index v2 information. Example ------- index_kits show /path/to/index.json """, formatter_class=argparse.RawTextHelpFormatter) show_parser.add_argument('src', type=str, help='Path to a base/multireso index file.') show_parser.add_argument('--arrow-files', action='store_true', help='Show arrow files only.') show_parser.add_argument('--depth', type=int, default=1, help='Arrow file depth. Default is 1, the level of last folder in the arrow file path. ' 'Set it to 0 to show the full path including `xxx/last_folder/*.arrow`.') # Single resolution bucket base_parser = sub_parsers.add_parser('base', description=""" Build base index v2. Example ------- index_kits base -c /path/to/config.yaml -t /path/to/index.json """, formatter_class=argparse.RawTextHelpFormatter) base_parser.add_argument('-c', '--config', type=str, required=True, help='Configuration file path') common_args(base_parser) base_parser.add_argument('-w', '--world-size', type=int, default=1) base_parser.add_argument('--work-dir', type=str, default='.', help='Work directory') base_parser.add_argument('--use-cache', action='store_true', help='Use cache to avoid reprocessing. ' 'Perform merge pkl results directly.') # Multi-resolution bucket mo_parser = sub_parsers.add_parser('multireso', description=""" Build multi-resolution index v2 Example ------- Build with a configuration file: index_kits multireso -c /path/to/config.yaml -t /path/to/index_mb_gt512.json Build by specifying arguments without a configuration file: index_kits multireso --src /path/to/index.json --base-size 512 --reso-step 32 --min-size 512 -t /path/to/index_mb_gt512.json Build by specifying target-ratios: index_kits multireso --src /path/to/index.json --base-size 512 --target-ratios 1:1 4:3 3:4 16:9 9:16 --min-size 512 -t /path/to/index_mb_gt512.json Build with multiple source index files. index_kits multireso --src /path/to/index1.json /path/to/index2.json --base-size 512 --reso-step 32 --min-size 512 -t /path/to/index_mb_gt512.json """, formatter_class=argparse.RawTextHelpFormatter) mo_parser.add_argument('-c', '--config', type=str, default=None, help='Configuration file path in a yaml format. Either --config or --src must be provided.') mo_parser.add_argument('-s', '--src', type=str, nargs='+', default=None, help='Source index files. Either --config or --src must be provided.') common_args(mo_parser) mo_parser.add_argument('--base-size', type=int, default=None, help="Base size. Typically set as 256/512/1024 according to image size you train model.") mo_parser.add_argument('--reso-step', type=int, default=None, help="Resolution step. Either reso_step or target_ratios must be provided.") mo_parser.add_argument('--target-ratios', type=str, nargs='+', default=None, help="Target ratios. Either reso_step or target_ratios must be provided.") mo_parser.add_argument('--md5-file', type=str, default=None, help='You can provide an md5 to height and width file to accelerate the process. ' 'It is a pickle file that contains a dict, which maps md5 to (height, width) tuple.') mo_parser.add_argument('--align', type=int, default=16, help="Used when --target-ratios is provided. Align size of source image height and width.") mo_parser.add_argument('--min-size', type=int, default=0, help="Minimum size. Images smaller than this size will be ignored.") # Common parser.add_argument('-v', '--version', action='version', version=f'%(prog)s {__version__}') args = parser.parse_args() return args if __name__ == '__main__': args = get_args() if args.task == 'show': show_index_info(args.src, args.arrow_files, args.depth, ) elif args.task == 'base': startup(args.config, args.target, args.world_size, args.work_dir, use_cache=args.use_cache, ) elif args.task == 'multireso': make_multireso(args.target, args.config, args.src, args.base_size, args.reso_step, args.target_ratios, args.align, args.min_size, args.md5_file, )