Commit 71a45080 authored by Shaoshuai Shi's avatar Shaoshuai Shi
Browse files

specify epochs and batch size in the configs

parent e15c16a7
## Getting Started # Getting Started
The dataset configs are located within [tools/cfgs/dataset_configs](tools/cfgs/dataset_configs), The dataset configs are located within [tools/cfgs/dataset_configs](../tools/cfgs/dataset_configs),
and the model configs are located within [tools/cfgs](tools/cfgs) for different datasets, like [tools/cfgs/kitti_models/](tools/cfgs/kitti_models/). and the model configs are located within [tools/cfgs](../tools/cfgs) for different datasets.
## Dataset Preparation ## Dataset Preparation
...@@ -73,34 +73,30 @@ python test.py --cfg_file ${CONFIG_FILE} --batch_size ${BATCH_SIZE} --eval_all ...@@ -73,34 +73,30 @@ python test.py --cfg_file ${CONFIG_FILE} --batch_size ${BATCH_SIZE} --eval_all
* To test with multiple GPUs: * To test with multiple GPUs:
```shell script ```shell script
sh scripts/slurm_test_mgpu.sh ${PARTITION} ${NUM_GPUS} \ sh scripts/dist_test.sh ${NUM_GPUS} \
--cfg_file ${CONFIG_FILE} --batch_size ${BATCH_SIZE} --cfg_file ${CONFIG_FILE} --batch_size ${BATCH_SIZE}
# or # or
sh scripts/dist_test.sh ${NUM_GPUS} \ sh scripts/slurm_test_mgpu.sh ${PARTITION} ${NUM_GPUS} \
--cfg_file ${CONFIG_FILE} --batch_size ${BATCH_SIZE} --cfg_file ${CONFIG_FILE} --batch_size ${BATCH_SIZE}
``` ```
### Train a model ### Train a model
Note that the `--batch_size` depends on the number of your training GPUs, You could optionally add extra command line parameters `--batch_size ${BATCH_SIZE}` and `--epochs ${EPOCHS}` to specify your preferred parameters.
please refer to `Model Zoo` of [README.md](../README.md) for the setting of batch_size for different models.
* Train with multiple GPUs:
```shell script
sh scripts/dist_train.sh ${NUM_GPUS} \
--cfg_file ${CONFIG_FILE} --batch_size ${BATCH_SIZE} --epochs 80
```
* Train with multiple machines: * Train with multiple GPUs or multiple machines
```shell script ```shell script
sh scripts/slurm_train.sh ${PARTITION} ${JOB_NAME} ${NUM_GPUS} \ sh scripts/dist_train.sh ${NUM_GPUS} --cfg_file ${CONFIG_FILE}
--cfg_file ${CONFIG_FILE} --batch_size ${BATCH_SIZE} --epochs 80
# or
sh scripts/slurm_train.sh ${PARTITION} ${JOB_NAME} ${NUM_GPUS} --cfg_file ${CONFIG_FILE}
``` ```
* Train with a single GPU: * Train with a single GPU:
```shell script ```shell script
python train.py --cfg_file ${CONFIG_FILE} --batch_size ${BATCH_SIZE} --epochs 50 python train.py --cfg_file ${CONFIG_FILE}
``` ```
...@@ -10,7 +10,7 @@ All the codes are tested in the following environment: ...@@ -10,7 +10,7 @@ All the codes are tested in the following environment:
### Install `pcdet v0.3` ### Install `pcdet v0.3`
NOTE: Please re-install `pcdet v0.3` by running `python setup.py develop` if you have already installed `pcdet v0.1` previously. NOTE: Please re-install `pcdet v0.3` by running `python setup.py develop` even if you have already installed previous version.
a. Clone this repository. a. Clone this repository.
```shell ```shell
......
...@@ -110,11 +110,10 @@ def keep_arrays_by_name(gt_names, used_classes): ...@@ -110,11 +110,10 @@ def keep_arrays_by_name(gt_names, used_classes):
return inds return inds
def init_dist_slurm(batch_size, tcp_port, local_rank, backend='nccl'): def init_dist_slurm(tcp_port, local_rank, backend='nccl'):
""" """
modified from https://github.com/open-mmlab/mmdetection modified from https://github.com/open-mmlab/mmdetection
Args: Args:
batch_size:
tcp_port: tcp_port:
backend: backend:
...@@ -134,13 +133,10 @@ def init_dist_slurm(batch_size, tcp_port, local_rank, backend='nccl'): ...@@ -134,13 +133,10 @@ def init_dist_slurm(batch_size, tcp_port, local_rank, backend='nccl'):
dist.init_process_group(backend=backend) dist.init_process_group(backend=backend)
total_gpus = dist.get_world_size() total_gpus = dist.get_world_size()
assert batch_size % total_gpus == 0, 'Batch size should be matched with GPUS: (%d, %d)' % (batch_size, total_gpus) return total_gpus, rank
batch_size_each_gpu = batch_size // total_gpus
rank = dist.get_rank()
return batch_size_each_gpu, rank
def init_dist_pytorch(batch_size, tcp_port, local_rank, backend='nccl'): def init_dist_pytorch(tcp_port, local_rank, backend='nccl'):
if mp.get_start_method(allow_none=True) is None: if mp.get_start_method(allow_none=True) is None:
mp.set_start_method('spawn') mp.set_start_method('spawn')
...@@ -152,10 +148,9 @@ def init_dist_pytorch(batch_size, tcp_port, local_rank, backend='nccl'): ...@@ -152,10 +148,9 @@ def init_dist_pytorch(batch_size, tcp_port, local_rank, backend='nccl'):
rank=local_rank, rank=local_rank,
world_size=num_gpus world_size=num_gpus
) )
assert batch_size % num_gpus == 0, 'Batch size should be matched with GPUS: (%d, %d)' % (batch_size, num_gpus)
batch_size_each_gpu = batch_size // num_gpus
rank = dist.get_rank() rank = dist.get_rank()
return batch_size_each_gpu, rank return num_gpus, rank
def get_dist_info(): def get_dist_info():
if torch.__version__ < '1.0': if torch.__version__ < '1.0':
...@@ -173,6 +168,7 @@ def get_dist_info(): ...@@ -173,6 +168,7 @@ def get_dist_info():
world_size = 1 world_size = 1
return rank, world_size return rank, world_size
def merge_results_dist(result_part, size, tmpdir): def merge_results_dist(result_part, size, tmpdir):
rank, world_size = get_dist_info() rank, world_size = get_dist_info()
os.makedirs(tmpdir, exist_ok=True) os.makedirs(tmpdir, exist_ok=True)
......
...@@ -170,6 +170,9 @@ MODEL: ...@@ -170,6 +170,9 @@ MODEL:
OPTIMIZATION: OPTIMIZATION:
BATCH_SIZE_PER_GPU: 4
NUM_EPOCHS: 80
OPTIMIZER: adam_onecycle OPTIMIZER: adam_onecycle
LR: 0.01 LR: 0.01
WEIGHT_DECAY: 0.01 WEIGHT_DECAY: 0.01
......
...@@ -114,6 +114,9 @@ MODEL: ...@@ -114,6 +114,9 @@ MODEL:
OPTIMIZATION: OPTIMIZATION:
BATCH_SIZE_PER_GPU: 4
NUM_EPOCHS: 80
OPTIMIZER: adam_onecycle OPTIMIZER: adam_onecycle
LR: 0.003 LR: 0.003
WEIGHT_DECAY: 0.01 WEIGHT_DECAY: 0.01
......
...@@ -143,6 +143,9 @@ MODEL: ...@@ -143,6 +143,9 @@ MODEL:
OPTIMIZATION: OPTIMIZATION:
BATCH_SIZE_PER_GPU: 4
NUM_EPOCHS: 80
OPTIMIZER: adam_onecycle OPTIMIZER: adam_onecycle
LR: 0.003 LR: 0.003
WEIGHT_DECAY: 0.01 WEIGHT_DECAY: 0.01
......
...@@ -139,6 +139,9 @@ MODEL: ...@@ -139,6 +139,9 @@ MODEL:
OPTIMIZATION: OPTIMIZATION:
BATCH_SIZE_PER_GPU: 2
NUM_EPOCHS: 80
OPTIMIZER: adam_onecycle OPTIMIZER: adam_onecycle
LR: 0.01 LR: 0.01
WEIGHT_DECAY: 0.01 WEIGHT_DECAY: 0.01
......
...@@ -228,6 +228,9 @@ MODEL: ...@@ -228,6 +228,9 @@ MODEL:
OPTIMIZATION: OPTIMIZATION:
BATCH_SIZE_PER_GPU: 2
NUM_EPOCHS: 80
OPTIMIZER: adam_onecycle OPTIMIZER: adam_onecycle
LR: 0.01 LR: 0.01
WEIGHT_DECAY: 0.01 WEIGHT_DECAY: 0.01
......
...@@ -100,6 +100,9 @@ MODEL: ...@@ -100,6 +100,9 @@ MODEL:
OPTIMIZATION: OPTIMIZATION:
BATCH_SIZE_PER_GPU: 4
NUM_EPOCHS: 80
OPTIMIZER: adam_onecycle OPTIMIZER: adam_onecycle
LR: 0.003 LR: 0.003
WEIGHT_DECAY: 0.01 WEIGHT_DECAY: 0.01
......
...@@ -116,6 +116,9 @@ MODEL: ...@@ -116,6 +116,9 @@ MODEL:
OPTIMIZATION: OPTIMIZATION:
BATCH_SIZE_PER_GPU: 4
NUM_EPOCHS: 80
OPTIMIZER: adam_onecycle OPTIMIZER: adam_onecycle
LR: 0.003 LR: 0.003
WEIGHT_DECAY: 0.01 WEIGHT_DECAY: 0.01
......
...@@ -20,8 +20,8 @@ def parse_config(): ...@@ -20,8 +20,8 @@ def parse_config():
parser = argparse.ArgumentParser(description='arg parser') parser = argparse.ArgumentParser(description='arg parser')
parser.add_argument('--cfg_file', type=str, default=None, help='specify the config for training') parser.add_argument('--cfg_file', type=str, default=None, help='specify the config for training')
parser.add_argument('--batch_size', type=int, default=16, required=False, help='batch size for training') parser.add_argument('--batch_size', type=int, default=None, required=False, help='batch size for training')
parser.add_argument('--epochs', type=int, default=30, required=False, help='number of epochs to train for') parser.add_argument('--epochs', type=int, default=None, required=False, help='number of epochs to train for')
parser.add_argument('--workers', type=int, default=4, help='number of workers for dataloader') parser.add_argument('--workers', type=int, default=4, help='number of workers for dataloader')
parser.add_argument('--extra_tag', type=str, default='default', help='extra tag for this experiment') parser.add_argument('--extra_tag', type=str, default='default', help='extra tag for this experiment')
parser.add_argument('--ckpt', type=str, default=None, help='checkpoint to start from') parser.add_argument('--ckpt', type=str, default=None, help='checkpoint to start from')
...@@ -58,10 +58,14 @@ def main(): ...@@ -58,10 +58,14 @@ def main():
if args.launcher == 'none': if args.launcher == 'none':
dist_train = False dist_train = False
else: else:
args.batch_size, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' % args.launcher)( total_gpus, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' % args.launcher)(
args.batch_size, args.tcp_port, args.local_rank, backend='nccl' args.tcp_port, args.local_rank, backend='nccl'
) )
dist_train = True dist_train = True
args.batch_size = cfg.OPTIMIZATION.BATCH_SIZE_PER_GPU if args.batch_size is None else args.batch_size
args.epochs = cfg.OPTIMIZATION.NUM_EPOCHS if args.epochs is None else args.epochs
if args.fix_random_seed: if args.fix_random_seed:
common_utils.set_random_seed(666) common_utils.set_random_seed(666)
...@@ -79,7 +83,6 @@ def main(): ...@@ -79,7 +83,6 @@ def main():
logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list) logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)
if dist_train: if dist_train:
total_gpus = dist.get_world_size()
logger.info('total_batch_size: %d' % (total_gpus * args.batch_size)) logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
for key, val in vars(args).items(): for key, val in vars(args).items():
logger.info('{:16} {}'.format(key, val)) logger.info('{:16} {}'.format(key, val))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment