Commit 71a45080 authored by Shaoshuai Shi's avatar Shaoshuai Shi
Browse files

specify epochs and batch size in the configs

parent e15c16a7
## Getting Started
The dataset configs are located within [tools/cfgs/dataset_configs](tools/cfgs/dataset_configs),
and the model configs are located within [tools/cfgs](tools/cfgs) for different datasets, like [tools/cfgs/kitti_models/](tools/cfgs/kitti_models/).
# Getting Started
The dataset configs are located within [tools/cfgs/dataset_configs](../tools/cfgs/dataset_configs),
and the model configs are located within [tools/cfgs](../tools/cfgs) for different datasets.
## Dataset Preparation
......@@ -73,34 +73,30 @@ python test.py --cfg_file ${CONFIG_FILE} --batch_size ${BATCH_SIZE} --eval_all
* To test with multiple GPUs:
```shell script
sh scripts/slurm_test_mgpu.sh ${PARTITION} ${NUM_GPUS} \
sh scripts/dist_test.sh ${NUM_GPUS} \
--cfg_file ${CONFIG_FILE} --batch_size ${BATCH_SIZE}
# or
sh scripts/dist_test.sh ${NUM_GPUS} \
sh scripts/slurm_test_mgpu.sh ${PARTITION} ${NUM_GPUS} \
--cfg_file ${CONFIG_FILE} --batch_size ${BATCH_SIZE}
```
### Train a model
Note that the `--batch_size` depends on the number of your training GPUs,
please refer to `Model Zoo` of [README.md](../README.md) for the setting of batch_size for different models.
You could optionally add extra command line parameters `--batch_size ${BATCH_SIZE}` and `--epochs ${EPOCHS}` to specify your preferred parameters.
* Train with multiple GPUs:
* Train with multiple GPUs or multiple machines
```shell script
sh scripts/dist_train.sh ${NUM_GPUS} \
--cfg_file ${CONFIG_FILE} --batch_size ${BATCH_SIZE} --epochs 80
```
sh scripts/dist_train.sh ${NUM_GPUS} --cfg_file ${CONFIG_FILE}
* Train with multiple machines:
```shell script
sh scripts/slurm_train.sh ${PARTITION} ${JOB_NAME} ${NUM_GPUS} \
--cfg_file ${CONFIG_FILE} --batch_size ${BATCH_SIZE} --epochs 80
# or
sh scripts/slurm_train.sh ${PARTITION} ${JOB_NAME} ${NUM_GPUS} --cfg_file ${CONFIG_FILE}
```
* Train with a single GPU:
```shell script
python train.py --cfg_file ${CONFIG_FILE} --batch_size ${BATCH_SIZE} --epochs 50
python train.py --cfg_file ${CONFIG_FILE}
```
......@@ -10,7 +10,7 @@ All the codes are tested in the following environment:
### Install `pcdet v0.3`
NOTE: Please re-install `pcdet v0.3` by running `python setup.py develop` if you have already installed `pcdet v0.1` previously.
NOTE: Please re-install `pcdet v0.3` by running `python setup.py develop` even if you have already installed previous version.
a. Clone this repository.
```shell
......
......@@ -110,11 +110,10 @@ def keep_arrays_by_name(gt_names, used_classes):
return inds
def init_dist_slurm(batch_size, tcp_port, local_rank, backend='nccl'):
def init_dist_slurm(tcp_port, local_rank, backend='nccl'):
"""
modified from https://github.com/open-mmlab/mmdetection
Args:
batch_size:
tcp_port:
backend:
......@@ -134,13 +133,10 @@ def init_dist_slurm(batch_size, tcp_port, local_rank, backend='nccl'):
dist.init_process_group(backend=backend)
total_gpus = dist.get_world_size()
assert batch_size % total_gpus == 0, 'Batch size should be matched with GPUS: (%d, %d)' % (batch_size, total_gpus)
batch_size_each_gpu = batch_size // total_gpus
rank = dist.get_rank()
return batch_size_each_gpu, rank
return total_gpus, rank
def init_dist_pytorch(batch_size, tcp_port, local_rank, backend='nccl'):
def init_dist_pytorch(tcp_port, local_rank, backend='nccl'):
if mp.get_start_method(allow_none=True) is None:
mp.set_start_method('spawn')
......@@ -152,10 +148,9 @@ def init_dist_pytorch(batch_size, tcp_port, local_rank, backend='nccl'):
rank=local_rank,
world_size=num_gpus
)
assert batch_size % num_gpus == 0, 'Batch size should be matched with GPUS: (%d, %d)' % (batch_size, num_gpus)
batch_size_each_gpu = batch_size // num_gpus
rank = dist.get_rank()
return batch_size_each_gpu, rank
return num_gpus, rank
def get_dist_info():
if torch.__version__ < '1.0':
......@@ -173,6 +168,7 @@ def get_dist_info():
world_size = 1
return rank, world_size
def merge_results_dist(result_part, size, tmpdir):
rank, world_size = get_dist_info()
os.makedirs(tmpdir, exist_ok=True)
......
......@@ -170,6 +170,9 @@ MODEL:
OPTIMIZATION:
BATCH_SIZE_PER_GPU: 4
NUM_EPOCHS: 80
OPTIMIZER: adam_onecycle
LR: 0.01
WEIGHT_DECAY: 0.01
......
......@@ -114,6 +114,9 @@ MODEL:
OPTIMIZATION:
BATCH_SIZE_PER_GPU: 4
NUM_EPOCHS: 80
OPTIMIZER: adam_onecycle
LR: 0.003
WEIGHT_DECAY: 0.01
......
......@@ -143,6 +143,9 @@ MODEL:
OPTIMIZATION:
BATCH_SIZE_PER_GPU: 4
NUM_EPOCHS: 80
OPTIMIZER: adam_onecycle
LR: 0.003
WEIGHT_DECAY: 0.01
......
......@@ -139,6 +139,9 @@ MODEL:
OPTIMIZATION:
BATCH_SIZE_PER_GPU: 2
NUM_EPOCHS: 80
OPTIMIZER: adam_onecycle
LR: 0.01
WEIGHT_DECAY: 0.01
......
......@@ -228,6 +228,9 @@ MODEL:
OPTIMIZATION:
BATCH_SIZE_PER_GPU: 2
NUM_EPOCHS: 80
OPTIMIZER: adam_onecycle
LR: 0.01
WEIGHT_DECAY: 0.01
......
......@@ -100,6 +100,9 @@ MODEL:
OPTIMIZATION:
BATCH_SIZE_PER_GPU: 4
NUM_EPOCHS: 80
OPTIMIZER: adam_onecycle
LR: 0.003
WEIGHT_DECAY: 0.01
......
......@@ -116,6 +116,9 @@ MODEL:
OPTIMIZATION:
BATCH_SIZE_PER_GPU: 4
NUM_EPOCHS: 80
OPTIMIZER: adam_onecycle
LR: 0.003
WEIGHT_DECAY: 0.01
......
......@@ -20,8 +20,8 @@ def parse_config():
parser = argparse.ArgumentParser(description='arg parser')
parser.add_argument('--cfg_file', type=str, default=None, help='specify the config for training')
parser.add_argument('--batch_size', type=int, default=16, required=False, help='batch size for training')
parser.add_argument('--epochs', type=int, default=30, required=False, help='number of epochs to train for')
parser.add_argument('--batch_size', type=int, default=None, required=False, help='batch size for training')
parser.add_argument('--epochs', type=int, default=None, required=False, help='number of epochs to train for')
parser.add_argument('--workers', type=int, default=4, help='number of workers for dataloader')
parser.add_argument('--extra_tag', type=str, default='default', help='extra tag for this experiment')
parser.add_argument('--ckpt', type=str, default=None, help='checkpoint to start from')
......@@ -58,10 +58,14 @@ def main():
if args.launcher == 'none':
dist_train = False
else:
args.batch_size, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' % args.launcher)(
args.batch_size, args.tcp_port, args.local_rank, backend='nccl'
total_gpus, cfg.LOCAL_RANK = getattr(common_utils, 'init_dist_%s' % args.launcher)(
args.tcp_port, args.local_rank, backend='nccl'
)
dist_train = True
args.batch_size = cfg.OPTIMIZATION.BATCH_SIZE_PER_GPU if args.batch_size is None else args.batch_size
args.epochs = cfg.OPTIMIZATION.NUM_EPOCHS if args.epochs is None else args.epochs
if args.fix_random_seed:
common_utils.set_random_seed(666)
......@@ -79,7 +83,6 @@ def main():
logger.info('CUDA_VISIBLE_DEVICES=%s' % gpu_list)
if dist_train:
total_gpus = dist.get_world_size()
logger.info('total_batch_size: %d' % (total_gpus * args.batch_size))
for key, val in vars(args).items():
logger.info('{:16} {}'.format(key, val))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment