benchmark_training.py 2.14 KB
Newer Older
zhenyi's avatar
zhenyi committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Scripts that simplifies running training benchmark """

import argparse
import os
import shutil
import subprocess


def main():
    # CLI flags
    parser = argparse.ArgumentParser(description="MaskRCNN train benchmark")
    parser.add_argument('--gpus', type=int, required=True)
    parser.add_argument('--batch_size', type=int, required=True)
    parser.add_argument('--amp', action='store_true')
    parser.add_argument('--data_dir', type=str, default='/data')
    parser.add_argument('--model_dir', type=str, default='/tmp/model')
    parser.add_argument('--weights_dir', type=str, default='/model')

    flags = parser.parse_args()
    main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '../mask_rcnn_main.py'))

    # build command
    cmd = (
        f'horovodrun -np {flags.gpus} '
        f'python {main_path}'
        f' --mode train'
        f' --model_dir "{flags.model_dir}"'
        f' --checkpoint "{os.path.join(flags.weights_dir, "resnet/resnet-nhwc-2018-02-07/model.ckpt-112603")}"'
        f' --training_file_pattern "{os.path.join(flags.data_dir, "train*.tfrecord")}"'
        f' --init_learning_rate 0.04'
        f' --total_steps 200'
        f' --use_batched_nms'
        f' --noeval_after_training'
        f' --nouse_custom_box_proposals_op'
        f' --train_batch_size {flags.batch_size}'
        f' {"--amp" if flags.amp else ""}'
    )

    # print command
    line = '-' * shutil.get_terminal_size()[0]
    print(line, cmd, line, sep='\n')

    # run model
    subprocess.call(cmd, shell=True)


if __name__ == '__main__':
    main()