change the default number of gpu and tpu for the experiment config template

9070109e · Shixin Luo · bf4c3894 · 9070109e · 9070109e
Commit 9070109e authored Sep 28, 2020 by Shixin Luo
2 changed files
--- a/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv2_1.0_gpu.yaml
+++ b/official/vision/beta/configs/experiments/image_classification/imagenet_mobilenetv2_1.0_gpu.yaml
@@ -19,21 +19,21 @@ task:
  train_data:
    input_path: 'imagenet-2012-tfrecord/train*'
    is_training: True
-    global_batch_size: 96
+    global_batch_size: 768  # 96 * 8
    dtype: 'float32'
  validation_data:
    input_path: 'imagenet-2012-tfrecord/valid*'
    is_training: False
-    global_batch_size: 96
+    global_batch_size: 768  # 96 * 8
    dtype: 'float32'
    drop_remainder: False
 trainer:
-  train_steps: 1201050 # 90 * steps_per_epoch
-  validation_steps: 520
-  validation_interval: 13345
-  steps_per_loop: 13345 # NUM_EXAMPLES (1281167) // global_batch_size
-  summary_interval: 13345
-  checkpoint_interval: 13345
+  train_steps: 150120  # 90 * steps_per_epoch
+  validation_steps: 65
+  validation_interval: 1668
+  steps_per_loop: 1668  # NUM_EXAMPLES (1281167) // global_batch_size
+  summary_interval: 1668
+  checkpoint_interval: 1668
  optimizer_config:
    optimizer:
      type: 'rmsprop'
@@ -44,13 +44,13 @@ trainer:
    learning_rate:
      type: 'exponential'
      exponential:
-        initial_learning_rate: 0.045,
-        decay_steps: 33362, # 2.5 * steps_per_epoch
+        initial_learning_rate: 0.36,  # 0.045 * NUM_GPUS
+        decay_steps: 4170,  # 2.5 * steps_per_epoch
        decay_rate: 0.97,
        staircase: True
    warmup:
      type: 'linear'
      linear:
-        warmup_steps: 66725 # 5 * steps_per_epoch
+        warmup_steps: 8340  # 5 * steps_per_epoch
    ema:
      average_decay: 0.9999
\ No newline at end of file
--- a/official/vision/beta/configs/image_classification.py
+++ b/official/vision/beta/configs/image_classification.py
@@ -218,8 +218,8 @@ def image_classification_imagenet_revnet() -> cfg.ExperimentConfig:
 @exp_factory.register_config_factory('mobilenet_imagenet')
 def image_classification_imagenet_mobilenet() -> cfg.ExperimentConfig:
  """Image classification on imagenet with mobilenet."""
-  train_batch_size = 192
-  eval_batch_size = 192
+  train_batch_size = 1536  # 96 * 16
+  eval_batch_size = 1536  # 96 * 16
  steps_per_epoch = IMAGENET_TRAIN_EXAMPLES // train_batch_size
  config = cfg.ExperimentConfig(
      task=ImageClassificationTask(
@@ -261,9 +261,12 @@ def image_classification_imagenet_mobilenet() -> cfg.ExperimentConfig:
              'learning_rate': {
                  'type': 'exponential',
                  'exponential': {
-                      'initial_learning_rate': 0.045,
-                      'decay_steps': int(2.4 * steps_per_epoch),
-                      'decay_rate': 0.97,
+                      # 0.045 * NUM_GPUS
+                      'initial_learning_rate': 0.045 * (train_batch_size // 96),
+                      # (2.5 / NUM_GPUS) epochs
+                      'decay_steps': int((2.5 / (train_batch_size // 96))
+                                         * steps_per_epoch),
+                      'decay_rate': 0.98,
                      'staircase': True
                  }
              },