currently training darknet

48b412c3 · Vishnu Banna · 6d3cfef4 · 48b412c3 · 48b412c3 · 48b412c3
Commit 48b412c3 authored Nov 01, 2020 by Vishnu Banna
3 changed files
--- a/official/vision/beta/projects/yolo/configs/experiments/darknet53_tfds.yaml
+++ b/official/vision/beta/projects/yolo/configs/experiments/darknet53_tfds.yaml
 runtime:
  distribution_strategy: 'mirrored'
  mixed_precision_dtype: 'float16'
+  loss_scale: 'dynamic'
+  num_gpus: 2
 task:
  model:
    num_classes: 1001
@@ -18,21 +20,23 @@ task:
    tfds_data_dir: '~/tensorflow_datasets/'
    tfds_download: true
    is_training: true
-    global_batch_size: 128
+    global_batch_size: 16 #default = 128
    dtype: 'float16'
+    shuffle_buffer_size: 100
  validation_data:
    tfds_name: 'imagenet2012'
    tfds_split: 'validation'
    tfds_data_dir: '~/tensorflow_datasets/'
    tfds_download: true
    is_training: true
-    global_batch_size: 128
+    global_batch_size: 16 #default = 128
    dtype: 'float16'
    drop_remainder: false
+    shuffle_buffer_size: 100
 trainer:
-  train_steps: 800000 # epochs: 80
+  train_steps: 6400000 # epochs: 80, 800000 * 128/batchsize
-  validation_steps: 400 # size of validation data
+  validation_steps: 3200 # size of validation data, 400 * 128/batchsize
-  validation_interval: 500 #10000
+  validation_interval: 10000 #10000
  steps_per_loop: 10000
  summary_interval: 10000
  checkpoint_interval: 10000
@@ -44,11 +48,11 @@ trainer:
    learning_rate:
      type: 'polynomial'
      polynomial:
-        initial_learning_rate: 0.1
+        initial_learning_rate: 0.0125 # 0.1 * batchsize/128, default = 0.1
-        end_learning_rate: 0.0001
+        end_learning_rate: 0.0000125 # 0.0001 * batchsize/128, default = 0.0001
        power: 4.0
-        decay_steps: 799000
+        decay_steps: 6392000 # 790000 * 128/batchsize,   default =  800000 - 1000 = 799000
    warmup:
      type: 'linear'
      linear:
-        warmup_steps: 1000 #learning rate rises from 0 to 0.1 over 1000 steps
+        warmup_steps: 8000 # 0 to 0.1 over 1000 * 128/batchsize, default = 128
--- a/official/vision/beta/projects/yolo/tasks/image_classification.py
+++ b/official/vision/beta/projects/yolo/tasks/image_classification.py
@@ -141,7 +141,8 @@ class ImageClassificationTask(base_task.Task):
      # Computes per-replica loss.
      loss = self.build_losses(
          model_outputs=outputs, labels=labels, aux_losses=model.losses)
-      # Scales loss as the default gradients allreduce performs sum inside the
+      #Scales loss as the default gradients allreduce performs sum inside the
      # optimizer.
      scaled_loss = loss / num_replicas
@@ -150,7 +151,7 @@ class ImageClassificationTask(base_task.Task):
      if isinstance(
          optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer):
        scaled_loss = optimizer.get_scaled_loss(scaled_loss)
+      tf.print("batch loss: ", loss, end = "\r")
    tvars = model.trainable_variables
    grads = tape.gradient(scaled_loss, tvars)
    # Scales back gradient before apply_gradients when LossScaleOptimizer is

--- a/training_dir/params.yaml
+++ b/training_dir/params.yaml
@@ -6,10 +6,10 @@ runtime:
  distribution_strategy: mirrored
  enable_xla: false
  gpu_thread_mode: null
-  loss_scale: null
+  loss_scale: dynamic
  mixed_precision_dtype: float16
  num_cores_per_replica: 1
-  num_gpus: 0
+  num_gpus: 2
  num_packs: 1
  per_gpu_thread_count: 0
  run_eagerly: false
@@ -46,11 +46,11 @@ task:
    drop_remainder: true
    dtype: float16
    enable_tf_data_service: false
-    global_batch_size: 128
+    global_batch_size: 16
    input_path: ''
    is_training: true
    sharding: true
-    shuffle_buffer_size: 10000
+    shuffle_buffer_size: 100
    tf_data_service_address: null
    tf_data_service_job_name: null
    tfds_as_supervised: false
@@ -67,11 +67,11 @@ task:
    drop_remainder: false
    dtype: float16
    enable_tf_data_service: false
-    global_batch_size: 128
+    global_batch_size: 16
    input_path: ''
    is_training: true
    sharding: true
-    shuffle_buffer_size: 10000
+    shuffle_buffer_size: 100
    tf_data_service_address: null
    tf_data_service_job_name: null
    tfds_as_supervised: false
@@ -94,9 +94,9 @@ trainer:
    learning_rate:
      polynomial:
        cycle: false
-        decay_steps: 799000
+        decay_steps: 6392000
-        end_learning_rate: 0.0001
+        end_learning_rate: 1.25e-05
-        initial_learning_rate: 0.1
+        initial_learning_rate: 0.0125
        name: PolynomialDecay
        power: 4.0
      type: polynomial
@@ -113,12 +113,12 @@ trainer:
      linear:
        name: linear
        warmup_learning_rate: 0
-        warmup_steps: 1000
+        warmup_steps: 8000
      type: linear
  steps_per_loop: 10000
  summary_interval: 10000
-  train_steps: 800000
+  train_steps: 6400000
  train_tf_function: true
  train_tf_while_loop: true
-  validation_interval: 500
+  validation_interval: 10000
-  validation_steps: 400
+  validation_steps: 3200