Commit 48b412c3 authored by Vishnu Banna's avatar Vishnu Banna
Browse files

currently training darknet

parent 6d3cfef4
runtime: runtime:
distribution_strategy: 'mirrored' distribution_strategy: 'mirrored'
mixed_precision_dtype: 'float16' mixed_precision_dtype: 'float16'
loss_scale: 'dynamic'
num_gpus: 2
task: task:
model: model:
num_classes: 1001 num_classes: 1001
...@@ -18,21 +20,23 @@ task: ...@@ -18,21 +20,23 @@ task:
tfds_data_dir: '~/tensorflow_datasets/' tfds_data_dir: '~/tensorflow_datasets/'
tfds_download: true tfds_download: true
is_training: true is_training: true
global_batch_size: 128 global_batch_size: 16 #default = 128
dtype: 'float16' dtype: 'float16'
shuffle_buffer_size: 100
validation_data: validation_data:
tfds_name: 'imagenet2012' tfds_name: 'imagenet2012'
tfds_split: 'validation' tfds_split: 'validation'
tfds_data_dir: '~/tensorflow_datasets/' tfds_data_dir: '~/tensorflow_datasets/'
tfds_download: true tfds_download: true
is_training: true is_training: true
global_batch_size: 128 global_batch_size: 16 #default = 128
dtype: 'float16' dtype: 'float16'
drop_remainder: false drop_remainder: false
shuffle_buffer_size: 100
trainer: trainer:
train_steps: 800000 # epochs: 80 train_steps: 6400000 # epochs: 80, 800000 * 128/batchsize
validation_steps: 400 # size of validation data validation_steps: 3200 # size of validation data, 400 * 128/batchsize
validation_interval: 500 #10000 validation_interval: 10000 #10000
steps_per_loop: 10000 steps_per_loop: 10000
summary_interval: 10000 summary_interval: 10000
checkpoint_interval: 10000 checkpoint_interval: 10000
...@@ -44,11 +48,11 @@ trainer: ...@@ -44,11 +48,11 @@ trainer:
learning_rate: learning_rate:
type: 'polynomial' type: 'polynomial'
polynomial: polynomial:
initial_learning_rate: 0.1 initial_learning_rate: 0.0125 # 0.1 * batchsize/128, default = 0.1
end_learning_rate: 0.0001 end_learning_rate: 0.0000125 # 0.0001 * batchsize/128, default = 0.0001
power: 4.0 power: 4.0
decay_steps: 799000 decay_steps: 6392000 # 790000 * 128/batchsize, default = 800000 - 1000 = 799000
warmup: warmup:
type: 'linear' type: 'linear'
linear: linear:
warmup_steps: 1000 #learning rate rises from 0 to 0.1 over 1000 steps warmup_steps: 8000 # 0 to 0.1 over 1000 * 128/batchsize, default = 128
...@@ -141,7 +141,8 @@ class ImageClassificationTask(base_task.Task): ...@@ -141,7 +141,8 @@ class ImageClassificationTask(base_task.Task):
# Computes per-replica loss. # Computes per-replica loss.
loss = self.build_losses( loss = self.build_losses(
model_outputs=outputs, labels=labels, aux_losses=model.losses) model_outputs=outputs, labels=labels, aux_losses=model.losses)
# Scales loss as the default gradients allreduce performs sum inside the
#Scales loss as the default gradients allreduce performs sum inside the
# optimizer. # optimizer.
scaled_loss = loss / num_replicas scaled_loss = loss / num_replicas
...@@ -150,7 +151,7 @@ class ImageClassificationTask(base_task.Task): ...@@ -150,7 +151,7 @@ class ImageClassificationTask(base_task.Task):
if isinstance( if isinstance(
optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer): optimizer, tf.keras.mixed_precision.experimental.LossScaleOptimizer):
scaled_loss = optimizer.get_scaled_loss(scaled_loss) scaled_loss = optimizer.get_scaled_loss(scaled_loss)
tf.print("batch loss: ", loss, end = "\r")
tvars = model.trainable_variables tvars = model.trainable_variables
grads = tape.gradient(scaled_loss, tvars) grads = tape.gradient(scaled_loss, tvars)
# Scales back gradient before apply_gradients when LossScaleOptimizer is # Scales back gradient before apply_gradients when LossScaleOptimizer is
......
...@@ -6,10 +6,10 @@ runtime: ...@@ -6,10 +6,10 @@ runtime:
distribution_strategy: mirrored distribution_strategy: mirrored
enable_xla: false enable_xla: false
gpu_thread_mode: null gpu_thread_mode: null
loss_scale: null loss_scale: dynamic
mixed_precision_dtype: float16 mixed_precision_dtype: float16
num_cores_per_replica: 1 num_cores_per_replica: 1
num_gpus: 0 num_gpus: 2
num_packs: 1 num_packs: 1
per_gpu_thread_count: 0 per_gpu_thread_count: 0
run_eagerly: false run_eagerly: false
...@@ -46,11 +46,11 @@ task: ...@@ -46,11 +46,11 @@ task:
drop_remainder: true drop_remainder: true
dtype: float16 dtype: float16
enable_tf_data_service: false enable_tf_data_service: false
global_batch_size: 128 global_batch_size: 16
input_path: '' input_path: ''
is_training: true is_training: true
sharding: true sharding: true
shuffle_buffer_size: 10000 shuffle_buffer_size: 100
tf_data_service_address: null tf_data_service_address: null
tf_data_service_job_name: null tf_data_service_job_name: null
tfds_as_supervised: false tfds_as_supervised: false
...@@ -67,11 +67,11 @@ task: ...@@ -67,11 +67,11 @@ task:
drop_remainder: false drop_remainder: false
dtype: float16 dtype: float16
enable_tf_data_service: false enable_tf_data_service: false
global_batch_size: 128 global_batch_size: 16
input_path: '' input_path: ''
is_training: true is_training: true
sharding: true sharding: true
shuffle_buffer_size: 10000 shuffle_buffer_size: 100
tf_data_service_address: null tf_data_service_address: null
tf_data_service_job_name: null tf_data_service_job_name: null
tfds_as_supervised: false tfds_as_supervised: false
...@@ -94,9 +94,9 @@ trainer: ...@@ -94,9 +94,9 @@ trainer:
learning_rate: learning_rate:
polynomial: polynomial:
cycle: false cycle: false
decay_steps: 799000 decay_steps: 6392000
end_learning_rate: 0.0001 end_learning_rate: 1.25e-05
initial_learning_rate: 0.1 initial_learning_rate: 0.0125
name: PolynomialDecay name: PolynomialDecay
power: 4.0 power: 4.0
type: polynomial type: polynomial
...@@ -113,12 +113,12 @@ trainer: ...@@ -113,12 +113,12 @@ trainer:
linear: linear:
name: linear name: linear
warmup_learning_rate: 0 warmup_learning_rate: 0
warmup_steps: 1000 warmup_steps: 8000
type: linear type: linear
steps_per_loop: 10000 steps_per_loop: 10000
summary_interval: 10000 summary_interval: 10000
train_steps: 800000 train_steps: 6400000
train_tf_function: true train_tf_function: true
train_tf_while_loop: true train_tf_while_loop: true
validation_interval: 500 validation_interval: 10000
validation_steps: 400 validation_steps: 3200
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment