classification task fix

72257494 · Vishnu Banna · 842cdd4d · 72257494 · 72257494 · 72257494
Commit 72257494 authored Sep 21, 2021 by Vishnu Banna
5 changed files
--- a/official/vision/beta/modeling/factory.py
+++ b/official/vision/beta/modeling/factory.py
@@ -51,6 +51,8 @@ def build_classification_model(
      norm_activation_config=norm_activation_config,
      l2_regularizer=l2_regularizer)

+  print(backbone)
+
  model = classification_model.ClassificationModel(
      backbone=backbone,
      num_classes=model_config.num_classes,

--- a/official/vision/beta/projects/yolo/configs/darknet_classification.py
+++ b/official/vision/beta/projects/yolo/configs/darknet_classification.py
@@ -29,13 +29,14 @@ from official.vision.beta.projects.yolo.configs import backbones
 @dataclasses.dataclass
 class ImageClassificationModel(hyperparams.Config):
  num_classes: int = 0
-  input_size: List[int] = dataclasses.field(default_factory=list)
+  input_size: List[int] = dataclasses.field(default_factory=lambda:[224, 224])
  backbone: backbones.Backbone = backbones.Backbone(
      type='darknet', darknet=backbones.Darknet())
  dropout_rate: float = 0.0
  norm_activation: common.NormActivation = common.NormActivation()
  # Adds a Batch Normalization layer pre-GlobalAveragePooling in classification.
  add_head_batch_norm: bool = False
+  kernel_initializer: str = 'VarianceScaling'


 @dataclasses.dataclass
@@ -56,7 +57,6 @@ class ImageClassificationTask(cfg.TaskConfig):
  gradient_clip_norm: float = 0.0
  logging_dir: Optional[str] = None

-
 @exp_factory.register_config_factory('darknet_classification')
 def darknet_classification() -> cfg.ExperimentConfig:
  """Image classification general."""
@@ -67,3 +67,4 @@ def darknet_classification() -> cfg.ExperimentConfig:
          'task.train_data.is_training != None',
          'task.validation_data.is_training != None'
      ])
+
--- a/official/vision/beta/projects/yolo/dataloaders/classification_input.py
+++ b/official/vision/beta/projects/yolo/dataloaders/classification_input.py
-"""Classification parser."""
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

-# Import libraries
+"""Classification decoder and parser."""
 import tensorflow as tf
-import tensorflow_datasets as tfds
-import tensorflow_addons as tfa
-
-from official.vision.beta.dataloaders import parser
+from official.vision.beta.dataloaders import classification_input
 from official.vision.beta.ops import preprocess_ops
-from official.vision.beta.ops import augment


-class Parser(parser.Parser):
+class Parser(classification_input.Parser):
  """Parser to parse an image and its annotations into a dictionary of tensors."""

-  def __init__(self,
-               output_size,
-               aug_policy,
-               scale=[128, 448],
-               dtype='float32'):
-    """Initializes parameters for parsing annotations in the dataset.
-
-    Args:
-      output_size: `Tensor` or `list` for [height, width] of output image. The
-        output_size should be divided by the largest feature stride 2^max_level.
-      num_classes: `float`, number of classes.
-      aug_policy: An optional Augmentation object to choose from AutoAugment and
-        RandAugment.
-      scale: A `List[int]`, minimum and maximum image shape range.
-      dtype: `str`, cast output image in dtype. It can be 'float32', 'float16',
-        or 'bfloat16'.
-    """
-    self._output_size = output_size
-    if aug_policy:
-      if aug_policy == 'autoaug':
-        self._augmenter = augment.AutoAugment()
-      elif aug_policy == 'randaug':
-        self._augmenter = augment.RandAugment(num_layers=2, magnitude=20)
-      else:
-        raise ValueError(
-            'Augmentation policy {} not supported.'.format(aug_policy))
-    else:
-      self._augmenter = None
-
-    self._scale = scale
-    if dtype == 'float32':
-      self._dtype = tf.float32
-    elif dtype == 'float16':
-      self._dtype = tf.float16
-    elif dtype == 'bfloat16':
-      self._dtype = tf.bfloat16
-    else:
-      raise ValueError('dtype {!r} is not supported!'.format(dtype))
-
-  def _parse_train_data(self, decoded_tensors):
-    """Generates images and labels that are usable for model training.
-     Args:
-       decoded_tensors: a dict of Tensors produced by the decoder.
-     Returns:
-       images: the image tensor.
-       labels: a dict of Tensors that contains labels.
-    """
-    image = tf.io.decode_image(decoded_tensors['image/encoded'])
-    image.set_shape((None, None, 3))
-
-    image = tf.image.resize_with_pad(
-        image,
-        target_width=self._output_size[0],
-        target_height=self._output_size[1])
-
-    scale = tf.random.uniform([],
-                              minval=self._scale[0],
-                              maxval=self._scale[1],
-                              dtype=tf.int32)
-    if scale > self._output_size[0]:
-      image = tf.image.resize_with_crop_or_pad(
-          image, target_height=scale, target_width=scale)
+  def _parse_train_image(self, decoded_tensors):
+    """Parses image data for training."""
+    image_bytes = decoded_tensors[self._image_field_key]
+
+    if self._decode_jpeg_only:
+      image_shape = tf.image.extract_jpeg_shape(image_bytes)
+
+      # Crops image.
+      cropped_image = preprocess_ops.random_crop_image_v2(
+          image_bytes, image_shape)
+      image = tf.cond(
+          tf.reduce_all(tf.equal(tf.shape(cropped_image), image_shape)),
+          lambda: preprocess_ops.center_crop_image_v2(image_bytes, image_shape),
+          lambda: cropped_image)
    else:
-      image = tf.image.random_crop(image, (scale, scale, 3))
+      # Decodes image.
+      image = tf.io.decode_image(image_bytes, channels=3)
+      image.set_shape([None, None, 3])
+
+      # Crops image.
+      cropped_image = preprocess_ops.random_crop_image(image)
+
+      image = tf.cond(
+          tf.reduce_all(tf.equal(tf.shape(cropped_image), tf.shape(image))),
+          lambda: preprocess_ops.center_crop_image(image),
+          lambda: cropped_image)
+
+    if self._aug_rand_hflip:
+      image = tf.image.random_flip_left_right(image)

+    # Resizes image.
+    image = tf.image.resize(
+        image, self._output_size, method=tf.image.ResizeMethod.BILINEAR)
+    image.set_shape([self._output_size[0], self._output_size[1], 3])
+
+    # Apply autoaug or randaug.
    if self._augmenter is not None:
      image = self._augmenter.distort(image)

-    image = tf.image.random_flip_left_right(image)
-    image = tf.cast(image, tf.float32) / 255
-    image = tf.image.resize(image, (self._output_size[0], self._output_size[1]))
-
-    label = decoded_tensors['image/class/label']
-    return image, label
-
-  def _parse_eval_data(self, decoded_tensors):
-    """Generates images and labels that are usable for model evaluation.
-    Args:
-      decoded_tensors: a dict of Tensors produced by the decoder.
-    Returns:
-      images: the image tensor.
-      labels: a dict of Tensors that contains labels.
-    """
-    image = tf.io.decode_image(decoded_tensors['image/encoded'])
-    image.set_shape((None, None, 3))
-    image = tf.cast(image, tf.float32)
-    image = tf.image.resize_with_pad(
-        image,
-        target_width=self._output_size[0],
-        target_height=self._output_size[1])  # Final Output Shape
-    image = image / 255.  # Normalize
-    #label = tf.one_hot(decoded_tensors['image/class/label'], self._num_classes)
-    label = decoded_tensors['image/class/label']
-    return image, label
+    # Convert image to self._dtype.
+    image = tf.image.convert_image_dtype(image, self._dtype)
+    image = image/255.0
+    return image
+
+  def _parse_eval_image(self, decoded_tensors):
+    """Parses image data for evaluation."""
+    image_bytes = decoded_tensors[self._image_field_key]
+
+    if self._decode_jpeg_only:
+      image_shape = tf.image.extract_jpeg_shape(image_bytes)
+
+      # Center crops.
+      image = preprocess_ops.center_crop_image_v2(image_bytes, image_shape)
+    else:
+      # Decodes image.
+      image = tf.io.decode_image(image_bytes, channels=3)
+      image.set_shape([None, None, 3])
+
+      # Center crops.
+      image = preprocess_ops.center_crop_image(image)
+
+    image = tf.image.resize(
+        image, self._output_size, method=tf.image.ResizeMethod.BILINEAR)
+    image.set_shape([self._output_size[0], self._output_size[1], 3])
+
+    # Convert image to self._dtype.
+    image = tf.image.convert_image_dtype(image, self._dtype)
+    image = image/255.0
+    return image
--- a/official/vision/beta/projects/yolo/tasks/image_classification.py
+++ b/official/vision/beta/projects/yolo/tasks/image_classification.py
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,19 +12,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-# Lint as: python3
+# ==============================================================================
 """Image classification task definition."""
-import tensorflow as tf

-from official.core import input_reader
 from official.core import task_factory
-from official.vision.beta.dataloaders import classification_input
 from official.vision.beta.projects.yolo.configs import darknet_classification as exp_cfg
-from official.vision.beta.projects.yolo.dataloaders import classification_tfds_decoder as cli
+
+from official.common import dataset_fn
+from official.vision.beta.dataloaders import input_reader_factory
+from official.vision.beta.dataloaders import tfds_factory
+from official.vision.beta.dataloaders import classification_input as classification_input_base
+
+from official.vision.beta.projects.yolo.dataloaders import classification_input
 from official.vision.beta.tasks import image_classification


+
 @task_factory.register_task_cls(exp_cfg.ImageClassificationTask)
 class ImageClassificationTask(image_classification.ImageClassificationTask):
  """A task for image classification."""
@@ -33,82 +37,33 @@ class ImageClassificationTask(image_classification.ImageClassificationTask):

    num_classes = self.task_config.model.num_classes
    input_size = self.task_config.model.input_size
+    image_field_key = self.task_config.train_data.image_field_key
+    label_field_key = self.task_config.train_data.label_field_key
+    is_multilabel = self.task_config.train_data.is_multilabel

    if params.tfds_name:
-      decoder = cli.Decoder()
+      decoder = tfds_factory.get_classification_decoder(params.tfds_name)
    else:
-      decoder = classification_input.Decoder()
+      decoder = classification_input_base.Decoder(
+          image_field_key=image_field_key, label_field_key=label_field_key,
+          is_multilabel=is_multilabel)

    parser = classification_input.Parser(
        output_size=input_size[:2],
        num_classes=num_classes,
+        image_field_key=image_field_key,
+        label_field_key=label_field_key,
+        decode_jpeg_only=params.decode_jpeg_only,
+        aug_rand_hflip=params.aug_rand_hflip,
+        aug_type=params.aug_type,
+        is_multilabel=is_multilabel,
        dtype=params.dtype)

-    reader = input_reader.InputReader(
+    reader = input_reader_factory.input_reader_generator(
        params,
-        dataset_fn=tf.data.TFRecordDataset,
+        dataset_fn=dataset_fn.pick_dataset_fn(params.file_type),
        decoder_fn=decoder.decode,
        parser_fn=parser.parse_fn(params.is_training))

    dataset = reader.read(input_context=input_context)
-    return dataset
-
-  def train_step(self, inputs, model, optimizer, metrics=None):
-    """Does forward and backward.
-
-    Args:
-      inputs: a dictionary of input tensors.
-      model: the model, forward pass definition.
-      optimizer: the optimizer for this training step.
-      metrics: a nested structure of metrics objects.
-
-    Returns:
-      A dictionary of logs.
-    """
-    features, labels = inputs
-    if self.task_config.losses.one_hot:
-      labels = tf.one_hot(labels, self.task_config.model.num_classes)
-
-    num_replicas = tf.distribute.get_strategy().num_replicas_in_sync
-    with tf.GradientTape() as tape:
-      outputs = model(features, training=True)
-      # Casting output layer as float32 is necessary when mixed_precision is
-      # mixed_float16 or mixed_bfloat16 to ensure output is casted as float32.
-      outputs = tf.nest.map_structure(
-          lambda x: tf.cast(x, tf.float32), outputs)
-
-      # Computes per-replica loss.
-      loss = self.build_losses(
-          model_outputs=outputs, labels=labels, aux_losses=model.losses)
-      # Scales loss as the default gradients allreduce performs sum inside the
-      # optimizer.
-      scaled_loss = loss / num_replicas
-
-      # For mixed_precision policy, when LossScaleOptimizer is used, loss is
-      # scaled for numerical stability.
-      if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
-        scaled_loss = optimizer.get_scaled_loss(scaled_loss)
-
-    tvars = model.trainable_variables
-    grads = tape.gradient(scaled_loss, tvars)
-    # Scales back gradient before apply_gradients when LossScaleOptimizer is
-    # used.
-    if isinstance(optimizer, tf.keras.mixed_precision.LossScaleOptimizer):
-      grads = optimizer.get_unscaled_gradients(grads)
-
-    # Apply gradient clipping.
-    if self.task_config.gradient_clip_norm > 0:
-      grads, _ = tf.clip_by_global_norm(
-          grads, self.task_config.gradient_clip_norm)
-    optimizer.apply_gradients(list(zip(grads, tvars)))
-
-    logs = {self.loss: loss}
-    if metrics:
-      self.process_metrics(metrics, labels, outputs)
-      logs.update({m.name: m.result() for m in metrics})
-    elif model.compiled_metrics:
-      self.process_compiled_metrics(model.compiled_metrics, labels, outputs)
-      logs.update({m.name: m.result() for m in model.metrics})
-    return logs
-
-
+    return dataset
\ No newline at end of file
--- a/training_dir/params.yaml
+++ b/training_dir/params.yaml
+runtime:
+  all_reduce_alg: null
+  batchnorm_spatial_persistent: false
+  dataset_num_private_threads: null
+  default_shard_dim: -1
+  distribution_strategy: mirrored
+  enable_xla: false
+  gpu_thread_mode: null
+  loss_scale: dynamic
+  mixed_precision_dtype: float16
+  num_cores_per_replica: 1
+  num_gpus: 2
+  num_packs: 1
+  per_gpu_thread_count: 0
+  run_eagerly: false
+  task_index: -1
+  tpu: null
+  tpu_enable_xla_dynamic_padder: null
+  worker_hosts: null
+task:
+  evaluation:
+    top_k: 5
+  gradient_clip_norm: 0.0
+  init_checkpoint: ''
+  logging_dir: null
+  losses:
+    l2_weight_decay: 0.0005
+    label_smoothing: 0.0
+    one_hot: true
+  model:
+    add_head_batch_norm: false
+    backbone:
+      darknet:
+        depth_scale: 1.0
+        dilate: false
+        max_level: 5
+        min_level: 3
+        model_id: darknet53
+        use_reorg_input: false
+        use_separable_conv: false
+        width_scale: 1.0
+      type: darknet
+    dropout_rate: 0.0
+    input_size: [256, 256, 3]
+    kernel_initializer: VarianceScaling
+    norm_activation:
+      activation: mish
+      norm_epsilon: 0.001
+      norm_momentum: 0.99
+      use_sync_bn: true
+    num_classes: 1001
+  name: null
+  train_data:
+    aug_policy: null
+    aug_rand_hflip: true
+    aug_type: null
+    block_length: 1
+    cache: false
+    color_jitter: 0.0
+    cycle_length: 10
+    decode_jpeg_only: true
+    decoder:
+      simple_decoder:
+        mask_binarize_threshold: null
+        regenerate_source_id: false
+      type: simple_decoder
+    deterministic: null
+    drop_remainder: true
+    dtype: float16
+    enable_tf_data_service: false
+    file_type: tfrecord
+    global_batch_size: 16
+    image_field_key: image/encoded
+    input_path: ''
+    is_multilabel: false
+    is_training: true
+    label_field_key: image/class/label
+    mixup_and_cutmix: null
+    randaug_magnitude: 10
+    random_erasing: null
+    seed: null
+    sharding: true
+    shuffle_buffer_size: 100
+    tf_data_service_address: null
+    tf_data_service_job_name: null
+    tfds_as_supervised: false
+    tfds_data_dir: ~/tensorflow_datasets/
+    tfds_name: imagenet2012
+    tfds_skip_decoding_feature: ''
+    tfds_split: train
+  validation_data:
+    aug_policy: null
+    aug_rand_hflip: true
+    aug_type: null
+    block_length: 1
+    cache: false
+    color_jitter: 0.0
+    cycle_length: 10
+    decode_jpeg_only: true
+    decoder:
+      simple_decoder:
+        mask_binarize_threshold: null
+        regenerate_source_id: false
+      type: simple_decoder
+    deterministic: null
+    drop_remainder: false
+    dtype: float16
+    enable_tf_data_service: false
+    file_type: tfrecord
+    global_batch_size: 16
+    image_field_key: image/encoded
+    input_path: ''
+    is_multilabel: false
+    is_training: true
+    label_field_key: image/class/label
+    mixup_and_cutmix: null
+    randaug_magnitude: 10
+    random_erasing: null
+    seed: null
+    sharding: true
+    shuffle_buffer_size: 100
+    tf_data_service_address: null
+    tf_data_service_job_name: null
+    tfds_as_supervised: false
+    tfds_data_dir: ~/tensorflow_datasets/
+    tfds_name: imagenet2012
+    tfds_skip_decoding_feature: ''
+    tfds_split: validation
+trainer:
+  allow_tpu_summary: false
+  best_checkpoint_eval_metric: ''
+  best_checkpoint_export_subdir: ''
+  best_checkpoint_metric_comp: higher
+  checkpoint_interval: 10000
+  continuous_eval_timeout: 3600
+  eval_tf_function: true
+  eval_tf_while_loop: false
+  loss_upper_bound: 1000000.0
+  max_to_keep: 5
+  optimizer_config:
+    ema: null
+    learning_rate:
+      polynomial:
+        cycle: false
+        decay_steps: 6392000
+        end_learning_rate: 1.25e-05
+        initial_learning_rate: 0.0125
+        name: PolynomialDecay
+        offset: 0
+        power: 4.0
+      type: polynomial
+    optimizer:
+      sgd:
+        clipnorm: null
+        clipvalue: null
+        decay: 0.0
+        global_clipnorm: null
+        momentum: 0.9
+        name: SGD
+        nesterov: false
+      type: sgd
+    warmup:
+      linear:
+        name: linear
+        warmup_learning_rate: 0
+        warmup_steps: 8000
+      type: linear
+  recovery_begin_steps: 0
+  recovery_max_trials: 0
+  steps_per_loop: 10000
+  summary_interval: 10000
+  train_steps: 6400000
+  train_tf_function: true
+  train_tf_while_loop: true
+  validation_interval: 10000
+  validation_steps: 3200
+  validation_summary_subdir: validation