Merge pull request #10338 from srihari-humbarwadi:readme

PiperOrigin-RevId: 413033276

Merge pull request #10338 from srihari-humbarwadi:readme
PiperOrigin-RevId: 413033276
c57e975a · saberkun · 7fb4f3cd · acf4156e · c57e975a · c57e975a
Commit c57e975a authored Nov 29, 2021 by saberkun
20 changed files
--- a/official/nlp/transformer/embedding_layer.py
+++ b/official/nlp/transformer/embedding_layer.py
@@ -76,8 +76,8 @@ class EmbeddingSharedWeights(tf.keras.layers.Layer):
    with tf.name_scope("embedding"):
      # Create binary mask of size [batch_size, length]
      embeddings = tf.gather(self.shared_weights, inputs)
-      mask = tf.cast(tf.not_equal(inputs, 0), embeddings.dtype)
-      embeddings *= tf.expand_dims(mask, -1)
+      # mask = tf.cast(tf.not_equal(inputs, 0), embeddings.dtype)
+      # embeddings *= tf.expand_dims(mask, -1)
      # Scale embedding by the sqrt of the hidden size
      embeddings *= self.hidden_size**0.5


--- a/official/nlp/transformer/transformer.py
+++ b/official/nlp/transformer/transformer.py
@@ -196,13 +196,12 @@ class Transformer(tf.keras.Model):
    with tf.name_scope("decode"):
      # Prepare inputs to decoder layers by shifting targets, adding positional
      # encoding and applying dropout.
+      with tf.name_scope("shift_targets"):
+        # Shift targets to the right, and remove the last element
+        targets = tf.pad(targets, [[0, 0], [1, 0]])[:, :-1]
      decoder_inputs = self.embedding_softmax_layer(targets)
      decoder_inputs = tf.cast(decoder_inputs, self.params["dtype"])
      attention_bias = tf.cast(attention_bias, self.params["dtype"])
-      with tf.name_scope("shift_targets"):
-        # Shift targets to the right, and remove the last element
-        decoder_inputs = tf.pad(decoder_inputs,
-                                [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
      with tf.name_scope("add_pos_encoding"):
        length = tf.shape(decoder_inputs)[1]
        pos_encoding = self.position_embedding(decoder_inputs)

--- a/official/nlp/transformer/transformer_main.py
+++ b/official/nlp/transformer/transformer_main.py
@@ -440,7 +440,6 @@ class TransformerTask(object):
    opt = performance.configure_optimizer(
        opt,
        use_float16=params["dtype"] == tf.float16,
-        use_graph_rewrite=self.flags_obj.fp16_implementation == "graph_rewrite",
        loss_scale=flags_core.get_loss_scale(
            self.flags_obj, default_for_fp16="dynamic"))


--- a/official/nlp/transformer/transformer_main_test.py
+++ b/official/nlp/transformer/transformer_main_test.py
@@ -53,9 +53,9 @@ class TransformerTaskTest(tf.test.TestCase):
    FLAGS.param_set = 'tiny'
    FLAGS.use_synthetic_data = True
    FLAGS.steps_between_evals = 1
-    FLAGS.train_steps = 2
+    FLAGS.train_steps = 1
    FLAGS.validation_steps = 1
-    FLAGS.batch_size = 8
+    FLAGS.batch_size = 4
    FLAGS.max_length = 1
    FLAGS.num_gpus = 1
    FLAGS.distribution_strategy = 'off'

--- a/official/nlp/xlnet/squad_utils.py
+++ b/official/nlp/xlnet/squad_utils.py
@@ -16,7 +16,6 @@
 """Utilities used in SQUAD task."""
 from __future__ import absolute_import
 from __future__ import division
-# from __future__ import google_type_annotations
 from __future__ import print_function

 import collections

--- a/official/pip_package/setup.py
+++ b/official/pip_package/setup.py
@@ -20,7 +20,7 @@ import sys
 from setuptools import find_packages
 from setuptools import setup

-version = '2.5.0'
+version = '2.7.0'

 project_name = 'tf-models-official'

@@ -61,8 +61,8 @@ if project_name == 'tf-models-nightly':
  install_requires.append('tf-nightly')
  install_requires.append('tensorflow-text-nightly')
 else:
-  install_requires.append('tensorflow>=2.4.0')
-  install_requires.append('tensorflow-text>=2.4.0')
+  install_requires.append('tensorflow>=2.7.0')
+  install_requires.append('tensorflow-text>=2.7.0')

 print('install_requires: ', install_requires)
 print('dependency_links: ', dependency_links)
@@ -88,5 +88,5 @@ setup(
    },
    install_requires=install_requires,
    dependency_links=dependency_links,
-    python_requires='>=3.6',
+    python_requires='>=3.7',
 )
--- a/official/projects/README.md
+++ b/official/projects/README.md
+# TensorFlow Model Garden Modeling Projects
+
 This directory contains projects using TensorFlow Model Garden Modeling
 libraries.
+
+## Projects
+
+*   [NHNet](nhnet):
+    [Generating Representative Headlines for News Stories](https://arxiv.org/abs/2001.09386)
+    by Gu et al, 2020
+
--- a/official/projects/basnet/configs/basnet.py
+++ b/official/projects/basnet/configs/basnet.py
@@ -16,11 +16,10 @@
 import dataclasses
 import os
 from typing import List, Optional, Union
-
+from official.core import config_definitions as cfg
 from official.core import exp_factory
 from official.modeling import hyperparams
 from official.modeling import optimization
-from official.modeling.hyperparams import config_definitions as cfg
 from official.vision.beta.configs import common



--- a/official/projects/basnet/configs/basnet_test.py
+++ b/official/projects/basnet/configs/basnet_test.py
@@ -18,8 +18,8 @@
 from absl.testing import parameterized
 import tensorflow as tf

+from official.core import config_definitions as cfg
 from official.core import exp_factory
-from official.modeling.hyperparams import config_definitions as cfg
 from official.projects.basnet.configs import basnet as exp_cfg



--- a/official/projects/edgetpu/README.md
+++ b/official/projects/edgetpu/README.md
+# Machine Learning Models Optimized for Google Tensor's Edge TPU
+
+## Requirements
+[![TensorFlow 2.4](https://img.shields.io/badge/TensorFlow-2.4-FF6F00?logo=tensorflow)](https://github.com/tensorflow/tensorflow/releases/tag/v2.4.0)
+[![Python 3.7](https://img.shields.io/badge/Python-3.7-3776AB)](https://www.python.org/downloads/release/python-379/)
+
+## Overview
+
+<figure align="center">
+<img width=70% src=https://storage.googleapis.com/tf_model_garden/models/edgetpu/images/neural%20architecture%20search.gif>
+  <figcaption><i>An illustration of NAS to find Edge TPU optimized models, Each column represents a stage in the natural network, with dots indicating different options, and each color representing a different type of building block. A path from inputs (e.g., an image) to outputs (e.g., per-pixel label predictions) through the matrix represents a candidate neural network. In each iteration of the search, a neural network is formed using the blocks chosen at every stage, and the search algorithm aims to find neural networks that jointly minimize TPU latency and/or energy and maximize accuracy.
+    </i></figcaption>
+</figure>
+
+This repository contains machine learning models optimized for the Edge TPU in
+Pixel 6's SoC,
+[Google Tensor](https://blog.google/products/pixel/google-tensor-debuts-new-pixel-6-fall/).
+We use Neural Architecture Search (NAS) to automate the process of designing ML
+models and incentivize the search algorithms to discover models that achieve
+higher quality as well as better latency and computing efficiency. This
+automation also allows us to scale the development of ML models for a variety of
+on-device tasks. We’re making these ML models publicly available through the
+Tensorflow model garden and [Tensorflow Hub](https://tfhub.dev/s?q=edgetpu) to
+enable researchers and developers to bootstrap further use case development on
+Pixel 6.
+
+### [Image Classification](https://github.com/tensorflow/models/tree/master/official/projects/edgetpu/vision#edgetpu-optimized-vision-models)
+
+### [Object Detection](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_detection_zoo.md#pixel-6-edge-tpu-models)
+
+### [Semantic Segmentation](https://github.com/tensorflow/models/tree/master/official/projects/edgetpu/vision#edgetpu-optimized-vision-models)
+
+
+### [Natural Language Understanding](https://github.com/tensorflow/models/tree/master/official/projects/edgetpu/nlp#mobilebert-edgetpu)
+
--- a/official/projects/edgetpu/vision/README.md
+++ b/official/projects/edgetpu/vision/README.md
 # EdgeTPU-optimized Vision Models

-## Overview
+## Image classification task

-This project includes computer vision models optimized for Edge TPU featured in
-Pixel Phones, Coral Products, and more. These models improve the latency and
-energy vs. accuracy pareto-frontier significantly compared to the existing
-SOTA models when running on the Edge TPU devices.
+### Introduction
+
+We are presenting a family of computer vision models based on MobileNetEdgeTPUV2
+that are optimized for the next generation Edge TPU ML accelerator in the Google
+Tensor SoC that powers the Pixel 6 phones. These models improve the
+latency-accuracy pareto-frontier compared to the existing SOTA on-device models
+including their predecessor MobileNetEdgeTPUs. MobileNetEdgeTPUV2 can be used as
+a standalone image classification model or as a backbone for other computer
+vision tasks such as object detection or semantic segmentation.
+
+### Search space design
+
+During the design of MobileNetEdgeTPUV2 we crafted a neural network search space
+which includes building blocks that run efficiently on the Edge TPU accelerator
+while providing better algorithmic qualities and leveraged AutoML to find the
+optimal architectures. As one of the key optimizations, we introduce Group
+Convolution based Inverted Bottleneck (IBN) blocks that provide great
+flexibility in achieving a tradeoff between latency and accuracy.
+
+Inverted Bottleneck (IBN) is a widely used building block in architecting a
+neural network for mobile vision tasks. A conventional IBN uses pointwise
+convolutions for expansion/projection before/after a depthwise convolution.
+Previously it has been shown that using a full convolution replacing the
+pointwise expansion and depthwise convolution can provide more trainable
+parameters while being faster. However, one big limitation is that using these
+full-convolution IBNs can get very expensive in terms of latency and memory
+requirements, especially for narrow/deep tensors that we see in later stages of
+vision models. This limits the use of “fused” full-convolution IBNs throughout
+the model and leaves depthwise IBN as the only alternative.
+
+<figure align="center">
+<img width=70% src=https://storage.googleapis.com/tf_model_garden/models/edgetpu/images/readme-ibn-intro.png>
+  <figcaption><i>Inverted bottleneck block (IBN) variants: (a) Conventional with depthwise, (b) Fused-IBN, (c)GC-IBN with group convolutions in the expansion phase</i></figcaption>
+</figure>
+
+In this work we utilize Group Convolution (GC) as part of the fused expansion in
+constructing IBNs (Figure 1). GC based IBN becomes a versatile block that opens
+up a large design space between conventional depthwise IBNs and fused
+full-convolution IBNs which can be controlled by the group size parameter.
+Figure 2 demonstrates the search space enabled by GC-based IBNs that allows a
+flexible tradeoff between latency and number of trainable parameters. GC-based
+IBNs allow increasing the number of trainable parameters gradually without
+requiring the latency cost of full-convolution based IBNs. Moreover, they can
+also be faster than conventional IBNs with depthwise convolutions while
+providing more trainable parameters.
+
+<figure align="center">
+<img width=60% src=https://storage.googleapis.com/tf_model_garden/models/edgetpu/images/readme-gc-comparison.png>
+</figure>
+
+
+### Model performance on Edge TPU
+
+Tradeoffs discussed above and exemplified in Figure 2 are highly dependent on
+the tensor shapes and cannot be generalized throughout the neural network.
+Hence, we use AutoML techniques as a rescue to find the optimal block decisions
+and craft a family of network architectures at different latency targets. Figure
+3 demonstrates that the resulting MobileNetEdgeTPUV2 model-family improves the
+pareto-frontier compared to the existing on-device SOTA models when run on Edge
+TPU.
+
+
+<figure align="center">
+<img width=70% src=https://storage.googleapis.com/tf_model_garden/models/edgetpu/images/readme-edgetpu-classification-plot.png>
+<figcaption><i>Comparison of Imagenet top-1 accuracy and Pixel 6 Edge TPU latency of MobileNetEdgeTPUV2 models with other on-device classification models</i></figcaption>
+</figure>
+
+
+#### On-device benchmarking of classification models
+
+Results on on-device benchmarking of various int8 quantized image classification
+models for 224x224 input resolution:
+
+Model (Checkpoint)                                                                                                                                                                  | Accuracy (int8) | Pixel 6 Edge TPU Latency (ms) | tflite
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------: | :---------------------------: | :----:
+[MobileNetEdgeTPUv2-Tiny](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v2/tiny/mobilenet-edgetpu-v2-tiny.tar.gz)  | 74.66%          | 0.78                          | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v2/tiny/mobilenet_edgetpu_v2_tiny.tflite)
+[MobileNetEdgeTPUv2-XS](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v2/tiny/mobilenet-edgetpu-v2-xs.tar.gz)      | 75.79%          | 0.82                          | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v2/tiny/mobilenet_edgetpu_v2_xs.tflite)
+[MobileNetEdgeTPUv2-S](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v2/tiny/mobilenet-edgetpu-v2-s.tar.gz)        | 77.36%          | 1.03                          | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v2/tiny/mobilenet_edgetpu_v2_s.tflite)
+[MobileNetEdgeTPUv2-M](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v2/tiny/mobilenet-edgetpu-v2-m.tar.gz)        | 78.43%          | 1.35                          | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v2/tiny/mobilenet_edgetpu_v2_m.tflite)
+[MobileNetEdgeTPUv2-L](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v2/tiny/mobilenet-edgetpu-v2-l.tar.gz)        | 79.00%          | 1.64                          | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v2/tiny/mobilenet_edgetpu_v2_l.tflite)
+[MobileNetEdgeTPU dm1.0](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v1/dm1p0/mobilenet-edgetpu-dm1p0.tar.gz)    | 75.6%           | 0.92                          | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v1/dm1p0/mobilenet_edgetpu.tflite)
+[MobileNetEdgeTPU dm1.25](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v1/dm1p25/mobilenet-edgetpu-dm1p25.tar.gz) | 77.06%          | 1.20                          | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v1/dm1p25/mobilenet_edgetpu_dm1p25.tflite)
+[MobileNetEdgeTPU dm1.5](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v1/dm1p5/mobilenet-edgetpu-dm1p5.tar.gz)    | 75.9%           | 1.42                          | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v1/dm1p5/mobilenet_edgetpu_dm1p5.tflite)
+[MobileNetEdgeTPU dm1.75](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v1/dm1p75/mobilenet-edgetpu-dm1p75.tar.gz) | 78.6%           | 1.93                          | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/mobilenet-edgetpu-v1/dm1p75/mobilenet_edgetpu_dm1p75.tflite)
+
+### Model performance on Pixel 6 CPU
+
+Our primary optimization target is the Edge TPU accelerator however in our
+search space we include operations that also run well on Pixel 6 CPU to be able
+to reach a wide range of platforms. Moreover, we implement GC using functionally
+equivalent series of commonly used ML primitives (channelwise slice, full
+convolution, concatenation) as shown in Figure 2, since a native GC operation
+may not be supported for all target platforms. As a result, the performance of
+MobileNetEdgeTPUV2 is also superior to other on-device models when run on Pixel
+6 CPU as shown in Figure 4.
+
+<figure align="center">
+<img width=70% src=https://storage.googleapis.com/tf_model_garden/models/edgetpu/images/readme-cpu-classification-plot.png>
+  <figcaption><i>Comparison of Imagenet top-1 accuracy and Pixel 6 latency of MobileNetEdgeTPUV2 models with other on-device classification models</i></figcaption>
+</figure>
+
+
+
+## Semantic segmentation task
+### Using classification models as backbone
+
+We also present segmentation models based on MobileNetEdgeTPUV2 backbone and
+DeepLab v3 plus decoder and head (first used
+[here](https://arxiv.org/pdf/1802.02611.pdf)). These models optimized for the
+next generation Edge TPU accelerators featured in Pixel 6 phones and improve the
+latency-accuracy pareto-frontier compared to the their predecessor based on
+MobileNetV2 and DeepLabV3+.
+
+The segmentation model is built using the pretrained MobileNetEdgeTPUV2 as a
+feature encoder and ASPP decoder in conjunction with a Deeplab V3 Plus head.
+Separable convolutions used to reduce the size of the model.
+
+<figure align="center">
+<img width=60% src=https://storage.googleapis.com/tf_model_garden/models/edgetpu/images/readme-seg-flow.png>
+<figcaption></figcaption>
+</figure>
+
+#### Using architecture search to find high-quality, low-latency segmentation models
+
+To further improve the quality of on-device segmentation models, we invoke
+architecture search to jointly search for the model's feature extractor and the
+segmentation head. Autoseg-EdgeTPU is a set of searched segmentation models
+customized for the Edge TPU in Pixel 6. The feature extractor is derived from
+Edge TPU search space where a mixture of IBN and fused IBN are used. We
+automatically find the optimal kernel size, channel multiplier, expansion ratio,
+and groups on a per layer basis using a reinforcement learning algorithm. The
+segmentation head is an optimized version of
+[Bi-FPN](https://arxiv.org/abs/1911.09070) head, with customized number of
+repeats and feature selection.
+
+#### Argmax fusion to improve segmentation model latency
+The last two levels of the model (bilinear resizing and Argmax) contribute
+significantly to latency on the device model. This is due to the large
+activation size between these layers (512 x 512 x Number of classes). These
+layers can be merged without significantly impacting quality scores by making
+Argmax smaller and scaling the classes to the desired size with nearest
+neighbor.
+
+<figure align="center">
+<img width=60% src=https://storage.googleapis.com/tf_model_garden/models/edgetpu/images/readme-seg-fused-argmax.png>
+</figure>
+
+
+### On-device benchmarking of segmentation models
+
+<figure align="center">
+<img src=https://storage.googleapis.com/tf_model_garden/models/edgetpu/images/readme-seg-plot.png width=60%>
+  <figcaption><i>Performance of AutosegEdgeTPU and MobileNetEdgeTPUV2+DeeplabV3+ models on the 32-class ADE20K semantic segmentation task.</i></figcaption>
+</figure>
+
+Model Name  (Checkpoint) | Backbone | Segmentation Head| #Parameters (million)| ADE20K 32-class mIOU| Pixel 6 EdgeTPU latency (ms)| Tflite |
+|:---|:-----------------------:|:----------------:|:--------------------:|:-------------------:|:---------------------------:|:------:|
+deeplabv3plus_mobilenet_edgetpuv2_baseline| MobileNet V2 (baseline)| DeeplabV3+ | 2.34 | 54.06% | 7.5 | link |
+[deeplabv3plus_mobilenet_edgetpuv2_xs](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/segmentation-edgetpu/checkpoints/deeplabv3plus_mobilenet_edgetpuv2_xs_ade20k_32/deeplabv3plus_mobilenet_edgetpuv2_xs_ade20k_32.tar.gz)| MobileNetEdgeTPUV2-XS | DeeplabV3+ | 3.6 | 56.02% | 5.2 | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/segmentation-edgetpu/tflite/default_argmax/deeplabv3plus_mobilenet_edgetpuv2_xs_ade20k_32.tflite) |
+[deeplabv3plus_mobilenet_edgetpuv2_s](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/segmentation-edgetpu/checkpoints/deeplabv3plus_mobilenet_edgetpuv2_s_ade20k_32/deeplabv3plus_mobilenet_edgetpuv2_s_ade20k_32.tar.gz)| MobileNetEdgeTPUV2-S | DeeplabV3+ | 5.2 | 59.43% | 5.9 | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/segmentation-edgetpu/tflite/default_argmax/deeplabv3plus_mobilenet_edgetpuv2_s_ade20k_32.tflite) |
+[deeplabv3plus_mobilenet_edgetpuv2_m](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/segmentation-edgetpu/checkpoints/deeplabv3plus_mobilenet_edgetpuv2_m_ade20k_32/deeplabv3plus_mobilenet_edgetpuv2_m_ade20k_32.tar.gz)| MobileNetEdgeTPUV2-M | DeeplabV3+ | 7.7 | 59.81% | 7.2 | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/segmentation-edgetpu/tflite/default_argmax/deeplabv3plus_mobilenet_edgetpuv2_m_ade20k_32.tflite) |
+[autoseg_edgetpu_xs](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/segmentation-edgetpu/checkpoints/autoseg_edgetpu_xs/autoseg_edgetpu_xs.tar.gz)| AutosegEdgeTPU-XS | BiFPN | 2.9 | 59.64% | 5.4 | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/segmentation-edgetpu/tflite/default_argmax/autoseg_edgetpu_xs.tflite) |
+[autoseg_edgetpu_s](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/segmentation-edgetpu/checkpoints/autoseg_edgetpu_s/autoseg_edgetpu_s.tar.gz)| AutosegEdgeTPU-S  | BiFPN | 3.1 | 61.31% | 5.7 | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/segmentation-edgetpu/tflite/default_argmax/autoseg_edgetpu_s.tflite) |
+
+
+By fusing argmax with resize operator as shown above, it is possible to further
+improve the on-device latency of the segmentation models without significantly
+impacting the quality:
+
+Note: Models with default argmax and fusing argmax are using the same checkpoint
+since there is no parameter change.
+
+| Model Name | ADE20K 32-class mIOU| Pixel 6 EdgeTPU latency (ms)| tflite |
+|----------------------|:------------------:|:----------------------:|:---------------------:|
+| deeplabv3plus_mobilenet_edgetpuv2_xs | 56% | 3.4 | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/segmentation-edgetpu/tflite/fused_argmax/deeplabv3plus_mobilenet_edgetpuv2_xs_ade20k_32.tflite) |
+| deeplabv3plus_mobilenet_edgetpuv2_s | 59.41% | 4.2 | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/segmentation-edgetpu/tflite/fused_argmax/deeplabv3plus_mobilenet_edgetpuv2_s_ade20k_32.tflite) |
+| deeplabv3plus_mobilenet_edgetpuv2_m | 59.79% | 5.5 | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/segmentation-edgetpu/tflite/fused_argmax/deeplabv3plus_mobilenet_edgetpuv2_m_ade20k_32.tflite) |
+| autoseg_edgetpu_xs | 59.62% | 3.6 | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/segmentation-edgetpu/tflite/fused_argmax/autoseg_edgetpu_xs.tflite) |
+| autoseg_edgetpu_s | 61.28% | 3.9 | [link](https://storage.cloud.google.com/tf_model_garden/models/edgetpu/checkpoint_and_tflite/vision/segmentation-edgetpu/tflite/fused_argmax/autoseg_edgetpu_s.tflite) |
+
+## Object detection task
+
+EdgeTPU-optimized models for object detection are hosted in the [TensorFlow object
+detection API](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/tf1_detection_zoo.md#pixel-6-edge-tpu-models)
+
+### Training the models
+
+Note that the `EXPERIMENT_TYPE` has to be in one of the preregistered
+classification configs, such as `mobilenet_edgetpu_xs` for classification
+models. In case you train segmentation model `EXPERIMENT_TYPE` has to be in one
+of the preregistered segmentations configs], such as
+`seg_deeplabv3plus_mobilenet_edgetpuv2_s_ade20k`, `autoseg_edgetpu_xs`
+
+```
+EXPERIMENT_NAME=xxx  # Change this for your run, for example, 'mobilenet-edgetpu-test-run'
+EXPERIMENT_TYPE=xxx  # Change this for your run, for example, 'mobilenet_edgetpu_v2_xs'
+$ python3 train.py \
+--experiment_name=${EXPERIMENT_NAME} \
+--experiment_type=${EXPERIMENT_TYPE}  \
+--mode=train_and_eval
+```
+
+### From training to quantized inference deployment
+
+To export quantized tflite models using tensorflow post-training quantization:
+
+**For classification models**:
+
+```
+$ python3 serving/export_tflite.py 
+--model_name=${EXPERIMENT_TYPE} \
+--ckpt_path=${CHECKPOINT} \
+--dataset_dir=/path/to/calibration/dataset \
+--output_dir=/tmp \
+--quantize \
+--image_size=224
+```
+
+Note that the `EXPERIMENT_TYPE` has to be in one of the preregistered
+classification configs, such as `mobilenet_edgetpu_xs`.
+
+**For segmentation models**:
+
+```
+$ python3 serving/export_tflite.py \
+--model_name=${EXPERIMENT_TYPE} 
+--ckpt_path=${CHECKPOINT} \
+--dataset_dir=/path/to/calibration/dataset \ 
+--output_dir=/tmp \
+--quantize \
+--quantize_less_restrictive \
+--image_size=512 \
+--finalize_method=${ARGMAX_FUSION}
+```
+
+`EXPERIMENT_TYPE` has to be in one of the preregistered segmentations configs,
+such as `deeplabv3plus_mobilenet_edgetpuv2_s_ade20k_32`.
+
+`ARGMAX_FUSION` has to be in one of the following:
+
+-   `resize512,argmax`: Argmax applied after scaling the output to 512x512.
+-   `resize256,argmax,resize512,squeeze`: Scale the output to 256x256, apply
+    argmax, scale to 512x512 using nearest neighbor upsampling
+-   `resize128,argmax,resize512,squeeze`: Scale the output to 128x128, apply
+    argmax, scale to 512x512 using nearest neighbor upsampling
+
+### On-device benchmarking
+
+The models in this repository are compatible with NNAPI and can be benchmarked
+on Pixel 6 devices using the
+[tflite benchmark tool](https://www.tensorflow.org/lite/performance/measurement)
+
+While using the benchmark tool, enable the use of NNAPI by setting the
+`use_nnapi` command line argument to `true`, and specifying the
+`nnapi_accelerator` as `google-edgetpu`
+
+```shell
+$ bazel build -c opt --config=android_arm64 tensorflow/lite/tools/benchmark:benchmark_model
+# Push binary to device
+$ adb push bazel-bin/tensorflow/lite/tools/benchmark/benchmark_model /data/local/tmp
+# Push model to device
+$ adb push /path/to/model.tflite /data/local/tmp/
+# Run on-device benchmarking
+$ adb shell /data/local/tmp/benchmark_model --graph=/data/local/tmp/model.tflite --use_nnapi=true --
+nnapi_accelerator_name=google-edgetpu
+```
--- a/official/projects/edgetpu/vision/configs/semantic_segmentation_config.py
+++ b/official/projects/edgetpu/vision/configs/semantic_segmentation_config.py
@@ -21,11 +21,10 @@ deeplab v3 segmentation head.
 import dataclasses
 import os
 from typing import Optional
-
+from official.core import config_definitions as cfg
 from official.core import exp_factory
 from official.modeling import hyperparams
 from official.modeling import optimization
-from official.modeling.hyperparams import config_definitions as cfg
 from official.vision.beta.configs import backbones
 from official.vision.beta.configs import common
 from official.vision.beta.configs import decoders

--- a/official/projects/edgetpu/vision/modeling/mobilenet_edgetpu_v1_model_test.py
+++ b/official/projects/edgetpu/vision/modeling/mobilenet_edgetpu_v1_model_test.py
@@ -17,11 +17,10 @@
 import os

 import tensorflow as tf
-
+from official.legacy.image_classification import preprocessing
 from official.projects.edgetpu.vision.modeling import common_modules
 from official.projects.edgetpu.vision.modeling import mobilenet_edgetpu_v1_model
 from official.projects.edgetpu.vision.modeling import mobilenet_edgetpu_v1_model_blocks
-from official.vision.image_classification import preprocessing

 # TODO(b/151324383): Enable once training is supported for mobilenet-edgetpu
 EXAMPLE_IMAGE = ('third_party/tensorflow_models/official/vision/'

--- a/official/projects/edgetpu/vision/serving/export_tflite.py
+++ b/official/projects/edgetpu/vision/serving/export_tflite.py
@@ -110,6 +110,7 @@ def get_export_config_from_flags():
      dataset_split=FLAGS.dataset_split)
  export_config = export_util.ExportConfig(
      model_name=FLAGS.model_name,
+      output_layer=FLAGS.output_layer,
      ckpt_path=FLAGS.ckpt_path,
      ckpt_format=FLAGS.ckpt_format,
      output_dir=FLAGS.output_dir,

--- a/official/projects/edgetpu/vision/serving/export_util.py
+++ b/official/projects/edgetpu/vision/serving/export_util.py
@@ -69,7 +69,9 @@ class ExportConfig(base_config.Config):
  """Configuration for exporting models as tflite and saved_models.

  Attributes:
-    model_name: One of the registered model names
+    model_name: One of the registered model names.
+    output_layer: Layer name to take the output from. Can be used to take the
+      output from an intermediate layer.
    ckpt_path: Path of the training checkpoint. If not provided tflite with
      random parameters is exported.
    ckpt_format: Format of the checkpoint. tf_checkpoint is for ckpt files from
@@ -92,7 +94,8 @@ class ExportConfig(base_config.Config):
        resize bilinear to 128x128, then argmax then resize nn to 512x512
  """
  quantization_config: QuantizationConfig = QuantizationConfig()
-  model_name: str = None
+  model_name: Optional[str] = None
+  output_layer: Optional[str] = None
  ckpt_path: Optional[str] = None
  ckpt_format: Optional[str] = 'tf_checkpoint'
  output_dir: str = '/tmp/'

--- a/official/projects/edgetpu/vision/tasks/image_classification.py
+++ b/official/projects/edgetpu/vision/tasks/image_classification.py
@@ -112,7 +112,6 @@ class EdgeTPUTask(base_task.Task):
    else:
      raise ValueError('Model has to be mobilenet-edgetpu model or searched'
                       'model with given saved model path.')
-    model.summary()

    return model


--- a/official/projects/edgetpu/vision/tasks/semantic_segmentation.py
+++ b/official/projects/edgetpu/vision/tasks/semantic_segmentation.py
@@ -19,8 +19,8 @@ from absl import logging
 import tensorflow as tf

 from official.common import dataset_fn
+from official.core import config_definitions as cfg
 from official.core import task_factory
-from official.modeling.hyperparams import config_definitions as cfg
 from official.projects.edgetpu.vision.configs import semantic_segmentation_config as exp_cfg
 from official.projects.edgetpu.vision.configs import semantic_segmentation_searched_config as searched_cfg
 from official.projects.edgetpu.vision.modeling import mobilenet_edgetpu_v1_model

--- a/official/nlp/projects/mobilebert/README.md
+++ b/official/nlp/projects/mobilebert/README.md
--- a/official/vision/keras_cv/losses/__init__.py
+++ b/official/vision/keras_cv/losses/__init__.py
@@ -12,6 +12,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""Keras-CV layers package definition."""
-from official.vision.keras_cv.losses.focal_loss import FocalLoss
-from official.vision.keras_cv.losses.loss_utils import multi_level_flatten
--- a/official/nlp/projects/mobilebert/distillation.py
+++ b/official/nlp/projects/mobilebert/distillation.py