Merge pull request #1 from tensorflow/master

Updated

Merge pull request #1 from tensorflow/master
Updated
43178d7f · Ayushman Kumar · GitHub · 8b47aa3d · 75d13042 · 43178d7f
Unverified Commit 43178d7f authored Mar 04, 2020 by Ayushman Kumar Committed by GitHub Mar 04, 2020
20 changed files
--- a/official/benchmark/bert_squad_benchmark.py
+++ b/official/benchmark/bert_squad_benchmark.py
@@ -172,7 +172,7 @@ class BertSquadBenchmarkReal(BertSquadBenchmarkBase):
                                run_eagerly=False,
                                ds_type='mirrored'):
    """Runs the benchmark and reports various metrics."""
-    if FLAGS.train_batch_size <= 4:
+    if FLAGS.train_batch_size <= 4 or run_eagerly:
      FLAGS.input_meta_data_path = SQUAD_MEDIUM_INPUT_META_DATA_PATH
    else:
      FLAGS.input_meta_data_path = SQUAD_LONG_INPUT_META_DATA_PATH

--- a/official/modeling/hyperparams/base_config.py
+++ b/official/modeling/hyperparams/base_config.py
@@ -143,12 +143,10 @@ class Config(params_dict.ParamsDict):
    return subconfig_type

  def __post_init__(self, default_params, restrictions, *args, **kwargs):
-    logging.error('DEBUG before init %r', type(self))
    super().__init__(default_params=default_params,
                     restrictions=restrictions,
                     *args,
                     **kwargs)
-    logging.error('DEBUG after init %r', type(self))

  def _set(self, k, v):
    """Overrides same method in ParamsDict.
@@ -246,3 +244,71 @@ class Config(params_dict.ParamsDict):
    default_params = {a: p for a, p in zip(attributes, args)}
    default_params.update(kwargs)
    return cls(default_params)
+
+
+@dataclasses.dataclass
+class RuntimeConfig(Config):
+  """High-level configurations for Runtime.
+
+  These include parameters that are not directly related to the experiment,
+  e.g. directories, accelerator type, etc.
+
+  Attributes:
+    distribution_strategy: e.g. 'mirrored', 'tpu', etc.
+    enable_eager: Whether or not to enable eager mode.
+    enable_xla: Whether or not to enable XLA.
+    per_gpu_thread_count: thread count per GPU.
+    gpu_threads_enabled: Whether or not GPU threads are enabled.
+    gpu_thread_mode: Whether and how the GPU device uses its own threadpool.
+    dataset_num_private_threads: Number of threads for a private threadpool
+      created for all datasets computation.
+    tpu: The address of the TPU to use, if any.
+    num_gpus: The number of GPUs to use, if any.
+    worker_hosts: comma-separated list of worker ip:port pairs for running
+      multi-worker models with DistributionStrategy.
+    task_index: If multi-worker training, the task index of this worker.
+    all_reduce_alg: Defines the algorithm for performing all-reduce.
+  """
+  distribution_strategy: str = 'mirrored'
+  enable_eager: bool = False
+  enable_xla: bool = False
+  gpu_threads_enabled: bool = False
+  gpu_thread_mode: Optional[str] = None
+  dataset_num_private_threads: Optional[int] = None
+  per_gpu_thread_count: int = 0
+  tpu: Optional[str] = None
+  num_gpus: int = 0
+  worker_hosts: Optional[str] = None
+  task_index: int = -1
+  all_reduce_alg: Optional[str] = None
+
+
+@dataclasses.dataclass
+class TensorboardConfig(Config):
+  """Configuration for Tensorboard.
+
+  Attributes:
+    track_lr: Whether or not to track the learning rate in Tensorboard. Defaults
+      to True.
+    write_model_weights: Whether or not to write the model weights as
+      images in Tensorboard. Defaults to False.
+
+  """
+  track_lr: bool = True
+  write_model_weights: bool = False
+
+
+@dataclasses.dataclass
+class CallbacksConfig(Config):
+  """Configuration for Callbacks.
+
+  Attributes:
+    enable_checkpoint_and_export: Whether or not to enable checkpoints as a
+      Callback. Defaults to True.
+    enable_tensorboard: Whether or not to enable Tensorboard as a Callback.
+      Defaults to True.
+
+  """
+  enable_checkpoint_and_export: bool = True
+  enable_tensorboard: bool = True
+
--- a/official/nlp/modeling/layers/attention.py
+++ b/official/nlp/modeling/layers/attention.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based attention layer."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -45,7 +45,7 @@ class Attention(tf.keras.layers.Layer):
  interpolated by these probabilities, then concatenated back to a single
  tensor and returned.

-  Attributes:
+  Arguments:
    num_heads: Number of attention heads.
    head_size: Size of each attention head.
    dropout: Dropout probability.
@@ -186,7 +186,7 @@ class Attention(tf.keras.layers.Layer):
 class CachedAttention(Attention):
  """Attention layer with cache used for auto-agressive decoding.

-  Attributes:
+  Arguments:
    num_heads: Number of attention heads.
    head_size: Size of each attention head.
    **kwargs: Other keyword arguments inherit from `Attention` class.

--- a/official/nlp/modeling/layers/dense_einsum.py
+++ b/official/nlp/modeling/layers/dense_einsum.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based einsum layer."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -30,7 +30,7 @@ class DenseEinsum(tf.keras.layers.Layer):

  This layer can perform einsum calculations of arbitrary dimensionality.

-  Attributes:
+  Arguments:
    output_shape: Positive integer or tuple, dimensionality of the output space.
    num_summed_dimensions: The number of dimensions to sum over. Standard 2D
      matmul should use 1, 3D matmul should use 2, and so forth.

--- a/official/nlp/modeling/layers/masked_softmax.py
+++ b/official/nlp/modeling/layers/masked_softmax.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based softmax layer with optional masking."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -26,7 +26,7 @@ import tensorflow as tf
 class MaskedSoftmax(tf.keras.layers.Layer):
  """Performs a softmax with optional masking on a tensor.

-  Attributes:
+  Arguments:
    mask_expansion_axes: Any axes that should be padded on the mask tensor.
  """


--- a/official/nlp/modeling/layers/on_device_embedding.py
+++ b/official/nlp/modeling/layers/on_device_embedding.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based one-hot embedding layer."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -31,7 +31,7 @@ class OnDeviceEmbedding(tf.keras.layers.Layer):
  This layer uses either tf.gather or tf.one_hot to translate integer indices to
  float embeddings.

-  Attributes:
+  Arguments:
    vocab_size: Number of elements in the vocabulary.
    embedding_width: Output size of the embedding layer.
    initializer: The initializer to use for the embedding weights. Defaults to

--- a/official/nlp/modeling/layers/position_embedding.py
+++ b/official/nlp/modeling/layers/position_embedding.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based positional embedding layer."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -37,7 +37,7 @@ class PositionEmbedding(tf.keras.layers.Layer):
  can have a dynamic 1st dimension, while if `use_dynamic_slicing` is False the
  input size must be fixed.

-  Attributes:
+  Arguments:
    use_dynamic_slicing: Whether to use the dynamic slicing path.
    max_sequence_length: The maximum size of the dynamic sequence. Only
      applicable if `use_dynamic_slicing` is True.

--- a/official/nlp/modeling/layers/transformer.py
+++ b/official/nlp/modeling/layers/transformer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based transformer block layer."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -32,7 +32,7 @@ class Transformer(tf.keras.layers.Layer):
  This layer implements the Transformer from "Attention Is All You Need".
  (https://arxiv.org/abs/1706.03762).

-  Attributes:
+  Arguments:
    num_attention_heads: Number of attention heads.
    intermediate_size: Size of the intermediate layer.
    intermediate_activation: Activation for the intermediate layer.

--- a/official/nlp/modeling/layers/transformer_scaffold.py
+++ b/official/nlp/modeling/layers/transformer_scaffold.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Keras-based transformer scaffold layer."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -35,7 +35,7 @@ class TransformerScaffold(tf.keras.layers.Layer):
  `attention_cfg`, in which case the scaffold will instantiate the class with
  the config, or pass a class instance to `attention_cls`.

-  Attributes:
+  Arguments:
    num_attention_heads: Number of attention heads.
    intermediate_size: Size of the intermediate layer.
    intermediate_activation: Activation for the intermediate layer.

--- a/official/nlp/modeling/networks/albert_transformer_encoder.py
+++ b/official/nlp/modeling/networks/albert_transformer_encoder.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """ALBERT (https://arxiv.org/abs/1810.04805) text encoder network."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -41,7 +41,7 @@ class AlbertTransformerEncoder(network.Network):
  The default values for this object are taken from the ALBERT-Base
  implementation described in the paper.

-  Attributes:
+  Arguments:
    vocab_size: The size of the token vocabulary.
    embedding_width: The width of the word embeddings. If the embedding width
      is not equal to hidden size, embedding parameters will be factorized into

--- a/official/nlp/modeling/networks/bert_classifier.py
+++ b/official/nlp/modeling/networks/bert_classifier.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Trainer network for BERT-style models."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -36,7 +36,7 @@ class BertClassifier(tf.keras.Model):
  instantiates a classification network based on the passed `num_classes`
  argument.

-  Attributes:
+  Arguments:
    network: A transformer network. This network should output a sequence output
      and a classification output. Furthermore, it should expose its embedding
      table via a "get_embedding_table" method.

--- a/official/nlp/modeling/networks/bert_pretrainer.py
+++ b/official/nlp/modeling/networks/bert_pretrainer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Trainer network for BERT-style models."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -37,7 +37,7 @@ class BertPretrainer(tf.keras.Model):
  instantiates the masked language model and classification networks that are
  used to create the training objectives.

-  Attributes:
+  Arguments:
    network: A transformer network. This network should output a sequence output
      and a classification output. Furthermore, it should expose its embedding
      table via a "get_embedding_table" method.

--- a/official/nlp/modeling/networks/bert_span_labeler.py
+++ b/official/nlp/modeling/networks/bert_span_labeler.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Trainer network for BERT-style models."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -35,7 +35,7 @@ class BertSpanLabeler(tf.keras.Model):
  The BertSpanLabeler allows a user to pass in a transformer stack, and
  instantiates a span labeling network based on a single dense layer.

-  Attributes:
+  Arguments:
    network: A transformer network. This network should output a sequence output
      and a classification output. Furthermore, it should expose its embedding
      table via a "get_embedding_table" method.

--- a/official/nlp/modeling/networks/classification.py
+++ b/official/nlp/modeling/networks/classification.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Classification network."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -31,7 +31,7 @@ class Classification(network.Network):

  This network implements a simple classifier head based on a dense layer.

-  Attributes:
+  Arguments:
    input_width: The innermost dimension of the input tensor to this network.
    num_classes: The number of classes that this network should classify to.
    activation: The activation, if any, for the dense layer in this network.

--- a/official/nlp/modeling/networks/encoder_scaffold.py
+++ b/official/nlp/modeling/networks/encoder_scaffold.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Transformer-based text encoder network."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -46,7 +46,7 @@ class EncoderScaffold(network.Network):
  If the hidden_cls is not overridden, a default transformer layer will be
  instantiated.

-  Attributes:
+  Arguments:
    num_output_classes: The output size of the classification layer.
    classification_layer_initializer: The initializer for the classification
      layer.

--- a/official/nlp/modeling/networks/masked_lm.py
+++ b/official/nlp/modeling/networks/masked_lm.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Masked language model network."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -32,7 +32,7 @@ class MaskedLM(network.Network):
  This network implements a masked language model based on the provided network.
  It assumes that the network being passed has a "get_embedding_table()" method.

-  Attributes:
+  Arguments:
    input_width: The innermost dimension of the input tensor to this network.
    num_predictions: The number of predictions to make per sequence.
    source_network: The network with the embedding layer to use for the

--- a/official/nlp/modeling/networks/span_labeling.py
+++ b/official/nlp/modeling/networks/span_labeling.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Span labeling network."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -31,7 +31,7 @@ class SpanLabeling(network.Network):

  This network implements a simple single-span labeler based on a dense layer.

-  Attributes:
+  Arguments:
    input_width: The innermost dimension of the input tensor to this network.
    activation: The activation, if any, for the dense layer in this network.
    initializer: The intializer for the dense layer in this network. Defaults to

--- a/official/nlp/modeling/networks/transformer_encoder.py
+++ b/official/nlp/modeling/networks/transformer_encoder.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Transformer-based text encoder network."""
-
+# pylint: disable=g-classes-have-attributes
 from __future__ import absolute_import
 from __future__ import division
 # from __future__ import google_type_annotations
@@ -40,7 +40,7 @@ class TransformerEncoder(network.Network):
  in "BERT: Pre-training of Deep Bidirectional Transformers for Language
  Understanding".

-  Attributes:
+  Arguments:
    vocab_size: The size of the token vocabulary.
    hidden_size: The size of the transformer hidden layers.
    num_layers: The number of transformer layers.

--- a/research/lstm_object_detection/builders/graph_rewriter_builder.py
+++ b/research/lstm_object_detection/builders/graph_rewriter_builder.py
@@ -21,7 +21,9 @@ customization of freeze_bn_delay.
 """

 import re
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+from tensorflow.contrib import layers as contrib_layers
+from tensorflow.contrib import quantize as contrib_quantize
 from tensorflow.contrib.quantize.python import common
 from tensorflow.contrib.quantize.python import input_to_ops
 from tensorflow.contrib.quantize.python import quant_ops
@@ -72,17 +74,18 @@ def build(graph_rewriter_config,

    # Quantize the graph by inserting quantize ops for weights and activations
    if is_training:
-      tf.contrib.quantize.experimental_create_training_graph(
+      contrib_quantize.experimental_create_training_graph(
          input_graph=graph,
          quant_delay=graph_rewriter_config.quantization.delay,
          freeze_bn_delay=graph_rewriter_config.quantization.delay)
    else:
-      tf.contrib.quantize.experimental_create_eval_graph(
+      contrib_quantize.experimental_create_eval_graph(
          input_graph=graph,
          quant_delay=graph_rewriter_config.quantization.delay
          if not is_export else 0)

-    tf.contrib.layers.summarize_collection('quant_vars')
+    contrib_layers.summarize_collection('quant_vars')
+
  return graph_rewrite_fn



--- a/research/lstm_object_detection/builders/graph_rewriter_builder_test.py
+++ b/research/lstm_object_detection/builders/graph_rewriter_builder_test.py
@@ -15,7 +15,9 @@

 """Tests for graph_rewriter_builder."""
 import mock
-import tensorflow as tf
+import tensorflow.compat.v1 as tf
+from tensorflow.contrib import layers as contrib_layers
+from tensorflow.contrib import quantize as contrib_quantize
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from lstm_object_detection.builders import graph_rewriter_builder
@@ -27,9 +29,9 @@ class QuantizationBuilderTest(tf.test.TestCase):

  def testQuantizationBuilderSetsUpCorrectTrainArguments(self):
    with mock.patch.object(
-        tf.contrib.quantize,
+        contrib_quantize,
        'experimental_create_training_graph') as mock_quant_fn:
-      with mock.patch.object(tf.contrib.layers,
+      with mock.patch.object(contrib_layers,
                             'summarize_collection') as mock_summarize_col:
        graph_rewriter_proto = graph_rewriter_pb2.GraphRewriter()
        graph_rewriter_proto.quantization.delay = 10
@@ -44,9 +46,9 @@ class QuantizationBuilderTest(tf.test.TestCase):
        mock_summarize_col.assert_called_with('quant_vars')

  def testQuantizationBuilderSetsUpCorrectEvalArguments(self):
-    with mock.patch.object(tf.contrib.quantize,
+    with mock.patch.object(contrib_quantize,
                           'experimental_create_eval_graph') as mock_quant_fn:
-      with mock.patch.object(tf.contrib.layers,
+      with mock.patch.object(contrib_layers,
                             'summarize_collection') as mock_summarize_col:
        graph_rewriter_proto = graph_rewriter_pb2.GraphRewriter()
        graph_rewriter_proto.quantization.delay = 10