Merge pull request #9575 from SamuelMarks:args-for-google-style-docstrings-official

PiperOrigin-RevId: 348853056

Merge pull request #9575 from SamuelMarks:args-for-google-style-docstrings-official
PiperOrigin-RevId: 348853056
905f8871 · A. Unique TensorFlower · 9e006cf7 · 90979a21 · 905f8871 · 905f8871
Commit 905f8871 authored Dec 23, 2020 by A. Unique TensorFlower
20 changed files
--- a/official/core/train_utils.py
+++ b/official/core/train_utils.py
@@ -43,7 +43,7 @@ class BestCheckpointExporter:
  def __init__(self, export_dir: str, metric_name: str, metric_comp: str):
    """Initialization.
-    Arguments:
+    Args:
      export_dir: The directory that will contain exported checkpoints.
      metric_name: Indicates which metric to look at, when determining which
        result is better.

--- a/official/nlp/bert/model_saving_utils.py
+++ b/official/nlp/bert/model_saving_utils.py
@@ -27,7 +27,7 @@ def export_bert_model(model_export_path: typing.Text,
                      restore_model_using_load_weights: bool = False) -> None:
  """Export BERT model for serving which does not include the optimizer.
-  Arguments:
+  Args:
      model_export_path: Path to which exported model will be saved.
      model: Keras model object to export.
      checkpoint_dir: Path from which model weights will be loaded, if

--- a/official/nlp/bert/model_training_utils.py
+++ b/official/nlp/bert/model_training_utils.py
@@ -132,7 +132,7 @@ def run_customized_training_loop(
    allreduce_bytes_per_pack=0):
  """Run BERT pretrain model training using low-level API.
-  Arguments:
+  Args:
      _sentinel: Used to prevent positional parameters. Internal, do not use.
      strategy: Distribution strategy on which to run low level training loop.
      model_fn: Function that returns a tuple (model, sub_model). Caller of this

--- a/official/nlp/data/classifier_data_lib.py
+++ b/official/nlp/data/classifier_data_lib.py
@@ -944,7 +944,7 @@ class XtremePawsxProcessor(DataProcessor):
               only_use_en_dev=True):
    """See base class.
-    Arguments:
+    Args:
      process_text_fn: See base class.
      translated_data_dir: If specified, will also include translated data in
        the training and testing data.
@@ -1061,7 +1061,7 @@ class XtremeXnliProcessor(DataProcessor):
               only_use_en_dev=True):
    """See base class.
-    Arguments:
+    Args:
      process_text_fn: See base class.
      translated_data_dir: If specified, will also include translated data in
        the training data.
@@ -1350,7 +1350,7 @@ def generate_tf_record_from_data_file(processor,
                                      max_seq_length=128):
  """Generates and saves training data into a tf record file.
-  Arguments:
+  Args:
      processor: Input processor object to be used for generating data. Subclass
        of `DataProcessor`.
      data_dir: Directory that contains train/eval/test data to process.

--- a/official/nlp/data/create_pretraining_data.py
+++ b/official/nlp/data/create_pretraining_data.py
@@ -390,7 +390,7 @@ def _window(iterable, size):
    _window(input, 4) => [1, 2, 3, 4]
    _window(input, 5) => None
-  Arguments:
+  Args:
    iterable: elements to iterate over.
    size: size of the window.
@@ -414,7 +414,7 @@ def _window(iterable, size):
 def _contiguous(sorted_grams):
  """Test whether a sequence of grams is contiguous.
-  Arguments:
+  Args:
    sorted_grams: _Grams which are sorted in increasing order.
  Returns:
    True if `sorted_grams` are touching each other.
@@ -454,7 +454,7 @@ def _masking_ngrams(grams, max_ngram_size, max_masked_tokens, rng):
  The length of the selected n-gram follows a zipf weighting to
  favor shorter n-gram sizes (weight(1)=1, weight(2)=1/2, weight(3)=1/3, ...).
-  Arguments:
+  Args:
    grams: List of one-grams.
    max_ngram_size: Maximum number of contiguous one-grams combined to create
      an n-gram.
@@ -542,7 +542,7 @@ def _wordpieces_to_grams(tokens):
     tokens: ['[CLS]', 'That', 'lit', '##tle', 'blue', 'tru', '##ck', '[SEP]']
      grams: [          [1,2), [2,         4),  [4,5) , [5,       6)]
-  Arguments:
+  Args:
    tokens: list of wordpieces
  Returns:
    List of _Grams representing spans of whole words

--- a/official/nlp/data/tagging_data_lib.py
+++ b/official/nlp/data/tagging_data_lib.py
@@ -96,7 +96,7 @@ class PanxProcessor(classifier_data_lib.DataProcessor):
               only_use_en_dev=True):
    """See base class.
-    Arguments:
+    Args:
      process_text_fn: See base class.
      only_use_en_train: If True, only use english training data. Otherwise, use
        training data from all languages.
@@ -162,7 +162,7 @@ class UdposProcessor(classifier_data_lib.DataProcessor):
               only_use_en_dev=True):
    """See base class.
-    Arguments:
+    Args:
      process_text_fn: See base class.
      only_use_en_train: If True, only use english training data. Otherwise, use
        training data from all languages.

--- a/official/nlp/keras_nlp/encoders/bert_encoder.py
+++ b/official/nlp/keras_nlp/encoders/bert_encoder.py
@@ -39,7 +39,7 @@ class BertEncoder(tf.keras.Model):
  *Note* that the network is constructed by
  [Keras Functional API](https://keras.io/guides/functional_api/).
-  Arguments:
+  Args:
    vocab_size: The size of the token vocabulary.
    hidden_size: The size of the transformer hidden layers.
    num_layers: The number of transformer layers.

--- a/official/nlp/keras_nlp/layers/masked_lm.py
+++ b/official/nlp/keras_nlp/layers/masked_lm.py
@@ -31,7 +31,7 @@ class MaskedLM(tf.keras.layers.Layer):
  lm_layer=MaskedLM(embedding_table=encoder.get_embedding_table())
  ```
-  Arguments:
+  Args:
    embedding_table: The embedding table from encoder network.
    activation: The activation, if any, for the dense layer.
    initializer: The initializer for the dense layer. Defaults to a Glorot

--- a/official/nlp/keras_nlp/layers/on_device_embedding.py
+++ b/official/nlp/keras_nlp/layers/on_device_embedding.py
@@ -25,7 +25,7 @@ class OnDeviceEmbedding(tf.keras.layers.Layer):
  This layer uses either tf.gather or tf.one_hot to translate integer indices to
  float embeddings.
-  Arguments:
+  Args:
    vocab_size: Number of elements in the vocabulary.
    embedding_width: Output size of the embedding layer.
    initializer: The initializer to use for the embedding weights. Defaults to

--- a/official/nlp/keras_nlp/layers/position_embedding.py
+++ b/official/nlp/keras_nlp/layers/position_embedding.py
@@ -29,7 +29,7 @@ class PositionEmbedding(tf.keras.layers.Layer):
  ```
-  Arguments:
+  Args:
    max_length: The maximum size of the dynamic sequence.
    initializer: The initializer to use for the embedding weights. Defaults to
      "glorot_uniform".

--- a/official/nlp/keras_nlp/layers/transformer_encoder_block.py
+++ b/official/nlp/keras_nlp/layers/transformer_encoder_block.py
@@ -54,7 +54,7 @@ class TransformerEncoderBlock(tf.keras.layers.Layer):
               **kwargs):
    """Initializes `TransformerEncoderBlock`.
-    Arguments:
+    Args:
      num_attention_heads: Number of attention heads.
      inner_dim: The output dimension of the first Dense layer in a two-layer
        feedforward network.

--- a/official/nlp/modeling/layers/dense_einsum.py
+++ b/official/nlp/modeling/layers/dense_einsum.py
@@ -28,7 +28,7 @@ class DenseEinsum(tf.keras.layers.Layer):
  This layer can perform einsum calculations of arbitrary dimensionality.
-  Arguments:
+  Args:
    output_shape: Positive integer or tuple, dimensionality of the output space.
    num_summed_dimensions: The number of dimensions to sum over. Standard 2D
      matmul should use 1, 3D matmul should use 2, and so forth.

--- a/official/nlp/modeling/layers/gated_feedforward.py
+++ b/official/nlp/modeling/layers/gated_feedforward.py
@@ -28,7 +28,7 @@ class GatedFeedforward(tf.keras.layers.Layer):
  (https://arxiv.org/abs/2002.05202). In additional, it allows to stack
  multiple feedforward blocks and specify the position of dropout layer.
-  Arguments:
+  Args:
    intermediate_size: Size of the intermediate layer.
    intermediate_activation: Activation for the intermediate layer.
    dropout: Dropout probability for the output dropout.

--- a/official/nlp/modeling/layers/masked_softmax.py
+++ b/official/nlp/modeling/layers/masked_softmax.py
@@ -39,7 +39,7 @@ def _large_compatible_negative(tensor_type):
 class MaskedSoftmax(tf.keras.layers.Layer):
  """Performs a softmax with optional masking on a tensor.
-  Arguments:
+  Args:
    mask_expansion_axes: Any axes that should be padded on the mask tensor.
    normalization_axes: On which axes the softmax should perform.
  """

--- a/official/nlp/modeling/layers/mat_mul_with_margin.py
+++ b/official/nlp/modeling/layers/mat_mul_with_margin.py
@@ -26,7 +26,7 @@ from official.modeling import tf_utils
 class MatMulWithMargin(tf.keras.layers.Layer):
  """This layer computs a dot product matrix given two encoded inputs.
-  Arguments:
+  Args:
    logit_scale: The scaling factor of dot products when doing training.
    logit_margin: The margin value between the positive and negative examples
      when doing training.

--- a/official/nlp/modeling/layers/mobile_bert_layers.py
+++ b/official/nlp/modeling/layers/mobile_bert_layers.py
@@ -42,7 +42,7 @@ class NoNorm(tf.keras.layers.Layer):
 def _get_norm_layer(normalization_type='no_norm', name=None):
  """Get normlization layer.
-  Arguments:
+  Args:
      normalization_type: String. The type of normalization_type, only
        'no_norm' and 'layer_norm' are supported.
      name: Name for the norm layer.
@@ -82,7 +82,7 @@ class MobileBertEmbedding(tf.keras.layers.Layer):
               **kwargs):
    """Class initialization.
-    Arguments:
+    Args:
      word_vocab_size: Number of words in the vocabulary.
      word_embed_size: Word embedding size.
      type_vocab_size: Number of word types.
@@ -192,7 +192,7 @@ class MobileBertTransformer(tf.keras.layers.Layer):
               **kwargs):
    """Class initialization.
-    Arguments:
+    Args:
      hidden_size: Hidden size for the Transformer input and output tensor.
      num_attention_heads: Number of attention heads in the Transformer.
      intermediate_size: The size of the "intermediate" (a.k.a., feed
@@ -346,7 +346,7 @@ class MobileBertTransformer(tf.keras.layers.Layer):
           return_attention_scores=False):
    """Implementes the forward pass.
-    Arguments:
+    Args:
      input_tensor: Float tensor of shape [batch_size, seq_length, hidden_size].
      attention_mask: (optional) int32 tensor of shape [batch_size, seq_length,
        seq_length], with 1 for positions that can be attended to and 0 in
@@ -446,7 +446,7 @@ class MobileBertMaskedLM(tf.keras.layers.Layer):
               **kwargs):
    """Class initialization.
-    Arguments:
+    Args:
      embedding_table: The embedding table from encoder network.
      activation: The activation, if any, for the dense layer.
      initializer: The initializer for the dense layer. Defaults to a Glorot

--- a/official/nlp/modeling/layers/multi_channel_attention.py
+++ b/official/nlp/modeling/layers/multi_channel_attention.py
@@ -26,7 +26,7 @@ from official.nlp.modeling.layers import masked_softmax
 class VotingAttention(tf.keras.layers.Layer):
  """Voting Attention layer.
-  Arguments:
+  Args:
    num_heads: the number of attention heads.
    head_size: per-head hidden size.
    kernel_initializer: Initializer for dense layer kernels.

--- a/official/nlp/modeling/layers/position_embedding.py
+++ b/official/nlp/modeling/layers/position_embedding.py
@@ -31,7 +31,7 @@ class RelativePositionEmbedding(tf.keras.layers.Layer):
   "Attention is All You Need", section 3.5.
  (https://arxiv.org/abs/1706.03762).
-  Arguments:
+  Args:
    hidden_size: Size of the hidden layer.
    min_timescale: Minimum scale that will be applied at each position
    max_timescale: Maximum scale that will be applied at each position.

--- a/official/nlp/modeling/layers/rezero_transformer.py
+++ b/official/nlp/modeling/layers/rezero_transformer.py
@@ -29,7 +29,7 @@ class ReZeroTransformer(tf.keras.layers.Layer):
  The residual connection implements the ReZero method.
  (https://arxiv.org/abs/2003.04887)
-  Arguments:
+  Args:
    num_attention_heads: Number of attention heads.
    intermediate_size: Size of the intermediate layer.
    intermediate_activation: Activation for the intermediate layer.

--- a/official/nlp/modeling/layers/talking_heads_attention.py
+++ b/official/nlp/modeling/layers/talking_heads_attention.py
@@ -35,7 +35,7 @@ class TalkingHeadsAttention(tf.keras.layers.MultiHeadAttention):
  See the base class `MultiHeadAttention` for more details.
-  Arguments:
+  Args:
    num_heads: Number of attention heads.
    key_dim: Size of each attention head for query and key.
    value_dim:  Size of each attention head for value.