Sentiment analysis model initial implementation. (#4494)

* Sentiment analysis model initial implementation. - This is an implementation of the Sentiment Analysis model. - The implementation is with the reference to paddle version. * Sentiment analysis model initial implementation. - This is an implementation of the Sentiment Analysis model. - The implementation is with the reference to paddle version. * addressing comments * addressing comments * addressing the comments: using the util function for distribution strategy * addressing comments - lint style correction. - avoid using global FLAG variable. * modify the format to fit in the style guideline. * convert from single quote to double quote

Sentiment analysis model initial implementation. (#4494)
* Sentiment analysis model initial implementation. - This is an implementation of the Sentiment Analysis model. - The implementation is with the reference to paddle version. * Sentiment analysis model initial implementation. - This is an implementation of the Sentiment Analysis model. - The implementation is with the reference to paddle version. * addressing comments * addressing comments * addressing the comments: using the util function for distribution strategy * addressing comments - lint style correction. - avoid using global FLAG variable. * modify the format to fit in the style guideline. * convert from single quote to double quote
a81e1e7c · Hyungjun Lim · Qianli Scott Zhu · b1a704d7 · a81e1e7c · a81e1e7c
Commit a81e1e7c authored Jun 22, 2018 by Hyungjun Lim Committed by Qianli Scott Zhu Jun 22, 2018
8 changed files
--- a/research/sentiment_analysis/README.md
+++ b/research/sentiment_analysis/README.md
+# Sentiment Analysis
+## Overview
+This is an implementation of the Sentiment Analysis model as described in the [this paper](https://arxiv.org/abs/1412.1058). The implementation is with the reference to [paddle version](https://github.com/mlperf/reference/tree/master/sentiment_analysis/paddle).
+The model makes use of concatenation of two CNN layers with different kernel sizes. Dropout and batch normalization layers are used to prevent over-fitting.
+## Dataset
+The [keras](https://keras.io)'s [IMDB Movie reviews sentiment classification](https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification) dataset is used. The dataset file download is handled by keras module, and the downloaded files are stored at ``~/.keras/datasets` directory. The compressed file's filesize as of June 15 2018 is 17MB.
+## Running Code
+### Train and evaluate model
+To train and evaluate the model, issue the following command:
+```
+python sentiment_main.py
+```
+Arguments:
+  * `--vocabulary_size`: The number of words included in the dataset. The most frequent words are chosen. The default is 6000.
+  * `--sentence_length`: The length of the sentence
+  * `--dataset`: The dataset name to be downloaded and preprocessed. By default, it is `imdb`.
+There are other arguments about models and training process. Use the `--help` or `-h` flag to get a full list of possible arguments with detailed descriptions.
+## Benchmarks (TBA)
\ No newline at end of file
--- a/research/sentiment_analysis/__init__.py
+++ b/research/sentiment_analysis/__init__.py
--- a/research/sentiment_analysis/data/__init__.py
+++ b/research/sentiment_analysis/data/__init__.py
--- a/research/sentiment_analysis/data/dataset.py
+++ b/research/sentiment_analysis/data/dataset.py
+import data.imdb as imdb
+DATASET_IMDB = "imdb"
+def construct_input_fns(dataset, batch_size, vocabulary_size,
+                        sentence_length, repeat=1):
+  """Returns training and evaluation input functions.
+  Args:
+    dataset: Dataset to be trained and evaluated.
+      Currently only imdb is supported.
+    batch_size: Number of data in each batch.
+    vocabulary_size: The number of the most frequent tokens
+      to be used from the corpus.
+    sentence_length: The number of words in each sentence.
+      Longer sentences get cut, shorter ones padded.
+    repeat: The number of epoch.
+  Raises:
+    ValueError: if the dataset value is not valid.
+  Returns:
+    A tuple of training and evaluation input function.
+  """
+  if dataset == DATASET_IMDB:
+    train_input_fn, eval_input_fn = imdb.construct_input_fns(
+        vocabulary_size, sentence_length, batch_size, repeat=repeat)
+    return train_input_fn, eval_input_fn
+  else:
+    raise ValueError("unsupported dataset: " + dataset)
+def get_num_class(dataset):
+  """Returns an integer for the number of label classes.
+  Args:
+    dataset: Dataset to be trained and evaluated.
+      Currently only imdb is supported.
+  Raises:
+    ValueError: if the dataset value is not valid.
+  Returns:
+    str: The dataset name.
+  """
+  if dataset == DATASET_IMDB:
+    return imdb.NUM_CLASS
+  else:
+    raise ValueError("unsupported dataset: " + dataset)
--- a/research/sentiment_analysis/data/imdb.py
+++ b/research/sentiment_analysis/data/imdb.py
+from data.util import pad_sentence, to_dataset, START_CHAR, OOV_CHAR
+import tensorflow as tf, numpy as np
+NUM_CLASS = 2
+def construct_input_fns(vocabulary_size, sentence_length,
+                        batch_size, repeat=1):
+  """Returns training and evaluation input functions.
+  Args:
+    vocabulary_size: The number of the most frequent tokens
+      to be used from the corpus.
+    sentence_length: The number of words in each sentence.
+      Longer sentences get cut, shorter ones padded.
+    batch_size: Number of data in each batch.
+    repeat: The number of epoch.
+  Raises:
+    ValueError: if the dataset value is not valid.
+  Returns:
+    A tuple of training and evaluation input function.
+  """
+  (x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(
+      path="imdb.npz",
+      num_words=vocabulary_size,
+      skip_top=0,
+      maxlen=None,
+      seed=113,
+      start_char=START_CHAR,
+      oov_char=OOV_CHAR,
+      index_from=OOV_CHAR + 1)
+  def train_input_fn():
+    dataset = to_dataset(
+        np.array([pad_sentence(s, sentence_length) for s in x_train]),
+        np.eye(NUM_CLASS)[y_train], batch_size, repeat)
+    dataset = dataset.shuffle(len(x_train), reshuffle_each_iteration=True)
+    return dataset
+  def eval_input_fn():
+    dataset = to_dataset(
+        np.array([pad_sentence(s, sentence_length) for s in x_test]),
+        np.eye(NUM_CLASS)[y_test], batch_size, repeat)
+    return dataset
+  return train_input_fn, eval_input_fn
--- a/research/sentiment_analysis/data/util.py
+++ b/research/sentiment_analysis/data/util.py
+import numpy as np
+import tensorflow as tf
+START_CHAR = 1
+END_CHAR = 2
+OOV_CHAR = 3
+def pad_sentence(sen, sentence_length):
+  sen = sen[:sentence_length]
+  if len(sen) < sentence_length:
+    sen = np.pad(sen, (0, sentence_length - len(sen)), "constant",
+                 constant_values=(START_CHAR, END_CHAR))
+  return sen
+def to_dataset(x, y, batch_size, repeat):
+  dataset = tf.data.Dataset.from_tensor_slices((x, y))
+  # Repeat and batch the dataset
+  dataset = dataset.repeat(repeat)
+  dataset = dataset.batch(batch_size)
+  # Prefetch to improve speed of input pipeline.
+  dataset = dataset.prefetch(10)
+  return dataset
--- a/research/sentiment_analysis/sentiment_main.py
+++ b/research/sentiment_analysis/sentiment_main.py
+"""The main module for sentiment analysis.
+The model makes use of concatenation of two CNN layers
+with different kernel sizes.
+See `sentiment_model.py` for more details about the models.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from absl import app as absl_app
+from absl import flags
+from data import dataset
+from official.utils.flags import core as flags_core
+from official.utils.logs import hooks_helper
+from official.utils.logs import logger
+from official.utils.misc import distribution_utils
+import sentiment_model
+import tensorflow as tf
+def convert_keras_to_estimator(keras_model, num_gpus, model_dir=None):
+  """Convert keras model into tensorflow estimator."""
+  keras_model.compile(optimizer="rmsprop",
+                      loss="categorical_crossentropy", metrics=["accuracy"])
+  distribution = distribution_utils.get_distribution_strategy(
+      num_gpus, all_reduce_alg=None)
+  run_config = tf.estimator.RunConfig(train_distribute=distribution)
+  estimator = tf.keras.estimator.model_to_estimator(
+      keras_model=keras_model, model_dir=model_dir, config=run_config)
+  return estimator
+def run_model(flags_obj):
+  """Run training and eval loop."""
+  num_class = dataset.get_num_class(flags_obj.dataset)
+  tf.logging.info("Loading the dataset...")
+  train_input_fn, eval_input_fn = dataset.construct_input_fns(
+      flags_obj.dataset, flags_obj.batch_size, flags_obj.vocabulary_size,
+      flags_obj.sentence_length, repeat=flags_obj.epochs_between_evals)
+  keras_model = sentiment_model.CNN(
+      flags_obj.embedding_dim, flags_obj.vocabulary_size,
+      flags_obj.sentence_length,
+      flags_obj.cnn_filters, num_class, flags_obj.dropout_rate)
+  num_gpus = flags_core.get_num_gpus(FLAGS)
+  tf.logging.info("Creating Estimator from Keras model...")
+  estimator = convert_keras_to_estimator(
+      keras_model, num_gpus, flags_obj.model_dir)
+  # Create hooks that log information about the training and metric values
+  train_hooks = hooks_helper.get_train_hooks(
+      flags_obj.hooks,
+      batch_size=flags_obj.batch_size  # for ExamplesPerSecondHook
+  )
+  run_params = {
+      "batch_size": flags_obj.batch_size,
+      "train_epochs": flags_obj.train_epochs,
+  }
+  benchmark_logger = logger.get_benchmark_logger()
+  benchmark_logger.log_run_info(
+      model_name="sentiment_analysis",
+      dataset_name=flags_obj.dataset,
+      run_params=run_params,
+      test_id=flags_obj.benchmark_test_id)
+  # Training and evaluation cycle
+  total_training_cycle = flags_obj.train_epochs\
+    // flags_obj.epochs_between_evals
+  for cycle_index in range(total_training_cycle):
+    tf.logging.info("Starting a training cycle: {}/{}".format(
+        cycle_index + 1, total_training_cycle))
+    # Train the model
+    estimator.train(input_fn=train_input_fn, hooks=train_hooks)
+    # Evaluate the model
+    eval_results = estimator.evaluate(input_fn=eval_input_fn)
+    # Benchmark the evaluation results
+    benchmark_logger.log_evaluation_result(eval_results)
+    tf.logging.info("Iteration {}".format(eval_results))
+  # Clear the session explicitly to avoid session delete error
+  tf.keras.backend.clear_session()
+def main(_):
+  with logger.benchmark_context(FLAGS):
+    run_model(FLAGS)
+def define_flags():
+  """Add flags to run the main function."""
+  # Add common flags
+  flags_core.define_base(export_dir=False)
+  flags_core.define_performance(
+      num_parallel_calls=False,
+      inter_op=False,
+      intra_op=False,
+      synthetic_data=False,
+      max_train_steps=False,
+      dtype=False
+  )
+  flags_core.define_benchmark()
+  flags.adopt_module_key_flags(flags_core)
+  flags_core.set_defaults(
+      model_dir=None,
+      train_epochs=30,
+      batch_size=30,
+      hooks="")
+  # Add domain-specific flags
+  flags.DEFINE_enum(
+      name="dataset", default=dataset.DATASET_IMDB,
+      enum_values=[dataset.DATASET_IMDB], case_sensitive=False,
+      help=flags_core.help_wrap(
+          "Dataset to be trained and evaluated."))
+  flags.DEFINE_integer(
+      name="vocabulary_size", default=6000,
+      help=flags_core.help_wrap(
+          "The number of the most frequent tokens"
+          "to be used from the corpus."))
+  flags.DEFINE_integer(
+      name="sentence_length", default=200,
+      help=flags_core.help_wrap(
+          "The number of words in each sentence. Longer sentences get cut,"
+          "shorter ones padded."))
+  flags.DEFINE_integer(
+      name="embedding_dim", default=256,
+      help=flags_core.help_wrap("The dimension of the Embedding layer."))
+  flags.DEFINE_integer(
+      name="cnn_filters", default=512,
+      help=flags_core.help_wrap("The number of the CNN layer filters."))
+  flags.DEFINE_float(
+      name="dropout_rate", default=0.7,
+      help=flags_core.help_wrap("The rate for the Dropout layer."))
+if __name__ == "__main__":
+  tf.logging.set_verbosity(tf.logging.INFO)
+  define_flags()
+  FLAGS = flags.FLAGS
+  absl_app.run(main)
--- a/research/sentiment_analysis/sentiment_model.py
+++ b/research/sentiment_analysis/sentiment_model.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+def _dynamic_pooling(w_embs):
+  """Dynamic Pooling layer.
+  Given the variable-sized output of the convolution layer,
+  the pooling with a fixed pooling kernel size and stride would
+  produce variable-sized output, whereas the following fully-connected
+  layer expects fixes input layer size.
+  Thus we fix the number of pooling units (to 2) and dynamically
+  determine the pooling region size on each data point.
+  Args:
+    w_embs: a input tensor with dimensionality of 1.
+  Returns:
+    A tensor of size 2.
+  """
+  # a Lambda layer maintain separate context, so that tf should be imported
+  # here.
+  import tensorflow as tf
+  t = tf.expand_dims(w_embs, 2)
+  pool_size = w_embs.shape[1].value / 2
+  pooled = tf.keras.backend.pool2d(t, (pool_size, 1), strides=(
+      pool_size, 1), data_format="channels_last")
+  return tf.squeeze(pooled, 2)
+def _dynamic_pooling_output_shape(input_shape):
+  """Output shape for the dynamic pooling layer.
+  This function is used for keras Lambda layer to indicate
+  the output shape of the dynamic poolic layer.
+  Args:
+    input_shape: A tuple for the input shape.
+  Returns:
+    output shape for the dynamic pooling layer.
+  """
+  shape = list(input_shape)
+  assert len(shape) == 2  # only valid for 2D tensors
+  shape[1] = 2
+  return tuple(shape)
+class CNN(tf.keras.models.Model):
+  """CNN for sentimental analysis."""
+  def __init__(self, emb_dim, num_words, sentence_length, hid_dim,
+      class_dim, dropout_rate):
+    """Initialize CNN model.
+    Args:
+      emb_dim: The dimension of the Embedding layer.
+      num_words: The number of the most frequent tokens
+        to be used from the corpus.
+      sentence_length: The number of words in each sentence.
+        Longer sentences get cut, shorter ones padded.
+      hid_dim: The dimension of the Embedding layer.
+      class_dim: The number of the CNN layer filters.
+      dropout_rate: The portion of kept value in the Dropout layer.
+    Returns:
+      tf.keras.models.Model: A model.
+    """
+    input = tf.keras.layers.Input(shape=(sentence_length,), dtype=tf.int32)
+    layer = tf.keras.layers.Embedding(num_words, output_dim=emb_dim)(input)
+    layer_conv3 = tf.keras.layers.Conv1D(hid_dim, 3, activation="relu")(layer)
+    layer_conv3 = tf.keras.layers.Lambda(_dynamic_pooling,
+        output_shape=_dynamic_pooling_output_shape)(layer_conv3)
+    layer_conv3 = tf.keras.layers.Flatten()(layer_conv3)
+    layer_conv2 = tf.keras.layers.Conv1D(hid_dim, 2, activation="relu")(layer)
+    layer_conv2 = tf.keras.layers.Lambda(_dynamic_pooling,
+        output_shape=_dynamic_pooling_output_shape)(layer_conv2)
+    layer_conv2 = tf.keras.layers.Flatten()(layer_conv2)
+    layer = tf.keras.layers.concatenate([layer_conv2, layer_conv3], axis=1)
+    layer = tf.keras.layers.Dropout(dropout_rate)(layer)
+    layer = tf.keras.layers.BatchNormalization()(layer)
+    output = tf.keras.layers.Dense(class_dim, activation="softmax")(layer)
+    super(CNN, self).__init__(inputs=[input], outputs=output)