Commit a81e1e7c authored by Hyungjun Lim's avatar Hyungjun Lim Committed by Qianli Scott Zhu
Browse files

Sentiment analysis model initial implementation. (#4494)

* Sentiment analysis model initial implementation.

- This is an implementation of the Sentiment Analysis model.
- The implementation is with the reference to paddle version.

* Sentiment analysis model initial implementation.

- This is an implementation of the Sentiment Analysis model.
- The implementation is with the reference to paddle version.

* addressing comments

* addressing comments

* addressing the comments: using the util function for distribution strategy

* addressing comments

- lint style correction.
- avoid using global FLAG variable.

* modify the format to fit in the style guideline.

* convert from single quote to double quote
parent b1a704d7
# Sentiment Analysis
## Overview
This is an implementation of the Sentiment Analysis model as described in the [this paper](https://arxiv.org/abs/1412.1058). The implementation is with the reference to [paddle version](https://github.com/mlperf/reference/tree/master/sentiment_analysis/paddle).
The model makes use of concatenation of two CNN layers with different kernel sizes. Dropout and batch normalization layers are used to prevent over-fitting.
## Dataset
The [keras](https://keras.io)'s [IMDB Movie reviews sentiment classification](https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification) dataset is used. The dataset file download is handled by keras module, and the downloaded files are stored at ``~/.keras/datasets` directory. The compressed file's filesize as of June 15 2018 is 17MB.
## Running Code
### Train and evaluate model
To train and evaluate the model, issue the following command:
```
python sentiment_main.py
```
Arguments:
* `--vocabulary_size`: The number of words included in the dataset. The most frequent words are chosen. The default is 6000.
* `--sentence_length`: The length of the sentence
* `--dataset`: The dataset name to be downloaded and preprocessed. By default, it is `imdb`.
There are other arguments about models and training process. Use the `--help` or `-h` flag to get a full list of possible arguments with detailed descriptions.
## Benchmarks (TBA)
\ No newline at end of file
import data.imdb as imdb
DATASET_IMDB = "imdb"
def construct_input_fns(dataset, batch_size, vocabulary_size,
sentence_length, repeat=1):
"""Returns training and evaluation input functions.
Args:
dataset: Dataset to be trained and evaluated.
Currently only imdb is supported.
batch_size: Number of data in each batch.
vocabulary_size: The number of the most frequent tokens
to be used from the corpus.
sentence_length: The number of words in each sentence.
Longer sentences get cut, shorter ones padded.
repeat: The number of epoch.
Raises:
ValueError: if the dataset value is not valid.
Returns:
A tuple of training and evaluation input function.
"""
if dataset == DATASET_IMDB:
train_input_fn, eval_input_fn = imdb.construct_input_fns(
vocabulary_size, sentence_length, batch_size, repeat=repeat)
return train_input_fn, eval_input_fn
else:
raise ValueError("unsupported dataset: " + dataset)
def get_num_class(dataset):
"""Returns an integer for the number of label classes.
Args:
dataset: Dataset to be trained and evaluated.
Currently only imdb is supported.
Raises:
ValueError: if the dataset value is not valid.
Returns:
str: The dataset name.
"""
if dataset == DATASET_IMDB:
return imdb.NUM_CLASS
else:
raise ValueError("unsupported dataset: " + dataset)
from data.util import pad_sentence, to_dataset, START_CHAR, OOV_CHAR
import tensorflow as tf, numpy as np
NUM_CLASS = 2
def construct_input_fns(vocabulary_size, sentence_length,
batch_size, repeat=1):
"""Returns training and evaluation input functions.
Args:
vocabulary_size: The number of the most frequent tokens
to be used from the corpus.
sentence_length: The number of words in each sentence.
Longer sentences get cut, shorter ones padded.
batch_size: Number of data in each batch.
repeat: The number of epoch.
Raises:
ValueError: if the dataset value is not valid.
Returns:
A tuple of training and evaluation input function.
"""
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(
path="imdb.npz",
num_words=vocabulary_size,
skip_top=0,
maxlen=None,
seed=113,
start_char=START_CHAR,
oov_char=OOV_CHAR,
index_from=OOV_CHAR + 1)
def train_input_fn():
dataset = to_dataset(
np.array([pad_sentence(s, sentence_length) for s in x_train]),
np.eye(NUM_CLASS)[y_train], batch_size, repeat)
dataset = dataset.shuffle(len(x_train), reshuffle_each_iteration=True)
return dataset
def eval_input_fn():
dataset = to_dataset(
np.array([pad_sentence(s, sentence_length) for s in x_test]),
np.eye(NUM_CLASS)[y_test], batch_size, repeat)
return dataset
return train_input_fn, eval_input_fn
import numpy as np
import tensorflow as tf
START_CHAR = 1
END_CHAR = 2
OOV_CHAR = 3
def pad_sentence(sen, sentence_length):
sen = sen[:sentence_length]
if len(sen) < sentence_length:
sen = np.pad(sen, (0, sentence_length - len(sen)), "constant",
constant_values=(START_CHAR, END_CHAR))
return sen
def to_dataset(x, y, batch_size, repeat):
dataset = tf.data.Dataset.from_tensor_slices((x, y))
# Repeat and batch the dataset
dataset = dataset.repeat(repeat)
dataset = dataset.batch(batch_size)
# Prefetch to improve speed of input pipeline.
dataset = dataset.prefetch(10)
return dataset
"""The main module for sentiment analysis.
The model makes use of concatenation of two CNN layers
with different kernel sizes.
See `sentiment_model.py` for more details about the models.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl import app as absl_app
from absl import flags
from data import dataset
from official.utils.flags import core as flags_core
from official.utils.logs import hooks_helper
from official.utils.logs import logger
from official.utils.misc import distribution_utils
import sentiment_model
import tensorflow as tf
def convert_keras_to_estimator(keras_model, num_gpus, model_dir=None):
"""Convert keras model into tensorflow estimator."""
keras_model.compile(optimizer="rmsprop",
loss="categorical_crossentropy", metrics=["accuracy"])
distribution = distribution_utils.get_distribution_strategy(
num_gpus, all_reduce_alg=None)
run_config = tf.estimator.RunConfig(train_distribute=distribution)
estimator = tf.keras.estimator.model_to_estimator(
keras_model=keras_model, model_dir=model_dir, config=run_config)
return estimator
def run_model(flags_obj):
"""Run training and eval loop."""
num_class = dataset.get_num_class(flags_obj.dataset)
tf.logging.info("Loading the dataset...")
train_input_fn, eval_input_fn = dataset.construct_input_fns(
flags_obj.dataset, flags_obj.batch_size, flags_obj.vocabulary_size,
flags_obj.sentence_length, repeat=flags_obj.epochs_between_evals)
keras_model = sentiment_model.CNN(
flags_obj.embedding_dim, flags_obj.vocabulary_size,
flags_obj.sentence_length,
flags_obj.cnn_filters, num_class, flags_obj.dropout_rate)
num_gpus = flags_core.get_num_gpus(FLAGS)
tf.logging.info("Creating Estimator from Keras model...")
estimator = convert_keras_to_estimator(
keras_model, num_gpus, flags_obj.model_dir)
# Create hooks that log information about the training and metric values
train_hooks = hooks_helper.get_train_hooks(
flags_obj.hooks,
batch_size=flags_obj.batch_size # for ExamplesPerSecondHook
)
run_params = {
"batch_size": flags_obj.batch_size,
"train_epochs": flags_obj.train_epochs,
}
benchmark_logger = logger.get_benchmark_logger()
benchmark_logger.log_run_info(
model_name="sentiment_analysis",
dataset_name=flags_obj.dataset,
run_params=run_params,
test_id=flags_obj.benchmark_test_id)
# Training and evaluation cycle
total_training_cycle = flags_obj.train_epochs\
// flags_obj.epochs_between_evals
for cycle_index in range(total_training_cycle):
tf.logging.info("Starting a training cycle: {}/{}".format(
cycle_index + 1, total_training_cycle))
# Train the model
estimator.train(input_fn=train_input_fn, hooks=train_hooks)
# Evaluate the model
eval_results = estimator.evaluate(input_fn=eval_input_fn)
# Benchmark the evaluation results
benchmark_logger.log_evaluation_result(eval_results)
tf.logging.info("Iteration {}".format(eval_results))
# Clear the session explicitly to avoid session delete error
tf.keras.backend.clear_session()
def main(_):
with logger.benchmark_context(FLAGS):
run_model(FLAGS)
def define_flags():
"""Add flags to run the main function."""
# Add common flags
flags_core.define_base(export_dir=False)
flags_core.define_performance(
num_parallel_calls=False,
inter_op=False,
intra_op=False,
synthetic_data=False,
max_train_steps=False,
dtype=False
)
flags_core.define_benchmark()
flags.adopt_module_key_flags(flags_core)
flags_core.set_defaults(
model_dir=None,
train_epochs=30,
batch_size=30,
hooks="")
# Add domain-specific flags
flags.DEFINE_enum(
name="dataset", default=dataset.DATASET_IMDB,
enum_values=[dataset.DATASET_IMDB], case_sensitive=False,
help=flags_core.help_wrap(
"Dataset to be trained and evaluated."))
flags.DEFINE_integer(
name="vocabulary_size", default=6000,
help=flags_core.help_wrap(
"The number of the most frequent tokens"
"to be used from the corpus."))
flags.DEFINE_integer(
name="sentence_length", default=200,
help=flags_core.help_wrap(
"The number of words in each sentence. Longer sentences get cut,"
"shorter ones padded."))
flags.DEFINE_integer(
name="embedding_dim", default=256,
help=flags_core.help_wrap("The dimension of the Embedding layer."))
flags.DEFINE_integer(
name="cnn_filters", default=512,
help=flags_core.help_wrap("The number of the CNN layer filters."))
flags.DEFINE_float(
name="dropout_rate", default=0.7,
help=flags_core.help_wrap("The rate for the Dropout layer."))
if __name__ == "__main__":
tf.logging.set_verbosity(tf.logging.INFO)
define_flags()
FLAGS = flags.FLAGS
absl_app.run(main)
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
def _dynamic_pooling(w_embs):
"""Dynamic Pooling layer.
Given the variable-sized output of the convolution layer,
the pooling with a fixed pooling kernel size and stride would
produce variable-sized output, whereas the following fully-connected
layer expects fixes input layer size.
Thus we fix the number of pooling units (to 2) and dynamically
determine the pooling region size on each data point.
Args:
w_embs: a input tensor with dimensionality of 1.
Returns:
A tensor of size 2.
"""
# a Lambda layer maintain separate context, so that tf should be imported
# here.
import tensorflow as tf
t = tf.expand_dims(w_embs, 2)
pool_size = w_embs.shape[1].value / 2
pooled = tf.keras.backend.pool2d(t, (pool_size, 1), strides=(
pool_size, 1), data_format="channels_last")
return tf.squeeze(pooled, 2)
def _dynamic_pooling_output_shape(input_shape):
"""Output shape for the dynamic pooling layer.
This function is used for keras Lambda layer to indicate
the output shape of the dynamic poolic layer.
Args:
input_shape: A tuple for the input shape.
Returns:
output shape for the dynamic pooling layer.
"""
shape = list(input_shape)
assert len(shape) == 2 # only valid for 2D tensors
shape[1] = 2
return tuple(shape)
class CNN(tf.keras.models.Model):
"""CNN for sentimental analysis."""
def __init__(self, emb_dim, num_words, sentence_length, hid_dim,
class_dim, dropout_rate):
"""Initialize CNN model.
Args:
emb_dim: The dimension of the Embedding layer.
num_words: The number of the most frequent tokens
to be used from the corpus.
sentence_length: The number of words in each sentence.
Longer sentences get cut, shorter ones padded.
hid_dim: The dimension of the Embedding layer.
class_dim: The number of the CNN layer filters.
dropout_rate: The portion of kept value in the Dropout layer.
Returns:
tf.keras.models.Model: A model.
"""
input = tf.keras.layers.Input(shape=(sentence_length,), dtype=tf.int32)
layer = tf.keras.layers.Embedding(num_words, output_dim=emb_dim)(input)
layer_conv3 = tf.keras.layers.Conv1D(hid_dim, 3, activation="relu")(layer)
layer_conv3 = tf.keras.layers.Lambda(_dynamic_pooling,
output_shape=_dynamic_pooling_output_shape)(layer_conv3)
layer_conv3 = tf.keras.layers.Flatten()(layer_conv3)
layer_conv2 = tf.keras.layers.Conv1D(hid_dim, 2, activation="relu")(layer)
layer_conv2 = tf.keras.layers.Lambda(_dynamic_pooling,
output_shape=_dynamic_pooling_output_shape)(layer_conv2)
layer_conv2 = tf.keras.layers.Flatten()(layer_conv2)
layer = tf.keras.layers.concatenate([layer_conv2, layer_conv3], axis=1)
layer = tf.keras.layers.Dropout(dropout_rate)(layer)
layer = tf.keras.layers.BatchNormalization()(layer)
output = tf.keras.layers.Dense(class_dim, activation="softmax")(layer)
super(CNN, self).__init__(inputs=[input], outputs=output)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment