Move tests_data to gcs and upgrade data_download. (#6722)

0f76239b · Toby Boyd · GitHub · 59c218f5 · 0f76239b · 0f76239b
Unverified Commit 0f76239b authored May 06, 2019 by Toby Boyd Committed by GitHub May 06, 2019
4 changed files
--- a/official/transformer/README.md
+++ b/official/transformer/README.md
@@ -31,7 +31,9 @@ The model also applies embeddings on the input and output tokens, and adds a con
 ## Walkthrough
-Below are the commands for running the Transformer model. See the [Detailed instrutions](#detailed-instructions) for more details on running the model.
+Below are the commands for running the Transformer model. See the
+[Detailed instructions](#detailed-instructions) for more details on running the
+model.
 ```
 cd /path/to/models/official/transformer
@@ -46,13 +48,13 @@ DATA_DIR=$HOME/transformer/data
 MODEL_DIR=$HOME/transformer/model_$PARAM_SET
 VOCAB_FILE=$DATA_DIR/vocab.ende.32768
-# Download training/evaluation datasets
+# Download training/evaluation/test datasets
 python data_download.py --data_dir=$DATA_DIR
 # Train the model for 10 epochs, and evaluate after every epoch.
 python transformer_main.py --data_dir=$DATA_DIR --model_dir=$MODEL_DIR \
    --vocab_file=$VOCAB_FILE --param_set=$PARAM_SET \
-    --bleu_source=test_data/newstest2014.en --bleu_ref=test_data/newstest2014.de
+    --bleu_source=$DATA_DIR/newstest2014.en --bleu_ref=$DATA_DIR/newstest2014.de
 # Run during training in a separate process to get continuous updates,
 # or after training is complete.
@@ -64,8 +66,8 @@ python translate.py --model_dir=$MODEL_DIR --vocab_file=$VOCAB_FILE \
 # Compute model's BLEU score using the newstest2014 dataset.
 python translate.py --model_dir=$MODEL_DIR --vocab_file=$VOCAB_FILE \
-    --param_set=$PARAM_SET --file=test_data/newstest2014.en --file_out=translation.en
+    --param_set=$PARAM_SET --file=$DATA_DIR/newstest2014.en --file_out=translation.en
-python compute_bleu.py --translation=translation.en --reference=test_data/newstest2014.de
+python compute_bleu.py --translation=translation.en --reference=$DATA_DIR/newstest2014.de
 ```
 ## Benchmarks
@@ -134,7 +136,7 @@ big | 28.9
   Arguments:
   * `--data_dir`: This should be set to the same directory given to the `data_download`'s `data_dir` argument.
   * `--model_dir`: Directory to save Transformer model training checkpoints.
-   * `--vocab_file`: Path to subtoken vacbulary file. If data_download was used, you may find the file in `data_dir`.
+   * `--vocab_file`: Path to subtoken vocabulary file. If data_download was used, you may find the file in `data_dir`.
   * `--param_set`: Parameter set to use when creating and training the model. Options are `base` and `big` (default).
   * Use the `--help` or `-h` flag to get a full list of possible arguments.
@@ -159,9 +161,7 @@ big | 28.9
   * `--bleu_ref`: Path to file containing the reference translation.
   * `--stop_threshold`: Train until the BLEU score reaches this lower bound. This setting overrides the `--train_steps` and `--train_epochs` flags.
-   The test source and reference files located in the `test_data` directory are extracted from the preprocessed dataset from the [NMT Seq2Seq tutorial](https://google.github.io/seq2seq/nmt/#download-data).
+   When running `transformer_main.py`, use the flags: `--bleu_source=$DATA_DIR/newstest2014.en --bleu_ref=$DATA_DIR/newstest2014.de`
-   When running `transformer_main.py`, use the flags: `--bleu_source=test_data/newstest2014.en --bleu_ref=test_data/newstest2014.de`
   #### Tensorboard
   Training and evaluation metrics (loss, accuracy, approximate BLEU score, etc.) are logged, and can be displayed in the browser using Tensorboard.
@@ -181,7 +181,7 @@ big | 28.9
   Arguments for initializing the Subtokenizer and trained model:
   * `--model_dir` and `--param_set`: These parameters are used to rebuild the trained model
-   * `--vocab_file`: Path to subtoken vacbulary file. If data_download was used, you may find the file in `data_dir`.
+   * `--vocab_file`: Path to subtoken vocabulary file. If data_download was used, you may find the file in `data_dir`.
   Arguments for specifying what to translate:
   * `--text`: Text to translate
@@ -191,7 +191,7 @@ big | 28.9
   To translate the newstest2014 data, run:
   ```
   python translate.py --model_dir=$MODEL_DIR --vocab_file=$VOCAB_FILE \
-       --param_set=$PARAM_SET --file=test_data/newstest2014.en --file_out=translation.en
+       --param_set=$PARAM_SET --file=$DATA_DIR/newstest2014.en --file_out=translation.en
   ```
   Translating the file takes around 15 minutes on a GTX1080, or 5 minutes on a P100.
@@ -201,14 +201,14 @@ big | 28.9
   Command to run:
   ```
-   python compute_bleu.py --translation=translation.en --reference=test_data/newstest2014.de
+   python compute_bleu.py --translation=translation.en --reference=$DATA_DIR/newstest2014.de
   ```
   Arguments:
   * `--translation`: Path to file containing generated translations.
   * `--reference`: Path to file containing reference translations.
   * Use the `--help` or `-h` flag to get a full list of possible arguments.
 5. ### TPU
   TPU support for this version of Transformer is experimental. Currently it is present for
   demonstration purposes only, but will be optimized in the coming weeks.
@@ -337,7 +337,7 @@ Aside from the main file to train the Transformer model, we provide other script
 [data_download.py](data_download.py) downloads and extracts data, then uses `Subtokenizer` to tokenize strings into arrays of int IDs. The int arrays are converted to `tf.Examples` and saved in the `tf.RecordDataset` format.
- The data is downloaded from the Workshop of Machine Transtion (WMT) [news translation task](http://www.statmt.org/wmt17/translation-task.html). The following datasets are used:
+ The data is downloaded from the Workshop of Machine Translation (WMT) [news translation task](http://www.statmt.org/wmt17/translation-task.html). The following datasets are used:
 * Europarl v7
 * Common Crawl corpus
@@ -356,7 +356,12 @@ Translation is defined in [translate.py](translate.py). First, `Subtokenizer` to
 [compute_bleu.py](compute_bleu.py): Implementation from [https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/utils/bleu_hook.py).
 ### Test dataset
-The [newstest2014 files](test_data) are extracted from the [NMT Seq2Seq tutorial](https://google.github.io/seq2seq/nmt/#download-data). The raw text files are converted from the SGM format of the [WMT 2016](http://www.statmt.org/wmt16/translation-task.html) test sets.
+The [newstest2014 files](https://storage.googleapis.com/tf-perf-public/official_transformer/test_data/newstest2014.tgz)
+are extracted from the [NMT Seq2Seq tutorial](https://google.github.io/seq2seq/nmt/#download-data).
+The raw text files are converted from the SGM format of the
+[WMT 2016](http://www.statmt.org/wmt16/translation-task.html) test sets. The
+newstest2014 files are put into the `$DATA_DIR` when executing
+`data_download.py`
 ## Term definitions

--- a/official/transformer/data_download.py
+++ b/official/transformer/data_download.py
@@ -69,6 +69,15 @@ _EVAL_DATA_SOURCES = [
    }
 ]
+_TEST_DATA_SOURCES = [
+    {
+        "url": ("https://storage.googleapis.com/tf-perf-public/"
+                "official_transformer/test_data/newstest2014.tgz"),
+        "input": "newstest2014.en",
+        "target": "newstest2014.de",
+    }
+]
 # Vocabulary constants
 _TARGET_VOCAB_SIZE = 32768  # Number of subtokens in the vocabulary list.
 _TARGET_THRESHOLD = 327  # Accept vocabulary if size is within this threshold
@@ -198,7 +207,7 @@ def download_and_extract(path, url, input_filename, target_filename):
  with tarfile.open(compressed_file, "r:gz") as corpus_tar:
    corpus_tar.extractall(path)
-  # Return filepaths of the requested files.
+  # Return file paths of the requested files.
  input_file = find_file(path, input_filename)
  target_file = find_file(path, target_filename)
@@ -367,25 +376,29 @@ def main(unused_argv):
  make_dir(FLAGS.raw_dir)
  make_dir(FLAGS.data_dir)
+  # Download test_data
+  tf.logging.info("Step 1/5: Downloading test data")
+  train_files = get_raw_files(FLAGS.data_dir, _TEST_DATA_SOURCES)
  # Get paths of download/extracted training and evaluation files.
-  tf.logging.info("Step 1/4: Downloading data from source")
+  tf.logging.info("Step 2/5: Downloading data from source")
  train_files = get_raw_files(FLAGS.raw_dir, _TRAIN_DATA_SOURCES)
  eval_files = get_raw_files(FLAGS.raw_dir, _EVAL_DATA_SOURCES)
  # Create subtokenizer based on the training files.
-  tf.logging.info("Step 2/4: Creating subtokenizer and building vocabulary")
+  tf.logging.info("Step 3/5: Creating subtokenizer and building vocabulary")
  train_files_flat = train_files["inputs"] + train_files["targets"]
  vocab_file = os.path.join(FLAGS.data_dir, VOCAB_FILE)
  subtokenizer = tokenizer.Subtokenizer.init_from_files(
      vocab_file, train_files_flat, _TARGET_VOCAB_SIZE, _TARGET_THRESHOLD,
      min_count=None if FLAGS.search else _TRAIN_DATA_MIN_COUNT)
-  tf.logging.info("Step 3/4: Compiling training and evaluation data")
+  tf.logging.info("Step 4/5: Compiling training and evaluation data")
  compiled_train_files = compile_files(FLAGS.raw_dir, train_files, _TRAIN_TAG)
  compiled_eval_files = compile_files(FLAGS.raw_dir, eval_files, _EVAL_TAG)
  # Tokenize and save data as Examples in the TFRecord format.
-  tf.logging.info("Step 4/4: Preprocessing and saving data")
+  tf.logging.info("Step 5/5: Preprocessing and saving data")
  train_tfrecord_files = encode_and_save_files(
      subtokenizer, FLAGS.data_dir, compiled_train_files, _TRAIN_TAG,
      _TRAIN_SHARDS)

--- a/official/transformer/test_data/newstest2014.de
+++ b/official/transformer/test_data/newstest2014.de
--- a/official/transformer/test_data/newstest2014.en
+++ b/official/transformer/test_data/newstest2014.en