Update transformer's data_download.py to use TF 1.x compatibility mode.

PiperOrigin-RevId: 280455983

Update transformer's data_download.py to use TF 1.x compatibility mode.
PiperOrigin-RevId: 280455983
63d754ec · Allen Wang · A. Unique TensorFlower · c59cf48d · 63d754ec
Commit 63d754ec authored Nov 14, 2019 by Allen Wang Committed by A. Unique TensorFlower Nov 14, 2019
Show whitespace changes
Inline Side-by-side

Showing with 22 additions and 21 deletions

official/transformer/data_download.py official/transformer/data_download.py +22 -21

No files found.
--- a/official/transformer/data_download.py
+++ b/official/transformer/data_download.py
@@ -27,7 +27,8 @@ import six
 from six.moves import urllib
 from absl import app as absl_app
 from absl import flags
-import tensorflow as tf
+from absl import logging
+import tensorflow.compat.v1 as tf
 # pylint: enable=g-bad-import-order

 from official.transformer.utils import tokenizer
@@ -164,7 +165,7 @@ def download_from_url(path, url):
  found_file = find_file(path, filename, max_depth=0)
  if found_file is None:
    filename = os.path.join(path, filename)
-    tf.logging.info("Downloading from %s to %s." % (url, filename))
+    logging.info("Downloading from %s to %s." % (url, filename))
    inprogress_filepath = filename + ".incomplete"
    inprogress_filepath, _ = urllib.request.urlretrieve(
        url, inprogress_filepath, reporthook=download_report_hook)
@@ -173,7 +174,7 @@ def download_from_url(path, url):
    tf.gfile.Rename(inprogress_filepath, filename)
    return filename
  else:
-    tf.logging.info("Already downloaded: %s (at %s)." % (url, found_file))
+    logging.info("Already downloaded: %s (at %s)." % (url, found_file))
    return found_file


@@ -196,14 +197,14 @@ def download_and_extract(path, url, input_filename, target_filename):
  input_file = find_file(path, input_filename)
  target_file = find_file(path, target_filename)
  if input_file and target_file:
-    tf.logging.info("Already downloaded and extracted %s." % url)
+    logging.info("Already downloaded and extracted %s." % url)
    return input_file, target_file

  # Download archive file if it doesn't already exist.
  compressed_file = download_from_url(path, url)

  # Extract compressed files
-  tf.logging.info("Extracting %s." % compressed_file)
+  logging.info("Extracting %s." % compressed_file)
  with tarfile.open(compressed_file, "r:gz") as corpus_tar:
    corpus_tar.extractall(path)

@@ -239,7 +240,7 @@ def compile_files(raw_dir, raw_files, tag):
  Returns:
    Full path of compiled input and target files.
  """
-  tf.logging.info("Compiling files with tag %s." % tag)
+  logging.info("Compiling files with tag %s." % tag)
  filename = "%s-%s" % (_PREFIX, tag)
  input_compiled_file = os.path.join(raw_dir, filename + ".lang1")
  target_compiled_file = os.path.join(raw_dir, filename + ".lang2")
@@ -250,7 +251,7 @@ def compile_files(raw_dir, raw_files, tag):
        input_file = raw_files["inputs"][i]
        target_file = raw_files["targets"][i]

-        tf.logging.info("Reading files %s and %s." % (input_file, target_file))
+        logging.info("Reading files %s and %s." % (input_file, target_file))
        write_file(input_writer, input_file)
        write_file(target_writer, target_file)
  return input_compiled_file, target_compiled_file
@@ -286,10 +287,10 @@ def encode_and_save_files(
               for n in range(total_shards)]

  if all_exist(filepaths):
-    tf.logging.info("Files with tag %s already exist." % tag)
+    logging.info("Files with tag %s already exist." % tag)
    return filepaths

-  tf.logging.info("Saving files with tag %s." % tag)
+  logging.info("Saving files with tag %s." % tag)
  input_file = raw_files[0]
  target_file = raw_files[1]

@@ -300,7 +301,7 @@ def encode_and_save_files(
  for counter, (input_line, target_line) in enumerate(zip(
      txt_line_iterator(input_file), txt_line_iterator(target_file))):
    if counter > 0 and counter % 100000 == 0:
-      tf.logging.info("\tSaving case %d." % counter)
+      logging.info("\tSaving case %d." % counter)
    example = dict_to_example(
        {"inputs": subtokenizer.encode(input_line, add_eos=True),
         "targets": subtokenizer.encode(target_line, add_eos=True)})
@@ -312,7 +313,7 @@ def encode_and_save_files(
  for tmp_name, final_name in zip(tmp_filepaths, filepaths):
    tf.gfile.Rename(tmp_name, final_name)

-  tf.logging.info("Saved %d Examples", counter + 1)
+  logging.info("Saved %d Examples", counter + 1)
  return filepaths


@@ -324,7 +325,7 @@ def shard_filename(path, tag, shard_num, total_shards):

 def shuffle_records(fname):
  """Shuffle records in a single file."""
-  tf.logging.info("Shuffling records in file %s" % fname)
+  logging.info("Shuffling records in file %s" % fname)

  # Rename file prior to shuffling
  tmp_fname = fname + ".unshuffled"
@@ -335,7 +336,7 @@ def shuffle_records(fname):
  for record in reader:
    records.append(record)
    if len(records) % 100000 == 0:
-      tf.logging.info("\tRead: %d", len(records))
+      logging.info("\tRead: %d", len(records))

  random.shuffle(records)

@@ -344,7 +345,7 @@ def shuffle_records(fname):
    for count, record in enumerate(records):
      w.write(record)
      if count > 0 and count % 100000 == 0:
-        tf.logging.info("\tWriting record: %d" % count)
+        logging.info("\tWriting record: %d" % count)

  tf.gfile.Remove(tmp_fname)

@@ -367,7 +368,7 @@ def all_exist(filepaths):

 def make_dir(path):
  if not tf.gfile.Exists(path):
-    tf.logging.info("Creating directory %s" % path)
+    logging.info("Creating directory %s" % path)
    tf.gfile.MakeDirs(path)


@@ -377,28 +378,28 @@ def main(unused_argv):
  make_dir(FLAGS.data_dir)

  # Download test_data
-  tf.logging.info("Step 1/5: Downloading test data")
+  logging.info("Step 1/5: Downloading test data")
  train_files = get_raw_files(FLAGS.data_dir, _TEST_DATA_SOURCES)

  # Get paths of download/extracted training and evaluation files.
-  tf.logging.info("Step 2/5: Downloading data from source")
+  logging.info("Step 2/5: Downloading data from source")
  train_files = get_raw_files(FLAGS.raw_dir, _TRAIN_DATA_SOURCES)
  eval_files = get_raw_files(FLAGS.raw_dir, _EVAL_DATA_SOURCES)

  # Create subtokenizer based on the training files.
-  tf.logging.info("Step 3/5: Creating subtokenizer and building vocabulary")
+  logging.info("Step 3/5: Creating subtokenizer and building vocabulary")
  train_files_flat = train_files["inputs"] + train_files["targets"]
  vocab_file = os.path.join(FLAGS.data_dir, VOCAB_FILE)
  subtokenizer = tokenizer.Subtokenizer.init_from_files(
      vocab_file, train_files_flat, _TARGET_VOCAB_SIZE, _TARGET_THRESHOLD,
      min_count=None if FLAGS.search else _TRAIN_DATA_MIN_COUNT)

-  tf.logging.info("Step 4/5: Compiling training and evaluation data")
+  logging.info("Step 4/5: Compiling training and evaluation data")
  compiled_train_files = compile_files(FLAGS.raw_dir, train_files, _TRAIN_TAG)
  compiled_eval_files = compile_files(FLAGS.raw_dir, eval_files, _EVAL_TAG)

  # Tokenize and save data as Examples in the TFRecord format.
-  tf.logging.info("Step 5/5: Preprocessing and saving data")
+  logging.info("Step 5/5: Preprocessing and saving data")
  train_tfrecord_files = encode_and_save_files(
      subtokenizer, FLAGS.data_dir, compiled_train_files, _TRAIN_TAG,
      _TRAIN_SHARDS)
@@ -428,7 +429,7 @@ def define_data_download_flags():


 if __name__ == "__main__":
-  tf.logging.set_verbosity(tf.logging.INFO)
+  logging.set_verbosity(logging.INFO)
  define_data_download_flags()
  FLAGS = flags.FLAGS
  absl_app.run(main)