Merged commit includes the following changes: (#6969)

251762562 by hongkuny<hongkuny@google.com>: Fix blue score inconsistency -- PiperOrigin-RevId: 251762562

Merged commit includes the following changes: (#6969)
251762562 by hongkuny<hongkuny@google.com>: Fix blue score inconsistency -- PiperOrigin-RevId: 251762562
bc557d14 · saberkun · GitHub · 152baba5 · bc557d14 · bc557d14
Unverified Commit bc557d14 authored Jun 05, 2019 by saberkun Committed by GitHub Jun 05, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 7 deletions

official/transformer/compute_bleu.py official/transformer/compute_bleu.py +5 -2

official/transformer/utils/tokenizer.py official/transformer/utils/tokenizer.py +5 -5

No files found.
--- a/official/transformer/compute_bleu.py
+++ b/official/transformer/compute_bleu.py
@@ -34,6 +34,7 @@ import tensorflow as tf
 # pylint: enable=g-bad-import-order

 from official.transformer.utils import metrics
+from official.transformer.utils import tokenizer
 from official.utils.flags import core as flags_core


@@ -86,8 +87,10 @@ def bleu_tokenize(string):

 def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False):
  """Compute BLEU for two files (reference and hypothesis translation)."""
-  ref_lines = tf.io.gfile.GFile(ref_filename).read().strip().splitlines()
-  hyp_lines = tf.io.gfile.GFile(hyp_filename).read().strip().splitlines()
+  ref_lines = tokenizer.native_to_unicode(
+      tf.gfile.Open(ref_filename, "r").read()).split("\n")
+  hyp_lines = tokenizer.native_to_unicode(
+      tf.gfile.Open(hyp_filename, "r").read()).split("\n")

  if len(ref_lines) != len(hyp_lines):
    raise ValueError("Reference and translation files have different number of "

--- a/official/transformer/utils/tokenizer.py
+++ b/official/transformer/utils/tokenizer.py
@@ -124,7 +124,7 @@ class Subtokenizer(object):
  def encode(self, raw_string, add_eos=False):
    """Encodes a string into a list of int subtoken ids."""
    ret = []
-    tokens = _split_string_to_tokens(_native_to_unicode(raw_string))
+    tokens = _split_string_to_tokens(native_to_unicode(raw_string))
    for token in tokens:
      ret.extend(self._token_to_subtoken_ids(token))
    if add_eos:
@@ -193,15 +193,15 @@ def _load_vocab_file(vocab_file, reserved_tokens=None):
  subtoken_list = []
  with tf.io.gfile.GFile(vocab_file, mode="r") as f:
    for line in f:
-      subtoken = _native_to_unicode(line.strip())
+      subtoken = native_to_unicode(line.strip())
      subtoken = subtoken[1:-1]  # Remove surrounding single-quotes
      if subtoken in reserved_tokens:
        continue
-      subtoken_list.append(_native_to_unicode(subtoken))
+      subtoken_list.append(native_to_unicode(subtoken))
  return reserved_tokens + subtoken_list


-def _native_to_unicode(s):
+def native_to_unicode(s):
  """Convert string to unicode (required in Python 2)."""
  try:               # Python 2
    return s if isinstance(s, unicode) else s.decode("utf-8")
@@ -355,7 +355,7 @@ def _count_tokens(files, file_byte_limit=1e6):
          counter = 0

          # Add words to token counts
-          for token in _split_string_to_tokens(_native_to_unicode(line)):
+          for token in _split_string_to_tokens(native_to_unicode(line)):
            token_counts[token] += 1
  return token_counts