Enforce UTF-8 when open() text files (#460)

Summary: When opening text files without specifying the encoding (i.e. `open(path, "r")` or `open(path, "w")`), python3 will use the preferred locale encoding (`locale.getpreferredencoding()`) so the result is platform dependent and can change from one machine to another. I believe fairseq should enforce its standard (UTF-8 seems like the best choice to me). This pull request explicity specify UTF-8 encoding when reading text files. Pull Request resolved: https://github.com/pytorch/fairseq/pull/460 Differential Revision: D13802525 Pulled By: myleott fbshipit-source-id: 672fd55707ee559ab36d74bc1c24026166ea2367

Enforce UTF-8 when open() text files (#460)
Summary: When opening text files without specifying the encoding (i.e. `open(path, "r")` or `open(path, "w")`), python3 will use the preferred locale encoding (`locale.getpreferredencoding()`) so the result is platform dependent and can change from one machine to another. I believe fairseq should enforce its standard (UTF-8 seems like the best choice to me). This pull request explicity specify UTF-8 encoding when reading text files. Pull Request resolved: https://github.com/pytorch/fairseq/pull/460 Differential Revision: D13802525 Pulled By: myleott fbshipit-source-id: 672fd55707ee559ab36d74bc1c24026166ea2367
38f1dee9 · Davide Caroselli · Facebook Github Bot · ef3e6ab5 · 38f1dee9 · 38f1dee9
Commit 38f1dee9 authored Jan 24, 2019 by Davide Caroselli Committed by Facebook Github Bot Jan 24, 2019
4 changed files
--- a/fairseq/data/indexed_dataset.py
+++ b/fairseq/data/indexed_dataset.py
@@ -168,7 +168,7 @@ class IndexedRawTextDataset(torch.utils.data.Dataset):
        self.size = len(self.tokens_list)

    def read_data(self, path, dictionary):
-        with open(path, 'r') as f:
+        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                self.lines.append(line.strip('\n'))
                tokens = Tokenizer.tokenize(

--- a/fairseq/tokenizer.py
+++ b/fairseq/tokenizer.py
@@ -36,7 +36,7 @@ class Tokenizer:
    @staticmethod
    def add_file_to_dictionary_single_worker(filename, tokenize, eos_word, worker_id=0, num_workers=1):
        counter = Counter()
-        with open(filename, 'r') as f:
+        with open(filename, 'r', encoding='utf-8') as f:
            size = os.fstat(f.fileno()).st_size
            chunk_size = size // num_workers
            offset = worker_id * chunk_size
@@ -86,7 +86,7 @@ class Tokenizer:
            if idx == dict.unk_index and word != dict.unk_word:
                replaced.update([word])

-        with open(filename, 'r') as f:
+        with open(filename, 'r', encoding='utf-8') as f:
            f.seek(offset)
            # next(f) breaks f.tell(), hence readline() must be used
            line = safe_readline(f)
@@ -110,7 +110,7 @@ class Tokenizer:

    @staticmethod
    def find_offsets(filename, num_chunks):
-        with open(filename, 'r') as f:
+        with open(filename, 'r', encoding='utf-8') as f:
            size = os.fstat(f.fileno()).st_size
            chunk_size = size // num_chunks
            offsets = [0 for _ in range(num_chunks + 1)]

--- a/preprocess.py
+++ b/preprocess.py
@@ -232,9 +232,9 @@ def main(args):
        src_dict = dictionary.Dictionary.load(dict_path(args.source_lang))
        tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang))
        freq_map = {}
-        with open(args.alignfile, "r") as align_file:
-            with open(src_file_name, "r") as src_file:
-                with open(tgt_file_name, "r") as tgt_file:
+        with open(args.alignfile, "r", encoding='utf-8') as align_file:
+            with open(src_file_name, "r", encoding='utf-8') as src_file:
+                with open(tgt_file_name, "r", encoding='utf-8') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False)
@@ -264,7 +264,7 @@ def main(args):
                args.destdir,
                "alignment.{}-{}.txt".format(args.source_lang, args.target_lang),
            ),
-            "w",
+            "w", encoding='utf-8'
        ) as f:
            for k, v in align_dict.items():
                print("{} {}".format(src_dict[k], tgt_dict[v]), file=f)

--- a/scripts/build_sym_alignment.py
+++ b/scripts/build_sym_alignment.py
@@ -59,8 +59,8 @@ def main():

    # create joined file
    joined_file = os.path.join(args.output_dir, 'text.joined')
-    with open(args.source_file, 'r') as src, open(args.target_file, 'r') as tgt:
-        with open(joined_file, 'w') as joined:
+    with open(args.source_file, 'r', encoding='utf-8') as src, open(args.target_file, 'r', encoding='utf-8') as tgt:
+        with open(joined_file, 'w', encoding='utf-8') as joined:
            for s, t in zip_longest(src, tgt):
                print('{} ||| {}'.format(s.strip(), t.strip()), file=joined)