Commit 38f1dee9 authored by Davide Caroselli's avatar Davide Caroselli Committed by Facebook Github Bot
Browse files

Enforce UTF-8 when open() text files (#460)

Summary:
When opening text files without specifying the encoding (i.e. `open(path, "r")` or `open(path, "w")`), python3 will use the preferred locale encoding (`locale.getpreferredencoding()`) so the result is platform dependent and can change from one machine to another.

I believe fairseq should enforce its standard (UTF-8 seems like the best choice to me). This pull request explicity specify UTF-8 encoding when reading text files.
Pull Request resolved: https://github.com/pytorch/fairseq/pull/460

Differential Revision: D13802525

Pulled By: myleott

fbshipit-source-id: 672fd55707ee559ab36d74bc1c24026166ea2367
parent ef3e6ab5
...@@ -168,7 +168,7 @@ class IndexedRawTextDataset(torch.utils.data.Dataset): ...@@ -168,7 +168,7 @@ class IndexedRawTextDataset(torch.utils.data.Dataset):
self.size = len(self.tokens_list) self.size = len(self.tokens_list)
def read_data(self, path, dictionary): def read_data(self, path, dictionary):
with open(path, 'r') as f: with open(path, 'r', encoding='utf-8') as f:
for line in f: for line in f:
self.lines.append(line.strip('\n')) self.lines.append(line.strip('\n'))
tokens = Tokenizer.tokenize( tokens = Tokenizer.tokenize(
......
...@@ -36,7 +36,7 @@ class Tokenizer: ...@@ -36,7 +36,7 @@ class Tokenizer:
@staticmethod @staticmethod
def add_file_to_dictionary_single_worker(filename, tokenize, eos_word, worker_id=0, num_workers=1): def add_file_to_dictionary_single_worker(filename, tokenize, eos_word, worker_id=0, num_workers=1):
counter = Counter() counter = Counter()
with open(filename, 'r') as f: with open(filename, 'r', encoding='utf-8') as f:
size = os.fstat(f.fileno()).st_size size = os.fstat(f.fileno()).st_size
chunk_size = size // num_workers chunk_size = size // num_workers
offset = worker_id * chunk_size offset = worker_id * chunk_size
...@@ -86,7 +86,7 @@ class Tokenizer: ...@@ -86,7 +86,7 @@ class Tokenizer:
if idx == dict.unk_index and word != dict.unk_word: if idx == dict.unk_index and word != dict.unk_word:
replaced.update([word]) replaced.update([word])
with open(filename, 'r') as f: with open(filename, 'r', encoding='utf-8') as f:
f.seek(offset) f.seek(offset)
# next(f) breaks f.tell(), hence readline() must be used # next(f) breaks f.tell(), hence readline() must be used
line = safe_readline(f) line = safe_readline(f)
...@@ -110,7 +110,7 @@ class Tokenizer: ...@@ -110,7 +110,7 @@ class Tokenizer:
@staticmethod @staticmethod
def find_offsets(filename, num_chunks): def find_offsets(filename, num_chunks):
with open(filename, 'r') as f: with open(filename, 'r', encoding='utf-8') as f:
size = os.fstat(f.fileno()).st_size size = os.fstat(f.fileno()).st_size
chunk_size = size // num_chunks chunk_size = size // num_chunks
offsets = [0 for _ in range(num_chunks + 1)] offsets = [0 for _ in range(num_chunks + 1)]
......
...@@ -232,9 +232,9 @@ def main(args): ...@@ -232,9 +232,9 @@ def main(args):
src_dict = dictionary.Dictionary.load(dict_path(args.source_lang)) src_dict = dictionary.Dictionary.load(dict_path(args.source_lang))
tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang)) tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang))
freq_map = {} freq_map = {}
with open(args.alignfile, "r") as align_file: with open(args.alignfile, "r", encoding='utf-8') as align_file:
with open(src_file_name, "r") as src_file: with open(src_file_name, "r", encoding='utf-8') as src_file:
with open(tgt_file_name, "r") as tgt_file: with open(tgt_file_name, "r", encoding='utf-8') as tgt_file:
for a, s, t in zip_longest(align_file, src_file, tgt_file): for a, s, t in zip_longest(align_file, src_file, tgt_file):
si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False)
ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False)
...@@ -264,7 +264,7 @@ def main(args): ...@@ -264,7 +264,7 @@ def main(args):
args.destdir, args.destdir,
"alignment.{}-{}.txt".format(args.source_lang, args.target_lang), "alignment.{}-{}.txt".format(args.source_lang, args.target_lang),
), ),
"w", "w", encoding='utf-8'
) as f: ) as f:
for k, v in align_dict.items(): for k, v in align_dict.items():
print("{} {}".format(src_dict[k], tgt_dict[v]), file=f) print("{} {}".format(src_dict[k], tgt_dict[v]), file=f)
......
...@@ -59,8 +59,8 @@ def main(): ...@@ -59,8 +59,8 @@ def main():
# create joined file # create joined file
joined_file = os.path.join(args.output_dir, 'text.joined') joined_file = os.path.join(args.output_dir, 'text.joined')
with open(args.source_file, 'r') as src, open(args.target_file, 'r') as tgt: with open(args.source_file, 'r', encoding='utf-8') as src, open(args.target_file, 'r', encoding='utf-8') as tgt:
with open(joined_file, 'w') as joined: with open(joined_file, 'w', encoding='utf-8') as joined:
for s, t in zip_longest(src, tgt): for s, t in zip_longest(src, tgt):
print('{} ||| {}'.format(s.strip(), t.strip()), file=joined) print('{} ||| {}'.format(s.strip(), t.strip()), file=joined)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment