make dictionary class as input for fairseq preprocess functions (#482)

Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/482 With this change, we can use different dictionary classes when calling build_dictionary and build_and_save_dictionary Reviewed By: liaimi Differential Revision: D13855100 fbshipit-source-id: 62e6db310b5f078e05c547d2671252233be7b7f0

make dictionary class as input for fairseq preprocess functions (#482)
Summary: Pull Request resolved: https://github.com/pytorch/fairseq/pull/482 With this change, we can use different dictionary classes when calling build_dictionary and build_and_save_dictionary Reviewed By: liaimi Differential Revision: D13855100 fbshipit-source-id: 62e6db310b5f078e05c547d2671252233be7b7f0
66ce2175 · Jingfei Du · Facebook Github Bot · b41c74dc · 66ce2175
Commit 66ce2175 authored Jan 29, 2019 by Jingfei Du Committed by Facebook Github Bot Jan 29, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 19 additions and 6 deletions

preprocess.py preprocess.py +19 -6

No files found.
--- a/preprocess.py
+++ b/preprocess.py
@@ -271,17 +271,21 @@ def main(args):
 def build_and_save_dictionary(
-    train_path, output_path, num_workers, freq_threshold, max_words
+    train_path, output_path, num_workers, freq_threshold, max_words, dict_cls=dictionary.Dictionary,
 ):
-    dict = build_dictionary([train_path], num_workers)
+    dict = build_dictionary([train_path], num_workers, dict_cls)
    dict.finalize(threshold=freq_threshold, nwords=max_words)
    dict_path = os.path.join(output_path, "dict.txt")
    dict.save(dict_path)
    return dict_path
-def build_dictionary(filenames, workers):
+def build_dictionary(
-    d = dictionary.Dictionary()
+    filenames,
+    workers,
+    dict_cls=dictionary.Dictionary,
+):
+    d = dict_cls()
    for filename in filenames:
        Tokenizer.add_file_to_dictionary(filename, d, tokenize_line, workers)
    return d
@@ -300,8 +304,17 @@ def binarize(args, filename, dict, output_prefix, lang, offset, end):
    return res
-def binarize_with_load(args, filename, dict_path, output_prefix, lang, offset, end):
+def binarize_with_load(
-    dict = dictionary.Dictionary.load(dict_path)
+    args,
+    filename,
+    dict_path,
+    output_prefix,
+    lang,
+    offset,
+    end,
+    dict_cls=dictionary.Dictionary,
+):
+    dict = dict_cls.load(dict_path)
    binarize(args, filename, dict, output_prefix, lang, offset, end)
    return dataset_dest_prefix(args, output_prefix, lang)