Commit 66ce2175 authored by Jingfei Du's avatar Jingfei Du Committed by Facebook Github Bot
Browse files

make dictionary class as input for fairseq preprocess functions (#482)

Summary:
Pull Request resolved: https://github.com/pytorch/fairseq/pull/482

With this change, we can use different dictionary classes when calling build_dictionary and build_and_save_dictionary

Reviewed By: liaimi

Differential Revision: D13855100

fbshipit-source-id: 62e6db310b5f078e05c547d2671252233be7b7f0
parent b41c74dc
...@@ -271,17 +271,21 @@ def main(args): ...@@ -271,17 +271,21 @@ def main(args):
def build_and_save_dictionary( def build_and_save_dictionary(
train_path, output_path, num_workers, freq_threshold, max_words train_path, output_path, num_workers, freq_threshold, max_words, dict_cls=dictionary.Dictionary,
): ):
dict = build_dictionary([train_path], num_workers) dict = build_dictionary([train_path], num_workers, dict_cls)
dict.finalize(threshold=freq_threshold, nwords=max_words) dict.finalize(threshold=freq_threshold, nwords=max_words)
dict_path = os.path.join(output_path, "dict.txt") dict_path = os.path.join(output_path, "dict.txt")
dict.save(dict_path) dict.save(dict_path)
return dict_path return dict_path
def build_dictionary(filenames, workers): def build_dictionary(
d = dictionary.Dictionary() filenames,
workers,
dict_cls=dictionary.Dictionary,
):
d = dict_cls()
for filename in filenames: for filename in filenames:
Tokenizer.add_file_to_dictionary(filename, d, tokenize_line, workers) Tokenizer.add_file_to_dictionary(filename, d, tokenize_line, workers)
return d return d
...@@ -300,8 +304,17 @@ def binarize(args, filename, dict, output_prefix, lang, offset, end): ...@@ -300,8 +304,17 @@ def binarize(args, filename, dict, output_prefix, lang, offset, end):
return res return res
def binarize_with_load(args, filename, dict_path, output_prefix, lang, offset, end): def binarize_with_load(
dict = dictionary.Dictionary.load(dict_path) args,
filename,
dict_path,
output_prefix,
lang,
offset,
end,
dict_cls=dictionary.Dictionary,
):
dict = dict_cls.load(dict_path)
binarize(args, filename, dict, output_prefix, lang, offset, end) binarize(args, filename, dict, output_prefix, lang, offset, end)
return dataset_dest_prefix(args, output_prefix, lang) return dataset_dest_prefix(args, output_prefix, lang)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment