Commit cea0e4b9 authored by Ruty Rinott's avatar Ruty Rinott Committed by Facebook Github Bot
Browse files

stitch preprocessing pipeline

Summary:
1. add call to binarization to complete preprocessing pipeline
2. add ability to specify task to select the dictionary, and add a bert task
3. Get rid of function calls that are no longer needed after moving functions from fairseq here

Reviewed By: jingfeidu

Differential Revision: D13977842

fbshipit-source-id: ec9bbb4e98e62e12c20ba68bb52b8bcc94aee91d
parent c49c292c
...@@ -232,7 +232,7 @@ def main(args): ...@@ -232,7 +232,7 @@ def main(args):
print("{} {}".format(src_dict[k], tgt_dict[v]), file=f) print("{} {}".format(src_dict[k], tgt_dict[v]), file=f)
def binarize(args, filename, dict, output_prefix, lang, offset, end): def binarize(args, filename, dict, output_prefix, lang, offset, end, append_eos=True):
ds = indexed_dataset.IndexedDatasetBuilder( ds = indexed_dataset.IndexedDatasetBuilder(
dataset_dest_file(args, output_prefix, lang, "bin") dataset_dest_file(args, output_prefix, lang, "bin")
) )
...@@ -240,7 +240,14 @@ def binarize(args, filename, dict, output_prefix, lang, offset, end): ...@@ -240,7 +240,14 @@ def binarize(args, filename, dict, output_prefix, lang, offset, end):
def consumer(tensor): def consumer(tensor):
ds.add_item(tensor) ds.add_item(tensor)
res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end) res = Tokenizer.binarize(
filename,
dict,
consumer,
offset=offset,
end=end,
append_eos=append_eos
)
ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))
return res return res
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment