Commit 851c0226 authored by Naman Goyal's avatar Naman Goyal Committed by Facebook Github Bot
Browse files

added check in token block dataset for multiple consecutive blank lines

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/830

Differential Revision: D16861799

fbshipit-source-id: d85deaf78ec5b9c23eafd4145a96252e3901fa22
parent a3cfd51d
......@@ -48,6 +48,13 @@ class TokenBlockDataset(FairseqDataset):
assert len(dataset) == len(sizes)
assert len(dataset) > 0
sizes = np.array(sizes, dtype=int)
assert np.all(np.diff((sizes == document_sep_len).nonzero()) != 1),\
(
"Found multiple blank lines in the dataset, please remove them"
" (eg. cat -s raw.txt) and preprocess the data again."
)
if break_mode is None or break_mode == 'none':
total_size = sum(sizes)
length = math.ceil(total_size / block_size)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment