"src/lib/vscode:/vscode.git/clone" did not exist on "e6918397714b7251004698fe903645f8c0d53a22"
Commit 851c0226 authored by Naman Goyal's avatar Naman Goyal Committed by Facebook Github Bot
Browse files

added check in token block dataset for multiple consecutive blank lines

Summary: Pull Request resolved: https://github.com/fairinternal/fairseq-py/pull/830

Differential Revision: D16861799

fbshipit-source-id: d85deaf78ec5b9c23eafd4145a96252e3901fa22
parent a3cfd51d
......@@ -48,6 +48,13 @@ class TokenBlockDataset(FairseqDataset):
assert len(dataset) == len(sizes)
assert len(dataset) > 0
sizes = np.array(sizes, dtype=int)
assert np.all(np.diff((sizes == document_sep_len).nonzero()) != 1),\
(
"Found multiple blank lines in the dataset, please remove them"
" (eg. cat -s raw.txt) and preprocess the data again."
)
if break_mode is None or break_mode == 'none':
total_size = sum(sizes)
length = math.ceil(total_size / block_size)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment