Commit 3f4bc91b authored by Jared Casper's avatar Jared Casper
Browse files

Skip any empty sentences during preprocessing.

parent 61697cab
......@@ -44,7 +44,8 @@ class Encoder(object):
for sentence in Encoder.splitter.tokenize(text):
tokens = Encoder.tokenizer.tokenize(sentence)
ids = Encoder.tokenizer.convert_tokens_to_ids(tokens)
doc_ids.append(ids)
if len(ids) > 0:
doc_ids.append(ids)
return doc_ids, len(json_line)
def main():
......
......@@ -18,16 +18,18 @@ def test_indexed_dataset(args):
if ds.supports_prefetch:
# just prefetch the whole thing in test (so assume it is small)
ds.prefetch(range(len(ds)))
for i in range(2):
for i in range(len(ds.doc_idx)-1):
start = ds.doc_idx[i]
end = ds.doc_idx[i+1]
ids = ds[start:end]
for s in ids:
assert len(s) > 0
l = s.data.tolist()
print(l)
tokens = tokenizer.convert_ids_to_tokens(l)
print(tokens)
print("******** END DOCUMENT **********")
for t in tokens:
if '\n' in t:
print("Newline in string!")
print(i)
def main():
parser = argparse.ArgumentParser()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment