Commit 09d220cf authored by Jared Casper's avatar Jared Casper
Browse files

Handle empty documents in preprocess_data.

parent 1b8e2891
...@@ -85,7 +85,7 @@ class Encoder(object): ...@@ -85,7 +85,7 @@ class Encoder(object):
sentence_ids = Encoder.tokenizer.tokenize(sentence) sentence_ids = Encoder.tokenizer.tokenize(sentence)
if len(sentence_ids) > 0: if len(sentence_ids) > 0:
doc_ids.append(sentence_ids) doc_ids.append(sentence_ids)
if self.args.append_eod: if len(doc_ids) > 0 and self.args.append_eod:
doc_ids[-1].append(Encoder.tokenizer.eod) doc_ids[-1].append(Encoder.tokenizer.eod)
ids[key] = doc_ids ids[key] = doc_ids
return ids, len(json_line) return ids, len(json_line)
...@@ -182,6 +182,8 @@ def main(): ...@@ -182,6 +182,8 @@ def main():
for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1): for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
total_bytes_processed += bytes_processed total_bytes_processed += bytes_processed
for key, sentences in doc.items(): for key, sentences in doc.items():
if len(sentences) == 0:
continue
for sentence in sentences: for sentence in sentences:
builders[key].add_item(torch.IntTensor(sentence)) builders[key].add_item(torch.IntTensor(sentence))
builders[key].end_document() builders[key].end_document()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment