Commit 5c18825a authored by LysandreJik's avatar LysandreJik
Browse files

Removed dataset limit

parent 3e3e1454
...@@ -14,7 +14,7 @@ class WikiTextDataset(Dataset): ...@@ -14,7 +14,7 @@ class WikiTextDataset(Dataset):
with open(os.path.join(directory, f"wiki.{file}.raw"), encoding="utf-8") as f: with open(os.path.join(directory, f"wiki.{file}.raw"), encoding="utf-8") as f:
text = f.read() text = f.read()
spans = list(filter(lambda item: len(item) > 120, text.split("\n")[:20])) spans = list(filter(lambda item: len(item) > 120, text.split("\n")))
for span in spans: for span in spans:
span = tokenizer.encode(span) span = tokenizer.encode(span)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment