Commit fd43d570 authored by cardy20's avatar cardy20
Browse files

ngram for polyglot-ko

parent f740d5a3
......@@ -49,10 +49,7 @@ def get_pile(directory):
reader = Reader()
# for file in glob.glob(os.path.join(directory, f"*.jsonl.zst*")):
for dir in os.listdir(directory):
print(os.path.join(directory + dir, f".jsonl"))
for file in glob.glob(os.path.join(directory + dir)):
# for document in open(file).read():
for file in glob.glob(os.path.join(directory + dir, "*.jsonl")):
for document in reader.read(file):
yield document
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment