Commit fd43d570 authored by cardy20's avatar cardy20
Browse files

ngram for polyglot-ko

parent f740d5a3
...@@ -49,10 +49,7 @@ def get_pile(directory): ...@@ -49,10 +49,7 @@ def get_pile(directory):
reader = Reader() reader = Reader()
# for file in glob.glob(os.path.join(directory, f"*.jsonl.zst*")): # for file in glob.glob(os.path.join(directory, f"*.jsonl.zst*")):
for dir in os.listdir(directory): for dir in os.listdir(directory):
print(os.path.join(directory + dir, f".jsonl")) for file in glob.glob(os.path.join(directory + dir, "*.jsonl")):
for file in glob.glob(os.path.join(directory + dir)):
# for document in open(file).read():
for document in reader.read(file): for document in reader.read(file):
yield document yield document
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment