Commit f740d5a3 authored by cardy20's avatar cardy20
Browse files

ngram modified

parent 122b8526
INFO - 05/25/23 13:54:55 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 13:54:55 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:17:37 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:17:37 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:17:52 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:17:52 - 0:00:00 - ngrams already generated and bucketed, skipping
INFO - 05/25/23 14:22:13 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:22:13 - 0:00:00 - ngrams already generated and bucketed, skipping
INFO - 05/25/23 14:22:37 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:22:37 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:23:04 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:23:04 - 0:00:00 - ngrams already generated and bucketed, skipping
INFO - 05/25/23 14:23:29 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:23:29 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:24:56 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:24:56 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:25:47 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:25:47 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:28:28 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:28:28 - 0:00:00 - ngrams already generated and bucketed, skipping
INFO - 05/25/23 14:28:40 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:28:40 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:29:28 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:29:28 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:36:03 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:36:03 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:37:46 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:37:46 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:52:10 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:52:10 - 0:00:00 - Starting at pile document index 0
...@@ -39,9 +39,12 @@ class Reader: ...@@ -39,9 +39,12 @@ class Reader:
def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner='\n\n'): def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner='\n\n'):
with open(file, 'rb') as fh: with open(file, 'rb') as fh:
self.fh = fh self.fh = fh
cctx = zstandard.ZstdDecompressor()
reader = io.BufferedReader(cctx.stream_reader(fh)) #cctx = zstandard.ZstdDecompressor()
# reader = io.BufferedReader(cctx.stream_reader(fh))
reader = io.BufferedReader(fh)
rdr = jsonlines.Reader(reader) rdr = jsonlines.Reader(reader)
for ob in rdr: for ob in rdr:
# naive jsonl where each object is just the string itself, with no meta. For legacy compatibility. # naive jsonl where each object is just the string itself, with no meta. For legacy compatibility.
if isinstance(ob, str): if isinstance(ob, str):
......
...@@ -46,12 +46,14 @@ def handler(signal_received, frame): ...@@ -46,12 +46,14 @@ def handler(signal_received, frame):
terminate = True terminate = True
def get_pile(directory): def get_pile(directory):
# reader = Reader() reader = Reader()
# for file in glob.glob(os.path.join(directory, f"*.jsonl.zst*")): # for file in glob.glob(os.path.join(directory, f"*.jsonl.zst*")):
for dir in os.listdir(directory): for dir in os.listdir(directory):
for file in glob.glob(os.path.join(directory + dir, f"*.jsonl")): print(os.path.join(directory + dir, f".jsonl"))
for document in open(file).read(): for file in glob.glob(os.path.join(directory + dir)):
# for document in reader.read(file):
# for document in open(file).read():
for document in reader.read(file):
yield document yield document
def close_buckets(buckets): def close_buckets(buckets):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment