Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
f740d5a3
Commit
f740d5a3
authored
May 29, 2023
by
cardy20
Browse files
ngram modified
parent
122b8526
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
12 additions
and
37 deletions
+12
-37
ngrams.log
ngrams.log
+0
-30
scripts/clean_training_data/archiver.py
scripts/clean_training_data/archiver.py
+5
-2
scripts/clean_training_data/generate_13_grams.py
scripts/clean_training_data/generate_13_grams.py
+7
-5
No files found.
ngrams.log
deleted
100644 → 0
View file @
122b8526
INFO - 05/25/23 13:54:55 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 13:54:55 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:17:37 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:17:37 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:17:52 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:17:52 - 0:00:00 - ngrams already generated and bucketed, skipping
INFO - 05/25/23 14:22:13 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:22:13 - 0:00:00 - ngrams already generated and bucketed, skipping
INFO - 05/25/23 14:22:37 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:22:37 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:23:04 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:23:04 - 0:00:00 - ngrams already generated and bucketed, skipping
INFO - 05/25/23 14:23:29 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:23:29 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:24:56 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:24:56 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:25:47 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:25:47 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:28:28 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:28:28 - 0:00:00 - ngrams already generated and bucketed, skipping
INFO - 05/25/23 14:28:40 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:28:40 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:29:28 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:29:28 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:36:03 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:36:03 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:37:46 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:37:46 - 0:00:00 - Starting at pile document index 0
INFO - 05/25/23 14:52:10 - 0:00:00 - Generating 13-grams and bucketing.
INFO - 05/25/23 14:52:10 - 0:00:00 - Starting at pile document index 0
scripts/clean_training_data/archiver.py
View file @
f740d5a3
...
@@ -39,9 +39,12 @@ class Reader:
...
@@ -39,9 +39,12 @@ class Reader:
def
read
(
self
,
file
,
get_meta
=
False
,
autojoin_paragraphs
=
True
,
para_joiner
=
'
\n\n
'
):
def
read
(
self
,
file
,
get_meta
=
False
,
autojoin_paragraphs
=
True
,
para_joiner
=
'
\n\n
'
):
with
open
(
file
,
'rb'
)
as
fh
:
with
open
(
file
,
'rb'
)
as
fh
:
self
.
fh
=
fh
self
.
fh
=
fh
cctx
=
zstandard
.
ZstdDecompressor
()
reader
=
io
.
BufferedReader
(
cctx
.
stream_reader
(
fh
))
#cctx = zstandard.ZstdDecompressor()
# reader = io.BufferedReader(cctx.stream_reader(fh))
reader
=
io
.
BufferedReader
(
fh
)
rdr
=
jsonlines
.
Reader
(
reader
)
rdr
=
jsonlines
.
Reader
(
reader
)
for
ob
in
rdr
:
for
ob
in
rdr
:
# naive jsonl where each object is just the string itself, with no meta. For legacy compatibility.
# naive jsonl where each object is just the string itself, with no meta. For legacy compatibility.
if
isinstance
(
ob
,
str
):
if
isinstance
(
ob
,
str
):
...
...
scripts/clean_training_data/generate_13_grams.py
View file @
f740d5a3
...
@@ -46,12 +46,14 @@ def handler(signal_received, frame):
...
@@ -46,12 +46,14 @@ def handler(signal_received, frame):
terminate
=
True
terminate
=
True
def
get_pile
(
directory
):
def
get_pile
(
directory
):
#
reader = Reader()
reader
=
Reader
()
# for file in glob.glob(os.path.join(directory, f"*.jsonl.zst*")):
# for file in glob.glob(os.path.join(directory, f"*.jsonl.zst*")):
for
dir
in
os
.
listdir
(
directory
):
for
dir
in
os
.
listdir
(
directory
):
for
file
in
glob
.
glob
(
os
.
path
.
join
(
directory
+
dir
,
f
"*.jsonl"
)):
print
(
os
.
path
.
join
(
directory
+
dir
,
f
".jsonl"
))
for
document
in
open
(
file
).
read
():
for
file
in
glob
.
glob
(
os
.
path
.
join
(
directory
+
dir
)):
# for document in reader.read(file):
# for document in open(file).read():
for
document
in
reader
.
read
(
file
):
yield
document
yield
document
def
close_buckets
(
buckets
):
def
close_buckets
(
buckets
):
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment