Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
09915adf
Commit
09915adf
authored
May 29, 2023
by
cardy20
Browse files
add master files
parent
fd43d570
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
175 additions
and
0 deletions
+175
-0
scripts/clean_training_data/compress_and_package.py
scripts/clean_training_data/compress_and_package.py
+73
-0
scripts/clean_training_data/investigate_pile.py
scripts/clean_training_data/investigate_pile.py
+101
-0
scripts/clean_training_data/sort_13_gram_buckets.py
scripts/clean_training_data/sort_13_gram_buckets.py
+1
-0
No files found.
scripts/clean_training_data/compress_and_package.py
0 → 100644
View file @
09915adf
import
glob
import
argparse
import
os
import
subprocess
import
shutil
from
tqdm
import
tqdm
from
tqdm_multiprocess
import
TqdmMultiProcessPool
import
logging
from
tqdm_multiprocess.logger
import
setup_logger_tqdm
logger
=
logging
.
getLogger
(
__name__
)
def
process_task
(
working_directory
,
output_directory
,
bucket_file_path
,
tqdm_func
,
global_tqdm
):
command
=
f
"zstd
{
bucket_file_path
}
"
logger
.
info
(
command
)
subprocess
.
call
(
command
,
shell
=
True
)
compressed_file
=
bucket_file_path
+
".zst"
if
output_directory
:
shutil
.
move
(
compressed_file
,
output_directory
)
os
.
remove
(
bucket_file_path
)
global_tqdm
.
update
()
def
compress_and_move
(
working_directory
,
output_directory
,
process_count
):
os
.
makedirs
(
output_directory
,
exist_ok
=
True
)
original_info_file_path
=
os
.
path
.
join
(
working_directory
,
"info.json"
)
assert
os
.
path
.
exists
(
original_info_file_path
)
tasks
=
[]
bucket_file_paths
=
glob
.
glob
(
os
.
path
.
join
(
working_directory
,
"output"
,
f
"*.bkt.txt.sorted"
)
)
for
bucket_file_path
in
bucket_file_paths
:
task
=
(
process_task
,
(
working_directory
,
output_directory
,
bucket_file_path
))
tasks
.
append
(
task
)
pool
=
TqdmMultiProcessPool
(
process_count
)
def
on_done
(
_
):
return
None
def
on_error
(
_
):
return
None
global_progress
=
tqdm
(
total
=
len
(
bucket_file_paths
),
dynamic_ncols
=
True
,
unit
=
"file"
)
_
=
pool
.
map
(
global_progress
,
tasks
,
on_error
,
on_done
)
shutil
.
copy
(
original_info_file_path
,
os
.
path
.
join
(
output_directory
,
"info.json"
))
parser
=
argparse
.
ArgumentParser
(
description
=
"sort 13gram buckets"
)
parser
.
add_argument
(
"-dir"
,
"--working_directory"
,
required
=
True
)
parser
.
add_argument
(
"-output"
,
"--output_directory"
,
required
=
True
)
parser
.
add_argument
(
"-procs"
,
"--process_count"
,
type
=
int
,
default
=
8
)
if
__name__
==
"__main__"
:
version
=
1.00
print
(
f
"Running version
{
version
}
"
)
logfile_path
=
"compress_and_package.log"
setup_logger_tqdm
(
logfile_path
)
args
=
parser
.
parse_args
()
compress_and_move
(
args
.
working_directory
,
args
.
output_directory
,
args
.
process_count
)
scripts/clean_training_data/investigate_pile.py
0 → 100644
View file @
09915adf
from
lm_eval.decontamination.archiver
import
Reader
import
os
import
json
from
functools
import
reduce
import
glob
import
tqdm
from
tqdm_multiprocess
import
TqdmMultiProcessPool
def
get_file_stats
(
file_path
,
tqdm_func
,
global_tqdm
):
reader
=
Reader
()
total_documents
=
0
total_size
=
0
update_frequency
=
10000
current_file_position
=
0
with
tqdm_func
(
total
=
os
.
path
.
getsize
(
file_path
),
dynamic_ncols
=
True
,
unit
=
"byte"
,
unit_scale
=
1
)
as
progress
:
for
document
in
reader
.
read
(
file_path
,
get_meta
=
True
):
total_size
+=
len
(
document
)
total_documents
+=
1
if
total_documents
%
update_frequency
==
0
:
new_file_pos
=
reader
.
fh
.
tell
()
bytes_read
=
new_file_pos
-
current_file_position
current_file_position
=
new_file_pos
progress
.
update
(
bytes_read
)
global_tqdm
.
update
(
bytes_read
)
return
(
total_documents
,
total_size
)
def
get_files_zst
():
directory
=
"pile"
files
=
list
(
sorted
(
glob
.
glob
(
os
.
path
.
join
(
directory
,
"*.jsonl.zst*"
))))
print
(
files
)
return
files
def
get_files
():
""" jsonl files in directory """
directory
=
"pile"
files
=
list
(
sorted
(
glob
.
glob
(
os
.
path
.
join
(
directory
,
"*.jsonl"
))))
print
(
files
)
return
files
def
get_stats
():
files
=
get_files
()
total_size_bytes
=
sum
(
map
(
lambda
x
:
os
.
path
.
getsize
(
x
),
files
))
pool
=
TqdmMultiProcessPool
(
4
)
global_tqdm
=
tqdm
.
tqdm
(
total
=
total_size_bytes
,
dynamic_ncols
=
True
,
unit
=
"byte"
,
unit_scale
=
1
)
# Generate minhashes with pool
tasks
=
[(
get_file_stats
,
(
file
,))
for
file
in
files
]
def
on_done
(
_
):
return
None
def
on_error
(
_
):
return
None
results
=
pool
.
map
(
global_tqdm
,
tasks
,
on_error
,
on_done
)
total_documents
,
total_size
=
reduce
(
lambda
x
,
y
:
(
x
[
0
]
+
y
[
0
],
x
[
1
]
+
y
[
1
]),
results
)
start_offsets
=
[]
current_offset
=
0
for
file_document_count
,
_
in
results
:
start_offsets
.
append
(
current_offset
)
current_offset
+=
file_document_count
return
(
total_documents
,
total_size
,
start_offsets
)
if
__name__
==
"__main__"
:
version
=
1.01
print
(
f
"Running version
{
version
}
"
)
stats_file_path
=
"pile_statistics.json"
if
os
.
path
.
exists
(
stats_file_path
):
stats
=
json
.
load
(
open
(
stats_file_path
,
"r"
))
else
:
document_count
,
total_document_size_chars
,
start_offsets
=
get_stats
()
stats
=
{
"Data"
:
"Pile statistics"
,
"Document Count"
:
document_count
,
"Total Pile Characters"
:
total_document_size_chars
,
"File Start Offsets"
:
start_offsets
,
}
json
.
dump
(
stats
,
open
(
stats_file_path
,
"w"
),
indent
=
4
)
print
(
f
"document_count:
{
stats
[
'Document Count'
]
}
"
)
print
(
f
"total_chars:
{
stats
[
'Total Pile Characters'
]
}
"
)
print
(
f
"start_offsets:
{
stats
[
'File Start Offsets'
]
}
"
)
\ No newline at end of file
scripts/clean_training_data/sort_13_gram_buckets.py
View file @
09915adf
...
...
@@ -34,6 +34,7 @@ def sort_13_gram_buckets(working_directory):
for
bucket_file_path
in
tqdm
(
bucket_file_paths
,
dynamic_ncols
=
True
):
bucket_id
=
re
.
sub
(
"\D"
,
""
,
os
.
path
.
basename
(
bucket_file_path
))
done_file
=
os
.
path
.
join
(
working_directory
,
f
"ngram_bucket_sorting_
{
bucket_id
}
.done"
)
if
os
.
path
.
exists
(
done_file
):
logger
.
info
(
f
"bucket
{
bucket_id
}
already processed, skipping"
)
return
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment