Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
5655f076
Commit
5655f076
authored
Apr 12, 2020
by
Mohammad
Browse files
cleaned up old gpt2 dataset stuff from openwebtext
parent
20764e12
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
0 additions
and
159 deletions
+0
-159
openwebtext/make_gpt2_dataset.py
openwebtext/make_gpt2_dataset.py
+0
-77
openwebtext/make_gpt2_sizes.py
openwebtext/make_gpt2_sizes.py
+0
-38
openwebtext/run_make_gpt2_dataset.sh
openwebtext/run_make_gpt2_dataset.sh
+0
-8
openwebtext/tokenizer.py
openwebtext/tokenizer.py
+0
-36
No files found.
openwebtext/make_gpt2_dataset.py
deleted
100644 → 0
View file @
20764e12
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
numpy
as
np
import
time
import
os
import
sys
from
tokenizer
import
Tokenizer
def
tokenize_corpus
(
filename
,
np_filename
,
print_interval
=
10000
):
print
(
' > tokenizing {}'
.
format
(
filename
))
tokenizer
=
Tokenizer
(
cache_dir
=
'./cache'
)
tokenized_docs
=
[]
num_docs
=
0
num_tokens
=
0
start_time
=
time
.
time
()
with
open
(
filename
,
'r'
)
as
f
:
for
line
in
f
:
try
:
myjson
=
json
.
loads
(
line
)
url
=
myjson
[
'url'
]
sample
=
myjson
[
'text'
]
tokens
=
tokenizer
.
tokenize_document
(
sample
)
tokenized_docs
.
append
(
np
.
array
(
tokens
,
dtype
=
np
.
uint16
))
num_docs
+=
1
num_tokens
+=
len
(
tokens
)
if
num_docs
%
print_interval
==
0
:
print
(
' processed {:9d} documents in {:.2f} (s) so far'
.
format
(
num_docs
,
time
.
time
()
-
start_time
),
flush
=
True
)
except
Exception
as
e
:
print
(
' skipping '
,
line
,
e
)
print
(
' >> processed {} document with total of {} tokens ...'
.
format
(
num_docs
,
num_tokens
))
tokenized_docs
=
np
.
array
(
tokenized_docs
,
dtype
=
object
)
np
.
save
(
np_filename
,
tokenized_docs
,
allow_pickle
=
True
)
print
(
' >> saved the tokenzed document to {} ...'
.
format
(
np_filename
))
if
__name__
==
'__main__'
:
print
(
'building gpt2 dataset ...'
)
path
=
sys
.
argv
[
1
]
shard
=
sys
.
argv
[
2
]
input_filename
=
os
.
path
.
join
(
path
,
'shards/shard_{:04d}'
.
format
(
int
(
shard
)))
output_filename
=
os
.
path
.
join
(
path
,
'npys/shard_{:04d}.npy'
.
format
(
int
(
shard
)))
print
(
'will be reading {}'
.
format
(
input_filename
))
print
(
'and will write the results to {}'
.
format
(
output_filename
))
tokenize_corpus
(
input_filename
,
output_filename
)
openwebtext/make_gpt2_sizes.py
deleted
100644 → 0
View file @
20764e12
import
glob
import
json
import
os
import
time
import
sys
import
numpy
as
np
if
__name__
==
'__main__'
:
print
(
'building the shard sizes ...'
)
path
=
sys
.
argv
[
1
]
print
(
'> reading numpy files from {}'
.
format
(
path
))
npy_files
=
glob
.
glob
(
path
+
'/*.npy'
)
npy_files
.
sort
()
print
(
' found {} numpy files'
.
format
(
len
(
npy_files
)))
size_dict
=
{}
counter
=
0
start_time
=
time
.
time
()
for
filename
in
npy_files
:
data
=
np
.
load
(
filename
,
allow_pickle
=
True
)
size
=
np
.
hstack
(
data
).
size
np_filename
=
os
.
path
.
basename
(
filename
)
size_dict
[
np_filename
]
=
size
counter
+=
1
if
counter
%
10
==
0
:
print
(
' processed {} files in {:.2f} seconds'
.
format
(
counter
,
time
.
time
()
-
start_time
))
output_filename
=
os
.
path
.
join
(
path
,
'sizes.txt'
)
with
open
(
output_filename
,
'w'
)
as
f
:
json
.
dump
(
size_dict
,
f
)
print
(
'> wrote sizes to {}'
.
format
(
output_filename
))
openwebtext/run_make_gpt2_dataset.sh
deleted
100755 → 0
View file @
20764e12
#!/bin/bash
echo
"processing gpt2 data ..."
DIR
=
"/raid/mpatwary/redownload_v0/0-21"
for
thread
in
{
0..3
}
;
do
echo
" launching thread "
$thread
&&
python make_gpt2_dataset.py
$DIR
$thread
>
$DIR
/logs/shard_
$thread
.log 2>&1 &
done
openwebtext/tokenizer.py
deleted
100644 → 0
View file @
20764e12
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
sys
sys
.
path
.
append
(
'..'
)
from
megatron.data_utils.tokenization_gpt2
import
GPT2Tokenizer
class
Tokenizer
:
def
__init__
(
self
,
cache_dir
=
None
):
self
.
tokenizer
=
GPT2Tokenizer
.
from_pretrained
(
'gpt2'
,
cache_dir
=
cache_dir
)
self
.
tokenizer
.
max_len
=
int
(
1e12
)
self
.
eod_token
=
self
.
tokenizer
.
encoder
[
'<|endoftext|>'
]
assert
self
.
eod_token
<
65535
,
'vocab size will not fit in uint16'
print
(
'> GPT2 tokenizer with {} vocab size and eod token {} ...'
.
format
(
len
(
self
.
tokenizer
.
encoder
),
self
.
eod_token
))
def
tokenize_document
(
self
,
document
):
tokens
=
self
.
tokenizer
.
encode
(
document
)
tokens
.
append
(
self
.
eod_token
)
return
tokens
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment