Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
43d5d84b
Commit
43d5d84b
authored
Apr 19, 2020
by
Neel Kant
Browse files
Merge staging-realm into hashing
parents
787914ff
4abd7ce2
Changes
51
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
269 additions
and
21 deletions
+269
-21
tools/generate_samples_gpt2.py
tools/generate_samples_gpt2.py
+5
-0
tools/linter.py
tools/linter.py
+0
-0
tools/merge_mp_partitions.py
tools/merge_mp_partitions.py
+64
-21
tools/openwebtext/README.md
tools/openwebtext/README.md
+0
-0
tools/openwebtext/blacklist_urls.py
tools/openwebtext/blacklist_urls.py
+0
-0
tools/openwebtext/cleanup_dataset.py
tools/openwebtext/cleanup_dataset.py
+0
-0
tools/openwebtext/find_duplicates.py
tools/openwebtext/find_duplicates.py
+0
-0
tools/openwebtext/group_duplicates_url.py
tools/openwebtext/group_duplicates_url.py
+0
-0
tools/openwebtext/merge_jsons.py
tools/openwebtext/merge_jsons.py
+0
-0
tools/openwebtext/remove_group_duplicates.py
tools/openwebtext/remove_group_duplicates.py
+0
-0
tools/preprocess_data.py
tools/preprocess_data.py
+200
-0
No files found.
generate_samples_gpt2.py
→
tools/
generate_samples_gpt2.py
View file @
43d5d84b
...
...
@@ -15,6 +15,11 @@
"""Sample Generate GPT2"""
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
os
.
path
.
pardir
)))
from
megatron
import
get_args
from
megatron
import
get_tokenizer
from
megatron
import
print_rank_0
...
...
script
s/linter.py
→
tool
s/linter.py
View file @
43d5d84b
File moved
merge_mp_partitions.py
→
tools/
merge_mp_partitions.py
View file @
43d5d84b
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Merge model parallel partitions."""
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
os
.
path
.
pardir
)))
import
torch
from
arguments
import
get_args
from
megatron
import
mpu
from
megatron.utils
import
ensure_directory_exists
from
megatron.utils
import
get_checkpoint_name
from
megatron.utils
import
get_checkpoint_tracker_filename
from
megatron.utils
import
vocab_size_with_padding
from
megatron.checkpointing
import
ensure_directory_exists
from
megatron.checkpointing
import
get_checkpoint_name
from
megatron.checkpointing
import
get_checkpoint_tracker_filename
from
megatron.global_vars
import
rebuild_tokenizer
from
megatron.global_vars
import
_parse_args
def
split_into_partitions
(
tensor
,
num_partitions
,
partition_dim
,
stride
):
...
...
@@ -84,21 +104,26 @@ def merge_partitions(merged, partitions, partition_dim, stride):
return
def
get_model
(
model_type
,
args
):
def
get_model
(
model_type
):
if
model_type
==
'BERT'
:
from
pretrain_albert
import
model_provider
args
.
tokentype_size
=
2
elif
model_type
==
'GPT'
:
from
pretrain_bert
import
model_provider
elif
model_type
==
'GPT2'
:
from
pretrain_gpt2
import
model_provider
elif
model_type
==
'RACE'
:
from
tasks.race.finetune
import
model_provider
elif
model_type
==
[
'MNLI'
,
'QQP'
]:
num_classes
=
2
if
model_type
==
'MNLI'
:
num_classes
=
3
from
megatron.model.classification
import
Classification
def
model_provider
():
return
Classification
(
num_classes
=
num_classes
,
num_tokentypes
=
2
)
else
:
raise
Exception
(
'unrecognized model type: {}'
.
format
(
model_type
))
orig_vocab_size
=
args
.
vocab_size
args
.
vocab_size
=
vocab_size_with_padding
(
args
.
vocab_size
,
args
)
model
=
model_provider
(
args
)
model
=
model_provider
()
model
=
model
.
half
()
args
.
vocab_size
=
orig_vocab_size
return
model
...
...
@@ -147,17 +172,32 @@ def test_split_merge():
print
(
' > max error (should be zero): {}'
.
format
(
max_error
))
def
main
(
model_type
):
def
get_mp_merge_args
(
parser
):
"""Provide extra arguments required for merging."""
group
=
parser
.
add_argument_group
(
title
=
'mp merge'
)
group
.
add_argument
(
'--model-type'
,
type
=
str
,
required
=
True
,
choices
=
[
'BERT'
,
'GPT2'
,
'RACE'
,
'MNLI'
,
'QQP'
],
help
=
'Type of the mdoel.'
)
return
parser
def
main
():
# Args
args
=
get_args
()
args
=
_parse_args
(
extra_args_provider
=
get_mp_merge_args
)
model_type
=
args
.
model_type
orig_model_parallel_size
=
args
.
model_parallel_size
args
.
model_parallel_size
=
1
tokenizer
=
rebuild_tokenizer
(
args
)
print
(
'
\n
merging model parallel partitions ...'
)
assert
args
.
vocab_size
is
not
None
print
(
' > number of partitions: {}'
.
format
(
args
.
model_parallel_size
))
print
(
' > number of partitions: {}'
.
format
(
orig_model_parallel_size
))
print
(
' > checkpoint path: {}'
.
format
(
args
.
load
))
print
(
' > model parameters:'
)
print
(
' number of tokens ................ {} '
.
format
(
args
.
vocab_size
))
print
(
' number of tokens ................ {} '
.
format
(
tokenizer
.
vocab_size
))
print
(
' number of layers ................ {}'
.
format
(
args
.
num_layers
))
print
(
' hidden sise ..................... {}'
.
format
(
args
.
hidden_size
))
print
(
' number of attention heads ....... {}'
.
format
(
...
...
@@ -169,17 +209,19 @@ def main(model_type):
print
(
'> building the full model ...'
)
mpu
.
initialize
.
set_model_parallel_world_size
(
1
)
mpu
.
initialize
.
set_model_parallel_rank
(
0
)
merged_model
=
get_model
(
model_type
,
args
)
merged_model
=
get_model
(
model_type
)
# Build and load partitions.
partitions
=
[]
iteration
=
0
args
.
model_parallel_size
=
orig_model_parallel_size
tokenizer
=
rebuild_tokenizer
(
args
)
mpu
.
initialize
.
set_model_parallel_world_size
(
args
.
model_parallel_size
)
for
rank
in
range
(
args
.
model_parallel_size
):
mpu
.
initialize
.
set_model_parallel_rank
(
rank
)
checkpoint_name
,
iteration
=
get_parallel_checkpoint_name
(
args
.
load
)
print
(
'> loading {} ...'
.
format
(
checkpoint_name
))
model_
=
get_model
(
model_type
,
args
)
model_
=
get_model
(
model_type
)
sd
=
torch
.
load
(
checkpoint_name
,
map_location
=
'cpu'
)
model_
.
load_state_dict
(
sd
[
'model'
])
partitions
.
append
(
model_
)
...
...
@@ -225,6 +267,7 @@ def main(model_type):
# Save the model.
args
.
model_parallel_size
=
1
mpu
.
initialize
.
set_model_parallel_rank
(
0
)
sd
=
{}
sd
[
'model'
]
=
merged_model
.
state_dict_for_save_checkpoint
()
...
...
@@ -240,4 +283,4 @@ def main(model_type):
if
__name__
==
'__main__'
:
main
(
'BERT'
)
main
()
openwebtext/README.md
→
tools/
openwebtext/README.md
View file @
43d5d84b
File moved
openwebtext/blacklist_urls.py
→
tools/
openwebtext/blacklist_urls.py
View file @
43d5d84b
File moved
openwebtext/cleanup_dataset.py
→
tools/
openwebtext/cleanup_dataset.py
View file @
43d5d84b
File moved
openwebtext/find_duplicates.py
→
tools/
openwebtext/find_duplicates.py
View file @
43d5d84b
File moved
openwebtext/group_duplicates_url.py
→
tools/
openwebtext/group_duplicates_url.py
View file @
43d5d84b
File moved
openwebtext/merge_jsons.py
→
tools/
openwebtext/merge_jsons.py
View file @
43d5d84b
File moved
openwebtext/remove_group_duplicates.py
→
tools/
openwebtext/remove_group_duplicates.py
View file @
43d5d84b
File moved
tools/preprocess_data.py
0 → 100644
View file @
43d5d84b
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Processing data for pretraining."""
import
argparse
import
json
import
multiprocessing
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
abspath
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
os
.
path
.
pardir
)))
import
time
import
torch
try
:
import
nltk
nltk_available
=
True
except
ImportError
:
nltk_available
=
False
from
megatron.tokenizer
import
build_tokenizer
from
megatron.data
import
indexed_dataset
# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
class
CustomLanguageVars
(
nltk
.
tokenize
.
punkt
.
PunktLanguageVars
):
_period_context_fmt
=
r
"""
\S* # some word material
%(SentEndChars)s # a potential sentence ending
\s* # <-- THIS is what I changed
(?=(?P<after_tok>
%(NonWord)s # either other punctuation
|
(?P<next_tok>\S+) # <-- Normally you would have \s+ here
))"""
class
IdentitySplitter
(
object
):
def
tokenize
(
self
,
*
text
):
return
text
class
Encoder
(
object
):
def
__init__
(
self
,
args
):
self
.
args
=
args
def
initializer
(
self
):
# Use Encoder class as a container for global data
Encoder
.
tokenizer
=
build_tokenizer
(
self
.
args
)
if
self
.
args
.
split_sentences
:
if
not
nltk_available
:
print
(
"NLTK is not available to split sentences."
)
exit
()
splitter
=
nltk
.
load
(
"tokenizers/punkt/english.pickle"
)
if
self
.
args
.
keep_newlines
:
# this prevents punkt from eating newlines after sentences
Encoder
.
splitter
=
nltk
.
tokenize
.
punkt
.
PunktSentenceTokenizer
(
train_text
=
splitter
.
_params
,
lang_vars
=
CustomLanguageVars
())
else
:
Encoder
.
splitter
=
splitter
else
:
Encoder
.
splitter
=
IdentitySplitter
()
def
encode
(
self
,
json_line
):
data
=
json
.
loads
(
json_line
)
ids
=
{}
for
key
in
self
.
args
.
json_keys
:
text
=
data
[
key
]
doc_ids
=
[]
for
sentence
in
Encoder
.
splitter
.
tokenize
(
text
):
sentence_ids
=
Encoder
.
tokenizer
.
tokenize
(
sentence
)
if
len
(
sentence_ids
)
>
0
:
doc_ids
.
append
(
sentence_ids
)
if
self
.
args
.
append_eod
:
doc_ids
[
-
1
].
append
(
Encoder
.
tokenizer
.
eod
)
ids
[
key
]
=
doc_ids
return
ids
,
len
(
json_line
)
def
get_args
():
parser
=
argparse
.
ArgumentParser
()
group
=
parser
.
add_argument_group
(
title
=
'input data'
)
group
.
add_argument
(
'--input'
,
type
=
str
,
required
=
True
,
help
=
'Path to input JSON'
)
group
.
add_argument
(
'--json-keys'
,
nargs
=
'+'
,
default
=
[
'text'
],
help
=
'space separate listed of keys to extract from json'
)
group
.
add_argument
(
'--split-sentences'
,
action
=
'store_true'
,
help
=
'Split documents into sentences.'
)
group
.
add_argument
(
'--keep-newlines'
,
action
=
'store_true'
,
help
=
'Keep newlines between sentences when splitting.'
)
group
=
parser
.
add_argument_group
(
title
=
'tokenizer'
)
group
.
add_argument
(
'--tokenizer-type'
,
type
=
str
,
required
=
True
,
choices
=
[
'BertWordPieceLowerCase'
,
'GPT2BPETokenizer'
],
help
=
'What type of tokenizer to use.'
)
group
.
add_argument
(
'--vocab-file'
,
type
=
str
,
default
=
None
,
help
=
'Path to the vocab file'
)
group
.
add_argument
(
'--merge-file'
,
type
=
str
,
default
=
None
,
help
=
'Path to the BPE merge file (if necessary).'
)
group
.
add_argument
(
'--append-eod'
,
action
=
'store_true'
,
help
=
'Append an <eod> token to the end of a document.'
)
group
=
parser
.
add_argument_group
(
title
=
'output data'
)
group
.
add_argument
(
'--output-prefix'
,
type
=
str
,
required
=
True
,
help
=
'Path to binary output file without suffix'
)
group
.
add_argument
(
'--dataset-impl'
,
type
=
str
,
default
=
'mmap'
,
choices
=
[
'lazy'
,
'cached'
,
'mmap'
])
group
=
parser
.
add_argument_group
(
title
=
'runtime'
)
group
.
add_argument
(
'--workers'
,
type
=
int
,
default
=
1
,
help
=
'Number of worker processes to launch'
)
group
.
add_argument
(
'--log-interval'
,
type
=
int
,
default
=
100
,
help
=
'Interval between progress updates'
)
args
=
parser
.
parse_args
()
args
.
keep_empty
=
False
if
args
.
tokenizer_type
.
lower
().
startswith
(
'bert'
):
if
not
args
.
split_sentences
:
print
(
"Bert tokenizer detected, are you sure you don't want to split sentences?"
)
# some default/dummy values for the tokenizer
args
.
rank
=
0
args
.
make_vocab_size_divisible_by
=
128
args
.
model_parallel_size
=
1
return
args
def
main
():
args
=
get_args
()
startup_start
=
time
.
time
()
print
(
"Opening"
,
args
.
input
)
fin
=
open
(
args
.
input
,
'r'
,
encoding
=
'utf-8'
)
if
nltk_available
and
args
.
split_sentences
:
nltk
.
download
(
"punkt"
,
quiet
=
True
)
encoder
=
Encoder
(
args
)
tokenizer
=
build_tokenizer
(
args
)
pool
=
multiprocessing
.
Pool
(
args
.
workers
,
initializer
=
encoder
.
initializer
)
encoded_docs
=
pool
.
imap
(
encoder
.
encode
,
fin
,
25
)
#encoded_docs = map(encoder.encode, fin)
level
=
"document"
if
args
.
split_sentences
:
level
=
"sentence"
print
(
f
"Vocab size:
{
tokenizer
.
vocab_size
}
"
)
print
(
f
"Output prefix:
{
args
.
output_prefix
}
"
)
output_bin_files
=
{}
output_idx_files
=
{}
builders
=
{}
for
key
in
args
.
json_keys
:
output_bin_files
[
key
]
=
"{}_{}_{}.bin"
.
format
(
args
.
output_prefix
,
key
,
level
)
output_idx_files
[
key
]
=
"{}_{}_{}.idx"
.
format
(
args
.
output_prefix
,
key
,
level
)
builders
[
key
]
=
indexed_dataset
.
make_builder
(
output_bin_files
[
key
],
impl
=
args
.
dataset_impl
,
vocab_size
=
tokenizer
.
vocab_size
)
startup_end
=
time
.
time
()
proc_start
=
time
.
time
()
total_bytes_processed
=
0
print
(
"Time to startup:"
,
startup_end
-
startup_start
)
for
i
,
(
doc
,
bytes_processed
)
in
enumerate
(
encoded_docs
,
start
=
1
):
total_bytes_processed
+=
bytes_processed
for
key
,
sentences
in
doc
.
items
():
for
sentence
in
sentences
:
builders
[
key
].
add_item
(
torch
.
IntTensor
(
sentence
))
builders
[
key
].
end_document
()
if
i
%
args
.
log_interval
==
0
:
current
=
time
.
time
()
elapsed
=
current
-
proc_start
mbs
=
total_bytes_processed
/
elapsed
/
1024
/
1024
print
(
f
"Processed
{
i
}
documents"
,
f
"(
{
i
/
elapsed
}
docs/s,
{
mbs
}
MB/s)."
,
file
=
sys
.
stderr
)
for
key
in
args
.
json_keys
:
builders
[
key
].
finalize
(
output_idx_files
[
key
])
if
__name__
==
'__main__'
:
main
()
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment