Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
1a1cfcff
Commit
1a1cfcff
authored
Nov 18, 2019
by
Jared Casper
Browse files
Plumbing for new albert dataset, including mods to arguments for data loaders.
parent
0058b1a5
Changes
16
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
521 additions
and
101 deletions
+521
-101
arguments.py
arguments.py
+43
-33
configure_data.py
configure_data.py
+2
-2
gpt2_data_loader.py
gpt2_data_loader.py
+3
-3
megatron/data/__init__.py
megatron/data/__init__.py
+1
-1
megatron/data/albert_dataset.py
megatron/data/albert_dataset.py
+12
-2
megatron/data/dataset_utils.py
megatron/data/dataset_utils.py
+2
-2
megatron/data/helpers.cpp
megatron/data/helpers.cpp
+7
-6
megatron/data/indexed_dataset.py
megatron/data/indexed_dataset.py
+22
-36
megatron/data/split_dataset.py
megatron/data/split_dataset.py
+112
-0
megatron/data_utils/__init__.py
megatron/data_utils/__init__.py
+26
-2
megatron/training.py
megatron/training.py
+0
-1
pretrain_albert.py
pretrain_albert.py
+200
-0
pretrain_bert.py
pretrain_bert.py
+17
-11
pretrain_gpt2.py
pretrain_gpt2.py
+2
-2
scripts/pretrain_albert.sh
scripts/pretrain_albert.sh
+32
-0
scripts/pretrain_albert_distributed.sh
scripts/pretrain_albert_distributed.sh
+40
-0
No files found.
arguments.py
View file @
1a1cfcff
...
...
@@ -267,23 +267,52 @@ def add_data_args(parser):
group
.
add_argument
(
'--shuffle'
,
action
=
'store_true'
,
help
=
'Shuffle data. Shuffling is deterministic '
'based on seed and current epoch.'
)
group
.
add_argument
(
'--data-loader'
,
type
=
str
,
default
=
None
,
choices
=
[
'raw'
,
'lazy'
,
'tfrecords'
,
'numpy'
,
'binary'
],
help
=
'Which data loader to use. Default varies by model.'
)
group
.
add_argument
(
'--train-data'
,
nargs
=
'+'
,
default
=
None
,
help
=
'Whitespace separated
filename
s or corpora names '
help
=
'Whitespace separated
path
s or corpora names '
'for training.'
)
group
.
add_argument
(
'--valid-data'
,
nargs
=
'*'
,
default
=
None
,
help
=
'path(s) to the validation data.'
)
group
.
add_argument
(
'--test-data'
,
nargs
=
'*'
,
default
=
None
,
help
=
'path(s) to the testing data.'
)
group
.
add_argument
(
'--data-path'
,
type
=
str
,
default
=
None
,
help
=
'path to combined dataset to split'
)
group
.
add_argument
(
'--split'
,
default
=
'1000,1,1'
,
help
=
'comma-separated list of proportions for training,'
' validation, and test split'
)
group
.
add_argument
(
'--use-npy-data-loader'
,
action
=
'store_true'
,
help
=
'Use the numpy data loader. If set, then'
'train-data-path, val-data-path, and test-data-path'
'should also be provided.'
)
group
.
add_argument
(
'--train-data-path'
,
type
=
str
,
default
=
''
,
help
=
'path to the training data'
)
group
.
add_argument
(
'--val-data-path'
,
type
=
str
,
default
=
''
,
help
=
'path to the validation data'
)
group
.
add_argument
(
'--test-data-path'
,
type
=
str
,
default
=
''
,
help
=
'path to the test data'
)
group
.
add_argument
(
'--seq-length'
,
type
=
int
,
default
=
512
,
help
=
"Maximum sequence length to process"
)
group
.
add_argument
(
'--max-preds-per-seq'
,
type
=
int
,
default
=
None
,
help
=
'Maximum number of predictions to use per sequence.'
'Defaults to math.ceil(`--seq-length`*.15/10)*10.'
'MUST BE SPECIFIED IF `--data-loader tfrecords`.'
)
# arguments for binary data loader
parser
.
add_argument
(
'--vocab'
,
type
=
str
,
default
=
'vocab.txt'
,
help
=
'path to vocab file'
)
parser
.
add_argument
(
'--data-impl'
,
type
=
str
,
default
=
'infer'
,
help
=
'implementation of indexed datasets'
,
choices
=
[
'lazy'
,
'cached'
,
'mmap'
,
'infer'
])
parser
.
add_argument
(
'--max-num-samples'
,
type
=
int
,
default
=
None
,
help
=
'Maximum number of samples to plan for, defaults to total iters * batch-size.'
)
parser
.
add_argument
(
'--data-epochs'
,
type
=
int
,
default
=
None
,
help
=
'Number of epochs to plan for, defaults to using --max-num-samples'
)
parser
.
add_argument
(
'--mask-prob'
,
default
=
0.15
,
type
=
float
,
help
=
'probability of replacing a token with mask'
)
parser
.
add_argument
(
'--short-seq-prob'
,
default
=
0.1
,
type
=
float
,
help
=
'probability of producing a short sequence'
)
parser
.
add_argument
(
'--skip-mmap-warmup'
,
action
=
'store_true'
,
help
=
'skip warming up mmap files'
)
# arguments for numpy data loader
group
.
add_argument
(
'--input-data-sizes-file'
,
type
=
str
,
default
=
'sizes.txt'
,
help
=
'the filename containing all the shards sizes'
)
help
=
'the filename containing all the shards sizes
for numpy data loader
'
)
# arguments for raw/tfrecords data loader
group
.
add_argument
(
'--delim'
,
default
=
','
,
help
=
'delimiter used to parse csv data files'
)
group
.
add_argument
(
'--text-key'
,
default
=
'sentence'
,
...
...
@@ -291,16 +320,6 @@ def add_data_args(parser):
group
.
add_argument
(
'--eval-text-key'
,
default
=
None
,
help
=
'key to use to extract text from '
'json/csv evaluation datasets'
)
group
.
add_argument
(
'--valid-data'
,
nargs
=
'*'
,
default
=
None
,
help
=
"""Filename for validation data."""
)
group
.
add_argument
(
'--split'
,
default
=
'1000,1,1'
,
help
=
'comma-separated list of proportions for training,'
' validation, and test split'
)
group
.
add_argument
(
'--test-data'
,
nargs
=
'*'
,
default
=
None
,
help
=
"""Filename for testing"""
)
group
.
add_argument
(
'--lazy-loader'
,
action
=
'store_true'
,
help
=
'whether to lazy read the data set'
)
group
.
add_argument
(
'--loose-json'
,
action
=
'store_true'
,
help
=
'Use loose json (one json-formatted string per '
'newline), instead of tight json (data file is one '
...
...
@@ -308,6 +327,7 @@ def add_data_args(parser):
group
.
add_argument
(
'--presplit-sentences'
,
action
=
'store_true'
,
help
=
'Dataset content consists of documents where '
'each document consists of newline separated sentences'
)
group
.
add_argument
(
'--num-workers'
,
type
=
int
,
default
=
2
,
help
=
"""Number of workers to use for dataloading"""
)
group
.
add_argument
(
'--tokenizer-model-type'
,
type
=
str
,
...
...
@@ -328,16 +348,6 @@ def add_data_args(parser):
help
=
'what type of tokenizer to use'
)
group
.
add_argument
(
"--cache-dir"
,
default
=
None
,
type
=
str
,
help
=
"Where to store pre-trained BERT downloads"
)
group
.
add_argument
(
'--use-tfrecords'
,
action
=
'store_true'
,
help
=
'load `--train-data`, `--valid-data`, '
'`--test-data` from BERT tf records instead of '
'normal data pipeline'
)
group
.
add_argument
(
'--seq-length'
,
type
=
int
,
default
=
512
,
help
=
"Maximum sequence length to process"
)
group
.
add_argument
(
'--max-preds-per-seq'
,
type
=
int
,
default
=
None
,
help
=
'Maximum number of predictions to use per sequence.'
'Defaults to math.ceil(`--seq-length`*.15/10)*10.'
'MUST BE SPECIFIED IF `--use-tfrecords` is True.'
)
return
parser
...
...
@@ -355,7 +365,7 @@ def get_args():
args
=
parser
.
parse_args
()
if
not
args
.
train_data
and
not
args
.
train_
data_path
:
if
not
args
.
train_data
and
not
args
.
data_path
:
print
(
'WARNING: No training data specified'
)
args
.
cuda
=
torch
.
cuda
.
is_available
()
...
...
configure_data.py
View file @
1a1cfcff
...
...
@@ -116,7 +116,7 @@ def make_tfrecord_loaders(args):
def
make_loaders
(
args
):
"""makes training/val/test"""
if
args
.
use_
tfrecords
:
if
args
.
data_loader
==
'
tfrecords
'
:
return
make_tfrecord_loaders
(
args
)
world_size
=
torch
.
distributed
.
get_world_size
(
group
=
mpu
.
get_data_parallel_group
())
...
...
@@ -134,7 +134,7 @@ def make_loaders(args):
data_set_args
=
{
'path'
:
args
.
train_data
,
'seq_length'
:
seq_length
,
'lazy'
:
args
.
lazy
_loader
,
'lazy'
:
args
.
data
_loader
==
'lazy'
,
'delim'
:
args
.
delim
,
'text_key'
:
args
.
text_key
,
'label_key'
:
'label'
,
...
...
gpt2_data_loader.py
View file @
1a1cfcff
...
...
@@ -56,9 +56,9 @@ def make_gpt2_dataloaders(args):
num_workers
=
num_workers
,
pin_memory
=
True
)
train
=
make_data_loader_
(
args
.
train_data
_path
)
valid
=
make_data_loader_
(
args
.
val_data
_path
)
test
=
make_data_loader_
(
args
.
test_data
_path
)
train
=
make_data_loader_
(
args
.
train_data
)
valid
=
make_data_loader_
(
args
.
val_data
)
test
=
make_data_loader_
(
args
.
test_data
)
args
.
do_train
=
False
args
.
do_valid
=
False
...
...
megatron/data/__init__.py
View file @
1a1cfcff
from
.
import
indexed_dataset
from
.bert_tokenization
import
FullTokenizer
as
FullBertTokenizer
from
.dataset
import
AlbertDataset
from
.
albert_
dataset
import
AlbertDataset
megatron/data/dataset.py
→
megatron/data/
albert_
dataset.py
View file @
1a1cfcff
...
...
@@ -29,8 +29,13 @@ class AlbertDataset(Dataset):
self
.
indexed_dataset
=
indexed_dataset
# Build the samples mapping.
if
not
num_epochs
:
if
not
max_num_samples
:
raise
ValueError
(
"Need to specify either max_num_samples or num_epochs"
)
num_epochs
=
int
(
max_num_samples
/
len
(
indexed_dataset
))
+
1
if
not
max_num_samples
:
max_num_samples
=
len
(
indexed_dataset
)
*
num_epochs
print
(
f
"Building the sample map for
{
num_epochs
}
epochs or
{
max_num_samples
}
samples."
)
self
.
samples_mapping
=
helpers
.
build_mapping
(
indexed_dataset
.
doc_idx
,
indexed_dataset
.
sizes
,
...
...
@@ -52,12 +57,17 @@ class AlbertDataset(Dataset):
@
classmethod
def
from_paths
(
cls
,
vocab
,
data_prefix
,
data_impl
,
num_epochs
,
max_num_samples
,
masked_lm_prob
,
max_seq_length
,
short_seq_prob
,
seed
):
max_seq_length
,
short_seq_prob
,
seed
,
skip_warmup
=
False
):
tokenizer
=
FullBertTokenizer
(
vocab
,
do_lower_case
=
True
)
idx_ds
=
indexed_dataset
.
make_dataset
(
data_prefix
,
data_impl
)
print
(
"> Reading dataset index"
)
idx_ds
=
indexed_dataset
.
make_dataset
(
data_prefix
,
data_impl
,
skip_warmup
)
print
(
"> Finished creating indexed dataset"
)
return
cls
(
idx_ds
,
tokenizer
,
num_epochs
,
max_num_samples
,
masked_lm_prob
,
max_seq_length
,
short_seq_prob
,
seed
)
def
num_tokens
(
self
):
return
self
.
tokenizer
.
vocab_size
()
def
__len__
(
self
):
return
self
.
samples_mapping
.
shape
[
0
]
...
...
megatron/data/dataset_utils.py
View file @
1a1cfcff
...
...
@@ -357,7 +357,7 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
tokentypes_np
=
np
.
array
(
tokentypes
+
filler
,
dtype
=
np
.
int64
)
# Padding mask.
padding_mask
=
np
.
array
([
1
]
*
num_tokens
+
[
0
]
*
padding_length
,
dtype
=
np
.
int64
)
padding_mask
_np
=
np
.
array
([
1
]
*
num_tokens
+
[
0
]
*
padding_length
,
dtype
=
np
.
int64
)
# Lables and loss mask.
labels
=
[
-
1
]
*
max_seq_length
...
...
@@ -369,7 +369,7 @@ def pad_and_convert_to_numpy(tokens, tokentypes, masked_positions,
labels_np
=
np
.
array
(
labels
,
dtype
=
np
.
int64
)
loss_mask_np
=
np
.
array
(
loss_mask
,
dtype
=
np
.
int64
)
return
tokens_np
,
tokentypes_np
,
labels
,
padding_mask
,
loss_mask
return
tokens_np
,
tokentypes_np
,
labels
_np
,
padding_mask
_np
,
loss_mask
_np
...
...
megatron/data/helpers.cpp
View file @
1a1cfcff
...
...
@@ -30,8 +30,9 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_,
const
double
short_seq_prob
,
const
int
seed
)
{
cout
<<
"> building dataset mapping for "
<<
docs_
.
shape
(
0
)
-
1
<<
" documents with "
<<
sizes_
.
shape
(
0
)
<<
" sentences ..."
<<
endl
;
cout
<<
"> building dataset mapping for "
<<
docs_
.
shape
(
0
)
-
1
\
<<
" documents with "
<<
sizes_
.
shape
(
0
)
<<
" sentences ..."
<<
std
::
flush
<<
endl
;
// For efficiency, convert probability to ratio.
const
auto
short_seq_ratio
=
static_cast
<
int
>
(
round
(
1.0
/
short_seq_prob
));
...
...
@@ -72,8 +73,8 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_,
// For each epoch:
for
(
int
epoch
=
0
;
epoch
<
num_epochs
;
++
epoch
)
{
if
(
map_index
>=
max_num_samples
&&
!
second
)
{
cout
<<
" > reached "
<<
max_num_samples
<<
" samples after "
<<
epoch
<<
" epochs ..."
<<
endl
;
cout
<<
" > reached "
<<
max_num_samples
<<
" samples after "
<<
epoch
<<
" epochs ..."
<<
std
::
flush
<<
endl
;
break
;
}
// For each document:
...
...
@@ -96,8 +97,8 @@ py::array build_mapping_impl(const py::array_t<uint32_t>& docs_,
empty_docs
+=
1
;
}
if
(
num_remain_sent
==
1
)
{
cout
<<
"***WARNING*** document "
<<
doc
<<
" has one sentence"
<<
endl
;
//
cout << "***WARNING*** document " << doc <<
//
" has one sentence" << endl;
one_sent_docs
+=
1
;
}
}
...
...
megatron/data/indexed_dataset.py
View file @
1a1cfcff
...
...
@@ -51,13 +51,15 @@ def make_builder(out_file, impl, vocab_size=None):
return
IndexedDatasetBuilder
(
out_file
)
def
make_dataset
(
path
,
impl
):
def
make_dataset
(
path
,
impl
,
skip_warmup
=
False
):
if
impl
==
'infer'
:
impl
=
infer_dataset_impl
(
path
)
if
impl
==
'lazy'
and
IndexedDataset
.
exists
(
path
):
return
IndexedDataset
(
path
)
elif
impl
==
'cached'
and
IndexedDataset
.
exists
(
path
):
return
IndexedCachedDataset
(
path
)
elif
impl
==
'mmap'
and
MMapIndexedDataset
.
exists
(
path
):
return
MMapIndexedDataset
(
path
)
return
MMapIndexedDataset
(
path
,
skip_warmup
)
return
None
...
...
@@ -315,7 +317,7 @@ class IndexedDatasetBuilder(object):
def
_warmup_mmap_file
(
path
):
with
open
(
path
,
'rb'
)
as
stream
:
while
stream
.
read
(
1
00
*
1024
*
1024
):
while
stream
.
read
(
1
*
1024
*
1024
):
pass
...
...
@@ -369,7 +371,7 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
return
_Writer
()
def
__init__
(
self
,
path
):
def
__init__
(
self
,
path
,
skip_warmup
=
False
):
with
open
(
path
,
'rb'
)
as
stream
:
magic_test
=
stream
.
read
(
9
)
assert
self
.
_HDR_MAGIC
==
magic_test
,
(
...
...
@@ -387,13 +389,18 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
self
.
_doc_count
=
struct
.
unpack
(
'<Q'
,
stream
.
read
(
8
))[
0
]
offset
=
stream
.
tell
()
_warmup_mmap_file
(
path
)
if
not
skip_warmup
:
print
(
"> Warming up index mmap file..."
)
_warmup_mmap_file
(
path
)
self
.
_bin_buffer_mmap
=
np
.
memmap
(
path
,
mode
=
'r'
,
order
=
'C'
)
self
.
_bin_buffer
=
memoryview
(
self
.
_bin_buffer_mmap
)
print
(
"> Reading sizes..."
)
self
.
_sizes
=
np
.
frombuffer
(
self
.
_bin_buffer
,
dtype
=
np
.
int32
,
count
=
self
.
_len
,
offset
=
offset
)
print
(
"> Reading pointers..."
)
self
.
_pointers
=
np
.
frombuffer
(
self
.
_bin_buffer
,
dtype
=
np
.
int64
,
count
=
self
.
_len
,
offset
=
offset
+
self
.
_sizes
.
nbytes
)
print
(
"> Reading document index..."
)
self
.
_doc_idx
=
np
.
frombuffer
(
self
.
_bin_buffer
,
dtype
=
np
.
int64
,
count
=
self
.
_doc_count
,
offset
=
offset
+
self
.
_sizes
.
nbytes
+
self
.
_pointers
.
nbytes
)
def
__del__
(
self
):
...
...
@@ -419,14 +426,14 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
def
__len__
(
self
):
return
self
.
_len
def
__init__
(
self
,
path
):
def
__init__
(
self
,
path
,
skip_warmup
=
False
):
super
().
__init__
()
self
.
_path
=
None
self
.
_index
=
None
self
.
_bin_buffer
=
None
self
.
_do_init
(
path
)
self
.
_do_init
(
path
,
skip_warmup
)
def
__getstate__
(
self
):
return
self
.
_path
...
...
@@ -434,13 +441,18 @@ class MMapIndexedDataset(torch.utils.data.Dataset):
def
__setstate__
(
self
,
state
):
self
.
_do_init
(
state
)
def
_do_init
(
self
,
path
):
def
_do_init
(
self
,
path
,
skip_warmup
):
self
.
_path
=
path
self
.
_index
=
self
.
Index
(
index_file_path
(
self
.
_path
))
self
.
_index
=
self
.
Index
(
index_file_path
(
self
.
_path
)
,
skip_warmup
)
_warmup_mmap_file
(
data_file_path
(
self
.
_path
))
if
not
skip_warmup
:
print
(
"> Warming up data mmap file..."
)
_warmup_mmap_file
(
data_file_path
(
self
.
_path
))
print
(
"> Creating numpy buffer of mmap..."
)
self
.
_bin_buffer_mmap
=
np
.
memmap
(
data_file_path
(
self
.
_path
),
mode
=
'r'
,
order
=
'C'
)
print
(
"> Creating memory view of numpy buffer..."
)
self
.
_bin_buffer
=
memoryview
(
self
.
_bin_buffer_mmap
)
print
(
"> Done"
)
def
__del__
(
self
):
self
.
_bin_buffer_mmap
.
_mmap
.
close
()
...
...
@@ -522,29 +534,3 @@ class MMapIndexedDatasetBuilder(object):
with
MMapIndexedDataset
.
Index
.
writer
(
index_file
,
self
.
_dtype
)
as
index
:
index
.
write
(
self
.
_sizes
,
self
.
_doc_idx
)
class
indexed_doc_dataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
path
):
impl
=
infer_dataset_impl
(
path
)
self
.
ds
=
make_dataset
(
path
,
impl
)
self
.
_docs
=
[]
doc_idxs
=
[]
for
i
,
s
in
enumerate
(
self
.
_sizes
):
if
s
>
0
:
doc_idxs
.
append
(
i
)
else
:
self
.
_docs
.
append
(
doc_idxs
)
doc_idxs
=
[]
def
__getitem__
(
self
,
i
):
if
not
isinstance
(
i
,
tuple
):
raise
ValueError
(
"Index into indexed_doc_dataset must be a tuple"
)
idx
=
self
.
_docs
[
i
[
0
]][
i
[
1
]]
return
self
.
ds
[
idx
]
def
__len__
(
self
):
"""Returns number of documents, not number of sentences"""
return
len
(
self
.
_docs
)
def
doc_len
(
self
,
d
):
return
len
(
self
.
_docs
[
d
])
megatron/data/split_dataset.py
0 → 100644
View file @
1a1cfcff
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""dataset to split one large one into multiple smaller datasets"""
import
torch
import
numpy
as
np
def
should_split
(
split
):
"""
given split proportions checks if should split
Examples:
>>> should_split([10,0,0])
False
>>> should_split([1,.1,.2])
True
"""
return
max
(
split
)
/
sum
(
split
)
!=
1.
def
get_split
(
args
):
"""
Get dataset splits from comma separated string list
"""
splits
=
[]
if
args
.
split
.
find
(
','
)
!=
-
1
:
splits
=
[
float
(
s
)
for
s
in
args
.
split
.
split
(
','
)]
elif
args
.
split
.
find
(
'/'
)
!=
-
1
:
splits
=
[
float
(
s
)
for
s
in
args
.
split
.
split
(
'/'
)]
else
:
splits
=
[
float
(
args
.
split
)]
split_total
=
sum
(
splits
)
if
split_total
<
1.
:
splits
.
append
(
1
-
split_total
)
while
len
(
splits
)
<
3
:
splits
.
append
(
0.
)
splits
=
splits
[:
3
]
if
args
.
valid_data
is
not
None
:
splits
[
1
]
=
0.
if
args
.
test_data
is
not
None
:
splits
[
2
]
=
0.
final_sum
=
sum
(
splits
)
return
[
s
/
final_sum
for
s
in
splits
]
class
SplitDataset
(
torch
.
utils
.
data
.
Dataset
):
"""
Dataset wrapper to access a subset of another dataset.
Purpose: useful to index into existing datasets, possibly
large-scale datasets as the subindexing operation is done in an
on-the-fly manner.
Arguments:
ds (Dataset or array-like): List of datasets to be subindexed
split_inds (1D array-like): List of indices part of subset
"""
def
__init__
(
self
,
ds
,
split_inds
,
**
kwargs
):
self
.
split_inds
=
list
(
split_inds
)
self
.
wrapped_data
=
ds
def
__len__
(
self
):
return
len
(
self
.
split_inds
)
def
__getitem__
(
self
,
index
):
return
self
.
wrapped_data
[
self
.
split_inds
[
index
]]
def
num_tokens
(
self
):
return
self
.
wrapped_data
.
num_tokens
()
def
__iter__
(
self
):
for
idx
in
self
.
split_inds
:
yield
self
.
wrapped_data
[
idx
]
def
split_ds
(
ds
,
split
=
[.
8
,.
2
,.
0
],
shuffle
=
True
):
"""
Split a dataset into subsets given proportions of how
much to allocate per split. If a split is 0% returns None for that split.
Purpose: Useful for creating train/val/test splits
Arguments:
ds (Dataset or array-like): Data to be split.
split (1D array-like): proportions to split `ds`. `sum(splits) != 0`
shuffle (boolean): Randomly split dataset. Default: True
"""
split_sum
=
sum
(
split
)
if
split_sum
==
0
:
raise
Exception
(
'Split cannot sum to 0.'
)
split
=
np
.
array
(
split
)
split
/=
split_sum
ds_len
=
len
(
ds
)
inds
=
np
.
arange
(
ds_len
)
if
shuffle
:
np
.
random
.
shuffle
(
inds
)
start_idx
=
0
residual_idx
=
0
rtn_ds
=
[
None
]
*
len
(
split
)
for
i
,
f
in
enumerate
(
split
):
if
f
!=
0
:
proportion
=
ds_len
*
split
[
i
]
residual_idx
+=
proportion
%
1
split_
=
int
(
int
(
proportion
)
+
residual_idx
)
split_inds
=
inds
[
start_idx
:
start_idx
+
max
(
split_
,
1
)]
rtn_ds
[
i
]
=
SplitDataset
(
ds
,
split_inds
)
start_idx
+=
split_
residual_idx
%=
1
return
rtn_ds
megatron/data_utils/__init__.py
View file @
1a1cfcff
...
...
@@ -32,13 +32,37 @@ def should_split(split):
"""
given split proportions checks if should split
Examples:
>>> should_split([10,0,0])
>>> should_split([10,0,0])
False
>>> should_split([1,.1,.2])
True
"""
return
max
(
split
)
/
sum
(
split
)
!=
1.
def
get_split
(
args
):
"""
Get dataset splits from comma separated string list
"""
splits
=
[]
if
args
.
split
.
find
(
','
)
!=
-
1
:
splits
=
[
float
(
s
)
for
s
in
args
.
split
.
split
(
','
)]
elif
args
.
split
.
find
(
'/'
)
!=
-
1
:
splits
=
[
float
(
s
)
for
s
in
args
.
split
.
split
(
'/'
)]
else
:
splits
=
[
float
(
args
.
split
)]
split_total
=
sum
(
splits
)
if
split_total
<
1.
:
splits
.
append
(
1
-
split_total
)
while
len
(
splits
)
<
3
:
splits
.
append
(
0.
)
splits
=
splits
[:
3
]
if
args
.
valid_data
is
not
None
:
splits
[
1
]
=
0.
if
args
.
test_data
is
not
None
:
splits
[
2
]
=
0.
final_sum
=
sum
(
splits
)
return
[
s
/
final_sum
for
s
in
splits
]
def
get_ext
(
path
):
"""gets path extension"""
return
os
.
path
.
splitext
(
path
)[
1
]
...
...
@@ -108,7 +132,7 @@ def make_dataset(path, seq_length, text_key, label_key, lazy=False, process_fn=N
ds
=
ConcatDataset
(
datasets
)
# make tokenizer for dataset
if
tokenizer
is
None
:
tokenizer
=
make_tokenizer
(
tokenizer_type
,
ds
,
tokenizer_model_path
,
vocab_size
,
model_type
,
tokenizer
=
make_tokenizer
(
tokenizer_type
,
ds
,
tokenizer_model_path
,
vocab_size
,
model_type
,
pad_token
,
character_converage
,
**
kwargs
)
ds_type
=
''
...
...
megatron/training.py
View file @
1a1cfcff
...
...
@@ -381,7 +381,6 @@ def train(forward_step_func, model, optimizer, lr_scheduler,
timers
(
'interval time'
).
start
()
report_memory_flag
=
True
while
iteration
<
args
.
train_iters
:
loss_dict
,
skipped_iter
=
train_step
(
forward_step_func
,
train_data_iterator
,
model
,
...
...
pretrain_albert.py
0 → 100644
View file @
1a1cfcff
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain BERT"""
import
torch
import
torch.nn.functional
as
F
from
configure_data
import
configure_data
from
megatron
import
mpu
from
megatron.model
import
BertModel
from
megatron.utils
import
print_rank_0
from
megatron.utils
import
reduce_losses
from
megatron.utils
import
vocab_size_with_padding
from
megatron.training
import
run
from
megatron.data
import
AlbertDataset
,
split_dataset
from
megatron.data_utils.samplers
import
DistributedBatchSampler
def
model_provider
(
args
):
"""Build the model."""
print_rank_0
(
'building BERT model ...'
)
model
=
BertModel
(
num_layers
=
args
.
num_layers
,
vocab_size
=
args
.
vocab_size
,
hidden_size
=
args
.
hidden_size
,
num_attention_heads
=
args
.
num_attention_heads
,
embedding_dropout_prob
=
args
.
hidden_dropout
,
attention_dropout_prob
=
args
.
attention_dropout
,
output_dropout_prob
=
args
.
hidden_dropout
,
max_sequence_length
=
args
.
max_position_embeddings
,
checkpoint_activations
=
args
.
checkpoint_activations
,
checkpoint_num_layers
=
args
.
checkpoint_num_layers
,
add_binary_head
=
True
,
layernorm_epsilon
=
args
.
layernorm_epsilon
,
num_tokentypes
=
args
.
tokentype_size
,
parallel_output
=
True
)
return
model
def
get_batch
(
data_iterator
,
timers
):
# Items and their type.
keys
=
[
'text'
,
'types'
,
'labels'
,
'is_random'
,
'loss_mask'
,
'padding_mask'
]
datatype
=
torch
.
int64
# Broadcast data.
timers
(
'data loader'
).
start
()
if
data_iterator
is
not
None
:
data
=
next
(
data_iterator
)
else
:
data
=
None
timers
(
'data loader'
).
stop
()
data_b
=
mpu
.
broadcast_data
(
keys
,
data
,
datatype
)
# Unpack.
tokens
=
data_b
[
'text'
].
long
()
types
=
data_b
[
'types'
].
long
()
sentence_order
=
data_b
[
'is_random'
].
long
()
loss_mask
=
data_b
[
'loss_mask'
].
float
()
lm_labels
=
data_b
[
'labels'
].
long
()
padding_mask
=
data_b
[
'padding_mask'
].
byte
()
return
tokens
,
types
,
sentence_order
,
loss_mask
,
lm_labels
,
padding_mask
def
forward_step
(
data_iterator
,
model
,
args
,
timers
):
"""Forward step."""
# Get the batch.
timers
(
'batch generator'
).
start
()
tokens
,
types
,
sentence_order
,
loss_mask
,
lm_labels
,
padding_mask
\
=
get_batch
(
data_iterator
,
timers
)
timers
(
'batch generator'
).
stop
()
# Forward model.
lm_logits
,
sop_logits
=
model
(
tokens
,
1
-
padding_mask
,
tokentype_ids
=
types
)
sop_loss
=
F
.
cross_entropy
(
sop_logits
.
view
(
-
1
,
2
).
contiguous
().
float
(),
sentence_order
.
view
(
-
1
).
contiguous
(),
ignore_index
=-
1
)
lm_loss_
=
mpu
.
vocab_parallel_cross_entropy
(
lm_logits
.
contiguous
().
float
(),
lm_labels
.
contiguous
())
lm_loss
=
torch
.
sum
(
lm_loss_
.
view
(
-
1
)
*
loss_mask
.
reshape
(
-
1
))
/
loss_mask
.
sum
()
loss
=
lm_loss
+
sop_loss
reduced_losses
=
reduce_losses
([
lm_loss
,
sop_loss
])
return
loss
,
{
'lm loss'
:
reduced_losses
[
0
],
'sop loss'
:
reduced_losses
[
1
]}
def
get_train_val_test_data
(
args
):
"""Load the data on rank zero and boradcast number of tokens to all GPUS."""
(
train_data
,
val_data
,
test_data
)
=
(
None
,
None
,
None
)
# Data loader only on rank 0 of each model parallel group.
if
mpu
.
get_model_parallel_rank
()
==
0
:
if
args
.
data_loader
==
None
:
args
.
data_loader
=
'binary'
if
args
.
data_loader
==
'binary'
:
if
not
args
.
max_num_samples
:
args
.
max_num_samples
=
(
args
.
train_iters
+
2
*
args
.
eval_iters
)
*
args
.
batch_size
if
not
args
.
data_path
:
print
(
"Albert currently only supports a unified dataset specified with --data-path"
)
exit
(
1
)
print
(
"Creating AlbertDataset..."
)
full_data
=
AlbertDataset
.
from_paths
(
args
.
vocab
,
args
.
data_path
,
args
.
data_impl
,
args
.
data_epochs
,
args
.
max_num_samples
,
args
.
mask_prob
,
args
.
seq_length
,
args
.
short_seq_prob
,
args
.
seed
,
args
.
skip_mmap_warmup
)
print
(
"Finished creating AlbertDataset..."
)
split
=
split_dataset
.
get_split
(
args
)
if
split_dataset
.
should_split
(
split
):
train_ds
,
val_ds
,
test_ds
=
split_dataset
.
split_ds
(
full_data
,
split
,
args
.
shuffle
)
else
:
train_ds
=
full_data
num_tokens
=
train_ds
.
num_tokens
()
world_size
=
mpu
.
get_data_parallel_world_size
()
rank
=
mpu
.
get_data_parallel_rank
()
global_batch_size
=
args
.
batch_size
*
world_size
num_workers
=
args
.
num_workers
def
make_data_loader_
(
dataset
):
if
not
dataset
:
return
None
# Use a simple sampler with distributed batch sampler.
sampler
=
torch
.
utils
.
data
.
SequentialSampler
(
dataset
)
batch_sampler
=
DistributedBatchSampler
(
sampler
=
sampler
,
batch_size
=
global_batch_size
,
drop_last
=
True
,
rank
=
rank
,
world_size
=
world_size
)
# Torch dataloader.
return
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
batch_sampler
,
num_workers
=
num_workers
,
pin_memory
=
True
)
train_data
=
make_data_loader_
(
train_ds
)
valid_data
=
make_data_loader_
(
val_ds
)
test_data
=
make_data_loader_
(
test_ds
)
do_train
=
train_data
is
not
None
and
args
.
train_iters
>
0
do_valid
=
valid_data
is
not
None
and
args
.
eval_iters
>
0
do_test
=
test_data
is
not
None
and
args
.
eval_iters
>
0
# Need to broadcast num_tokens and num_type_tokens.
token_counts
=
torch
.
cuda
.
LongTensor
([
num_tokens
,
2
,
# hard coded num_type_tokens for now
int
(
do_train
),
int
(
do_valid
),
int
(
do_test
)])
else
:
print
(
"Unsupported data loader for BERT."
)
exit
(
1
)
else
:
token_counts
=
torch
.
cuda
.
LongTensor
([
0
,
0
,
0
,
0
,
0
])
# Broadcast num tokens.
torch
.
distributed
.
broadcast
(
token_counts
,
mpu
.
get_model_parallel_src_rank
(),
group
=
mpu
.
get_model_parallel_group
())
num_tokens
=
token_counts
[
0
].
item
()
num_type_tokens
=
token_counts
[
1
].
item
()
args
.
do_train
=
token_counts
[
2
].
item
()
args
.
do_valid
=
token_counts
[
3
].
item
()
args
.
do_test
=
token_counts
[
4
].
item
()
args
.
vocab_size
=
num_tokens
args
.
tokentype_size
=
num_type_tokens
return
train_data
,
val_data
,
test_data
if
__name__
==
"__main__"
:
run
(
'Pretrain BERT model'
,
get_train_val_test_data
,
model_provider
,
forward_step
)
pretrain_bert.py
View file @
1a1cfcff
...
...
@@ -112,17 +112,23 @@ def get_train_val_test_data(args):
# Data loader only on rank 0 of each model parallel group.
if
mpu
.
get_model_parallel_rank
()
==
0
:
data_config
=
configure_data
()
ds_type
=
'BERT'
data_config
.
set_defaults
(
data_set_type
=
ds_type
,
transpose
=
False
)
(
train_data
,
val_data
,
test_data
),
tokenizer
=
data_config
.
apply
(
args
)
num_tokens
=
vocab_size_with_padding
(
tokenizer
.
num_tokens
,
args
)
# Need to broadcast num_tokens and num_type_tokens.
token_counts
=
torch
.
cuda
.
LongTensor
([
num_tokens
,
tokenizer
.
num_type_tokens
,
int
(
args
.
do_train
),
int
(
args
.
do_valid
),
int
(
args
.
do_test
)])
if
(
args
.
data_loader
==
'raw'
or
args
.
data_loader
==
'lazy'
or
args
.
data_loader
==
'tfrecords'
):
data_config
=
configure_data
()
ds_type
=
'BERT'
data_config
.
set_defaults
(
data_set_type
=
ds_type
,
transpose
=
False
)
(
train_data
,
val_data
,
test_data
),
tokenizer
=
data_config
.
apply
(
args
)
num_tokens
=
vocab_size_with_padding
(
tokenizer
.
num_tokens
,
args
)
# Need to broadcast num_tokens and num_type_tokens.
token_counts
=
torch
.
cuda
.
LongTensor
([
num_tokens
,
tokenizer
.
num_type_tokens
,
int
(
args
.
do_train
),
int
(
args
.
do_valid
),
int
(
args
.
do_test
)])
else
:
print
(
"Unsupported data loader for BERT."
)
exit
(
1
)
else
:
token_counts
=
torch
.
cuda
.
LongTensor
([
0
,
0
,
0
,
0
,
0
])
...
...
pretrain_gpt2.py
View file @
1a1cfcff
...
...
@@ -168,10 +168,10 @@ def get_train_val_test_data(args):
# Data loader only on rank 0 of each model parallel group.
if
mpu
.
get_model_parallel_rank
()
==
0
:
if
args
.
use_npy_
data_loader
:
if
args
.
data_loader
==
'numpy'
:
(
train_data
,
val_data
,
test_data
),
num_tokens
,
\
eod_token
=
make_gpt2_dataloaders
(
args
)
el
se
:
el
if
args
.
data_loader
==
'raw'
or
args
.
data_loader
==
'tfrecords'
data_config
=
configure_data
()
data_config
.
set_defaults
(
data_set_type
=
'GPT2'
,
transpose
=
False
)
(
train_data
,
val_data
,
test_data
),
tokenizer
=
data_config
.
apply
(
...
...
scripts/pretrain_albert.sh
0 → 100755
View file @
1a1cfcff
#!/bin/bash
RANK
=
0
WORLD_SIZE
=
1
python pretrain_albert.py
\
--num-layers
12
\
--hidden-size
768
\
--num-attention-heads
12
\
--batch-size
4
\
--seq-length
512
\
--max-preds-per-seq
80
\
--max-position-embeddings
512
\
--train-iters
10000
\
--save
checkpoints/albert_117m
\
--load
checkpoints/albert_117m
\
--resume-dataloader
\
--data-path
data/megatron/bc_rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_mmap
\
--vocab
data/megatron/vocab.txt
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.0001
\
--lr-decay-style
linear
\
--lr-decay-iters
990000
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--fp16
\
--fp32-layernorm
\
--fp32-embedding
\
--skip-mmap-warmup
\
--num-workers
0
scripts/pretrain_albert_distributed.sh
0 → 100755
View file @
1a1cfcff
#!/bin/bash
GPUS_PER_NODE
=
2
# Change for multinode config
MASTER_ADDR
=
localhost
MASTER_PORT
=
6000
NNODES
=
1
NODE_RANK
=
0
WORLD_SIZE
=
$((
$GPUS_PER_NODE
*
$NNODES
))
DISTRIBUTED_ARGS
=
"--nproc_per_node
$GPUS_PER_NODE
--nnodes
$NNODES
--node_rank
$NODE_RANK
--master_addr
$MASTER_ADDR
--master_port
$MASTER_PORT
"
python
-m
torch.distributed.launch
$DISTRIBUTED_ARGS
\
pretrain_albert.py
\
--num-layers
12
\
--hidden-size
768
\
--num-attention-heads
12
\
--batch-size
4
\
--seq-length
512
\
--max-preds-per-seq
80
\
--max-position-embeddings
512
\
--train-iters
10000
\
--save
checkpoints/albert_117m
\
--load
checkpoints/albert_117m
\
--resume-dataloader
\
--data-path
data/megatron/bc_rn_owt_sto_wiki_dedup_shuf_cleaned_0.7_mmap
\
--vocab
data/megatron/vocab.txt
\
--split
949,50,1
\
--distributed-backend
nccl
\
--lr
0.0001
\
--lr-decay-style
linear
\
--lr-decay-iters
990000
\
--weight-decay
1e-2
\
--clip-grad
1.0
\
--warmup
.01
\
--fp16
\
--fp32-layernorm
\
--fp32-embedding
\
--skip-mmap-warmup
\
--num-workers
0
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment