Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
91d4a605
Commit
91d4a605
authored
Dec 02, 2020
by
Jared Casper
Browse files
Merge branch 'main' into pipeline_parallel_main
parents
63c340ec
75bd9b54
Changes
26
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
370 additions
and
2514 deletions
+370
-2514
megatron/arguments.py
megatron/arguments.py
+14
-6
megatron/checkpointing.py
megatron/checkpointing.py
+4
-1
megatron/data/blendable_dataset.py
megatron/data/blendable_dataset.py
+75
-0
megatron/data/data_loaders.py
megatron/data/data_loaders.py
+95
-0
megatron/data/dataset_utils.py
megatron/data/dataset_utils.py
+74
-0
megatron/data/gpt2_dataset.py
megatron/data/gpt2_dataset.py
+42
-0
megatron/data/helpers.cpp
megatron/data/helpers.cpp
+64
-0
megatron/data/realm_dataset_utils.py
megatron/data/realm_dataset_utils.py
+2
-1
megatron/data/samplers.py
megatron/data/samplers.py
+0
-148
megatron/deprecated_data_utils/__init__.py
megatron/deprecated_data_utils/__init__.py
+0
-141
megatron/deprecated_data_utils/configure_data.py
megatron/deprecated_data_utils/configure_data.py
+0
-252
megatron/deprecated_data_utils/corpora.py
megatron/deprecated_data_utils/corpora.py
+0
-61
megatron/deprecated_data_utils/datasets.py
megatron/deprecated_data_utils/datasets.py
+0
-883
megatron/deprecated_data_utils/file_utils.py
megatron/deprecated_data_utils/file_utils.py
+0
-253
megatron/deprecated_data_utils/lazy_loader.py
megatron/deprecated_data_utils/lazy_loader.py
+0
-202
megatron/deprecated_data_utils/samplers.py
megatron/deprecated_data_utils/samplers.py
+0
-143
megatron/deprecated_data_utils/scripts/presplit_sentences_json.py
.../deprecated_data_utils/scripts/presplit_sentences_json.py
+0
-27
megatron/deprecated_data_utils/scripts/split_gpt2_json.py
megatron/deprecated_data_utils/scripts/split_gpt2_json.py
+0
-141
megatron/deprecated_data_utils/scripts/split_json.py
megatron/deprecated_data_utils/scripts/split_json.py
+0
-126
megatron/deprecated_data_utils/tf_dl.py
megatron/deprecated_data_utils/tf_dl.py
+0
-129
No files found.
megatron/arguments.py
View file @
91d4a605
...
@@ -82,6 +82,9 @@ def parse_args(extra_args_provider=None, defaults={},
...
@@ -82,6 +82,9 @@ def parse_args(extra_args_provider=None, defaults={},
print
(
'using {} for parameters ...'
.
format
(
args
.
params_dtype
),
print
(
'using {} for parameters ...'
.
format
(
args
.
params_dtype
),
flush
=
True
)
flush
=
True
)
# Consumed tokens.
args
.
consumed_train_samples
=
0
args
.
consumed_valid_samples
=
0
# Set input defaults.
# Set input defaults.
for
key
in
defaults
:
for
key
in
defaults
:
...
@@ -135,14 +138,16 @@ def parse_args(extra_args_provider=None, defaults={},
...
@@ -135,14 +138,16 @@ def parse_args(extra_args_provider=None, defaults={},
def
_print_args
(
args
):
def
_print_args
(
args
):
"""Print arguments."""
"""Print arguments."""
if
args
.
rank
==
0
:
if
args
.
rank
==
0
:
print
(
'-------------------- arguments --------------------'
,
flush
=
True
)
print
(
'------------------------ arguments ------------------------'
,
flush
=
True
)
str_list
=
[]
str_list
=
[]
for
arg
in
vars
(
args
):
for
arg
in
vars
(
args
):
dots
=
'.'
*
(
32
-
len
(
arg
))
dots
=
'.'
*
(
48
-
len
(
arg
))
str_list
.
append
(
' {} {} {}'
.
format
(
arg
,
dots
,
getattr
(
args
,
arg
)))
str_list
.
append
(
' {} {} {}'
.
format
(
arg
,
dots
,
getattr
(
args
,
arg
)))
for
arg
in
sorted
(
str_list
,
key
=
lambda
x
:
x
.
lower
()):
for
arg
in
sorted
(
str_list
,
key
=
lambda
x
:
x
.
lower
()):
print
(
arg
,
flush
=
True
)
print
(
arg
,
flush
=
True
)
print
(
'---------------- end of arguments ----------------'
,
flush
=
True
)
print
(
'-------------------- end of arguments ---------------------'
,
flush
=
True
)
def
_check_arg_is_not_none
(
args
,
arg
):
def
_check_arg_is_not_none
(
args
,
arg
):
...
@@ -269,7 +274,7 @@ def _add_learning_rate_args(parser):
...
@@ -269,7 +274,7 @@ def _add_learning_rate_args(parser):
'and initial warmup, the learing rate at each '
'and initial warmup, the learing rate at each '
'iteration would be different.'
)
'iteration would be different.'
)
group
.
add_argument
(
'--lr-decay-style'
,
type
=
str
,
default
=
'linear'
,
group
.
add_argument
(
'--lr-decay-style'
,
type
=
str
,
default
=
'linear'
,
choices
=
[
'constant'
,
'linear'
,
'cosine'
,
'exponential'
],
choices
=
[
'constant'
,
'linear'
,
'cosine'
],
help
=
'Learning rate decay function.'
)
help
=
'Learning rate decay function.'
)
group
.
add_argument
(
'--lr-decay-iters'
,
type
=
int
,
default
=
None
,
group
.
add_argument
(
'--lr-decay-iters'
,
type
=
int
,
default
=
None
,
help
=
'number of iterations to decay learning rate over,'
help
=
'number of iterations to decay learning rate over,'
...
@@ -393,8 +398,11 @@ def _add_validation_args(parser):
...
@@ -393,8 +398,11 @@ def _add_validation_args(parser):
def
_add_data_args
(
parser
):
def
_add_data_args
(
parser
):
group
=
parser
.
add_argument_group
(
title
=
'data and dataloader'
)
group
=
parser
.
add_argument_group
(
title
=
'data and dataloader'
)
group
.
add_argument
(
'--data-path'
,
type
=
str
,
default
=
None
,
group
.
add_argument
(
'--data-path'
,
nargs
=
'*'
,
default
=
None
,
help
=
'Path to combined dataset to split.'
)
help
=
'Path to the training dataset. Accepted format:'
'1) a single data path, 2) multiple datasets in the'
'form: dataset1-weight dataset1-path dataset2-weight '
'dataset2-path ...'
)
group
.
add_argument
(
'--split'
,
type
=
str
,
default
=
'969, 30, 1'
,
group
.
add_argument
(
'--split'
,
type
=
str
,
default
=
'969, 30, 1'
,
help
=
'Comma-separated list of proportions for training,'
help
=
'Comma-separated list of proportions for training,'
' validation, and test split. For example the split '
' validation, and test split. For example the split '
...
...
megatron/checkpointing.py
View file @
91d4a605
...
@@ -227,12 +227,15 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
...
@@ -227,12 +227,15 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load'):
'iteration from checkpoint {}, exiting'
.
format
(
'iteration from checkpoint {}, exiting'
.
format
(
checkpoint_name
))
checkpoint_name
))
sys
.
exit
()
sys
.
exit
()
# Check arguments.
# Check arguments.
assert
args
.
consumed_train_samples
==
0
assert
args
.
consumed_valid_samples
==
0
if
'args'
in
state_dict
:
if
'args'
in
state_dict
:
checkpoint_args
=
state_dict
[
'args'
]
checkpoint_args
=
state_dict
[
'args'
]
check_checkpoint_args
(
checkpoint_args
)
check_checkpoint_args
(
checkpoint_args
)
args
.
consumed_train_samples
=
getattr
(
args
,
'consumed_train_samples'
,
0
)
args
.
consumed_valid_samples
=
getattr
(
args
,
'consumed_valid_samples'
,
0
)
else
:
else
:
print_rank_0
(
'could not find arguments in the checkpoint ...'
)
print_rank_0
(
'could not find arguments in the checkpoint ...'
)
...
...
megatron/data/blendable_dataset.py
0 → 100644
View file @
91d4a605
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Blendable dataset."""
import
time
import
numpy
as
np
import
torch
from
megatron
import
print_rank_0
from
megatron
import
mpu
class
BlendableDataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
datasets
,
weights
):
self
.
datasets
=
datasets
num_datasets
=
len
(
datasets
)
assert
num_datasets
==
len
(
weights
)
self
.
size
=
0
for
dataset
in
self
.
datasets
:
self
.
size
+=
len
(
dataset
)
# Normalize weights.
weights
=
np
.
array
(
weights
,
dtype
=
np
.
float64
)
sum_weights
=
np
.
sum
(
weights
)
assert
sum_weights
>
0.0
weights
/=
sum_weights
# Build indecies.
start_time
=
time
.
time
()
assert
num_datasets
<
255
self
.
dataset_index
=
np
.
zeros
(
self
.
size
,
dtype
=
np
.
uint8
)
self
.
dataset_sample_index
=
np
.
zeros
(
self
.
size
,
dtype
=
np
.
int64
)
if
torch
.
distributed
.
get_rank
()
==
0
:
from
megatron.data.dataset_utils
import
compile_helper
compile_helper
()
# Simple barrier
tmp
=
torch
.
cuda
.
LongTensor
([
1
])
torch
.
distributed
.
all_reduce
(
tmp
,
group
=
mpu
.
get_data_parallel_group
())
from
megatron.data
import
helpers
helpers
.
build_blending_indices
(
self
.
dataset_index
,
self
.
dataset_sample_index
,
weights
,
num_datasets
,
self
.
size
,
torch
.
distributed
.
get_rank
()
==
0
)
print_rank_0
(
'> elapsed time for building blendable dataset indices: '
'{:.2f} (sec)'
.
format
(
time
.
time
()
-
start_time
))
def
__len__
(
self
):
return
self
.
size
def
__getitem__
(
self
,
idx
):
dataset_idx
=
self
.
dataset_index
[
idx
]
sample_idx
=
self
.
dataset_sample_index
[
idx
]
return
self
.
datasets
[
dataset_idx
][
sample_idx
]
megatron/data/data_loaders.py
0 → 100644
View file @
91d4a605
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataloaders."""
import
torch
from
megatron
import
get_args
from
megatron
import
mpu
def
build_pretraining_data_loader
(
dataset
,
consumed_samples
):
"""Buld dataloader given an input dataset."""
if
dataset
is
None
:
return
None
args
=
get_args
()
world_size
=
mpu
.
get_data_parallel_world_size
()
global_batch_size
=
args
.
batch_size
*
world_size
# Megatron sampler
batch_sampler
=
MegatronPretrainingSampler
(
total_samples
=
len
(
dataset
),
consumed_samples
=
consumed_samples
,
global_batch_size
=
global_batch_size
,
rank
=
mpu
.
get_data_parallel_rank
(),
world_size
=
world_size
)
# Torch dataloader.
return
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
batch_sampler
,
num_workers
=
args
.
num_workers
,
pin_memory
=
True
)
class
MegatronPretrainingSampler
:
def
__init__
(
self
,
total_samples
,
consumed_samples
,
global_batch_size
,
rank
,
world_size
):
# Keep a copy of input params for later use.
self
.
total_samples
=
total_samples
self
.
consumed_samples
=
consumed_samples
self
.
global_batch_size
=
global_batch_size
self
.
rank
=
rank
# Sanity checks.
assert
self
.
total_samples
>
0
,
\
'no sample to consume: {}'
.
format
(
self
.
total_samples
)
assert
self
.
consumed_samples
<
self
.
total_samples
,
\
'no samples left to consume: {}, {}'
.
format
(
self
.
consumed_samples
,
self
.
total_samples
)
assert
self
.
global_batch_size
>
0
,
\
'Unexpected global batch size: {}'
.
format
(
self
.
global_batch_size
)
assert
world_size
>
0
,
\
'non zero world size is expected: {}'
.
format
(
world_size
)
assert
self
.
rank
<
world_size
,
\
'rank should be smaller than world size: {}, {}'
.
format
(
self
.
rank
,
world_size
)
# Batch size per rank.
assert
self
.
global_batch_size
%
world_size
==
0
,
\
'global batch size must be divisible by world size: {}, {}'
.
format
(
self
.
global_batch_size
,
world_size
)
self
.
batch_size_per_rank
=
self
.
global_batch_size
//
world_size
def
__len__
(
self
):
return
self
.
total_samples
def
__iter__
(
self
):
batch
=
[]
# Last batch if not complete will be dropped.
for
idx
in
range
(
self
.
consumed_samples
,
self
.
total_samples
):
batch
.
append
(
idx
)
if
len
(
batch
)
==
self
.
global_batch_size
:
start_idx
=
self
.
rank
*
self
.
batch_size_per_rank
end_idx
=
start_idx
+
self
.
batch_size_per_rank
yield
batch
[
start_idx
:
end_idx
]
batch
=
[]
megatron/data/dataset_utils.py
View file @
91d4a605
...
@@ -18,11 +18,13 @@
...
@@ -18,11 +18,13 @@
# https://github.com/google-research/albert/blob/master/create_pretraining_data.py
# https://github.com/google-research/albert/blob/master/create_pretraining_data.py
# with some modifications.
# with some modifications.
import
math
import
time
import
time
import
collections
import
collections
import
numpy
as
np
import
numpy
as
np
from
megatron
import
get_args
,
print_rank_0
from
megatron
import
get_args
,
print_rank_0
from
megatron.data.blendable_dataset
import
BlendableDataset
from
megatron.data.indexed_dataset
import
make_dataset
as
make_indexed_dataset
from
megatron.data.indexed_dataset
import
make_dataset
as
make_indexed_dataset
DSET_TYPE_STD
=
'standard_bert'
DSET_TYPE_STD
=
'standard_bert'
...
@@ -31,6 +33,38 @@ DSET_TYPE_ICT = 'ict'
...
@@ -31,6 +33,38 @@ DSET_TYPE_ICT = 'ict'
DSET_TYPES
=
[
DSET_TYPE_ICT
,
DSET_TYPE_STD
]
DSET_TYPES
=
[
DSET_TYPE_ICT
,
DSET_TYPE_STD
]
def
get_datasets_weights_and_num_samples
(
data_prefix
,
train_valid_test_num_samples
):
# The data prefix should be in the format of:
# weight-1, data-prefix-1, weight-2, data-prefix-2, ..
assert
len
(
data_prefix
)
%
2
==
0
num_datasets
=
len
(
data_prefix
)
//
2
weights
=
[
0
]
*
num_datasets
prefixes
=
[
0
]
*
num_datasets
for
i
in
range
(
num_datasets
):
weights
[
i
]
=
float
(
data_prefix
[
2
*
i
])
prefixes
[
i
]
=
(
data_prefix
[
2
*
i
+
1
]).
strip
()
# Normalize weights
weight_sum
=
0.0
for
weight
in
weights
:
weight_sum
+=
weight
assert
weight_sum
>
0.0
weights
=
[
weight
/
weight_sum
for
weight
in
weights
]
# Add 0.5% (the 1.005 factor) so in case the bleding dataset does
# not uniformly distribute the number of samples, we still have
# samples left to feed to the network.
datasets_train_valid_test_num_samples
=
[]
for
weight
in
weights
:
datasets_train_valid_test_num_samples
.
append
(
[
int
(
math
.
ceil
(
val
*
weight
*
1.005
))
for
val
in
train_valid_test_num_samples
])
return
prefixes
,
weights
,
datasets_train_valid_test_num_samples
def
compile_helper
():
def
compile_helper
():
"""Compile helper function ar runtime. Make sure this
"""Compile helper function ar runtime. Make sure this
is invoked on a single process."""
is invoked on a single process."""
...
@@ -360,6 +394,46 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
...
@@ -360,6 +394,46 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
short_seq_prob
,
seed
,
skip_warmup
,
short_seq_prob
,
seed
,
skip_warmup
,
dataset_type
=
'standard_bert'
):
dataset_type
=
'standard_bert'
):
if
len
(
data_prefix
)
==
1
:
return
_build_train_valid_test_datasets
(
data_prefix
[
0
],
data_impl
,
splits_string
,
train_valid_test_num_samples
,
max_seq_length
,
masked_lm_prob
,
short_seq_prob
,
seed
,
skip_warmup
,
dataset_type
=
dataset_type
)
# Blending dataset.
# Parse the values.
output
=
get_datasets_weights_and_num_samples
(
data_prefix
,
train_valid_test_num_samples
)
prefixes
,
weights
,
datasets_train_valid_test_num_samples
=
output
# Build individual datasets.
train_datasets
=
[]
valid_datasets
=
[]
test_datasets
=
[]
for
i
in
range
(
len
(
prefixes
)):
train_ds
,
valid_ds
,
test_ds
=
_build_train_valid_test_datasets
(
prefixes
[
i
],
data_impl
,
splits_string
,
datasets_train_valid_test_num_samples
[
i
],
max_seq_length
,
masked_lm_prob
,
short_seq_prob
,
seed
,
skip_warmup
,
dataset_type
=
dataset_type
)
# Blend.
blending_train_dataset
=
BlendableDataset
(
train_datasets
,
weights
)
blending_valid_dataset
=
BlendableDataset
(
valid_datasets
,
weights
)
blending_test_dataset
=
BlendableDataset
(
test_datasets
,
weights
)
return
(
blending_train_dataset
,
blending_valid_dataset
,
blending_test_dataset
)
def
_build_train_valid_test_datasets
(
data_prefix
,
data_impl
,
splits_string
,
train_valid_test_num_samples
,
max_seq_length
,
masked_lm_prob
,
short_seq_prob
,
seed
,
skip_warmup
,
dataset_type
=
'standard_bert'
):
if
dataset_type
not
in
DSET_TYPES
:
if
dataset_type
not
in
DSET_TYPES
:
raise
ValueError
(
"Invalid dataset_type: "
,
dataset_type
)
raise
ValueError
(
"Invalid dataset_type: "
,
dataset_type
)
...
...
megatron/data/gpt2_dataset.py
View file @
91d4a605
...
@@ -22,6 +22,8 @@ import numpy as np
...
@@ -22,6 +22,8 @@ import numpy as np
import
torch
import
torch
from
megatron
import
mpu
,
print_rank_0
from
megatron
import
mpu
,
print_rank_0
from
megatron.data.blendable_dataset
import
BlendableDataset
from
megatron.data.dataset_utils
import
get_datasets_weights_and_num_samples
from
megatron.data.dataset_utils
import
get_train_valid_test_split_
from
megatron.data.dataset_utils
import
get_train_valid_test_split_
from
megatron.data.indexed_dataset
import
make_dataset
as
make_indexed_dataset
from
megatron.data.indexed_dataset
import
make_dataset
as
make_indexed_dataset
...
@@ -31,6 +33,46 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
...
@@ -31,6 +33,46 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
seq_length
,
seed
,
skip_warmup
):
seq_length
,
seed
,
skip_warmup
):
"""Build train, valid, and test datasets."""
"""Build train, valid, and test datasets."""
# Single dataset.
if
len
(
data_prefix
)
==
1
:
return
_build_train_valid_test_datasets
(
data_prefix
[
0
],
data_impl
,
splits_string
,
train_valid_test_num_samples
,
seq_length
,
seed
,
skip_warmup
)
# Blending dataset.
# Parse the values.
output
=
get_datasets_weights_and_num_samples
(
data_prefix
,
train_valid_test_num_samples
)
prefixes
,
weights
,
datasets_train_valid_test_num_samples
=
output
# Build individual datasets.
train_datasets
=
[]
valid_datasets
=
[]
test_datasets
=
[]
for
i
in
range
(
len
(
prefixes
)):
train_ds
,
valid_ds
,
test_ds
=
_build_train_valid_test_datasets
(
prefixes
[
i
],
data_impl
,
splits_string
,
datasets_train_valid_test_num_samples
[
i
],
seq_length
,
seed
,
skip_warmup
)
train_datasets
.
append
(
train_ds
)
valid_datasets
.
append
(
valid_ds
)
test_datasets
.
append
(
test_ds
)
# Blend.
blending_train_dataset
=
BlendableDataset
(
train_datasets
,
weights
)
blending_valid_dataset
=
BlendableDataset
(
valid_datasets
,
weights
)
blending_test_dataset
=
BlendableDataset
(
test_datasets
,
weights
)
return
(
blending_train_dataset
,
blending_valid_dataset
,
blending_test_dataset
)
def
_build_train_valid_test_datasets
(
data_prefix
,
data_impl
,
splits_string
,
train_valid_test_num_samples
,
seq_length
,
seed
,
skip_warmup
):
"""Build train, valid, and test datasets."""
# Indexed dataset.
# Indexed dataset.
indexed_dataset
=
get_indexed_dataset_
(
data_prefix
,
indexed_dataset
=
get_indexed_dataset_
(
data_prefix
,
data_impl
,
data_impl
,
...
...
megatron/data/helpers.cpp
View file @
91d4a605
...
@@ -33,6 +33,69 @@ using namespace std;
...
@@ -33,6 +33,69 @@ using namespace std;
const
int32_t
LONG_SENTENCE_LEN
=
512
;
const
int32_t
LONG_SENTENCE_LEN
=
512
;
void
build_blending_indices
(
py
::
array_t
<
uint8_t
>&
dataset_index
,
py
::
array_t
<
int64_t
>&
dataset_sample_index
,
const
py
::
array_t
<
double
>&
weights
,
const
int32_t
num_datasets
,
const
int64_t
size
,
const
bool
verbose
)
{
/* Given multiple datasets and a weighting array, build samples
such that it follows those wieghts.*/
if
(
verbose
)
{
std
::
cout
<<
"> building indices for blendable datasets ..."
<<
std
::
endl
;
}
// Get the pointer access without the checks.
auto
dataset_index_ptr
=
dataset_index
.
mutable_unchecked
<
1
>
();
auto
dataset_sample_index_ptr
=
dataset_sample_index
.
mutable_unchecked
<
1
>
();
auto
weights_ptr
=
weights
.
unchecked
<
1
>
();
// Initialize buffer for number of samples used for each dataset.
int64_t
current_samples
[
num_datasets
];
for
(
int64_t
i
=
0
;
i
<
num_datasets
;
++
i
)
{
current_samples
[
i
]
=
0
;
}
// For each sample:
for
(
int64_t
sample_idx
=
0
;
sample_idx
<
size
;
++
sample_idx
)
{
// Determine where the max error in sampling is happening.
auto
sample_idx_double
=
std
::
max
(
static_cast
<
double
>
(
sample_idx
),
1.0
);
int64_t
max_error_index
=
0
;
double
max_error
=
weights_ptr
[
0
]
*
sample_idx_double
-
static_cast
<
double
>
(
current_samples
[
0
]);
for
(
int64_t
dataset_idx
=
1
;
dataset_idx
<
num_datasets
;
++
dataset_idx
)
{
double
error
=
weights_ptr
[
dataset_idx
]
*
sample_idx_double
-
static_cast
<
double
>
(
current_samples
[
dataset_idx
]);
if
(
error
>
max_error
)
{
max_error
=
error
;
max_error_index
=
dataset_idx
;
}
}
// Populate the indices.
dataset_index_ptr
[
sample_idx
]
=
static_cast
<
uint8_t
>
(
max_error_index
);
dataset_sample_index_ptr
[
sample_idx
]
=
current_samples
[
max_error_index
];
// Update the total samples.
current_samples
[
max_error_index
]
+=
1
;
}
// print info
if
(
verbose
)
{
std
::
cout
<<
" > sample ratios:"
<<
std
::
endl
;
for
(
int64_t
dataset_idx
=
0
;
dataset_idx
<
num_datasets
;
++
dataset_idx
)
{
auto
ratio
=
static_cast
<
double
>
(
current_samples
[
dataset_idx
])
/
static_cast
<
double
>
(
size
);
std
::
cout
<<
" dataset "
<<
dataset_idx
<<
", input: "
<<
weights_ptr
[
dataset_idx
]
<<
", achieved: "
<<
ratio
<<
std
::
endl
;
}
}
}
py
::
array
build_sample_idx
(
const
py
::
array_t
<
int32_t
>&
sizes_
,
py
::
array
build_sample_idx
(
const
py
::
array_t
<
int32_t
>&
sizes_
,
const
py
::
array_t
<
int32_t
>&
doc_idx_
,
const
py
::
array_t
<
int32_t
>&
doc_idx_
,
const
int32_t
seq_length
,
const
int32_t
seq_length
,
...
@@ -640,4 +703,5 @@ PYBIND11_MODULE(helpers, m) {
...
@@ -640,4 +703,5 @@ PYBIND11_MODULE(helpers, m) {
m
.
def
(
"build_mapping"
,
&
build_mapping
);
m
.
def
(
"build_mapping"
,
&
build_mapping
);
m
.
def
(
"build_blocks_mapping"
,
&
build_blocks_mapping
);
m
.
def
(
"build_blocks_mapping"
,
&
build_blocks_mapping
);
m
.
def
(
"build_sample_idx"
,
&
build_sample_idx
);
m
.
def
(
"build_sample_idx"
,
&
build_sample_idx
);
m
.
def
(
"build_blending_indices"
,
&
build_blending_indices
);
}
}
megatron/data/realm_dataset_utils.py
View file @
91d4a605
...
@@ -6,7 +6,6 @@ import torch
...
@@ -6,7 +6,6 @@ import torch
from
megatron
import
mpu
,
print_rank_0
from
megatron
import
mpu
,
print_rank_0
from
megatron.data.dataset_utils
import
create_masked_lm_predictions
,
pad_and_convert_to_numpy
from
megatron.data.dataset_utils
import
create_masked_lm_predictions
,
pad_and_convert_to_numpy
from
megatron.data.samplers
import
DistributedBatchSampler
from
megatron
import
get_args
,
get_tokenizer
,
print_rank_0
,
mpu
from
megatron
import
get_args
,
get_tokenizer
,
print_rank_0
,
mpu
...
@@ -23,6 +22,8 @@ def get_one_epoch_dataloader(dataset, batch_size=None):
...
@@ -23,6 +22,8 @@ def get_one_epoch_dataloader(dataset, batch_size=None):
sampler
=
torch
.
utils
.
data
.
SequentialSampler
(
dataset
)
sampler
=
torch
.
utils
.
data
.
SequentialSampler
(
dataset
)
# importantly, drop_last must be False to get all the data.
# importantly, drop_last must be False to get all the data.
assert
False
,
'DistributedBatchSampler deprecated, change the implementation'
from
megatron.data.samplers
import
DistributedBatchSampler
batch_sampler
=
DistributedBatchSampler
(
sampler
,
batch_sampler
=
DistributedBatchSampler
(
sampler
,
batch_size
=
global_batch_size
,
batch_size
=
global_batch_size
,
drop_last
=
False
,
drop_last
=
False
,
...
...
megatron/data/samplers.py
deleted
100644 → 0
View file @
63c340ec
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Batch samplers that work with either random or sequential data samplers."""
import
torch
from
torch.utils
import
data
class
RandomSampler
(
data
.
sampler
.
Sampler
):
"""Based off of pytorch RandomSampler and DistributedSampler. Essentially
a RandomSampler, but this class lets the user set an epoch like
DistributedSampler Samples elements randomly. If without replacement, then
sample from a shuffled dataset. If with replacement, then user can
specify ``num_samples`` to draw.
Arguments:
data_source (Dataset): dataset to sample from
num_samples (int): number of samples to draw, default=len(dataset)
replacement (bool): samples are drawn with replacement if ``True``,
default=False
"""
def
__init__
(
self
,
data_source
,
replacement
=
False
,
num_samples
=
None
):
self
.
data_source
=
data_source
self
.
replacement
=
replacement
self
.
_num_samples
=
num_samples
self
.
epoch
=
-
1
if
self
.
_num_samples
is
not
None
and
replacement
is
False
:
raise
ValueError
(
"With replacement=False, num_samples should not "
"be specified, since a random permute will be "
"performed."
)
if
not
isinstance
(
self
.
num_samples
,
int
)
or
self
.
num_samples
<=
0
:
raise
ValueError
(
"num_samples should be a positive integer "
"value, but got num_samples={}"
.
format
(
self
.
num_samples
))
if
not
isinstance
(
self
.
replacement
,
bool
):
raise
ValueError
(
"replacement should be a boolean value, but got "
"replacement={}"
.
format
(
self
.
replacement
))
@
property
def
num_samples
(
self
):
# dataset size might change at runtime
if
self
.
_num_samples
is
None
:
return
len
(
self
.
data_source
)
return
self
.
_num_samples
def
__iter__
(
self
):
n
=
len
(
self
.
data_source
)
g
=
torch
.
Generator
()
if
self
.
epoch
>=
0
:
g
.
manual_seed
(
self
.
epoch
)
if
self
.
replacement
:
return
iter
(
torch
.
randint
(
high
=
n
,
size
=
(
self
.
num_samples
,),
dtype
=
torch
.
int64
,
generator
=
g
).
tolist
())
return
iter
(
torch
.
randperm
(
n
,
generator
=
g
).
tolist
())
def
__len__
(
self
):
return
self
.
num_samples
def
set_epoch
(
self
,
epoch
):
self
.
epoch
=
epoch
class
DistributedBatchSampler
(
data
.
sampler
.
BatchSampler
):
"""Similar to normal implementation of distributed sampler, except
implementation is at the batch sampler level, instead of just the
sampler level. This allows wrapping of arbitrary data samplers
(sequential, random, WeightedRandomSampler, etc.) with this batch
sampler.
The `interleave` argument specifies how to distribute a batch. A value
of True combined with the above random sampler is equivalent to pytorch's
torch.utils.data.distributed.DistributedSampler.
For the following batch [0,1,2,3,4,5,6,7] and data parallelism of 2
specifying True will result in the following samples for each gpu:
GPU0: [0,2,4,6] GPU1: [1,3,5,7]
specifying False will result in the following samples:
GPU0: [0,1,2,3] GPU1: [4,5,6,7]"""
def
__init__
(
self
,
sampler
,
batch_size
,
drop_last
,
rank
=-
1
,
world_size
=
2
,
wrap_last
=
False
,
interleave
=
False
):
super
(
DistributedBatchSampler
,
self
).
__init__
(
sampler
,
batch_size
,
drop_last
)
if
rank
==
-
1
:
assert
False
,
'should not be here'
rank
=
torch
.
distributed
.
get_rank
()
self
.
rank
=
rank
self
.
world_size
=
world_size
self
.
sampler
.
wrap_around
=
0
self
.
wrap_around
=
0
self
.
wrap_last
=
wrap_last
self
.
start_iter
=
0
self
.
interleave
=
interleave
def
__iter__
(
self
):
batch
=
[]
i
=
0
for
idx
in
self
.
data_iterator
(
self
.
sampler
,
wrap_around
=
False
):
batch
.
append
(
idx
)
if
len
(
batch
)
==
self
.
batch_size
:
tbatch
=
self
.
_batch
(
batch
)
if
i
>=
self
.
start_iter
:
yield
tbatch
self
.
start_iter
=
0
i
+=
1
batch
=
[]
batch_len
=
len
(
batch
)
if
batch_len
>
0
and
not
self
.
drop_last
:
if
self
.
wrap_last
:
self
.
sampler
.
wrap_around
-=
(
self
.
batch_size
)
self
.
wrap_around
+=
(
len
(
batch
))
self
.
wrap_around
%=
self
.
batch_size
yield
self
.
_batch
(
batch
)
if
self
.
wrap_last
:
self
.
sampler
.
wrap_around
+=
self
.
batch_size
def
data_iterator
(
self
,
_iter
,
wrap_around
=
False
):
"""iterates through data and handles wrap around"""
for
i
,
idx
in
enumerate
(
_iter
):
if
i
<
self
.
wrap_around
%
self
.
batch_size
:
continue
if
wrap_around
:
self
.
wrap_around
+=
1
self
.
wrap_around
%=
self
.
batch_size
yield
idx
def
_batch
(
self
,
batch
):
"""extracts samples only pertaining to this worker's batch"""
if
self
.
interleave
:
return
batch
[
self
.
rank
:
self
.
batch_size
:
self
.
world_size
]
start
=
self
.
rank
*
self
.
batch_size
//
self
.
world_size
end
=
(
self
.
rank
+
1
)
*
self
.
batch_size
//
self
.
world_size
return
batch
[
start
:
end
]
megatron/deprecated_data_utils/__init__.py
deleted
100644 → 0
View file @
63c340ec
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""utils for creating datasets"""
import
os
import
math
import
torch
from
.samplers
import
DistributedBatchSampler
from
.datasets
import
json_dataset
,
csv_dataset
,
split_ds
,
ConcatDataset
,
SplitDataset
,
bert_sentencepair_dataset
,
GPT2Dataset
from
.lazy_loader
import
exists_lazy
,
make_lazy
,
lazy_array_loader
from
.tokenization
import
Tokenization
,
CommandToken
,
Tokenizer
,
CharacterLevelTokenizer
,
BertWordPieceTokenizer
,
GPT2BPETokenizer
,
make_tokenizer
from
.
import
corpora
TRAIN_DATA
=
0
VAL_DATA
=
1
TEST_DATA
=
2
def
should_split
(
split
):
"""
given split proportions checks if should split
Examples:
>>> should_split([10,0,0])
False
>>> should_split([1,.1,.2])
True
"""
return
max
(
split
)
/
sum
(
split
)
!=
1.
def
get_ext
(
path
):
"""gets path extension"""
return
os
.
path
.
splitext
(
path
)[
1
]
def
get_dataset
(
path
,
**
kwargs
):
"""gets dataset object based on keyword args and file at `path`"""
if
supported_corpus
(
path
):
return
corpora
.
NAMED_CORPORA
[
path
](
**
kwargs
)
ext
=
get_ext
(
path
)
if
'.json'
in
ext
:
text
=
json_dataset
(
path
,
**
kwargs
)
elif
ext
in
[
'.csv'
,
'.tsv'
]:
text
=
csv_dataset
(
path
,
**
kwargs
)
else
:
raise
NotImplementedError
(
'data file type %s is not supported'
%
(
ext
))
return
text
def
supported_corpus
(
corpus_name
):
"""checks if corpus name is defined in `corpora.py`"""
return
corpus_name
in
corpora
.
NAMED_CORPORA
def
make_dataset
(
path
,
seq_length
,
text_key
,
label_key
,
lazy
=
False
,
process_fn
=
None
,
split
=
[
1.
],
delim
=
','
,
loose
=
False
,
binarize_sent
=
False
,
drop_unlabeled
=
False
,
tokenizer
=
None
,
tokenizer_type
=
'CharacterLevelTokenizer'
,
tokenizer_model_path
=
None
,
vocab_size
=
None
,
model_type
=
'bpe'
,
pad_token
=
0
,
character_converage
=
1.0
,
non_binary_cols
=
None
,
parallel_group
=
None
,
**
kwargs
):
"""function to create datasets+tokenizers for common options"""
if
isinstance
(
process_fn
,
str
):
process_fn
=
eval
(
process_fn
)
if
non_binary_cols
is
not
None
:
# multilabel dataset support (only for csvs)
label_key
=
non_binary_cols
def
get_dataset_from_path
(
path_
):
if
lazy
:
# get lazily loaded dataset
named_corpora
=
False
if
supported_corpus
(
path_
):
named_corpora
=
True
name
=
path_
path_
=
corpora
.
NAMED_CORPORA
[
path_
].
PATH
if
torch
.
distributed
.
get_rank
()
==
0
and
not
exists_lazy
(
path_
,
data_type
=
'data'
):
# create cached version of dataset for lazy loading if it doesn't exist
text
=
get_dataset
(
name
if
named_corpora
else
path_
,
text_key
=
text_key
,
label_key
=
label_key
,
binarize_sent
=
binarize_sent
,
delim
=
delim
,
drop_unlabeled
=
drop_unlabeled
,
loose_json
=
loose
)
make_lazy
(
path_
,
text
.
X
,
data_type
=
'data'
)
# This should be a barrier but nccl barrier assumes
# device_index=rank which is not the case for model
# parallel case
counts
=
torch
.
cuda
.
LongTensor
([
1
])
torch
.
distributed
.
all_reduce
(
counts
,
group
=
parallel_group
)
assert
counts
[
0
].
item
()
==
torch
.
distributed
.
get_world_size
(
group
=
parallel_group
)
text
=
lazy_array_loader
(
path_
,
data_type
=
'data'
,
map_fn
=
process_fn
)
else
:
# get dataset
text
=
get_dataset
(
path_
,
text_key
=
text_key
,
label_key
=
label_key
,
binarize_sent
=
binarize_sent
,
delim
=
delim
,
drop_unlabeled
=
drop_unlabeled
,
loose_json
=
loose
,
preprocess_fn
=
process_fn
)
return
text
# get one or multiple datasets and concatenate
if
isinstance
(
path
,
str
):
path
=
[
path
]
datasets
=
[
get_dataset_from_path
(
p
)
for
p
in
path
]
if
len
(
datasets
)
==
1
:
ds
=
datasets
[
0
]
else
:
ds
=
ConcatDataset
(
datasets
)
# make tokenizer for dataset
if
tokenizer
is
None
:
tokenizer
=
make_tokenizer
(
tokenizer_type
,
ds
,
tokenizer_model_path
,
vocab_size
,
model_type
,
pad_token
,
character_converage
,
**
kwargs
)
ds_type
=
''
if
'ds_type'
in
kwargs
:
ds_type
=
kwargs
[
'ds_type'
]
ds
.
SetTokenizer
(
tokenizer
)
# Split dataset into train/val/test (and wrap bert dataset)
if
should_split
(
split
):
ds
=
split_ds
(
ds
,
split
)
if
'bert'
in
ds_type
.
lower
():
presplit_sentences
=
kwargs
[
'presplit_sentences'
]
if
'presplit_sentences'
in
kwargs
else
False
dstype
=
bert_sentencepair_dataset
ds
=
[
dstype
(
d
,
max_seq_len
=
seq_length
,
presplit_sentences
=
presplit_sentences
)
if
d
is
not
None
else
None
for
d
in
ds
]
elif
ds_type
.
lower
()
==
'gpt2'
:
ds
=
[
GPT2Dataset
(
d
,
max_seq_len
=
seq_length
)
if
d
is
not
None
else
None
for
d
in
ds
]
else
:
if
'bert'
in
ds_type
.
lower
():
presplit_sentences
=
kwargs
[
'presplit_sentences'
]
if
'presplit_sentences'
in
kwargs
else
False
dstype
=
bert_sentencepair_dataset
ds
=
dstype
(
ds
,
max_seq_len
=
seq_length
,
presplit_sentences
=
presplit_sentences
)
elif
ds_type
.
lower
()
==
'gpt2'
:
ds
=
GPT2Dataset
(
ds
,
max_seq_len
=
seq_length
)
return
ds
,
tokenizer
megatron/deprecated_data_utils/configure_data.py
deleted
100644 → 0
View file @
63c340ec
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""parses arguments and preps data loader"""
import
copy
import
torch
from
megatron
import
data_utils
from
megatron
import
mpu
class
DataConfig
:
def
__init__
(
self
,
defaults
=
{}):
super
(
DataConfig
,
self
).
__init__
()
self
.
defaults
=
defaults
def
apply
(
self
,
args
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'configuring data'
)
self
.
apply_defaults
(
args
)
return
make_loaders
(
args
)
def
set_defaults
(
self
,
**
kwargs
):
for
k
,
v
in
kwargs
.
items
():
self
.
defaults
[
k
]
=
v
def
apply_defaults
(
self
,
args
):
for
k
,
v
in
self
.
defaults
.
items
():
k
=
k
.
replace
(
'-'
,
'_'
)
if
not
hasattr
(
args
,
k
):
setattr
(
args
,
k
,
v
)
def
make_data_loader
(
dataset
,
batch_size
,
args
):
shuffle
=
args
.
shuffle
if
shuffle
:
sampler
=
data_utils
.
samplers
.
RandomSampler
(
dataset
,
replacement
=
True
,
num_samples
=
batch_size
*
args
.
train_iters
)
else
:
sampler
=
torch
.
utils
.
data
.
SequentialSampler
(
dataset
)
world_size
=
torch
.
distributed
.
get_world_size
(
group
=
mpu
.
get_data_parallel_group
())
rank
=
torch
.
distributed
.
get_rank
(
group
=
mpu
.
get_data_parallel_group
())
distributed
=
world_size
>
1
drop_last
=
distributed
if
distributed
:
batch_sampler
=
data_utils
.
samplers
.
DistributedBatchSampler
(
sampler
,
batch_size
,
drop_last
,
rank
,
world_size
)
else
:
batch_sampler
=
torch
.
utils
.
data
.
BatchSampler
(
sampler
,
batch_size
,
drop_last
)
data_loader
=
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
batch_sampler
,
num_workers
=
args
.
num_workers
,
pin_memory
=
True
)
return
data_loader
def
make_tfrecord_loaders
(
args
):
"""Load train/val/test dataset from shuffled TFRecords"""
import
data_utils.tf_dl
data_set_args
=
{
'batch_size'
:
args
.
batch_size
,
'max_seq_len'
:
args
.
seq_length
,
'max_preds_per_seq'
:
args
.
max_preds_per_seq
,
'train'
:
True
,
'num_workers'
:
max
(
args
.
num_workers
,
1
),
'seed'
:
args
.
seed
+
args
.
rank
+
1
,
'threaded_dl'
:
args
.
num_workers
>
0
}
train
=
data_utils
.
tf_dl
.
TFRecordDataLoader
(
args
.
train_data
,
**
data_set_args
)
data_set_args
[
'train'
]
=
False
if
args
.
eval_seq_length
is
not
None
:
data_set_args
[
'max_seq_len'
]
=
args
.
eval_seq_length
if
args
.
eval_max_preds_per_seq
is
not
None
:
data_set_args
[
'max_preds_per_seq'
]
=
args
.
eval_max_preds_per_seq
valid
=
None
if
args
.
valid_data
is
not
None
:
valid
=
data_utils
.
tf_dl
.
TFRecordDataLoader
(
args
.
valid_data
,
**
data_set_args
)
test
=
None
if
args
.
test_data
is
not
None
:
test
=
data_utils
.
tf_dl
.
TFRecordDataLoader
(
args
.
test_data
,
**
data_set_args
)
tokenizer
=
data_utils
.
make_tokenizer
(
args
.
tokenizer_type
,
train
,
args
.
tokenizer_path
,
args
.
vocab_size
,
args
.
tokenizer_model_type
,
cache_dir
=
args
.
cache_dir
)
return
(
train
,
valid
,
test
),
tokenizer
def
make_loaders
(
args
):
"""makes training/val/test"""
if
args
.
data_loader
==
'tfrecords'
:
return
make_tfrecord_loaders
(
args
)
world_size
=
torch
.
distributed
.
get_world_size
(
group
=
mpu
.
get_data_parallel_group
())
batch_size
=
args
.
batch_size
*
world_size
eval_batch_size
=
batch_size
if
args
.
eval_batch_size
is
not
None
:
eval_batch_size
=
args
.
eval_batch_size
*
world_size
seq_length
=
args
.
seq_length
if
seq_length
<
0
:
seq_length
=
seq_length
*
world_size
eval_seq_length
=
args
.
eval_seq_length
if
eval_seq_length
is
not
None
and
eval_seq_length
<
0
:
eval_seq_length
=
eval_seq_length
*
world_size
split
=
get_split
(
args
)
if
args
.
data_path
is
not
None
:
args
.
train_data
=
args
.
data_path
data_set_args
=
{
'path'
:
args
.
train_data
,
'seq_length'
:
seq_length
,
'lazy'
:
args
.
data_loader
==
'lazy'
,
'delim'
:
args
.
delim
,
'text_key'
:
args
.
text_key
,
'label_key'
:
'label'
,
'non_binary_cols'
:
None
,
'ds_type'
:
args
.
data_set_type
,
'split'
:
split
,
'loose'
:
args
.
loose_json
,
'tokenizer_type'
:
args
.
tokenizer_type
,
'tokenizer_model_path'
:
args
.
tokenizer_path
,
'vocab_size'
:
args
.
vocab_size
,
'model_type'
:
args
.
tokenizer_model_type
,
'cache_dir'
:
args
.
cache_dir
,
'max_preds_per_seq'
:
args
.
max_preds_per_seq
,
'presplit_sentences'
:
args
.
presplit_sentences
,
'parallel_group'
:
mpu
.
get_data_parallel_group
()}
eval_set_args
=
copy
.
copy
(
data_set_args
)
eval_set_args
[
'split'
]
=
[
1.
]
# if optional eval args were set then replace their
# equivalent values in the arg dict
if
eval_seq_length
:
eval_set_args
[
'seq_length'
]
=
eval_seq_length
if
args
.
eval_max_preds_per_seq
:
eval_set_args
[
'max_preds_per_seq'
]
=
args
.
eval_max_preds_per_seq
if
args
.
eval_text_key
is
not
None
:
eval_set_args
[
'text_key'
]
=
args
.
eval_text_key
# make datasets splits and tokenizer
train
=
None
valid
=
None
test
=
None
if
args
.
train_data
is
not
None
:
train
,
tokenizer
=
data_utils
.
make_dataset
(
**
data_set_args
)
if
data_utils
.
should_split
(
split
):
train
,
valid
,
test
=
train
eval_set_args
[
'tokenizer'
]
=
tokenizer
# make training and val dataset if necessary
if
valid
is
None
and
args
.
valid_data
is
not
None
:
eval_set_args
[
'path'
]
=
args
.
valid_data
valid
,
tokenizer
=
data_utils
.
make_dataset
(
**
eval_set_args
)
eval_set_args
[
'tokenizer'
]
=
tokenizer
if
test
is
None
and
args
.
test_data
is
not
None
:
eval_set_args
[
'path'
]
=
args
.
test_data
test
,
tokenizer
=
data_utils
.
make_dataset
(
**
eval_set_args
)
# wrap datasets with data loader
if
train
is
not
None
and
args
.
batch_size
>
0
:
train
=
make_data_loader
(
train
,
batch_size
,
args
)
args
.
do_train
=
True
else
:
args
.
do_train
=
False
eval_batch_size
=
eval_batch_size
if
eval_batch_size
!=
0
else
batch_size
if
valid
is
not
None
:
valid
=
make_data_loader
(
valid
,
eval_batch_size
,
args
)
args
.
do_valid
=
True
else
:
args
.
do_valid
=
False
if
test
is
not
None
:
test
=
make_data_loader
(
test
,
eval_batch_size
,
args
)
args
.
do_test
=
True
else
:
args
.
do_test
=
False
return
(
train
,
valid
,
test
),
tokenizer
def
get_split
(
args
):
"""
Get dataset splits from comma separated string list
"""
splits
=
[]
if
args
.
split
.
find
(
','
)
!=
-
1
:
splits
=
[
float
(
s
)
for
s
in
args
.
split
.
split
(
','
)]
elif
args
.
split
.
find
(
'/'
)
!=
-
1
:
splits
=
[
float
(
s
)
for
s
in
args
.
split
.
split
(
'/'
)]
else
:
splits
=
[
float
(
args
.
split
)]
split_total
=
sum
(
splits
)
if
split_total
<
1.
:
splits
.
append
(
1
-
split_total
)
while
len
(
splits
)
<
3
:
splits
.
append
(
0.
)
splits
=
splits
[:
3
]
if
args
.
valid_data
is
not
None
:
splits
[
1
]
=
0.
if
args
.
test_data
is
not
None
:
splits
[
2
]
=
0.
final_sum
=
sum
(
splits
)
return
[
s
/
final_sum
for
s
in
splits
]
def
configure_data
():
"""add cmdline flags for configuring datasets"""
# These are options that are used by data_utils, but are either
# deprecated or not meant to be exposed to the command line user.
# These options are intneded to be set in code by specific scripts.
defaults
=
{
'world_size'
:
1
,
'rank'
:
-
1
,
'persist_state'
:
0
,
'lazy'
:
False
,
'transpose'
:
False
,
'data_set_type'
:
'supervised'
,
'seq_length'
:
256
,
'eval_seq_length'
:
256
,
'samples_per_shard'
:
100
}
return
DataConfig
(
defaults
=
defaults
)
megatron/deprecated_data_utils/corpora.py
deleted
100755 → 0
View file @
63c340ec
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""several datasets with preset arguments"""
from
.datasets
import
json_dataset
,
csv_dataset
import
os
class
wikipedia
(
json_dataset
):
"""
dataset for wikipedia with arguments configured for convenience
command line usage: `--train-data wikipedia`
"""
PATH
=
'data/wikipedia/wikidump_lines.json'
assert_str
=
"make sure to set PATH for wikipedia data_utils/corpora.py"
def
__init__
(
self
,
**
kwargs
):
assert
os
.
path
.
exists
(
wikipedia
.
PATH
),
\
wikipedia
.
assert_str
if
not
kwargs
:
kwargs
=
{}
kwargs
[
'text_key'
]
=
'text'
kwargs
[
'loose_json'
]
=
True
super
(
wikipedia
,
self
).
__init__
(
wikipedia
.
PATH
,
**
kwargs
)
class
webtext
(
json_dataset
):
"""
dataset for webtext with arguments configured for convenience
command line usage: `--train-data webtext`
"""
PATH
=
'data/webtext/data.json'
assert_str
=
"make sure to set PATH for webtext data_utils/corpora.py"
def
__init__
(
self
,
**
kwargs
):
assert
os
.
path
.
exists
(
webtext
.
PATH
),
\
webtext
.
assert_str
if
not
kwargs
:
kwargs
=
{}
kwargs
[
'text_key'
]
=
'text'
kwargs
[
'loose_json'
]
=
True
super
(
webtext
,
self
).
__init__
(
webtext
.
PATH
,
**
kwargs
)
NAMED_CORPORA
=
{
'wikipedia'
:
wikipedia
,
'webtext'
:
webtext
,
}
megatron/deprecated_data_utils/datasets.py
deleted
100755 → 0
View file @
63c340ec
This diff is collapsed.
Click to expand it.
megatron/deprecated_data_utils/file_utils.py
deleted
100755 → 0
View file @
63c340ec
# This file is provided as is from:
# https://github.com/huggingface/pytorch-pretrained-BERT
# Please refer to their repository for copyright.
"""
Utilities for working with the local dataset cache.
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
Copyright by the AllenNLP authors.
"""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
import
json
import
logging
import
os
import
shutil
import
tempfile
from
functools
import
wraps
from
hashlib
import
sha256
import
sys
from
io
import
open
import
boto3
import
requests
from
botocore.exceptions
import
ClientError
from
tqdm
import
tqdm
try
:
from
urllib.parse
import
urlparse
except
ImportError
:
from
urlparse
import
urlparse
try
:
from
pathlib
import
Path
PYTORCH_PRETRAINED_BERT_CACHE
=
Path
(
os
.
getenv
(
'PYTORCH_PRETRAINED_BERT_CACHE'
,
Path
.
home
()
/
'.pytorch_pretrained_bert'
))
except
(
AttributeError
,
ImportError
):
PYTORCH_PRETRAINED_BERT_CACHE
=
os
.
getenv
(
'PYTORCH_PRETRAINED_BERT_CACHE'
,
os
.
path
.
join
(
os
.
path
.
expanduser
(
"~"
),
'.pytorch_pretrained_bert'
))
logger
=
logging
.
getLogger
(
__name__
)
# pylint: disable=invalid-name
def
url_to_filename
(
url
,
etag
=
None
):
"""
Convert `url` into a hashed filename in a repeatable way.
If `etag` is specified, append its hash to the url's, delimited
by a period.
"""
url_bytes
=
url
.
encode
(
'utf-8'
)
url_hash
=
sha256
(
url_bytes
)
filename
=
url_hash
.
hexdigest
()
if
etag
:
etag_bytes
=
etag
.
encode
(
'utf-8'
)
etag_hash
=
sha256
(
etag_bytes
)
filename
+=
'.'
+
etag_hash
.
hexdigest
()
return
filename
def
filename_to_url
(
filename
,
cache_dir
=
None
):
"""
Return the url and etag (which may be ``None``) stored for `filename`.
Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
"""
if
cache_dir
is
None
:
cache_dir
=
PYTORCH_PRETRAINED_BERT_CACHE
if
sys
.
version_info
[
0
]
==
3
and
isinstance
(
cache_dir
,
Path
):
cache_dir
=
str
(
cache_dir
)
cache_path
=
os
.
path
.
join
(
cache_dir
,
filename
)
if
not
os
.
path
.
exists
(
cache_path
):
raise
EnvironmentError
(
"file {} not found"
.
format
(
cache_path
))
meta_path
=
cache_path
+
'.json'
if
not
os
.
path
.
exists
(
meta_path
):
raise
EnvironmentError
(
"file {} not found"
.
format
(
meta_path
))
with
open
(
meta_path
,
encoding
=
"utf-8"
)
as
meta_file
:
metadata
=
json
.
load
(
meta_file
)
url
=
metadata
[
'url'
]
etag
=
metadata
[
'etag'
]
return
url
,
etag
def
cached_path
(
url_or_filename
,
cache_dir
=
None
):
"""
Given something that might be a URL (or might be a local path),
determine which. If it's a URL, download the file and cache it, and
return the path to the cached file. If it's already a local path,
make sure the file exists and then return the path.
"""
if
cache_dir
is
None
:
cache_dir
=
PYTORCH_PRETRAINED_BERT_CACHE
if
sys
.
version_info
[
0
]
==
3
and
isinstance
(
url_or_filename
,
Path
):
url_or_filename
=
str
(
url_or_filename
)
if
sys
.
version_info
[
0
]
==
3
and
isinstance
(
cache_dir
,
Path
):
cache_dir
=
str
(
cache_dir
)
parsed
=
urlparse
(
url_or_filename
)
if
parsed
.
scheme
in
(
'http'
,
'https'
,
's3'
):
# URL, so get it from the cache (downloading if necessary)
return
get_from_cache
(
url_or_filename
,
cache_dir
)
elif
os
.
path
.
exists
(
url_or_filename
):
# File, and it exists.
return
url_or_filename
elif
parsed
.
scheme
==
''
:
# File, but it doesn't exist.
raise
EnvironmentError
(
"file {} not found"
.
format
(
url_or_filename
))
else
:
# Something unknown
raise
ValueError
(
"unable to parse {} as a URL or as a local path"
.
format
(
url_or_filename
))
def
split_s3_path
(
url
):
"""Split a full s3 path into the bucket name and path."""
parsed
=
urlparse
(
url
)
if
not
parsed
.
netloc
or
not
parsed
.
path
:
raise
ValueError
(
"bad s3 path {}"
.
format
(
url
))
bucket_name
=
parsed
.
netloc
s3_path
=
parsed
.
path
# Remove '/' at beginning of path.
if
s3_path
.
startswith
(
"/"
):
s3_path
=
s3_path
[
1
:]
return
bucket_name
,
s3_path
def
s3_request
(
func
):
"""
Wrapper function for s3 requests in order to create more helpful error
messages.
"""
@
wraps
(
func
)
def
wrapper
(
url
,
*
args
,
**
kwargs
):
try
:
return
func
(
url
,
*
args
,
**
kwargs
)
except
ClientError
as
exc
:
if
int
(
exc
.
response
[
"Error"
][
"Code"
])
==
404
:
raise
EnvironmentError
(
"file {} not found"
.
format
(
url
))
else
:
raise
return
wrapper
@
s3_request
def
s3_etag
(
url
):
"""Check ETag on S3 object."""
s3_resource
=
boto3
.
resource
(
"s3"
)
bucket_name
,
s3_path
=
split_s3_path
(
url
)
s3_object
=
s3_resource
.
Object
(
bucket_name
,
s3_path
)
return
s3_object
.
e_tag
@
s3_request
def
s3_get
(
url
,
temp_file
):
"""Pull a file directly from S3."""
s3_resource
=
boto3
.
resource
(
"s3"
)
bucket_name
,
s3_path
=
split_s3_path
(
url
)
s3_resource
.
Bucket
(
bucket_name
).
download_fileobj
(
s3_path
,
temp_file
)
def
http_get
(
url
,
temp_file
):
req
=
requests
.
get
(
url
,
stream
=
True
)
content_length
=
req
.
headers
.
get
(
'Content-Length'
)
total
=
int
(
content_length
)
if
content_length
is
not
None
else
None
progress
=
tqdm
(
unit
=
"B"
,
total
=
total
)
for
chunk
in
req
.
iter_content
(
chunk_size
=
1024
):
if
chunk
:
# filter out keep-alive new chunks
progress
.
update
(
len
(
chunk
))
temp_file
.
write
(
chunk
)
progress
.
close
()
def
get_from_cache
(
url
,
cache_dir
=
None
):
"""
Given a URL, look for the corresponding dataset in the local cache.
If it's not there, download it. Then return the path to the cached file.
"""
if
cache_dir
is
None
:
cache_dir
=
PYTORCH_PRETRAINED_BERT_CACHE
if
sys
.
version_info
[
0
]
==
3
and
isinstance
(
cache_dir
,
Path
):
cache_dir
=
str
(
cache_dir
)
if
not
os
.
path
.
exists
(
cache_dir
):
os
.
makedirs
(
cache_dir
)
# Get eTag to add to filename, if it exists.
if
url
.
startswith
(
"s3://"
):
etag
=
s3_etag
(
url
)
else
:
response
=
requests
.
head
(
url
,
allow_redirects
=
True
)
if
response
.
status_code
!=
200
:
raise
IOError
(
"HEAD request failed for url {} with status code {}"
.
format
(
url
,
response
.
status_code
))
etag
=
response
.
headers
.
get
(
"ETag"
)
filename
=
url_to_filename
(
url
,
etag
)
# get cache path to put the file
cache_path
=
os
.
path
.
join
(
cache_dir
,
filename
)
if
not
os
.
path
.
exists
(
cache_path
):
# Download to temporary file, then copy to cache dir once finished.
# Otherwise you get corrupt cache entries if the download gets interrupted.
with
tempfile
.
NamedTemporaryFile
()
as
temp_file
:
logger
.
info
(
"%s not found in cache, downloading to %s"
,
url
,
temp_file
.
name
)
# GET file object
if
url
.
startswith
(
"s3://"
):
s3_get
(
url
,
temp_file
)
else
:
http_get
(
url
,
temp_file
)
# we are copying the file before closing it, so flush to avoid truncation
temp_file
.
flush
()
# shutil.copyfileobj() starts at the current position, so go to the start
temp_file
.
seek
(
0
)
logger
.
info
(
"copying %s to cache at %s"
,
temp_file
.
name
,
cache_path
)
with
open
(
cache_path
,
'wb'
)
as
cache_file
:
shutil
.
copyfileobj
(
temp_file
,
cache_file
)
logger
.
info
(
"creating metadata file for %s"
,
cache_path
)
meta
=
{
'url'
:
url
,
'etag'
:
etag
}
meta_path
=
cache_path
+
'.json'
with
open
(
meta_path
,
'w'
,
encoding
=
"utf-8"
)
as
meta_file
:
json
.
dump
(
meta
,
meta_file
)
logger
.
info
(
"removing temp file %s"
,
temp_file
.
name
)
return
cache_path
def
read_set_from_file
(
filename
):
'''
Extract a de-duped collection (set) of text from a file.
Expected file format is one item per line.
'''
collection
=
set
()
with
open
(
filename
,
'r'
,
encoding
=
'utf-8'
)
as
file_
:
for
line
in
file_
:
collection
.
add
(
line
.
rstrip
())
return
collection
def
get_file_extension
(
path
,
dot
=
True
,
lower
=
True
):
ext
=
os
.
path
.
splitext
(
path
)[
1
]
ext
=
ext
if
dot
else
ext
[
1
:]
return
ext
.
lower
()
if
lower
else
ext
megatron/deprecated_data_utils/lazy_loader.py
deleted
100644 → 0
View file @
63c340ec
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""utils for loading text from disk"""
import
os
import
mmap
import
pickle
as
pkl
import
time
from
itertools
import
accumulate
import
torch
from
torch.multiprocessing
import
Lock
def
get_lazy_path
(
path
):
"""
Gets directory path where lazy files are stored.
"""
return
os
.
path
.
splitext
(
path
)[
0
]
+
'.lazy'
def
exists_lazy
(
path
,
data_type
=
'data'
):
"""
Check if we've already made a lazy version of this file for the `data_type` field.
"""
if
not
os
.
path
.
exists
(
get_lazy_path
(
path
)):
return
False
contents
=
os
.
listdir
(
get_lazy_path
(
path
))
if
data_type
not
in
contents
:
return
False
if
data_type
+
'.len.pkl'
not
in
contents
:
return
False
return
True
def
make_lazy
(
path
,
strs
,
data_type
=
'data'
):
"""
Make lazy version of `data_type` field of the file. Byte offsets
corresponding to data indices are stored in a `.len.pkl` data file.
"""
lazypath
=
get_lazy_path
(
path
)
if
not
os
.
path
.
exists
(
lazypath
):
os
.
makedirs
(
lazypath
)
datapath
=
os
.
path
.
join
(
lazypath
,
data_type
)
lenpath
=
os
.
path
.
join
(
lazypath
,
data_type
+
'.len.pkl'
)
if
not
torch
.
distributed
.
is_initialized
()
or
torch
.
distributed
.
get_rank
()
==
0
:
with
open
(
datapath
,
'wb'
)
as
f
:
str_lens
=
[]
str_cnt
=
0
for
s
in
strs
:
if
isinstance
(
s
,
dict
):
s
=
s
[
'text'
]
encoded
=
s
.
encode
(
'utf-8'
)
f
.
write
(
encoded
)
str_cnt
=
len
(
encoded
)
str_lens
.
append
(
str_cnt
)
pkl
.
dump
(
str_lens
,
open
(
lenpath
,
'wb'
))
else
:
while
not
os
.
path
.
exists
(
lenpath
):
time
.
sleep
(
1
)
def
split_strings
(
strings
,
start
,
chr_lens
):
"""
Split strings based on string lengths and given start.
"""
return
[
strings
[
i
-
start
:
j
-
start
]
for
i
,
j
in
zip
([
start
]
+
chr_lens
[:
-
1
],
chr_lens
)]
class
ProcessorTokenizer
:
"""
callable class that runs a preprocessing, as well as tokenization step,
on input text.
"""
def
__init__
(
self
,
tokenizer
,
process_fn
=
None
):
self
.
tokenizer
=
tokenizer
self
.
process_fn
=
process_fn
def
__call__
(
self
,
string
):
if
self
.
tokenizer
is
not
None
:
string
=
self
.
tokenizer
(
string
,
process_fn
=
self
.
process_fn
)
elif
self
.
process_fn
is
not
None
:
string
=
self
.
process_fn
(
string
)
return
string
class
lazy_array_loader
(
object
):
"""
Arguments:
path: path to directory where array entries are concatenated into one big string file
and the .len file are located
data_type (str): Some datsets have multiple fields that are stored in different paths.
`data_type` specifies which of these fields to load in this class
mem_map (boolean): Specifies whether to memory map file `path`
map_fn (callable): Fetched strings are passed through map_fn before being returned.
Example of lazy loader directory structure:
file.json
file.lazy/
data_type1
data_type1.len.pkl
data_type2
data_type2.len.pkl
"""
def
__init__
(
self
,
path
,
data_type
=
'data'
,
mem_map
=
False
,
map_fn
=
None
):
lazypath
=
get_lazy_path
(
path
)
datapath
=
os
.
path
.
join
(
lazypath
,
data_type
)
# get file where array entries are concatenated into one big string
self
.
_file
=
open
(
datapath
,
'rb'
,
buffering
=
0
)
self
.
file
=
self
.
_file
# memory map file if necessary
self
.
mem_map
=
mem_map
if
self
.
mem_map
:
self
.
file
=
mmap
.
mmap
(
self
.
file
.
fileno
(),
0
,
prot
=
mmap
.
PROT_READ
)
lenpath
=
os
.
path
.
join
(
lazypath
,
data_type
+
'.len.pkl'
)
self
.
lens
=
pkl
.
load
(
open
(
lenpath
,
'rb'
))
self
.
ends
=
list
(
accumulate
(
self
.
lens
))
self
.
dumb_ends
=
list
(
self
.
ends
)
self
.
read_lock
=
Lock
()
self
.
process_fn
=
map_fn
self
.
map_fn
=
map_fn
self
.
_tokenizer
=
None
def
SetTokenizer
(
self
,
tokenizer
):
"""
logic to set and remove (set to None) tokenizer.
combines preprocessing/tokenization into one callable.
"""
if
tokenizer
is
None
:
if
not
hasattr
(
self
,
'_tokenizer'
):
self
.
_tokenizer
=
tokenizer
else
:
self
.
_tokenizer
=
tokenizer
self
.
map_fn
=
ProcessorTokenizer
(
tokenizer
,
self
.
process_fn
)
def
GetTokenizer
(
self
):
return
self
.
_tokenizer
def
__getitem__
(
self
,
index
):
"""
read file and splice strings based on string ending array `self.ends`
"""
if
not
isinstance
(
index
,
slice
):
if
index
==
0
:
start
=
0
else
:
start
=
self
.
ends
[
index
-
1
]
end
=
self
.
ends
[
index
]
rtn
=
self
.
file_read
(
start
,
end
)
if
self
.
map_fn
is
not
None
:
return
self
.
map_fn
(
rtn
)
else
:
# if slice, fetch strings with 1 diskread and then splice in memory
chr_lens
=
self
.
ends
[
index
]
if
index
.
start
==
0
or
index
.
start
is
None
:
start
=
0
else
:
start
=
self
.
ends
[
index
.
start
-
1
]
stop
=
chr_lens
[
-
1
]
strings
=
self
.
file_read
(
start
,
stop
)
rtn
=
split_strings
(
strings
,
start
,
chr_lens
)
if
self
.
map_fn
is
not
None
:
return
self
.
map_fn
([
s
for
s
in
rtn
])
return
rtn
def
__len__
(
self
):
return
len
(
self
.
ends
)
def
file_read
(
self
,
start
=
0
,
end
=
None
):
"""read specified portion of file"""
# atomic reads to avoid race conditions with multiprocess dataloader
self
.
read_lock
.
acquire
()
# seek to start of file read
self
.
file
.
seek
(
start
)
# read to end of file if no end point provided
if
end
is
None
:
rtn
=
self
.
file
.
read
()
# else read amount needed to reach end point
else
:
rtn
=
self
.
file
.
read
(
end
-
start
)
self
.
read_lock
.
release
()
# TODO: @raulp figure out mem map byte string bug
# if mem map'd need to decode byte string to string
rtn
=
rtn
.
decode
(
'utf-8'
,
'ignore'
)
# rtn = str(rtn)
if
self
.
mem_map
:
rtn
=
rtn
.
decode
(
'unicode_escape'
)
return
rtn
megatron/deprecated_data_utils/samplers.py
deleted
100644 → 0
View file @
63c340ec
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""batch samplers that work with either random or sequential data samplers"""
import
math
import
os
import
sys
import
torch
from
torch.utils
import
data
import
numpy
as
np
class
RandomSampler
(
data
.
sampler
.
Sampler
):
r
"""
Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler,
but this class lets the user set an epoch like DistributedSampler
Samples elements randomly. If without replacement, then sample from a shuffled dataset.
If with replacement, then user can specify ``num_samples`` to draw.
Arguments:
data_source (Dataset): dataset to sample from
num_samples (int): number of samples to draw, default=len(dataset)
replacement (bool): samples are drawn with replacement if ``True``, default=False
"""
def
__init__
(
self
,
data_source
,
replacement
=
False
,
num_samples
=
None
):
self
.
data_source
=
data_source
self
.
replacement
=
replacement
self
.
_num_samples
=
num_samples
self
.
epoch
=
-
1
if
self
.
_num_samples
is
not
None
and
replacement
is
False
:
raise
ValueError
(
"With replacement=False, num_samples should not be specified, "
"since a random permute will be performed."
)
if
not
isinstance
(
self
.
num_samples
,
int
)
or
self
.
num_samples
<=
0
:
raise
ValueError
(
"num_samples should be a positive integer "
"value, but got num_samples={}"
.
format
(
self
.
num_samples
))
if
not
isinstance
(
self
.
replacement
,
bool
):
raise
ValueError
(
"replacement should be a boolean value, but got "
"replacement={}"
.
format
(
self
.
replacement
))
@
property
def
num_samples
(
self
):
# dataset size might change at runtime
if
self
.
_num_samples
is
None
:
return
len
(
self
.
data_source
)
return
self
.
_num_samples
def
__iter__
(
self
):
n
=
len
(
self
.
data_source
)
g
=
torch
.
Generator
()
if
self
.
epoch
>=
0
:
g
.
manual_seed
(
self
.
epoch
)
if
self
.
replacement
:
return
iter
(
torch
.
randint
(
high
=
n
,
size
=
(
self
.
num_samples
,),
dtype
=
torch
.
int64
,
generator
=
g
).
tolist
())
return
iter
(
torch
.
randperm
(
n
,
generator
=
g
).
tolist
())
def
__len__
(
self
):
return
self
.
num_samples
def
set_epoch
(
self
,
epoch
):
self
.
epoch
=
epoch
class
DistributedBatchSampler
(
data
.
sampler
.
BatchSampler
):
"""
similar to normal implementation of distributed sampler, except implementation is at the
batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary
data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler.
"""
def
__init__
(
self
,
sampler
,
batch_size
,
drop_last
,
rank
=-
1
,
world_size
=
2
,
wrap_last
=
False
):
super
(
DistributedBatchSampler
,
self
).
__init__
(
sampler
,
batch_size
,
drop_last
)
if
rank
==
-
1
:
assert
False
,
'should not be here'
rank
=
torch
.
distributed
.
get_rank
()
self
.
rank
=
rank
self
.
world_size
=
world_size
self
.
sampler
.
wrap_around
=
0
self
.
wrap_around
=
0
self
.
wrap_last
=
wrap_last
self
.
start_iter
=
0
def
__iter__
(
self
):
batch
=
[]
last_batch
=
None
i
=
0
for
idx
in
self
.
data_iterator
(
self
.
sampler
,
wrap_around
=
False
):
batch
.
append
(
idx
)
if
len
(
batch
)
==
self
.
batch_size
:
tbatch
=
self
.
_batch
(
batch
)
if
i
>=
self
.
start_iter
:
yield
tbatch
self
.
start_iter
=
0
i
+=
1
last_batch
=
np
.
array
(
list
(
tbatch
))
batch
=
[]
batch_len
=
len
(
batch
)
if
batch_len
>
0
and
not
self
.
drop_last
:
if
self
.
wrap_last
:
self
.
sampler
.
wrap_around
-=
(
self
.
batch_size
)
self
.
wrap_around
+=
(
len
(
batch
))
self
.
wrap_around
%=
self
.
batch_size
if
isinstance
(
self
.
sampler
,
TransposedSampler
):
for
i
,
idx
in
enumerate
(
self
.
data_iterator
(
self
.
sampler
,
wrap_around
=
True
)):
if
i
==
0
:
continue
batch
.
append
(
idx
)
new_batch_len
=
len
(
batch
)
if
len
(
batch
)
==
self
.
batch_size
:
break
yield
self
.
_batch
(
batch
)
if
self
.
wrap_last
:
self
.
sampler
.
wrap_around
+=
self
.
batch_size
def
data_iterator
(
self
,
_iter
,
wrap_around
=
False
):
"""iterates through data and handles wrap around"""
for
i
,
idx
in
enumerate
(
_iter
):
if
i
<
self
.
wrap_around
%
self
.
batch_size
:
continue
if
wrap_around
:
self
.
wrap_around
+=
1
self
.
wrap_around
%=
self
.
batch_size
yield
idx
def
_batch
(
self
,
batch
):
"""extracts samples only pertaining to this worker's batch"""
start
=
self
.
rank
*
self
.
batch_size
//
self
.
world_size
end
=
(
self
.
rank
+
1
)
*
self
.
batch_size
//
self
.
world_size
return
batch
[
start
:
end
]
megatron/deprecated_data_utils/scripts/presplit_sentences_json.py
deleted
100644 → 0
View file @
63c340ec
"""
Usage:
python scripts/presplit_sentences_json.py <original loose json file> <output loose json file>
"""
import
sys
import
json
import
nltk
nltk
.
download
(
'punkt'
)
input_file
=
sys
.
argv
[
1
]
output_file
=
sys
.
argv
[
2
]
line_seperator
=
"
\n
"
with
open
(
input_file
,
'r'
)
as
ifile
:
with
open
(
output_file
,
"w"
)
as
ofile
:
for
doc
in
ifile
.
readlines
():
parsed
=
json
.
loads
(
doc
)
sent_list
=
[]
for
line
in
parsed
[
'text'
].
split
(
'
\n
'
):
if
line
!=
'
\n
'
:
sent_list
.
extend
(
nltk
.
tokenize
.
sent_tokenize
(
line
))
parsed
[
'text'
]
=
line_seperator
.
join
(
sent_list
)
ofile
.
write
(
json
.
dumps
(
parsed
)
+
'
\n
'
)
megatron/deprecated_data_utils/scripts/split_gpt2_json.py
deleted
100644 → 0
View file @
63c340ec
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Takes a corpora of files (specified by `--input_files`) with json data separated
by newlines (loose json). Splits data into train.json, val.json, test.json files
under `output_dir`.
Note: This code has the potential to override files with the names
train.json, val.json, test.json in `--output_dir`.
"""
import
os
import
argparse
import
math
import
random
parser
=
argparse
.
ArgumentParser
(
'resplit loose json data into train/val/test'
)
parser
.
add_argument
(
'--input_files'
,
nargs
=
'+'
,
required
=
True
,
help
=
'whitespace separated list of input data files'
)
parser
.
add_argument
(
'--output_dir'
,
required
=
True
,
help
=
'output directory where to put files'
)
parser
.
add_argument
(
'--test_percent'
,
type
=
float
,
nargs
=
'+'
,
default
=
[
0.05
,
0
],
help
=
'percentage of available data to use for val/test dataset'
)
args
=
parser
.
parse_args
()
def
get_lines
(
filepath
):
lines
=
[]
with
open
(
filepath
,
'r'
)
as
f
:
for
i
,
l
in
enumerate
(
f
.
readlines
()):
l
=
l
.
strip
()
lines
.
append
(
l
)
return
lines
def
get_splits
(
lines
,
line_counts
):
all_lines
=
[]
line_idx
=
[]
file_mappings
=
[]
for
i
,
l
in
enumerate
(
lines
):
all_lines
.
extend
(
l
)
line_idx
.
extend
(
list
(
range
(
len
(
l
))))
file_mappings
.
extend
([
i
]
*
len
(
l
))
indices
=
list
(
range
(
len
(
all_lines
)))
random
.
shuffle
(
indices
)
all_lines
=
[
all_lines
[
idx
]
for
idx
in
indices
]
line_idx
=
[
line_idx
[
idx
]
for
idx
in
indices
]
file_mappings
=
[
file_mappings
[
idx
]
for
idx
in
indices
]
splits
=
[]
mappings
=
[]
start
=
0
for
end
in
line_counts
:
end
+=
start
splits
.
append
(
all_lines
[
start
:
end
])
mappings
.
append
(
format_mappings
(
line_idx
[
start
:
end
],
file_mappings
[
start
:
end
]))
start
=
end
return
splits
,
mappings
def
format_mappings
(
line_idx
,
file_mappings
):
lines
=
[]
for
m
,
l
in
zip
(
file_mappings
,
line_idx
):
lines
.
append
(
str
(
m
).
strip
()
+
'
\t
'
+
str
(
l
).
strip
())
return
lines
def
get_filepaths
(
filepaths
,
output_dir
):
paths
=
[]
train_path
=
'train.json'
dev_path
=
'dev.json'
test_path
=
'test.json'
paths
.
append
(
os
.
path
.
join
(
output_dir
,
train_path
))
paths
.
append
(
os
.
path
.
join
(
output_dir
,
dev_path
))
paths
.
append
(
os
.
path
.
join
(
output_dir
,
test_path
))
return
paths
def
write_files
(
lines
,
mappings
,
filepaths
):
for
l
,
m
,
path
in
zip
(
lines
,
mappings
,
filepaths
):
write_file
(
l
,
path
)
write_mapping_file
(
m
,
path
)
def
write_file
(
lines
,
path
):
print
(
'Writing:'
,
path
)
with
open
(
path
,
'w'
)
as
f
:
for
l
in
lines
:
f
.
write
(
l
+
'
\n
'
)
def
write_mapping_file
(
m
,
path
):
path
=
path
+
'.map'
m
=
[
get_mapping_header
()]
+
m
write_file
(
m
,
path
)
def
get_mapping_header
():
return
'file
\t
line #'
if
not
os
.
path
.
exists
(
args
.
output_dir
):
os
.
makedirs
(
args
.
output_dir
)
lines
=
[]
for
filepath
in
args
.
input_files
:
_lines
=
get_lines
(
filepath
)
lines
.
append
(
_lines
)
# calculate number of lines to use for each
line_counts
=
[
len
(
l
)
for
l
in
lines
]
total_lines
=
sum
(
line_counts
)
dev_percent
=
args
.
test_percent
[
0
]
dev_lines
=
math
.
ceil
(
dev_percent
*
total_lines
)
test_percent
=
0
if
len
(
args
.
test_percent
)
==
2
:
test_percent
=
args
.
test_percent
[
1
]
test_lines
=
math
.
ceil
(
test_percent
*
total_lines
)
train_lines
=
total_lines
-
(
test_lines
+
dev_lines
)
normed_lines
=
[
train_lines
,
dev_lines
,
test_lines
]
normed_lines
=
[
int
(
l
)
for
l
in
normed_lines
]
splits
,
mappings
=
get_splits
(
lines
,
normed_lines
)
filepaths
=
get_filepaths
(
args
.
input_files
,
args
.
output_dir
)
print
(
'Writing output to:'
,
filepaths
)
write_files
(
splits
,
mappings
,
filepaths
)
megatron/deprecated_data_utils/scripts/split_json.py
deleted
100644 → 0
View file @
63c340ec
"""
Takes a corpora of files (specified by `--input_files`) with json data separated
by newlines (loose json). Splits data into train.json, val.json, test.json files
under `output_dir`.
Note: This code has the potential to override files with the names
train.json, val.json, test.json in `--output_dir`.
"""
import
os
import
argparse
import
math
import
random
parser
=
argparse
.
ArgumentParser
(
'resplit loose json data into train/val/test'
)
parser
.
add_argument
(
'--input_files'
,
nargs
=
'+'
,
required
=
True
,
help
=
'whitespace separated list of input data files'
)
parser
.
add_argument
(
'--output_dir'
,
required
=
True
,
help
=
'output directory where to put files'
)
parser
.
add_argument
(
'--test_percent'
,
type
=
float
,
nargs
=
'+'
,
default
=
[
0.05
,
0
],
help
=
'percentage of available data to use for val/test dataset'
)
args
=
parser
.
parse_args
()
def
get_lines
(
filepath
):
lines
=
[]
with
open
(
filepath
,
'r'
)
as
f
:
for
i
,
l
in
enumerate
(
f
.
readlines
()):
l
=
l
.
strip
()
lines
.
append
(
l
)
return
lines
def
get_splits
(
lines
,
line_counts
):
all_lines
=
[]
line_idx
=
[]
file_mappings
=
[]
for
i
,
l
in
enumerate
(
lines
):
all_lines
.
extend
(
l
)
line_idx
.
extend
(
list
(
range
(
len
(
l
))))
file_mappings
.
extend
([
i
]
*
len
(
l
))
indices
=
list
(
range
(
len
(
all_lines
)))
random
.
shuffle
(
indices
)
all_lines
=
[
all_lines
[
idx
]
for
idx
in
indices
]
line_idx
=
[
line_idx
[
idx
]
for
idx
in
indices
]
file_mappings
=
[
file_mappings
[
idx
]
for
idx
in
indices
]
splits
=
[]
mappings
=
[]
start
=
0
for
end
in
line_counts
:
end
+=
start
splits
.
append
(
all_lines
[
start
:
end
])
mappings
.
append
(
format_mappings
(
line_idx
[
start
:
end
],
file_mappings
[
start
:
end
]))
start
=
end
return
splits
,
mappings
def
format_mappings
(
line_idx
,
file_mappings
):
lines
=
[]
for
m
,
l
in
zip
(
file_mappings
,
line_idx
):
lines
.
append
(
str
(
m
).
strip
()
+
'
\t
'
+
str
(
l
).
strip
())
return
lines
def
get_filepaths
(
filepaths
,
output_dir
):
paths
=
[]
train_path
=
'train.json'
dev_path
=
'dev.json'
test_path
=
'test.json'
paths
.
append
(
os
.
path
.
join
(
output_dir
,
train_path
))
paths
.
append
(
os
.
path
.
join
(
output_dir
,
dev_path
))
paths
.
append
(
os
.
path
.
join
(
output_dir
,
test_path
))
return
paths
def
write_files
(
lines
,
mappings
,
filepaths
):
for
l
,
m
,
path
in
zip
(
lines
,
mappings
,
filepaths
):
write_file
(
l
,
path
)
write_mapping_file
(
m
,
path
)
def
write_file
(
lines
,
path
):
print
(
'Writing:'
,
path
)
with
open
(
path
,
'w'
)
as
f
:
for
l
in
lines
:
f
.
write
(
l
+
'
\n
'
)
def
write_mapping_file
(
m
,
path
):
path
=
path
+
'.map'
m
=
[
get_mapping_header
()]
+
m
write_file
(
m
,
path
)
def
get_mapping_header
():
return
'file
\t
line #'
if
not
os
.
path
.
exists
(
args
.
output_dir
):
os
.
makedirs
(
args
.
output_dir
)
lines
=
[]
for
filepath
in
args
.
input_files
:
_lines
=
get_lines
(
filepath
)
lines
.
append
(
_lines
)
# calculate number of lines to use for each
line_counts
=
[
len
(
l
)
for
l
in
lines
]
total_lines
=
sum
(
line_counts
)
dev_percent
=
args
.
test_percent
[
0
]
dev_lines
=
math
.
ceil
(
dev_percent
*
total_lines
)
test_percent
=
0
if
len
(
args
.
test_percent
)
==
2
:
test_percent
=
args
.
test_percent
[
1
]
test_lines
=
math
.
ceil
(
test_percent
*
total_lines
)
train_lines
=
total_lines
-
(
test_lines
+
dev_lines
)
normed_lines
=
[
train_lines
,
dev_lines
,
test_lines
]
normed_lines
=
[
int
(
l
)
for
l
in
normed_lines
]
splits
,
mappings
=
get_splits
(
lines
,
normed_lines
)
filepaths
=
get_filepaths
(
args
.
input_files
,
args
.
output_dir
)
print
(
'Writing output to:'
,
filepaths
)
write_files
(
splits
,
mappings
,
filepaths
)
megatron/deprecated_data_utils/tf_dl.py
deleted
100755 → 0
View file @
63c340ec
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch DataLoader for TFRecords"""
import
numpy
as
np
import
torch
import
queue
import
threading
import
tensorflow
as
tf
tf
.
enable_eager_execution
()
class
TFRecordDataLoader
(
object
):
def
__init__
(
self
,
records
,
batch_size
,
max_seq_len
,
max_preds_per_seq
,
train
,
num_workers
=
2
,
seed
=
1
,
threaded_dl
=
False
):
assert
max_preds_per_seq
is
not
None
,
"--max-preds-per-seq MUST BE SPECIFIED when using tfrecords"
tf
.
set_random_seed
(
seed
)
if
isinstance
(
records
,
str
):
records
=
[
records
]
self
.
record_converter
=
Record2Example
({
"input_ids"
:
tf
.
FixedLenFeature
([
max_seq_len
],
tf
.
int64
),
"input_mask"
:
tf
.
FixedLenFeature
([
max_seq_len
],
tf
.
int64
),
"segment_ids"
:
tf
.
FixedLenFeature
([
max_seq_len
],
tf
.
int64
),
"masked_lm_positions"
:
tf
.
FixedLenFeature
([
max_preds_per_seq
],
tf
.
int64
),
"masked_lm_ids"
:
tf
.
FixedLenFeature
([
max_preds_per_seq
],
tf
.
int64
),
"masked_lm_weights"
:
tf
.
FixedLenFeature
([
max_preds_per_seq
],
tf
.
float32
),
"next_sentence_labels"
:
tf
.
FixedLenFeature
([
1
],
tf
.
int64
)})
# Instantiate dataset according to original BERT implementation
if
train
:
self
.
dataset
=
tf
.
data
.
Dataset
.
from_tensor_slices
(
tf
.
constant
(
records
))
self
.
dataset
=
self
.
dataset
.
repeat
()
self
.
dataset
=
self
.
dataset
.
shuffle
(
buffer_size
=
len
(
records
))
# use sloppy tfrecord dataset
self
.
dataset
=
self
.
dataset
.
apply
(
tf
.
contrib
.
data
.
parallel_interleave
(
tf
.
data
.
TFRecordDataset
,
sloppy
=
train
,
cycle_length
=
min
(
num_workers
,
len
(
records
))))
self
.
dataset
=
self
.
dataset
.
shuffle
(
buffer_size
=
100
)
else
:
self
.
dataset
=
tf
.
data
.
TFRecordDataset
(
records
)
self
.
dataset
=
self
.
dataset
.
repeat
()
# Instantiate dataloader (do not drop remainder for eval)
loader_args
=
{
'batch_size'
:
batch_size
,
'num_parallel_batches'
:
num_workers
,
'drop_remainder'
:
train
}
self
.
dataloader
=
self
.
dataset
.
apply
(
tf
.
contrib
.
data
.
map_and_batch
(
self
.
record_converter
,
**
loader_args
))
self
.
threaded_dl
=
threaded_dl
self
.
num_workers
=
num_workers
def
__iter__
(
self
):
if
self
.
threaded_dl
:
data_iter
=
iter
(
MultiprocessLoader
(
self
.
dataloader
,
self
.
num_workers
))
for
item
in
data_iter
:
yield
item
else
:
data_iter
=
iter
(
self
.
dataloader
)
for
item
in
data_iter
:
yield
convert_tf_example_to_torch_tensors
(
item
)
class
Record2Example
(
object
):
def
__init__
(
self
,
feature_map
):
self
.
feature_map
=
feature_map
def
__call__
(
self
,
record
):
"""Decodes a BERT TF record to a TF example."""
example
=
tf
.
parse_single_example
(
record
,
self
.
feature_map
)
for
k
,
v
in
list
(
example
.
items
()):
if
v
.
dtype
==
tf
.
int64
:
example
[
k
]
=
tf
.
to_int32
(
v
)
return
example
def
convert_tf_example_to_torch_tensors
(
example
):
item
=
{
k
:
(
v
.
numpy
())
for
k
,
v
in
example
.
items
()}
mask
=
np
.
zeros_like
(
item
[
'input_ids'
])
mask_labels
=
np
.
ones_like
(
item
[
'input_ids'
])
*
-
1
for
b
,
row
in
enumerate
(
item
[
'masked_lm_positions'
].
astype
(
int
)):
for
i
,
idx
in
enumerate
(
row
):
if
item
[
'masked_lm_weights'
][
b
,
i
]
!=
0
:
mask
[
b
,
idx
]
=
1
mask_labels
[
b
,
idx
]
=
item
[
'masked_lm_ids'
][
b
,
i
]
output
=
{
'text'
:
item
[
'input_ids'
],
'types'
:
item
[
'segment_ids'
],
'is_random'
:
item
[
'next_sentence_labels'
],
'pad_mask'
:
1
-
item
[
'input_mask'
],
'mask'
:
mask
,
'mask_labels'
:
mask_labels
}
return
{
k
:
torch
.
from_numpy
(
v
)
for
k
,
v
in
output
.
items
()}
class
MultiprocessLoader
(
object
):
def
__init__
(
self
,
dataloader
,
num_workers
=
2
):
self
.
dl
=
dataloader
self
.
queue_size
=
2
*
num_workers
def
__iter__
(
self
):
output_queue
=
queue
.
Queue
(
self
.
queue_size
)
output_thread
=
threading
.
Thread
(
target
=
_multiproc_iter
,
args
=
(
self
.
dl
,
output_queue
))
output_thread
.
daemon
=
True
output_thread
.
start
()
while
output_thread
.
is_alive
():
yield
output_queue
.
get
(
block
=
True
)
else
:
print
(
RuntimeError
(
'TF record data loader thread exited unexpectedly'
))
def
_multiproc_iter
(
dl
,
output_queue
):
data_iter
=
iter
(
dl
)
for
item
in
data_iter
:
tensors
=
convert_tf_example_to_torch_tensors
(
item
)
output_queue
.
put
(
tensors
,
block
=
True
)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment