Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
LLama_fastertransformer
Commits
0211193c
Commit
0211193c
authored
Aug 17, 2023
by
zhuwenwen
Browse files
initial llama
parents
Pipeline
#509
failed with stages
in 0 seconds
Changes
260
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4739 additions
and
0 deletions
+4739
-0
3rdparty/Megatron-LM/megatron/data/blendable_dataset.py
3rdparty/Megatron-LM/megatron/data/blendable_dataset.py
+68
-0
3rdparty/Megatron-LM/megatron/data/data_samplers.py
3rdparty/Megatron-LM/megatron/data/data_samplers.py
+199
-0
3rdparty/Megatron-LM/megatron/data/dataset_utils.py
3rdparty/Megatron-LM/megatron/data/dataset_utils.py
+719
-0
3rdparty/Megatron-LM/megatron/data/gpt_dataset.py
3rdparty/Megatron-LM/megatron/data/gpt_dataset.py
+430
-0
3rdparty/Megatron-LM/megatron/data/helpers.cpp
3rdparty/Megatron-LM/megatron/data/helpers.cpp
+717
-0
3rdparty/Megatron-LM/megatron/data/ict_dataset.py
3rdparty/Megatron-LM/megatron/data/ict_dataset.py
+156
-0
3rdparty/Megatron-LM/megatron/data/image_folder.py
3rdparty/Megatron-LM/megatron/data/image_folder.py
+302
-0
3rdparty/Megatron-LM/megatron/data/indexed_dataset.py
3rdparty/Megatron-LM/megatron/data/indexed_dataset.py
+570
-0
3rdparty/Megatron-LM/megatron/data/orqa_wiki_dataset.py
3rdparty/Megatron-LM/megatron/data/orqa_wiki_dataset.py
+205
-0
3rdparty/Megatron-LM/megatron/data/realm_dataset_utils.py
3rdparty/Megatron-LM/megatron/data/realm_dataset_utils.py
+198
-0
3rdparty/Megatron-LM/megatron/data/realm_index.py
3rdparty/Megatron-LM/megatron/data/realm_index.py
+224
-0
3rdparty/Megatron-LM/megatron/data/t5_dataset.py
3rdparty/Megatron-LM/megatron/data/t5_dataset.py
+270
-0
3rdparty/Megatron-LM/megatron/data/test/test_indexed_dataset.py
...ty/Megatron-LM/megatron/data/test/test_indexed_dataset.py
+125
-0
3rdparty/Megatron-LM/megatron/data/test/test_preprocess_data.sh
...ty/Megatron-LM/megatron/data/test/test_preprocess_data.sh
+10
-0
3rdparty/Megatron-LM/megatron/data/vit_dataset.py
3rdparty/Megatron-LM/megatron/data/vit_dataset.py
+79
-0
3rdparty/Megatron-LM/megatron/dist_signal_handler.py
3rdparty/Megatron-LM/megatron/dist_signal_handler.py
+81
-0
3rdparty/Megatron-LM/megatron/fp16_deprecated/loss_scaler.py
3rdparty/Megatron-LM/megatron/fp16_deprecated/loss_scaler.py
+39
-0
3rdparty/Megatron-LM/megatron/fused_kernels/__init__.py
3rdparty/Megatron-LM/megatron/fused_kernels/__init__.py
+115
-0
3rdparty/Megatron-LM/megatron/fused_kernels/compat.h
3rdparty/Megatron-LM/megatron/fused_kernels/compat.h
+31
-0
3rdparty/Megatron-LM/megatron/fused_kernels/layer_norm_cuda.cpp
...ty/Megatron-LM/megatron/fused_kernels/layer_norm_cuda.cpp
+201
-0
No files found.
Too many changes to show.
To preserve performance only
260 of 260+
files are displayed.
Plain diff
Email patch
3rdparty/Megatron-LM/megatron/data/blendable_dataset.py
0 → 100644
View file @
0211193c
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Blendable dataset."""
import
time
import
numpy
as
np
import
torch
from
megatron
import
print_rank_0
from
megatron
import
mpu
class
BlendableDataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
datasets
,
weights
):
self
.
datasets
=
datasets
num_datasets
=
len
(
datasets
)
assert
num_datasets
==
len
(
weights
)
self
.
size
=
0
for
dataset
in
self
.
datasets
:
self
.
size
+=
len
(
dataset
)
# Normalize weights.
weights
=
np
.
array
(
weights
,
dtype
=
np
.
float64
)
sum_weights
=
np
.
sum
(
weights
)
assert
sum_weights
>
0.0
weights
/=
sum_weights
# Build indecies.
start_time
=
time
.
time
()
assert
num_datasets
<
255
self
.
dataset_index
=
np
.
zeros
(
self
.
size
,
dtype
=
np
.
uint8
)
self
.
dataset_sample_index
=
np
.
zeros
(
self
.
size
,
dtype
=
np
.
int64
)
from
megatron.data
import
helpers
helpers
.
build_blending_indices
(
self
.
dataset_index
,
self
.
dataset_sample_index
,
weights
,
num_datasets
,
self
.
size
,
torch
.
distributed
.
get_rank
()
==
0
)
print_rank_0
(
'> elapsed time for building blendable dataset indices: '
'{:.2f} (sec)'
.
format
(
time
.
time
()
-
start_time
))
def
__len__
(
self
):
return
self
.
size
def
__getitem__
(
self
,
idx
):
dataset_idx
=
self
.
dataset_index
[
idx
]
sample_idx
=
self
.
dataset_sample_index
[
idx
]
return
self
.
datasets
[
dataset_idx
][
sample_idx
]
3rdparty/Megatron-LM/megatron/data/data_samplers.py
0 → 100644
View file @
0211193c
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataloaders."""
import
random
import
torch
import
numpy
as
np
from
torch.utils.data
import
Dataset
from
megatron
import
get_args
from
megatron
import
mpu
def
build_pretraining_data_loader
(
dataset
,
consumed_samples
):
"""Buld dataloader given an input dataset."""
if
dataset
is
None
:
return
None
args
=
get_args
()
# Megatron sampler
if
args
.
dataloader_type
==
'single'
:
batch_sampler
=
MegatronPretrainingSampler
(
total_samples
=
len
(
dataset
),
consumed_samples
=
consumed_samples
,
micro_batch_size
=
args
.
micro_batch_size
,
data_parallel_rank
=
mpu
.
get_data_parallel_rank
(),
data_parallel_size
=
mpu
.
get_data_parallel_world_size
())
elif
args
.
dataloader_type
==
'cyclic'
:
batch_sampler
=
MegatronPretrainingRandomSampler
(
dataset
,
total_samples
=
len
(
dataset
),
consumed_samples
=
consumed_samples
,
micro_batch_size
=
args
.
micro_batch_size
,
data_parallel_rank
=
mpu
.
get_data_parallel_rank
(),
data_parallel_size
=
mpu
.
get_data_parallel_world_size
(),
data_sharding
=
args
.
data_sharding
)
else
:
raise
Exception
(
'{} dataloader type is not supported.'
.
format
(
args
.
dataloader_type
))
# Torch dataloader.
return
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
batch_sampler
,
num_workers
=
args
.
num_workers
,
pin_memory
=
True
)
class
MegatronPretrainingSampler
:
def
__init__
(
self
,
total_samples
,
consumed_samples
,
micro_batch_size
,
data_parallel_rank
,
data_parallel_size
,
drop_last
=
True
):
# Keep a copy of input params for later use.
self
.
total_samples
=
total_samples
self
.
consumed_samples
=
consumed_samples
self
.
micro_batch_size
=
micro_batch_size
self
.
data_parallel_rank
=
data_parallel_rank
self
.
micro_batch_times_data_parallel_size
=
\
self
.
micro_batch_size
*
data_parallel_size
self
.
drop_last
=
drop_last
# Sanity checks.
assert
self
.
total_samples
>
0
,
\
'no sample to consume: {}'
.
format
(
self
.
total_samples
)
assert
self
.
consumed_samples
<
self
.
total_samples
,
\
'no samples left to consume: {}, {}'
.
format
(
self
.
consumed_samples
,
self
.
total_samples
)
assert
self
.
micro_batch_size
>
0
assert
data_parallel_size
>
0
assert
self
.
data_parallel_rank
<
data_parallel_size
,
\
'data_parallel_rank should be smaller than data size: {}, '
\
'{}'
.
format
(
self
.
data_parallel_rank
,
data_parallel_size
)
def
__len__
(
self
):
return
self
.
total_samples
def
get_start_end_idx
(
self
):
start_idx
=
self
.
data_parallel_rank
*
self
.
micro_batch_size
end_idx
=
start_idx
+
self
.
micro_batch_size
return
start_idx
,
end_idx
def
__iter__
(
self
):
batch
=
[]
# Last batch will be dropped if drop_last is not set False
for
idx
in
range
(
self
.
consumed_samples
,
self
.
total_samples
):
batch
.
append
(
idx
)
if
len
(
batch
)
==
self
.
micro_batch_times_data_parallel_size
:
start_idx
,
end_idx
=
self
.
get_start_end_idx
()
yield
batch
[
start_idx
:
end_idx
]
batch
=
[]
# Check the last partial batch and see drop_last is set
if
len
(
batch
)
>
0
and
not
self
.
drop_last
:
start_idx
,
end_idx
=
self
.
get_start_end_idx
()
yield
batch
[
start_idx
:
end_idx
]
class
RandomSeedDataset
(
Dataset
):
def
__init__
(
self
,
dataset
):
args
=
get_args
()
self
.
base_seed
=
args
.
seed
self
.
curr_seed
=
args
.
seed
self
.
dataset
=
dataset
def
__len__
(
self
):
return
len
(
self
.
dataset
)
def
set_epoch
(
self
,
epoch
):
self
.
curr_seed
=
self
.
base_seed
+
epoch
def
__getitem__
(
self
,
idx
):
seed
=
idx
+
self
.
curr_seed
torch
.
manual_seed
(
seed
)
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
return
self
.
dataset
[
idx
]
class
MegatronPretrainingRandomSampler
:
def
__init__
(
self
,
dataset
,
total_samples
,
consumed_samples
,
micro_batch_size
,
data_parallel_rank
,
data_parallel_size
,
data_sharding
):
# Keep a copy of input params for later use.
self
.
dataset
=
dataset
self
.
total_samples
=
total_samples
self
.
consumed_samples
=
consumed_samples
self
.
micro_batch_size
=
micro_batch_size
self
.
data_parallel_rank
=
data_parallel_rank
self
.
data_parallel_size
=
data_parallel_size
self
.
data_sharding
=
data_sharding
self
.
micro_batch_times_data_parallel_size
=
\
self
.
micro_batch_size
*
data_parallel_size
self
.
last_batch_size
=
\
self
.
total_samples
%
self
.
micro_batch_times_data_parallel_size
# Sanity checks.
assert
self
.
total_samples
>
0
,
\
'no sample to consume: {}'
.
format
(
self
.
total_samples
)
assert
self
.
micro_batch_size
>
0
assert
data_parallel_size
>
0
assert
self
.
data_parallel_rank
<
data_parallel_size
,
\
'data_parallel_rank should be smaller than data size: {}, '
\
'{}'
.
format
(
self
.
data_parallel_rank
,
data_parallel_size
)
def
__len__
(
self
):
return
self
.
total_samples
def
__iter__
(
self
):
active_total_samples
=
self
.
total_samples
-
self
.
last_batch_size
self
.
epoch
=
self
.
consumed_samples
//
active_total_samples
current_epoch_samples
=
self
.
consumed_samples
%
active_total_samples
assert
current_epoch_samples
%
self
.
micro_batch_times_data_parallel_size
==
0
if
isinstance
(
self
.
dataset
,
RandomSeedDataset
):
self
.
dataset
.
set_epoch
(
self
.
epoch
)
# data sharding and random sampling
if
self
.
data_sharding
:
bucket_size
=
(
self
.
total_samples
//
self
.
micro_batch_times_data_parallel_size
)
\
*
self
.
micro_batch_size
bucket_offset
=
current_epoch_samples
//
self
.
data_parallel_size
start_idx
=
self
.
data_parallel_rank
*
bucket_size
g
=
torch
.
Generator
()
g
.
manual_seed
(
self
.
epoch
)
random_idx
=
torch
.
randperm
(
bucket_size
,
generator
=
g
).
tolist
()
idx_range
=
[
start_idx
+
x
for
x
in
random_idx
[
bucket_offset
:]]
else
:
full_bucket_size
=
(
self
.
total_samples
//
self
.
micro_batch_size
)
\
*
self
.
micro_batch_size
full_bucket_offset
=
current_epoch_samples
g
=
torch
.
Generator
()
g
.
manual_seed
(
self
.
epoch
)
idx_range_total
=
\
torch
.
randperm
(
full_bucket_size
,
generator
=
g
).
tolist
()
idx_range_active
=
idx_range_total
[
full_bucket_offset
:]
idx_range
=
idx_range_active
[
self
.
data_parallel_rank
::
self
.
data_parallel_size
]
batch
=
[]
# Last batch if not complete will be dropped.
for
idx
in
idx_range
:
batch
.
append
(
idx
)
if
len
(
batch
)
==
self
.
micro_batch_size
:
self
.
consumed_samples
+=
self
.
micro_batch_times_data_parallel_size
yield
batch
batch
=
[]
3rdparty/Megatron-LM/megatron/data/dataset_utils.py
0 → 100644
View file @
0211193c
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Most of the code here has been copied from:
# https://github.com/google-research/albert/blob/master/create_pretraining_data.py
# with some modifications.
import
math
import
os
import
time
import
collections
import
numpy
as
np
import
torch
from
megatron
import
(
get_args
,
mpu
,
print_rank_0
)
from
megatron.data.blendable_dataset
import
BlendableDataset
from
megatron.data.indexed_dataset
import
make_dataset
as
make_indexed_dataset
DSET_TYPE_BERT
=
'standard_bert'
DSET_TYPE_ICT
=
'ict'
DSET_TYPE_T5
=
't5'
DSET_TYPES
=
[
DSET_TYPE_BERT
,
DSET_TYPE_ICT
,
DSET_TYPE_T5
]
def
get_datasets_weights_and_num_samples
(
data_prefix
,
train_valid_test_num_samples
):
# The data prefix should be in the format of:
# weight-1, data-prefix-1, weight-2, data-prefix-2, ..
assert
len
(
data_prefix
)
%
2
==
0
num_datasets
=
len
(
data_prefix
)
//
2
weights
=
[
0
]
*
num_datasets
prefixes
=
[
0
]
*
num_datasets
for
i
in
range
(
num_datasets
):
weights
[
i
]
=
float
(
data_prefix
[
2
*
i
])
prefixes
[
i
]
=
(
data_prefix
[
2
*
i
+
1
]).
strip
()
# Normalize weights
weight_sum
=
0.0
for
weight
in
weights
:
weight_sum
+=
weight
assert
weight_sum
>
0.0
weights
=
[
weight
/
weight_sum
for
weight
in
weights
]
# Add 0.5% (the 1.005 factor) so in case the bleding dataset does
# not uniformly distribute the number of samples, we still have
# samples left to feed to the network.
datasets_train_valid_test_num_samples
=
[]
for
weight
in
weights
:
datasets_train_valid_test_num_samples
.
append
(
[
int
(
math
.
ceil
(
val
*
weight
*
1.005
))
for
val
in
train_valid_test_num_samples
])
return
prefixes
,
weights
,
datasets_train_valid_test_num_samples
def
compile_helper
():
"""Compile helper function ar runtime. Make sure this
is invoked on a single process."""
import
os
import
subprocess
path
=
os
.
path
.
abspath
(
os
.
path
.
dirname
(
__file__
))
ret
=
subprocess
.
run
([
'make'
,
'-C'
,
path
])
if
ret
.
returncode
!=
0
:
print
(
"Making C++ dataset helpers module failed, exiting."
)
import
sys
sys
.
exit
(
1
)
def
get_a_and_b_segments
(
sample
,
np_rng
):
"""Divide sample into a and b segments."""
# Number of sentences in the sample.
n_sentences
=
len
(
sample
)
# Make sure we always have two sentences.
assert
n_sentences
>
1
,
'make sure each sample has at least two sentences.'
# First part:
# `a_end` is how many sentences go into the `A`.
a_end
=
1
if
n_sentences
>=
3
:
# Note that randin in numpy is exclusive.
a_end
=
np_rng
.
randint
(
1
,
n_sentences
)
tokens_a
=
[]
for
j
in
range
(
a_end
):
tokens_a
.
extend
(
sample
[
j
])
# Second part:
tokens_b
=
[]
for
j
in
range
(
a_end
,
n_sentences
):
tokens_b
.
extend
(
sample
[
j
])
# Random next:
is_next_random
=
False
if
np_rng
.
random
()
<
0.5
:
is_next_random
=
True
tokens_a
,
tokens_b
=
tokens_b
,
tokens_a
return
tokens_a
,
tokens_b
,
is_next_random
def
truncate_segments
(
tokens_a
,
tokens_b
,
len_a
,
len_b
,
max_num_tokens
,
np_rng
):
"""Truncates a pair of sequences to a maximum sequence length."""
#print(len_a, len_b, max_num_tokens)
assert
len_a
>
0
if
len_a
+
len_b
<=
max_num_tokens
:
return
False
while
len_a
+
len_b
>
max_num_tokens
:
if
len_a
>
len_b
:
len_a
-=
1
tokens
=
tokens_a
else
:
len_b
-=
1
tokens
=
tokens_b
if
np_rng
.
random
()
<
0.5
:
del
tokens
[
0
]
else
:
tokens
.
pop
()
return
True
def
create_tokens_and_tokentypes
(
tokens_a
,
tokens_b
,
cls_id
,
sep_id
):
"""Merge segments A and B, add [CLS] and [SEP] and build tokentypes."""
tokens
=
[]
tokentypes
=
[]
# [CLS].
tokens
.
append
(
cls_id
)
tokentypes
.
append
(
0
)
# Segment A.
for
token
in
tokens_a
:
tokens
.
append
(
token
)
tokentypes
.
append
(
0
)
# [SEP].
tokens
.
append
(
sep_id
)
tokentypes
.
append
(
0
)
# Segment B.
for
token
in
tokens_b
:
tokens
.
append
(
token
)
tokentypes
.
append
(
1
)
if
tokens_b
:
# [SEP].
tokens
.
append
(
sep_id
)
tokentypes
.
append
(
1
)
return
tokens
,
tokentypes
MaskedLmInstance
=
collections
.
namedtuple
(
"MaskedLmInstance"
,
[
"index"
,
"label"
])
def
is_start_piece
(
piece
):
"""Check if the current word piece is the starting piece (BERT)."""
# When a word has been split into
# WordPieces, the first token does not have any marker and any subsequence
# tokens are prefixed with ##. So whenever we see the ## token, we
# append it to the previous set of word indexes.
return
not
piece
.
startswith
(
"##"
)
def
create_masked_lm_predictions
(
tokens
,
vocab_id_list
,
vocab_id_to_token_dict
,
masked_lm_prob
,
cls_id
,
sep_id
,
mask_id
,
max_predictions_per_seq
,
np_rng
,
max_ngrams
=
3
,
do_whole_word_mask
=
True
,
favor_longer_ngram
=
False
,
do_permutation
=
False
,
geometric_dist
=
False
,
masking_style
=
"bert"
):
"""Creates the predictions for the masked LM objective.
Note: Tokens here are vocab ids and not text tokens."""
cand_indexes
=
[]
# Note(mingdachen): We create a list for recording if the piece is
# the starting piece of current token, where 1 means true, so that
# on-the-fly whole word masking is possible.
token_boundary
=
[
0
]
*
len
(
tokens
)
for
(
i
,
token
)
in
enumerate
(
tokens
):
if
token
==
cls_id
or
token
==
sep_id
:
token_boundary
[
i
]
=
1
continue
# Whole Word Masking means that if we mask all of the wordpieces
# corresponding to an original word.
#
# Note that Whole Word Masking does *not* change the training code
# at all -- we still predict each WordPiece independently, softmaxed
# over the entire vocabulary.
if
(
do_whole_word_mask
and
len
(
cand_indexes
)
>=
1
and
not
is_start_piece
(
vocab_id_to_token_dict
[
token
])):
cand_indexes
[
-
1
].
append
(
i
)
else
:
cand_indexes
.
append
([
i
])
if
is_start_piece
(
vocab_id_to_token_dict
[
token
]):
token_boundary
[
i
]
=
1
output_tokens
=
list
(
tokens
)
masked_lm_positions
=
[]
masked_lm_labels
=
[]
if
masked_lm_prob
==
0
:
return
(
output_tokens
,
masked_lm_positions
,
masked_lm_labels
,
token_boundary
)
num_to_predict
=
min
(
max_predictions_per_seq
,
max
(
1
,
int
(
round
(
len
(
tokens
)
*
masked_lm_prob
))))
ngrams
=
np
.
arange
(
1
,
max_ngrams
+
1
,
dtype
=
np
.
int64
)
if
not
geometric_dist
:
# Note(mingdachen):
# By default, we set the probilities to favor shorter ngram sequences.
pvals
=
1.
/
np
.
arange
(
1
,
max_ngrams
+
1
)
pvals
/=
pvals
.
sum
(
keepdims
=
True
)
if
favor_longer_ngram
:
pvals
=
pvals
[::
-
1
]
ngram_indexes
=
[]
for
idx
in
range
(
len
(
cand_indexes
)):
ngram_index
=
[]
for
n
in
ngrams
:
ngram_index
.
append
(
cand_indexes
[
idx
:
idx
+
n
])
ngram_indexes
.
append
(
ngram_index
)
np_rng
.
shuffle
(
ngram_indexes
)
(
masked_lms
,
masked_spans
)
=
([],
[])
covered_indexes
=
set
()
for
cand_index_set
in
ngram_indexes
:
if
len
(
masked_lms
)
>=
num_to_predict
:
break
if
not
cand_index_set
:
continue
# Note(mingdachen):
# Skip current piece if they are covered in lm masking or previous ngrams.
for
index_set
in
cand_index_set
[
0
]:
for
index
in
index_set
:
if
index
in
covered_indexes
:
continue
if
not
geometric_dist
:
n
=
np_rng
.
choice
(
ngrams
[:
len
(
cand_index_set
)],
p
=
pvals
[:
len
(
cand_index_set
)]
/
pvals
[:
len
(
cand_index_set
)].
sum
(
keepdims
=
True
))
else
:
# Sampling "n" from the geometric distribution and clipping it to
# the max_ngrams. Using p=0.2 default from the SpanBERT paper
# https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1)
n
=
min
(
np_rng
.
geometric
(
0.2
),
max_ngrams
)
index_set
=
sum
(
cand_index_set
[
n
-
1
],
[])
n
-=
1
# Note(mingdachen):
# Repeatedly looking for a candidate that does not exceed the
# maximum number of predictions by trying shorter ngrams.
while
len
(
masked_lms
)
+
len
(
index_set
)
>
num_to_predict
:
if
n
==
0
:
break
index_set
=
sum
(
cand_index_set
[
n
-
1
],
[])
n
-=
1
# If adding a whole-word mask would exceed the maximum number of
# predictions, then just skip this candidate.
if
len
(
masked_lms
)
+
len
(
index_set
)
>
num_to_predict
:
continue
is_any_index_covered
=
False
for
index
in
index_set
:
if
index
in
covered_indexes
:
is_any_index_covered
=
True
break
if
is_any_index_covered
:
continue
for
index
in
index_set
:
covered_indexes
.
add
(
index
)
masked_token
=
None
if
masking_style
==
"bert"
:
# 80% of the time, replace with [MASK]
if
np_rng
.
random
()
<
0.8
:
masked_token
=
mask_id
else
:
# 10% of the time, keep original
if
np_rng
.
random
()
<
0.5
:
masked_token
=
tokens
[
index
]
# 10% of the time, replace with random word
else
:
masked_token
=
vocab_id_list
[
np_rng
.
randint
(
0
,
len
(
vocab_id_list
))]
elif
masking_style
==
"t5"
:
masked_token
=
mask_id
else
:
raise
ValueError
(
"invalid value of masking style"
)
output_tokens
[
index
]
=
masked_token
masked_lms
.
append
(
MaskedLmInstance
(
index
=
index
,
label
=
tokens
[
index
]))
masked_spans
.
append
(
MaskedLmInstance
(
index
=
index_set
,
label
=
[
tokens
[
index
]
for
index
in
index_set
]))
assert
len
(
masked_lms
)
<=
num_to_predict
np_rng
.
shuffle
(
ngram_indexes
)
select_indexes
=
set
()
if
do_permutation
:
for
cand_index_set
in
ngram_indexes
:
if
len
(
select_indexes
)
>=
num_to_predict
:
break
if
not
cand_index_set
:
continue
# Note(mingdachen):
# Skip current piece if they are covered in lm masking or previous ngrams.
for
index_set
in
cand_index_set
[
0
]:
for
index
in
index_set
:
if
index
in
covered_indexes
or
index
in
select_indexes
:
continue
n
=
np
.
random
.
choice
(
ngrams
[:
len
(
cand_index_set
)],
p
=
pvals
[:
len
(
cand_index_set
)]
/
pvals
[:
len
(
cand_index_set
)].
sum
(
keepdims
=
True
))
index_set
=
sum
(
cand_index_set
[
n
-
1
],
[])
n
-=
1
while
len
(
select_indexes
)
+
len
(
index_set
)
>
num_to_predict
:
if
n
==
0
:
break
index_set
=
sum
(
cand_index_set
[
n
-
1
],
[])
n
-=
1
# If adding a whole-word mask would exceed the maximum number of
# predictions, then just skip this candidate.
if
len
(
select_indexes
)
+
len
(
index_set
)
>
num_to_predict
:
continue
is_any_index_covered
=
False
for
index
in
index_set
:
if
index
in
covered_indexes
or
index
in
select_indexes
:
is_any_index_covered
=
True
break
if
is_any_index_covered
:
continue
for
index
in
index_set
:
select_indexes
.
add
(
index
)
assert
len
(
select_indexes
)
<=
num_to_predict
select_indexes
=
sorted
(
select_indexes
)
permute_indexes
=
list
(
select_indexes
)
np_rng
.
shuffle
(
permute_indexes
)
orig_token
=
list
(
output_tokens
)
for
src_i
,
tgt_i
in
zip
(
select_indexes
,
permute_indexes
):
output_tokens
[
src_i
]
=
orig_token
[
tgt_i
]
masked_lms
.
append
(
MaskedLmInstance
(
index
=
src_i
,
label
=
orig_token
[
src_i
]))
masked_lms
=
sorted
(
masked_lms
,
key
=
lambda
x
:
x
.
index
)
# Sort the spans by the index of the first span
masked_spans
=
sorted
(
masked_spans
,
key
=
lambda
x
:
x
.
index
[
0
])
for
p
in
masked_lms
:
masked_lm_positions
.
append
(
p
.
index
)
masked_lm_labels
.
append
(
p
.
label
)
return
(
output_tokens
,
masked_lm_positions
,
masked_lm_labels
,
token_boundary
,
masked_spans
)
def
pad_and_convert_to_numpy
(
tokens
,
tokentypes
,
masked_positions
,
masked_labels
,
pad_id
,
max_seq_length
):
"""Pad sequences and convert them to numpy."""
# Some checks.
num_tokens
=
len
(
tokens
)
padding_length
=
max_seq_length
-
num_tokens
assert
padding_length
>=
0
assert
len
(
tokentypes
)
==
num_tokens
assert
len
(
masked_positions
)
==
len
(
masked_labels
)
# Tokens and token types.
filler
=
[
pad_id
]
*
padding_length
tokens_np
=
np
.
array
(
tokens
+
filler
,
dtype
=
np
.
int64
)
tokentypes_np
=
np
.
array
(
tokentypes
+
filler
,
dtype
=
np
.
int64
)
# Padding mask.
padding_mask_np
=
np
.
array
([
1
]
*
num_tokens
+
[
0
]
*
padding_length
,
dtype
=
np
.
int64
)
# Lables and loss mask.
labels
=
[
-
1
]
*
max_seq_length
loss_mask
=
[
0
]
*
max_seq_length
for
i
in
range
(
len
(
masked_positions
)):
assert
masked_positions
[
i
]
<
num_tokens
labels
[
masked_positions
[
i
]]
=
masked_labels
[
i
]
loss_mask
[
masked_positions
[
i
]]
=
1
labels_np
=
np
.
array
(
labels
,
dtype
=
np
.
int64
)
loss_mask_np
=
np
.
array
(
loss_mask
,
dtype
=
np
.
int64
)
return
tokens_np
,
tokentypes_np
,
labels_np
,
padding_mask_np
,
loss_mask_np
def
build_train_valid_test_datasets
(
data_prefix
,
data_impl
,
splits_string
,
train_valid_test_num_samples
,
max_seq_length
,
masked_lm_prob
,
short_seq_prob
,
seed
,
skip_warmup
,
binary_head
=
False
,
max_seq_length_dec
=
None
,
dataset_type
=
'standard_bert'
):
if
len
(
data_prefix
)
==
1
:
return
_build_train_valid_test_datasets
(
data_prefix
[
0
],
data_impl
,
splits_string
,
train_valid_test_num_samples
,
max_seq_length
,
masked_lm_prob
,
short_seq_prob
,
seed
,
skip_warmup
,
binary_head
,
max_seq_length_dec
,
dataset_type
=
dataset_type
)
# Blending dataset.
# Parse the values.
output
=
get_datasets_weights_and_num_samples
(
data_prefix
,
train_valid_test_num_samples
)
prefixes
,
weights
,
datasets_train_valid_test_num_samples
=
output
# Build individual datasets.
train_datasets
=
[]
valid_datasets
=
[]
test_datasets
=
[]
for
i
in
range
(
len
(
prefixes
)):
train_ds
,
valid_ds
,
test_ds
=
_build_train_valid_test_datasets
(
prefixes
[
i
],
data_impl
,
splits_string
,
datasets_train_valid_test_num_samples
[
i
],
max_seq_length
,
masked_lm_prob
,
short_seq_prob
,
seed
,
skip_warmup
,
binary_head
,
dataset_type
=
dataset_type
)
if
train_ds
:
train_datasets
.
append
(
train_ds
)
if
valid_ds
:
valid_datasets
.
append
(
valid_ds
)
if
test_ds
:
test_datasets
.
append
(
test_ds
)
# Blend.
blending_train_dataset
=
None
if
train_datasets
:
blending_train_dataset
=
BlendableDataset
(
train_datasets
,
weights
)
blending_valid_dataset
=
None
if
valid_datasets
:
blending_valid_dataset
=
BlendableDataset
(
valid_datasets
,
weights
)
blending_test_dataset
=
None
if
test_datasets
:
blending_test_dataset
=
BlendableDataset
(
test_datasets
,
weights
)
return
(
blending_train_dataset
,
blending_valid_dataset
,
blending_test_dataset
)
def
_build_train_valid_test_datasets
(
data_prefix
,
data_impl
,
splits_string
,
train_valid_test_num_samples
,
max_seq_length
,
masked_lm_prob
,
short_seq_prob
,
seed
,
skip_warmup
,
binary_head
,
max_seq_length_dec
,
dataset_type
=
'standard_bert'
):
if
dataset_type
not
in
DSET_TYPES
:
raise
ValueError
(
"Invalid dataset_type: "
,
dataset_type
)
# Indexed dataset.
indexed_dataset
=
get_indexed_dataset_
(
data_prefix
,
data_impl
,
skip_warmup
)
if
dataset_type
==
DSET_TYPE_ICT
:
args
=
get_args
()
title_dataset
=
get_indexed_dataset_
(
args
.
titles_data_path
,
data_impl
,
skip_warmup
)
# Get start and end indices of train/valid/train into doc-idx
# Note that doc-idx is desinged to be num-docs + 1 so we can
# easily iterate over it.
total_num_of_documents
=
indexed_dataset
.
doc_idx
.
shape
[
0
]
-
1
splits
=
get_train_valid_test_split_
(
splits_string
,
total_num_of_documents
)
# Print stats about the splits.
print_rank_0
(
' > dataset split:'
)
def
print_split_stats
(
name
,
index
):
print_rank_0
(
' {}:'
.
format
(
name
))
print_rank_0
(
' document indices in [{}, {}) total of {} '
'documents'
.
format
(
splits
[
index
],
splits
[
index
+
1
],
splits
[
index
+
1
]
-
splits
[
index
]))
start_index
=
indexed_dataset
.
doc_idx
[
splits
[
index
]]
end_index
=
indexed_dataset
.
doc_idx
[
splits
[
index
+
1
]]
print_rank_0
(
' sentence indices in [{}, {}) total of {} '
'sentences'
.
format
(
start_index
,
end_index
,
end_index
-
start_index
))
print_split_stats
(
'train'
,
0
)
print_split_stats
(
'validation'
,
1
)
print_split_stats
(
'test'
,
2
)
def
build_dataset
(
index
,
name
):
from
megatron.data.bert_dataset
import
BertDataset
from
megatron.data.ict_dataset
import
ICTDataset
from
megatron.data.t5_dataset
import
T5Dataset
dataset
=
None
if
splits
[
index
+
1
]
>
splits
[
index
]:
# Get the pointer to the original doc-idx so we can set it later.
doc_idx_ptr
=
indexed_dataset
.
get_doc_idx
()
# Slice the doc-idx
start_index
=
splits
[
index
]
# Add +1 so we can index into the dataset to get the upper bound.
end_index
=
splits
[
index
+
1
]
+
1
# New doc_idx view.
indexed_dataset
.
set_doc_idx
(
doc_idx_ptr
[
start_index
:
end_index
])
# Build the dataset accordingly.
kwargs
=
dict
(
name
=
name
,
data_prefix
=
data_prefix
,
num_epochs
=
None
,
max_num_samples
=
train_valid_test_num_samples
[
index
],
max_seq_length
=
max_seq_length
,
seed
=
seed
,
)
if
dataset_type
==
DSET_TYPE_ICT
:
args
=
get_args
()
dataset
=
ICTDataset
(
block_dataset
=
indexed_dataset
,
title_dataset
=
title_dataset
,
query_in_block_prob
=
args
.
query_in_block_prob
,
use_one_sent_docs
=
args
.
use_one_sent_docs
,
binary_head
=
binary_head
,
**
kwargs
)
elif
dataset_type
==
DSET_TYPE_T5
:
dataset
=
T5Dataset
(
indexed_dataset
=
indexed_dataset
,
masked_lm_prob
=
masked_lm_prob
,
max_seq_length_dec
=
max_seq_length_dec
,
short_seq_prob
=
short_seq_prob
,
**
kwargs
)
elif
dataset_type
==
DSET_TYPE_BERT
:
dataset
=
BertDataset
(
indexed_dataset
=
indexed_dataset
,
masked_lm_prob
=
masked_lm_prob
,
short_seq_prob
=
short_seq_prob
,
binary_head
=
binary_head
,
**
kwargs
)
else
:
raise
NotImplementedError
(
"Dataset type not fully implemented."
)
# Set the original pointer so dataset remains the main dataset.
indexed_dataset
.
set_doc_idx
(
doc_idx_ptr
)
# Checks.
assert
indexed_dataset
.
doc_idx
[
0
]
==
0
assert
indexed_dataset
.
doc_idx
.
shape
[
0
]
==
\
(
total_num_of_documents
+
1
)
return
dataset
train_dataset
=
build_dataset
(
0
,
'train'
)
valid_dataset
=
build_dataset
(
1
,
'valid'
)
test_dataset
=
build_dataset
(
2
,
'test'
)
return
(
train_dataset
,
valid_dataset
,
test_dataset
)
def
get_indexed_dataset_
(
data_prefix
,
data_impl
,
skip_warmup
):
print_rank_0
(
' > building dataset index ...'
)
start_time
=
time
.
time
()
indexed_dataset
=
make_indexed_dataset
(
data_prefix
,
data_impl
,
skip_warmup
)
assert
indexed_dataset
.
sizes
.
shape
[
0
]
==
indexed_dataset
.
doc_idx
[
-
1
]
print_rank_0
(
' > finished creating indexed dataset in {:4f} '
'seconds'
.
format
(
time
.
time
()
-
start_time
))
print_rank_0
(
' > indexed dataset stats:'
)
print_rank_0
(
' number of documents: {}'
.
format
(
indexed_dataset
.
doc_idx
.
shape
[
0
]
-
1
))
print_rank_0
(
' number of sentences: {}'
.
format
(
indexed_dataset
.
sizes
.
shape
[
0
]))
return
indexed_dataset
def
get_train_valid_test_split_
(
splits_string
,
size
):
""" Get dataset splits from comma or '/' separated string list."""
splits
=
[]
if
splits_string
.
find
(
','
)
!=
-
1
:
splits
=
[
float
(
s
)
for
s
in
splits_string
.
split
(
','
)]
elif
splits_string
.
find
(
'/'
)
!=
-
1
:
splits
=
[
float
(
s
)
for
s
in
splits_string
.
split
(
'/'
)]
else
:
splits
=
[
float
(
splits_string
)]
while
len
(
splits
)
<
3
:
splits
.
append
(
0.
)
splits
=
splits
[:
3
]
splits_sum
=
sum
(
splits
)
assert
splits_sum
>
0.0
splits
=
[
split
/
splits_sum
for
split
in
splits
]
splits_index
=
[
0
]
for
index
,
split
in
enumerate
(
splits
):
splits_index
.
append
(
splits_index
[
index
]
+
int
(
round
(
split
*
float
(
size
))))
diff
=
splits_index
[
-
1
]
-
size
for
index
in
range
(
1
,
len
(
splits_index
)):
splits_index
[
index
]
-=
diff
assert
len
(
splits_index
)
==
4
assert
splits_index
[
-
1
]
==
size
return
splits_index
def
get_samples_mapping
(
indexed_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
max_seq_length
,
short_seq_prob
,
seed
,
name
,
binary_head
):
"""Get a list that maps a sample index to a starting sentence index, end sentence index, and length"""
if
not
num_epochs
:
if
not
max_num_samples
:
raise
ValueError
(
"Need to specify either max_num_samples "
"or num_epochs"
)
num_epochs
=
np
.
iinfo
(
np
.
int32
).
max
-
1
if
not
max_num_samples
:
max_num_samples
=
np
.
iinfo
(
np
.
int64
).
max
-
1
# Filename of the index mapping
indexmap_filename
=
data_prefix
indexmap_filename
+=
'_{}_indexmap'
.
format
(
name
)
if
num_epochs
!=
(
np
.
iinfo
(
np
.
int32
).
max
-
1
):
indexmap_filename
+=
'_{}ep'
.
format
(
num_epochs
)
if
max_num_samples
!=
(
np
.
iinfo
(
np
.
int64
).
max
-
1
):
indexmap_filename
+=
'_{}mns'
.
format
(
max_num_samples
)
indexmap_filename
+=
'_{}msl'
.
format
(
max_seq_length
)
indexmap_filename
+=
'_{:0.2f}ssp'
.
format
(
short_seq_prob
)
indexmap_filename
+=
'_{}s'
.
format
(
seed
)
indexmap_filename
+=
'.npy'
# Build the indexed mapping if not exist.
if
torch
.
distributed
.
get_rank
()
==
0
and
\
not
os
.
path
.
isfile
(
indexmap_filename
):
print
(
' > WARNING: could not find index map file {}, building '
'the indices on rank 0 ...'
.
format
(
indexmap_filename
))
# Make sure the types match the helpers input types.
assert
indexed_dataset
.
doc_idx
.
dtype
==
np
.
int64
assert
indexed_dataset
.
sizes
.
dtype
==
np
.
int32
# Build samples mapping
verbose
=
torch
.
distributed
.
get_rank
()
==
0
start_time
=
time
.
time
()
print_rank_0
(
' > building samples index mapping for {} ...'
.
format
(
name
))
# First compile and then import.
from
megatron.data
import
helpers
samples_mapping
=
helpers
.
build_mapping
(
indexed_dataset
.
doc_idx
,
indexed_dataset
.
sizes
,
num_epochs
,
max_num_samples
,
max_seq_length
,
short_seq_prob
,
seed
,
verbose
,
2
if
binary_head
else
1
)
print_rank_0
(
' > done building samples index maping'
)
np
.
save
(
indexmap_filename
,
samples_mapping
,
allow_pickle
=
True
)
print_rank_0
(
' > saved the index mapping in {}'
.
format
(
indexmap_filename
))
# Make sure all the ranks have built the mapping
print_rank_0
(
' > elasped time to build and save samples mapping '
'(seconds): {:4f}'
.
format
(
time
.
time
()
-
start_time
))
# This should be a barrier but nccl barrier assumes
# device_index=rank which is not the case for model
# parallel case
counts
=
torch
.
cuda
.
LongTensor
([
1
])
torch
.
distributed
.
all_reduce
(
counts
,
group
=
mpu
.
get_data_parallel_group
())
torch
.
distributed
.
all_reduce
(
counts
,
group
=
mpu
.
get_pipeline_model_parallel_group
())
assert
counts
[
0
].
item
()
==
(
torch
.
distributed
.
get_world_size
()
//
torch
.
distributed
.
get_world_size
(
group
=
mpu
.
get_tensor_model_parallel_group
()))
# Load indexed dataset.
print_rank_0
(
' > loading indexed mapping from {}'
.
format
(
indexmap_filename
))
start_time
=
time
.
time
()
samples_mapping
=
np
.
load
(
indexmap_filename
,
allow_pickle
=
True
,
mmap_mode
=
'r'
)
print_rank_0
(
' loaded indexed file in {:3.3f} seconds'
.
format
(
time
.
time
()
-
start_time
))
print_rank_0
(
' total number of samples: {}'
.
format
(
samples_mapping
.
shape
[
0
]))
return
samples_mapping
3rdparty/Megatron-LM/megatron/data/gpt_dataset.py
0 → 100644
View file @
0211193c
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""GPT style dataset."""
import
os
import
time
import
numpy
as
np
import
torch
from
megatron
import
mpu
,
print_rank_0
from
megatron.data.blendable_dataset
import
BlendableDataset
from
megatron.data.dataset_utils
import
get_datasets_weights_and_num_samples
from
megatron.data.dataset_utils
import
get_train_valid_test_split_
from
megatron.data.indexed_dataset
import
make_dataset
as
make_indexed_dataset
def
build_train_valid_test_datasets
(
data_prefix
,
data_impl
,
splits_string
,
train_valid_test_num_samples
,
seq_length
,
seed
,
skip_warmup
):
"""Build train, valid, and test datasets."""
# Single dataset.
if
len
(
data_prefix
)
==
1
:
return
_build_train_valid_test_datasets
(
data_prefix
[
0
],
data_impl
,
splits_string
,
train_valid_test_num_samples
,
seq_length
,
seed
,
skip_warmup
)
# Blending dataset.
# Parse the values.
output
=
get_datasets_weights_and_num_samples
(
data_prefix
,
train_valid_test_num_samples
)
prefixes
,
weights
,
datasets_train_valid_test_num_samples
=
output
# Build individual datasets.
train_datasets
=
[]
valid_datasets
=
[]
test_datasets
=
[]
for
i
in
range
(
len
(
prefixes
)):
train_ds
,
valid_ds
,
test_ds
=
_build_train_valid_test_datasets
(
prefixes
[
i
],
data_impl
,
splits_string
,
datasets_train_valid_test_num_samples
[
i
],
seq_length
,
seed
,
skip_warmup
)
if
train_ds
:
train_datasets
.
append
(
train_ds
)
if
valid_ds
:
valid_datasets
.
append
(
valid_ds
)
if
test_ds
:
test_datasets
.
append
(
test_ds
)
# Blend.
blending_train_dataset
=
None
if
train_datasets
:
blending_train_dataset
=
BlendableDataset
(
train_datasets
,
weights
)
blending_valid_dataset
=
None
if
valid_datasets
:
blending_valid_dataset
=
BlendableDataset
(
valid_datasets
,
weights
)
blending_test_dataset
=
None
if
test_datasets
:
blending_test_dataset
=
BlendableDataset
(
test_datasets
,
weights
)
return
(
blending_train_dataset
,
blending_valid_dataset
,
blending_test_dataset
)
def
_build_train_valid_test_datasets
(
data_prefix
,
data_impl
,
splits_string
,
train_valid_test_num_samples
,
seq_length
,
seed
,
skip_warmup
):
"""Build train, valid, and test datasets."""
# Indexed dataset.
indexed_dataset
=
get_indexed_dataset_
(
data_prefix
,
data_impl
,
skip_warmup
)
total_num_of_documents
=
indexed_dataset
.
sizes
.
shape
[
0
]
splits
=
get_train_valid_test_split_
(
splits_string
,
total_num_of_documents
)
# Print stats about the splits.
print_rank_0
(
' > dataset split:'
)
def
print_split_stats
(
name
,
index
):
print_rank_0
(
' {}:'
.
format
(
name
))
print_rank_0
(
' document indices in [{}, {}) total of {} '
'documents'
.
format
(
splits
[
index
],
splits
[
index
+
1
],
splits
[
index
+
1
]
-
splits
[
index
]))
print_split_stats
(
'train'
,
0
)
print_split_stats
(
'validation'
,
1
)
print_split_stats
(
'test'
,
2
)
def
build_dataset
(
index
,
name
):
dataset
=
None
if
splits
[
index
+
1
]
>
splits
[
index
]:
documents
=
np
.
arange
(
start
=
splits
[
index
],
stop
=
splits
[
index
+
1
],
step
=
1
,
dtype
=
np
.
int32
)
dataset
=
GPTDataset
(
name
,
data_prefix
,
documents
,
indexed_dataset
,
train_valid_test_num_samples
[
index
],
seq_length
,
seed
)
return
dataset
train_dataset
=
build_dataset
(
0
,
'train'
)
valid_dataset
=
build_dataset
(
1
,
'valid'
)
test_dataset
=
build_dataset
(
2
,
'test'
)
return
(
train_dataset
,
valid_dataset
,
test_dataset
)
def
get_indexed_dataset_
(
data_prefix
,
data_impl
,
skip_warmup
):
"""Build indexed dataset."""
print_rank_0
(
' > building dataset index ...'
)
start_time
=
time
.
time
()
indexed_dataset
=
make_indexed_dataset
(
data_prefix
,
data_impl
,
skip_warmup
)
print_rank_0
(
' > finished creating indexed dataset in {:4f} '
'seconds'
.
format
(
time
.
time
()
-
start_time
))
print_rank_0
(
' number of documents: {}'
.
format
(
indexed_dataset
.
sizes
.
shape
[
0
]))
return
indexed_dataset
class
GPTDataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
name
,
data_prefix
,
documents
,
indexed_dataset
,
num_samples
,
seq_length
,
seed
):
self
.
name
=
name
self
.
indexed_dataset
=
indexed_dataset
# Checks
assert
np
.
min
(
documents
)
>=
0
assert
np
.
max
(
documents
)
<
indexed_dataset
.
sizes
.
shape
[
0
]
# Build index mappings.
self
.
doc_idx
,
self
.
sample_idx
,
self
.
shuffle_idx
=
_build_index_mappings
(
self
.
name
,
data_prefix
,
documents
,
self
.
indexed_dataset
.
sizes
,
num_samples
,
seq_length
,
seed
)
def
__len__
(
self
):
# -1 is due to data structure used to retieve the index:
# sample i --> [sample_idx[i], sample_idx[i+1])
return
self
.
sample_idx
.
shape
[
0
]
-
1
def
__getitem__
(
self
,
idx
):
# Get the shuffled index.
idx
=
self
.
shuffle_idx
[
idx
]
# Start and end documents and offsets.
doc_index_f
=
self
.
sample_idx
[
idx
][
0
]
doc_index_l
=
self
.
sample_idx
[
idx
+
1
][
0
]
offset_f
=
self
.
sample_idx
[
idx
][
1
]
offset_l
=
self
.
sample_idx
[
idx
+
1
][
1
]
# If we are within the same document, just extract the chunk.
if
doc_index_f
==
doc_index_l
:
sample
=
self
.
indexed_dataset
.
get
(
self
.
doc_idx
[
doc_index_f
],
offset
=
offset_f
,
length
=
offset_l
-
offset_f
+
1
)
else
:
# Otherwise, get the rest of the initial document.
sample_list
=
[
self
.
indexed_dataset
.
get
(
self
.
doc_idx
[
doc_index_f
],
offset
=
offset_f
)]
# Loop over all in between documents and add the entire document.
for
i
in
range
(
doc_index_f
+
1
,
doc_index_l
):
sample_list
.
append
(
self
.
indexed_dataset
.
get
(
self
.
doc_idx
[
i
]))
# And finally add the relevant portion of last document.
sample_list
.
append
(
self
.
indexed_dataset
.
get
(
self
.
doc_idx
[
doc_index_l
],
length
=
offset_l
+
1
))
sample
=
np
.
concatenate
(
sample_list
)
return
{
'text'
:
np
.
array
(
sample
,
dtype
=
np
.
int64
)}
def
_build_index_mappings
(
name
,
data_prefix
,
documents
,
sizes
,
num_samples
,
seq_length
,
seed
):
"""Build doc-idx, sample-idx, and shuffle-idx.
doc-idx: is an array (ordered) of documents to be used in training.
sample-idx: is the start document index and document offset for each
training sample.
shuffle-idx: maps the sample index into a random index into sample-idx.
"""
# Number of tokens in each epoch and number of required epochs.
tokens_per_epoch
=
_num_tokens
(
documents
,
sizes
)
num_epochs
=
_num_epochs
(
tokens_per_epoch
,
seq_length
,
num_samples
)
# rng state
np_rng
=
np
.
random
.
RandomState
(
seed
=
seed
)
# Filename of the index mappings.
_filename
=
data_prefix
_filename
+=
'_{}_indexmap'
.
format
(
name
)
_filename
+=
'_{}ns'
.
format
(
num_samples
)
_filename
+=
'_{}sl'
.
format
(
seq_length
)
_filename
+=
'_{}s'
.
format
(
seed
)
doc_idx_filename
=
_filename
+
'_doc_idx.npy'
sample_idx_filename
=
_filename
+
'_sample_idx.npy'
shuffle_idx_filename
=
_filename
+
'_shuffle_idx.npy'
# Build the indexed mapping if not exist.
if
torch
.
distributed
.
get_rank
()
==
0
:
if
(
not
os
.
path
.
isfile
(
doc_idx_filename
))
or
\
(
not
os
.
path
.
isfile
(
sample_idx_filename
))
or
\
(
not
os
.
path
.
isfile
(
shuffle_idx_filename
)):
print_rank_0
(
' > WARNING: could not find index map files, building '
'the indices on rank 0 ...'
)
# For the last epoch, decide whether include the entire epoch
# in the global shuffle or not.
# If we need only one epoch, then separating last epoch does
# not mean anything.
if
num_epochs
==
1
:
separate_last_epoch
=
False
print
(
' > only one epoch required, setting '
'separate_last_epoch to False'
,
flush
=
True
)
else
:
# Get the number of samples for the last epoch
num_samples_from_epochs_minus_one
=
(
(
num_epochs
-
1
)
*
tokens_per_epoch
-
1
)
//
seq_length
last_epoch_num_samples
=
num_samples
-
\
num_samples_from_epochs_minus_one
assert
last_epoch_num_samples
>=
0
,
\
'last epoch number of samples should be non-negative.'
num_samples_per_epoch
=
(
tokens_per_epoch
-
1
)
//
seq_length
assert
last_epoch_num_samples
<
(
num_samples_per_epoch
+
1
),
\
'last epoch number of samples exceeded max value.'
# If we have less than 80% of the samples for the last epoch,
# seperate out the epoch and treat it differently.
# Note: the 80% number is just based on common sense and can
# be adjusted if needed.
separate_last_epoch
=
(
last_epoch_num_samples
<
int
(
0.80
*
num_samples_per_epoch
))
if
separate_last_epoch
:
string
=
' > last epoch number of samples ({}) is smaller '
\
'than 80% of number of samples per epoch ({}), '
\
'setting separate_last_epoch to True'
else
:
string
=
' > last epoch number of samples ({}) is larger '
\
'than 80% of number of samples per epoch ({}), '
\
'setting separate_last_epoch to False'
print
(
string
.
format
(
last_epoch_num_samples
,
num_samples_per_epoch
),
flush
=
True
)
# doc-idx.
start_time
=
time
.
time
()
doc_idx
=
_build_doc_idx
(
documents
,
num_epochs
,
np_rng
,
separate_last_epoch
)
np
.
save
(
doc_idx_filename
,
doc_idx
,
allow_pickle
=
True
)
print_rank_0
(
' > elasped time to build and save doc-idx mapping '
'(seconds): {:4f}'
.
format
(
time
.
time
()
-
start_time
))
# sample-idx.
start_time
=
time
.
time
()
# Use C++ implementation for speed.
# First compile and then import.
from
megatron.data
import
helpers
assert
doc_idx
.
dtype
==
np
.
int32
assert
sizes
.
dtype
==
np
.
int32
sample_idx
=
helpers
.
build_sample_idx
(
sizes
,
doc_idx
,
seq_length
,
num_epochs
,
tokens_per_epoch
)
# sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
# num_epochs, tokens_per_epoch)
np
.
save
(
sample_idx_filename
,
sample_idx
,
allow_pickle
=
True
)
print_rank_0
(
' > elasped time to build and save sample-idx mapping '
'(seconds): {:4f}'
.
format
(
time
.
time
()
-
start_time
))
# shuffle-idx.
start_time
=
time
.
time
()
# -1 is due to data structure used to retieve the index:
# sample i --> [sample_idx[i], sample_idx[i+1])
if
separate_last_epoch
:
num_samples_
=
num_samples_from_epochs_minus_one
else
:
num_samples_
=
sample_idx
.
shape
[
0
]
-
1
shuffle_idx
=
_build_shuffle_idx
(
num_samples_
,
sample_idx
.
shape
[
0
]
-
1
,
np_rng
)
np
.
save
(
shuffle_idx_filename
,
shuffle_idx
,
allow_pickle
=
True
)
print_rank_0
(
' > elasped time to build and save shuffle-idx mapping'
' (seconds): {:4f}'
.
format
(
time
.
time
()
-
start_time
))
# This should be a barrier but nccl barrier assumes
# device_index=rank which is not the case for model
# parallel case
counts
=
torch
.
cuda
.
LongTensor
([
1
])
torch
.
distributed
.
all_reduce
(
counts
,
group
=
mpu
.
get_data_parallel_group
())
torch
.
distributed
.
all_reduce
(
counts
,
group
=
mpu
.
get_pipeline_model_parallel_group
())
assert
counts
[
0
].
item
()
==
(
torch
.
distributed
.
get_world_size
()
//
torch
.
distributed
.
get_world_size
(
group
=
mpu
.
get_tensor_model_parallel_group
()))
# Load mappings.
start_time
=
time
.
time
()
print_rank_0
(
' > loading doc-idx mapping from {}'
.
format
(
doc_idx_filename
))
doc_idx
=
np
.
load
(
doc_idx_filename
,
allow_pickle
=
True
,
mmap_mode
=
'r'
)
print_rank_0
(
' > loading sample-idx mapping from {}'
.
format
(
sample_idx_filename
))
sample_idx
=
np
.
load
(
sample_idx_filename
,
allow_pickle
=
True
,
mmap_mode
=
'r'
)
print_rank_0
(
' > loading shuffle-idx mapping from {}'
.
format
(
shuffle_idx_filename
))
shuffle_idx
=
np
.
load
(
shuffle_idx_filename
,
allow_pickle
=
True
,
mmap_mode
=
'r'
)
print_rank_0
(
' loaded indexed file in {:3.3f} seconds'
.
format
(
time
.
time
()
-
start_time
))
print_rank_0
(
' total number of samples: {}'
.
format
(
sample_idx
.
shape
[
0
]))
print_rank_0
(
' total number of epochs: {}'
.
format
(
num_epochs
))
return
doc_idx
,
sample_idx
,
shuffle_idx
def
_num_tokens
(
documents
,
sizes
):
"""Total number of tokens in the dataset."""
return
np
.
sum
(
sizes
[
documents
])
def
_num_epochs
(
tokens_per_epoch
,
seq_length
,
num_samples
):
"""Based on number of samples and sequence lenght, calculate how many
epochs will be needed."""
num_epochs
=
0
total_tokens
=
0
while
True
:
num_epochs
+=
1
total_tokens
+=
tokens_per_epoch
# -1 is because we need to retrieve seq_length + 1 token each time
# but the last token will overlap with the first token of the next
# sample except for the last sample.
if
((
total_tokens
-
1
)
//
seq_length
)
>=
num_samples
:
return
num_epochs
def
_build_doc_idx
(
documents
,
num_epochs
,
np_rng
,
separate_last_epoch
):
"""Build an array with length = number-of-epochs * number-of-dcuments.
Each index is mapped to a corresponding document."""
if
not
separate_last_epoch
or
num_epochs
==
1
:
doc_idx
=
np
.
mgrid
[
0
:
num_epochs
,
0
:
len
(
documents
)][
1
]
doc_idx
[:]
=
documents
doc_idx
=
doc_idx
.
reshape
(
-
1
)
doc_idx
=
doc_idx
.
astype
(
np
.
int32
)
np_rng
.
shuffle
(
doc_idx
)
return
doc_idx
doc_idx_first
=
_build_doc_idx
(
documents
,
num_epochs
-
1
,
np_rng
,
False
)
doc_idx_last
=
_build_doc_idx
(
documents
,
1
,
np_rng
,
False
)
return
np
.
concatenate
((
doc_idx_first
,
doc_idx_last
))
def
_build_sample_idx
(
sizes
,
doc_idx
,
seq_length
,
num_epochs
,
tokens_per_epoch
):
"""Sample index mapping is a 2D array with sizes
[number-of-samples + 1, 2] where [..., 0] contains
the index into `doc_idx` and [..., 1] is the
starting offset in that document."""
# Total number of samples. For -1 see comments in `_num_epochs`.
num_samples
=
(
num_epochs
*
tokens_per_epoch
-
1
)
//
seq_length
sample_idx
=
np
.
zeros
([
num_samples
+
1
,
2
],
dtype
=
np
.
int32
)
# Index into sample_idx.
sample_index
=
0
# Index into doc_idx.
doc_idx_index
=
0
# Begining offset for each document.
doc_offset
=
0
# Start with first document and no offset.
sample_idx
[
sample_index
][
0
]
=
doc_idx_index
sample_idx
[
sample_index
][
1
]
=
doc_offset
sample_index
+=
1
while
sample_index
<=
num_samples
:
# Start with a fresh sequence.
remaining_seq_length
=
seq_length
+
1
while
remaining_seq_length
!=
0
:
# Get the document length.
doc_id
=
doc_idx
[
doc_idx_index
]
doc_length
=
sizes
[
doc_id
]
-
doc_offset
# And add it to the current sequence.
remaining_seq_length
-=
doc_length
# If we have more than a full sequence, adjust offset and set
# remaining length to zero so we return from the while loop.
# Note that -1 here is for the same reason we have -1 in
# `_num_epochs` calculations.
if
remaining_seq_length
<=
0
:
doc_offset
+=
(
remaining_seq_length
+
doc_length
-
1
)
remaining_seq_length
=
0
else
:
# Otherwise, start from the begining of the next document.
doc_idx_index
+=
1
doc_offset
=
0
# Record the sequence.
sample_idx
[
sample_index
][
0
]
=
doc_idx_index
sample_idx
[
sample_index
][
1
]
=
doc_offset
sample_index
+=
1
return
sample_idx
def
_build_shuffle_idx
(
num_samples
,
total_size
,
np_rng
):
"""Build the range [0, size) and shuffle."""
print
(
' > building shuffle index with split [0, {}) and [{}, {}) '
'...'
.
format
(
num_samples
,
num_samples
,
total_size
),
flush
=
True
)
dtype_
=
np
.
uint32
if
total_size
>=
(
np
.
iinfo
(
np
.
uint32
).
max
-
1
):
dtype_
=
np
.
int64
shuffle_idx_first
=
np
.
arange
(
start
=
0
,
stop
=
num_samples
,
step
=
1
,
dtype
=
dtype_
)
np_rng
.
shuffle
(
shuffle_idx_first
)
if
num_samples
==
total_size
:
return
shuffle_idx_first
shuffle_idx_last
=
np
.
arange
(
start
=
num_samples
,
stop
=
total_size
,
step
=
1
,
dtype
=
dtype_
)
np_rng
.
shuffle
(
shuffle_idx_last
)
return
np
.
concatenate
((
shuffle_idx_first
,
shuffle_idx_last
))
3rdparty/Megatron-LM/megatron/data/helpers.cpp
0 → 100644
View file @
0211193c
/*
coding=utf-8
Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
/* Helper methods for fast index mapping builds */
#include <algorithm>
#include <iostream>
#include <limits>
#include <math.h>
#include <stdexcept>
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <random>
namespace
py
=
pybind11
;
using
namespace
std
;
const
int32_t
LONG_SENTENCE_LEN
=
512
;
void
build_blending_indices
(
py
::
array_t
<
uint8_t
>&
dataset_index
,
py
::
array_t
<
int64_t
>&
dataset_sample_index
,
const
py
::
array_t
<
double
>&
weights
,
const
int32_t
num_datasets
,
const
int64_t
size
,
const
bool
verbose
)
{
/* Given multiple datasets and a weighting array, build samples
such that it follows those wieghts.*/
if
(
verbose
)
{
std
::
cout
<<
"> building indices for blendable datasets ..."
<<
std
::
endl
;
}
// Get the pointer access without the checks.
auto
dataset_index_ptr
=
dataset_index
.
mutable_unchecked
<
1
>
();
auto
dataset_sample_index_ptr
=
dataset_sample_index
.
mutable_unchecked
<
1
>
();
auto
weights_ptr
=
weights
.
unchecked
<
1
>
();
// Initialize buffer for number of samples used for each dataset.
int64_t
current_samples
[
num_datasets
];
for
(
int64_t
i
=
0
;
i
<
num_datasets
;
++
i
)
{
current_samples
[
i
]
=
0
;
}
// For each sample:
for
(
int64_t
sample_idx
=
0
;
sample_idx
<
size
;
++
sample_idx
)
{
// Determine where the max error in sampling is happening.
auto
sample_idx_double
=
std
::
max
(
static_cast
<
double
>
(
sample_idx
),
1.0
);
int64_t
max_error_index
=
0
;
double
max_error
=
weights_ptr
[
0
]
*
sample_idx_double
-
static_cast
<
double
>
(
current_samples
[
0
]);
for
(
int64_t
dataset_idx
=
1
;
dataset_idx
<
num_datasets
;
++
dataset_idx
)
{
double
error
=
weights_ptr
[
dataset_idx
]
*
sample_idx_double
-
static_cast
<
double
>
(
current_samples
[
dataset_idx
]);
if
(
error
>
max_error
)
{
max_error
=
error
;
max_error_index
=
dataset_idx
;
}
}
// Populate the indices.
dataset_index_ptr
[
sample_idx
]
=
static_cast
<
uint8_t
>
(
max_error_index
);
dataset_sample_index_ptr
[
sample_idx
]
=
current_samples
[
max_error_index
];
// Update the total samples.
current_samples
[
max_error_index
]
+=
1
;
}
// print info
if
(
verbose
)
{
std
::
cout
<<
" > sample ratios:"
<<
std
::
endl
;
for
(
int64_t
dataset_idx
=
0
;
dataset_idx
<
num_datasets
;
++
dataset_idx
)
{
auto
ratio
=
static_cast
<
double
>
(
current_samples
[
dataset_idx
])
/
static_cast
<
double
>
(
size
);
std
::
cout
<<
" dataset "
<<
dataset_idx
<<
", input: "
<<
weights_ptr
[
dataset_idx
]
<<
", achieved: "
<<
ratio
<<
std
::
endl
;
}
}
}
py
::
array
build_sample_idx
(
const
py
::
array_t
<
int32_t
>&
sizes_
,
const
py
::
array_t
<
int32_t
>&
doc_idx_
,
const
int32_t
seq_length
,
const
int32_t
num_epochs
,
const
int64_t
tokens_per_epoch
)
{
/* Sample index (sample_idx) is used for gpt2 like dataset for which
the documents are flattened and the samples are built based on this
1-D flatten array. It is a 2D array with sizes [number-of-samples + 1, 2]
where [..., 0] contains the index into `doc_idx` and [..., 1] is the
starting offset in that document.*/
// Consistency checks.
assert
(
seq_length
>
1
);
assert
(
num_epochs
>
0
);
assert
(
tokens_per_epoch
>
1
);
// Remove bound checks.
auto
sizes
=
sizes_
.
unchecked
<
1
>
();
auto
doc_idx
=
doc_idx_
.
unchecked
<
1
>
();
// Mapping and it's length (1D).
int64_t
num_samples
=
(
num_epochs
*
tokens_per_epoch
-
1
)
/
seq_length
;
int32_t
*
sample_idx
=
new
int32_t
[
2
*
(
num_samples
+
1
)];
cout
<<
" using:"
<<
endl
<<
std
::
flush
;
cout
<<
" number of documents: "
<<
doc_idx_
.
shape
(
0
)
/
num_epochs
<<
endl
<<
std
::
flush
;
cout
<<
" number of epochs: "
<<
num_epochs
<<
endl
<<
std
::
flush
;
cout
<<
" sequence length: "
<<
seq_length
<<
endl
<<
std
::
flush
;
cout
<<
" total number of samples: "
<<
num_samples
<<
endl
<<
std
::
flush
;
// Index into sample_idx.
int64_t
sample_index
=
0
;
// Index into doc_idx.
int64_t
doc_idx_index
=
0
;
// Begining offset for each document.
int32_t
doc_offset
=
0
;
// Start with first document and no offset.
sample_idx
[
2
*
sample_index
]
=
doc_idx_index
;
sample_idx
[
2
*
sample_index
+
1
]
=
doc_offset
;
++
sample_index
;
while
(
sample_index
<=
num_samples
)
{
// Start with a fresh sequence.
int32_t
remaining_seq_length
=
seq_length
+
1
;
while
(
remaining_seq_length
!=
0
)
{
// Get the document length.
auto
doc_id
=
doc_idx
[
doc_idx_index
];
auto
doc_length
=
sizes
[
doc_id
]
-
doc_offset
;
// And add it to the current sequence.
remaining_seq_length
-=
doc_length
;
// If we have more than a full sequence, adjust offset and set
// remaining length to zero so we return from the while loop.
// Note that -1 here is for the same reason we have -1 in
// `_num_epochs` calculations.
if
(
remaining_seq_length
<=
0
)
{
doc_offset
+=
(
remaining_seq_length
+
doc_length
-
1
);
remaining_seq_length
=
0
;
}
else
{
// Otherwise, start from the begining of the next document.
++
doc_idx_index
;
doc_offset
=
0
;
}
}
// Record the sequence.
sample_idx
[
2
*
sample_index
]
=
doc_idx_index
;
sample_idx
[
2
*
sample_index
+
1
]
=
doc_offset
;
++
sample_index
;
}
// Method to deallocate memory.
py
::
capsule
free_when_done
(
sample_idx
,
[](
void
*
mem_
)
{
int32_t
*
mem
=
reinterpret_cast
<
int32_t
*>
(
mem_
);
delete
[]
mem
;
});
// Return the numpy array.
const
auto
byte_size
=
sizeof
(
int32_t
);
return
py
::
array
(
std
::
vector
<
int64_t
>
{
num_samples
+
1
,
2
},
// shape
{
2
*
byte_size
,
byte_size
},
// C-style contiguous strides
sample_idx
,
// the data pointer
free_when_done
);
// numpy array references
}
inline
int32_t
get_target_sample_len
(
const
int32_t
short_seq_ratio
,
const
int32_t
max_length
,
std
::
mt19937
&
rand32_gen
)
{
/* Training sample length. */
if
(
short_seq_ratio
==
0
)
{
return
max_length
;
}
const
auto
random_number
=
rand32_gen
();
if
((
random_number
%
short_seq_ratio
)
==
0
)
{
return
2
+
random_number
%
(
max_length
-
1
);
}
return
max_length
;
}
template
<
typename
DocIdx
>
py
::
array
build_mapping_impl
(
const
py
::
array_t
<
int64_t
>&
docs_
,
const
py
::
array_t
<
int32_t
>&
sizes_
,
const
int32_t
num_epochs
,
const
uint64_t
max_num_samples
,
const
int32_t
max_seq_length
,
const
double
short_seq_prob
,
const
int32_t
seed
,
const
bool
verbose
,
const
int32_t
min_num_sent
)
{
/* Build a mapping of (start-index, end-index, sequence-length) where
start and end index are the indices of the sentences in the sample
and sequence-length is the target sequence length.
*/
// Consistency checks.
assert
(
num_epochs
>
0
);
assert
(
max_seq_length
>
1
);
assert
(
short_seq_prob
>=
0.0
);
assert
(
short_seq_prob
<=
1.0
);
assert
(
seed
>
0
);
// Remove bound checks.
auto
docs
=
docs_
.
unchecked
<
1
>
();
auto
sizes
=
sizes_
.
unchecked
<
1
>
();
// For efficiency, convert probability to ratio. Note: rand() generates int.
int32_t
short_seq_ratio
=
0
;
if
(
short_seq_prob
>
0
)
{
short_seq_ratio
=
static_cast
<
int32_t
>
(
round
(
1.0
/
short_seq_prob
));
}
if
(
verbose
)
{
const
auto
sent_start_index
=
docs
[
0
];
const
auto
sent_end_index
=
docs
[
docs_
.
shape
(
0
)
-
1
];
const
auto
num_sentences
=
sent_end_index
-
sent_start_index
;
cout
<<
" using:"
<<
endl
<<
std
::
flush
;
cout
<<
" number of documents: "
<<
docs_
.
shape
(
0
)
-
1
<<
endl
<<
std
::
flush
;
cout
<<
" sentences range: ["
<<
sent_start_index
<<
", "
<<
sent_end_index
<<
")"
<<
endl
<<
std
::
flush
;
cout
<<
" total number of sentences: "
<<
num_sentences
<<
endl
<<
std
::
flush
;
cout
<<
" number of epochs: "
<<
num_epochs
<<
endl
<<
std
::
flush
;
cout
<<
" maximum number of samples: "
<<
max_num_samples
<<
endl
<<
std
::
flush
;
cout
<<
" maximum sequence length: "
<<
max_seq_length
<<
endl
<<
std
::
flush
;
cout
<<
" short sequence probability: "
<<
short_seq_prob
<<
endl
<<
std
::
flush
;
cout
<<
" short sequence ration (1/prob): "
<<
short_seq_ratio
<<
endl
<<
std
::
flush
;
cout
<<
" seed: "
<<
seed
<<
endl
<<
std
::
flush
;
}
// Mapping and it's length (1D).
int64_t
num_samples
=
-
1
;
DocIdx
*
maps
=
NULL
;
// Perform two iterations, in the first iteration get the size
// and allocate memory and in the second iteration populate the map.
bool
second
=
false
;
for
(
int32_t
iteration
=
0
;
iteration
<
2
;
++
iteration
)
{
// Set the seed so both iterations produce the same results.
std
::
mt19937
rand32_gen
(
seed
);
// Set the flag on second iteration.
second
=
(
iteration
==
1
);
// Counters:
uint64_t
empty_docs
=
0
;
uint64_t
one_sent_docs
=
0
;
uint64_t
long_sent_docs
=
0
;
// Current map index.
uint64_t
map_index
=
0
;
// For each epoch:
for
(
int32_t
epoch
=
0
;
epoch
<
num_epochs
;
++
epoch
)
{
if
(
map_index
>=
max_num_samples
)
{
if
(
verbose
&&
(
!
second
))
{
cout
<<
" reached "
<<
max_num_samples
<<
" samples after "
<<
epoch
<<
" epochs ..."
<<
endl
<<
std
::
flush
;
}
break
;
}
// For each document:
for
(
int32_t
doc
=
0
;
doc
<
(
docs
.
shape
(
0
)
-
1
);
++
doc
)
{
// Document sentences are in [sent_index_first, sent_index_last)
const
auto
sent_index_first
=
docs
[
doc
];
const
auto
sent_index_last
=
docs
[
doc
+
1
];
// At the begining of the document previous index is the
// start index.
auto
prev_start_index
=
sent_index_first
;
// Remaining documents.
auto
num_remain_sent
=
sent_index_last
-
sent_index_first
;
// Some bookkeeping
if
((
epoch
==
0
)
&&
(
!
second
))
{
if
(
num_remain_sent
==
0
)
{
++
empty_docs
;
}
if
(
num_remain_sent
==
1
)
{
++
one_sent_docs
;
}
}
// Detect documents with long sentences.
bool
contains_long_sentence
=
false
;
if
(
num_remain_sent
>
1
)
{
for
(
auto
sent_index
=
sent_index_first
;
sent_index
<
sent_index_last
;
++
sent_index
)
{
if
(
sizes
[
sent_index
]
>
LONG_SENTENCE_LEN
){
if
((
epoch
==
0
)
&&
(
!
second
))
{
++
long_sent_docs
;
}
contains_long_sentence
=
true
;
break
;
}
}
}
// If we have more than two sentences.
if
((
num_remain_sent
>=
min_num_sent
)
&&
(
!
contains_long_sentence
))
{
// Set values.
auto
seq_len
=
int32_t
{
0
};
auto
num_sent
=
int32_t
{
0
};
auto
target_seq_len
=
get_target_sample_len
(
short_seq_ratio
,
max_seq_length
,
rand32_gen
);
// Loop through sentences.
for
(
auto
sent_index
=
sent_index_first
;
sent_index
<
sent_index_last
;
++
sent_index
)
{
// Add the size and number of sentences.
seq_len
+=
sizes
[
sent_index
];
++
num_sent
;
--
num_remain_sent
;
// If we have reached the target length.
// and if not only one sentence is left in the document.
// and if we have at least two sentneces.
// and if we have reached end of the document.
if
(((
seq_len
>=
target_seq_len
)
&&
(
num_remain_sent
>
1
)
&&
(
num_sent
>=
min_num_sent
)
)
||
(
num_remain_sent
==
0
))
{
// Check for overflow.
if
((
3
*
map_index
+
2
)
>
std
::
numeric_limits
<
int64_t
>::
max
())
{
cout
<<
"number of samples exceeded maximum "
<<
"allowed by type int64: "
<<
std
::
numeric_limits
<
int64_t
>::
max
()
<<
endl
;
throw
std
::
overflow_error
(
"Number of samples"
);
}
// Populate the map.
if
(
second
)
{
const
auto
map_index_0
=
3
*
map_index
;
maps
[
map_index_0
]
=
static_cast
<
DocIdx
>
(
prev_start_index
);
maps
[
map_index_0
+
1
]
=
static_cast
<
DocIdx
>
(
sent_index
+
1
);
maps
[
map_index_0
+
2
]
=
static_cast
<
DocIdx
>
(
target_seq_len
);
}
// Update indices / counters.
++
map_index
;
prev_start_index
=
sent_index
+
1
;
target_seq_len
=
get_target_sample_len
(
short_seq_ratio
,
max_seq_length
,
rand32_gen
);
seq_len
=
0
;
num_sent
=
0
;
}
}
// for (auto sent_index=sent_index_first; ...
}
// if (num_remain_sent > 1) {
}
// for (int doc=0; doc < num_docs; ++doc) {
}
// for (int epoch=0; epoch < num_epochs; ++epoch) {
if
(
!
second
)
{
if
(
verbose
)
{
cout
<<
" number of empty documents: "
<<
empty_docs
<<
endl
<<
std
::
flush
;
cout
<<
" number of documents with one sentence: "
<<
one_sent_docs
<<
endl
<<
std
::
flush
;
cout
<<
" number of documents with long sentences: "
<<
long_sent_docs
<<
endl
<<
std
::
flush
;
cout
<<
" will create mapping for "
<<
map_index
<<
" samples"
<<
endl
<<
std
::
flush
;
}
assert
(
maps
==
NULL
);
assert
(
num_samples
<
0
);
maps
=
new
DocIdx
[
3
*
map_index
];
num_samples
=
static_cast
<
int64_t
>
(
map_index
);
}
}
// for (int iteration=0; iteration < 2; ++iteration) {
// Shuffle.
// We need a 64 bit random number generator as we might have more
// than 2 billion samples.
std
::
mt19937_64
rand64_gen
(
seed
+
1
);
for
(
auto
i
=
(
num_samples
-
1
);
i
>
0
;
--
i
)
{
const
auto
j
=
static_cast
<
int64_t
>
(
rand64_gen
()
%
(
i
+
1
));
const
auto
i0
=
3
*
i
;
const
auto
j0
=
3
*
j
;
// Swap values.
swap
(
maps
[
i0
],
maps
[
j0
]);
swap
(
maps
[
i0
+
1
],
maps
[
j0
+
1
]);
swap
(
maps
[
i0
+
2
],
maps
[
j0
+
2
]);
}
// Method to deallocate memory.
py
::
capsule
free_when_done
(
maps
,
[](
void
*
mem_
)
{
DocIdx
*
mem
=
reinterpret_cast
<
DocIdx
*>
(
mem_
);
delete
[]
mem
;
});
// Return the numpy array.
const
auto
byte_size
=
sizeof
(
DocIdx
);
return
py
::
array
(
std
::
vector
<
int64_t
>
{
num_samples
,
3
},
// shape
{
3
*
byte_size
,
byte_size
},
// C-style contiguous strides
maps
,
// the data pointer
free_when_done
);
// numpy array references
}
py
::
array
build_mapping
(
const
py
::
array_t
<
int64_t
>&
docs_
,
const
py
::
array_t
<
int
>&
sizes_
,
const
int
num_epochs
,
const
uint64_t
max_num_samples
,
const
int
max_seq_length
,
const
double
short_seq_prob
,
const
int
seed
,
const
bool
verbose
,
const
int32_t
min_num_sent
)
{
if
(
sizes_
.
size
()
>
std
::
numeric_limits
<
uint32_t
>::
max
())
{
if
(
verbose
)
{
cout
<<
" using uint64 for data mapping..."
<<
endl
<<
std
::
flush
;
}
return
build_mapping_impl
<
uint64_t
>
(
docs_
,
sizes_
,
num_epochs
,
max_num_samples
,
max_seq_length
,
short_seq_prob
,
seed
,
verbose
,
min_num_sent
);
}
else
{
if
(
verbose
)
{
cout
<<
" using uint32 for data mapping..."
<<
endl
<<
std
::
flush
;
}
return
build_mapping_impl
<
uint32_t
>
(
docs_
,
sizes_
,
num_epochs
,
max_num_samples
,
max_seq_length
,
short_seq_prob
,
seed
,
verbose
,
min_num_sent
);
}
}
template
<
typename
DocIdx
>
py
::
array
build_blocks_mapping_impl
(
const
py
::
array_t
<
int64_t
>&
docs_
,
const
py
::
array_t
<
int32_t
>&
sizes_
,
const
py
::
array_t
<
int32_t
>&
titles_sizes_
,
const
int32_t
num_epochs
,
const
uint64_t
max_num_samples
,
const
int32_t
max_seq_length
,
const
int32_t
seed
,
const
bool
verbose
,
const
bool
use_one_sent_blocks
)
{
/* Build a mapping of (start-index, end-index, sequence-length) where
start and end index are the indices of the sentences in the sample
and sequence-length is the target sequence length.
*/
// Consistency checks.
assert
(
num_epochs
>
0
);
assert
(
max_seq_length
>
1
);
assert
(
seed
>
0
);
// Remove bound checks.
auto
docs
=
docs_
.
unchecked
<
1
>
();
auto
sizes
=
sizes_
.
unchecked
<
1
>
();
auto
titles_sizes
=
titles_sizes_
.
unchecked
<
1
>
();
if
(
verbose
)
{
const
auto
sent_start_index
=
docs
[
0
];
const
auto
sent_end_index
=
docs
[
docs_
.
shape
(
0
)
-
1
];
const
auto
num_sentences
=
sent_end_index
-
sent_start_index
;
cout
<<
" using:"
<<
endl
<<
std
::
flush
;
cout
<<
" number of documents: "
<<
docs_
.
shape
(
0
)
-
1
<<
endl
<<
std
::
flush
;
cout
<<
" sentences range: ["
<<
sent_start_index
<<
", "
<<
sent_end_index
<<
")"
<<
endl
<<
std
::
flush
;
cout
<<
" total number of sentences: "
<<
num_sentences
<<
endl
<<
std
::
flush
;
cout
<<
" number of epochs: "
<<
num_epochs
<<
endl
<<
std
::
flush
;
cout
<<
" maximum number of samples: "
<<
max_num_samples
<<
endl
<<
std
::
flush
;
cout
<<
" maximum sequence length: "
<<
max_seq_length
<<
endl
<<
std
::
flush
;
cout
<<
" seed: "
<<
seed
<<
endl
<<
std
::
flush
;
}
// Mapping and its length (1D).
int64_t
num_samples
=
-
1
;
DocIdx
*
maps
=
NULL
;
// Acceptable number of sentences per block.
int
min_num_sent
=
2
;
if
(
use_one_sent_blocks
)
{
min_num_sent
=
1
;
}
// Perform two iterations, in the first iteration get the size
// and allocate memory and in the second iteration populate the map.
bool
second
=
false
;
for
(
int32_t
iteration
=
0
;
iteration
<
2
;
++
iteration
)
{
// Set the flag on second iteration.
second
=
(
iteration
==
1
);
// Current map index.
uint64_t
map_index
=
0
;
uint64_t
empty_docs
=
0
;
uint64_t
one_sent_docs
=
0
;
uint64_t
long_sent_docs
=
0
;
// For each epoch:
for
(
int32_t
epoch
=
0
;
epoch
<
num_epochs
;
++
epoch
)
{
// assign every block a unique id
int32_t
block_id
=
0
;
if
(
map_index
>=
max_num_samples
)
{
if
(
verbose
&&
(
!
second
))
{
cout
<<
" reached "
<<
max_num_samples
<<
" samples after "
<<
epoch
<<
" epochs ..."
<<
endl
<<
std
::
flush
;
}
break
;
}
// For each document:
for
(
int32_t
doc
=
0
;
doc
<
(
docs
.
shape
(
0
)
-
1
);
++
doc
)
{
// Document sentences are in [sent_index_first, sent_index_last)
const
auto
sent_index_first
=
docs
[
doc
];
const
auto
sent_index_last
=
docs
[
doc
+
1
];
const
auto
target_seq_len
=
max_seq_length
-
titles_sizes
[
doc
];
// At the begining of the document previous index is the
// start index.
auto
prev_start_index
=
sent_index_first
;
// Remaining documents.
auto
num_remain_sent
=
sent_index_last
-
sent_index_first
;
// Some bookkeeping
if
((
epoch
==
0
)
&&
(
!
second
))
{
if
(
num_remain_sent
==
0
)
{
++
empty_docs
;
}
if
(
num_remain_sent
==
1
)
{
++
one_sent_docs
;
}
}
// Detect documents with long sentences.
bool
contains_long_sentence
=
false
;
if
(
num_remain_sent
>=
min_num_sent
)
{
for
(
auto
sent_index
=
sent_index_first
;
sent_index
<
sent_index_last
;
++
sent_index
)
{
if
(
sizes
[
sent_index
]
>
LONG_SENTENCE_LEN
){
if
((
epoch
==
0
)
&&
(
!
second
))
{
++
long_sent_docs
;
}
contains_long_sentence
=
true
;
break
;
}
}
}
// If we have enough sentences and no long sentences.
if
((
num_remain_sent
>=
min_num_sent
)
&&
(
!
contains_long_sentence
))
{
// Set values.
auto
seq_len
=
int32_t
{
0
};
auto
num_sent
=
int32_t
{
0
};
// Loop through sentences.
for
(
auto
sent_index
=
sent_index_first
;
sent_index
<
sent_index_last
;
++
sent_index
)
{
// Add the size and number of sentences.
seq_len
+=
sizes
[
sent_index
];
++
num_sent
;
--
num_remain_sent
;
// If we have reached the target length.
// and there are an acceptable number of sentences left
// and if we have at least the minimum number of sentences.
// or if we have reached end of the document.
if
(((
seq_len
>=
target_seq_len
)
&&
(
num_remain_sent
>=
min_num_sent
)
&&
(
num_sent
>=
min_num_sent
)
)
||
(
num_remain_sent
==
0
))
{
// Populate the map.
if
(
second
)
{
const
auto
map_index_0
=
4
*
map_index
;
// Each sample has 4 items: the starting sentence index, ending sentence index,
// the index of the document from which the block comes (used for fetching titles)
// and the unique id of the block (used for creating block indexes)
maps
[
map_index_0
]
=
static_cast
<
DocIdx
>
(
prev_start_index
);
maps
[
map_index_0
+
1
]
=
static_cast
<
DocIdx
>
(
sent_index
+
1
);
maps
[
map_index_0
+
2
]
=
static_cast
<
DocIdx
>
(
doc
);
maps
[
map_index_0
+
3
]
=
static_cast
<
DocIdx
>
(
block_id
);
}
// Update indices / counters.
++
map_index
;
++
block_id
;
prev_start_index
=
sent_index
+
1
;
seq_len
=
0
;
num_sent
=
0
;
}
}
// for (auto sent_index=sent_index_first; ...
}
// if (num_remain_sent > 1) {
}
// for (int doc=0; doc < num_docs; ++doc) {
}
// for (int epoch=0; epoch < num_epochs; ++epoch) {
if
(
!
second
)
{
if
(
verbose
)
{
cout
<<
" number of empty documents: "
<<
empty_docs
<<
endl
<<
std
::
flush
;
cout
<<
" number of documents with one sentence: "
<<
one_sent_docs
<<
endl
<<
std
::
flush
;
cout
<<
" number of documents with long sentences: "
<<
long_sent_docs
<<
endl
<<
std
::
flush
;
cout
<<
" will create mapping for "
<<
map_index
<<
" samples"
<<
endl
<<
std
::
flush
;
}
assert
(
maps
==
NULL
);
assert
(
num_samples
<
0
);
maps
=
new
DocIdx
[
4
*
map_index
];
num_samples
=
static_cast
<
int64_t
>
(
map_index
);
}
}
// for (int iteration=0; iteration < 2; ++iteration) {
// Shuffle.
// We need a 64 bit random number generator as we might have more
// than 2 billion samples.
std
::
mt19937_64
rand64_gen
(
seed
+
1
);
for
(
auto
i
=
(
num_samples
-
1
);
i
>
0
;
--
i
)
{
const
auto
j
=
static_cast
<
int64_t
>
(
rand64_gen
()
%
(
i
+
1
));
const
auto
i0
=
4
*
i
;
const
auto
j0
=
4
*
j
;
// Swap values.
swap
(
maps
[
i0
],
maps
[
j0
]);
swap
(
maps
[
i0
+
1
],
maps
[
j0
+
1
]);
swap
(
maps
[
i0
+
2
],
maps
[
j0
+
2
]);
swap
(
maps
[
i0
+
3
],
maps
[
j0
+
3
]);
}
// Method to deallocate memory.
py
::
capsule
free_when_done
(
maps
,
[](
void
*
mem_
)
{
DocIdx
*
mem
=
reinterpret_cast
<
DocIdx
*>
(
mem_
);
delete
[]
mem
;
});
// Return the numpy array.
const
auto
byte_size
=
sizeof
(
DocIdx
);
return
py
::
array
(
std
::
vector
<
int64_t
>
{
num_samples
,
4
},
// shape
{
4
*
byte_size
,
byte_size
},
// C-style contiguous strides
maps
,
// the data pointer
free_when_done
);
// numpy array references
}
py
::
array
build_blocks_mapping
(
const
py
::
array_t
<
int64_t
>&
docs_
,
const
py
::
array_t
<
int
>&
sizes_
,
const
py
::
array_t
<
int
>&
titles_sizes_
,
const
int
num_epochs
,
const
uint64_t
max_num_samples
,
const
int
max_seq_length
,
const
int
seed
,
const
bool
verbose
,
const
bool
use_one_sent_blocks
)
{
if
(
sizes_
.
size
()
>
std
::
numeric_limits
<
uint32_t
>::
max
())
{
if
(
verbose
)
{
cout
<<
" using uint64 for data mapping..."
<<
endl
<<
std
::
flush
;
}
return
build_blocks_mapping_impl
<
uint64_t
>
(
docs_
,
sizes_
,
titles_sizes_
,
num_epochs
,
max_num_samples
,
max_seq_length
,
seed
,
verbose
,
use_one_sent_blocks
);
}
else
{
if
(
verbose
)
{
cout
<<
" using uint32 for data mapping..."
<<
endl
<<
std
::
flush
;
}
return
build_blocks_mapping_impl
<
uint32_t
>
(
docs_
,
sizes_
,
titles_sizes_
,
num_epochs
,
max_num_samples
,
max_seq_length
,
seed
,
verbose
,
use_one_sent_blocks
);
}
}
PYBIND11_MODULE
(
helpers
,
m
)
{
m
.
def
(
"build_mapping"
,
&
build_mapping
);
m
.
def
(
"build_blocks_mapping"
,
&
build_blocks_mapping
);
m
.
def
(
"build_sample_idx"
,
&
build_sample_idx
);
m
.
def
(
"build_blending_indices"
,
&
build_blending_indices
);
}
3rdparty/Megatron-LM/megatron/data/ict_dataset.py
0 → 100644
View file @
0211193c
import
itertools
import
random
import
numpy
as
np
from
torch.utils.data
import
Dataset
from
megatron
import
get_tokenizer
from
megatron
import
get_args
from
megatron.data.dataset_utils
import
get_indexed_dataset_
from
megatron.data.realm_dataset_utils
import
get_block_samples_mapping
def
make_attention_mask
(
source_block
,
target_block
):
"""
Returns a 2-dimensional (2-D) attention mask
:param source_block: 1-D array
:param target_block: 1-D array
"""
mask
=
(
target_block
[
None
,
:]
>=
1
)
*
(
source_block
[:,
None
]
>=
1
)
mask
=
mask
.
astype
(
np
.
int64
)
# (source_length, target_length)
return
mask
def
get_ict_dataset
(
use_titles
=
True
,
query_in_block_prob
=
1
):
"""Get a dataset which uses block samples mappings to get ICT/block indexing data (via get_block())
rather than for training, since it is only built with a single epoch sample mapping.
"""
args
=
get_args
()
block_dataset
=
get_indexed_dataset_
(
args
.
data_path
,
'mmap'
,
True
)
titles_dataset
=
get_indexed_dataset_
(
args
.
titles_data_path
,
'mmap'
,
True
)
kwargs
=
dict
(
name
=
'full'
,
block_dataset
=
block_dataset
,
title_dataset
=
titles_dataset
,
data_prefix
=
args
.
data_path
,
num_epochs
=
1
,
max_num_samples
=
None
,
max_seq_length
=
args
.
seq_length
,
seed
=
1
,
query_in_block_prob
=
query_in_block_prob
,
use_titles
=
use_titles
,
use_one_sent_docs
=
args
.
use_one_sent_docs
)
dataset
=
ICTDataset
(
**
kwargs
)
return
dataset
class
ICTDataset
(
Dataset
):
"""Dataset containing sentences and their blocks for an inverse cloze task."""
def
__init__
(
self
,
name
,
block_dataset
,
title_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
max_seq_length
,
query_in_block_prob
,
seed
,
use_titles
=
True
,
use_one_sent_docs
=
False
,
binary_head
=
False
):
self
.
name
=
name
self
.
seed
=
seed
self
.
max_seq_length
=
max_seq_length
self
.
query_in_block_prob
=
query_in_block_prob
self
.
block_dataset
=
block_dataset
self
.
title_dataset
=
title_dataset
self
.
rng
=
random
.
Random
(
self
.
seed
)
self
.
use_titles
=
use_titles
self
.
use_one_sent_docs
=
use_one_sent_docs
self
.
samples_mapping
=
get_block_samples_mapping
(
block_dataset
,
title_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
max_seq_length
,
seed
,
name
,
use_one_sent_docs
)
self
.
tokenizer
=
get_tokenizer
()
self
.
vocab_id_list
=
list
(
self
.
tokenizer
.
inv_vocab
.
keys
())
self
.
vocab_id_to_token_list
=
self
.
tokenizer
.
inv_vocab
self
.
cls_id
=
self
.
tokenizer
.
cls
self
.
sep_id
=
self
.
tokenizer
.
sep
self
.
mask_id
=
self
.
tokenizer
.
mask
self
.
pad_id
=
self
.
tokenizer
.
pad
def
__len__
(
self
):
return
len
(
self
.
samples_mapping
)
def
__getitem__
(
self
,
idx
):
"""Get an ICT example of a pseudo-query and the block of text from which it was extracted"""
sample_data
=
self
.
samples_mapping
[
idx
]
start_idx
,
end_idx
,
doc_idx
,
block_idx
=
sample_data
.
as_tuple
()
if
self
.
use_titles
:
title
=
self
.
title_dataset
[
int
(
doc_idx
)]
title_pad_offset
=
3
+
len
(
title
)
else
:
title
=
None
title_pad_offset
=
2
block
=
[
self
.
block_dataset
[
i
]
for
i
in
range
(
start_idx
,
end_idx
)]
assert
len
(
block
)
>
1
or
self
.
use_one_sent_docs
or
self
.
query_in_block_prob
==
1
# randint() is inclusive for Python rng
rand_sent_idx
=
self
.
rng
.
randint
(
0
,
len
(
block
)
-
1
)
# keep the query in the context query_in_block_prob fraction of the time.
if
self
.
rng
.
random
()
<
self
.
query_in_block_prob
:
query
=
block
[
rand_sent_idx
].
copy
()
else
:
query
=
block
.
pop
(
rand_sent_idx
)
# still need to truncate because blocks are concluded when
# the sentence lengths have exceeded max_seq_length.
query
=
query
[:
self
.
max_seq_length
-
2
]
block
=
list
(
itertools
.
chain
(
*
block
))[:
self
.
max_seq_length
-
title_pad_offset
]
query_tokens
,
query_pad_mask
=
self
.
concat_and_pad_tokens
(
query
)
context_tokens
,
context_pad_mask
=
self
.
concat_and_pad_tokens
(
block
,
title
)
query_mask
=
make_attention_mask
(
query_tokens
,
query_tokens
)
context_mask
=
make_attention_mask
(
context_tokens
,
context_tokens
)
block_data
=
sample_data
.
as_array
()
sample
=
{
'query_tokens'
:
query_tokens
,
'query_mask'
:
query_mask
,
'query_pad_mask'
:
query_pad_mask
,
'context_tokens'
:
context_tokens
,
'context_mask'
:
context_mask
,
'context_pad_mask'
:
context_pad_mask
,
'block_data'
:
block_data
,
}
return
sample
def
get_block
(
self
,
start_idx
,
end_idx
,
doc_idx
):
"""Get the IDs for an evidence block plus the title of the corresponding document"""
block
=
[
self
.
block_dataset
[
i
]
for
i
in
range
(
start_idx
,
end_idx
)]
title
=
self
.
title_dataset
[
int
(
doc_idx
)]
block
=
list
(
itertools
.
chain
(
*
block
))[:
self
.
max_seq_length
-
(
3
+
len
(
title
))]
block_tokens
,
block_pad_mask
=
self
.
concat_and_pad_tokens
(
block
,
title
)
return
block_tokens
,
block_pad_mask
def
get_null_block
(
self
):
"""Get empty block and title - used in REALM pretraining"""
block
,
title
=
[],
[]
block_tokens
,
block_pad_mask
=
self
.
concat_and_pad_tokens
(
block
,
title
)
return
block_tokens
,
block_pad_mask
def
concat_and_pad_tokens
(
self
,
tokens
,
title
=
None
):
"""Concat with special tokens and pad sequence to self.max_seq_length"""
tokens
=
list
(
tokens
)
if
title
is
None
:
tokens
=
[
self
.
cls_id
]
+
tokens
+
[
self
.
sep_id
]
else
:
title
=
list
(
title
)
tokens
=
[
self
.
cls_id
]
+
title
+
[
self
.
sep_id
]
+
tokens
+
[
self
.
sep_id
]
assert
len
(
tokens
)
<=
self
.
max_seq_length
num_pad
=
self
.
max_seq_length
-
len
(
tokens
)
pad_mask
=
[
1
]
*
len
(
tokens
)
+
[
0
]
*
num_pad
tokens
+=
[
self
.
pad_id
]
*
num_pad
return
np
.
array
(
tokens
),
np
.
array
(
pad_mask
)
3rdparty/Megatron-LM/megatron/data/image_folder.py
0 → 100644
View file @
0211193c
# BSD 3-Clause License
#
# Copyright (c) Soumith Chintala 2016,
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# code taken from
# https://github.com/pytorch/vision/blob/main/torchvision/datasets/folder.py
# added support for classes_fraction and data_per_class_fraction
from
torchvision.datasets
import
VisionDataset
from
PIL
import
Image
import
os
import
os.path
from
typing
import
Any
,
Callable
,
cast
,
Dict
,
List
,
Optional
,
Tuple
import
numpy
as
np
def
has_file_allowed_extension
(
filename
:
str
,
extensions
:
Tuple
[
str
,
...])
->
bool
:
"""Checks if a file is an allowed extension.
Args:
filename (string): path to a file
extensions (tuple of strings): extensions to consider (lowercase)
Returns:
bool: True if the filename ends with one of given extensions
"""
return
filename
.
lower
().
endswith
(
extensions
)
def
is_image_file
(
filename
:
str
)
->
bool
:
"""Checks if a file is an allowed image extension.
Args:
filename (string): path to a file
Returns:
bool: True if the filename ends with a known image extension
"""
return
has_file_allowed_extension
(
filename
,
IMG_EXTENSIONS
)
def
make_dataset
(
directory
:
str
,
class_to_idx
:
Dict
[
str
,
int
],
data_per_class_fraction
:
float
,
extensions
:
Optional
[
Tuple
[
str
,
...]]
=
None
,
is_valid_file
:
Optional
[
Callable
[[
str
],
bool
]]
=
None
,
)
->
List
[
Tuple
[
str
,
int
]]:
"""Generates a list of samples of a form (path_to_sample, class).
Args:
directory (str): root dataset directory
class_to_idx (Dict[str, int]): dictionary mapping class name to class index
extensions (optional): A list of allowed extensions.
Either extensions or is_valid_file should be passed. Defaults to None.
is_valid_file (optional): A function that takes path of a file
and checks if the file is a valid file
(used to check of corrupt files) both extensions and
is_valid_file should not be passed. Defaults to None.
Raises:
ValueError: In case ``extensions`` and ``is_valid_file`` are None or both are not None.
Returns:
List[Tuple[str, int]]: samples of a form (path_to_sample, class)
"""
instances
=
[]
directory
=
os
.
path
.
expanduser
(
directory
)
both_none
=
extensions
is
None
and
is_valid_file
is
None
both_something
=
extensions
is
not
None
and
is_valid_file
is
not
None
if
both_none
or
both_something
:
raise
ValueError
(
"Both extensions and is_valid_file cannot be None or not None at the same time"
)
if
extensions
is
not
None
:
def
is_valid_file
(
x
:
str
)
->
bool
:
return
has_file_allowed_extension
(
x
,
cast
(
Tuple
[
str
,
...],
extensions
))
is_valid_file
=
cast
(
Callable
[[
str
],
bool
],
is_valid_file
)
for
target_class
in
sorted
(
class_to_idx
.
keys
()):
class_index
=
class_to_idx
[
target_class
]
target_dir
=
os
.
path
.
join
(
directory
,
target_class
)
if
not
os
.
path
.
isdir
(
target_dir
):
continue
local_instances
=
[]
for
root
,
_
,
fnames
in
sorted
(
os
.
walk
(
target_dir
,
followlinks
=
True
)):
for
fname
in
sorted
(
fnames
):
path
=
os
.
path
.
join
(
root
,
fname
)
if
is_valid_file
(
path
):
item
=
path
,
class_index
local_instances
.
append
(
item
)
instances
.
extend
(
local_instances
[
0
:
int
(
len
(
local_instances
)
*
data_per_class_fraction
)])
return
instances
class
DatasetFolder
(
VisionDataset
):
"""A generic data loader where the samples are arranged in this way: ::
root/class_x/xxx.ext
root/class_x/xxy.ext
root/class_x/[...]/xxz.ext
root/class_y/123.ext
root/class_y/nsdf3.ext
root/class_y/[...]/asd932_.ext
Args:
root (string): Root directory path.
loader (callable): A function to load a sample given its path.
extensions (tuple[string]): A list of allowed extensions.
both extensions and is_valid_file should not be passed.
transform (callable, optional): A function/transform that takes in
a sample and returns a transformed version.
E.g, ``transforms.RandomCrop`` for images.
target_transform (callable, optional): A function/transform that takes
in the target and transforms it.
is_valid_file (callable, optional): A function that takes path of a file
and check if the file is a valid file (used to check of corrupt files)
both extensions and is_valid_file should not be passed.
Attributes:
classes (list): List of the class names sorted alphabetically.
class_to_idx (dict): Dict with items (class_name, class_index).
samples (list): List of (sample path, class_index) tuples
targets (list): The class_index value for each image in the dataset
"""
def
__init__
(
self
,
root
:
str
,
loader
:
Callable
[[
str
],
Any
],
extensions
:
Optional
[
Tuple
[
str
,
...]]
=
None
,
transform
:
Optional
[
Callable
]
=
None
,
target_transform
:
Optional
[
Callable
]
=
None
,
classes_fraction
=
1.0
,
data_per_class_fraction
=
1.0
,
is_valid_file
:
Optional
[
Callable
[[
str
],
bool
]]
=
None
,
)
->
None
:
super
(
DatasetFolder
,
self
).
__init__
(
root
,
transform
=
transform
,
target_transform
=
target_transform
)
self
.
classes_fraction
=
classes_fraction
self
.
data_per_class_fraction
=
data_per_class_fraction
classes
,
class_to_idx
=
self
.
_find_classes
(
self
.
root
)
samples
=
self
.
make_dataset
(
self
.
root
,
class_to_idx
,
self
.
data_per_class_fraction
,
extensions
,
is_valid_file
)
if
len
(
samples
)
==
0
:
msg
=
"Found 0 files in subfolders of: {}
\n
"
.
format
(
self
.
root
)
if
extensions
is
not
None
:
msg
+=
"Supported extensions are: {}"
.
format
(
","
.
join
(
extensions
))
raise
RuntimeError
(
msg
)
self
.
loader
=
loader
self
.
extensions
=
extensions
self
.
total
=
len
(
samples
)
self
.
classes
=
classes
self
.
class_to_idx
=
class_to_idx
self
.
samples
=
samples
self
.
targets
=
[
s
[
1
]
for
s
in
samples
]
@
staticmethod
def
make_dataset
(
directory
:
str
,
class_to_idx
:
Dict
[
str
,
int
],
data_per_class_fraction
:
float
,
extensions
:
Optional
[
Tuple
[
str
,
...]]
=
None
,
is_valid_file
:
Optional
[
Callable
[[
str
],
bool
]]
=
None
,
)
->
List
[
Tuple
[
str
,
int
]]:
return
make_dataset
(
directory
,
class_to_idx
,
data_per_class_fraction
,
extensions
=
extensions
,
is_valid_file
=
is_valid_file
)
def
_find_classes
(
self
,
dir
:
str
)
->
Tuple
[
List
[
str
],
Dict
[
str
,
int
]]:
"""
Finds the class folders in a dataset.
Args:
dir (string): Root directory path.
Returns:
tuple: (classes, class_to_idx) where classes are relative to (dir), and class_to_idx is a dictionary.
Ensures:
No class is a subdirectory of another.
"""
all_classes
=
[
d
.
name
for
d
in
os
.
scandir
(
dir
)
if
d
.
is_dir
()]
classes
=
all_classes
[
0
:
int
(
len
(
all_classes
)
*
self
.
classes_fraction
)]
classes
.
sort
()
class_to_idx
=
{
cls_name
:
i
for
i
,
cls_name
in
enumerate
(
classes
)}
return
classes
,
class_to_idx
def
__getitem__
(
self
,
index
:
int
)
->
Tuple
[
Any
,
Any
]:
"""
Args:
index (int): Index
Returns:
tuple: (sample, target) where target is class_index of the target class.
"""
curr_index
=
index
for
x
in
range
(
self
.
total
):
try
:
path
,
target
=
self
.
samples
[
curr_index
]
sample
=
self
.
loader
(
path
)
break
except
Exception
as
e
:
curr_index
=
np
.
random
.
randint
(
0
,
self
.
total
)
if
self
.
transform
is
not
None
:
sample
=
self
.
transform
(
sample
)
if
self
.
target_transform
is
not
None
:
target
=
self
.
target_transform
(
target
)
return
sample
,
target
def
__len__
(
self
)
->
int
:
return
len
(
self
.
samples
)
IMG_EXTENSIONS
=
(
'.jpg'
,
'.jpeg'
,
'.png'
,
'.ppm'
,
'.bmp'
,
'.pgm'
,
'.tif'
,
'.tiff'
,
'.webp'
)
def
pil_loader
(
path
:
str
)
->
Image
.
Image
:
# open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
with
open
(
path
,
'rb'
)
as
f
:
img
=
Image
.
open
(
f
)
return
img
.
convert
(
'RGB'
)
# TODO: specify the return type
def
accimage_loader
(
path
:
str
)
->
Any
:
import
accimage
try
:
return
accimage
.
Image
(
path
)
except
IOError
:
# Potentially a decoding problem, fall back to PIL.Image
return
pil_loader
(
path
)
def
default_loader
(
path
:
str
)
->
Any
:
from
torchvision
import
get_image_backend
if
get_image_backend
()
==
'accimage'
:
return
accimage_loader
(
path
)
else
:
return
pil_loader
(
path
)
class
ImageFolder
(
DatasetFolder
):
"""A generic data loader where the images are arranged in this way: ::
root/dog/xxx.png
root/dog/xxy.png
root/dog/[...]/xxz.png
root/cat/123.png
root/cat/nsdf3.png
root/cat/[...]/asd932_.png
Args:
root (string): Root directory path.
transform (callable, optional): A function/transform that takes in an PIL image
and returns a transformed version. E.g, ``transforms.RandomCrop``
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
loader (callable, optional): A function to load an image given its path.
is_valid_file (callable, optional): A function that takes path of an Image file
and check if the file is a valid file (used to check of corrupt files)
Attributes:
classes (list): List of the class names sorted alphabetically.
class_to_idx (dict): Dict with items (class_name, class_index).
imgs (list): List of (image path, class_index) tuples
"""
def
__init__
(
self
,
root
:
str
,
transform
:
Optional
[
Callable
]
=
None
,
target_transform
:
Optional
[
Callable
]
=
None
,
classes_fraction
=
1.0
,
data_per_class_fraction
=
1.0
,
loader
:
Callable
[[
str
],
Any
]
=
default_loader
,
is_valid_file
:
Optional
[
Callable
[[
str
],
bool
]]
=
None
,
):
super
(
ImageFolder
,
self
).
__init__
(
root
,
loader
,
IMG_EXTENSIONS
if
is_valid_file
is
None
else
None
,
transform
=
transform
,
target_transform
=
target_transform
,
classes_fraction
=
classes_fraction
,
data_per_class_fraction
=
data_per_class_fraction
,
is_valid_file
=
is_valid_file
)
self
.
imgs
=
self
.
samples
3rdparty/Megatron-LM/megatron/data/indexed_dataset.py
0 → 100644
View file @
0211193c
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# copied from fairseq/fairseq/data/indexed_dataset.py
# Removed IndexedRawTextDataset since it relied on Fairseq dictionary
# other slight modifications to remove fairseq dependencies
# Added document index to index file and made it accessible.
# An empty sentence no longer separates documents.
from
functools
import
lru_cache
import
os
import
shutil
import
struct
from
itertools
import
accumulate
import
numpy
as
np
import
torch
from
megatron
import
print_rank_0
def
__best_fitting_dtype
(
vocab_size
=
None
):
if
vocab_size
is
not
None
and
vocab_size
<
65500
:
return
np
.
uint16
else
:
return
np
.
int32
def
get_available_dataset_impl
():
return
[
'lazy'
,
'cached'
,
'mmap'
]
def
infer_dataset_impl
(
path
):
if
IndexedDataset
.
exists
(
path
):
with
open
(
index_file_path
(
path
),
'rb'
)
as
f
:
magic
=
f
.
read
(
8
)
if
magic
==
IndexedDataset
.
_HDR_MAGIC
:
return
'cached'
elif
magic
==
MMapIndexedDataset
.
Index
.
_HDR_MAGIC
[:
8
]:
return
'mmap'
else
:
return
None
else
:
print
(
f
"Dataset does not exist:
{
path
}
"
)
print
(
"Path should be a basename that both .idx and .bin can be appended to get full filenames."
)
return
None
def
make_builder
(
out_file
,
impl
,
vocab_size
=
None
):
if
impl
==
'mmap'
:
return
MMapIndexedDatasetBuilder
(
out_file
,
dtype
=
__best_fitting_dtype
(
vocab_size
))
else
:
return
IndexedDatasetBuilder
(
out_file
)
def
make_dataset
(
path
,
impl
,
skip_warmup
=
False
):
if
not
IndexedDataset
.
exists
(
path
):
print
(
f
"Dataset does not exist:
{
path
}
"
)
print
(
"Path should be a basename that both .idx and .bin can be appended to get full filenames."
)
return
None
if
impl
==
'infer'
:
impl
=
infer_dataset_impl
(
path
)
if
impl
==
'lazy'
and
IndexedDataset
.
exists
(
path
):
return
IndexedDataset
(
path
)
elif
impl
==
'cached'
and
IndexedDataset
.
exists
(
path
):
return
IndexedCachedDataset
(
path
)
elif
impl
==
'mmap'
and
MMapIndexedDataset
.
exists
(
path
):
return
MMapIndexedDataset
(
path
,
skip_warmup
)
print
(
f
"Unknown dataset implementation:
{
impl
}
"
)
return
None
def
dataset_exists
(
path
,
impl
):
if
impl
==
'mmap'
:
return
MMapIndexedDataset
.
exists
(
path
)
else
:
return
IndexedDataset
.
exists
(
path
)
def
read_longs
(
f
,
n
):
a
=
np
.
empty
(
n
,
dtype
=
np
.
int64
)
f
.
readinto
(
a
)
return
a
def
write_longs
(
f
,
a
):
f
.
write
(
np
.
array
(
a
,
dtype
=
np
.
int64
))
dtypes
=
{
1
:
np
.
uint8
,
2
:
np
.
int8
,
3
:
np
.
int16
,
4
:
np
.
int32
,
5
:
np
.
int64
,
6
:
np
.
float
,
7
:
np
.
double
,
8
:
np
.
uint16
}
def
code
(
dtype
):
for
k
in
dtypes
.
keys
():
if
dtypes
[
k
]
==
dtype
:
return
k
raise
ValueError
(
dtype
)
def
index_file_path
(
prefix_path
):
return
prefix_path
+
'.idx'
def
data_file_path
(
prefix_path
):
return
prefix_path
+
'.bin'
def
create_doc_idx
(
sizes
):
doc_idx
=
[
0
]
for
i
,
s
in
enumerate
(
sizes
):
if
s
==
0
:
doc_idx
.
append
(
i
+
1
)
return
doc_idx
class
IndexedDataset
(
torch
.
utils
.
data
.
Dataset
):
"""Loader for IndexedDataset"""
_HDR_MAGIC
=
b
'TNTIDX
\x00\x00
'
def
__init__
(
self
,
path
):
super
().
__init__
()
self
.
path
=
path
self
.
data_file
=
None
self
.
read_index
(
path
)
def
read_index
(
self
,
path
):
with
open
(
index_file_path
(
path
),
'rb'
)
as
f
:
magic
=
f
.
read
(
8
)
assert
magic
==
self
.
_HDR_MAGIC
,
(
'Index file doesn
\'
t match expected format. '
'Make sure that --dataset-impl is configured properly.'
)
version
=
f
.
read
(
8
)
assert
struct
.
unpack
(
'<Q'
,
version
)
==
(
1
,)
code
,
self
.
element_size
=
struct
.
unpack
(
'<QQ'
,
f
.
read
(
16
))
self
.
dtype
=
dtypes
[
code
]
self
.
_len
,
self
.
s
=
struct
.
unpack
(
'<QQ'
,
f
.
read
(
16
))
self
.
doc_count
=
struct
.
unpack
(
'<Q'
,
f
.
read
(
8
))
self
.
dim_offsets
=
read_longs
(
f
,
self
.
_len
+
1
)
self
.
data_offsets
=
read_longs
(
f
,
self
.
_len
+
1
)
self
.
sizes
=
read_longs
(
f
,
self
.
s
)
self
.
doc_idx
=
read_longs
(
f
,
self
.
doc_count
)
def
read_data
(
self
,
path
):
self
.
data_file
=
open
(
data_file_path
(
path
),
'rb'
,
buffering
=
0
)
def
check_index
(
self
,
i
):
if
i
<
0
or
i
>=
self
.
_len
:
raise
IndexError
(
'index out of range'
)
def
__del__
(
self
):
if
self
.
data_file
:
self
.
data_file
.
close
()
# @lru_cache(maxsize=8)
def
__getitem__
(
self
,
idx
):
if
not
self
.
data_file
:
self
.
read_data
(
self
.
path
)
if
isinstance
(
idx
,
int
):
i
=
idx
self
.
check_index
(
i
)
tensor_size
=
self
.
sizes
[
self
.
dim_offsets
[
i
]:
self
.
dim_offsets
[
i
+
1
]]
a
=
np
.
empty
(
tensor_size
,
dtype
=
self
.
dtype
)
self
.
data_file
.
seek
(
self
.
data_offsets
[
i
]
*
self
.
element_size
)
self
.
data_file
.
readinto
(
a
)
return
a
elif
isinstance
(
idx
,
slice
):
start
,
stop
,
step
=
idx
.
indices
(
len
(
self
))
if
step
!=
1
:
raise
ValueError
(
"Slices into indexed_dataset must be contiguous"
)
sizes
=
self
.
sizes
[
self
.
dim_offsets
[
start
]:
self
.
dim_offsets
[
stop
]]
size
=
sum
(
sizes
)
a
=
np
.
empty
(
size
,
dtype
=
self
.
dtype
)
self
.
data_file
.
seek
(
self
.
data_offsets
[
start
]
*
self
.
element_size
)
self
.
data_file
.
readinto
(
a
)
offsets
=
list
(
accumulate
(
sizes
))
sents
=
np
.
split
(
a
,
offsets
[:
-
1
])
return
sents
def
__len__
(
self
):
return
self
.
_len
def
num_tokens
(
self
,
index
):
return
self
.
sizes
[
index
]
def
size
(
self
,
index
):
return
self
.
sizes
[
index
]
@
staticmethod
def
exists
(
path
):
return
(
os
.
path
.
exists
(
index_file_path
(
path
))
and
os
.
path
.
exists
(
data_file_path
(
path
))
)
@
property
def
supports_prefetch
(
self
):
return
False
# avoid prefetching to save memory
class
IndexedCachedDataset
(
IndexedDataset
):
def
__init__
(
self
,
path
):
super
().
__init__
(
path
)
self
.
cache
=
None
self
.
cache_index
=
{}
@
property
def
supports_prefetch
(
self
):
return
True
def
prefetch
(
self
,
indices
):
if
all
(
i
in
self
.
cache_index
for
i
in
indices
):
return
if
not
self
.
data_file
:
self
.
read_data
(
self
.
path
)
indices
=
sorted
(
set
(
indices
))
total_size
=
0
for
i
in
indices
:
total_size
+=
self
.
data_offsets
[
i
+
1
]
-
self
.
data_offsets
[
i
]
self
.
cache
=
np
.
empty
(
total_size
,
dtype
=
self
.
dtype
)
ptx
=
0
self
.
cache_index
.
clear
()
for
i
in
indices
:
self
.
cache_index
[
i
]
=
ptx
size
=
self
.
data_offsets
[
i
+
1
]
-
self
.
data_offsets
[
i
]
a
=
self
.
cache
[
ptx
:
ptx
+
size
]
self
.
data_file
.
seek
(
self
.
data_offsets
[
i
]
*
self
.
element_size
)
self
.
data_file
.
readinto
(
a
)
ptx
+=
size
if
self
.
data_file
:
# close and delete data file after prefetch so we can pickle
self
.
data_file
.
close
()
self
.
data_file
=
None
# @lru_cache(maxsize=8)
def
__getitem__
(
self
,
idx
):
if
isinstance
(
idx
,
int
):
i
=
idx
self
.
check_index
(
i
)
tensor_size
=
self
.
sizes
[
self
.
dim_offsets
[
i
]:
self
.
dim_offsets
[
i
+
1
]]
a
=
np
.
empty
(
tensor_size
,
dtype
=
self
.
dtype
)
ptx
=
self
.
cache_index
[
i
]
np
.
copyto
(
a
,
self
.
cache
[
ptx
:
ptx
+
a
.
size
])
return
a
elif
isinstance
(
idx
,
slice
):
# Hack just to make this work, can optimizer later if necessary
sents
=
[]
for
i
in
range
(
*
idx
.
indices
(
len
(
self
))):
sents
.
append
(
self
[
i
])
return
sents
class
IndexedDatasetBuilder
(
object
):
element_sizes
=
{
np
.
uint8
:
1
,
np
.
int8
:
1
,
np
.
int16
:
2
,
np
.
int32
:
4
,
np
.
int64
:
8
,
np
.
float
:
4
,
np
.
double
:
8
}
def
__init__
(
self
,
out_file
,
dtype
=
np
.
int32
):
self
.
out_file
=
open
(
out_file
,
'wb'
)
self
.
dtype
=
dtype
self
.
data_offsets
=
[
0
]
self
.
dim_offsets
=
[
0
]
self
.
sizes
=
[]
self
.
element_size
=
self
.
element_sizes
[
self
.
dtype
]
self
.
doc_idx
=
[
0
]
def
add_item
(
self
,
tensor
):
bytes
=
self
.
out_file
.
write
(
np
.
array
(
tensor
.
numpy
(),
dtype
=
self
.
dtype
))
self
.
data_offsets
.
append
(
self
.
data_offsets
[
-
1
]
+
bytes
/
self
.
element_size
)
for
s
in
tensor
.
size
():
self
.
sizes
.
append
(
s
)
self
.
dim_offsets
.
append
(
self
.
dim_offsets
[
-
1
]
+
len
(
tensor
.
size
()))
def
end_document
(
self
):
self
.
doc_idx
.
append
(
len
(
self
.
sizes
))
def
merge_file_
(
self
,
another_file
):
index
=
IndexedDataset
(
another_file
)
assert
index
.
dtype
==
self
.
dtype
begin
=
self
.
data_offsets
[
-
1
]
for
offset
in
index
.
data_offsets
[
1
:]:
self
.
data_offsets
.
append
(
begin
+
offset
)
self
.
sizes
.
extend
(
index
.
sizes
)
begin
=
self
.
dim_offsets
[
-
1
]
for
dim_offset
in
index
.
dim_offsets
[
1
:]:
self
.
dim_offsets
.
append
(
begin
+
dim_offset
)
with
open
(
data_file_path
(
another_file
),
'rb'
)
as
f
:
while
True
:
data
=
f
.
read
(
1024
)
if
data
:
self
.
out_file
.
write
(
data
)
else
:
break
def
finalize
(
self
,
index_file
):
self
.
out_file
.
close
()
index
=
open
(
index_file
,
'wb'
)
index
.
write
(
b
'TNTIDX
\x00\x00
'
)
index
.
write
(
struct
.
pack
(
'<Q'
,
1
))
index
.
write
(
struct
.
pack
(
'<QQ'
,
code
(
self
.
dtype
),
self
.
element_size
))
index
.
write
(
struct
.
pack
(
'<QQ'
,
len
(
self
.
data_offsets
)
-
1
,
len
(
self
.
sizes
)))
index
.
write
(
struct
.
pack
(
'<Q'
,
len
(
self
.
doc_idx
)))
write_longs
(
index
,
self
.
dim_offsets
)
write_longs
(
index
,
self
.
data_offsets
)
write_longs
(
index
,
self
.
sizes
)
write_longs
(
index
,
self
.
doc_idx
)
index
.
close
()
def
_warmup_mmap_file
(
path
):
with
open
(
path
,
'rb'
)
as
stream
:
while
stream
.
read
(
100
*
1024
*
1024
):
pass
class
MMapIndexedDataset
(
torch
.
utils
.
data
.
Dataset
):
class
Index
(
object
):
_HDR_MAGIC
=
b
'MMIDIDX
\x00\x00
'
@
classmethod
def
writer
(
cls
,
path
,
dtype
):
class
_Writer
(
object
):
def
__enter__
(
self
):
self
.
_file
=
open
(
path
,
'wb'
)
self
.
_file
.
write
(
cls
.
_HDR_MAGIC
)
self
.
_file
.
write
(
struct
.
pack
(
'<Q'
,
1
))
self
.
_file
.
write
(
struct
.
pack
(
'<B'
,
code
(
dtype
)))
return
self
@
staticmethod
def
_get_pointers
(
sizes
):
dtype_size
=
dtype
().
itemsize
address
=
0
pointers
=
[]
for
size
in
sizes
:
pointers
.
append
(
address
)
address
+=
size
*
dtype_size
return
pointers
def
write
(
self
,
sizes
,
doc_idx
):
pointers
=
self
.
_get_pointers
(
sizes
)
self
.
_file
.
write
(
struct
.
pack
(
'<Q'
,
len
(
sizes
)))
self
.
_file
.
write
(
struct
.
pack
(
'<Q'
,
len
(
doc_idx
)))
sizes
=
np
.
array
(
sizes
,
dtype
=
np
.
int32
)
self
.
_file
.
write
(
sizes
.
tobytes
(
order
=
'C'
))
del
sizes
pointers
=
np
.
array
(
pointers
,
dtype
=
np
.
int64
)
self
.
_file
.
write
(
pointers
.
tobytes
(
order
=
'C'
))
del
pointers
doc_idx
=
np
.
array
(
doc_idx
,
dtype
=
np
.
int64
)
self
.
_file
.
write
(
doc_idx
.
tobytes
(
order
=
'C'
))
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
):
self
.
_file
.
close
()
return
_Writer
()
def
__init__
(
self
,
path
,
skip_warmup
=
False
):
with
open
(
path
,
'rb'
)
as
stream
:
magic_test
=
stream
.
read
(
9
)
assert
self
.
_HDR_MAGIC
==
magic_test
,
(
'Index file doesn
\'
t match expected format. '
'Make sure that --dataset-impl is configured properly.'
)
version
=
struct
.
unpack
(
'<Q'
,
stream
.
read
(
8
))
assert
(
1
,)
==
version
dtype_code
,
=
struct
.
unpack
(
'<B'
,
stream
.
read
(
1
))
self
.
_dtype
=
dtypes
[
dtype_code
]
self
.
_dtype_size
=
self
.
_dtype
().
itemsize
self
.
_len
=
struct
.
unpack
(
'<Q'
,
stream
.
read
(
8
))[
0
]
self
.
_doc_count
=
struct
.
unpack
(
'<Q'
,
stream
.
read
(
8
))[
0
]
offset
=
stream
.
tell
()
if
not
skip_warmup
:
print_rank_0
(
" warming up index mmap file..."
)
_warmup_mmap_file
(
path
)
self
.
_bin_buffer_mmap
=
np
.
memmap
(
path
,
mode
=
'r'
,
order
=
'C'
)
self
.
_bin_buffer
=
memoryview
(
self
.
_bin_buffer_mmap
)
print_rank_0
(
" reading sizes..."
)
self
.
_sizes
=
np
.
frombuffer
(
self
.
_bin_buffer
,
dtype
=
np
.
int32
,
count
=
self
.
_len
,
offset
=
offset
)
print_rank_0
(
" reading pointers..."
)
self
.
_pointers
=
np
.
frombuffer
(
self
.
_bin_buffer
,
dtype
=
np
.
int64
,
count
=
self
.
_len
,
offset
=
offset
+
self
.
_sizes
.
nbytes
)
print_rank_0
(
" reading document index..."
)
self
.
_doc_idx
=
np
.
frombuffer
(
self
.
_bin_buffer
,
dtype
=
np
.
int64
,
count
=
self
.
_doc_count
,
offset
=
offset
+
self
.
_sizes
.
nbytes
+
self
.
_pointers
.
nbytes
)
def
__del__
(
self
):
self
.
_bin_buffer_mmap
.
_mmap
.
close
()
del
self
.
_bin_buffer_mmap
@
property
def
dtype
(
self
):
return
self
.
_dtype
@
property
def
sizes
(
self
):
return
self
.
_sizes
@
property
def
doc_idx
(
self
):
return
self
.
_doc_idx
@
lru_cache
(
maxsize
=
8
)
def
__getitem__
(
self
,
i
):
return
self
.
_pointers
[
i
],
self
.
_sizes
[
i
]
def
__len__
(
self
):
return
self
.
_len
def
__init__
(
self
,
path
,
skip_warmup
=
False
):
super
().
__init__
()
self
.
_path
=
None
self
.
_index
=
None
self
.
_bin_buffer
=
None
self
.
_do_init
(
path
,
skip_warmup
)
def
__getstate__
(
self
):
return
self
.
_path
def
__setstate__
(
self
,
state
):
self
.
_do_init
(
state
)
def
_do_init
(
self
,
path
,
skip_warmup
):
self
.
_path
=
path
self
.
_index
=
self
.
Index
(
index_file_path
(
self
.
_path
),
skip_warmup
)
if
not
skip_warmup
:
print_rank_0
(
" warming up data mmap file..."
)
_warmup_mmap_file
(
data_file_path
(
self
.
_path
))
print_rank_0
(
" creating numpy buffer of mmap..."
)
self
.
_bin_buffer_mmap
=
np
.
memmap
(
data_file_path
(
self
.
_path
),
mode
=
'r'
,
order
=
'C'
)
print_rank_0
(
" creating memory view of numpy buffer..."
)
self
.
_bin_buffer
=
memoryview
(
self
.
_bin_buffer_mmap
)
def
__del__
(
self
):
self
.
_bin_buffer_mmap
.
_mmap
.
close
()
del
self
.
_bin_buffer_mmap
del
self
.
_index
def
__len__
(
self
):
return
len
(
self
.
_index
)
# @lru_cache(maxsize=8)
def
__getitem__
(
self
,
idx
):
if
isinstance
(
idx
,
int
):
ptr
,
size
=
self
.
_index
[
idx
]
np_array
=
np
.
frombuffer
(
self
.
_bin_buffer
,
dtype
=
self
.
_index
.
dtype
,
count
=
size
,
offset
=
ptr
)
return
np_array
elif
isinstance
(
idx
,
slice
):
start
,
stop
,
step
=
idx
.
indices
(
len
(
self
))
if
step
!=
1
:
raise
ValueError
(
"Slices into indexed_dataset must be contiguous"
)
ptr
=
self
.
_index
.
_pointers
[
start
]
sizes
=
self
.
_index
.
_sizes
[
idx
]
offsets
=
list
(
accumulate
(
sizes
))
total_size
=
sum
(
sizes
)
np_array
=
np
.
frombuffer
(
self
.
_bin_buffer
,
dtype
=
self
.
_index
.
dtype
,
count
=
total_size
,
offset
=
ptr
)
sents
=
np
.
split
(
np_array
,
offsets
[:
-
1
])
return
sents
def
get
(
self
,
idx
,
offset
=
0
,
length
=
None
):
""" Retrieves a single item from the dataset with the option to only
return a portion of the item.
get(idx) is the same as [idx] but get() does not support slicing.
"""
ptr
,
size
=
self
.
_index
[
idx
]
if
length
is
None
:
length
=
size
-
offset
ptr
+=
offset
*
np
.
dtype
(
self
.
_index
.
dtype
).
itemsize
np_array
=
np
.
frombuffer
(
self
.
_bin_buffer
,
dtype
=
self
.
_index
.
dtype
,
count
=
length
,
offset
=
ptr
)
return
np_array
@
property
def
sizes
(
self
):
return
self
.
_index
.
sizes
@
property
def
doc_idx
(
self
):
return
self
.
_index
.
doc_idx
def
get_doc_idx
(
self
):
return
self
.
_index
.
_doc_idx
def
set_doc_idx
(
self
,
doc_idx_
):
self
.
_index
.
_doc_idx
=
doc_idx_
@
property
def
supports_prefetch
(
self
):
return
False
@
staticmethod
def
exists
(
path
):
return
(
os
.
path
.
exists
(
index_file_path
(
path
))
and
os
.
path
.
exists
(
data_file_path
(
path
))
)
class
MMapIndexedDatasetBuilder
(
object
):
def
__init__
(
self
,
out_file
,
dtype
=
np
.
int64
):
self
.
_data_file
=
open
(
out_file
,
'wb'
)
self
.
_dtype
=
dtype
self
.
_sizes
=
[]
self
.
_doc_idx
=
[
0
]
def
add_item
(
self
,
tensor
):
np_array
=
np
.
array
(
tensor
.
numpy
(),
dtype
=
self
.
_dtype
)
self
.
_data_file
.
write
(
np_array
.
tobytes
(
order
=
'C'
))
self
.
_sizes
.
append
(
np_array
.
size
)
def
end_document
(
self
):
self
.
_doc_idx
.
append
(
len
(
self
.
_sizes
))
def
merge_file_
(
self
,
another_file
):
# Concatenate index
index
=
MMapIndexedDataset
.
Index
(
index_file_path
(
another_file
))
assert
index
.
dtype
==
self
.
_dtype
for
size
in
index
.
sizes
:
self
.
_sizes
.
append
(
size
)
# Concatenate data
with
open
(
data_file_path
(
another_file
),
'rb'
)
as
f
:
shutil
.
copyfileobj
(
f
,
self
.
_data_file
)
def
finalize
(
self
,
index_file
):
self
.
_data_file
.
close
()
with
MMapIndexedDataset
.
Index
.
writer
(
index_file
,
self
.
_dtype
)
as
index
:
index
.
write
(
self
.
_sizes
,
self
.
_doc_idx
)
3rdparty/Megatron-LM/megatron/data/orqa_wiki_dataset.py
0 → 100644
View file @
0211193c
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Wikipedia dataset from DPR code for ORQA."""
from
abc
import
ABC
import
csv
import
numpy
as
np
import
random
import
torch
from
torch.utils.data
import
Dataset
from
megatron
import
print_rank_0
,
get_args
,
get_tokenizer
,
mpu
from
megatron.data.biencoder_dataset_utils
import
make_attention_mask
def
get_open_retrieval_wiki_dataset
():
args
=
get_args
()
tokenizer
=
get_tokenizer
()
dataset
=
OpenRetrievalEvidenceDataset
(
'2018 Wikipedia from DPR codebase'
,
'evidence'
,
args
.
evidence_data_path
,
tokenizer
,
args
.
retriever_seq_length
)
return
dataset
def
get_open_retrieval_batch
(
data_iterator
):
# Items and their type.
keys
=
[
'row_id'
,
'context'
,
'context_mask'
,
'context_types'
,
'context_pad_mask'
]
datatype
=
torch
.
int64
# Broadcast data.
data
=
None
if
data_iterator
is
None
else
next
(
data_iterator
)
data_b
=
mpu
.
broadcast_data
(
keys
,
data
,
datatype
)
# Unpack.
row_id
=
data_b
[
'row_id'
].
long
()
context
=
data_b
[
'context'
].
long
()
# TODO: make the context mask a binary one
context_mask
=
(
data_b
[
'context_mask'
]
<
0.5
)
context_types
=
data_b
[
'context_types'
].
long
()
context_pad_mask
=
data_b
[
'context_pad_mask'
].
long
()
return
row_id
,
context
,
context_mask
,
context_types
,
context_pad_mask
def
build_tokens_types_paddings_from_text
(
row
,
tokenizer
,
max_seq_length
):
"""Build token types and paddings, trim if needed, and pad if needed."""
title_ids
=
tokenizer
.
tokenize
(
row
[
'title'
])
context_ids
=
tokenizer
.
tokenize
(
row
[
'text'
])
# Appending the title of the context at front
extended_context_ids
=
title_ids
+
[
tokenizer
.
sep_id
]
+
context_ids
context_ids
,
context_types
,
context_pad_mask
=
\
build_tokens_types_paddings_from_ids
(
extended_context_ids
,
max_seq_length
,
tokenizer
.
cls
,
tokenizer
.
sep
,
tokenizer
.
pad
)
return
context_ids
,
context_types
,
context_pad_mask
# noinspection DuplicatedCode
def
build_tokens_types_paddings_from_ids
(
text_ids
,
max_seq_length
,
cls_id
,
sep_id
,
pad_id
):
"""Build token types and paddings, trim if needed, and pad if needed."""
enc_ids
=
[]
tokentypes_enc
=
[]
# [CLS].
enc_ids
.
append
(
cls_id
)
tokentypes_enc
.
append
(
0
)
# A.
len_src
=
len
(
text_ids
)
enc_ids
.
extend
(
text_ids
)
tokentypes_enc
.
extend
([
0
]
*
len_src
)
# Cap the size.
if
len
(
enc_ids
)
>
max_seq_length
-
1
:
enc_ids
=
enc_ids
[
0
:
max_seq_length
-
1
]
tokentypes_enc
=
tokentypes_enc
[
0
:
max_seq_length
-
1
]
# [SEP].
enc_ids
.
append
(
sep_id
)
tokentypes_enc
.
append
(
0
)
num_tokens_enc
=
len
(
enc_ids
)
# Padding.
padding_length
=
max_seq_length
-
len
(
enc_ids
)
if
padding_length
>
0
:
enc_ids
.
extend
([
pad_id
]
*
padding_length
)
tokentypes_enc
.
extend
([
pad_id
]
*
padding_length
)
pad_mask
=
([
1
]
*
num_tokens_enc
)
+
([
0
]
*
padding_length
)
pad_mask
=
np
.
array
(
pad_mask
,
dtype
=
np
.
int64
)
return
enc_ids
,
tokentypes_enc
,
pad_mask
def
build_sample
(
row_id
,
context_ids
,
context_types
,
context_pad_mask
):
"""Convert to numpy and return a sample consumed by the batch producer."""
context_ids
=
np
.
array
(
context_ids
,
dtype
=
np
.
int64
)
context_types
=
np
.
array
(
context_types
,
dtype
=
np
.
int64
)
context_mask
=
make_attention_mask
(
context_ids
,
context_ids
)
sample
=
({
'row_id'
:
row_id
,
'context'
:
context_ids
,
'context_mask'
:
context_mask
,
'context_types'
:
context_types
,
'context_pad_mask'
:
context_pad_mask
})
return
sample
class
OpenRetrievalEvidenceDataset
(
ABC
,
Dataset
):
"""Open Retrieval Evidence dataset class."""
def
__init__
(
self
,
task_name
,
dataset_name
,
datapath
,
tokenizer
,
max_seq_length
):
# Store inputs.
self
.
task_name
=
task_name
self
.
dataset_name
=
dataset_name
self
.
tokenizer
=
tokenizer
self
.
max_seq_length
=
max_seq_length
print_rank_0
(
' > building {} dataset for {}:'
.
format
(
self
.
task_name
,
self
.
dataset_name
))
# Process the files.
print_rank_0
(
datapath
)
self
.
samples
,
self
.
id2text
=
self
.
process_samples_from_single_path
(
datapath
)
args
=
get_args
()
if
args
.
sample_rate
<
1
:
# subsample
k
=
int
(
len
(
self
.
samples
)
*
args
.
sample_rate
)
self
.
samples
=
random
.
sample
(
self
.
samples
,
k
)
print_rank_0
(
' >> total number of samples: {}'
.
format
(
len
(
self
.
samples
)))
def
__len__
(
self
):
return
len
(
self
.
samples
)
def
__getitem__
(
self
,
idx
):
row
=
self
.
samples
[
idx
]
context_ids
,
context_types
,
context_pad_mask
=
\
build_tokens_types_paddings_from_text
(
row
,
self
.
tokenizer
,
self
.
max_seq_length
)
sample
=
build_sample
(
row
[
'doc_id'
],
context_ids
,
context_types
,
context_pad_mask
)
return
sample
@
staticmethod
def
process_samples_from_single_path
(
filename
):
print_rank_0
(
' > Processing {} ...'
.
format
(
filename
))
total
=
0
rows
=
[]
id2text
=
{}
with
open
(
filename
)
as
tsvfile
:
reader
=
csv
.
reader
(
tsvfile
,
delimiter
=
'
\t
'
)
next
(
reader
,
None
)
# skip the headers
for
row
in
reader
:
# file format: doc_id, doc_text, title
doc_id
=
int
(
row
[
0
])
text
=
row
[
1
]
title
=
row
[
2
]
rows
.
append
({
'doc_id'
:
doc_id
,
'text'
:
text
,
'title'
:
title
})
assert
doc_id
not
in
id2text
id2text
[
doc_id
]
=
(
text
,
title
)
total
+=
1
if
total
%
100000
==
0
:
print_rank_0
(
' > processed {} rows so far ...'
.
format
(
total
))
print_rank_0
(
' >> processed {} samples.'
.
format
(
len
(
rows
)))
return
rows
,
id2text
3rdparty/Megatron-LM/megatron/data/realm_dataset_utils.py
0 → 100644
View file @
0211193c
import
os
import
time
import
numpy
as
np
import
torch
from
megatron
import
mpu
,
print_rank_0
from
megatron.data.dataset_utils
import
create_masked_lm_predictions
,
pad_and_convert_to_numpy
from
megatron
import
get_args
,
get_tokenizer
,
print_rank_0
,
mpu
def
get_one_epoch_dataloader
(
dataset
,
micro_batch_size
=
None
):
"""Specifically one epoch to be used in an indexing job."""
args
=
get_args
()
world_size
=
mpu
.
get_data_parallel_world_size
()
rank
=
mpu
.
get_data_parallel_rank
()
if
micro_batch_size
is
None
:
micro_batch_size
=
args
.
micro_batch_size
global_batch_size
=
micro_batch_size
*
world_size
num_workers
=
args
.
num_workers
sampler
=
torch
.
utils
.
data
.
SequentialSampler
(
dataset
)
# importantly, drop_last must be False to get all the data.
assert
False
,
'DistributedBatchSampler deprecated, change the implementation'
from
megatron.data.samplers
import
DistributedBatchSampler
batch_sampler
=
DistributedBatchSampler
(
sampler
,
batch_size
=
global_batch_size
,
drop_last
=
False
,
rank
=
rank
,
world_size
=
world_size
)
return
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
batch_sampler
,
num_workers
=
num_workers
,
pin_memory
=
True
)
def
get_ict_batch
(
data_iterator
):
# Items and their type.
keys
=
[
'query_tokens'
,
'query_pad_mask'
,
'block_tokens'
,
'block_pad_mask'
,
'block_data'
]
datatype
=
torch
.
int64
# Broadcast data.
if
data_iterator
is
None
:
data
=
None
else
:
data
=
next
(
data_iterator
)
data_b
=
mpu
.
broadcast_data
(
keys
,
data
,
datatype
)
# Unpack.
query_tokens
=
data_b
[
'query_tokens'
].
long
()
query_pad_mask
=
data_b
[
'query_pad_mask'
].
long
()
block_tokens
=
data_b
[
'block_tokens'
].
long
()
block_pad_mask
=
data_b
[
'block_pad_mask'
].
long
()
block_indices
=
data_b
[
'block_data'
].
long
()
return
query_tokens
,
query_pad_mask
,
\
block_tokens
,
block_pad_mask
,
block_indices
def
join_str_list
(
str_list
):
"""Join a list of strings, handling spaces appropriately"""
result
=
""
for
s
in
str_list
:
if
s
.
startswith
(
"##"
):
result
+=
s
[
2
:]
else
:
result
+=
" "
+
s
return
result
class
BlockSampleData
(
object
):
"""A struct for fully describing a fixed-size block of data as used in REALM
:param start_idx: for first sentence of the block
:param end_idx: for last sentence of the block (may be partially truncated in sample construction)
:param doc_idx: the index of the document from which the block comes in the original indexed dataset
:param block_idx: a unique integer identifier given to every block.
"""
def
__init__
(
self
,
start_idx
,
end_idx
,
doc_idx
,
block_idx
):
self
.
start_idx
=
start_idx
self
.
end_idx
=
end_idx
self
.
doc_idx
=
doc_idx
self
.
block_idx
=
block_idx
def
as_array
(
self
):
return
np
.
array
([
self
.
start_idx
,
self
.
end_idx
,
self
.
doc_idx
,
self
.
block_idx
]).
astype
(
np
.
int64
)
def
as_tuple
(
self
):
return
self
.
start_idx
,
self
.
end_idx
,
self
.
doc_idx
,
self
.
block_idx
class
BlockSamplesMapping
(
object
):
def
__init__
(
self
,
mapping_array
):
# make sure that the array is compatible with BlockSampleData
assert
mapping_array
.
shape
[
1
]
==
4
self
.
mapping_array
=
mapping_array
def
__len__
(
self
):
return
self
.
mapping_array
.
shape
[
0
]
def
__getitem__
(
self
,
idx
):
"""Get the data associated with an indexed sample."""
sample_data
=
BlockSampleData
(
*
self
.
mapping_array
[
idx
])
return
sample_data
def
get_block_samples_mapping
(
block_dataset
,
title_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
max_seq_length
,
seed
,
name
,
use_one_sent_docs
=
False
):
"""Get samples mapping for a dataset over fixed size blocks. This function also requires
a dataset of the titles for the source documents since their lengths must be taken into account.
:return: samples_mapping (BlockSamplesMapping)
"""
if
not
num_epochs
:
if
not
max_num_samples
:
raise
ValueError
(
"Need to specify either max_num_samples "
"or num_epochs"
)
num_epochs
=
np
.
iinfo
(
np
.
int32
).
max
-
1
if
not
max_num_samples
:
max_num_samples
=
np
.
iinfo
(
np
.
int64
).
max
-
1
# Filename of the index mapping
indexmap_filename
=
data_prefix
indexmap_filename
+=
'_{}_indexmap'
.
format
(
name
)
if
num_epochs
!=
(
np
.
iinfo
(
np
.
int32
).
max
-
1
):
indexmap_filename
+=
'_{}ep'
.
format
(
num_epochs
)
if
max_num_samples
!=
(
np
.
iinfo
(
np
.
int64
).
max
-
1
):
indexmap_filename
+=
'_{}mns'
.
format
(
max_num_samples
)
indexmap_filename
+=
'_{}msl'
.
format
(
max_seq_length
)
indexmap_filename
+=
'_{}s'
.
format
(
seed
)
if
use_one_sent_docs
:
indexmap_filename
+=
'_1sentok'
indexmap_filename
+=
'.npy'
# Build the indexed mapping if not exist.
if
mpu
.
get_data_parallel_rank
()
==
0
and
\
not
os
.
path
.
isfile
(
indexmap_filename
):
print
(
' > WARNING: could not find index map file {}, building '
'the indices on rank 0 ...'
.
format
(
indexmap_filename
))
# Make sure the types match the helpers input types.
assert
block_dataset
.
doc_idx
.
dtype
==
np
.
int64
assert
block_dataset
.
sizes
.
dtype
==
np
.
int32
# Build samples mapping
verbose
=
torch
.
distributed
.
get_rank
()
==
0
start_time
=
time
.
time
()
print_rank_0
(
' > building samples index mapping for {} ...'
.
format
(
name
))
from
megatron.data
import
helpers
mapping_array
=
helpers
.
build_blocks_mapping
(
block_dataset
.
doc_idx
,
block_dataset
.
sizes
,
title_dataset
.
sizes
,
num_epochs
,
max_num_samples
,
max_seq_length
-
3
,
# account for added tokens
seed
,
verbose
,
use_one_sent_docs
)
print_rank_0
(
' > done building samples index mapping'
)
np
.
save
(
indexmap_filename
,
mapping_array
,
allow_pickle
=
True
)
print_rank_0
(
' > saved the index mapping in {}'
.
format
(
indexmap_filename
))
# Make sure all the ranks have built the mapping
print_rank_0
(
' > elapsed time to build and save samples mapping '
'(seconds): {:4f}'
.
format
(
time
.
time
()
-
start_time
))
# This should be a barrier but nccl barrier assumes
# device_index=rank which is not the case for model
# parallel case
counts
=
torch
.
cuda
.
LongTensor
([
1
])
torch
.
distributed
.
all_reduce
(
counts
,
group
=
mpu
.
get_data_parallel_group
())
assert
counts
[
0
].
item
()
==
torch
.
distributed
.
get_world_size
(
group
=
mpu
.
get_data_parallel_group
())
# Load indexed dataset.
print_rank_0
(
' > loading indexed mapping from {}'
.
format
(
indexmap_filename
))
start_time
=
time
.
time
()
mapping_array
=
np
.
load
(
indexmap_filename
,
allow_pickle
=
True
,
mmap_mode
=
'r'
)
samples_mapping
=
BlockSamplesMapping
(
mapping_array
)
print_rank_0
(
' loaded indexed file in {:3.3f} seconds'
.
format
(
time
.
time
()
-
start_time
))
print_rank_0
(
' total number of samples: {}'
.
format
(
mapping_array
.
shape
[
0
]))
return
samples_mapping
3rdparty/Megatron-LM/megatron/data/realm_index.py
0 → 100644
View file @
0211193c
import
itertools
import
os
import
pickle
import
shutil
import
numpy
as
np
import
torch
from
megatron
import
get_args
from
megatron
import
mpu
def
detach
(
tensor
):
return
tensor
.
detach
().
cpu
().
numpy
()
class
OpenRetreivalDataStore
(
object
):
"""
Serializable data structure for holding data for blocks --
embeddings and necessary metadata for Retriever
"""
def
__init__
(
self
,
embedding_path
=
None
,
load_from_path
=
True
,
rank
=
None
):
self
.
embed_data
=
dict
()
if
embedding_path
is
None
:
args
=
get_args
()
embedding_path
=
args
.
embedding_path
rank
=
args
.
rank
self
.
embedding_path
=
embedding_path
self
.
rank
=
rank
if
load_from_path
:
self
.
load_from_file
()
block_data_name
=
os
.
path
.
splitext
(
self
.
embedding_path
)[
0
]
self
.
temp_dir_name
=
block_data_name
+
'_tmp'
def
state
(
self
):
return
{
'embed_data'
:
self
.
embed_data
,
}
def
clear
(
self
):
"""
Clear the embedding data structures to save memory.
The metadata ends up getting used, and is also much smaller in
dimensionality so it isn't really worth clearing.
"""
self
.
embed_data
=
dict
()
def
load_from_file
(
self
):
"""Populate members from instance saved to file"""
if
mpu
.
is_unitialized
()
or
mpu
.
get_data_parallel_rank
()
==
0
:
print
(
"
\n
> Unpickling BlockData"
,
flush
=
True
)
state_dict
=
pickle
.
load
(
open
(
self
.
embedding_path
,
'rb'
))
if
mpu
.
is_unitialized
()
or
mpu
.
get_data_parallel_rank
()
==
0
:
print
(
">> Finished unpickling BlockData
\n
"
,
flush
=
True
)
self
.
embed_data
=
state_dict
[
'embed_data'
]
def
add_block_data
(
self
,
row_id
,
block_embeds
,
allow_overwrite
=
False
):
"""
Add data for set of blocks
:param row_id: 1D array of unique int ids for the blocks
:param block_embeds: 2D array of embeddings of the blocks
In the case of retriever this will be [start_idx, end_idx, doc_idx]
"""
for
idx
,
embed
in
zip
(
row_id
,
block_embeds
):
if
not
allow_overwrite
and
idx
in
self
.
embed_data
:
raise
ValueError
(
"Unexpectedly tried to overwrite block data"
)
self
.
embed_data
[
idx
]
=
np
.
float16
(
embed
)
def
save_shard
(
self
):
"""
Save the block data that was created this in this process
"""
if
not
os
.
path
.
isdir
(
self
.
temp_dir_name
):
os
.
makedirs
(
self
.
temp_dir_name
,
exist_ok
=
True
)
# save the data for each shard
with
open
(
'{}/{}.pkl'
.
format
(
self
.
temp_dir_name
,
self
.
rank
),
'wb'
)
\
as
writer
:
pickle
.
dump
(
self
.
state
(),
writer
)
def
merge_shards_and_save
(
self
):
#Combine all the shards made using save_shard
shard_names
=
os
.
listdir
(
self
.
temp_dir_name
)
seen_own_shard
=
False
for
fname
in
os
.
listdir
(
self
.
temp_dir_name
):
shard_rank
=
int
(
os
.
path
.
splitext
(
fname
)[
0
])
if
shard_rank
==
self
.
rank
:
seen_own_shard
=
True
continue
with
open
(
'{}/{}'
.
format
(
self
.
temp_dir_name
,
fname
),
'rb'
)
as
f
:
data
=
pickle
.
load
(
f
)
old_size
=
len
(
self
.
embed_data
)
shard_size
=
len
(
data
[
'embed_data'
])
# add the shard's data and check to make sure there
# is no overlap
self
.
embed_data
.
update
(
data
[
'embed_data'
])
assert
len
(
self
.
embed_data
)
==
old_size
+
shard_size
assert
seen_own_shard
# save the consolidated shards and remove temporary directory
with
open
(
self
.
embedding_path
,
'wb'
)
as
final_file
:
pickle
.
dump
(
self
.
state
(),
final_file
)
shutil
.
rmtree
(
self
.
temp_dir_name
,
ignore_errors
=
True
)
print
(
"Finished merging {} shards for a total of {} embeds"
.
format
(
len
(
shard_names
),
len
(
self
.
embed_data
)),
flush
=
True
)
class
FaissMIPSIndex
(
object
):
"""
Wrapper object for a BlockData which similarity search via FAISS under the hood
"""
def
__init__
(
self
,
embed_size
,
embed_data
=
None
,
use_gpu
=
False
):
self
.
embed_size
=
embed_size
self
.
embed_data
=
embed_data
self
.
use_gpu
=
use_gpu
self
.
mips_index
=
None
self
.
_set_mips_index
()
def
_set_mips_index
(
self
):
"""
Create a Faiss Flat index with inner product as the metric
to search against
"""
try
:
import
faiss
except
ImportError
:
raise
Exception
(
"Error: Please install faiss to use FaissMIPSIndex"
)
if
mpu
.
is_unitialized
()
or
mpu
.
get_data_parallel_rank
()
==
0
:
print
(
"
\n
> Building index"
,
flush
=
True
)
cpu_index
=
faiss
.
IndexFlatIP
(
self
.
embed_size
)
if
self
.
use_gpu
:
# create resources and config for GpuIndex
config
=
faiss
.
GpuMultipleClonerOptions
()
config
.
shard
=
True
config
.
useFloat16
=
True
gpu_index
=
faiss
.
index_cpu_to_all_gpus
(
cpu_index
,
co
=
config
)
self
.
mips_index
=
faiss
.
IndexIDMap
(
gpu_index
)
if
mpu
.
is_unitialized
()
or
mpu
.
get_data_parallel_rank
()
==
0
:
print
(
">> Initialized index on GPU"
,
flush
=
True
)
else
:
# CPU index supports IDs so wrap with IDMap
self
.
mips_index
=
faiss
.
IndexIDMap
(
cpu_index
)
if
mpu
.
is_unitialized
()
or
mpu
.
get_data_parallel_rank
()
==
0
:
print
(
">> Initialized index on CPU"
,
flush
=
True
)
# if we were constructed with a BlockData, then automatically load it
# when the FAISS structure is built
if
self
.
embed_data
is
not
None
:
self
.
add_embed_data
(
self
.
embed_data
)
def
reset_index
(
self
):
"""Delete existing index and create a new"""
del
self
.
mips_index
# reset the block data so that _set_block_index will reload it as well
if
self
.
embed_data
is
not
None
:
embed_data_path
=
self
.
embed_data
.
embedding_path
del
self
.
embed_data
self
.
embed_data
=
OpenRetreivalDataStore
(
embed_data_path
)
self
.
_set_mips_index
()
def
update_index
(
self
):
"""Delete existing index and create a new"""
del
self
.
mips_index
# reset the block data so that _set_mips_index will reload it as well
if
self
.
embed_data
is
not
None
:
self
.
embed_data
.
load_from_file
()
self
.
_set_mips_index
()
def
add_embed_data
(
self
,
all_embed_data
):
"""Add the embedding of each block to the underlying FAISS index"""
# this assumes the embed_data is a dict : {int: np.array<float>}
block_indices
,
block_embeds
=
zip
(
*
all_embed_data
.
embed_data
.
items
())
# the embeddings have to be entered in as float32 even though the math
# internally is done with float16.
embeds_arr
=
np
.
float32
(
np
.
array
(
block_embeds
))
indices_arr
=
np
.
array
(
block_indices
)
# we no longer need the embedding data since it's in the index now
all_embed_data
.
clear
()
self
.
mips_index
.
add_with_ids
(
embeds_arr
,
indices_arr
)
if
mpu
.
is_unitialized
()
or
mpu
.
get_data_parallel_rank
()
==
0
:
print
(
">>> Finished adding block data to index"
,
flush
=
True
)
def
search_mips_index
(
self
,
query_embeds
,
top_k
,
reconstruct
=
True
):
"""
Get the top-k blocks by the index distance metric.
:param reconstruct: if True: return a [num_queries x k x embed_dim]
array of blocks
if False: return [num_queries x k] array of
distances, and another for indices
"""
query_embeds
=
np
.
float32
(
detach
(
query_embeds
))
if
reconstruct
:
# get the vectors themselves
top_k_block_embeds
=
self
.
mips_index
.
search_and_reconstruct
(
\
query_embeds
,
top_k
)
return
top_k_block_embeds
else
:
# get distances and indices of closest vectors
distances
,
block_indices
=
self
.
mips_index
.
search
(
query_embeds
,
top_k
)
return
distances
,
block_indices
3rdparty/Megatron-LM/megatron/data/t5_dataset.py
0 → 100644
View file @
0211193c
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""T5 Style dataset."""
import
collections
import
numpy
as
np
import
torch
from
megatron
import
get_tokenizer
from
megatron.data.dataset_utils
import
(
create_masked_lm_predictions
,
get_samples_mapping
)
class
T5Dataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
name
,
indexed_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
masked_lm_prob
,
max_seq_length
,
max_seq_length_dec
,
short_seq_prob
,
seed
):
# Params to store.
self
.
name
=
name
self
.
seed
=
seed
self
.
masked_lm_prob
=
masked_lm_prob
self
.
max_seq_length
=
max_seq_length
self
.
max_seq_length_dec
=
max_seq_length_dec
# Dataset.
self
.
indexed_dataset
=
indexed_dataset
# Build the samples mapping.
self
.
samples_mapping
=
get_samples_mapping
(
self
.
indexed_dataset
,
data_prefix
,
num_epochs
,
max_num_samples
,
self
.
max_seq_length
-
2
,
# account for added tokens
short_seq_prob
,
self
.
seed
,
self
.
name
,
False
)
# Vocab stuff.
tokenizer
=
get_tokenizer
()
self
.
vocab_id_list
=
list
(
tokenizer
.
inv_vocab
.
keys
())
self
.
vocab_id_to_token_dict
=
tokenizer
.
inv_vocab
self
.
cls_id
=
tokenizer
.
cls
self
.
sep_id
=
tokenizer
.
sep
self
.
mask_id
=
tokenizer
.
mask
self
.
pad_id
=
tokenizer
.
pad
self
.
bos_id
=
tokenizer
.
bos_token_id
self
.
eos_id
=
tokenizer
.
eos_token_id
self
.
sentinel_tokens
=
tokenizer
.
additional_special_tokens_ids
assert
len
(
self
.
sentinel_tokens
)
>
0
,
"Provide the argument --vocab-extra-ids 100 to the script"
def
__len__
(
self
):
return
self
.
samples_mapping
.
shape
[
0
]
def
__getitem__
(
self
,
idx
):
start_index
,
end_index
,
seq_length
=
self
.
samples_mapping
[
idx
]
sample
=
[]
for
index
in
range
(
start_index
,
end_index
):
sample
.
append
(
self
.
indexed_dataset
[
index
])
# Note that this rng state should be numpy and not python since
# python randint is inclusive whereas the numpy one is exclusive.
np_rng
=
np
.
random
.
RandomState
(
seed
=
(
self
.
seed
+
idx
))
return
build_training_sample
(
sample
,
seq_length
,
self
.
max_seq_length
,
# needed for padding
self
.
max_seq_length_dec
,
self
.
vocab_id_list
,
self
.
vocab_id_to_token_dict
,
self
.
cls_id
,
self
.
sep_id
,
self
.
mask_id
,
self
.
pad_id
,
self
.
masked_lm_prob
,
np_rng
,
self
.
bos_id
,
self
.
eos_id
,
self
.
sentinel_tokens
)
def
build_training_sample
(
sample
,
target_seq_length
,
max_seq_length
,
max_seq_length_dec
,
vocab_id_list
,
vocab_id_to_token_dict
,
cls_id
,
sep_id
,
mask_id
,
pad_id
,
masked_lm_prob
,
np_rng
,
bos_id
=
None
,
eos_id
=
None
,
sentinel_tokens
=
None
):
"""Build training sample.
Arguments:
sample: A list of sentences in which each sentence is a list token ids.
target_seq_length: Desired sequence length.
max_seq_length: Maximum length of the sequence. All values are padded to
this length.
vocab_id_list: List of vocabulary ids. Used to pick a random id.
vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
cls_id: Start of example id.
sep_id: Separator id.
mask_id: Mask token id.
pad_id: Padding token id.
masked_lm_prob: Probability to mask tokens.
np_rng: Random number genenrator. Note that this rng state should be
numpy and not python since python randint is inclusive for
the opper bound whereas the numpy one is exclusive.
bos_id: start of decoder example id
eos_id: end of generation id
sentinel_tokens: unique value to be substituted for every replaced span
"""
assert
target_seq_length
<=
max_seq_length
# flatten sentences into one list
tokens
=
[
token
for
sentence
in
sample
for
token
in
sentence
]
# Truncate to `target_sequence_length`.
max_num_tokens
=
target_seq_length
truncated
=
len
(
tokens
)
>
max_num_tokens
tokens
=
tokens
[:
max_num_tokens
]
# Masking.
max_predictions_per_seq
=
masked_lm_prob
*
max_num_tokens
(
tokens
,
masked_positions
,
masked_labels
,
_
,
masked_spans
)
=
create_masked_lm_predictions
(
tokens
,
vocab_id_list
,
vocab_id_to_token_dict
,
masked_lm_prob
,
cls_id
,
sep_id
,
mask_id
,
max_predictions_per_seq
,
np_rng
,
max_ngrams
=
10
,
geometric_dist
=
True
,
masking_style
=
"t5"
)
# Padding.
tokens_enc
,
tokens_dec_in
,
labels
,
enc_mask
,
\
dec_mask
,
enc_dec_mask
,
loss_mask
\
=
pad_and_convert_to_numpy
(
tokens
,
masked_positions
,
masked_labels
,
pad_id
,
max_seq_length
,
max_seq_length_dec
,
masked_spans
,
bos_id
,
eos_id
,
sentinel_tokens
)
train_sample
=
{
'text_enc'
:
tokens_enc
,
'text_dec'
:
tokens_dec_in
,
'labels'
:
labels
,
'loss_mask'
:
loss_mask
,
'truncated'
:
int
(
truncated
),
'enc_mask'
:
enc_mask
,
'dec_mask'
:
dec_mask
,
'enc_dec_mask'
:
enc_dec_mask
,
}
return
train_sample
def
pad_and_convert_to_numpy
(
tokens
,
masked_positions
,
masked_labels
,
pad_id
,
max_seq_length
,
max_seq_length_dec
,
masked_spans
=
None
,
bos_id
=
None
,
eos_id
=
None
,
sentinel_tokens
=
None
):
"""Pad sequences and convert them to numpy."""
sentinel_tokens
=
collections
.
deque
(
sentinel_tokens
)
t5_input
=
[]
(
t5_decoder_in
,
t5_decoder_out
)
=
([
bos_id
],
[])
(
start_index
,
end_index
)
=
(
0
,
None
)
for
span
in
masked_spans
:
flag
=
sentinel_tokens
.
popleft
()
# Append the same tokens in decoder input and output
t5_decoder_in
.
append
(
flag
)
t5_decoder_in
.
extend
(
span
.
label
)
t5_decoder_out
.
append
(
flag
)
t5_decoder_out
.
extend
(
span
.
label
)
end_index
=
span
.
index
[
0
]
t5_input
.
extend
(
tokens
[
start_index
:
end_index
])
t5_input
.
append
(
flag
)
# the next start index is the token after the last span token
start_index
=
span
.
index
[
-
1
]
+
1
# Add <eos> token to the t5_decoder_out
t5_decoder_out
.
append
(
eos_id
)
# Add the remaining tokens to the t5 input
t5_input
.
extend
(
tokens
[
start_index
:])
# assert (len(t5_input) - len(masked_spans)) + \
# (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens)
# Some checks.
# Encoder-side padding mask.
num_tokens
=
len
(
t5_input
)
padding_length
=
max_seq_length
-
num_tokens
assert
padding_length
>=
0
assert
len
(
masked_positions
)
==
len
(
masked_labels
)
# Tokens..
filler
=
[
pad_id
]
*
padding_length
tokens_enc
=
np
.
array
(
t5_input
+
filler
,
dtype
=
np
.
int64
)
# Decoder-side padding mask.
num_tokens_dec
=
len
(
t5_decoder_in
)
padding_length_dec
=
max_seq_length_dec
-
num_tokens_dec
assert
padding_length_dec
>=
0
filler_dec
=
[
pad_id
]
*
padding_length_dec
tokens_dec_in
=
np
.
array
(
t5_decoder_in
+
filler_dec
,
dtype
=
np
.
int64
)
# Create attention masks
enc_mask
=
make_attention_mask
(
tokens_enc
,
tokens_enc
)
enc_dec_mask
=
make_attention_mask
(
tokens_dec_in
,
tokens_enc
)
dec_mask
=
make_attention_mask
(
tokens_dec_in
,
tokens_dec_in
)
dec_mask
=
dec_mask
*
make_history_mask
(
tokens_dec_in
)
# Labels mask.
labels
=
t5_decoder_out
+
([
-
1
]
*
padding_length_dec
)
labels
=
np
.
array
(
labels
,
dtype
=
np
.
int64
)
# Loss mask
loss_mask
=
([
1
]
*
num_tokens_dec
)
+
([
0
]
*
padding_length_dec
)
loss_mask
=
np
.
array
(
loss_mask
,
dtype
=
np
.
int64
)
return
tokens_enc
,
tokens_dec_in
,
labels
,
enc_mask
,
\
dec_mask
,
enc_dec_mask
,
loss_mask
def
make_attention_mask
(
source_block
,
target_block
):
"""
Returns a 2-dimensional (2-D) attention mask
:param source_block: 1-D array
:param target_block: 1-D array
"""
mask
=
(
target_block
[
None
,
:]
>=
1
)
*
(
source_block
[:,
None
]
>=
1
)
mask
=
mask
.
astype
(
np
.
int64
)
# (source_length, target_length)
return
mask
def
make_attention_mask_3d
(
source_block
,
target_block
):
"""
Returns a 3-dimensional (3-D) attention mask
:param source_block: 1-D array
:param target_block: 1-D array
"""
mask
=
(
target_block
[:,
None
,
:]
>=
1
)
*
(
source_block
[:,
:,
None
]
>=
1
)
# (batch, source_length, target_length)
# mask = mask.astype(np.int64)
return
mask
def
make_history_mask
(
block
):
length
=
block
.
shape
[
0
]
arange
=
np
.
arange
(
length
)
history_mask
=
(
arange
[
None
,
]
<=
arange
[:,
None
])
history_mask
=
history_mask
.
astype
(
np
.
int64
)
return
history_mask
def
make_history_mask_3d
(
block
):
batch
,
length
=
block
.
shape
arange
=
torch
.
arange
(
length
,
device
=
block
.
device
)
history_mask
=
(
arange
[
None
,
]
<=
arange
[:,
None
])[
None
,
]
history_mask
=
history_mask
.
expand
(
batch
,
length
,
length
)
return
history_mask
3rdparty/Megatron-LM/megatron/data/test/test_indexed_dataset.py
0 → 100644
View file @
0211193c
# This file isn't really a formal automated test, it's just a place to
# put some code used during development and manual testing of
# indexed_dataset.
from
megatron.data
import
indexed_dataset
from
megatron.tokenizer
import
build_tokenizer
import
argparse
import
os
import
sys
import
torch
script_dir
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
sys
.
path
.
append
(
os
.
path
.
join
(
script_dir
,
"../../../"
))
def
test_indexed_dataset
(
args
):
ds
=
indexed_dataset
.
make_dataset
(
args
.
data
,
args
.
dataset_impl
)
tokenizer
=
build_tokenizer
(
args
)
print
(
len
(
ds
.
doc_idx
))
print
(
len
(
ds
))
print
(
ds
.
doc_idx
[
-
1
])
if
ds
.
supports_prefetch
:
# just prefetch the whole thing in test (so assume it is small)
ds
.
prefetch
(
range
(
len
(
ds
)))
if
args
.
count
>
len
(
ds
.
doc_idx
)
-
1
:
args
.
count
=
len
(
ds
.
doc_idx
)
-
1
for
i
in
range
(
args
.
count
):
start
=
ds
.
doc_idx
[
i
]
end
=
ds
.
doc_idx
[
i
+
1
]
ids
=
ds
[
start
:
end
]
print
(
f
"Document
{
i
}
:"
)
print
(
"--------------"
)
for
s
in
ids
:
assert
len
(
s
)
>
0
l
=
s
.
data
.
tolist
()
text
=
tokenizer
.
detokenize
(
l
)
print
(
text
)
print
(
"---"
)
def
test_indexed_dataset_get
(
args
):
ds
=
indexed_dataset
.
make_dataset
(
args
.
data
,
args
.
dataset_impl
)
tokenizer
=
build_tokenizer
(
args
)
size
=
ds
.
sizes
[
0
]
print
(
f
"size:
{
size
}
"
)
full
=
ds
.
get
(
0
)
print
(
full
)
# print(tokenizer.detokenize(full.data.tolist()))
print
(
"---"
)
end
=
ds
.
get
(
0
,
offset
=
size
-
10
)
print
(
end
)
# print(tokenizer.detokenize(end.data.tolist()))
start
=
ds
.
get
(
0
,
length
=
10
)
print
(
start
)
# print(tokenizer.detokenize(start.data.tolist()))
part
=
ds
.
get
(
0
,
offset
=
2
,
length
=
8
)
print
(
part
)
# print(tokenizer.detokenize(part.data.tolist()))
# def test_albert_dataset(args):
# # tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True)
# # idataset = indexed_dataset.make_dataset(args.data, args.dataset_impl)
# # ds = AlbertDataset(idataset, tokenizer)
# ds = AlbertDataset.from_paths(args.vocab, args.data, args.dataset_impl,
# args.epochs, args.max_num_samples,
# args.masked_lm_prob, args.seq_length,
# args.short_seq_prob, args.seed)
# truncated = 0
# total = 0
# for i, s in enumerate(ds):
# ids = s['text']
# tokens = ds.tokenizer.convert_ids_to_tokens(ids)
# print(tokens)
# if i >= args.count-1:
# exit()
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--data'
,
type
=
str
,
help
=
'prefix to data files'
)
parser
.
add_argument
(
'--dataset-impl'
,
type
=
str
,
default
=
'infer'
,
choices
=
[
'lazy'
,
'cached'
,
'mmap'
,
'infer'
])
parser
.
add_argument
(
'--count'
,
type
=
int
,
default
=
10
,
help
=
'Number of samples/documents to print'
)
group
=
parser
.
add_argument_group
(
title
=
'tokenizer'
)
group
.
add_argument
(
'--tokenizer-type'
,
type
=
str
,
required
=
True
,
choices
=
[
'BertWordPieceLowerCase'
,
'GPT2BPETokenizer'
],
help
=
'What type of tokenizer to use.'
)
group
.
add_argument
(
'--vocab-file'
,
type
=
str
,
default
=
None
,
help
=
'Path to the vocab file'
)
group
.
add_argument
(
'--merge-file'
,
type
=
str
,
default
=
None
,
help
=
'Path to the BPE merge file (if necessary).'
)
parser
.
add_argument
(
'--epochs'
,
type
=
int
,
default
=
5
,
help
=
'Number of epochs to plan for'
)
parser
.
add_argument
(
'--max-num-samples'
,
type
=
int
,
default
=
None
,
help
=
'Maximum number of samples to plan for'
)
parser
.
add_argument
(
'--masked-lm-prob'
,
type
=
float
,
default
=
0.15
,
help
=
'probability of masking tokens'
)
parser
.
add_argument
(
'--seq-length'
,
type
=
int
,
default
=
512
,
help
=
'maximum sequence length'
)
parser
.
add_argument
(
'--short-seq-prob'
,
type
=
float
,
default
=
0.1
,
help
=
'probability of creating a short sequence'
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
1234
,
help
=
'random seed'
)
args
=
parser
.
parse_args
()
args
.
rank
=
0
args
.
make_vocab_size_divisible_by
=
128
args
.
tensor_model_parallel_size
=
1
if
args
.
dataset_impl
==
"infer"
:
args
.
dataset_impl
=
indexed_dataset
.
infer_dataset_impl
(
args
.
data
)
# test_albert_dataset(args)
test_indexed_dataset_get
(
args
)
if
__name__
==
"__main__"
:
main
()
3rdparty/Megatron-LM/megatron/data/test/test_preprocess_data.sh
0 → 100644
View file @
0211193c
#!/bin/bash
IMPL
=
cached
python ../preprocess_data.py
\
--input
test_samples.json
\
--vocab
vocab.txt
\
--dataset-impl
${
IMPL
}
\
--output-prefix
test_samples_
${
IMPL
}
\
--workers
1
\
--log-interval
2
3rdparty/Megatron-LM/megatron/data/vit_dataset.py
0 → 100644
View file @
0211193c
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
random
import
numpy
as
np
import
torch
import
torchvision.transforms
as
T
from
torchvision
import
datasets
from
megatron
import
get_args
from
megatron.data.image_folder
import
ImageFolder
from
megatron.data.autoaugment
import
ImageNetPolicy
from
megatron.data.data_samplers
import
RandomSeedDataset
class
ClassificationTransform
():
def
__init__
(
self
,
image_size
,
train
=
True
):
args
=
get_args
()
assert
args
.
fp16
or
args
.
bf16
self
.
data_type
=
torch
.
half
if
args
.
fp16
else
torch
.
bfloat16
if
train
:
self
.
transform
=
T
.
Compose
([
T
.
RandomResizedCrop
(
image_size
),
T
.
RandomHorizontalFlip
(),
T
.
ColorJitter
(
0.4
,
0.4
,
0.4
,
0.1
),
ImageNetPolicy
(),
T
.
ToTensor
(),
T
.
Normalize
((
0.485
,
0.456
,
0.406
),
(
0.229
,
0.224
,
0.225
)),
T
.
ConvertImageDtype
(
self
.
data_type
)
])
else
:
self
.
transform
=
T
.
Compose
([
T
.
Resize
(
image_size
),
T
.
CenterCrop
(
image_size
),
T
.
ToTensor
(),
T
.
Normalize
((
0.485
,
0.456
,
0.406
),
(
0.229
,
0.224
,
0.225
)),
T
.
ConvertImageDtype
(
self
.
data_type
)
])
def
__call__
(
self
,
input
):
output
=
self
.
transform
(
input
)
return
output
def
build_train_valid_datasets
(
data_path
,
image_size
=
224
):
args
=
get_args
()
train_transform
=
ClassificationTransform
(
image_size
)
val_transform
=
ClassificationTransform
(
image_size
,
train
=
False
)
# training dataset
train_data_path
=
data_path
[
0
]
train_data
=
ImageFolder
(
root
=
train_data_path
,
transform
=
train_transform
,
classes_fraction
=
args
.
classes_fraction
,
data_per_class_fraction
=
args
.
data_per_class_fraction
)
train_data
=
RandomSeedDataset
(
train_data
)
# validation dataset
val_data_path
=
data_path
[
1
]
val_data
=
ImageFolder
(
root
=
val_data_path
,
transform
=
val_transform
)
val_data
=
RandomSeedDataset
(
val_data
)
return
train_data
,
val_data
3rdparty/Megatron-LM/megatron/dist_signal_handler.py
0 → 100644
View file @
0211193c
import
signal
import
torch
def
get_world_size
():
if
torch
.
distributed
.
is_available
()
and
torch
.
distributed
.
is_initialized
():
world_size
=
torch
.
distributed
.
get_world_size
()
else
:
world_size
=
1
return
world_size
def
get_device
(
local_rank
=
None
):
backend
=
torch
.
distributed
.
get_backend
()
if
backend
==
'nccl'
:
if
local_rank
is
None
:
device
=
torch
.
device
(
'cuda'
)
else
:
device
=
torch
.
device
(
f
'cuda:
{
local_rank
}
'
)
elif
backend
==
'gloo'
:
device
=
torch
.
device
(
'cpu'
)
else
:
raise
RuntimeError
return
device
def
all_gather_item
(
item
,
dtype
,
group
=
None
,
async_op
=
False
,
local_rank
=
None
):
if
not
torch
.
distributed
.
is_available
()
or
\
not
torch
.
distributed
.
is_initialized
():
return
[
item
]
device
=
get_device
(
local_rank
)
if
group
is
not
None
:
group_size
=
group
.
size
()
else
:
group_size
=
get_world_size
()
tensor
=
torch
.
tensor
([
item
],
device
=
device
,
dtype
=
dtype
)
output_tensors
=
[
torch
.
zeros
(
1
,
dtype
=
tensor
.
dtype
,
device
=
tensor
.
device
)
for
_
in
range
(
group_size
)
]
torch
.
distributed
.
all_gather
(
output_tensors
,
tensor
,
group
,
async_op
)
output
=
[
elem
.
item
()
for
elem
in
output_tensors
]
return
output
class
DistributedSignalHandler
:
def
__init__
(
self
,
sig
=
signal
.
SIGTERM
):
self
.
sig
=
sig
def
signals_received
(
self
):
all_received
=
all_gather_item
(
self
.
_signal_received
,
dtype
=
torch
.
int32
)
return
all_received
def
__enter__
(
self
):
self
.
_signal_received
=
False
self
.
released
=
False
self
.
original_handler
=
signal
.
getsignal
(
self
.
sig
)
def
handler
(
signum
,
frame
):
self
.
_signal_received
=
True
signal
.
signal
(
self
.
sig
,
handler
)
return
self
def
__exit__
(
self
,
type
,
value
,
tb
):
self
.
release
()
def
release
(
self
):
if
self
.
released
:
return
False
signal
.
signal
(
self
.
sig
,
self
.
original_handler
)
self
.
released
=
True
return
True
3rdparty/Megatron-LM/megatron/fp16_deprecated/loss_scaler.py
0 → 100644
View file @
0211193c
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""For backward compatibility, we need the class definitions to deserialize."""
class
LossScaler
:
def
__init__
(
self
,
scale
=
1
):
self
.
cur_scale
=
scale
class
DynamicLossScaler
:
def
__init__
(
self
,
init_scale
=
2
**
32
,
scale_factor
=
2.
,
scale_window
=
1000
,
min_scale
=
1
,
delayed_shift
=
1
,
consecutive_hysteresis
=
False
):
self
.
cur_scale
=
init_scale
self
.
cur_iter
=
0
self
.
last_overflow_iter
=
-
1
self
.
scale_factor
=
scale_factor
self
.
scale_window
=
scale_window
self
.
min_scale
=
min_scale
self
.
delayed_shift
=
delayed_shift
self
.
cur_hysteresis
=
delayed_shift
self
.
consecutive_hysteresis
=
consecutive_hysteresis
3rdparty/Megatron-LM/megatron/fused_kernels/__init__.py
0 → 100644
View file @
0211193c
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
pathlib
import
subprocess
from
torch.utils
import
cpp_extension
# Setting this param to a list has a problem of generating different
# compilation commands (with diferent order of architectures) and
# leading to recompilation of fused kernels. Set it to empty string
# to avoid recompilation and assign arch flags explicity in
# extra_cuda_cflags below
os
.
environ
[
"TORCH_CUDA_ARCH_LIST"
]
=
""
def
load
(
args
):
# Check if cuda 11 is installed for compute capability 8.0
cc_flag
=
[]
_
,
bare_metal_major
,
_
=
_get_cuda_bare_metal_version
(
cpp_extension
.
CUDA_HOME
)
if
int
(
bare_metal_major
)
>=
11
:
cc_flag
.
append
(
'-gencode'
)
cc_flag
.
append
(
'arch=compute_80,code=sm_80'
)
# Build path
srcpath
=
pathlib
.
Path
(
__file__
).
parent
.
absolute
()
buildpath
=
srcpath
/
'build'
_create_build_dir
(
buildpath
)
# Helper function to build the kernels.
def
_cpp_extention_load_helper
(
name
,
sources
,
extra_cuda_flags
):
return
cpp_extension
.
load
(
name
=
name
,
sources
=
sources
,
build_directory
=
buildpath
,
extra_cflags
=
[
'-O3'
,],
extra_cuda_cflags
=
[
'-O3'
,
'-gencode'
,
'arch=compute_70,code=sm_70'
,
'--use_fast_math'
]
+
extra_cuda_flags
+
cc_flag
,
verbose
=
(
args
.
rank
==
0
)
)
# ==============
# Fused softmax.
# ==============
if
args
.
masked_softmax_fusion
:
extra_cuda_flags
=
[
'-U__CUDA_NO_HALF_OPERATORS__'
,
'-U__CUDA_NO_HALF_CONVERSIONS__'
,
'--expt-relaxed-constexpr'
,
'--expt-extended-lambda'
]
# Upper triangular softmax.
sources
=
[
srcpath
/
'scaled_upper_triang_masked_softmax.cpp'
,
srcpath
/
'scaled_upper_triang_masked_softmax_cuda.cu'
]
scaled_upper_triang_masked_softmax_cuda
=
_cpp_extention_load_helper
(
"scaled_upper_triang_masked_softmax_cuda"
,
sources
,
extra_cuda_flags
)
# Masked softmax.
sources
=
[
srcpath
/
'scaled_masked_softmax.cpp'
,
srcpath
/
'scaled_masked_softmax_cuda.cu'
]
scaled_masked_softmax_cuda
=
_cpp_extention_load_helper
(
"scaled_masked_softmax_cuda"
,
sources
,
extra_cuda_flags
)
# Softmax
sources
=
[
srcpath
/
'scaled_softmax.cpp'
,
srcpath
/
'scaled_softmax_cuda.cu'
]
scaled_softmax_cuda
=
_cpp_extention_load_helper
(
"scaled_softmax_cuda"
,
sources
,
extra_cuda_flags
)
# =================================
# Mixed precision fused layer norm.
# =================================
extra_cuda_flags
=
[
'-maxrregcount=50'
]
sources
=
[
srcpath
/
'layer_norm_cuda.cpp'
,
srcpath
/
'layer_norm_cuda_kernel.cu'
]
fused_mix_prec_layer_norm_cuda
=
_cpp_extention_load_helper
(
"fused_mix_prec_layer_norm_cuda"
,
sources
,
extra_cuda_flags
)
def
_get_cuda_bare_metal_version
(
cuda_dir
):
raw_output
=
subprocess
.
check_output
([
cuda_dir
+
"/bin/nvcc"
,
"-V"
],
universal_newlines
=
True
)
output
=
raw_output
.
split
()
release_idx
=
output
.
index
(
"release"
)
+
1
release
=
output
[
release_idx
].
split
(
"."
)
bare_metal_major
=
release
[
0
]
bare_metal_minor
=
release
[
1
][
0
]
return
raw_output
,
bare_metal_major
,
bare_metal_minor
def
_create_build_dir
(
buildpath
):
try
:
os
.
mkdir
(
buildpath
)
except
OSError
:
if
not
os
.
path
.
isdir
(
buildpath
):
print
(
f
"Creation of the build directory
{
buildpath
}
failed"
)
3rdparty/Megatron-LM/megatron/fused_kernels/compat.h
0 → 100644
View file @
0211193c
/* coding=utf-8
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*This code is copied fron NVIDIA apex:
* https://github.com/NVIDIA/apex
* with minor changes. */
#ifndef TORCH_CHECK
#define TORCH_CHECK AT_CHECK
#endif
#ifdef VERSION_GE_1_3
#define DATA_PTR data_ptr
#else
#define DATA_PTR data
#endif
3rdparty/Megatron-LM/megatron/fused_kernels/layer_norm_cuda.cpp
0 → 100644
View file @
0211193c
/* coding=utf-8
* Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*This code is copied fron NVIDIA apex:
* https://github.com/NVIDIA/apex
* with minor changes. */
#include <torch/extension.h>
#include <vector>
#include <cassert>
#include "compat.h"
namespace
{
void
compute_n1_n2
(
at
::
Tensor
input
,
at
::
IntArrayRef
normalized_shape
,
int
&
n1
,
int
&
n2
)
{
int
idiff
=
input
.
ndimension
()
-
normalized_shape
.
size
();
n2
=
1
;
for
(
int
i
=
0
;
i
<
(
int
)
normalized_shape
.
size
();
++
i
)
{
assert
(
input
.
sizes
()[
i
+
idiff
]
==
normalized_shape
[
i
]
);
n2
*=
normalized_shape
[
i
];
}
n1
=
1
;
for
(
int
i
=
0
;
i
<
idiff
;
++
i
)
{
n1
*=
input
.
sizes
()[
i
];
}
}
void
check_args
(
at
::
IntArrayRef
normalized_shape
,
at
::
Tensor
gamma
,
at
::
Tensor
beta
)
{
TORCH_CHECK
(
!
gamma
.
defined
()
||
gamma
.
sizes
().
equals
(
normalized_shape
));
TORCH_CHECK
(
!
beta
.
defined
()
||
beta
.
sizes
().
equals
(
normalized_shape
));
}
void
check_args
(
at
::
Tensor
input
,
at
::
IntArrayRef
normalized_shape
,
int
&
n1
,
int
&
n2
)
{
int64_t
normalized_ndim
=
normalized_shape
.
size
();
if
(
normalized_ndim
<
1
)
{
std
::
stringstream
ss
;
ss
<<
"Expected normalized_shape to be at least 1-dimensional, i.e., "
<<
"containing at least one element, but got normalized_shape="
<<
normalized_shape
;
throw
std
::
runtime_error
(
ss
.
str
());
}
auto
input_shape
=
input
.
sizes
();
auto
input_ndim
=
input
.
dim
();
if
(
input_ndim
<
normalized_ndim
||
!
input_shape
.
slice
(
input_ndim
-
normalized_ndim
).
equals
(
normalized_shape
))
{
std
::
stringstream
ss
;
ss
<<
"Given normalized_shape="
<<
normalized_shape
<<
", expected input with shape [*"
;
for
(
auto
size
:
normalized_shape
)
{
ss
<<
", "
<<
size
;
}
ss
<<
"], but got input of size"
<<
input_shape
;
throw
std
::
runtime_error
(
ss
.
str
());
}
compute_n1_n2
(
input
,
normalized_shape
,
n1
,
n2
);
}
void
check_args
(
at
::
Tensor
input
,
at
::
IntArrayRef
normalized_shape
,
at
::
Tensor
gamma
,
at
::
Tensor
beta
,
int
&
n1
,
int
&
n2
)
{
check_args
(
input
,
normalized_shape
,
n1
,
n2
);
check_args
(
normalized_shape
,
gamma
,
beta
);
}
}
void
cuda_layer_norm
(
at
::
Tensor
*
output
,
at
::
Tensor
*
mean
,
at
::
Tensor
*
invvar
,
at
::
Tensor
*
input
,
int
n1
,
int
n2
,
at
::
IntArrayRef
normalized_shape
,
at
::
Tensor
*
gamma
,
at
::
Tensor
*
beta
,
double
epsilon
);
#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
std
::
vector
<
at
::
Tensor
>
layer_norm_affine
(
at
::
Tensor
input
,
at
::
IntArrayRef
normalized_shape
,
at
::
Tensor
gamma
,
at
::
Tensor
beta
,
double
epsilon
)
{
CHECK_INPUT
(
input
);
CHECK_INPUT
(
gamma
);
CHECK_INPUT
(
beta
);
int
n1
,
n2
;
check_args
(
input
,
normalized_shape
,
gamma
,
beta
,
n1
,
n2
);
at
::
Tensor
output
=
at
::
empty_like
(
input
,
gamma
.
options
().
dtype
(
gamma
.
scalar_type
()));
at
::
Tensor
mean
=
at
::
empty
(
{
n1
},
input
.
options
().
dtype
(
at
::
ScalarType
::
Float
));
at
::
Tensor
invvar
=
at
::
empty_like
(
mean
);
cuda_layer_norm
(
&
output
,
&
mean
,
&
invvar
,
&
input
,
n1
,
n2
,
normalized_shape
,
&
gamma
,
&
beta
,
epsilon
);
return
{
output
,
mean
,
invvar
};
}
void
cuda_layer_norm_gradient
(
at
::
Tensor
*
dout
,
at
::
Tensor
*
mean
,
at
::
Tensor
*
invvar
,
at
::
Tensor
*
input
,
int
n1
,
int
n2
,
at
::
IntArrayRef
normalized_shape
,
at
::
Tensor
*
gamma
,
at
::
Tensor
*
beta
,
double
epsilon
,
at
::
Tensor
*
grad_input
,
at
::
Tensor
*
grad_gamma
,
at
::
Tensor
*
grad_beta
);
std
::
vector
<
at
::
Tensor
>
layer_norm_gradient_affine
(
at
::
Tensor
dout
,
at
::
Tensor
mean
,
at
::
Tensor
invvar
,
at
::
Tensor
input
,
at
::
IntArrayRef
normalized_shape
,
at
::
Tensor
gamma
,
at
::
Tensor
beta
,
double
epsilon
)
{
CHECK_INPUT
(
dout
);
CHECK_INPUT
(
mean
);
CHECK_INPUT
(
invvar
);
CHECK_INPUT
(
input
);
CHECK_INPUT
(
gamma
);
CHECK_INPUT
(
beta
);
int
n1
,
n2
;
check_args
(
input
,
normalized_shape
,
gamma
,
beta
,
n1
,
n2
);
at
::
Tensor
grad_input
=
at
::
empty_like
(
input
);
at
::
Tensor
grad_gamma
=
at
::
empty_like
(
gamma
);
at
::
Tensor
grad_beta
=
at
::
empty_like
(
beta
);
cuda_layer_norm_gradient
(
&
dout
,
&
mean
,
&
invvar
,
&
input
,
n1
,
n2
,
normalized_shape
,
&
gamma
,
&
beta
,
epsilon
,
&
grad_input
,
&
grad_gamma
,
&
grad_beta
);
return
{
grad_input
,
grad_gamma
,
grad_beta
};
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"forward_affine"
,
&
layer_norm_affine
,
"LayerNorm forward (CUDA)"
);
m
.
def
(
"backward_affine"
,
&
layer_norm_gradient_affine
,
"LayerNorm backward (CUDA)"
);
}
Prev
1
2
3
4
5
6
7
8
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment