Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
dcuai
dlexamples
Commits
316d3f90
Commit
316d3f90
authored
Jul 14, 2022
by
Pan,Huiwen
Browse files
增加ds框架测试模型
parent
aebde649
Changes
227
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4246 additions
and
0 deletions
+4246
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/tests/test_layers.py
...M-v1.1.5-3D_parallelism/megatron/mpu/tests/test_layers.py
+530
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/tests/test_random.py
...M-v1.1.5-3D_parallelism/megatron/mpu/tests/test_random.py
+204
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/utils.py
...d/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/utils.py
+70
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/package_info.py
...egatron-LM-v1.1.5-3D_parallelism/megatron/package_info.py
+30
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/text_generation_utils.py
...M-v1.1.5-3D_parallelism/megatron/text_generation_utils.py
+412
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/tokenizer/__init__.py
...n-LM-v1.1.5-3D_parallelism/megatron/tokenizer/__init__.py
+17
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/tokenizer/bert_tokenization.py
....5-3D_parallelism/megatron/tokenizer/bert_tokenization.py
+402
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/tokenizer/gpt2_tokenization.py
....5-3D_parallelism/megatron/tokenizer/gpt2_tokenization.py
+321
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/tokenizer/tokenizer.py
...-LM-v1.1.5-3D_parallelism/megatron/tokenizer/tokenizer.py
+220
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py
...ed/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py
+695
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/utils.py
...speed/Megatron-LM-v1.1.5-3D_parallelism/megatron/utils.py
+177
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/pretrain_bert.py
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/pretrain_bert.py
+123
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/pretrain_gpt2.py
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/pretrain_gpt2.py
+158
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/pretrain_ict.py
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/pretrain_ict.py
+138
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/requirements.txt
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/requirements.txt
+5
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/setup.py
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/setup.py
+91
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/tasks/data_utils.py
...eed/Megatron-LM-v1.1.5-3D_parallelism/tasks/data_utils.py
+118
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/tasks/ensemble_classifier.py
...ron-LM-v1.1.5-3D_parallelism/tasks/ensemble_classifier.py
+149
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/tasks/eval_utils.py
...eed/Megatron-LM-v1.1.5-3D_parallelism/tasks/eval_utils.py
+127
-0
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/tasks/finetune_utils.py
...Megatron-LM-v1.1.5-3D_parallelism/tasks/finetune_utils.py
+259
-0
No files found.
Too many changes to show.
To preserve performance only
227 of 227+
files are displayed.
Plain diff
Email patch
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/tests/test_layers.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
mpu
import
layers
from
commons
import
set_random_seed
from
commons
import
print_separator
from
commons
import
initialize_distributed
import
mpu
from
torch.nn.parameter
import
Parameter
import
torch.nn.init
as
init
import
torch
import
random
import
sys
sys
.
path
.
append
(
"../.."
)
def
test_parallel_embedding
(
model_parallel_size
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing parallel embedding with model parallel size {} ...'
.
format
(
model_parallel_size
))
mpu
.
initialize_model_parallel
(
model_parallel_size
)
model_parallel_size
=
mpu
.
get_model_parallel_world_size
()
batch_size
=
17
seq_length
=
23
vocab_size
=
48
hidden_size
=
16
seed
=
1236
set_random_seed
(
123
)
input_data
=
torch
.
LongTensor
(
size
=
(
batch_size
,
seq_length
)).
random_
(
0
,
vocab_size
).
cuda
()
loss_weight
=
torch
.
randn
([
batch_size
,
seq_length
,
hidden_size
]).
cuda
()
set_random_seed
(
seed
)
embedding_original
=
torch
.
nn
.
Embedding
(
vocab_size
,
hidden_size
).
cuda
()
output
=
embedding_original
(
input_data
)
loss_original
=
torch
.
mul
(
output
,
loss_weight
).
sum
()
loss_original
.
backward
()
set_random_seed
(
seed
)
embedding_parallel
=
layers
.
ParallelEmbedding
(
vocab_size
,
hidden_size
,
init_method
=
init
.
normal_
).
cuda
()
output
=
embedding_parallel
(
input_data
)
loss_parallel
=
torch
.
mul
(
output
,
loss_weight
).
sum
()
loss_parallel
.
backward
()
set_random_seed
(
seed
)
embedding_vocab_parallel
=
layers
.
VocabParallelEmbedding
(
vocab_size
,
hidden_size
,
init_method
=
init
.
normal_
).
cuda
()
output
=
embedding_vocab_parallel
(
input_data
)
loss_vocab_parallel
=
torch
.
mul
(
output
,
loss_weight
).
sum
()
loss_vocab_parallel
.
backward
()
torch
.
distributed
.
barrier
()
error
=
loss_parallel
.
sub
(
loss_original
).
abs
()
print
(
' error in loss (parallel) on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-12
,
'error: {}'
.
format
(
error
)
torch
.
distributed
.
barrier
()
error
=
loss_vocab_parallel
.
sub
(
loss_original
).
abs
()
print
(
' error in loss (vocab parallel) on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-12
,
'error: {}'
.
format
(
error
)
weight_grad_orig
=
torch
.
split
(
embedding_original
.
weight
.
grad
,
hidden_size
//
model_parallel_size
,
1
)[
mpu
.
get_model_parallel_rank
()]
error
=
embedding_parallel
.
weight
.
grad
.
sub
(
weight_grad_orig
).
abs
().
max
()
print
(
' error in grad (parallel) on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-12
,
'error: {}'
.
format
(
error
)
weight_grad_orig
=
torch
.
split
(
embedding_original
.
weight
.
grad
,
vocab_size
//
model_parallel_size
,
0
)[
mpu
.
get_model_parallel_rank
()]
error
=
embedding_vocab_parallel
.
weight
.
grad
.
sub
(
weight_grad_orig
).
abs
().
max
()
print
(
' error in grad (vocab parallel) on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-12
,
'error: {}'
.
format
(
error
)
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'>> passed the test :-)'
)
def
test_initialize_affine_weight
(
model_parallel_size
):
mpu
.
initialize_model_parallel
(
model_parallel_size
)
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing initialize_affine_weight with model parallel '
'size: {}'
.
format
(
model_parallel_size
))
model_parallel_size
=
mpu
.
get_model_parallel_world_size
()
seed
=
12345
input_size_coeff
=
13
input_size
=
input_size_coeff
*
model_parallel_size
output_size_coeff
=
17
output_size
=
output_size_coeff
*
model_parallel_size
# ---------------
# Column parallel
# ---------------
weight
=
torch
.
empty
(
output_size_coeff
,
input_size
)
set_random_seed
(
seed
)
layers
.
_initialize_affine_weight
(
weight
,
output_size
,
input_size
,
output_size_coeff
,
0
,
torch
.
nn
.
init
.
normal_
)
# Target.
set_random_seed
(
seed
)
master_weight
=
torch
.
empty
(
output_size
,
input_size
)
torch
.
nn
.
init
.
normal_
(
master_weight
)
rank
=
mpu
.
get_model_parallel_rank
()
my_weight
=
torch
.
split
(
master_weight
,
output_size_coeff
,
dim
=
0
)[
rank
].
contiguous
().
clone
()
# Compare.
error
=
weight
.
sub
(
my_weight
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' column parallel max error (should be zero) on global rank '
'{}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
# ------------
# Row parallel
# ------------
weight
=
torch
.
empty
(
output_size
,
input_size_coeff
)
set_random_seed
(
seed
)
mpu
.
layers
.
_initialize_affine_weight
(
weight
,
output_size
,
input_size
,
input_size_coeff
,
1
,
torch
.
nn
.
init
.
normal_
)
# Target.
set_random_seed
(
seed
)
master_weight
=
torch
.
empty
(
output_size
,
input_size
)
torch
.
nn
.
init
.
normal_
(
master_weight
)
rank
=
mpu
.
get_model_parallel_rank
()
my_weight
=
torch
.
split
(
master_weight
,
input_size_coeff
,
dim
=
1
)[
rank
].
contiguous
().
clone
()
# Compare.
error
=
weight
.
sub
(
my_weight
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' row parallel max error (should be zero) on global rank '
'{}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' >> passed the test :-)'
)
class
IdentityLayer2D
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
m
,
n
):
super
(
IdentityLayer2D
,
self
).
__init__
()
self
.
weight
=
Parameter
(
torch
.
Tensor
(
m
,
n
))
torch
.
nn
.
init
.
xavier_normal_
(
self
.
weight
)
def
forward
(
self
):
return
self
.
weight
def
test_column_parallel_linear
(
model_parallel_size
):
mpu
.
initialize_model_parallel
(
model_parallel_size
)
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing ColumnParallelLinear with model parallel '
'size: {}'
.
format
(
model_parallel_size
))
model_parallel_size
=
mpu
.
get_model_parallel_world_size
()
seed
=
12345
set_random_seed
(
seed
)
input_size_coeff
=
13
input_size
=
input_size_coeff
*
model_parallel_size
output_size_coeff
=
17
output_size
=
output_size_coeff
*
model_parallel_size
batch_size
=
7
# Network
identity_layer
=
IdentityLayer2D
(
batch_size
,
input_size
).
cuda
()
linear_layer
=
mpu
.
ColumnParallelLinear
(
input_size
,
output_size
,
keep_master_weight_for_test
=
True
).
cuda
()
loss_weight
=
torch
.
randn
([
batch_size
,
output_size
]).
cuda
()
# Forward
input_
=
identity_layer
()
output
=
linear_layer
(
input_
)
loss
=
torch
.
mul
(
output
,
loss_weight
).
sum
()
# Backward
loss
.
backward
()
# Values.
dLdY
=
loss_weight
X
=
identity_layer
.
weight
A
=
linear_layer
.
master_weight
.
cuda
()
dLdA
=
torch
.
matmul
(
dLdY
.
t
(),
X
)
dLdb
=
torch
.
matmul
(
torch
.
ones
(
batch_size
,
1
).
cuda
().
t
(),
dLdY
).
view
(
-
1
)
dLdX
=
torch
.
matmul
(
dLdY
,
A
)
rank
=
mpu
.
get_model_parallel_rank
()
my_dLdA
=
torch
.
split
(
dLdA
,
output_size_coeff
,
dim
=
0
)[
rank
].
contiguous
().
clone
()
error
=
my_dLdA
.
sub
(
linear_layer
.
weight
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' error in dLdA on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
my_dLdb
=
torch
.
split
(
dLdb
,
output_size_coeff
,
dim
=
0
)[
rank
].
contiguous
().
clone
()
error
=
my_dLdb
.
sub
(
linear_layer
.
bias
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' error in dLdb on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
error
=
dLdX
.
sub
(
identity_layer
.
weight
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' error in dLdX on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' >> passed the test :-)'
)
def
test_row_parallel_linear
(
model_parallel_size
):
mpu
.
initialize_model_parallel
(
model_parallel_size
)
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing RowParallelLinear with model parallel '
'size: {}'
.
format
(
model_parallel_size
))
model_parallel_size
=
mpu
.
get_model_parallel_world_size
()
seed
=
12345
set_random_seed
(
seed
)
input_size_coeff
=
13
input_size
=
input_size_coeff
*
model_parallel_size
output_size_coeff
=
17
output_size
=
output_size_coeff
*
model_parallel_size
batch_size
=
7
# Network
identity_layer
=
IdentityLayer2D
(
batch_size
,
input_size
).
cuda
()
linear_layer
=
mpu
.
RowParallelLinear
(
input_size
,
output_size
,
keep_master_weight_for_test
=
True
).
cuda
()
loss_weight
=
torch
.
randn
([
batch_size
,
output_size
]).
cuda
()
# Forward
input_
=
identity_layer
()
output
=
linear_layer
(
input_
)
loss
=
torch
.
mul
(
output
,
loss_weight
).
sum
()
# Backward
loss
.
backward
()
# Values.
dLdY
=
loss_weight
X
=
identity_layer
.
weight
A
=
linear_layer
.
master_weight
.
cuda
()
dLdA
=
torch
.
matmul
(
dLdY
.
t
(),
X
)
dLdb
=
torch
.
matmul
(
torch
.
ones
(
batch_size
,
1
).
cuda
().
t
(),
dLdY
).
view
(
-
1
)
dLdX
=
torch
.
matmul
(
dLdY
,
A
)
rank
=
mpu
.
get_model_parallel_rank
()
my_dLdA
=
torch
.
split
(
dLdA
,
input_size_coeff
,
dim
=
1
)[
rank
].
contiguous
().
clone
()
error
=
my_dLdA
.
sub
(
linear_layer
.
weight
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' error in dLdA on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
error
=
dLdb
.
sub
(
linear_layer
.
bias
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' error in dLdb on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
error
=
dLdX
.
sub
(
identity_layer
.
weight
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' error in dLdX on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' >> passed the test :-)'
)
class
IdentityLayer3D
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
m
,
n
,
k
):
super
(
IdentityLayer3D
,
self
).
__init__
()
self
.
weight
=
Parameter
(
torch
.
Tensor
(
m
,
n
,
k
))
torch
.
nn
.
init
.
xavier_normal_
(
self
.
weight
)
def
forward
(
self
):
return
self
.
weight
def
parallel_self_attention
(
model_parallel_size
,
num_att_heads_per_partition
,
hidden_size_per_att_head
,
dropout_prob
,
batch_size
,
sequence_length
):
mpu
.
initialize_model_parallel
(
model_parallel_size
)
model_parallel_size
=
mpu
.
get_model_parallel_world_size
()
seed
=
12345
set_random_seed
(
seed
)
num_att_heads
=
num_att_heads_per_partition
*
\
torch
.
distributed
.
get_world_size
()
hidden_size
=
hidden_size_per_att_head
*
num_att_heads
# Network
identity_layer
=
IdentityLayer3D
(
batch_size
,
sequence_length
,
hidden_size
).
cuda
()
attention_layer
=
mpu
.
BertParallelSelfAttention
(
hidden_size
,
num_att_heads
,
dropout_prob
).
cuda
()
loss_weight
=
torch
.
randn
([
batch_size
,
sequence_length
,
hidden_size
]).
cuda
()
attention_mask
=
torch
.
randn
([
batch_size
,
1
,
1
,
sequence_length
]).
cuda
()
# Forward
input_
=
identity_layer
()
output
=
attention_layer
(
input_
,
attention_mask
)
loss
=
torch
.
mul
(
output
,
loss_weight
).
sum
()
# Backward
loss
.
backward
()
rank
=
mpu
.
get_model_parallel_rank
()
mpu
.
destroy_model_parallel
()
return
rank
,
hidden_size
,
model_parallel_size
,
loss
,
\
attention_layer
,
identity_layer
def
test_parallel_self_attention
(
model_parallel_size
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing ParallelSelfAttention with model parallel '
'size: {}'
.
format
(
model_parallel_size
))
num_att_heads_per_partition
=
3
hidden_size_per_att_head
=
7
dropout_prob
=
0.0
# has to be zero
batch_size
=
5
sequence_length
=
13
rank_1
,
hideen_size_1
,
model_parallel_size_1
,
loss_1
,
\
attention_layer_1
,
identity_layer_1
=
parallel_self_attention
(
1
,
num_att_heads_per_partition
,
hidden_size_per_att_head
,
dropout_prob
,
batch_size
,
sequence_length
)
rank
,
hidden_size
,
model_parallel_size
,
loss
,
\
attention_layer
,
identity_layer
=
parallel_self_attention
(
model_parallel_size
,
num_att_heads_per_partition
,
hidden_size_per_att_head
,
dropout_prob
,
batch_size
,
sequence_length
)
assert
hideen_size_1
==
hidden_size
error
=
loss_1
.
sub
(
loss
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' loss error on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
5.0e-6
my_lin_grad_list
=
torch
.
split
(
attention_layer_1
.
query_key_value
.
weight
.
grad
,
hidden_size
//
model_parallel_size
,
0
)[
rank
::
model_parallel_size
]
my_lin_grad
=
torch
.
cat
(
my_lin_grad_list
,
dim
=
0
)
error
=
my_lin_grad
.
sub
(
attention_layer
.
query_key_value
.
weight
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' weight gradient error on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
5.0e-6
error
=
identity_layer_1
.
weight
.
grad
.
sub
(
identity_layer
.
weight
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' input gradient error on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
5.0e-6
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' >> passed the test :-)'
)
def
parallel_transformer
(
model_parallel_size
,
num_att_heads_per_partition
,
hidden_size_per_att_head
,
batch_size
,
sequence_length
):
mpu
.
initialize_model_parallel
(
model_parallel_size
)
model_parallel_size
=
mpu
.
get_model_parallel_world_size
()
seed
=
12345
set_random_seed
(
seed
)
num_att_heads
=
num_att_heads_per_partition
*
\
torch
.
distributed
.
get_world_size
()
hidden_size
=
hidden_size_per_att_head
*
num_att_heads
intermediate_size
=
4
*
hidden_size
# Network
identity_layer
=
IdentityLayer3D
(
batch_size
,
sequence_length
,
hidden_size
).
cuda
()
transformer_layer
=
mpu
.
BertParallelTransformerLayer
(
hidden_size
,
intermediate_size
,
num_att_heads
,
0.0
,
0.0
,
torch
.
nn
.
functional
.
relu
,
1.0e-5
).
cuda
()
loss_weight
=
torch
.
randn
([
batch_size
,
sequence_length
,
hidden_size
]).
cuda
()
attention_mask
=
torch
.
randn
([
batch_size
,
1
,
1
,
sequence_length
]).
cuda
()
# Forward
input_
=
identity_layer
()
output
=
transformer_layer
(
input_
,
attention_mask
)
loss
=
torch
.
mul
(
output
,
loss_weight
).
sum
()
# Backward
loss
.
backward
()
rank
=
mpu
.
get_model_parallel_rank
()
mpu
.
destroy_model_parallel
()
return
rank
,
hidden_size
,
model_parallel_size
,
loss
,
\
transformer_layer
,
identity_layer
def
test_parallel_transformer_layer
(
model_parallel_size
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing ParallelTransformerLayer with model parallel '
'size: {}'
.
format
(
model_parallel_size
))
num_att_heads_per_partition
=
3
hidden_size_per_att_head
=
7
batch_size
=
5
sequence_length
=
13
rank_1
,
hidden_size_1
,
model_parallel_size_1
,
loss_1
,
\
transformer_layer_1
,
identity_layer_1
=
parallel_transformer
(
1
,
num_att_heads_per_partition
,
hidden_size_per_att_head
,
batch_size
,
sequence_length
)
rank
,
hidden_size
,
model_parallel_size
,
loss
,
\
transformer_layer
,
identity_layer
=
parallel_transformer
(
model_parallel_size
,
num_att_heads_per_partition
,
hidden_size_per_att_head
,
batch_size
,
sequence_length
)
error
=
loss_1
.
sub
(
loss
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' loss error on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
5.0e-5
,
'error: {}'
.
format
(
error
)
error
=
identity_layer_1
.
weight
.
grad
.
sub
(
identity_layer
.
weight
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' input gradient error on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
5.0e-5
,
'error: {}'
.
format
(
error
)
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' >> passed the test :-)'
)
if
__name__
==
'__main__'
:
torch
.
backends
.
cudnn
.
deterministic
=
True
torch
.
backends
.
cudnn
.
benchmark
=
False
initialize_distributed
()
world_size
=
torch
.
distributed
.
get_world_size
()
print_separator
(
'test initialize affine weight'
)
model_parallel_size
=
1
while
model_parallel_size
<=
world_size
:
test_initialize_affine_weight
(
model_parallel_size
)
model_parallel_size
*=
2
model_parallel_size
=
1
while
model_parallel_size
<=
world_size
:
print_separator
(
'test parallel embedding'
)
test_parallel_embedding
(
model_parallel_size
)
model_parallel_size
*=
2
print_separator
(
'test column-parallel linear'
)
model_parallel_size
=
1
while
model_parallel_size
<=
world_size
:
test_column_parallel_linear
(
model_parallel_size
)
model_parallel_size
*=
2
print_separator
(
'test row-parallel linear'
)
model_parallel_size
=
1
while
model_parallel_size
<=
world_size
:
test_row_parallel_linear
(
model_parallel_size
)
model_parallel_size
*=
2
print_separator
(
'test parallel self-attention'
)
model_parallel_size
=
1
while
model_parallel_size
<=
world_size
:
test_parallel_self_attention
(
model_parallel_size
)
model_parallel_size
*=
2
print_separator
(
'test parallel transformer'
)
model_parallel_size
=
1
while
model_parallel_size
<=
world_size
:
test_parallel_transformer_layer
(
model_parallel_size
)
model_parallel_size
*=
2
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/tests/test_random.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
commons
import
print_separator
from
commons
import
initialize_distributed
import
mpu
import
torch
import
sys
sys
.
path
.
append
(
"../.."
)
def
test_set_cuda_rng_state
(
model_parallel_size
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing set_rng_state with size {} ...'
.
format
(
model_parallel_size
))
mpu
.
initialize_model_parallel
(
model_parallel_size
)
model_parallel_size
=
mpu
.
get_model_parallel_world_size
()
size
=
123
seed
=
1234
torch
.
cuda
.
manual_seed
(
1234
)
tensor
=
torch
.
cuda
.
FloatTensor
(
size
)
# Get the state
rng_state
=
torch
.
cuda
.
get_rng_state
()
rng_state_copy
=
rng_state
.
clone
()
# Do some stuff.
for
_
in
range
(
5
):
torch
.
randn
(
size
,
out
=
tensor
)
result_1
=
tensor
.
clone
()
assert
rng_state
.
sub
(
rng_state_copy
).
max
()
==
0
assert
torch
.
cuda
.
get_rng_state
().
sub
(
rng_state_copy
).
max
()
>
0
# State should be different.
new_rng_state
=
torch
.
cuda
.
get_rng_state
()
max_diff
=
new_rng_state
.
sub
(
rng_state
).
max
()
print
(
' max diff in rng state (should be non-zero) on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
max_diff
))
assert
max_diff
>
0
# Reset the rng state and do the same stuff.
mpu
.
random
.
_set_cuda_rng_state
(
rng_state
)
for
_
in
range
(
5
):
torch
.
randn
(
size
,
out
=
tensor
)
mpu
.
random
.
_set_cuda_rng_state
(
rng_state
)
for
_
in
range
(
5
):
torch
.
randn
(
size
,
out
=
tensor
)
result_2
=
tensor
.
clone
()
# Results should be the same
error
=
result_2
.
sub
(
result_1
).
abs
().
max
()
print
(
' max error in generated tensors (should be zero) on '
'global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
# Input state should have remained intact.
error
=
rng_state
.
sub
(
rng_state_copy
).
max
()
print
(
' max error in rng state (should be zero) on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
==
0
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'>> passed the test :-)'
)
def
test_cuda_rng_tracker
(
model_parallel_size
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing cuda rng tracker with size {} ...'
.
format
(
model_parallel_size
))
mpu
.
initialize_model_parallel
(
model_parallel_size
)
model_parallel_size
=
mpu
.
get_model_parallel_world_size
()
seed_1
=
1234
seed_2
=
4321
size
=
[
12
,
21
]
tensor
=
torch
.
cuda
.
FloatTensor
(
size
)
# Set to seed_1 and generate two tensors.
torch
.
cuda
.
manual_seed
(
seed_1
)
torch
.
randn
(
size
,
out
=
tensor
)
target_11
=
tensor
.
clone
()
torch
.
randn
(
size
,
out
=
tensor
)
target_12
=
tensor
.
clone
()
# Set to seed_2 and generate two tensors.
torch
.
cuda
.
manual_seed
(
seed_2
)
torch
.
randn
(
size
,
out
=
tensor
)
target_21
=
tensor
.
clone
()
torch
.
randn
(
size
,
out
=
tensor
)
target_22
=
tensor
.
clone
()
# Now if we interleave seed_1 and seed_2,
# we should still get the same tensors
torch
.
cuda
.
manual_seed
(
seed_1
)
mpu
.
get_cuda_rng_tracker
().
add
(
'test'
,
seed_2
)
torch
.
randn
(
size
,
out
=
tensor
)
result_11
=
tensor
.
clone
()
with
mpu
.
get_cuda_rng_tracker
().
fork
(
'test'
):
torch
.
randn
(
size
,
out
=
tensor
)
result_21
=
tensor
.
clone
()
torch
.
randn
(
size
,
out
=
tensor
)
result_12
=
tensor
.
clone
()
with
mpu
.
get_cuda_rng_tracker
().
fork
(
'test'
):
torch
.
randn
(
size
,
out
=
tensor
)
result_22
=
tensor
.
clone
()
diff
=
result_11
.
sub
(
result_21
).
abs
().
max
()
diff
=
min
(
diff
,
result_12
.
sub
(
result_22
).
abs
().
max
())
print
(
' max diff in generated tensors (should be non-zero) on '
'global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
diff
))
assert
diff
>
1.0e-6
error
=
max
(
result_11
.
sub
(
target_11
).
abs
().
max
(),
result_12
.
sub
(
target_12
).
abs
().
max
())
error
=
max
(
error
,
result_21
.
sub
(
target_21
).
abs
().
max
())
error
=
max
(
error
,
result_22
.
sub
(
target_22
).
abs
().
max
())
print
(
' max error in generated tensors (should be zero) on '
'global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
# Reset the tracker
mpu
.
get_cuda_rng_tracker
().
reset
()
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'>> passed the test :-)'
)
def
test_model_parallel_cuda_manual_seed
(
model_parallel_size
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing model parallel cuda manual seed with size {} ...'
.
format
(
model_parallel_size
))
mpu
.
initialize_model_parallel
(
model_parallel_size
)
model_parallel_size
=
mpu
.
get_model_parallel_world_size
()
mpu
.
model_parallel_cuda_manual_seed
(
12345
)
assert
torch
.
cuda
.
initial_seed
()
==
12345
with
mpu
.
get_cuda_rng_tracker
().
fork
():
assert
torch
.
cuda
.
initial_seed
()
==
(
12345
+
2718
+
mpu
.
get_model_parallel_rank
())
# Reset the tracker
mpu
.
get_cuda_rng_tracker
().
reset
()
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'>> passed the test :-)'
)
if
__name__
==
'__main__'
:
initialize_distributed
()
world_size
=
torch
.
distributed
.
get_world_size
()
model_parallel_size
=
1
while
model_parallel_size
<=
world_size
:
print_separator
(
'test set rng state'
)
test_set_cuda_rng_state
(
model_parallel_size
)
model_parallel_size
*=
2
model_parallel_size
=
1
while
model_parallel_size
<=
world_size
:
print_separator
(
'test cuda rng tracker'
)
test_cuda_rng_tracker
(
model_parallel_size
)
model_parallel_size
*=
2
model_parallel_size
=
1
while
model_parallel_size
<=
world_size
:
print_separator
(
'test model parallel cuda manual seed'
)
test_model_parallel_cuda_manual_seed
(
model_parallel_size
)
model_parallel_size
*=
2
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/mpu/utils.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
def
ensure_divisibility
(
numerator
,
denominator
):
"""Ensure that numerator is divisible by the denominator."""
assert
numerator
%
denominator
==
0
,
'{} is not divisible by {}'
.
format
(
numerator
,
denominator
)
def
divide
(
numerator
,
denominator
):
"""Ensure that numerator is divisible by the denominator and return
the division value."""
ensure_divisibility
(
numerator
,
denominator
)
return
numerator
//
denominator
def
split_tensor_along_last_dim
(
tensor
,
num_partitions
,
contiguous_split_chunks
=
False
):
"""Split a tensor along its last dimension.
Arguments:
tensor: input tensor.
num_partitions: number of partitions to split the tensor
contiguous_split_chunks: If True, make each chunk contiguous
in memory.
"""
# Get the size and dimension.
last_dim
=
tensor
.
dim
()
-
1
last_dim_size
=
divide
(
tensor
.
size
()[
last_dim
],
num_partitions
)
# Split.
tensor_list
=
torch
.
split
(
tensor
,
last_dim_size
,
dim
=
last_dim
)
# Note: torch.split does not create contiguous tensors by default.
if
contiguous_split_chunks
:
return
tuple
(
chunk
.
contiguous
()
for
chunk
in
tensor_list
)
return
tensor_list
class
VocabUtility
:
"""Split the vocabulary into `world_size` chunks amd return the
first and last index of the vocabulary belonging to the `rank`
partition: Note that indecies in [fist, last)"""
@
staticmethod
def
vocab_range_from_per_partition_vocab_size
(
per_partition_vocab_size
,
rank
,
world_size
):
index_f
=
rank
*
per_partition_vocab_size
index_l
=
index_f
+
per_partition_vocab_size
return
index_f
,
index_l
@
staticmethod
def
vocab_range_from_global_vocab_size
(
global_vocab_size
,
rank
,
world_size
):
per_partition_vocab_size
=
divide
(
global_vocab_size
,
world_size
)
return
VocabUtility
.
vocab_range_from_per_partition_vocab_size
(
per_partition_vocab_size
,
rank
,
world_size
)
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/package_info.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
MAJOR
=
1
MINOR
=
1.5
# Use the following formatting: (major, minor)
VERSION
=
(
MAJOR
,
MINOR
)
__version__
=
'.'
.
join
(
map
(
str
,
VERSION
))
__package_name__
=
'megatron-lm'
__contact_names__
=
'NVIDIA INC'
__url__
=
'https://github.com/NVIDIA/Megatron-LM'
__download_url__
=
'https://github.com/NVIDIA/Megatron-LM/releases'
__description__
=
'Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.'
__license__
=
'See https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE'
__keywords__
=
'deep learning, Megatron, gpu, NLP, nvidia, pytorch, torch, language'
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/text_generation_utils.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utilities for generating text."""
import
copy
import
json
import
os
import
time
import
torch
import
torch.nn.functional
as
F
from
megatron
import
get_args
from
megatron
import
get_tokenizer
from
megatron
import
mpu
from
megatron.utils
import
get_ltor_masks_and_position_ids
def
get_batch
(
context_tokens
):
"""Generate batch from context tokens."""
args
=
get_args
()
tokenizer
=
get_tokenizer
()
# Move to GPU.
tokens
=
context_tokens
.
view
(
args
.
batch_size
,
-
1
).
contiguous
().
cuda
()
# Get the attention mask and postition ids.
attention_mask
,
_
,
position_ids
=
get_ltor_masks_and_position_ids
(
tokens
,
tokenizer
.
eod
,
args
.
reset_position_ids
,
args
.
reset_attention_mask
,
args
.
eod_mask_loss
)
return
tokens
,
attention_mask
,
position_ids
def
top_k_logits
(
logits
,
top_k
=
0
,
top_p
=
0.0
,
filter_value
=-
float
(
'Inf'
)):
""" This function has been mostly taken from huggingface conversational
ai code at
https://medium.com/huggingface/how-to-build-a-state-of-the-art-
conversational-ai-with-transfer-learning-2d818ac26313 """
if
top_k
>
0
:
# Remove all tokens with a probability less than the
# last token of the top-k
indices_to_remove
=
logits
<
torch
.
topk
(
logits
,
top_k
)[
0
][...,
-
1
,
None
]
logits
[
indices_to_remove
]
=
filter_value
if
top_p
>
0.0
:
# Cconvert to 1D
sorted_logits
,
sorted_indices
=
torch
.
sort
(
logits
,
descending
=
True
,
dim
=-
1
)
cumulative_probs
=
torch
.
cumsum
(
F
.
softmax
(
sorted_logits
,
dim
=-
1
),
dim
=-
1
)
# Remove tokens with cumulative probability above the threshold
sorted_indices_to_remove
=
cumulative_probs
>
top_p
# Shift the indices to the right to keep also the first token
# above the threshold
sorted_indices_to_remove
[...,
1
:]
\
=
sorted_indices_to_remove
[...,
:
-
1
].
clone
()
sorted_indices_to_remove
[...,
0
]
=
0
for
i
in
range
(
sorted_indices
.
size
(
0
)):
indices_to_remove
=
sorted_indices
[
i
][
sorted_indices_to_remove
[
i
]]
logits
[
i
][
indices_to_remove
]
=
filter_value
return
logits
def
generate_samples_input_from_file
(
model
):
args
=
get_args
()
tokenizer
=
get_tokenizer
()
# Read the sample file and open the output file.
assert
args
.
sample_input_file
is
not
None
,
\
'sample input file is not provided.'
if
mpu
.
get_model_parallel_rank
()
==
0
:
fname
=
open
(
args
.
sample_input_file
,
"r"
)
all_raw_text
=
fname
.
readlines
()
input_count
=
len
(
all_raw_text
)
input_pos
=
0
if
args
.
sample_output_file
is
None
:
sample_output_file
=
args
.
sample_input_file
+
".out"
print
(
'could not find `sample-output-file`, setting '
'it to {}'
.
format
(
sample_output_file
))
else
:
sample_output_file
=
args
.
sample_output_file
fname_out
=
open
(
sample_output_file
,
"w+"
)
context_count
=
0
model
.
eval
()
with
torch
.
no_grad
():
while
True
:
torch
.
distributed
.
barrier
(
group
=
mpu
.
get_model_parallel_group
())
terminate_runs
=
0
if
mpu
.
get_model_parallel_rank
()
==
0
:
raw_text
=
all_raw_text
[
input_pos
]
input_pos
+=
1
if
input_pos
==
input_count
:
raw_text
=
"stop"
if
"stop"
in
raw_text
:
terminate_runs
=
1
else
:
context_tokens
=
tokenizer
.
tokenize
(
raw_text
)
context_length
=
len
(
context_tokens
)
if
context_length
>=
(
args
.
seq_length
//
2
):
print
(
"
\n
Context length"
,
context_length
,
"
\n
Please give smaller context (half of the "
"sequence length)!"
,
flush
=
True
)
continue
else
:
context_tokens
=
tokenizer
.
tokenize
(
"EMPTY TEXT"
)
context_length
=
len
(
context_tokens
)
terminate_runs_tensor
=
torch
.
cuda
.
LongTensor
([
terminate_runs
])
torch
.
distributed
.
broadcast
(
terminate_runs_tensor
,
mpu
.
get_model_parallel_src_rank
(),
group
=
mpu
.
get_model_parallel_group
())
terminate_runs
=
terminate_runs_tensor
[
0
].
item
()
if
terminate_runs
==
1
:
return
token_stream
=
get_token_stream
(
model
,
[
context_tokens
])
for
_
,
decode_tokens
in
enumerate
(
token_stream
):
decode_tokens
,
_
=
decode_tokens
decode_tokens
=
decode_tokens
[
0
].
cpu
().
numpy
().
tolist
()
if
mpu
.
get_model_parallel_rank
()
==
0
:
os
.
system
(
'clear'
)
print
(
"
\n
Context:"
,
raw_text
,
flush
=
True
)
trim_decode_tokens
=
tokenizer
.
detokenize
(
decode_tokens
)[
len
(
raw_text
):]
print
(
"
\n
Megatron-LM:"
,
trim_decode_tokens
,
flush
=
True
)
fname_out
.
write
(
"
\n
Context:"
)
fname_out
.
write
(
raw_text
)
fname_out
.
write
(
"
\n\n
Megatron-LM:"
)
fname_out
.
write
(
trim_decode_tokens
)
fname_out
.
write
(
"
\n
"
)
raw_text
=
None
torch
.
distributed
.
barrier
(
group
=
mpu
.
get_model_parallel_group
())
context_count
+=
1
def
generate_samples_interactive
(
model
,
print_frequency
=
24
):
args
=
get_args
()
tokenizer
=
get_tokenizer
()
context_count
=
0
model
.
eval
()
with
torch
.
no_grad
():
while
True
:
torch
.
distributed
.
barrier
(
group
=
mpu
.
get_model_parallel_group
())
terminate_runs
=
0
if
mpu
.
get_model_parallel_rank
()
==
0
:
os
.
system
(
'clear'
)
raw_text
=
input
(
"
\n
Context prompt (stop to exit) >>> "
)
while
not
raw_text
:
print
(
'Prompt should not be empty!'
)
raw_text
=
input
(
"
\n
Context prompt (stop to exit) >>> "
)
if
"stop"
in
raw_text
:
terminate_runs
=
1
else
:
context_tokens
=
tokenizer
.
tokenize
(
raw_text
)
context_length
=
len
(
context_tokens
)
if
context_length
>=
(
args
.
seq_length
//
2
):
print
(
"
\n
Context length"
,
context_length
,
"
\n
Please give smaller context (half of the "
"sequence length)!"
,
flush
=
True
)
continue
else
:
context_tokens
=
tokenizer
.
tokenize
(
"EMPTY TEXT"
)
context_length
=
len
(
context_tokens
)
terminate_runs_tensor
=
torch
.
cuda
.
LongTensor
([
terminate_runs
])
torch
.
distributed
.
broadcast
(
terminate_runs_tensor
,
mpu
.
get_model_parallel_src_rank
(),
group
=
mpu
.
get_model_parallel_group
())
terminate_runs
=
terminate_runs_tensor
[
0
].
item
()
if
terminate_runs
==
1
:
return
token_stream
=
get_token_stream
(
model
,
[
context_tokens
])
for
counter
,
decode_tokens
in
enumerate
(
token_stream
):
decode_tokens
,
_
=
decode_tokens
decode_tokens
=
decode_tokens
[
0
].
cpu
().
numpy
().
tolist
()
if
mpu
.
get_model_parallel_rank
()
==
0
and
\
counter
%
print_frequency
==
0
:
os
.
system
(
'clear'
)
print
(
"
\n
Context:"
,
raw_text
,
flush
=
True
)
trim_decode_tokens
=
tokenizer
.
detokenize
(
decode_tokens
)[
len
(
raw_text
):]
print
(
"
\n
Megatron-LM:"
,
trim_decode_tokens
,
flush
=
True
)
if
mpu
.
get_model_parallel_rank
()
==
0
:
os
.
system
(
'clear'
)
print
(
"
\n
Context:"
,
raw_text
,
flush
=
True
)
trim_decode_tokens
=
tokenizer
.
detokenize
(
decode_tokens
)[
len
(
raw_text
):]
print
(
"
\n
Megatron-LM:"
,
trim_decode_tokens
,
flush
=
True
)
raw_text
=
None
torch
.
distributed
.
barrier
(
group
=
mpu
.
get_model_parallel_group
())
context_count
+=
1
if
mpu
.
get_model_parallel_rank
()
==
0
:
input
(
"
\n
Press any key to continue >>>"
)
def
generate_samples_unconditional
(
model
):
args
=
get_args
()
tokenizer
=
get_tokenizer
()
num_samples
=
args
.
num_samples
context_tokens
=
[[
tokenizer
.
eod
]
for
_
in
range
(
args
.
batch_size
)]
ctr
=
0
while
True
:
start_time
=
time
.
time
()
for
token_stream
in
get_token_stream
(
model
,
copy
.
deepcopy
(
context_tokens
)):
pass
if
ctr
%
args
.
log_interval
==
0
:
print
(
'Avg s/batch:'
,
(
time
.
time
()
-
start_time
)
/
min
(
args
.
log_interval
,
ctr
+
1
))
start_time
=
time
.
time
()
length
=
len
(
token_stream
)
token_batch
=
token_stream
[
0
].
cpu
().
numpy
().
tolist
()
length_batch
=
token_stream
[
1
].
cpu
().
numpy
().
tolist
()
for
tokens
,
length
in
zip
(
token_batch
,
length_batch
):
tokens
=
tokens
[
1
:
length
-
1
]
text
=
tokenizer
.
detokenize
(
tokens
)
is_finished
=
length
<
args
.
seq_length
-
1
datum
=
{
'text'
:
text
,
'length'
:
length
-
1
,
'finished'
:
is_finished
}
yield
datum
ctr
+=
1
if
ctr
>=
num_samples
:
break
if
ctr
>=
num_samples
:
break
def
generate_and_write_samples_unconditional
(
model
):
args
=
get_args
()
assert
args
.
genfile
is
not
None
with
open
(
args
.
genfile
,
'w'
)
as
f
:
for
datum
in
generate_samples_unconditional
(
model
):
f
.
write
(
json
.
dumps
(
datum
)
+
'
\n
'
)
def
pad_batch
(
batch
,
pad_id
,
args
):
context_lengths
=
[]
for
tokens
in
batch
:
context_length
=
len
(
tokens
)
if
context_length
<
args
.
seq_length
:
tokens
.
extend
([
pad_id
]
*
(
args
.
seq_length
-
context_length
))
context_lengths
.
append
(
context_length
)
return
batch
,
context_lengths
def
get_token_stream
(
model
,
context_tokens
):
args
=
get_args
()
tokenizer
=
get_tokenizer
()
context_tokens
,
context_lengths
=
pad_batch
(
context_tokens
,
tokenizer
.
eod
,
args
)
context_tokens_tensor
=
torch
.
cuda
.
LongTensor
(
context_tokens
)
context_length_tensor
=
torch
.
cuda
.
LongTensor
(
context_lengths
)
torch
.
distributed
.
broadcast
(
context_length_tensor
,
mpu
.
get_model_parallel_src_rank
(),
group
=
mpu
.
get_model_parallel_group
())
torch
.
distributed
.
broadcast
(
context_tokens_tensor
,
mpu
.
get_model_parallel_src_rank
(),
group
=
mpu
.
get_model_parallel_group
())
context_length
=
context_length_tensor
.
min
().
item
()
tokens
,
attention_mask
,
position_ids
=
get_batch
(
context_tokens_tensor
)
batch_token_iterator
=
sample_sequence_batch
(
model
,
context_tokens_tensor
,
context_length_tensor
,
attention_mask
,
position_ids
)
for
tokens
,
lengths
in
batch_token_iterator
:
context_length
+=
1
yield
tokens
[:,
:
context_length
],
lengths
def
switch
(
val1
,
val2
,
boolean
):
boolean
=
boolean
.
type_as
(
val1
)
return
(
1
-
boolean
)
*
val1
+
boolean
*
val2
def
sample_sequence_batch
(
model
,
context_tokens
,
context_lengths
,
attention_mask
,
position_ids
,
maxlen
=
None
,
type_ids
=
None
):
args
=
get_args
()
tokenizer
=
get_tokenizer
()
model
.
eval
()
with
torch
.
no_grad
():
context_length
=
context_lengths
.
min
().
item
()
eos_id
=
tokenizer
.
eod
counter
=
0
org_context_length
=
context_length
layer_past
=
None
batch_size
=
context_tokens
.
size
(
0
)
is_done
=
torch
.
zeros
([
batch_size
]).
byte
().
cuda
()
tokens
=
context_tokens
if
maxlen
is
None
:
maxlen
=
args
.
seq_length
-
1
if
maxlen
>
(
org_context_length
+
args
.
out_seq_length
):
maxlen
=
org_context_length
+
args
.
out_seq_length
lengths
=
torch
.
ones
([
batch_size
]).
long
().
cuda
()
*
maxlen
while
context_length
<=
(
maxlen
):
if
args
.
recompute
:
logits
=
model
(
tokens
,
position_ids
,
attention_mask
,
tokentype_ids
=
type_ids
,
forward_method_parallel_output
=
False
)
logits
=
logits
[:,
context_length
-
1
,
:]
else
:
types2use
=
None
if
counter
==
0
:
tokens2use
=
tokens
[:,
:
context_length
]
positions2use
=
position_ids
[:,
:
context_length
]
if
type_ids
is
not
None
:
types2use
=
type_ids
[:,
:
context_length
]
else
:
tokens2use
=
tokens
[:,
context_length
-
1
].
view
(
batch_size
,
-
1
)
positions2use
=
position_ids
[:,
context_length
-
1
].
view
(
batch_size
,
-
1
)
if
type_ids
is
not
None
:
types2use
=
type_ids
[:,
context_length
-
1
].
view
(
batch_size
,
-
1
)
logits
,
layer_past
=
model
(
tokens2use
,
positions2use
,
attention_mask
,
layer_past
=
layer_past
,
get_key_value
=
True
,
tokentype_ids
=
types2use
,
forward_method_parallel_output
=
False
)
logits
=
logits
[:,
-
1
].
view
(
batch_size
,
-
1
).
contiguous
()
if
args
.
greedy
:
prev
=
torch
.
argmax
(
logits
,
dim
=-
1
).
view
(
-
1
)
else
:
logits
=
logits
.
float
()
logits
/=
args
.
temperature
logits
=
top_k_logits
(
logits
,
top_k
=
args
.
top_k
,
top_p
=
args
.
top_p
)
log_probs
=
F
.
softmax
(
logits
,
dim
=-
1
)
prev
=
torch
.
multinomial
(
log_probs
,
num_samples
=
1
).
view
(
-
1
)
print_logits
=
[]
for
p
in
prev
:
print_logits
.
append
([
logits
[
i
,
p
].
item
()
for
i
in
range
(
batch_size
)])
started
=
context_lengths
<=
context_length
tokens
[:,
context_length
]
=
switch
(
tokens
[:,
context_length
].
view
(
-
1
),
prev
,
started
)
context_length
+=
1
counter
+=
1
done_token
=
(
prev
==
eos_id
).
byte
()
&
started
.
byte
()
just_finished
=
(
done_token
&
~
is_done
).
bool
()
lengths
[
just_finished
.
view
(
-
1
)]
=
context_length
is_done
=
is_done
|
done_token
done
=
torch
.
all
(
is_done
)
yield
tokens
,
lengths
if
done
:
break
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/tokenizer/__init__.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.tokenizer
import
build_tokenizer
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/tokenizer/bert_tokenization.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes."""
from
__future__
import
absolute_import
from
__future__
import
division
from
__future__
import
print_function
import
collections
import
re
import
unicodedata
import
six
def
validate_case_matches_checkpoint
(
do_lower_case
,
init_checkpoint
):
"""Checks whether the casing config is consistent with the checkpoint name."""
# The casing has to be passed in by the user and there is no explicit check
# as to whether it matches the checkpoint. The casing information probably
# should have been stored in the bert_config.json file, but it's not, so
# we have to heuristically detect it to validate.
if
not
init_checkpoint
:
return
m
=
re
.
match
(
"^.*?([A-Za-z0-9_-]+)/bert_model.ckpt"
,
init_checkpoint
)
if
m
is
None
:
return
model_name
=
m
.
group
(
1
)
lower_models
=
[
"uncased_L-24_H-1024_A-16"
,
"uncased_L-12_H-768_A-12"
,
"multilingual_L-12_H-768_A-12"
,
"chinese_L-12_H-768_A-12"
]
cased_models
=
[
"cased_L-12_H-768_A-12"
,
"cased_L-24_H-1024_A-16"
,
"multi_cased_L-12_H-768_A-12"
]
is_bad_config
=
False
if
model_name
in
lower_models
and
not
do_lower_case
:
is_bad_config
=
True
actual_flag
=
"False"
case_name
=
"lowercased"
opposite_flag
=
"True"
if
model_name
in
cased_models
and
do_lower_case
:
is_bad_config
=
True
actual_flag
=
"True"
case_name
=
"cased"
opposite_flag
=
"False"
if
is_bad_config
:
raise
ValueError
(
"You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
"However, `%s` seems to be a %s model, so you "
"should pass in `--do_lower_case=%s` so that the fine-tuning matches "
"how the model was pre-training. If this error is wrong, please "
"just comment out this check."
%
(
actual_flag
,
init_checkpoint
,
model_name
,
case_name
,
opposite_flag
))
def
convert_to_unicode
(
text
):
"""Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
if
six
.
PY3
:
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
bytes
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
elif
six
.
PY2
:
if
isinstance
(
text
,
str
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
elif
isinstance
(
text
,
unicode
):
return
text
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
else
:
raise
ValueError
(
"Not running on Python2 or Python 3?"
)
def
printable_text
(
text
):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if
six
.
PY3
:
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
bytes
):
return
text
.
decode
(
"utf-8"
,
"ignore"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
elif
six
.
PY2
:
if
isinstance
(
text
,
str
):
return
text
elif
isinstance
(
text
,
unicode
):
return
text
.
encode
(
"utf-8"
)
else
:
raise
ValueError
(
"Unsupported string type: %s"
%
(
type
(
text
)))
else
:
raise
ValueError
(
"Not running on Python2 or Python 3?"
)
def
load_vocab
(
vocab_file
):
"""Loads a vocabulary file into a dictionary."""
vocab
=
collections
.
OrderedDict
()
index
=
0
with
open
(
vocab_file
,
"r"
)
as
reader
:
while
True
:
token
=
convert_to_unicode
(
reader
.
readline
())
if
not
token
:
break
token
=
token
.
strip
()
vocab
[
token
]
=
index
index
+=
1
return
vocab
def
convert_by_vocab
(
vocab
,
items
):
"""Converts a sequence of [tokens|ids] using the vocab."""
output
=
[]
for
item
in
items
:
output
.
append
(
vocab
[
item
])
return
output
def
convert_tokens_to_ids
(
vocab
,
tokens
):
return
convert_by_vocab
(
vocab
,
tokens
)
def
convert_ids_to_tokens
(
inv_vocab
,
ids
):
return
convert_by_vocab
(
inv_vocab
,
ids
)
def
whitespace_tokenize
(
text
):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text
=
text
.
strip
()
if
not
text
:
return
[]
tokens
=
text
.
split
()
return
tokens
class
FullTokenizer
(
object
):
"""Runs end-to-end tokenziation."""
def
__init__
(
self
,
vocab_file
,
do_lower_case
=
True
):
self
.
vocab
=
load_vocab
(
vocab_file
)
self
.
inv_vocab
=
{
v
:
k
for
k
,
v
in
self
.
vocab
.
items
()}
self
.
basic_tokenizer
=
BasicTokenizer
(
do_lower_case
=
do_lower_case
)
self
.
wordpiece_tokenizer
=
WordpieceTokenizer
(
vocab
=
self
.
vocab
)
def
tokenize
(
self
,
text
):
split_tokens
=
[]
for
token
in
self
.
basic_tokenizer
.
tokenize
(
text
):
for
sub_token
in
self
.
wordpiece_tokenizer
.
tokenize
(
token
):
split_tokens
.
append
(
sub_token
)
return
split_tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
return
convert_by_vocab
(
self
.
vocab
,
tokens
)
def
convert_ids_to_tokens
(
self
,
ids
):
return
convert_by_vocab
(
self
.
inv_vocab
,
ids
)
def
vocab_size
(
self
):
return
len
(
self
.
vocab
)
class
BasicTokenizer
(
object
):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
def
__init__
(
self
,
do_lower_case
=
True
):
"""Constructs a BasicTokenizer.
Args:
do_lower_case: Whether to lower case the input.
"""
self
.
do_lower_case
=
do_lower_case
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text."""
text
=
convert_to_unicode
(
text
)
text
=
self
.
_clean_text
(
text
)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text
=
self
.
_tokenize_chinese_chars
(
text
)
orig_tokens
=
whitespace_tokenize
(
text
)
split_tokens
=
[]
for
token
in
orig_tokens
:
if
self
.
do_lower_case
:
token
=
token
.
lower
()
token
=
self
.
_run_strip_accents
(
token
)
split_tokens
.
extend
(
self
.
_run_split_on_punc
(
token
))
output_tokens
=
whitespace_tokenize
(
" "
.
join
(
split_tokens
))
return
output_tokens
def
_run_strip_accents
(
self
,
text
):
"""Strips accents from a piece of text."""
text
=
unicodedata
.
normalize
(
"NFD"
,
text
)
output
=
[]
for
char
in
text
:
cat
=
unicodedata
.
category
(
char
)
if
cat
==
"Mn"
:
continue
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_run_split_on_punc
(
self
,
text
):
"""Splits punctuation on a piece of text."""
chars
=
list
(
text
)
i
=
0
start_new_word
=
True
output
=
[]
while
i
<
len
(
chars
):
char
=
chars
[
i
]
if
_is_punctuation
(
char
):
output
.
append
([
char
])
start_new_word
=
True
else
:
if
start_new_word
:
output
.
append
([])
start_new_word
=
False
output
[
-
1
].
append
(
char
)
i
+=
1
return
[
""
.
join
(
x
)
for
x
in
output
]
def
_tokenize_chinese_chars
(
self
,
text
):
"""Adds whitespace around any CJK character."""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
self
.
_is_chinese_char
(
cp
):
output
.
append
(
" "
)
output
.
append
(
char
)
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
def
_is_chinese_char
(
self
,
cp
):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if
((
cp
>=
0x4E00
and
cp
<=
0x9FFF
)
or
#
(
cp
>=
0x3400
and
cp
<=
0x4DBF
)
or
#
(
cp
>=
0x20000
and
cp
<=
0x2A6DF
)
or
#
(
cp
>=
0x2A700
and
cp
<=
0x2B73F
)
or
#
(
cp
>=
0x2B740
and
cp
<=
0x2B81F
)
or
#
(
cp
>=
0x2B820
and
cp
<=
0x2CEAF
)
or
(
cp
>=
0xF900
and
cp
<=
0xFAFF
)
or
#
(
cp
>=
0x2F800
and
cp
<=
0x2FA1F
)):
#
return
True
return
False
def
_clean_text
(
self
,
text
):
"""Performs invalid character removal and whitespace cleanup on text."""
output
=
[]
for
char
in
text
:
cp
=
ord
(
char
)
if
cp
==
0
or
cp
==
0xfffd
or
_is_control
(
char
):
continue
if
_is_whitespace
(
char
):
output
.
append
(
" "
)
else
:
output
.
append
(
char
)
return
""
.
join
(
output
)
class
WordpieceTokenizer
(
object
):
"""Runs WordPiece tokenziation."""
def
__init__
(
self
,
vocab
,
unk_token
=
"[UNK]"
,
max_input_chars_per_word
=
200
):
self
.
vocab
=
vocab
self
.
unk_token
=
unk_token
self
.
max_input_chars_per_word
=
max_input_chars_per_word
def
tokenize
(
self
,
text
):
"""Tokenizes a piece of text into its word pieces.
This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.
For example:
input = "unaffable"
output = ["un", "##aff", "##able"]
Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer.
Returns:
A list of wordpiece tokens.
"""
text
=
convert_to_unicode
(
text
)
output_tokens
=
[]
for
token
in
whitespace_tokenize
(
text
):
chars
=
list
(
token
)
if
len
(
chars
)
>
self
.
max_input_chars_per_word
:
output_tokens
.
append
(
self
.
unk_token
)
continue
is_bad
=
False
start
=
0
sub_tokens
=
[]
while
start
<
len
(
chars
):
end
=
len
(
chars
)
cur_substr
=
None
while
start
<
end
:
substr
=
""
.
join
(
chars
[
start
:
end
])
if
start
>
0
:
substr
=
"##"
+
substr
if
substr
in
self
.
vocab
:
cur_substr
=
substr
break
end
-=
1
if
cur_substr
is
None
:
is_bad
=
True
break
sub_tokens
.
append
(
cur_substr
)
start
=
end
if
is_bad
:
output_tokens
.
append
(
self
.
unk_token
)
else
:
output_tokens
.
extend
(
sub_tokens
)
return
output_tokens
def
_is_whitespace
(
char
):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if
char
==
" "
or
char
==
"
\t
"
or
char
==
"
\n
"
or
char
==
"
\r
"
:
return
True
cat
=
unicodedata
.
category
(
char
)
if
cat
==
"Zs"
:
return
True
return
False
def
_is_control
(
char
):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if
char
==
"
\t
"
or
char
==
"
\n
"
or
char
==
"
\r
"
:
return
False
cat
=
unicodedata
.
category
(
char
)
if
cat
in
(
"Cc"
,
"Cf"
):
return
True
return
False
def
_is_punctuation
(
char
):
"""Checks whether `chars` is a punctuation character."""
cp
=
ord
(
char
)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if
((
cp
>=
33
and
cp
<=
47
)
or
(
cp
>=
58
and
cp
<=
64
)
or
(
cp
>=
91
and
cp
<=
96
)
or
(
cp
>=
123
and
cp
<=
126
)):
return
True
cat
=
unicodedata
.
category
(
char
)
if
cat
.
startswith
(
"P"
):
return
True
return
False
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/tokenizer/gpt2_tokenization.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for OpenAI GPT."""
from
__future__
import
(
absolute_import
,
division
,
print_function
,
unicode_literals
)
import
sys
import
json
import
logging
import
os
import
regex
as
re
from
io
import
open
try
:
from
functools
import
lru_cache
except
ImportError
:
# Just a dummy decorator to get the checks to run on python2
# because honestly I don't want to support a byte-level unicode BPE
# tokenizer on python 2 right now.
def
lru_cache
():
return
lambda
func
:
func
logger
=
logging
.
getLogger
(
__name__
)
PRETRAINED_VOCAB_ARCHIVE_MAP
=
{
'gpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json"
,
}
PRETRAINED_MERGES_ARCHIVE_MAP
=
{
'gpt2'
:
"https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt"
,
}
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
=
{
'gpt2'
:
1024
,
}
VOCAB_NAME
=
'vocab.json'
MERGES_NAME
=
'merges.txt'
SPECIAL_TOKENS_NAME
=
'special_tokens.txt'
@
lru_cache
()
def
bytes_to_unicode
():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a signficant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
_chr
=
unichr
if
sys
.
version_info
[
0
]
==
2
else
chr
bs
=
list
(
range
(
ord
(
"!"
),
ord
(
"~"
)
+
1
))
+
list
(
range
(
ord
(
"¡"
),
ord
(
"¬"
)
+
1
))
+
\
list
(
range
(
ord
(
"®"
),
ord
(
"ÿ"
)
+
1
))
cs
=
bs
[:]
n
=
0
for
b
in
range
(
2
**
8
):
if
b
not
in
bs
:
bs
.
append
(
b
)
cs
.
append
(
2
**
8
+
n
)
n
+=
1
cs
=
[
_chr
(
n
)
for
n
in
cs
]
return
dict
(
zip
(
bs
,
cs
))
def
get_pairs
(
word
):
"""Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs
=
set
()
prev_char
=
word
[
0
]
for
char
in
word
[
1
:]:
pairs
.
add
((
prev_char
,
char
))
prev_char
=
char
return
pairs
class
GPT2Tokenizer
(
object
):
"""
GPT-2 BPE tokenizer. Peculiarities:
- Byte-level BPE
"""
@
classmethod
def
from_pretrained
(
cls
,
pretrained_model_name_or_path
,
cache_dir
=
None
,
*
inputs
,
**
kwargs
):
"""
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
"""
if
pretrained_model_name_or_path
in
PRETRAINED_VOCAB_ARCHIVE_MAP
:
vocab_file
=
PRETRAINED_VOCAB_ARCHIVE_MAP
[
pretrained_model_name_or_path
]
merges_file
=
PRETRAINED_MERGES_ARCHIVE_MAP
[
pretrained_model_name_or_path
]
special_tokens_file
=
None
else
:
vocab_file
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
VOCAB_NAME
)
merges_file
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
MERGES_NAME
)
special_tokens_file
=
os
.
path
.
join
(
pretrained_model_name_or_path
,
SPECIAL_TOKENS_NAME
)
if
not
os
.
path
.
exists
(
special_tokens_file
):
special_tokens_file
=
None
else
:
logger
.
info
(
"loading special tokens file {}"
.
format
(
special_tokens_file
))
# redirect to the cache, if necessary
try
:
from
.file_utils
import
cached_path
resolved_vocab_file
=
cached_path
(
vocab_file
,
cache_dir
=
cache_dir
)
resolved_merges_file
=
cached_path
(
merges_file
,
cache_dir
=
cache_dir
)
except
EnvironmentError
:
logger
.
error
(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find files {} and {} "
"at this path or url."
.
format
(
pretrained_model_name_or_path
,
', '
.
join
(
PRETRAINED_VOCAB_ARCHIVE_MAP
.
keys
()),
pretrained_model_name_or_path
,
vocab_file
,
merges_file
))
return
None
if
resolved_vocab_file
==
vocab_file
and
resolved_merges_file
==
merges_file
:
logger
.
info
(
"loading vocabulary file {}"
.
format
(
vocab_file
))
logger
.
info
(
"loading merges file {}"
.
format
(
merges_file
))
else
:
logger
.
info
(
"loading vocabulary file {} from cache at {}"
.
format
(
vocab_file
,
resolved_vocab_file
))
logger
.
info
(
"loading merges file {} from cache at {}"
.
format
(
merges_file
,
resolved_merges_file
))
if
pretrained_model_name_or_path
in
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
:
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# than the number of positional embeddings
max_len
=
PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
[
pretrained_model_name_or_path
]
kwargs
[
'max_len'
]
=
min
(
kwargs
.
get
(
'max_len'
,
int
(
1e12
)),
max_len
)
# Instantiate tokenizer.
if
special_tokens_file
and
'special_tokens'
not
in
kwargs
:
special_tokens
=
open
(
special_tokens_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[:
-
1
]
else
:
special_tokens
=
kwargs
.
pop
(
'special_tokens'
,
[])
tokenizer
=
cls
(
resolved_vocab_file
,
resolved_merges_file
,
special_tokens
=
special_tokens
,
*
inputs
,
**
kwargs
)
return
tokenizer
def
__init__
(
self
,
vocab_file
,
merges_file
,
errors
=
'replace'
,
special_tokens
=
None
,
max_len
=
None
):
self
.
max_len
=
max_len
if
max_len
is
not
None
else
int
(
1e12
)
self
.
encoder
=
json
.
load
(
open
(
vocab_file
))
self
.
decoder
=
{
v
:
k
for
k
,
v
in
self
.
encoder
.
items
()}
self
.
errors
=
errors
# how to handle errors in decoding
self
.
byte_encoder
=
bytes_to_unicode
()
self
.
byte_decoder
=
{
v
:
k
for
k
,
v
in
self
.
byte_encoder
.
items
()}
bpe_data
=
open
(
merges_file
,
encoding
=
'utf-8'
).
read
().
split
(
'
\n
'
)[
1
:
-
1
]
bpe_merges
=
[
tuple
(
merge
.
split
())
for
merge
in
bpe_data
]
self
.
bpe_ranks
=
dict
(
zip
(
bpe_merges
,
range
(
len
(
bpe_merges
))))
self
.
cache
=
{}
# Should haved added re.IGNORECASE so BPE merges can happen for
# capitalized versions of contractions
self
.
pat
=
re
.
compile
(
r
"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
)
self
.
special_tokens
=
{}
self
.
special_tokens_decoder
=
{}
self
.
set_special_tokens
(
special_tokens
)
def
__len__
(
self
):
return
len
(
self
.
encoder
)
+
len
(
self
.
special_tokens
)
def
set_special_tokens
(
self
,
special_tokens
):
""" Add a list of additional tokens to the encoder.
The additional tokens are indexed starting from the last index of the
current vocabulary in the order of the `special_tokens` list.
"""
if
not
special_tokens
:
self
.
special_tokens
=
{}
self
.
special_tokens_decoder
=
{}
return
self
.
special_tokens
=
dict
((
tok
,
len
(
self
.
encoder
)
+
i
)
for
i
,
tok
in
enumerate
(
special_tokens
))
self
.
special_tokens_decoder
=
{
v
:
k
for
k
,
v
in
self
.
special_tokens
.
items
()}
logger
.
info
(
"Special tokens {}"
.
format
(
self
.
special_tokens
))
def
bpe
(
self
,
token
):
if
token
in
self
.
cache
:
return
self
.
cache
[
token
]
word
=
tuple
(
token
)
pairs
=
get_pairs
(
word
)
if
not
pairs
:
return
token
while
True
:
bigram
=
min
(
pairs
,
key
=
lambda
pair
:
self
.
bpe_ranks
.
get
(
pair
,
float
(
'inf'
)))
if
bigram
not
in
self
.
bpe_ranks
:
break
first
,
second
=
bigram
new_word
=
[]
i
=
0
while
i
<
len
(
word
):
try
:
j
=
word
.
index
(
first
,
i
)
new_word
.
extend
(
word
[
i
:
j
])
i
=
j
except
BaseException
:
new_word
.
extend
(
word
[
i
:])
break
if
word
[
i
]
==
first
and
i
<
len
(
word
)
-
1
and
word
[
i
+
1
]
==
second
:
new_word
.
append
(
first
+
second
)
i
+=
2
else
:
new_word
.
append
(
word
[
i
])
i
+=
1
new_word
=
tuple
(
new_word
)
word
=
new_word
if
len
(
word
)
==
1
:
break
else
:
pairs
=
get_pairs
(
word
)
word
=
' '
.
join
(
word
)
self
.
cache
[
token
]
=
word
return
word
def
tokenize
(
self
,
text
):
""" Tokenize a string. """
bpe_tokens
=
[]
for
token
in
re
.
findall
(
self
.
pat
,
text
):
if
sys
.
version_info
[
0
]
==
2
:
token
=
''
.
join
(
self
.
byte_encoder
[
ord
(
b
)]
for
b
in
token
)
else
:
token
=
''
.
join
(
self
.
byte_encoder
[
b
]
for
b
in
token
.
encode
(
'utf-8'
))
bpe_tokens
.
extend
(
bpe_token
for
bpe_token
in
self
.
bpe
(
token
).
split
(
' '
))
return
bpe_tokens
def
convert_tokens_to_ids
(
self
,
tokens
):
""" Converts a sequence of tokens into ids using the vocab. """
ids
=
[]
if
isinstance
(
tokens
,
str
)
or
(
sys
.
version_info
[
0
]
==
2
and
isinstance
(
tokens
,
unicode
)):
if
tokens
in
self
.
special_tokens
:
return
self
.
special_tokens
[
tokens
]
else
:
return
self
.
encoder
.
get
(
tokens
,
0
)
for
token
in
tokens
:
if
token
in
self
.
special_tokens
:
ids
.
append
(
self
.
special_tokens
[
token
])
else
:
ids
.
append
(
self
.
encoder
.
get
(
token
,
0
))
if
len
(
ids
)
>
self
.
max_len
:
logger
.
warning
(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this OpenAI GPT model ({} > {}). Running this"
" sequence through the model will result in indexing errors"
.
format
(
len
(
ids
),
self
.
max_len
)
)
return
ids
def
convert_ids_to_tokens
(
self
,
ids
,
skip_special_tokens
=
False
):
"""Converts a sequence of ids in BPE tokens using the vocab."""
tokens
=
[]
for
i
in
ids
:
if
i
in
self
.
special_tokens_decoder
:
if
not
skip_special_tokens
:
tokens
.
append
(
self
.
special_tokens_decoder
[
i
])
else
:
tokens
.
append
(
self
.
decoder
[
i
])
return
tokens
def
encode
(
self
,
text
):
return
self
.
convert_tokens_to_ids
(
self
.
tokenize
(
text
))
def
decode
(
self
,
tokens
):
text
=
''
.
join
([
self
.
decoder
[
token
]
for
token
in
tokens
])
text
=
bytearray
([
self
.
byte_decoder
[
c
]
for
c
in
text
]).
decode
(
'utf-8'
,
errors
=
self
.
errors
)
return
text
def
save_vocabulary
(
self
,
vocab_path
):
"""Save the tokenizer vocabulary and merge files to a directory."""
if
not
os
.
path
.
isdir
(
vocab_path
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
vocab_path
))
return
vocab_file
=
os
.
path
.
join
(
vocab_path
,
VOCAB_NAME
)
merge_file
=
os
.
path
.
join
(
vocab_path
,
MERGES_NAME
)
special_tokens_file
=
os
.
path
.
join
(
vocab_path
,
SPECIAL_TOKENS_NAME
)
with
open
(
vocab_file
,
'w'
,
encoding
=
'utf-8'
)
as
f
:
f
.
write
(
json
.
dumps
(
self
.
encoder
,
ensure_ascii
=
False
))
index
=
0
with
open
(
merge_file
,
"w"
,
encoding
=
"utf-8"
)
as
writer
:
writer
.
write
(
u
'#version: 0.2
\n
'
)
for
bpe_tokens
,
token_index
in
sorted
(
self
.
bpe_ranks
.
items
(),
key
=
lambda
kv
:
kv
[
1
]):
if
index
!=
token_index
:
logger
.
warning
(
"Saving vocabulary to {}: BPE merge indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
.
format
(
merge_file
))
index
=
token_index
writer
.
write
(
' '
.
join
(
bpe_tokens
)
+
u
'
\n
'
)
index
+=
1
index
=
len
(
self
.
encoder
)
with
open
(
special_tokens_file
,
'w'
,
encoding
=
'utf-8'
)
as
writer
:
for
token
,
token_index
in
sorted
(
self
.
special_tokens
.
items
(),
key
=
lambda
kv
:
kv
[
1
]):
if
index
!=
token_index
:
logger
.
warning
(
"Saving special tokens vocabulary to {}: BPE indices are not consecutive."
" Please check that the tokenizer is not corrupted!"
.
format
(
special_tokens_file
))
index
=
token_index
writer
.
write
(
token
+
u
'
\n
'
)
index
+=
1
return
vocab_file
,
merge_file
,
special_tokens_file
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/tokenizer/tokenizer.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron tokenizers."""
from
abc
import
ABC
from
abc
import
abstractmethod
from
.bert_tokenization
import
FullTokenizer
as
FullBertTokenizer
from
.gpt2_tokenization
import
GPT2Tokenizer
def
build_tokenizer
(
args
):
"""Initialize tokenizer."""
if
args
.
rank
==
0
:
print
(
'> building {} tokenizer ...'
.
format
(
args
.
tokenizer_type
),
flush
=
True
)
# Select and instantiate the tokenizer.
assert
args
.
vocab_file
is
not
None
if
args
.
tokenizer_type
==
'BertWordPieceLowerCase'
:
tokenizer
=
_BertWordPieceTokenizer
(
vocab_file
=
args
.
vocab_file
,
lower_case
=
True
)
elif
args
.
tokenizer_type
==
'BertWordPieceCase'
:
tokenizer
=
_BertWordPieceTokenizer
(
vocab_file
=
args
.
vocab_file
,
lower_case
=
False
)
elif
args
.
tokenizer_type
==
'GPT2BPETokenizer'
:
assert
args
.
merge_file
is
not
None
tokenizer
=
_GPT2BPETokenizer
(
args
.
vocab_file
,
args
.
merge_file
)
else
:
raise
NotImplementedError
(
'{} tokenizer is not '
'implemented.'
.
format
(
args
.
tokenizer_type
))
# Add vocab size.
args
.
padded_vocab_size
=
_vocab_size_with_padding
(
tokenizer
.
vocab_size
,
args
)
return
tokenizer
def
_vocab_size_with_padding
(
orig_vocab_size
,
args
):
"""Pad vocab size so it is divisible by model parallel size and
still having GPU friendly size."""
after
=
orig_vocab_size
multiple
=
args
.
make_vocab_size_divisible_by
*
\
args
.
model_parallel_size
while
(
after
%
multiple
)
!=
0
:
after
+=
1
if
args
.
rank
==
0
:
print
(
' > padded vocab (size: {}) with {} dummy tokens '
'(new size: {})'
.
format
(
orig_vocab_size
,
after
-
orig_vocab_size
,
after
),
flush
=
True
)
return
after
class
AbstractTokenizer
(
ABC
):
"""Abstract class for tokenizer."""
def
__init__
(
self
,
name
):
self
.
name
=
name
super
().
__init__
()
@
property
@
abstractmethod
def
vocab_size
(
self
):
pass
@
property
@
abstractmethod
def
vocab
(
self
):
"""Dictionary from vocab text token to id token."""
pass
@
property
@
abstractmethod
def
inv_vocab
(
self
):
"""Dictionary from vocab id token to text token."""
pass
@
abstractmethod
def
tokenize
(
self
,
text
):
pass
def
detokenize
(
self
,
token_ids
):
raise
NotImplementedError
(
'detokenizer is not implemented for {} '
'tokenizer'
.
format
(
self
.
name
))
@
property
def
cls
(
self
):
raise
NotImplementedError
(
'CLS is not provided for {} '
'tokenizer'
.
format
(
self
.
name
))
@
property
def
sep
(
self
):
raise
NotImplementedError
(
'SEP is not provided for {} '
'tokenizer'
.
format
(
self
.
name
))
@
property
def
pad
(
self
):
raise
NotImplementedError
(
'PAD is not provided for {} '
'tokenizer'
.
format
(
self
.
name
))
@
property
def
eod
(
self
):
raise
NotImplementedError
(
'EOD is not provided for {} '
'tokenizer'
.
format
(
self
.
name
))
@
property
def
mask
(
self
):
raise
NotImplementedError
(
'MASK is not provided for {} '
'tokenizer'
.
format
(
self
.
name
))
class
_BertWordPieceTokenizer
(
AbstractTokenizer
):
"""Original BERT wordpiece tokenizer."""
def
__init__
(
self
,
vocab_file
,
lower_case
=
True
):
if
lower_case
:
name
=
'BERT Lower Case'
else
:
name
=
'BERT Upper Case'
super
().
__init__
(
name
)
self
.
tokenizer
=
FullBertTokenizer
(
vocab_file
,
do_lower_case
=
lower_case
)
self
.
cls_id
=
self
.
tokenizer
.
vocab
[
'[CLS]'
]
self
.
sep_id
=
self
.
tokenizer
.
vocab
[
'[SEP]'
]
self
.
pad_id
=
self
.
tokenizer
.
vocab
[
'[PAD]'
]
self
.
mask_id
=
self
.
tokenizer
.
vocab
[
'[MASK]'
]
@
property
def
vocab_size
(
self
):
return
self
.
tokenizer
.
vocab_size
()
@
property
def
vocab
(
self
):
return
self
.
tokenizer
.
vocab
@
property
def
inv_vocab
(
self
):
return
self
.
tokenizer
.
inv_vocab
def
tokenize
(
self
,
text
):
text_tokens
=
self
.
tokenizer
.
tokenize
(
text
)
return
self
.
tokenizer
.
convert_tokens_to_ids
(
text_tokens
)
def
decode_token_ids
(
self
,
token_ids
):
tokens
=
self
.
tokenizer
.
convert_ids_to_tokens
(
token_ids
)
exclude_list
=
[
'[PAD]'
,
'[CLS]'
]
non_pads
=
[
t
for
t
in
tokens
if
t
not
in
exclude_list
]
result
=
""
for
s
in
non_pads
:
if
s
.
startswith
(
"##"
):
result
+=
s
[
2
:]
else
:
result
+=
" "
+
s
return
result
@
property
def
cls
(
self
):
return
self
.
cls_id
@
property
def
sep
(
self
):
return
self
.
sep_id
@
property
def
pad
(
self
):
return
self
.
pad_id
@
property
def
mask
(
self
):
return
self
.
mask_id
class
_GPT2BPETokenizer
(
AbstractTokenizer
):
"""Original GPT2 BPE tokenizer."""
def
__init__
(
self
,
vocab_file
,
merge_file
):
name
=
'GPT2 BPE'
super
().
__init__
(
name
)
self
.
tokenizer
=
GPT2Tokenizer
(
vocab_file
,
merge_file
,
errors
=
'replace'
,
special_tokens
=
[],
max_len
=
None
)
self
.
eod_id
=
self
.
tokenizer
.
encoder
[
'<|endoftext|>'
]
@
property
def
vocab_size
(
self
):
return
len
(
self
.
tokenizer
.
encoder
)
@
property
def
vocab
(
self
):
return
self
.
tokenizer
.
encoder
@
property
def
inv_vocab
(
self
):
return
self
.
tokenizer
.
decoder
def
tokenize
(
self
,
text
):
return
self
.
tokenizer
.
encode
(
text
)
def
detokenize
(
self
,
token_ids
):
return
self
.
tokenizer
.
decode
(
token_ids
)
@
property
def
eod
(
self
):
return
self
.
eod_id
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain utilities."""
from
datetime
import
datetime
import
math
import
sys
import
torch
from
torch.nn.parallel.distributed
import
DistributedDataParallel
as
torchDDP
from
apex.optimizers
import
FusedAdam
as
Adam
from
megatron
import
get_args
from
megatron
import
get_timers
from
megatron
import
get_tensorboard_writer
from
megatron
import
mpu
from
megatron
import
print_rank_0
from
megatron.checkpointing
import
load_checkpoint
from
megatron.checkpointing
import
save_checkpoint
from
megatron.fp16
import
FP16_Module
from
megatron.fp16
import
FP16_Optimizer
from
megatron.initialize
import
initialize_megatron
from
megatron.learning_rates
import
AnnealingLR
from
megatron.model
import
DistributedDataParallel
as
LocalDDP
from
megatron.model
import
get_params_for_weight_decay_optimization
from
megatron.model.realm_model
import
ICTBertModel
from
megatron.utils
import
check_adlr_autoresume_termination
from
megatron.utils
import
make_data_loader
from
megatron.utils
import
report_memory
import
deepspeed
def
pretrain
(
train_valid_test_dataset_provider
,
model_provider
,
forward_step_func
,
extra_args_provider
=
None
,
args_defaults
=
{}):
"""Main training program.
This function will run the followings in the order provided:
1) initialize Megatron.
2) setup model, optimizer and lr schedule using the model_provider.
3) call train_val_test_data_provider to get train/val/test datasets.
4) train the modle using the forward_step_func.
Arguments:
train_valid_test_dataset_provider: a function that takes the size of
train/valid/test dataset and returns `train, valid, test` datasets.
model_provider: a function that returns a vanilla version of the
model. By vanilla we mean a simple model on cpu with no fp16 or ddp.
forward_step_func: a function that takes a `data iterator` and `model`,
and returns a `loss` scalar with a dictionary with key:values being
the info we would like to monitor during training, for example
`lm-loss: value`. We also require that this function add
`batch generator` to the timers class.
extra_args_provider: a function that takes a parser and adds arguments
to it. It is used for programs to add their own arguments.
args_defaults: a dictionary from argument-name to argument-value. It
to set already parse arguments.
"""
# Initalize and get arguments, timers, and Tensorboard writer.
initialize_megatron
(
extra_args_provider
=
extra_args_provider
,
args_defaults
=
args_defaults
)
args
=
get_args
()
timers
=
get_timers
()
# Model, optimizer, and learning rate.
timers
(
'model and optimizer'
).
start
()
model
,
optimizer
,
lr_scheduler
=
setup_model_and_optimizer
(
model_provider
)
timers
(
'model and optimizer'
).
stop
()
# Data stuff.
timers
(
'train/valid/test data iterators'
).
start
()
train_data_iterator
,
valid_data_iterator
,
test_data_iterator
\
=
build_train_valid_test_data_iterators
(
train_valid_test_dataset_provider
)
timers
(
'train/valid/test data iterators'
).
stop
()
# Print setup timing.
print_rank_0
(
'done with setups ...'
)
timers
.
log
([
'model and optimizer'
,
'train/valid/test data iterators'
])
print_rank_0
(
'training ...'
)
iteration
=
0
if
args
.
do_train
and
args
.
train_iters
>
0
:
iteration
=
train
(
forward_step_func
,
model
,
optimizer
,
lr_scheduler
,
train_data_iterator
,
valid_data_iterator
)
if
args
.
do_valid
:
prefix
=
'the end of training for val data'
evaluate_and_print_results
(
prefix
,
forward_step_func
,
valid_data_iterator
,
model
,
iteration
,
False
)
if
args
.
save
and
iteration
!=
0
:
save_checkpoint
(
iteration
,
model
,
optimizer
,
lr_scheduler
)
if
args
.
do_test
:
# Run on test data.
prefix
=
'the end of training for test data'
evaluate_and_print_results
(
prefix
,
forward_step_func
,
test_data_iterator
,
model
,
0
,
True
)
def
get_model
(
model_provider_func
):
"""Build the model."""
args
=
get_args
()
# Build model on cpu.
model
=
model_provider_func
()
# Print number of parameters.
if
mpu
.
get_data_parallel_rank
()
==
0
:
print
(
' > number of parameters on model parallel rank {}: {}'
.
format
(
mpu
.
get_model_parallel_rank
(),
sum
([
p
.
nelement
()
for
p
in
model
.
parameters
()])),
flush
=
True
)
if
args
.
deepspeed
:
# DeepSpeed handles CUDA, FP16, and DDP components.
return
model
# GPU allocation.
model
.
cuda
(
torch
.
cuda
.
current_device
())
# Fp16 conversion.
if
args
.
fp16
:
model
=
FP16_Module
(
model
)
# Wrap model for distributed training."""
if
args
.
DDP_impl
==
'torch'
:
i
=
torch
.
cuda
.
current_device
()
model
=
torchDDP
(
model
,
device_ids
=
[
i
],
output_device
=
i
,
process_group
=
mpu
.
get_data_parallel_group
())
return
model
if
args
.
DDP_impl
==
'local'
:
model
=
LocalDDP
(
model
)
return
model
raise
NotImplementedError
(
'Unknown DDP implementation specified: {}. '
'Exiting.'
.
format
(
args
.
DDP_impl
))
def
get_optimizer
(
model
):
"""Set up the optimizer."""
args
=
get_args
()
# Build parameter groups (weight decay and non-decay).
while
isinstance
(
model
,
(
torchDDP
,
LocalDDP
,
FP16_Module
)):
model
=
model
.
module
param_groups
=
get_params_for_weight_decay_optimization
(
model
)
# Add model parallel attribute if it is not set.
for
param_group
in
param_groups
:
for
param
in
param_group
[
'params'
]:
if
not
hasattr
(
param
,
'model_parallel'
):
param
.
model_parallel
=
False
if
args
.
cpu_optimizer
:
if
args
.
cpu_torch_adam
:
cpu_adam_optimizer
=
torch
.
optim
.
Adam
else
:
from
deepspeed.ops.adam
import
DeepSpeedCPUAdam
cpu_adam_optimizer
=
DeepSpeedCPUAdam
optimizer
=
cpu_adam_optimizer
(
param_groups
,
lr
=
args
.
lr
,
weight_decay
=
args
.
weight_decay
)
else
:
# Use Adam.
optimizer
=
Adam
(
param_groups
,
lr
=
args
.
lr
,
weight_decay
=
args
.
weight_decay
,
betas
=
(
args
.
adam_beta1
,
args
.
adam_beta2
),
eps
=
args
.
adam_eps
)
if
args
.
deepspeed
:
# fp16 wrapper is not required for DeepSpeed.
return
optimizer
# Wrap into fp16 optimizer.
if
args
.
fp16
:
optimizer
=
FP16_Optimizer
(
optimizer
,
static_loss_scale
=
args
.
loss_scale
,
dynamic_loss_scale
=
args
.
dynamic_loss_scale
,
dynamic_loss_args
=
{
'scale_window'
:
args
.
loss_scale_window
,
'min_scale'
:
args
.
min_scale
,
'delayed_shift'
:
args
.
hysteresis
})
return
optimizer
def
get_learning_rate_scheduler
(
optimizer
):
"""Build the learning rate scheduler."""
args
=
get_args
()
# Add linear learning rate scheduler.
if
args
.
lr_decay_iters
is
not
None
:
num_iters
=
args
.
lr_decay_iters
else
:
num_iters
=
args
.
train_iters
num_iters
=
max
(
1
,
num_iters
)
init_step
=
0
warmup_iter
=
args
.
warmup
*
num_iters
lr_scheduler
=
AnnealingLR
(
optimizer
,
start_lr
=
args
.
lr
,
warmup_iter
=
warmup_iter
,
total_iters
=
num_iters
,
decay_style
=
args
.
lr_decay_style
,
last_iter
=
init_step
,
min_lr
=
args
.
min_lr
,
use_checkpoint_lr_scheduler
=
args
.
use_checkpoint_lr_scheduler
,
override_lr_scheduler
=
args
.
override_lr_scheduler
)
return
lr_scheduler
def
setup_model_and_optimizer
(
model_provider_func
):
"""Setup model and optimizer."""
args
=
get_args
()
model
=
get_model
(
model_provider_func
)
optimizer
=
get_optimizer
(
model
)
lr_scheduler
=
get_learning_rate_scheduler
(
optimizer
)
if
args
.
deepspeed
:
print_rank_0
(
"DeepSpeed is enabled."
)
model
,
optimizer
,
_
,
lr_scheduler
=
deepspeed
.
initialize
(
model
=
model
,
optimizer
=
optimizer
,
args
=
args
,
lr_scheduler
=
lr_scheduler
,
mpu
=
mpu
if
args
.
pipe_parallel_size
==
0
else
None
,
dist_init_required
=
False
)
if
args
.
pipe_parallel_size
>
0
:
model
.
set_batch_fn
(
model
.
module
.
_megatron_batch_fn
)
if
args
.
load
is
not
None
:
args
.
iteration
=
load_checkpoint
(
model
,
optimizer
,
lr_scheduler
)
else
:
args
.
iteration
=
0
# get model without FP16 and/or TorchDDP wrappers
unwrapped_model
=
model
while
hasattr
(
unwrapped_model
,
'module'
):
unwrapped_model
=
unwrapped_model
.
module
if
args
.
iteration
==
0
and
hasattr
(
unwrapped_model
,
'init_state_dict_from_bert'
):
print
(
"Initializing ICT from pretrained BERT model"
,
flush
=
True
)
unwrapped_model
.
init_state_dict_from_bert
()
return
model
,
optimizer
,
lr_scheduler
def
backward_step
(
optimizer
,
model
,
loss
):
"""Backward step."""
args
=
get_args
()
timers
=
get_timers
()
# Backward pass.
timers
(
'backward-backward'
).
start
()
if
args
.
deepspeed
:
model
.
backward
(
loss
)
else
:
optimizer
.
zero_grad
(
set_grads_to_None
=
True
)
if
args
.
fp16
:
optimizer
.
backward
(
loss
,
update_master_grads
=
False
)
else
:
loss
.
backward
()
timers
(
'backward-backward'
).
stop
()
if
args
.
deepspeed
:
# DeepSpeed backward propagation already addressed all reduce communication.
# Reset the timer to avoid breaking timer logs below.
timers
(
'backward-allreduce'
).
reset
()
else
:
# All-reduce if needed.
if
args
.
DDP_impl
==
'local'
:
timers
(
'backward-allreduce'
).
start
()
model
.
allreduce_params
(
reduce_after
=
False
,
fp32_allreduce
=
args
.
fp32_allreduce
)
timers
(
'backward-allreduce'
).
stop
()
if
not
args
.
deepspeed
:
# Update master gradients.
timers
(
'backward-master-grad'
).
start
()
if
args
.
fp16
:
optimizer
.
update_master_grads
()
timers
(
'backward-master-grad'
).
stop
()
# Clipping gradients helps prevent the exploding gradient.
timers
(
'backward-clip-grad'
).
start
()
if
args
.
clip_grad
>
0
:
if
not
args
.
fp16
:
mpu
.
clip_grad_norm
(
model
.
parameters
(),
args
.
clip_grad
)
else
:
optimizer
.
clip_master_grads
(
args
.
clip_grad
)
timers
(
'backward-clip-grad'
).
stop
()
def
train_step
(
forward_step_func
,
data_iterator
,
model
,
optimizer
,
lr_scheduler
):
"""Single training step."""
args
=
get_args
()
timers
=
get_timers
()
# Pipeline parallelism schedules forward/backward/step
if
args
.
pipe_parallel_size
>
0
:
return
train_step_pipe
(
model
,
data_iterator
)
# Forward model for one step.
timers
(
'forward'
).
start
()
loss
,
loss_reduced
=
forward_step_func
(
data_iterator
,
model
)
timers
(
'forward'
).
stop
()
# Calculate gradients, reduce across processes, and clip.
timers
(
'backward'
).
start
()
backward_step
(
optimizer
,
model
,
loss
)
timers
(
'backward'
).
stop
()
# Update parameters.
skipped_iter
=
0
timers
(
'optimizer'
).
start
()
if
args
.
deepspeed
:
model
.
step
()
else
:
optimizer
.
step
()
# Update learning rate.
if
not
(
args
.
fp16
and
optimizer
.
overflow
):
lr_scheduler
.
step
()
else
:
skipped_iter
=
1
timers
(
'optimizer'
).
stop
()
return
loss_reduced
,
skipped_iter
def
train_step_pipe
(
model
,
data_iterator
):
"""Single training step with DeepSpeed's pipeline parallel engine. """
args
=
get_args
()
timers
=
get_timers
()
assert
args
.
deepspeed
loss
=
model
.
train_batch
(
data_iter
=
data_iterator
)
loss_dict
=
{
'lm loss'
:
loss
}
if
args
.
fp16
and
model
.
optimizer
.
overflow
:
skipped_iter
=
1
else
:
skipped_iter
=
0
# Don't break Megatron's timers because we changed code paths.
for
t
in
[
'forward'
,
'backward'
,
'allreduce'
,
'optimizer'
,
'batch generator'
,
'data loader'
]:
timers
(
t
).
reset
()
return
loss_dict
,
skipped_iter
def
training_log
(
loss_dict
,
total_loss_dict
,
learning_rate
,
iteration
,
loss_scale
,
report_memory_flag
,
skipped_iter
):
"""Log training information such as losses, timing, ...."""
args
=
get_args
()
timers
=
get_timers
()
writer
=
get_tensorboard_writer
()
# Update losses.
skipped_iters_key
=
'skipped iterations'
total_loss_dict
[
skipped_iters_key
]
=
total_loss_dict
.
get
(
skipped_iters_key
,
0
)
+
skipped_iter
got_nan_key
=
'got nan'
got_nan
=
False
for
key
in
loss_dict
:
if
not
skipped_iter
:
total_loss_dict
[
key
]
=
total_loss_dict
.
get
(
key
,
0.
)
+
loss_dict
[
key
]
else
:
value
=
loss_dict
[
key
].
float
().
sum
().
item
()
is_nan
=
value
==
float
(
'inf'
)
or
\
value
==
-
float
(
'inf'
)
or
\
value
!=
value
got_nan
=
got_nan
or
is_nan
total_loss_dict
[
got_nan_key
]
=
total_loss_dict
.
get
(
got_nan_key
,
0
)
+
int
(
got_nan
)
# Logging.
timers_to_log
=
[]
def
add_to_logging
(
name
):
if
name
in
timers
.
timers
:
timers_to_log
.
append
(
name
)
add_to_logging
(
'forward'
)
add_to_logging
(
'backward'
)
add_to_logging
(
'backward-backward'
)
add_to_logging
(
'backward-allreduce'
)
add_to_logging
(
'backward-master-grad'
)
add_to_logging
(
'backward-clip-grad'
)
add_to_logging
(
'optimizer'
)
add_to_logging
(
'batch generator'
)
# Tensorboard values.
if
writer
and
torch
.
distributed
.
get_rank
()
==
0
:
writer
.
add_scalar
(
'learning_rate'
,
learning_rate
,
iteration
)
for
key
in
loss_dict
:
writer
.
add_scalar
(
key
,
loss_dict
[
key
],
iteration
)
if
args
.
fp16
:
writer
.
add_scalar
(
'loss_scale'
,
loss_scale
,
iteration
)
normalizer
=
iteration
%
args
.
log_interval
if
normalizer
==
0
:
normalizer
=
args
.
log_interval
timers
.
write
(
timers_to_log
,
writer
,
iteration
,
normalizer
=
normalizer
)
if
iteration
%
args
.
log_interval
==
0
:
elapsed_time
=
timers
(
'interval time'
).
elapsed
()
if
writer
and
torch
.
distributed
.
get_rank
()
==
0
:
writer
.
add_scalar
(
'iteration_time'
,
elapsed_time
/
args
.
log_interval
,
iteration
)
log_string
=
' iteration {:8d}/{:8d} |'
.
format
(
iteration
,
args
.
train_iters
)
log_string
+=
' elapsed time per iteration (ms): {:.1f} |'
.
format
(
elapsed_time
*
1000.0
/
args
.
log_interval
)
log_string
+=
' learning rate: {:.3E} |'
.
format
(
learning_rate
)
num_iterations
=
max
(
1
,
args
.
log_interval
-
total_loss_dict
[
skipped_iters_key
])
for
key
in
total_loss_dict
:
if
key
not
in
[
skipped_iters_key
,
got_nan_key
]:
avg
=
total_loss_dict
[
key
]
/
float
(
num_iterations
)
log_string
+=
' {}: {:.6E} |'
.
format
(
key
,
avg
)
total_loss_dict
[
key
]
=
0.0
if
args
.
fp16
:
log_string
+=
' loss scale: {:.1f} |'
.
format
(
loss_scale
)
log_string
+=
' number of skipped iterations: {:3d} |'
.
format
(
total_loss_dict
[
skipped_iters_key
])
log_string
+=
' number of nan iterations: {:3d} |'
.
format
(
total_loss_dict
[
got_nan_key
])
total_loss_dict
[
skipped_iters_key
]
=
0
total_loss_dict
[
got_nan_key
]
=
0
print_rank_0
(
log_string
)
if
report_memory_flag
:
report_memory
(
'after {} iterations'
.
format
(
iteration
))
report_memory_flag
=
False
timers
.
log
(
timers_to_log
,
normalizer
=
args
.
log_interval
)
return
report_memory_flag
def
train
(
forward_step_func
,
model
,
optimizer
,
lr_scheduler
,
train_data_iterator
,
valid_data_iterator
):
"""Train the model function."""
args
=
get_args
()
timers
=
get_timers
()
# Turn on training mode which enables dropout.
model
.
train
()
# Tracking loss.
total_loss_dict
=
{}
# Iterations.
iteration
=
args
.
iteration
timers
(
'interval time'
).
start
()
report_memory_flag
=
True
while
iteration
<
args
.
train_iters
:
loss_dict
,
skipped_iter
=
train_step
(
forward_step_func
,
train_data_iterator
,
model
,
optimizer
,
lr_scheduler
)
iteration
+=
1
# Logging.
loss_scale
=
None
if
args
.
fp16
:
loss_scale
=
optimizer
.
cur_scale
if
args
.
deepspeed
else
optimizer
.
loss_scale
report_memory_flag
=
training_log
(
loss_dict
,
total_loss_dict
,
optimizer
.
param_groups
[
0
][
'lr'
],
iteration
,
loss_scale
,
report_memory_flag
,
skipped_iter
)
# Autoresume
if
args
.
adlr_autoresume
and
\
(
iteration
%
args
.
adlr_autoresume_interval
==
0
):
check_adlr_autoresume_termination
(
iteration
,
model
,
optimizer
,
lr_scheduler
)
# Checkpointing
if
args
.
save
and
args
.
save_interval
and
\
iteration
%
args
.
save_interval
==
0
:
save_checkpoint
(
iteration
,
model
,
optimizer
,
lr_scheduler
)
# Evaluation
if
args
.
eval_interval
and
iteration
%
args
.
eval_interval
==
0
and
\
args
.
do_valid
:
prefix
=
'iteration {}'
.
format
(
iteration
)
evaluate_and_print_results
(
prefix
,
forward_step_func
,
valid_data_iterator
,
model
,
iteration
,
False
)
if
args
.
exit_interval
and
iteration
%
args
.
exit_interval
==
0
:
torch
.
distributed
.
barrier
()
time_str
=
datetime
.
now
().
strftime
(
'%Y-%m-%d %H:%M:%S'
)
rank
=
torch
.
distributed
.
get_rank
()
print_rank_0
(
'rank: {} | time: {} | exiting the program at '
'iteration {}'
.
format
(
rank
,
time_str
,
iteration
))
sys
.
exit
()
return
iteration
def
evaluate
(
forward_step_func
,
data_iterator
,
model
,
verbose
=
False
):
"""Evaluation."""
args
=
get_args
()
# Turn on evaluation mode which disables dropout.
model
.
eval
()
total_loss_dict
=
{}
with
torch
.
no_grad
():
iteration
=
0
while
iteration
<
args
.
eval_iters
:
iteration
+=
1
if
verbose
and
iteration
%
args
.
log_interval
==
0
:
print_rank_0
(
'Evaluating iter {}/{}'
.
format
(
iteration
,
args
.
eval_iters
))
# Forward evaluation.
_
,
loss_dict
=
forward_step_func
(
data_iterator
,
model
)
# When contiguous memory optimizations are enabled, the buffers
# allocated by the optimizations are deallocated during backward pass
# in the absence of backward pass the buffers should be reset after each
# forward pass
if
args
.
deepspeed
and
args
.
deepspeed_activation_checkpointing
:
deepspeed
.
checkpointing
.
reset
()
# Reduce across processes.
for
key
in
loss_dict
:
total_loss_dict
[
key
]
=
total_loss_dict
.
get
(
key
,
0.
)
+
\
loss_dict
[
key
]
# Move model back to the train mode.
model
.
train
()
for
key
in
total_loss_dict
:
total_loss_dict
[
key
]
/=
args
.
eval_iters
return
total_loss_dict
def
evaluate_and_print_results
(
prefix
,
forward_step_func
,
data_iterator
,
model
,
iteration
,
verbose
=
False
):
"""Helper function to evaluate and dump results on screen."""
writer
=
get_tensorboard_writer
()
# Pipeline parallelism needs eval_batch() instead of a simple forward().
args
=
get_args
()
if
args
.
pipe_parallel_size
>
0
:
def
_eval_helper
(
data_iter
,
pipe_model
):
loss
=
model
.
eval_batch
(
data_iter
)
return
None
,
{
'lm loss'
:
loss
}
forward_step_func
=
_eval_helper
total_loss_dict
=
evaluate
(
forward_step_func
,
data_iterator
,
model
,
verbose
)
string
=
' validation loss at {} | '
.
format
(
prefix
)
for
key
in
total_loss_dict
:
string
+=
'{} value: {:.6E} | '
.
format
(
key
,
total_loss_dict
[
key
].
item
())
ppl
=
math
.
exp
(
min
(
20
,
total_loss_dict
[
key
].
item
()))
string
+=
'{} PPL: {:.6E} | '
.
format
(
key
,
ppl
)
if
writer
and
torch
.
distributed
.
get_rank
()
==
0
:
writer
.
add_scalar
(
'{} value'
.
format
(
key
),
total_loss_dict
[
key
].
item
(),
iteration
)
writer
.
add_scalar
(
'{} ppl'
.
format
(
key
),
ppl
,
iteration
)
length
=
len
(
string
)
+
1
print_rank_0
(
'-'
*
length
)
print_rank_0
(
string
)
print_rank_0
(
'-'
*
length
)
def
build_train_valid_test_data_iterators
(
build_train_valid_test_datasets_provider
):
"""XXX"""
args
=
get_args
()
(
train_dataloader
,
valid_dataloader
,
test_dataloader
)
=
(
None
,
None
,
None
)
print_rank_0
(
'> building train, validation, and test datasets ...'
)
# Ensure only the first/last pipeline stages have data loaders
if
args
.
pipe_parallel_size
>
0
:
is_first_stage
=
mpu
.
get_pipe_parallel_rank
()
==
0
is_last_stage
=
mpu
.
get_pipe_parallel_rank
()
==
mpu
.
get_pipe_parallel_world_size
()
-
1
pipe_load
=
is_first_stage
or
is_last_stage
else
:
pipe_load
=
True
# Data loader only on rank 0 of each model parallel group.
if
mpu
.
get_model_parallel_rank
()
==
0
and
pipe_load
:
# Rank, size, and global batch size.
data_parallel_size
=
mpu
.
get_data_parallel_world_size
()
global_batch_size
=
args
.
batch_size
*
data_parallel_size
*
args
.
gas
# Number of train/valid/test samples.
train_iters
=
args
.
train_iters
eval_iters
=
(
train_iters
//
args
.
eval_interval
+
1
)
*
args
.
eval_iters
test_iters
=
args
.
eval_iters
train_val_test_num_samples
=
[
train_iters
*
global_batch_size
,
eval_iters
*
global_batch_size
,
test_iters
*
global_batch_size
]
print_rank_0
(
' > datasets target sizes (minimum size):'
)
print_rank_0
(
' train: {}'
.
format
(
train_val_test_num_samples
[
0
]))
print_rank_0
(
' validation: {}'
.
format
(
train_val_test_num_samples
[
1
]))
print_rank_0
(
' test: {}'
.
format
(
train_val_test_num_samples
[
2
]))
# Build the datasets.
train_ds
,
valid_ds
,
test_ds
=
build_train_valid_test_datasets_provider
(
train_val_test_num_samples
)
# Build dataloders.
train_dataloader
=
make_data_loader
(
train_ds
)
valid_dataloader
=
make_data_loader
(
valid_ds
)
test_dataloader
=
make_data_loader
(
test_ds
)
# Flags to know if we need to do training/validation/testing.
do_train
=
train_dataloader
is
not
None
and
args
.
train_iters
>
0
do_valid
=
valid_dataloader
is
not
None
and
args
.
eval_iters
>
0
do_test
=
test_dataloader
is
not
None
and
args
.
eval_iters
>
0
# Need to broadcast num_tokens and num_type_tokens.
flags
=
torch
.
cuda
.
LongTensor
(
[
int
(
do_train
),
int
(
do_valid
),
int
(
do_test
)])
else
:
flags
=
torch
.
cuda
.
LongTensor
([
0
,
0
,
0
])
# Broadcast num tokens.
if
args
.
pipe_parallel_size
>
0
:
# Only first/last pipeline stages have data loaders, so pipeline parallelism should
# broadcast globally instead of just the model parallel group.
torch
.
distributed
.
broadcast
(
flags
,
src
=
0
)
else
:
torch
.
distributed
.
broadcast
(
flags
,
mpu
.
get_model_parallel_src_rank
(),
group
=
mpu
.
get_model_parallel_group
())
args
.
do_train
=
flags
[
0
].
item
()
args
.
do_valid
=
flags
[
1
].
item
()
args
.
do_test
=
flags
[
2
].
item
()
# Shift the start iterations.
if
train_dataloader
is
not
None
:
train_dataloader
.
batch_sampler
.
start_iter
=
args
.
iteration
%
\
len
(
train_dataloader
)
print_rank_0
(
'setting training data start iteration to {}'
.
format
(
train_dataloader
.
batch_sampler
.
start_iter
))
if
valid_dataloader
is
not
None
:
start_iter_val
=
(
args
.
iteration
//
args
.
eval_interval
)
*
\
args
.
eval_iters
valid_dataloader
.
batch_sampler
.
start_iter
=
start_iter_val
%
\
len
(
valid_dataloader
)
print_rank_0
(
'setting validation data start iteration to {}'
.
format
(
valid_dataloader
.
batch_sampler
.
start_iter
))
# Build iterators.
if
train_dataloader
is
not
None
:
train_data_iterator
=
iter
(
train_dataloader
)
else
:
train_data_iterator
=
None
if
valid_dataloader
is
not
None
:
valid_data_iterator
=
iter
(
valid_dataloader
)
else
:
valid_data_iterator
=
None
if
test_dataloader
is
not
None
:
test_data_iterator
=
iter
(
test_dataloader
)
else
:
test_data_iterator
=
None
return
train_data_iterator
,
valid_data_iterator
,
test_data_iterator
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/megatron/utils.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""General utilities for Megatron."""
import
sys
import
torch
from
megatron
import
get_args
from
megatron
import
print_rank_0
from
megatron
import
get_adlr_autoresume
from
megatron
import
mpu
from
megatron.checkpointing
import
save_checkpoint
from
megatron.data.samplers
import
DistributedBatchSampler
from
megatron.fp16
import
FP16_Optimizer
def
reduce_losses
(
losses
):
"""Reduce a tensor of losses across all GPUs."""
reduced_losses
=
torch
.
cat
(
[
loss
.
clone
().
detach
().
view
(
1
)
for
loss
in
losses
])
torch
.
distributed
.
all_reduce
(
reduced_losses
)
reduced_losses
=
reduced_losses
/
torch
.
distributed
.
get_world_size
()
return
reduced_losses
def
report_memory
(
name
):
"""Simple GPU memory report."""
mega_bytes
=
1024.0
*
1024.0
string
=
name
+
' memory (MB)'
string
+=
' | allocated: {}'
.
format
(
torch
.
cuda
.
memory_allocated
()
/
mega_bytes
)
string
+=
' | max allocated: {}'
.
format
(
torch
.
cuda
.
max_memory_allocated
()
/
mega_bytes
)
string
+=
' | reserved: {}'
.
format
(
torch
.
cuda
.
memory_reserved
()
/
mega_bytes
)
string
+=
' | max reserved: {}'
.
format
(
torch
.
cuda
.
max_memory_reserved
()
/
mega_bytes
)
print_rank_0
(
string
)
def
print_params_min_max_norm
(
optimizer
,
iteration
):
"""Print min, max, and norm of all parameters."""
index
=
0
rank
=
torch
.
distributed
.
get_rank
()
string
=
'iteration, rank, index, model-parallel,min, max, norm
\n
'
optimizer_
=
optimizer
if
isinstance
(
optimizer
,
FP16_Optimizer
):
optimizer_
=
optimizer
.
optimizer
for
param_group
in
optimizer_
.
param_groups
:
for
param
in
param_group
[
'params'
]:
index
+=
1
min_
=
param
.
data
.
min
()
max_
=
param
.
data
.
max
()
norm
=
param
.
data
.
norm
()
string
+=
'{:7d}, {:4d}, {:4d}, {:2d}, '
.
format
(
iteration
,
rank
,
index
,
int
(
param
.
model_parallel
))
string
+=
'{:.6E}, {:.6E}, {:.6E}
\n
'
.
format
(
min_
,
max_
,
norm
)
print
(
string
,
flush
=
True
)
def
check_adlr_autoresume_termination
(
iteration
,
model
,
optimizer
,
lr_scheduler
):
"""Check for autoresume signal and exit if it is received."""
args
=
get_args
()
autoresume
=
get_adlr_autoresume
()
# Add barrier to ensure consistnecy.
torch
.
distributed
.
barrier
()
if
autoresume
.
termination_requested
():
if
args
.
save
:
save_checkpoint
(
iteration
,
model
,
optimizer
,
lr_scheduler
)
print_rank_0
(
">>> autoresume termination request found!"
)
if
torch
.
distributed
.
get_rank
()
==
0
:
autoresume
.
request_resume
()
print_rank_0
(
">>> training terminated. Returning"
)
sys
.
exit
(
0
)
def
make_data_loader
(
dataset
):
"""Buld dataloader given an input dataset."""
if
dataset
is
None
:
return
None
args
=
get_args
()
# Data parallel arguments.
world_size
=
mpu
.
get_data_parallel_world_size
()
rank
=
mpu
.
get_data_parallel_rank
()
global_batch_size
=
args
.
batch_size
*
world_size
num_workers
=
args
.
num_workers
# Use a simple sampler with distributed batch sampler.
sampler
=
torch
.
utils
.
data
.
SequentialSampler
(
dataset
)
batch_sampler
=
DistributedBatchSampler
(
sampler
=
sampler
,
batch_size
=
global_batch_size
,
drop_last
=
True
,
rank
=
rank
,
world_size
=
world_size
)
# Torch dataloader.
return
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_sampler
=
batch_sampler
,
num_workers
=
num_workers
,
pin_memory
=
True
)
def
get_ltor_masks_and_position_ids
(
data
,
eod_token
,
reset_position_ids
,
reset_attention_mask
,
eod_mask_loss
):
"""Build masks and position id for left to right model."""
# Extract batch size and sequence length.
batch_size
,
seq_length
=
data
.
size
()
# Attention mask (lower triangular).
if
reset_attention_mask
:
att_mask_batch
=
batch_size
else
:
att_mask_batch
=
1
attention_mask
=
torch
.
tril
(
torch
.
ones
(
(
att_mask_batch
,
seq_length
,
seq_length
),
device
=
data
.
device
)).
view
(
att_mask_batch
,
1
,
seq_length
,
seq_length
)
# Loss mask.
loss_mask
=
torch
.
ones
(
data
.
size
(),
dtype
=
torch
.
float
,
device
=
data
.
device
)
if
eod_mask_loss
:
loss_mask
[
data
==
eod_token
]
=
0.0
# Position ids.
position_ids
=
torch
.
arange
(
seq_length
,
dtype
=
torch
.
long
,
device
=
data
.
device
)
position_ids
=
position_ids
.
unsqueeze
(
0
).
expand_as
(
data
)
# We need to clone as the ids will be modifed based on batch index.
if
reset_position_ids
:
position_ids
=
position_ids
.
clone
()
if
reset_position_ids
or
reset_attention_mask
:
# Loop through the batches:
for
b
in
range
(
batch_size
):
# Find indecies where EOD token is.
eod_index
=
position_ids
[
b
,
data
[
b
]
==
eod_token
]
# Detach indecies from positions if going to modify positions.
if
reset_position_ids
:
eod_index
=
eod_index
.
clone
()
# Loop through EOD indecies:
prev_index
=
0
for
j
in
range
(
eod_index
.
size
()[
0
]):
i
=
eod_index
[
j
]
# Mask attention loss.
if
reset_attention_mask
:
attention_mask
[
b
,
0
,
(
i
+
1
):,
:(
i
+
1
)]
=
0
# Reset positions.
if
reset_position_ids
:
position_ids
[
b
,
(
i
+
1
):]
-=
(
i
+
1
-
prev_index
)
prev_index
=
i
+
1
# Convert attention mask to binary:
attention_mask
=
(
attention_mask
<
0.5
)
return
attention_mask
,
loss_mask
,
position_ids
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/pretrain_bert.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain BERT"""
import
torch
import
torch.nn.functional
as
F
from
megatron
import
get_args
from
megatron
import
print_rank_0
from
megatron
import
get_timers
from
megatron
import
mpu
from
megatron.data.dataset_utils
import
build_train_valid_test_datasets
from
megatron.model
import
BertModel
from
megatron.training
import
pretrain
from
megatron.utils
import
reduce_losses
def
model_provider
():
"""Build the model."""
print_rank_0
(
'building BERT model ...'
)
model
=
BertModel
(
num_tokentypes
=
2
,
add_binary_head
=
True
,
parallel_output
=
True
)
return
model
def
get_batch
(
data_iterator
):
"""Build the batch."""
# Items and their type.
keys
=
[
'text'
,
'types'
,
'labels'
,
'is_random'
,
'loss_mask'
,
'padding_mask'
]
datatype
=
torch
.
int64
# Broadcast data.
if
data_iterator
is
not
None
:
data
=
next
(
data_iterator
)
else
:
data
=
None
data_b
=
mpu
.
broadcast_data
(
keys
,
data
,
datatype
)
# Unpack.
tokens
=
data_b
[
'text'
].
long
()
types
=
data_b
[
'types'
].
long
()
sentence_order
=
data_b
[
'is_random'
].
long
()
loss_mask
=
data_b
[
'loss_mask'
].
float
()
lm_labels
=
data_b
[
'labels'
].
long
()
padding_mask
=
data_b
[
'padding_mask'
].
long
()
return
tokens
,
types
,
sentence_order
,
loss_mask
,
lm_labels
,
padding_mask
def
forward_step
(
data_iterator
,
model
):
"""Forward step."""
args
=
get_args
()
timers
=
get_timers
()
# Get the batch.
timers
(
'batch generator'
).
start
()
tokens
,
types
,
sentence_order
,
loss_mask
,
lm_labels
,
padding_mask
\
=
get_batch
(
data_iterator
)
timers
(
'batch generator'
).
stop
()
# Forward model. lm_labels
lm_loss_
,
sop_logits
=
model
(
tokens
,
padding_mask
,
tokentype_ids
=
types
,
lm_labels
=
lm_labels
)
sop_loss
=
F
.
cross_entropy
(
sop_logits
.
view
(
-
1
,
2
).
float
(),
sentence_order
.
view
(
-
1
),
ignore_index
=-
1
)
lm_loss
=
torch
.
sum
(
lm_loss_
.
view
(
-
1
)
*
loss_mask
.
reshape
(
-
1
))
/
loss_mask
.
sum
()
loss
=
lm_loss
+
sop_loss
reduced_losses
=
reduce_losses
([
lm_loss
,
sop_loss
])
return
loss
,
{
'lm loss'
:
reduced_losses
[
0
],
'sop loss'
:
reduced_losses
[
1
]}
def
train_valid_test_datasets_provider
(
train_val_test_num_samples
):
"""Build train, valid, and test datasets."""
args
=
get_args
()
print_rank_0
(
'> building train, validation, and test datasets '
'for BERT ...'
)
train_ds
,
valid_ds
,
test_ds
=
build_train_valid_test_datasets
(
data_prefix
=
args
.
data_path
,
data_impl
=
args
.
data_impl
,
splits_string
=
args
.
split
,
train_valid_test_num_samples
=
train_val_test_num_samples
,
max_seq_length
=
args
.
seq_length
,
masked_lm_prob
=
args
.
mask_prob
,
short_seq_prob
=
args
.
short_seq_prob
,
seed
=
args
.
seed
,
skip_warmup
=
(
not
args
.
mmap_warmup
))
print_rank_0
(
"> finished creating BERT datasets ..."
)
return
train_ds
,
valid_ds
,
test_ds
if
__name__
==
"__main__"
:
pretrain
(
train_valid_test_datasets_provider
,
model_provider
,
forward_step
,
args_defaults
=
{
'tokenizer_type'
:
'BertWordPieceLowerCase'
})
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/pretrain_gpt2.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain GPT2"""
import
torch
from
megatron
import
get_args
from
megatron
import
print_rank_0
from
megatron
import
get_timers
from
megatron
import
get_tokenizer
from
megatron
import
mpu
from
megatron.data.gpt2_dataset
import
build_train_valid_test_datasets
from
megatron.model
import
GPT2Model
,
GPT2ModelPipe
from
megatron.training
import
pretrain
from
megatron.utils
import
get_ltor_masks_and_position_ids
from
megatron.utils
import
reduce_losses
from
megatron.fp16
import
fp32_to_fp16
# pretend this is a great DeepSpeed change too
def
model_provider
():
"""Build the model."""
args
=
get_args
()
print_rank_0
(
'building GPT2 model ...'
)
if
args
.
pipe_parallel_size
==
0
:
model
=
GPT2Model
(
num_tokentypes
=
0
,
parallel_output
=
True
)
else
:
model
=
GPT2ModelPipe
(
num_tokentypes
=
0
,
parallel_output
=
True
,
topology
=
mpu
.
get_topology
())
# This is a hack to give us a reference to get_batch_pipe from within training.py
# We need to call model.set_batch_fn after deepspeed.initialize
model
.
_megatron_batch_fn
=
get_batch_pipe
return
model
def
get_batch
(
data_iterator
):
"""Generate a batch"""
args
=
get_args
()
tokenizer
=
get_tokenizer
()
# Items and their type.
keys
=
[
'text'
]
datatype
=
torch
.
int64
# Broadcast data.
if
data_iterator
is
not
None
:
data
=
next
(
data_iterator
)
else
:
data
=
None
data_b
=
mpu
.
broadcast_data
(
keys
,
data
,
datatype
)
# Unpack.
tokens_
=
data_b
[
'text'
].
long
()
labels
=
tokens_
[:,
1
:].
contiguous
()
tokens
=
tokens_
[:,
:
-
1
].
contiguous
()
# Get the masks and postition ids.
attention_mask
,
loss_mask
,
position_ids
=
get_ltor_masks_and_position_ids
(
tokens
,
tokenizer
.
eod
,
args
.
reset_position_ids
,
args
.
reset_attention_mask
,
args
.
eod_mask_loss
)
return
tokens
,
labels
,
loss_mask
,
attention_mask
,
position_ids
def
get_batch_pipe
(
data
):
"""A modification of get_batch() to work with the latest batch instead of an iterator. """
args
=
get_args
()
tokenizer
=
get_tokenizer
()
# Items and their type.
keys
=
[
'text'
]
datatype
=
torch
.
int64
# Broadcast data.
data_b
=
mpu
.
broadcast_data
(
keys
,
data
,
datatype
)
# Unpack.
tokens_
=
data_b
[
'text'
].
long
()
labels
=
tokens_
[:,
1
:].
contiguous
()
tokens
=
tokens_
[:,
:
-
1
].
contiguous
()
# Get the masks and postition ids.
attention_mask
,
loss_mask
,
position_ids
=
get_ltor_masks_and_position_ids
(
tokens
,
tokenizer
.
eod
,
args
.
reset_position_ids
,
args
.
reset_attention_mask
,
args
.
eod_mask_loss
)
# unpack data
if
args
.
fp16
:
# cast to fp16 because pipeline parallelism skips the FP16 wrapper.
return
fp32_to_fp16
((
tokens
,
position_ids
,
attention_mask
)),
fp32_to_fp16
((
labels
,
loss_mask
))
else
:
return
(
tokens
,
position_ids
,
attention_mask
),
(
labels
,
loss_mask
)
def
forward_step
(
data_iterator
,
model
):
"""Forward step."""
args
=
get_args
()
timers
=
get_timers
()
# Get the batch.
timers
(
'batch generator'
).
start
()
tokens
,
labels
,
loss_mask
,
attention_mask
,
position_ids
=
get_batch
(
data_iterator
)
timers
(
'batch generator'
).
stop
()
# Forward model.
losses
=
model
(
tokens
,
position_ids
,
attention_mask
,
labels
=
labels
)
loss_mask
=
loss_mask
.
view
(
-
1
)
loss
=
torch
.
sum
(
losses
.
view
(
-
1
)
*
loss_mask
)
/
loss_mask
.
sum
()
# Reduce loss for logging.
reduced_loss
=
reduce_losses
([
loss
])
return
loss
,
{
'lm loss'
:
reduced_loss
[
0
]}
def
train_valid_test_datasets_provider
(
train_val_test_num_samples
):
"""Build train, valid, and test datasets."""
args
=
get_args
()
print_rank_0
(
'> building train, validation, and test datasets '
'for GPT2 ...'
)
train_ds
,
valid_ds
,
test_ds
=
build_train_valid_test_datasets
(
data_prefix
=
args
.
data_path
,
data_impl
=
args
.
data_impl
,
splits_string
=
args
.
split
,
train_valid_test_num_samples
=
train_val_test_num_samples
,
seq_length
=
args
.
seq_length
,
seed
=
args
.
seed
,
skip_warmup
=
(
not
args
.
mmap_warmup
))
print_rank_0
(
"> finished creating GPT2 datasets ..."
)
return
train_ds
,
valid_ds
,
test_ds
if
__name__
==
"__main__"
:
pretrain
(
train_valid_test_datasets_provider
,
model_provider
,
forward_step
,
args_defaults
=
{
'tokenizer_type'
:
'GPT2BPETokenizer'
})
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/pretrain_ict.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain BERT for Inverse Cloze Task"""
import
torch
import
torch.distributed
as
dist
import
torch.nn.functional
as
F
from
megatron
import
get_args
from
megatron
import
print_rank_0
from
megatron
import
get_timers
from
megatron
import
mpu
from
megatron.data.dataset_utils
import
build_train_valid_test_datasets
from
megatron.training
import
pretrain
from
megatron.utils
import
reduce_losses
from
megatron.model.realm_model
import
general_ict_model_provider
from
megatron.data.realm_dataset_utils
import
get_ict_batch
def
pretrain_ict_model_provider
():
return
general_ict_model_provider
(
False
,
False
)
def
get_group_world_size_rank
():
group
=
mpu
.
get_data_parallel_group
()
rank
=
torch
.
distributed
.
get_rank
(
group
=
group
)
world_size
=
torch
.
distributed
.
get_world_size
(
group
=
group
)
return
group
,
rank
,
world_size
class
AllgatherFromDataParallelRegion
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
input_
):
assert
input_
.
dim
()
==
2
group
,
rank
,
world_size
=
get_group_world_size_rank
()
tensor_list
=
[
torch
.
empty_like
(
input_
)
for
_
in
range
(
world_size
)]
tensor_list
[
rank
]
=
input_
torch
.
distributed
.
all_gather
(
tensor_list
,
input_
,
group
=
group
)
output
=
torch
.
cat
(
tensor_list
,
dim
=
0
).
contiguous
()
return
output
@
staticmethod
def
backward
(
ctx
,
grad_output
):
group
,
rank
,
world_size
=
get_group_world_size_rank
()
assert
grad_output
.
shape
[
0
]
%
world_size
==
0
dim_size
=
grad_output
.
shape
[
0
]
//
world_size
output_list
=
torch
.
split
(
grad_output
,
dim_size
,
dim
=
0
)
# get chunk from this rank
output
=
output_list
[
rank
].
contiguous
()
return
output
def
forward_step
(
data_iterator
,
model
):
"""Forward step."""
args
=
get_args
()
timers
=
get_timers
()
# Get the batch.
timers
(
'batch generator'
).
start
()
query_tokens
,
query_pad_mask
,
\
block_tokens
,
block_pad_mask
,
block_indices
=
get_ict_batch
(
data_iterator
)
timers
(
'batch generator'
).
stop
()
# Forward model.
query_logits
,
block_logits
=
model
(
query_tokens
,
query_pad_mask
,
block_tokens
,
block_pad_mask
)
local_batch_size
=
query_logits
.
shape
[
0
]
global_batch_size
=
dist
.
get_world_size
()
*
local_batch_size
# recall we assert that model_parallel_size == 1
all_query_logits
=
AllgatherFromDataParallelRegion
.
apply
(
query_logits
)
all_block_logits
=
AllgatherFromDataParallelRegion
.
apply
(
block_logits
)
# scores are inner products between query and block embeddings
retrieval_scores
=
all_query_logits
.
float
().
matmul
(
torch
.
transpose
(
all_block_logits
,
0
,
1
).
float
())
softmaxed
=
F
.
softmax
(
retrieval_scores
,
dim
=
1
)
sorted_vals
,
sorted_indices
=
torch
.
topk
(
softmaxed
,
k
=
softmaxed
.
shape
[
1
],
sorted
=
True
)
def
topk_accuracy
(
k
):
return
torch
.
cuda
.
FloatTensor
([
sum
([
int
(
i
in
sorted_indices
[
i
,
:
k
])
for
i
in
range
(
global_batch_size
)])
/
global_batch_size
])
topk_accs
=
[
topk_accuracy
(
int
(
k
))
for
k
in
args
.
report_topk_accuracies
]
retrieval_loss
=
torch
.
nn
.
CrossEntropyLoss
()(
retrieval_scores
,
torch
.
arange
(
global_batch_size
).
long
().
cuda
())
reduced_losses
=
reduce_losses
([
retrieval_loss
,
*
topk_accs
])
# create stats_dict with retrieval loss and all specified top-k accuracies
topk_acc_dict
=
{
'top{}_acc'
.
format
(
k
):
v
for
k
,
v
in
zip
(
args
.
report_topk_accuracies
,
reduced_losses
[
1
:])}
stats_dict
=
dict
(
retrieval_loss
=
reduced_losses
[
0
],
**
topk_acc_dict
)
return
retrieval_loss
,
stats_dict
def
train_valid_test_datasets_provider
(
train_val_test_num_samples
):
"""Build train, valid and test datasets."""
args
=
get_args
()
print_rank_0
(
'> building train, validation, and test datasets '
'for BERT ICT...'
)
train_ds
,
valid_ds
,
test_ds
=
build_train_valid_test_datasets
(
data_prefix
=
args
.
data_path
,
data_impl
=
args
.
data_impl
,
splits_string
=
args
.
split
,
train_valid_test_num_samples
=
train_val_test_num_samples
,
max_seq_length
=
args
.
seq_length
,
masked_lm_prob
=
args
.
mask_prob
,
short_seq_prob
=
args
.
short_seq_prob
,
seed
=
args
.
seed
,
skip_warmup
=
(
not
args
.
mmap_warmup
),
dataset_type
=
'ict'
)
print_rank_0
(
"> finished creating BERT ICT datasets ..."
)
return
train_ds
,
valid_ds
,
test_ds
if
__name__
==
"__main__"
:
pretrain
(
train_valid_test_datasets_provider
,
pretrain_ict_model_provider
,
forward_step
,
args_defaults
=
{
'tokenizer_type'
:
'BertWordPieceLowerCase'
})
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/requirements.txt
0 → 100644
View file @
316d3f90
pybind11
torch
six
regex
numpy
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/setup.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Setup for pip package for Megatron."""
import
os
import
sys
import
setuptools
if
sys
.
version_info
<
(
3
,):
raise
Exception
(
"Python 2 is not supported by Megatron."
)
from
megatron.package_info
import
(
__description__
,
__contact_names__
,
__url__
,
__download_url__
,
__keywords__
,
__license__
,
__package_name__
,
__version__
,
)
with
open
(
"README.md"
,
"r"
)
as
fh
:
long_description
=
fh
.
read
()
###############################################################################
# Dependency Loading #
# %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #
def
req_file
(
filename
):
with
open
(
filename
)
as
f
:
content
=
f
.
readlines
()
return
[
x
.
strip
()
for
x
in
content
]
install_requires
=
req_file
(
"requirements.txt"
)
setuptools
.
setup
(
name
=
__package_name__
,
# Versions should comply with PEP440. For a discussion on single-sourcing
# the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
version
=
__version__
,
description
=
__description__
,
long_description
=
long_description
,
long_description_content_type
=
"text/markdown"
,
# The project's main homepage.
url
=
__url__
,
author
=
__contact_names__
,
maintainer
=
__contact_names__
,
# The licence under which the project is released
license
=
__license__
,
classifiers
=
[
'Intended Audience :: Developers'
,
'Intended Audience :: Science/Research'
,
'Intended Audience :: Information Technology'
,
# Indicate what your project relates to
'Topic :: Scientific/Engineering :: Artificial Intelligence'
,
'Topic :: Software Development :: Libraries :: Python Modules'
,
# Supported python versions
'Programming Language :: Python :: 3.6'
,
'Programming Language :: Python :: 3.7'
,
'Programming Language :: Python :: 3.8'
,
# Additional Setting
'Environment :: Console'
,
'Natural Language :: English'
,
'Operating System :: OS Independent'
,
],
python_requires
=
'>=3.6'
,
packages
=
setuptools
.
find_packages
(),
install_requires
=
install_requires
,
# Add in any packaged data.
include_package_data
=
True
,
zip_safe
=
False
,
# PyPI package information.
keywords
=
__keywords__
)
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/tasks/data_utils.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tasks data utility."""
import
re
import
numpy
as
np
def
clean_text
(
text
):
"""Remove new lines and multiple spaces and adjust end of sentence dot."""
text
=
text
.
replace
(
"
\n
"
,
" "
)
text
=
re
.
sub
(
r
'\s+'
,
' '
,
text
)
for
_
in
range
(
3
):
text
=
text
.
replace
(
' . '
,
'. '
)
return
text
def
build_sample
(
ids
,
types
,
paddings
,
label
,
unique_id
):
"""Convert to numpy and return a sample consumed by the batch producer."""
ids_np
=
np
.
array
(
ids
,
dtype
=
np
.
int64
)
types_np
=
np
.
array
(
types
,
dtype
=
np
.
int64
)
paddings_np
=
np
.
array
(
paddings
,
dtype
=
np
.
int64
)
sample
=
({
'text'
:
ids_np
,
'types'
:
types_np
,
'padding_mask'
:
paddings_np
,
'label'
:
int
(
label
),
'uid'
:
int
(
unique_id
)})
return
sample
def
build_tokens_types_paddings_from_text
(
text_a
,
text_b
,
tokenizer
,
max_seq_length
):
"""Build token types and paddings, trim if needed, and pad if needed."""
text_a_ids
=
tokenizer
.
tokenize
(
text_a
)
text_b_ids
=
None
if
text_b
is
not
None
:
text_b_ids
=
tokenizer
.
tokenize
(
text_b
)
return
build_tokens_types_paddings_from_ids
(
text_a_ids
,
text_b_ids
,
max_seq_length
,
tokenizer
.
cls
,
tokenizer
.
sep
,
tokenizer
.
pad
)
def
build_tokens_types_paddings_from_ids
(
text_a_ids
,
text_b_ids
,
max_seq_length
,
cls_id
,
sep_id
,
pad_id
):
"""Build token types and paddings, trim if needed, and pad if needed."""
ids
=
[]
types
=
[]
paddings
=
[]
# [CLS].
ids
.
append
(
cls_id
)
types
.
append
(
0
)
paddings
.
append
(
1
)
# A.
len_text_a
=
len
(
text_a_ids
)
ids
.
extend
(
text_a_ids
)
types
.
extend
([
0
]
*
len_text_a
)
paddings
.
extend
([
1
]
*
len_text_a
)
# [SEP].
ids
.
append
(
sep_id
)
types
.
append
(
0
)
paddings
.
append
(
1
)
# B.
if
text_b_ids
is
not
None
:
len_text_b
=
len
(
text_b_ids
)
ids
.
extend
(
text_b_ids
)
types
.
extend
([
1
]
*
len_text_b
)
paddings
.
extend
([
1
]
*
len_text_b
)
# Cap the size.
trimmed
=
False
if
len
(
ids
)
>=
max_seq_length
:
max_seq_length_m1
=
max_seq_length
-
1
ids
=
ids
[
0
:
max_seq_length_m1
]
types
=
types
[
0
:
max_seq_length_m1
]
paddings
=
paddings
[
0
:
max_seq_length_m1
]
trimmed
=
True
# [SEP].
if
(
text_b_ids
is
not
None
)
or
trimmed
:
ids
.
append
(
sep_id
)
if
text_b_ids
is
None
:
types
.
append
(
0
)
else
:
types
.
append
(
1
)
paddings
.
append
(
1
)
# Padding.
padding_length
=
max_seq_length
-
len
(
ids
)
if
padding_length
>
0
:
ids
.
extend
([
pad_id
]
*
padding_length
)
types
.
extend
([
pad_id
]
*
padding_length
)
paddings
.
extend
([
0
]
*
padding_length
)
return
ids
,
types
,
paddings
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/tasks/ensemble_classifier.py
0 → 100644
View file @
316d3f90
import
os
import
argparse
import
collections
import
numpy
as
np
import
torch
def
process_files
(
args
):
all_predictions
=
collections
.
OrderedDict
()
all_labels
=
collections
.
OrderedDict
()
all_uid
=
collections
.
OrderedDict
()
for
path
in
args
.
paths
:
path
=
os
.
path
.
join
(
path
,
args
.
prediction_name
)
try
:
data
=
torch
.
load
(
path
)
for
dataset
in
data
:
name
,
d
=
dataset
predictions
,
labels
,
uid
=
d
if
name
not
in
all_predictions
:
all_predictions
[
name
]
=
np
.
array
(
predictions
)
if
args
.
labels
is
None
:
args
.
labels
=
[
i
for
i
in
range
(
all_predictions
[
name
].
shape
[
1
])]
if
args
.
eval
:
all_labels
[
name
]
=
np
.
array
(
labels
)
all_uid
[
name
]
=
np
.
array
(
uid
)
else
:
all_predictions
[
name
]
+=
np
.
array
(
predictions
)
assert
np
.
allclose
(
all_uid
[
name
],
np
.
array
(
uid
))
except
Exception
as
e
:
print
(
e
)
continue
return
all_predictions
,
all_labels
,
all_uid
def
get_threshold
(
all_predictions
,
all_labels
,
one_threshold
=
False
):
if
one_threshold
:
all_predictons
=
{
'combined'
:
np
.
concatenate
(
list
(
all_predictions
.
values
()))}
all_labels
=
{
'combined'
:
np
.
concatenate
(
list
(
all_predictions
.
labels
()))}
out_thresh
=
[]
for
dataset
in
all_predictions
:
preds
=
all_predictions
[
dataset
]
labels
=
all_labels
[
dataset
]
out_thresh
.
append
(
calc_threshold
(
preds
,
labels
))
return
out_thresh
def
calc_threshold
(
p
,
l
):
trials
=
[(
i
)
*
(
1.
/
100.
)
for
i
in
range
(
100
)]
best_acc
=
float
(
'-inf'
)
best_thresh
=
0
for
t
in
trials
:
acc
=
((
apply_threshold
(
p
,
t
).
argmax
(
-
1
)
==
l
).
astype
(
float
)).
mean
()
if
acc
>
best_acc
:
best_acc
=
acc
best_thresh
=
t
return
best_thresh
def
apply_threshold
(
preds
,
t
):
assert
(
np
.
allclose
(
preds
.
sum
(
-
1
),
np
.
ones
(
preds
.
shape
[
0
])))
prob
=
preds
[:,
-
1
]
thresholded
=
(
prob
>=
t
).
astype
(
int
)
preds
=
np
.
zeros_like
(
preds
)
preds
[
np
.
arange
(
len
(
thresholded
)),
thresholded
.
reshape
(
-
1
)]
=
1
return
preds
def
threshold_predictions
(
all_predictions
,
threshold
):
if
len
(
threshold
)
!=
len
(
all_predictions
):
threshold
=
[
threshold
[
-
1
]]
*
(
len
(
all_predictions
)
-
len
(
threshold
))
for
i
,
dataset
in
enumerate
(
all_predictions
):
thresh
=
threshold
[
i
]
preds
=
all_predictions
[
dataset
]
all_predictions
[
dataset
]
=
apply_threshold
(
preds
,
thresh
)
return
all_predictions
def
postprocess_predictions
(
all_predictions
,
all_labels
,
args
):
for
d
in
all_predictions
:
all_predictions
[
d
]
=
all_predictions
[
d
]
/
len
(
args
.
paths
)
if
args
.
calc_threshold
:
args
.
threshold
=
get_threshold
(
all_predictions
,
all_labels
,
args
.
one_threshold
)
print
(
'threshold'
,
args
.
threshold
)
if
args
.
threshold
is
not
None
:
all_predictions
=
threshold_predictions
(
all_predictions
,
args
.
threshold
)
return
all_predictions
,
all_labels
def
write_predictions
(
all_predictions
,
all_labels
,
all_uid
,
args
):
all_correct
=
0
count
=
0
for
dataset
in
all_predictions
:
preds
=
all_predictions
[
dataset
]
preds
=
np
.
argmax
(
preds
,
-
1
)
if
args
.
eval
:
correct
=
(
preds
==
all_labels
[
dataset
]).
sum
()
num
=
len
(
all_labels
[
dataset
])
accuracy
=
correct
/
num
count
+=
num
all_correct
+=
correct
accuracy
=
(
preds
==
all_labels
[
dataset
]).
mean
()
print
(
accuracy
)
if
not
os
.
path
.
exists
(
os
.
path
.
join
(
args
.
outdir
,
dataset
)):
os
.
makedirs
(
os
.
path
.
join
(
args
.
outdir
,
dataset
))
outpath
=
os
.
path
.
join
(
args
.
outdir
,
dataset
,
os
.
path
.
splitext
(
args
.
prediction_name
)[
0
]
+
'.tsv'
)
with
open
(
outpath
,
'w'
)
as
f
:
f
.
write
(
'id
\t
label
\n
'
)
f
.
write
(
'
\n
'
.
join
(
str
(
uid
)
+
'
\t
'
+
str
(
args
.
labels
[
p
])
for
uid
,
p
in
zip
(
all_uid
[
dataset
],
preds
.
tolist
())))
if
args
.
eval
:
print
(
all_correct
/
count
)
def
ensemble_predictions
(
args
):
all_predictions
,
all_labels
,
all_uid
=
process_files
(
args
)
all_predictions
,
all_labels
=
postprocess_predictions
(
all_predictions
,
all_labels
,
args
)
write_predictions
(
all_predictions
,
all_labels
,
all_uid
,
args
)
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--paths'
,
required
=
True
,
nargs
=
'+'
,
help
=
'paths to checkpoint directories used in ensemble'
)
parser
.
add_argument
(
'--eval'
,
action
=
'store_true'
,
help
=
'compute accuracy metrics against labels (dev set)'
)
parser
.
add_argument
(
'--outdir'
,
help
=
'directory to place ensembled predictions in'
)
parser
.
add_argument
(
'--prediction-name'
,
default
=
'test_predictions.pt'
,
help
=
'name of predictions in checkpoint directories'
)
parser
.
add_argument
(
'--calc-threshold'
,
action
=
'store_true'
,
help
=
'calculate threshold classification'
)
parser
.
add_argument
(
'--one-threshold'
,
action
=
'store_true'
,
help
=
'use on threshold for all subdatasets'
)
parser
.
add_argument
(
'--threshold'
,
nargs
=
'+'
,
default
=
None
,
type
=
float
,
help
=
'user supplied threshold for classification'
)
parser
.
add_argument
(
'--labels'
,
nargs
=
'+'
,
default
=
None
,
help
=
'whitespace separated list of label names'
)
args
=
parser
.
parse_args
()
ensemble_predictions
(
args
)
if
__name__
==
'__main__'
:
main
()
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/tasks/eval_utils.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Evaluation utilities."""
import
os
import
time
import
torch
from
megatron
import
get_args
from
megatron
import
print_rank_0
from
megatron
import
mpu
from
tasks.finetune_utils
import
build_data_loader
from
tasks.finetune_utils
import
process_batch
def
accuracy_func_provider
(
single_dataset_provider
):
"""Provide function that calculates accuracies."""
args
=
get_args
()
# Build dataloaders.
datapaths
=
args
.
valid_data
dataloaders
=
[]
for
datapath
in
datapaths
:
dataset
=
single_dataset_provider
(
datapath
)
dataloader
=
build_data_loader
(
dataset
,
args
.
batch_size
,
num_workers
=
args
.
num_workers
,
drop_last
=
(
mpu
.
get_data_parallel_world_size
()
>
1
))
dataloaders
.
append
((
dataset
.
dataset_name
,
dataloader
))
def
metrics_func
(
model
,
epoch
,
output_predictions
=
False
):
print_rank_0
(
'calculating metrics ...'
)
correct
=
0
total
=
0
if
output_predictions
:
assert
mpu
.
get_data_parallel_world_size
()
==
1
named_predictions
=
[]
names
=
'predictions'
for
name
,
dataloader
in
dataloaders
:
output
=
calculate_correct_answers
(
name
,
model
,
dataloader
,
epoch
,
output_predictions
)
if
not
output_predictions
:
correct_ans
,
total_count
=
output
else
:
correct_ans
,
total_count
,
predictions
=
output
named_predictions
.
append
((
name
,
predictions
))
names
+=
'_'
+
name
correct
+=
correct_ans
total
+=
total_count
percent
=
float
(
correct
)
*
100.0
/
float
(
total
)
print_rank_0
(
' >> |epoch: {}| overall: correct / total = {} / {} = '
'{:.4f} %'
.
format
(
epoch
,
correct
,
total
,
percent
))
if
output_predictions
and
torch
.
distributed
.
get_rank
()
==
0
:
assert
args
.
load
is
not
None
filename
=
os
.
path
.
join
(
args
.
load
,
names
+
'.pt'
)
torch
.
save
(
named_predictions
,
filename
)
return
metrics_func
def
calculate_correct_answers
(
name
,
model
,
dataloader
,
epoch
,
output_predictions
):
"""Calculate correct over total answers and return prediction if the
`output_predictions` is true."""
start_time
=
time
.
time
()
model
.
eval
()
with
torch
.
no_grad
():
# For all the batches in the dataset.
total
=
0
correct
=
0
if
output_predictions
:
# This option is only possible when data parallel size is 1.
assert
mpu
.
get_data_parallel_world_size
()
==
1
softmaxes
=
[]
labels
=
[]
ids
=
[]
for
_
,
batch
in
enumerate
(
dataloader
):
# Run the model forward.
tokens
,
types
,
labels_
,
attention_mask
=
process_batch
(
batch
)
logits
=
model
(
tokens
,
attention_mask
,
types
)
# Add output predictions.
if
output_predictions
:
softmaxes
.
extend
(
torch
.
nn
.
Softmax
(
dim
=-
1
)(
logits
.
float
()).
data
.
cpu
().
numpy
().
tolist
())
labels
.
extend
(
labels_
.
data
.
cpu
().
numpy
().
tolist
())
ids
.
extend
(
batch
[
'uid'
].
cpu
().
numpy
().
tolist
())
# Compute the correct answers.
predicted
=
torch
.
argmax
(
logits
,
dim
=-
1
)
corrects
=
(
predicted
==
labels_
)
# Add to the counters.
total
+=
labels_
.
size
(
0
)
correct
+=
corrects
.
sum
().
item
()
model
.
train
()
# Reduce.
unreduced
=
torch
.
cuda
.
LongTensor
([
correct
,
total
])
torch
.
distributed
.
all_reduce
(
unreduced
,
group
=
mpu
.
get_data_parallel_group
())
# Print on screen.
correct_ans
=
unreduced
[
0
].
item
()
total_count
=
unreduced
[
1
].
item
()
percent
=
float
(
correct_ans
)
*
100.0
/
float
(
total_count
)
elapsed_time
=
time
.
time
()
-
start_time
print_rank_0
(
' > |epoch: {}| metrics for {}: correct / total '
'= {} / {} = {:.4f} %, elapsed time (sec): {:.3f}'
.
format
(
epoch
,
name
,
correct_ans
,
total_count
,
percent
,
elapsed_time
))
if
output_predictions
:
return
correct_ans
,
total_count
,
(
softmaxes
,
labels
,
ids
)
return
correct_ans
,
total_count
Deepspeed/Megatron-LM-v1.1.5-3D_parallelism/tasks/finetune_utils.py
0 → 100644
View file @
316d3f90
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetune utilities."""
import
torch
from
megatron
import
get_args
from
megatron
import
print_rank_0
from
megatron
import
get_timers
from
megatron
import
mpu
from
megatron.checkpointing
import
load_checkpoint
from
megatron.checkpointing
import
save_checkpoint
from
megatron.training
import
evaluate_and_print_results
from
megatron.training
import
setup_model_and_optimizer
from
megatron.training
import
train_step
from
megatron.training
import
training_log
from
megatron.utils
import
check_adlr_autoresume_termination
from
megatron.utils
import
reduce_losses
def
process_batch
(
batch
):
"""Process batch and produce inputs for the model."""
args
=
get_args
()
tokens
=
batch
[
'text'
].
long
().
cuda
().
contiguous
()
types
=
batch
[
'types'
].
long
().
cuda
().
contiguous
()
labels
=
batch
[
'label'
].
long
().
cuda
().
contiguous
()
attention_mask
=
batch
[
'padding_mask'
].
float
().
cuda
().
contiguous
()
if
args
.
fp16
:
attention_mask
=
attention_mask
.
half
()
return
tokens
,
types
,
labels
,
attention_mask
def
_cross_entropy_forward_step
(
batch
,
model
):
"""Simple forward step with cross-entropy loss."""
timers
=
get_timers
()
# Get the batch.
timers
(
'batch generator'
).
start
()
try
:
batch_
=
next
(
batch
)
except
BaseException
:
batch_
=
batch
tokens
,
types
,
labels
,
attention_mask
=
process_batch
(
batch_
)
timers
(
'batch generator'
).
stop
()
# Forward model.
logits
=
model
(
tokens
,
attention_mask
,
types
)
# Cross-entropy loss.
loss_func
=
torch
.
nn
.
CrossEntropyLoss
()
loss
=
loss_func
(
logits
.
contiguous
().
float
(),
labels
)
# Reduce loss for logging.
reduced_loss
=
reduce_losses
([
loss
])
return
loss
,
{
'lm loss'
:
reduced_loss
[
0
]}
def
build_data_loader
(
dataset
,
batch_size
,
num_workers
,
drop_last
):
"""Data loader. Note that batch-size is the local (per GPU) batch-size."""
# Sampler.
world_size
=
mpu
.
get_data_parallel_world_size
()
rank
=
mpu
.
get_data_parallel_rank
()
sampler
=
torch
.
utils
.
data
.
distributed
.
DistributedSampler
(
dataset
,
num_replicas
=
world_size
,
rank
=
rank
)
# Data loader. Note that batch size is the per GPU batch size.
data_loader
=
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_size
=
batch_size
,
sampler
=
sampler
,
shuffle
=
False
,
num_workers
=
num_workers
,
drop_last
=
drop_last
,
pin_memory
=
True
)
return
data_loader
def
_build_infinite_size_dataloader
(
dataloader
):
"""Build a looped dataloader with infinite size."""
iterator
=
dataloader
.
__iter__
()
while
True
:
try
:
yield
iterator
.
__next__
()
except
StopIteration
:
iterator
=
dataloader
.
__iter__
()
def
_build_train_valid_dataloaders
(
train_dataset
,
valid_dataset
):
"""Traing and validation dataloaders."""
args
=
get_args
()
print_rank_0
(
'building train and validation dataloaders ...'
)
# Training dataset.
train_dataloader
=
build_data_loader
(
train_dataset
,
args
.
batch_size
,
args
.
num_workers
,
not
args
.
keep_last
)
# Set the training iterations.
args
.
train_iters_per_epoch
=
len
(
train_dataloader
)
args
.
train_iters
=
args
.
epochs
*
args
.
train_iters_per_epoch
# Validation dataset. For this dataset, we do not need to set up
# shuffling so we can just use a simple infinite loop.
valid_dataloader_
=
build_data_loader
(
valid_dataset
,
args
.
batch_size
,
args
.
num_workers
,
not
args
.
keep_last
)
valid_dataloader
=
_build_infinite_size_dataloader
(
valid_dataloader_
)
return
train_dataloader
,
valid_dataloader
def
_train
(
model
,
optimizer
,
lr_scheduler
,
forward_step
,
train_dataloader
,
valid_dataloader
,
end_of_epoch_callback
):
"""Train the model."""
args
=
get_args
()
timers
=
get_timers
()
# Turn on training mode which enables dropout.
model
.
train
()
# Tracking loss.
losses_dict_sum
=
{}
# Starting epoch and iteration
start_epoch
=
args
.
iteration
//
args
.
train_iters_per_epoch
start_iteration
=
args
.
iteration
%
args
.
train_iters_per_epoch
iteration
=
args
.
iteration
# Memory reporting flag.
report_memory_flag
=
True
# For each remaining epoch
timers
(
'interval time'
).
start
()
for
epoch
in
range
(
start_epoch
,
args
.
epochs
):
print_rank_0
(
'working on epoch {} ...'
.
format
(
epoch
+
1
))
# Set the data loader epoch to shuffle the index iterator.
train_dataloader
.
sampler
.
set_epoch
(
args
.
seed
+
epoch
)
# For all the batches in the dataset.
for
iteration_
,
batch
in
enumerate
(
train_dataloader
):
# Ignore the iterations before starting value
if
iteration_
<
start_iteration
:
continue
# Set to zero so the next epoch does not skip any batches.
start_iteration
=
0
# Train for one step.
losses_dict
,
_
=
train_step
(
forward_step
,
batch
,
model
,
optimizer
,
lr_scheduler
)
iteration
+=
1
# Logging.
report_memory_flag
=
training_log
(
losses_dict
,
losses_dict_sum
,
optimizer
.
param_groups
[
0
][
'lr'
],
iteration
,
optimizer
.
loss_scale
,
report_memory_flag
)
# Autoresume
if
args
.
adlr_autoresume
and
\
(
iteration
%
args
.
adlr_autoresume_interval
==
0
):
check_adlr_autoresume_termination
(
iteration
,
model
,
optimizer
,
lr_scheduler
)
# Checkpointing
if
args
.
save
and
args
.
save_interval
and
\
iteration
%
args
.
save_interval
==
0
:
save_checkpoint
(
iteration
,
model
,
optimizer
,
lr_scheduler
)
# Evaluation
if
args
.
eval_interval
and
iteration
%
args
.
eval_interval
==
0
:
prefix
=
'iteration {}'
.
format
(
iteration
)
evaluate_and_print_results
(
prefix
,
forward_step
,
valid_dataloader
,
model
,
iteration
,
False
)
# Checkpointing at the end of each epoch.
if
args
.
save
:
save_checkpoint
(
iteration
,
model
,
optimizer
,
lr_scheduler
)
# Callback at the end of each epoch.
if
end_of_epoch_callback
is
not
None
:
end_of_epoch_callback
(
model
,
epoch
)
def
finetune
(
train_valid_datasets_provider
,
model_provider
,
forward_step
=
_cross_entropy_forward_step
,
end_of_epoch_callback_provider
=
None
):
"""Main finetune function used across all tasks."""
args
=
get_args
()
timers
=
get_timers
()
# Train and validation data loaders.
timers
(
'train/valid/test dataset/dataloder'
).
start
()
if
args
.
epochs
>
0
:
train_dataset
,
valid_dataset
=
train_valid_datasets_provider
()
train_dataloader
,
valid_dataloader
=
_build_train_valid_dataloaders
(
train_dataset
,
valid_dataset
)
timers
(
'train/valid/test dataset/dataloder'
).
stop
()
# Build calback function.
timers
(
'callback function'
).
start
()
end_of_epoch_callback
=
None
if
end_of_epoch_callback_provider
is
not
None
:
end_of_epoch_callback
=
end_of_epoch_callback_provider
()
timers
(
'callback function'
).
stop
()
# Build model, optimizer and learning rate scheduler.
timers
(
'model and optimizer'
).
start
()
model
,
optimizer
,
lr_scheduler
=
setup_model_and_optimizer
(
model_provider
)
timers
(
'model and optimizer'
).
stop
()
# If pretrained checkpoint is provided and we have not trained for
# any iteration (i.e., iteration is zero), then load the pretrained
# checkpoint.
timers
(
'pretrained checkpoint'
).
start
()
if
args
.
iteration
==
0
and
args
.
pretrained_checkpoint
is
not
None
:
original_load
=
args
.
load
args
.
load
=
args
.
pretrained_checkpoint
_
=
load_checkpoint
(
model
,
None
,
None
)
args
.
load
=
original_load
# This is critical when only model is loaded. We should make sure
# master parameters are also updated.
if
args
.
fp16
:
optimizer
.
_model_params_to_master_params
()
timers
(
'pretrained checkpoint'
).
stop
()
# Print setup timing.
print_rank_0
(
'done with setups ...'
)
timers
.
log
([
'train/valid/test dataset/dataloder'
,
'callback function'
,
'model and optimizer'
,
'pretrained checkpoint'
])
print_rank_0
(
'training ...'
)
# Finetune the model.
if
args
.
epochs
>
0
:
_train
(
model
,
optimizer
,
lr_scheduler
,
forward_step
,
train_dataloader
,
valid_dataloader
,
end_of_epoch_callback
)
# Or just evaluate.
else
:
if
end_of_epoch_callback
is
not
None
:
print_rank_0
(
'evaluation only mode, setting epoch to -1'
)
end_of_epoch_callback
(
model
,
epoch
=-
1
,
output_predictions
=
True
)
print_rank_0
(
'done :-)'
)
Prev
1
…
5
6
7
8
9
10
11
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment