Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
GPT2_pytorch
Commits
8ec5d678
Commit
8ec5d678
authored
Apr 03, 2023
by
hepj987
Browse files
GPT2 base on megatron-deepspeed
parents
Changes
248
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
5066 additions
and
0 deletions
+5066
-0
megatron-deepspeed_dtk22.10/megatron/mpu/initialize.py
megatron-deepspeed_dtk22.10/megatron/mpu/initialize.py
+374
-0
megatron-deepspeed_dtk22.10/megatron/mpu/layers.py
megatron-deepspeed_dtk22.10/megatron/mpu/layers.py
+449
-0
megatron-deepspeed_dtk22.10/megatron/mpu/mappings.py
megatron-deepspeed_dtk22.10/megatron/mpu/mappings.py
+157
-0
megatron-deepspeed_dtk22.10/megatron/mpu/random.py
megatron-deepspeed_dtk22.10/megatron/mpu/random.py
+324
-0
megatron-deepspeed_dtk22.10/megatron/mpu/tests/__init__.py
megatron-deepspeed_dtk22.10/megatron/mpu/tests/__init__.py
+0
-0
megatron-deepspeed_dtk22.10/megatron/mpu/tests/commons.py
megatron-deepspeed_dtk22.10/megatron/mpu/tests/commons.py
+83
-0
megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_cross_entropy.py
...epspeed_dtk22.10/megatron/mpu/tests/test_cross_entropy.py
+108
-0
megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_data.py
megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_data.py
+88
-0
megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_initialize.py
...-deepspeed_dtk22.10/megatron/mpu/tests/test_initialize.py
+95
-0
megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_layers.py
...tron-deepspeed_dtk22.10/megatron/mpu/tests/test_layers.py
+530
-0
megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_random.py
...tron-deepspeed_dtk22.10/megatron/mpu/tests/test_random.py
+204
-0
megatron-deepspeed_dtk22.10/megatron/mpu/utils.py
megatron-deepspeed_dtk22.10/megatron/mpu/utils.py
+70
-0
megatron-deepspeed_dtk22.10/megatron/optimizer/__init__.py
megatron-deepspeed_dtk22.10/megatron/optimizer/__init__.py
+133
-0
megatron-deepspeed_dtk22.10/megatron/optimizer/clip_grads.py
megatron-deepspeed_dtk22.10/megatron/optimizer/clip_grads.py
+148
-0
megatron-deepspeed_dtk22.10/megatron/optimizer/grad_scaler.py
...tron-deepspeed_dtk22.10/megatron/optimizer/grad_scaler.py
+133
-0
megatron-deepspeed_dtk22.10/megatron/optimizer/optimizer.py
megatron-deepspeed_dtk22.10/megatron/optimizer/optimizer.py
+523
-0
megatron-deepspeed_dtk22.10/megatron/p2p_communication.py
megatron-deepspeed_dtk22.10/megatron/p2p_communication.py
+264
-0
megatron-deepspeed_dtk22.10/megatron/package_info.py
megatron-deepspeed_dtk22.10/megatron/package_info.py
+29
-0
megatron-deepspeed_dtk22.10/megatron/schedules.py
megatron-deepspeed_dtk22.10/megatron/schedules.py
+465
-0
megatron-deepspeed_dtk22.10/megatron/testing_utils.py
megatron-deepspeed_dtk22.10/megatron/testing_utils.py
+889
-0
No files found.
Too many changes to show.
To preserve performance only
248 of 248+
files are displayed.
Plain diff
Email patch
megatron-deepspeed_dtk22.10/megatron/mpu/initialize.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model and data parallel groups."""
import
torch
from
.utils
import
ensure_divisibility
# Intra-layer model parallel group that the current rank belongs to.
_TENSOR_MODEL_PARALLEL_GROUP
=
None
# Inter-layer model parallel group that the current rank belongs to.
_PIPELINE_MODEL_PARALLEL_GROUP
=
None
# Model parallel group (both intra- and pipeline) that the current rank belongs to.
_MODEL_PARALLEL_GROUP
=
None
# Embedding group.
_EMBEDDING_GROUP
=
None
# Data parallel group that the current rank belongs to.
_DATA_PARALLEL_GROUP
=
None
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
=
None
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
=
None
# These values enable us to change the mpu sizes on the fly.
_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
=
None
_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
=
None
_MPU_TENSOR_MODEL_PARALLEL_RANK
=
None
_MPU_PIPELINE_MODEL_PARALLEL_RANK
=
None
# A list of global ranks for each pipeline group to ease calculation of the source
# rank when broadcasting from the first or last pipeline stage
_PIPELINE_GLOBAL_RANKS
=
None
def
is_unitialized
():
"""Useful for code segments that may be accessed with or without mpu initialization"""
return
_DATA_PARALLEL_GROUP
is
None
def
initialize_model_parallel
(
tensor_model_parallel_size_
=
1
,
pipeline_model_parallel_size_
=
1
,
virtual_pipeline_model_parallel_size_
=
None
):
"""
Initialize model data parallel groups.
Arguments:
tensor_model_parallel_size: number of GPUs used to parallelize model tensor.
pipeline_model_parallel_size: number of GPUs used to parallelize model pipeline.
Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
the model pipeline. The present function will
create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
and 8 data-parallel groups as:
8 data_parallel groups:
[g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
8 tensor model-parallel groups:
[g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
4 pipeline model-parallel groups:
[g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
Note that for efficiency, the caller should make sure adjacent ranks
are on the same DGX box. For example if we are using 2 DGX-1 boxes
with a total of 16 GPUs, rank 0 to 7 belong to the first box and
ranks 8 to 15 belong to the second box.
"""
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> initializing tensor model parallel with size {}'
.
format
(
tensor_model_parallel_size_
))
print
(
'> initializing pipeline model parallel with size {}'
.
format
(
pipeline_model_parallel_size_
))
# Get world size and rank. Ensure some consistencies.
assert
torch
.
distributed
.
is_initialized
()
world_size
=
torch
.
distributed
.
get_world_size
()
tensor_model_parallel_size
=
min
(
tensor_model_parallel_size_
,
world_size
)
pipeline_model_parallel_size
=
min
(
pipeline_model_parallel_size_
,
world_size
)
ensure_divisibility
(
world_size
,
tensor_model_parallel_size
*
pipeline_model_parallel_size
)
data_parallel_size
=
world_size
//
(
tensor_model_parallel_size
*
pipeline_model_parallel_size
)
num_tensor_model_parallel_groups
=
world_size
//
tensor_model_parallel_size
num_pipeline_model_parallel_groups
=
world_size
//
pipeline_model_parallel_size
num_data_parallel_groups
=
world_size
//
data_parallel_size
if
virtual_pipeline_model_parallel_size_
is
not
None
:
global
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
global
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
=
0
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
=
virtual_pipeline_model_parallel_size_
rank
=
torch
.
distributed
.
get_rank
()
# Build the data-parallel groups.
global
_DATA_PARALLEL_GROUP
assert
_DATA_PARALLEL_GROUP
is
None
,
\
'data parallel group is already initialized'
all_data_parallel_group_ranks
=
[]
for
i
in
range
(
pipeline_model_parallel_size
):
start_rank
=
i
*
num_pipeline_model_parallel_groups
end_rank
=
(
i
+
1
)
*
num_pipeline_model_parallel_groups
for
j
in
range
(
tensor_model_parallel_size
):
ranks
=
range
(
start_rank
+
j
,
end_rank
,
tensor_model_parallel_size
)
all_data_parallel_group_ranks
.
append
(
list
(
ranks
))
group
=
torch
.
distributed
.
new_group
(
ranks
)
if
rank
in
ranks
:
_DATA_PARALLEL_GROUP
=
group
# Build the model-parallel groups.
global
_MODEL_PARALLEL_GROUP
assert
_MODEL_PARALLEL_GROUP
is
None
,
\
'model parallel group is already initialized'
for
i
in
range
(
data_parallel_size
):
ranks
=
[
data_parallel_group_ranks
[
i
]
for
data_parallel_group_ranks
in
all_data_parallel_group_ranks
]
group
=
torch
.
distributed
.
new_group
(
ranks
)
if
rank
in
ranks
:
_MODEL_PARALLEL_GROUP
=
group
# Build the tensor model-parallel groups.
global
_TENSOR_MODEL_PARALLEL_GROUP
assert
_TENSOR_MODEL_PARALLEL_GROUP
is
None
,
\
'tensor model parallel group is already initialized'
for
i
in
range
(
num_tensor_model_parallel_groups
):
ranks
=
range
(
i
*
tensor_model_parallel_size
,
(
i
+
1
)
*
tensor_model_parallel_size
)
group
=
torch
.
distributed
.
new_group
(
ranks
)
if
rank
in
ranks
:
_TENSOR_MODEL_PARALLEL_GROUP
=
group
# Build the pipeline model-parallel groups and embedding groups
# (first and last rank in each pipeline model-parallel group).
global
_PIPELINE_MODEL_PARALLEL_GROUP
global
_PIPELINE_GLOBAL_RANKS
assert
_PIPELINE_MODEL_PARALLEL_GROUP
is
None
,
\
'pipeline model parallel group is already initialized'
global
_EMBEDDING_GROUP
assert
_EMBEDDING_GROUP
is
None
,
\
'embedding group is already initialized'
for
i
in
range
(
num_pipeline_model_parallel_groups
):
ranks
=
range
(
i
,
world_size
,
num_pipeline_model_parallel_groups
)
group
=
torch
.
distributed
.
new_group
(
ranks
)
if
rank
in
ranks
:
_PIPELINE_MODEL_PARALLEL_GROUP
=
group
_PIPELINE_GLOBAL_RANKS
=
ranks
# Setup embedding group (to exchange gradients between
# first and last stages).
if
len
(
ranks
)
>
1
:
embedding_ranks
=
[
ranks
[
0
],
ranks
[
-
1
]]
else
:
embedding_ranks
=
ranks
group
=
torch
.
distributed
.
new_group
(
embedding_ranks
)
if
rank
in
embedding_ranks
:
_EMBEDDING_GROUP
=
group
def
model_parallel_is_initialized
():
"""Check if model and data parallel groups are initialized."""
if
_TENSOR_MODEL_PARALLEL_GROUP
is
None
or
\
_PIPELINE_MODEL_PARALLEL_GROUP
is
None
or
\
_DATA_PARALLEL_GROUP
is
None
:
return
False
return
True
def
get_model_parallel_group
():
"""Get the model parallel group the caller rank belongs to."""
assert
_MODEL_PARALLEL_GROUP
is
not
None
,
\
'model parallel group is not initialized'
return
_MODEL_PARALLEL_GROUP
def
get_tensor_model_parallel_group
():
"""Get the tensor model parallel group the caller rank belongs to."""
assert
_TENSOR_MODEL_PARALLEL_GROUP
is
not
None
,
\
'intra_layer_model parallel group is not initialized'
return
_TENSOR_MODEL_PARALLEL_GROUP
def
get_pipeline_model_parallel_group
():
"""Get the pipeline model parallel group the caller rank belongs to."""
assert
_PIPELINE_MODEL_PARALLEL_GROUP
is
not
None
,
\
'pipeline_model parallel group is not initialized'
return
_PIPELINE_MODEL_PARALLEL_GROUP
def
get_data_parallel_group
():
"""Get the data parallel group the caller rank belongs to."""
assert
_DATA_PARALLEL_GROUP
is
not
None
,
\
'data parallel group is not initialized'
return
_DATA_PARALLEL_GROUP
def
get_embedding_group
():
"""Get the embedding group the caller rank belongs to."""
assert
_EMBEDDING_GROUP
is
not
None
,
\
'embedding group is not initialized'
return
_EMBEDDING_GROUP
def
set_tensor_model_parallel_world_size
(
world_size
):
"""Set the tensor model parallel size"""
global
_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
=
world_size
def
set_pipeline_model_parallel_world_size
(
world_size
):
"""Set the pipeline model parallel size"""
global
_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
=
world_size
def
get_tensor_model_parallel_world_size
():
"""Return world size for the tensor model parallel group."""
global
_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
if
_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
is
not
None
:
return
_MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
return
torch
.
distributed
.
get_world_size
(
group
=
get_tensor_model_parallel_group
())
def
get_model_parallel_world_size
():
assert
get_pipeline_model_parallel_world_size
()
==
1
,
"legacy get_model_parallel_world_size is only supported if PP is disabled"
return
get_tensor_model_parallel_world_size
()
def
get_pipeline_model_parallel_world_size
():
"""Return world size for the pipeline model parallel group."""
global
_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
if
_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
is
not
None
:
return
_MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
return
torch
.
distributed
.
get_world_size
(
group
=
get_pipeline_model_parallel_group
())
def
set_tensor_model_parallel_rank
(
rank
):
"""Set tensor model parallel rank."""
global
_MPU_TENSOR_MODEL_PARALLEL_RANK
_MPU_TENSOR_MODEL_PARALLEL_RANK
=
rank
def
set_pipeline_model_parallel_rank
(
rank
):
"""Set pipeline model parallel rank."""
global
_MPU_PIPELINE_MODEL_PARALLEL_RANK
_MPU_PIPELINE_MODEL_PARALLEL_RANK
=
rank
def
get_tensor_model_parallel_rank
():
"""Return my rank for the tensor model parallel group."""
global
_MPU_TENSOR_MODEL_PARALLEL_RANK
if
_MPU_TENSOR_MODEL_PARALLEL_RANK
is
not
None
:
return
_MPU_TENSOR_MODEL_PARALLEL_RANK
return
torch
.
distributed
.
get_rank
(
group
=
get_tensor_model_parallel_group
())
def
get_model_parallel_rank
():
assert
get_pipeline_model_parallel_world_size
()
==
1
,
"legacy get_model_parallel_rank is only supported if PP is disabled"
return
get_tensor_model_parallel_rank
()
def
get_pipeline_model_parallel_rank
():
"""Return my rank for the pipeline model parallel group."""
global
_MPU_PIPELINE_MODEL_PARALLEL_RANK
if
_MPU_PIPELINE_MODEL_PARALLEL_RANK
is
not
None
:
return
_MPU_PIPELINE_MODEL_PARALLEL_RANK
return
torch
.
distributed
.
get_rank
(
group
=
get_pipeline_model_parallel_group
())
def
is_pipeline_first_stage
(
ignore_virtual
=
False
):
"""Return True if in the first pipeline model-parallel stage, False otherwise."""
if
not
ignore_virtual
:
if
get_virtual_pipeline_model_parallel_world_size
()
is
not
None
and
\
get_virtual_pipeline_model_parallel_rank
()
!=
0
:
return
False
return
get_pipeline_model_parallel_rank
()
==
0
def
is_pipeline_last_stage
(
ignore_virtual
=
False
):
"""Return True if in the last pipeline model-parallel stage, False otherwise."""
if
not
ignore_virtual
:
virtual_pipeline_model_parallel_world_size
=
\
get_virtual_pipeline_model_parallel_world_size
()
if
virtual_pipeline_model_parallel_world_size
is
not
None
and
\
get_virtual_pipeline_model_parallel_rank
()
!=
(
virtual_pipeline_model_parallel_world_size
-
1
):
return
False
return
get_pipeline_model_parallel_rank
()
==
(
get_pipeline_model_parallel_world_size
()
-
1
)
def
get_virtual_pipeline_model_parallel_rank
():
"""Return the virtual pipeline-parallel rank."""
global
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
return
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
def
set_virtual_pipeline_model_parallel_rank
(
rank
):
"""Set the virtual pipeline-parallel rank."""
global
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
=
rank
def
get_virtual_pipeline_model_parallel_world_size
():
"""Return the virtual pipeline-parallel world size."""
global
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
return
_VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
def
get_tensor_model_parallel_src_rank
():
"""Calculate the global rank corresponding to the first local rank
in the tensor model parallel group."""
global_rank
=
torch
.
distributed
.
get_rank
()
local_world_size
=
get_tensor_model_parallel_world_size
()
return
(
global_rank
//
local_world_size
)
*
local_world_size
def
get_pipeline_model_parallel_first_rank
():
assert
_PIPELINE_GLOBAL_RANKS
is
not
None
,
\
"Pipeline parallel group is not initialized"
return
_PIPELINE_GLOBAL_RANKS
[
0
]
def
get_pipeline_model_parallel_last_rank
():
assert
_PIPELINE_GLOBAL_RANKS
is
not
None
,
\
"Pipeline parallel group is not initialized"
last_rank_local
=
get_pipeline_model_parallel_world_size
()
-
1
return
_PIPELINE_GLOBAL_RANKS
[
last_rank_local
]
def
get_pipeline_model_parallel_next_rank
():
assert
_PIPELINE_GLOBAL_RANKS
is
not
None
,
\
"Pipeline parallel group is not initialized"
rank_in_pipeline
=
get_pipeline_model_parallel_rank
()
world_size
=
get_pipeline_model_parallel_world_size
()
return
_PIPELINE_GLOBAL_RANKS
[(
rank_in_pipeline
+
1
)
%
world_size
]
def
get_pipeline_model_parallel_prev_rank
():
assert
_PIPELINE_GLOBAL_RANKS
is
not
None
,
\
"Pipeline parallel group is not initialized"
rank_in_pipeline
=
get_pipeline_model_parallel_rank
()
world_size
=
get_pipeline_model_parallel_world_size
()
return
_PIPELINE_GLOBAL_RANKS
[(
rank_in_pipeline
-
1
)
%
world_size
]
def
get_data_parallel_world_size
():
"""Return world size for the data parallel group."""
return
torch
.
distributed
.
get_world_size
(
group
=
get_data_parallel_group
())
def
get_data_parallel_rank
():
"""Return my rank for the data parallel group."""
return
torch
.
distributed
.
get_rank
(
group
=
get_data_parallel_group
())
def
destroy_model_parallel
():
"""Set the groups to none."""
global
_TENSOR_MODEL_PARALLEL_GROUP
_TENSOR_MODEL_PARALLEL_GROUP
=
None
global
_PIPELINE_MODEL_PARALLEL_GROUP
_PIPELINE_MODEL_PARALLEL_GROUP
=
None
global
_DATA_PARALLEL_GROUP
_DATA_PARALLEL_GROUP
=
None
megatron-deepspeed_dtk22.10/megatron/mpu/layers.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Parts of the code here are adapted from PyTorch
# repo: https://github.com/pytorch/pytorch
import
math
import
torch
import
torch.nn.functional
as
F
import
torch.nn.init
as
init
from
torch.nn.parameter
import
Parameter
from
functools
import
partial
from
.initialize
import
get_tensor_model_parallel_rank
from
.initialize
import
get_tensor_model_parallel_world_size
from
.mappings
import
copy_to_tensor_model_parallel_region
from
.mappings
import
gather_from_tensor_model_parallel_region
from
.mappings
import
reduce_from_tensor_model_parallel_region
from
.mappings
import
scatter_to_tensor_model_parallel_region
from
.random
import
get_cuda_rng_tracker
from
.utils
import
divide
from
.utils
import
split_tensor_along_last_dim
from
.utils
import
VocabUtility
from
..model.fused_layer_norm
import
MixedFusedLayerNorm
as
LayerNorm
from
megatron
import
get_args
,
mpu
import
deepspeed.runtime.activation_checkpointing.checkpointing
as
ds_checkpointing
_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS
=
{
'tensor_model_parallel'
:
False
,
'partition_dim'
:
-
1
,
'partition_stride'
:
1
}
def
param_is_not_tensor_parallel_duplicate
(
param
):
return
(
hasattr
(
param
,
'tensor_model_parallel'
)
and
param
.
tensor_model_parallel
)
or
(
get_tensor_model_parallel_rank
()
==
0
)
def
set_tensor_model_parallel_attributes
(
tensor
,
is_parallel
,
dim
,
stride
):
# Make sure the attributes are not set.
for
attribute
in
_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS
:
assert
not
hasattr
(
tensor
,
attribute
)
# Set the attributes.
setattr
(
tensor
,
'tensor_model_parallel'
,
is_parallel
)
setattr
(
tensor
,
'partition_dim'
,
dim
)
setattr
(
tensor
,
'partition_stride'
,
stride
)
def
set_defaults_if_not_set_tensor_model_parallel_attributes
(
tensor
):
def
maybe_set
(
attribute
,
value
):
if
not
hasattr
(
tensor
,
attribute
):
setattr
(
tensor
,
attribute
,
value
)
for
attribute
in
_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS
:
maybe_set
(
attribute
,
_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS
[
attribute
])
def
copy_tensor_model_parallel_attributes
(
destination_tensor
,
source_tensor
):
def
maybe_copy
(
attribute
):
if
hasattr
(
source_tensor
,
attribute
):
setattr
(
destination_tensor
,
attribute
,
getattr
(
source_tensor
,
attribute
))
for
attribute
in
_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS
:
maybe_copy
(
attribute
)
def
_initialize_affine_weight_gpu
(
weight
,
init_method
,
partition_dim
,
stride
=
1
):
"""Initialize affine weight for model parallel on GPU."""
set_tensor_model_parallel_attributes
(
tensor
=
weight
,
is_parallel
=
True
,
dim
=
partition_dim
,
stride
=
stride
)
if
ds_checkpointing
.
is_configured
():
global
get_cuda_rng_tracker
get_cuda_rng_tracker
=
ds_checkpointing
.
get_cuda_rng_tracker
with
get_cuda_rng_tracker
().
fork
():
init_method
(
weight
)
def
_initialize_affine_weight_cpu
(
weight
,
output_size
,
input_size
,
per_partition_size
,
partition_dim
,
init_method
,
stride
=
1
,
return_master_weight
=
False
):
"""Initialize affine weight for model parallel.
Build the master weight on all processes and scatter
the relevant chunk."""
set_tensor_model_parallel_attributes
(
tensor
=
weight
,
is_parallel
=
True
,
dim
=
partition_dim
,
stride
=
stride
)
# Initialize master weight
master_weight
=
torch
.
empty
(
output_size
,
input_size
,
dtype
=
torch
.
float
,
requires_grad
=
False
)
init_method
(
master_weight
)
args
=
get_args
()
master_weight
=
master_weight
.
to
(
dtype
=
args
.
params_dtype
)
# Split and copy
per_partition_per_stride_size
=
divide
(
per_partition_size
,
stride
)
weight_list
=
torch
.
split
(
master_weight
,
per_partition_per_stride_size
,
dim
=
partition_dim
)
rank
=
get_tensor_model_parallel_rank
()
world_size
=
get_tensor_model_parallel_world_size
()
my_weight_list
=
weight_list
[
rank
::
world_size
]
with
torch
.
no_grad
():
torch
.
cat
(
my_weight_list
,
dim
=
partition_dim
,
out
=
weight
)
if
return_master_weight
:
return
master_weight
return
None
def
xavier_uniform_tensor_parallel_
(
tensor
,
gain
=
1.
,
tp_degree
=
1
):
r
"""
This is a modified torch.nn.init.xavier_uniform_ with changes to support
partitioned on the vocab size dim embedding with tensor parallel.
Additional args:
- tp_degree: degree of tensor parallel
Note: the code assumes all partitions are equal in size
"""
# receptive_field_size=1 as dim==2, so we don't need init._calculate_fan_in_and_fan_out
fan_out
,
fan_in
=
tensor
.
shape
fan_out
*=
tp_degree
# tp splits on num_embeddings dim
std
=
gain
*
math
.
sqrt
(
2.0
/
float
(
fan_in
+
fan_out
))
a
=
math
.
sqrt
(
3.0
)
*
std
# Calculate uniform bounds from standard deviation
return
torch
.
nn
.
init
.
_no_grad_uniform_
(
tensor
,
-
a
,
a
)
class
VocabParallelEmbedding
(
torch
.
nn
.
Module
):
"""Embedding parallelized in the vocabulary dimension.
This is mainly adapted from torch.nn.Embedding and all the default
values are kept.
Arguments:
num_embeddings: vocabulary size.
embedding_dim: size of hidden state.
init_method: method to initialize weights.
"""
def
__init__
(
self
,
num_embeddings
,
embedding_dim
,
init_method
=
init
.
xavier_normal_
):
super
(
VocabParallelEmbedding
,
self
).
__init__
()
# Keep the input dimensions.
self
.
num_embeddings
=
num_embeddings
self
.
embedding_dim
=
embedding_dim
# Set the defaults for compatibility.
self
.
padding_idx
=
None
self
.
max_norm
=
None
self
.
norm_type
=
2.
self
.
scale_grad_by_freq
=
False
self
.
sparse
=
False
self
.
_weight
=
None
self
.
tensor_model_parallel_size
=
get_tensor_model_parallel_world_size
()
# Divide the weight matrix along the vocabulary dimension.
self
.
vocab_start_index
,
self
.
vocab_end_index
=
\
VocabUtility
.
vocab_range_from_global_vocab_size
(
self
.
num_embeddings
,
get_tensor_model_parallel_rank
(),
self
.
tensor_model_parallel_size
)
self
.
num_embeddings_per_partition
=
self
.
vocab_end_index
-
\
self
.
vocab_start_index
# Allocate weights and initialize.
args
=
get_args
()
# only the first stage embedding runs this class' forward. The head's embedding does its own
# thing, so don't waste memory allocating LN weights.
if
mpu
.
is_pipeline_first_stage
()
and
(
args
.
use_bnb_optimizer
or
args
.
embed_layernorm
):
self
.
norm
=
LayerNorm
(
embedding_dim
)
if
args
.
use_bnb_optimizer
:
# for BNB we ignore the passed init_method and use torch.nn.init.xavier_uniform_
# modified to calculate std on the unpartitioned embedding
init_method
=
partial
(
xavier_uniform_tensor_parallel_
,
tp_degree
=
self
.
tensor_model_parallel_size
)
if
args
.
use_cpu_initialization
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
num_embeddings_per_partition
,
self
.
embedding_dim
,
dtype
=
args
.
params_dtype
))
_initialize_affine_weight_cpu
(
self
.
weight
,
self
.
num_embeddings
,
self
.
embedding_dim
,
self
.
num_embeddings_per_partition
,
0
,
init_method
)
else
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
num_embeddings_per_partition
,
self
.
embedding_dim
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
args
.
params_dtype
))
_initialize_affine_weight_gpu
(
self
.
weight
,
init_method
,
partition_dim
=
0
,
stride
=
1
)
if
args
.
use_bnb_optimizer
:
from
bitsandbytes.optim
import
GlobalOptimManager
GlobalOptimManager
.
get_instance
().
override_config
(
self
.
weight
,
'optim_bits'
,
32
)
GlobalOptimManager
.
get_instance
().
register_parameters
(
self
.
weight
)
def
forward
(
self
,
input_
):
if
torch
.
any
(
input_
>=
self
.
num_embeddings
):
raise
ValueError
(
f
"There is an input id in the input that is greater than the highest possible input id.
\n
Input:
{
input_
}
\n
num_embeddings:
{
self
.
num_embeddings
}
"
)
if
self
.
tensor_model_parallel_size
>
1
:
# Build the mask.
input_mask
=
(
input_
<
self
.
vocab_start_index
)
|
\
(
input_
>=
self
.
vocab_end_index
)
# Mask the input.
masked_input
=
input_
.
clone
()
-
self
.
vocab_start_index
masked_input
[
input_mask
]
=
0
else
:
# input_ is garanted to be in the range [0:self.vocab_end_index - self.vocab_start_index] thanks to the first check
masked_input
=
input_
# Get the embeddings.
output_parallel
=
F
.
embedding
(
masked_input
,
self
.
weight
,
self
.
padding_idx
,
self
.
max_norm
,
self
.
norm_type
,
self
.
scale_grad_by_freq
,
self
.
sparse
)
# Mask the output embedding.
if
self
.
tensor_model_parallel_size
>
1
:
output_parallel
[
input_mask
,
:]
=
0.0
# Reduce across all the model parallel GPUs.
output
=
reduce_from_tensor_model_parallel_region
(
output_parallel
)
if
hasattr
(
self
,
'norm'
):
output
=
self
.
norm
(
output
)
return
output
class
ColumnParallelLinear
(
torch
.
nn
.
Module
):
"""Linear layer with column parallelism.
The linear layer is defined as Y = XA + b. A is parallelized along
its second dimension as A = [A_1, ..., A_p].
Arguments:
input_size: first dimension of matrix A.
output_size: second dimension of matrix A.
bias: If true, add bias
gather_output: If true, call all-gether on output and make Y avaiable
to all GPUs, otherwise, every GPU will have its output
which is Y_i = XA_i
init_method: method to initialize weights. Note that bias is always set
to zero.
stride: For the strided linear layers.
keep_master_weight_for_test: This was added for testing and should be
set to False. It returns the master weights
used for initialization.
skip_bias_add: This was added to enable performance optimations where bias
can be fused with other elementwise operations. we skip
adding bias but instead return it.
"""
def
__init__
(
self
,
input_size
,
output_size
,
bias
=
True
,
gather_output
=
True
,
init_method
=
init
.
xavier_normal_
,
stride
=
1
,
keep_master_weight_for_test
=
False
,
skip_bias_add
=
False
):
super
(
ColumnParallelLinear
,
self
).
__init__
()
# Keep input parameters
self
.
input_size
=
input_size
self
.
output_size
=
output_size
self
.
gather_output
=
gather_output
# Divide the weight matrix along the last dimension.
world_size
=
get_tensor_model_parallel_world_size
()
self
.
output_size_per_partition
=
divide
(
output_size
,
world_size
)
self
.
skip_bias_add
=
skip_bias_add
# Parameters.
# Note: torch.nn.functional.linear performs XA^T + b and as a result
# we allocate the transpose.
# Initialize weight.
args
=
get_args
()
if
args
.
use_cpu_initialization
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
output_size_per_partition
,
self
.
input_size
,
dtype
=
args
.
params_dtype
))
self
.
master_weight
=
_initialize_affine_weight_cpu
(
self
.
weight
,
self
.
output_size
,
self
.
input_size
,
self
.
output_size_per_partition
,
0
,
init_method
,
stride
=
stride
,
return_master_weight
=
keep_master_weight_for_test
)
else
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
output_size_per_partition
,
self
.
input_size
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
args
.
params_dtype
))
_initialize_affine_weight_gpu
(
self
.
weight
,
init_method
,
partition_dim
=
0
,
stride
=
stride
)
if
bias
:
if
args
.
use_cpu_initialization
:
self
.
bias
=
Parameter
(
torch
.
empty
(
self
.
output_size_per_partition
,
dtype
=
args
.
params_dtype
))
else
:
self
.
bias
=
Parameter
(
torch
.
empty
(
self
.
output_size_per_partition
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
args
.
params_dtype
))
set_tensor_model_parallel_attributes
(
self
.
bias
,
True
,
0
,
stride
)
# Always initialize bias to zero.
with
torch
.
no_grad
():
self
.
bias
.
zero_
()
else
:
self
.
register_parameter
(
'bias'
,
None
)
def
forward
(
self
,
input_
):
# Set up backprop all-reduce.
input_parallel
=
copy_to_tensor_model_parallel_region
(
input_
)
# Matrix multiply.
bias
=
self
.
bias
if
not
self
.
skip_bias_add
else
None
output_parallel
=
F
.
linear
(
input_parallel
,
self
.
weight
,
bias
)
if
self
.
gather_output
:
# All-gather across the partitions.
output
=
gather_from_tensor_model_parallel_region
(
output_parallel
)
else
:
output
=
output_parallel
output_bias
=
self
.
bias
if
self
.
skip_bias_add
else
None
return
output
,
output_bias
class
RowParallelLinear
(
torch
.
nn
.
Module
):
"""Linear layer with row parallelism.
The linear layer is defined as Y = XA + b. A is parallelized along
its first dimension and X along its second dimension as:
- -
| A_1 |
| . |
A = | . | X = [X_1, ..., X_p]
| . |
| A_p |
- -
Arguments:
input_size: first dimension of matrix A.
output_size: second dimension of matrix A.
bias: If true, add bias. Note that bias is not parallelized.
input_is_parallel: If true, we assume that the input is already
split across the GPUs and we do not split
again.
init_method: method to initialize weights. Note that bias is always set
to zero.
stride: For the strided linear layers.
keep_master_weight_for_test: This was added for testing and should be
set to False. It returns the master weights
used for initialization.
skip_bias_add: This was added to enable performance optimations where bias
can be fused with other elementwise operations. we skip
adding bias but instead return it.
"""
def
__init__
(
self
,
input_size
,
output_size
,
bias
=
True
,
input_is_parallel
=
False
,
init_method
=
init
.
xavier_normal_
,
stride
=
1
,
keep_master_weight_for_test
=
False
,
skip_bias_add
=
False
):
super
(
RowParallelLinear
,
self
).
__init__
()
# Keep input parameters
self
.
input_size
=
input_size
self
.
output_size
=
output_size
self
.
input_is_parallel
=
input_is_parallel
# Divide the weight matrix along the last dimension.
world_size
=
get_tensor_model_parallel_world_size
()
self
.
input_size_per_partition
=
divide
(
input_size
,
world_size
)
self
.
skip_bias_add
=
skip_bias_add
# Parameters.
# Note: torch.nn.functional.linear performs XA^T + b and as a result
# we allocate the transpose.
# Initialize weight.
args
=
get_args
()
if
args
.
use_cpu_initialization
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
output_size
,
self
.
input_size_per_partition
,
dtype
=
args
.
params_dtype
))
self
.
master_weight
=
_initialize_affine_weight_cpu
(
self
.
weight
,
self
.
output_size
,
self
.
input_size
,
self
.
input_size_per_partition
,
1
,
init_method
,
stride
=
stride
,
return_master_weight
=
keep_master_weight_for_test
)
else
:
self
.
weight
=
Parameter
(
torch
.
empty
(
self
.
output_size
,
self
.
input_size_per_partition
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
args
.
params_dtype
))
_initialize_affine_weight_gpu
(
self
.
weight
,
init_method
,
partition_dim
=
1
,
stride
=
stride
)
if
bias
:
if
args
.
use_cpu_initialization
:
self
.
bias
=
Parameter
(
torch
.
empty
(
self
.
output_size
,
dtype
=
args
.
params_dtype
))
else
:
self
.
bias
=
Parameter
(
torch
.
empty
(
self
.
output_size
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
args
.
params_dtype
))
# Always initialize bias to zero.
with
torch
.
no_grad
():
self
.
bias
.
zero_
()
else
:
self
.
register_parameter
(
'bias'
,
None
)
self
.
bias_tp_auto_sync
=
args
.
sync_tp_duplicated_parameters
def
forward
(
self
,
input_
):
# Set up backprop all-reduce.
if
self
.
input_is_parallel
:
input_parallel
=
input_
else
:
input_parallel
=
scatter_to_tensor_model_parallel_region
(
input_
)
# Matrix multiply.
output_parallel
=
F
.
linear
(
input_parallel
,
self
.
weight
)
# All-reduce across all the partitions.
output_
=
reduce_from_tensor_model_parallel_region
(
output_parallel
)
if
self
.
bias_tp_auto_sync
:
torch
.
distributed
.
all_reduce
(
self
.
bias
,
op
=
torch
.
distributed
.
ReduceOp
.
AVG
,
group
=
mpu
.
get_tensor_model_parallel_group
())
if
not
self
.
skip_bias_add
:
output
=
output_
+
self
.
bias
if
self
.
bias
is
not
None
else
output_
output_bias
=
None
else
:
output
=
output_
output_bias
=
self
.
bias
return
output
,
output_bias
megatron-deepspeed_dtk22.10/megatron/mpu/mappings.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
from
.initialize
import
get_tensor_model_parallel_group
,
get_tensor_model_parallel_world_size
,
get_tensor_model_parallel_rank
from
.utils
import
split_tensor_along_last_dim
def
_reduce
(
input_
):
"""All-reduce the the input tensor across model parallel group."""
# Bypass the function if we are using only 1 GPU.
if
get_tensor_model_parallel_world_size
()
==
1
:
return
input_
# All-reduce.
torch
.
distributed
.
all_reduce
(
input_
,
group
=
get_tensor_model_parallel_group
())
return
input_
def
_split
(
input_
):
"""Split the tensor along its last dimension and keep the
corresponding slice."""
world_size
=
get_tensor_model_parallel_world_size
()
# Bypass the function if we are using only 1 GPU.
if
world_size
==
1
:
return
input_
# Split along last dimension.
input_list
=
split_tensor_along_last_dim
(
input_
,
world_size
)
# Note: torch.split does not create contiguous tensors by default.
rank
=
get_tensor_model_parallel_rank
()
output
=
input_list
[
rank
].
contiguous
()
return
output
def
_gather
(
input_
):
"""Gather tensors and concatinate along the last dimension."""
world_size
=
get_tensor_model_parallel_world_size
()
# Bypass the function if we are using only 1 GPU.
if
world_size
==
1
:
return
input_
# Size and dimension.
last_dim
=
input_
.
dim
()
-
1
rank
=
get_tensor_model_parallel_rank
()
tensor_list
=
[
torch
.
empty_like
(
input_
)
for
_
in
range
(
world_size
)]
tensor_list
[
rank
]
=
input_
torch
.
distributed
.
all_gather
(
tensor_list
,
input_
,
group
=
get_tensor_model_parallel_group
())
# Note: torch.cat already creates a contiguous tensor.
output
=
torch
.
cat
(
tensor_list
,
dim
=
last_dim
).
contiguous
()
return
output
class
_CopyToModelParallelRegion
(
torch
.
autograd
.
Function
):
"""Pass the input to the model parallel region."""
@
staticmethod
def
symbolic
(
graph
,
input_
):
return
input_
@
staticmethod
def
forward
(
ctx
,
input_
):
return
input_
@
staticmethod
def
backward
(
ctx
,
grad_output
):
return
_reduce
(
grad_output
)
class
_ReduceFromModelParallelRegion
(
torch
.
autograd
.
Function
):
"""All-reduce the input from the model parallel region."""
@
staticmethod
def
symbolic
(
graph
,
input_
):
return
_reduce
(
input_
)
@
staticmethod
def
forward
(
ctx
,
input_
):
return
_reduce
(
input_
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
return
grad_output
class
_ScatterToModelParallelRegion
(
torch
.
autograd
.
Function
):
"""Split the input and keep only the corresponding chuck to the rank."""
@
staticmethod
def
symbolic
(
graph
,
input_
):
return
_split
(
input_
)
@
staticmethod
def
forward
(
ctx
,
input_
):
return
_split
(
input_
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
return
_gather
(
grad_output
)
class
_GatherFromModelParallelRegion
(
torch
.
autograd
.
Function
):
"""Gather the input from model parallel region and concatinate."""
@
staticmethod
def
symbolic
(
graph
,
input_
):
return
_gather
(
input_
)
@
staticmethod
def
forward
(
ctx
,
input_
):
return
_gather
(
input_
)
@
staticmethod
def
backward
(
ctx
,
grad_output
):
return
_split
(
grad_output
)
# -----------------
# Helper functions.
# -----------------
def
copy_to_tensor_model_parallel_region
(
input_
):
return
_CopyToModelParallelRegion
.
apply
(
input_
)
def
reduce_from_tensor_model_parallel_region
(
input_
):
return
_ReduceFromModelParallelRegion
.
apply
(
input_
)
def
scatter_to_tensor_model_parallel_region
(
input_
):
return
_ScatterToModelParallelRegion
.
apply
(
input_
)
def
gather_from_tensor_model_parallel_region
(
input_
):
return
_GatherFromModelParallelRegion
.
apply
(
input_
)
megatron-deepspeed_dtk22.10/megatron/mpu/random.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Parts of the code here are adapted from PyTorch
# repo: https://github.com/pytorch/pytorch
import
contextlib
import
torch
from
torch
import
_C
from
torch.cuda
import
_lazy_call
,
device
as
device_ctx_manager
from
torch.utils.checkpoint
import
detach_variable
from
megatron
import
get_args
from
megatron.memory
import
allocate_mem_buff
from
.initialize
import
get_data_parallel_rank
from
.initialize
import
get_tensor_model_parallel_group
from
.initialize
import
get_tensor_model_parallel_rank
from
.initialize
import
get_tensor_model_parallel_world_size
# Default name for the model parallel rng tracker.
_MODEL_PARALLEL_RNG_TRACKER_NAME
=
'model-parallel-rng'
# Whether apply model parallelsim to checkpointed hidden states.
_CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
=
None
def
init_checkpointed_activations_memory_buffer
():
"""Initialize the memory buffer for the checkpointed activations."""
args
=
get_args
()
upper_bound_sequence_length
=
max
(
args
.
seq_length
if
args
.
seq_length
is
not
None
else
0
,
args
.
decoder_seq_length
if
args
.
decoder_seq_length
is
not
None
else
0
)
per_layer
=
args
.
micro_batch_size
*
upper_bound_sequence_length
*
\
args
.
hidden_size
//
args
.
tensor_model_parallel_size
assert
args
.
num_layers
%
args
.
checkpoint_num_layers
==
0
,
\
'number of layers is not divisible by checkpoint-num-layers'
num_checkpointer_layers
=
args
.
num_layers
//
args
.
checkpoint_num_layers
numel
=
per_layer
*
num_checkpointer_layers
dtype
=
torch
.
half
if
not
args
.
fp16
:
dtype
=
torch
.
float
global
_CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
assert
_CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
is
None
,
\
'checkpointed activations memory buffer is already allocated.'
_CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
=
allocate_mem_buff
(
'checkpointed activations'
,
numel
,
dtype
,
track_usage
=
False
)
def
reset_checkpointed_activations_memory_buffer
():
"""Reset the memory used for checkpointing."""
if
_CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
is
not
None
:
_CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
.
reset
()
def
_set_cuda_rng_state
(
new_state
,
device
=-
1
):
"""Sets the random number generator state of the current GPU.
Argumentss:
new_state (torch.ByteTensor): The desired state
This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
with a single change: the input state is not cloned. Cloning caused
major performance issues for +4 GPU cases.
"""
if
hasattr
(
_C
,
'_cuda_setRNGState'
)
and
callable
(
_C
.
_cuda_setRNGState
):
# older PyTorch
def
cb
():
with
device_ctx_manager
(
device
):
_C
.
_cuda_setRNGState
(
new_state
)
else
:
# newer PyTorch
if
device
==
-
1
:
device
=
torch
.
device
(
'cuda'
)
elif
isinstance
(
device
,
str
):
device
=
torch
.
device
(
device
)
elif
isinstance
(
device
,
int
):
device
=
torch
.
device
(
'cuda'
,
device
)
def
cb
():
idx
=
device
.
index
if
idx
is
None
:
idx
=
torch
.
cuda
.
current_device
()
default_generator
=
torch
.
cuda
.
default_generators
[
idx
]
default_generator
.
set_state
(
new_state
)
_lazy_call
(
cb
)
def
split_tensor_into_1d_equal_chunks
(
tensor
):
"""Break a tensor into equal 1D chunks."""
data
=
tensor
.
view
(
-
1
)
partition_size
=
torch
.
numel
(
data
)
//
get_tensor_model_parallel_world_size
()
start_index
=
partition_size
*
get_tensor_model_parallel_rank
()
end_index
=
start_index
+
partition_size
return
data
[
start_index
:
end_index
]
def
gather_split_1d_tensor
(
tensor
):
"""Opposite of above function, gather values from model parallel ranks."""
world_size
=
get_tensor_model_parallel_world_size
()
numel
=
torch
.
numel
(
tensor
)
numel_gathered
=
world_size
*
numel
gathered
=
torch
.
empty
(
numel_gathered
,
dtype
=
tensor
.
dtype
,
device
=
torch
.
cuda
.
current_device
(),
requires_grad
=
False
)
chunks
=
[
gathered
[
i
*
numel
:(
i
+
1
)
*
numel
]
for
i
in
range
(
world_size
)]
torch
.
distributed
.
all_gather
(
chunks
,
tensor
,
group
=
get_tensor_model_parallel_group
())
return
gathered
class
CudaRNGStatesTracker
:
"""Tracker for the cuda RNG states.
Using the `add` method, a cuda rng state is initialized based on
the input `seed` and is assigned to `name`. Later, by forking the
rng state, we can perform operations and return to our starting
cuda state.
"""
def
__init__
(
self
):
# Map from a string name to the cuda rng state.
self
.
states_
=
{}
# Seeds are just for book keeping and ensure no seed is set twice.
self
.
seeds_
=
set
()
def
reset
(
self
):
"""Set to the initial state (no tracker)."""
self
.
states_
=
{}
self
.
seeds_
=
set
()
def
get_states
(
self
):
"""Get rng states. Copy the dictionary so we have direct
pointers to the states, not just a pointer to the dictionary."""
states
=
{}
for
name
in
self
.
states_
:
states
[
name
]
=
self
.
states_
[
name
]
return
states
def
set_states
(
self
,
states
):
"""Set the rng states. For efficiency purposes, we do not check
the size of seed for compatibility."""
self
.
states_
=
states
def
add
(
self
,
name
,
seed
):
"""Track the rng state."""
# Check seed is not already used.
if
seed
in
self
.
seeds_
:
raise
Exception
(
'seed {} already exists'
.
format
(
seed
))
self
.
seeds_
.
add
(
seed
)
# Check that state is not already defined.
if
name
in
self
.
states_
:
raise
Exception
(
'cuda rng state {} already exists'
.
format
(
name
))
# Get the current rng state.
orig_rng_state
=
torch
.
cuda
.
get_rng_state
()
# Set the new state and store it.
torch
.
cuda
.
manual_seed
(
seed
)
self
.
states_
[
name
]
=
torch
.
cuda
.
get_rng_state
()
# Reset rng state to what it was.
_set_cuda_rng_state
(
orig_rng_state
)
@
contextlib
.
contextmanager
def
fork
(
self
,
name
=
_MODEL_PARALLEL_RNG_TRACKER_NAME
):
"""Fork the cuda rng state, perform operations, and exit with
the original state."""
# Check if we have added the state
if
name
not
in
self
.
states_
:
print
(
name
,
self
.
states_
)
raise
Exception
(
'cuda rng state {} is not added'
.
format
(
name
))
# Store current rng state.
orig_cuda_rng_state
=
torch
.
cuda
.
get_rng_state
()
# Set rng state to the desired one
_set_cuda_rng_state
(
self
.
states_
[
name
])
# Do the stuff we wanted to do.
try
:
yield
finally
:
# Update the current rng state for later use.
self
.
states_
[
name
]
=
torch
.
cuda
.
get_rng_state
()
# And set the state to the original state we started with.
_set_cuda_rng_state
(
orig_cuda_rng_state
)
# RNG tracker object.
_CUDA_RNG_STATE_TRACKER
=
CudaRNGStatesTracker
()
def
get_cuda_rng_tracker
():
"""Get cuda rng tracker."""
return
_CUDA_RNG_STATE_TRACKER
def
model_parallel_cuda_manual_seed
(
seed
):
"""Initialize model parallel cuda seed.
This function should be called after the model parallel is
initialized. Also, no torch.cuda.manual_seed should be called
after this function. Basically, this is replacement for that
function.
Two set of RNG states are tracked:
default state: This is for data parallelism and is the same among a
set of model parallel GPUs but different across
different model paralle groups. This is used for
example for dropout in the non-tensor-model-parallel regions.
tensor-model-parallel state: This state is different among a set of model
parallel GPUs, but the same across data parallel
groups. This is used for example for dropout in
model parallel regions.
"""
# 2718 is just for fun and any POSITIVE value will work.
offset
=
seed
+
2718
tensor_model_parallel_seed
=
offset
+
get_tensor_model_parallel_rank
()
# Data parallel gets the original seed.
data_parallel_seed
=
seed
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> initializing model parallel cuda seeds on global rank {}, '
'model parallel rank {}, and data parallel rank {} with '
'model parallel seed: {} and data parallel seed: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
get_tensor_model_parallel_rank
(),
get_data_parallel_rank
(),
tensor_model_parallel_seed
,
data_parallel_seed
),
flush
=
True
)
_CUDA_RNG_STATE_TRACKER
.
reset
()
# Set the default state.
torch
.
cuda
.
manual_seed
(
data_parallel_seed
)
# and model parallel state.
_CUDA_RNG_STATE_TRACKER
.
add
(
_MODEL_PARALLEL_RNG_TRACKER_NAME
,
tensor_model_parallel_seed
)
class
CheckpointFunction
(
torch
.
autograd
.
Function
):
"""This function is adapted from torch.utils.checkpoint with
two main changes:
1) torch.cuda.set_rng_state is replaced with `_set_cuda_rng_state`
2) the states in the model parallel tracker are also properly
tracked/set/reset.
"""
@
staticmethod
def
forward
(
ctx
,
run_function
,
*
args
):
ctx
.
run_function
=
run_function
# Copy the rng states.
ctx
.
fwd_cpu_rng_state
=
torch
.
get_rng_state
()
ctx
.
fwd_cuda_rng_state
=
torch
.
cuda
.
get_rng_state
()
ctx
.
fwd_cuda_rng_state_tracker
=
get_cuda_rng_tracker
().
get_states
()
with
torch
.
no_grad
():
outputs
=
run_function
(
*
args
)
# Divide hidden states across model parallel group and only keep
# the chunk corresponding to the current rank.
if
_CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
is
not
None
:
ctx
.
input_0_shape
=
args
[
0
].
data
.
shape
args
[
0
].
data
=
split_tensor_into_1d_equal_chunks
(
args
[
0
].
data
)
args
[
0
].
data
=
_CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
.
add
(
args
[
0
].
data
)
# Store everything.
ctx
.
save_for_backward
(
*
args
)
return
outputs
@
staticmethod
def
backward
(
ctx
,
*
args
):
if
not
torch
.
autograd
.
_is_checkpoint_valid
():
raise
RuntimeError
(
"Checkpointing is not compatible with .grad(), "
"please use .backward() if possible"
)
inputs
=
ctx
.
saved_tensors
if
_CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
is
not
None
:
inputs
[
0
].
data
=
gather_split_1d_tensor
(
inputs
[
0
].
data
)
inputs
[
0
].
data
=
inputs
[
0
].
data
.
view
(
ctx
.
input_0_shape
)
# Store the current states.
bwd_cpu_rng_state
=
torch
.
get_rng_state
()
bwd_cuda_rng_state
=
torch
.
cuda
.
get_rng_state
()
bwd_cuda_rng_state_tracker
=
get_cuda_rng_tracker
().
get_states
()
# Set the states to what it used to be before the forward pass.
torch
.
set_rng_state
(
ctx
.
fwd_cpu_rng_state
)
_set_cuda_rng_state
(
ctx
.
fwd_cuda_rng_state
)
get_cuda_rng_tracker
().
set_states
(
ctx
.
fwd_cuda_rng_state_tracker
)
# Compute the forward pass.
detached_inputs
=
detach_variable
(
inputs
)
with
torch
.
enable_grad
():
outputs
=
ctx
.
run_function
(
*
detached_inputs
)
# Set the states back to what it was at the start of this function.
torch
.
set_rng_state
(
bwd_cpu_rng_state
)
_set_cuda_rng_state
(
bwd_cuda_rng_state
)
get_cuda_rng_tracker
().
set_states
(
bwd_cuda_rng_state_tracker
)
if
isinstance
(
outputs
,
torch
.
Tensor
):
outputs
=
(
outputs
,)
torch
.
autograd
.
backward
(
outputs
,
args
)
grads
=
tuple
(
inp
.
grad
if
isinstance
(
inp
,
torch
.
Tensor
)
else
inp
for
inp
in
detached_inputs
)
return
(
None
,)
+
grads
def
checkpoint
(
function
,
*
args
):
"""Checkpoint a model or part of the model.
This has been directly copied from torch.utils.checkpoint."""
return
CheckpointFunction
.
apply
(
function
,
*
args
)
megatron-deepspeed_dtk22.10/megatron/mpu/tests/__init__.py
0 → 100644
View file @
8ec5d678
megatron-deepspeed_dtk22.10/megatron/mpu/tests/commons.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
os
import
random
import
numpy
import
torch
import
mpu
class
IdentityLayer
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
size
,
scale
=
1.0
):
super
(
IdentityLayer
,
self
).
__init__
()
self
.
weight
=
torch
.
nn
.
Parameter
(
scale
*
torch
.
randn
(
size
))
def
forward
(
self
):
return
self
.
weight
def
set_random_seed
(
seed
):
"""Set random seed for reproducability."""
random
.
seed
(
seed
)
numpy
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
mpu
.
model_parallel_cuda_manual_seed
(
seed
)
def
initialize_distributed
(
backend
=
'nccl'
):
"""Initialize torch.distributed."""
# Get local rank in case it is provided.
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--local_rank'
,
type
=
int
,
default
=
None
,
help
=
'local rank passed from distributed launcher'
)
args
=
parser
.
parse_args
()
local_rank
=
args
.
local_rank
# Get rank and world size.
rank
=
int
(
os
.
getenv
(
'RANK'
,
'0'
))
world_size
=
int
(
os
.
getenv
(
"WORLD_SIZE"
,
'1'
))
print
(
'> initializing torch.distributed with local rank: {}, '
'rank: {}, world size: {}'
.
format
(
local_rank
,
rank
,
world_size
))
# Set the device id.
device
=
rank
%
torch
.
cuda
.
device_count
()
if
local_rank
is
not
None
:
device
=
local_rank
torch
.
cuda
.
set_device
(
device
)
# Call the init process.
init_method
=
'tcp://'
master_ip
=
os
.
getenv
(
'MASTER_ADDR'
,
'localhost'
)
master_port
=
os
.
getenv
(
'MASTER_PORT'
,
'6000'
)
init_method
+=
master_ip
+
':'
+
master_port
torch
.
distributed
.
init_process_group
(
backend
=
backend
,
world_size
=
world_size
,
rank
=
rank
,
init_method
=
init_method
)
def
print_separator
(
message
):
torch
.
distributed
.
barrier
()
filler_len
=
(
78
-
len
(
message
))
//
2
filler
=
'-'
*
filler_len
string
=
'
\n
'
+
filler
+
' {} '
.
format
(
message
)
+
filler
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
string
,
flush
=
True
)
torch
.
distributed
.
barrier
()
megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_cross_entropy.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
commons
import
set_random_seed
from
commons
import
IdentityLayer
from
commons
import
print_separator
from
commons
import
initialize_distributed
from
mpu.cross_entropy
import
vocab_parallel_cross_entropy
import
mpu
import
torch.nn.functional
as
F
import
torch
import
random
import
sys
sys
.
path
.
append
(
"../.."
)
def
torch_cross_entropy
(
batch_size
,
seq_length
,
vocab_size
,
logits_scale
,
seed
):
set_random_seed
(
seed
)
identity
=
IdentityLayer
((
batch_size
,
seq_length
,
vocab_size
),
scale
=
logits_scale
).
cuda
()
logits
=
identity
()
target
=
torch
.
cuda
.
LongTensor
(
size
=
(
batch_size
,
seq_length
)).
random_
(
0
,
vocab_size
)
loss
=
F
.
cross_entropy
(
logits
.
view
(
-
1
,
logits
.
size
()[
-
1
]),
target
.
view
(
-
1
),
reduction
=
'none'
).
view_as
(
target
).
mean
()
loss
.
backward
()
return
loss
,
identity
.
weight
.
grad
def
mpu_cross_entropy
(
batch_size
,
seq_length
,
vocab_size
,
logits_scale
,
seed
):
set_random_seed
(
seed
)
identity
=
IdentityLayer
((
batch_size
,
seq_length
,
vocab_size
),
scale
=
logits_scale
).
cuda
()
logits
=
identity
()
logits_parallel
=
mpu
.
scatter_to_tensor_model_parallel_region
(
logits
)
target
=
torch
.
cuda
.
LongTensor
(
size
=
(
batch_size
,
seq_length
)).
random_
(
0
,
vocab_size
)
loss
=
vocab_parallel_cross_entropy
(
logits_parallel
,
target
).
mean
()
loss
.
backward
()
return
loss
,
identity
.
weight
.
grad
def
test_cross_entropy
(
tensor_model_parallel_size
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing cross entropy with model parallel size {} ...'
.
format
(
tensor_model_parallel_size
))
mpu
.
initialize_model_parallel
(
tensor_model_parallel_size
)
tensor_model_parallel_size
=
mpu
.
get_tensor_model_parallel_world_size
()
batch_size
=
13
seq_length
=
17
vocab_size_per_partition
=
11
logits_scale
=
1000.0
vocab_size
=
vocab_size_per_partition
*
tensor_model_parallel_size
seed
=
1234
loss_torch
,
grad_torch
=
torch_cross_entropy
(
batch_size
,
seq_length
,
vocab_size
,
logits_scale
,
seed
)
loss_mpu
,
grad_mpu
=
mpu_cross_entropy
(
batch_size
,
seq_length
,
vocab_size
,
logits_scale
,
seed
)
error
=
loss_torch
.
sub_
(
loss_mpu
).
abs
().
max
()
print
(
' max error in loss on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
error
=
grad_torch
.
sub_
(
grad_mpu
).
abs
().
max
()
print
(
' max error in grad on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
# Reset groups
mpu
.
destroy_tensor_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'>> passed the test :-)'
)
if
__name__
==
'__main__'
:
initialize_distributed
()
world_size
=
torch
.
distributed
.
get_world_size
()
tensor_model_parallel_size
=
1
while
tensor_model_parallel_size
<=
world_size
:
print_separator
(
'test cross entropy'
)
test_cross_entropy
(
tensor_model_parallel_size
)
tensor_model_parallel_size
*=
2
megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_data.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
commons
import
print_separator
from
commons
import
initialize_distributed
from
mpu
import
data
as
data_utils
import
mpu
import
torch
import
functools
import
operator
import
sys
sys
.
path
.
append
(
"../.."
)
def
test_broadcast_data
(
tensor_model_parallel_size
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing broadcast_data with model parallel size {} ...'
.
format
(
tensor_model_parallel_size
))
mpu
.
initialize_model_parallel
(
tensor_model_parallel_size
)
torch
.
manual_seed
(
1234
+
mpu
.
get_data_parallel_rank
())
tensor_model_parallel_size
=
mpu
.
get_tensor_model_parallel_world_size
()
key_size_t
=
{
'key1'
:
[
7
,
11
],
'key2'
:
[
8
,
2
,
1
],
'key3'
:
[
13
],
'key4'
:
[
5
,
1
,
2
],
'key5'
:
[
5
,
12
]}
keys
=
list
(
key_size_t
.
keys
())
data
=
{}
data_t
=
{}
for
key
in
key_size_t
:
data
[
key
]
=
torch
.
LongTensor
(
size
=
key_size_t
[
key
]).
random_
(
0
,
1000
)
data_t
[
key
]
=
data
[
key
].
clone
()
data
[
'keyX'
]
=
torch
.
FloatTensor
(
size
=
(
5
,
)).
random_
(
0
,
1000
)
data_t
[
'keyX'
]
=
data
[
'keyX'
].
clone
()
if
mpu
.
get_tensor_model_parallel_rank
()
!=
0
:
data
=
None
data_utils
.
_check_data_types
(
keys
,
data_t
,
torch
.
int64
)
key_size
,
key_numel
,
\
total_numel
=
data_utils
.
_build_key_size_numel_dictionaries
(
keys
,
data
)
for
key
in
keys
:
assert
key_size
[
key
]
==
key_size_t
[
key
]
total_numel_t
=
0
for
key
in
keys
:
target_size
=
functools
.
reduce
(
operator
.
mul
,
key_size_t
[
key
],
1
)
assert
key_numel
[
key
]
==
target_size
total_numel_t
+=
target_size
assert
total_numel
==
total_numel_t
data_b
=
data_utils
.
broadcast_data
(
keys
,
data
,
torch
.
int64
)
for
key
in
keys
:
tensor
=
data_t
[
key
].
cuda
()
assert
data_b
[
key
].
sub
(
tensor
).
abs
().
max
()
==
0
# Reset groups
mpu
.
destroy_tensor_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'>> passed the test :-)'
)
if
__name__
==
'__main__'
:
initialize_distributed
()
world_size
=
torch
.
distributed
.
get_world_size
()
tensor_model_parallel_size
=
1
while
tensor_model_parallel_size
<=
world_size
:
print_separator
(
'test test broadcast data'
)
test_broadcast_data
(
tensor_model_parallel_size
)
tensor_model_parallel_size
*=
2
megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_initialize.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
commons
import
print_separator
from
commons
import
initialize_distributed
import
mpu
import
torch
import
sys
sys
.
path
.
append
(
"../.."
)
def
test_initialize_model_parallel
(
tensor_model_parallel_size
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing initialize_model_parallel with size {} ...'
.
format
(
tensor_model_parallel_size
))
tensor_model_parallel_size_
=
min
(
tensor_model_parallel_size
,
torch
.
distributed
.
get_world_size
())
assert
not
mpu
.
model_parallel_is_initialized
()
mpu
.
initialize_model_parallel
(
tensor_model_parallel_size_
)
assert
mpu
.
model_parallel_is_initialized
()
# Checks.
def
check
(
group
,
world_size
,
rank
):
assert
world_size
==
torch
.
distributed
.
get_world_size
(
group
=
group
)
assert
rank
==
torch
.
distributed
.
get_rank
(
group
=
group
)
# Model parallel.
world_size
=
tensor_model_parallel_size_
rank
=
torch
.
distributed
.
get_rank
()
%
tensor_model_parallel_size_
assert
world_size
==
mpu
.
get_tensor_model_parallel_world_size
()
assert
rank
==
mpu
.
get_tensor_model_parallel_rank
()
check
(
mpu
.
get_tensor_model_parallel_group
(),
world_size
,
rank
)
# Data parallel.
world_size
=
torch
.
distributed
.
get_world_size
()
//
tensor_model_parallel_size_
rank
=
torch
.
distributed
.
get_rank
()
//
tensor_model_parallel_size
assert
world_size
==
mpu
.
get_data_parallel_world_size
()
assert
rank
==
mpu
.
get_data_parallel_rank
()
check
(
mpu
.
get_data_parallel_group
(),
world_size
,
rank
)
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'>> passed the test :-)'
)
def
test_get_tensor_model_parallel_src_rank
(
tensor_model_parallel_size_
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing get_tensor_model_parallel_src_rank with size {} ...'
.
format
(
tensor_model_parallel_size_
))
tensor_model_parallel_size
=
min
(
tensor_model_parallel_size_
,
torch
.
distributed
.
get_world_size
())
assert
not
mpu
.
model_parallel_is_initialized
()
mpu
.
initialize_model_parallel
(
tensor_model_parallel_size
)
assert
mpu
.
model_parallel_is_initialized
()
# Checks
src_rank
=
torch
.
distributed
.
get_rank
()
-
mpu
.
get_tensor_model_parallel_rank
()
assert
mpu
.
get_tensor_model_parallel_src_rank
()
==
src_rank
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'>> passed the test :-)'
)
if
__name__
==
'__main__'
:
initialize_distributed
()
world_size
=
torch
.
distributed
.
get_world_size
()
tensor_model_parallel_size
=
1
while
tensor_model_parallel_size
<=
world_size
:
print_separator
(
'test initialize model parallel'
)
test_initialize_model_parallel
(
tensor_model_parallel_size
)
print_separator
(
'test model parallel source rank'
)
test_get_tensor_model_parallel_src_rank
(
tensor_model_parallel_size
)
tensor_model_parallel_size
*=
2
megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_layers.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
mpu
import
layers
from
commons
import
set_random_seed
from
commons
import
print_separator
from
commons
import
initialize_distributed
import
mpu
from
torch.nn.parameter
import
Parameter
import
torch.nn.init
as
init
import
torch
import
random
import
sys
sys
.
path
.
append
(
"../.."
)
def
test_parallel_embedding
(
tensor_model_parallel_size
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing parallel embedding with model parallel size {} ...'
.
format
(
tensor_model_parallel_size
))
mpu
.
initialize_model_parallel
(
tensor_model_parallel_size
)
tensor_model_parallel_size
=
mpu
.
get_tensor_model_parallel_world_size
()
batch_size
=
17
seq_length
=
23
vocab_size
=
48
hidden_size
=
16
seed
=
1236
set_random_seed
(
123
)
input_data
=
torch
.
LongTensor
(
size
=
(
batch_size
,
seq_length
)).
random_
(
0
,
vocab_size
).
cuda
()
loss_weight
=
torch
.
randn
([
batch_size
,
seq_length
,
hidden_size
]).
cuda
()
set_random_seed
(
seed
)
embedding_original
=
torch
.
nn
.
Embedding
(
vocab_size
,
hidden_size
).
cuda
()
output
=
embedding_original
(
input_data
)
loss_original
=
torch
.
mul
(
output
,
loss_weight
).
sum
()
loss_original
.
backward
()
set_random_seed
(
seed
)
embedding_parallel
=
layers
.
ParallelEmbedding
(
vocab_size
,
hidden_size
,
init_method
=
init
.
normal_
).
cuda
()
output
=
embedding_parallel
(
input_data
)
loss_parallel
=
torch
.
mul
(
output
,
loss_weight
).
sum
()
loss_parallel
.
backward
()
set_random_seed
(
seed
)
embedding_vocab_parallel
=
layers
.
VocabParallelEmbedding
(
vocab_size
,
hidden_size
,
init_method
=
init
.
normal_
).
cuda
()
output
=
embedding_vocab_parallel
(
input_data
)
loss_vocab_parallel
=
torch
.
mul
(
output
,
loss_weight
).
sum
()
loss_vocab_parallel
.
backward
()
torch
.
distributed
.
barrier
()
error
=
loss_parallel
.
sub
(
loss_original
).
abs
()
print
(
' error in loss (parallel) on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-12
,
'error: {}'
.
format
(
error
)
torch
.
distributed
.
barrier
()
error
=
loss_vocab_parallel
.
sub
(
loss_original
).
abs
()
print
(
' error in loss (vocab parallel) on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-12
,
'error: {}'
.
format
(
error
)
weight_grad_orig
=
torch
.
split
(
embedding_original
.
weight
.
grad
,
hidden_size
//
tensor_model_parallel_size
,
1
)[
mpu
.
get_tensor_model_parallel_rank
()]
error
=
embedding_parallel
.
weight
.
grad
.
sub
(
weight_grad_orig
).
abs
().
max
()
print
(
' error in grad (parallel) on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-12
,
'error: {}'
.
format
(
error
)
weight_grad_orig
=
torch
.
split
(
embedding_original
.
weight
.
grad
,
vocab_size
//
tensor_model_parallel_size
,
0
)[
mpu
.
get_tensor_model_parallel_rank
()]
error
=
embedding_vocab_parallel
.
weight
.
grad
.
sub
(
weight_grad_orig
).
abs
().
max
()
print
(
' error in grad (vocab parallel) on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-12
,
'error: {}'
.
format
(
error
)
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'>> passed the test :-)'
)
def
test_initialize_affine_weight
(
tensor_model_parallel_size
):
mpu
.
initialize_model_parallel
(
tensor_model_parallel_size
)
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing initialize_affine_weight with model parallel '
'size: {}'
.
format
(
tensor_model_parallel_size
))
tensor_model_parallel_size
=
mpu
.
get_tensor_model_parallel_world_size
()
seed
=
12345
input_size_coeff
=
13
input_size
=
input_size_coeff
*
tensor_model_parallel_size
output_size_coeff
=
17
output_size
=
output_size_coeff
*
tensor_model_parallel_size
# ---------------
# Column parallel
# ---------------
weight
=
torch
.
empty
(
output_size_coeff
,
input_size
)
set_random_seed
(
seed
)
layers
.
_initialize_affine_weight
(
weight
,
output_size
,
input_size
,
output_size_coeff
,
0
,
torch
.
nn
.
init
.
normal_
)
# Target.
set_random_seed
(
seed
)
master_weight
=
torch
.
empty
(
output_size
,
input_size
)
torch
.
nn
.
init
.
normal_
(
master_weight
)
rank
=
mpu
.
get_tensor_model_parallel_rank
()
my_weight
=
torch
.
split
(
master_weight
,
output_size_coeff
,
dim
=
0
)[
rank
].
contiguous
().
clone
()
# Compare.
error
=
weight
.
sub
(
my_weight
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' column parallel max error (should be zero) on global rank '
'{}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
# ------------
# Row parallel
# ------------
weight
=
torch
.
empty
(
output_size
,
input_size_coeff
)
set_random_seed
(
seed
)
mpu
.
layers
.
_initialize_affine_weight
(
weight
,
output_size
,
input_size
,
input_size_coeff
,
1
,
torch
.
nn
.
init
.
normal_
)
# Target.
set_random_seed
(
seed
)
master_weight
=
torch
.
empty
(
output_size
,
input_size
)
torch
.
nn
.
init
.
normal_
(
master_weight
)
rank
=
mpu
.
get_tensor_model_parallel_rank
()
my_weight
=
torch
.
split
(
master_weight
,
input_size_coeff
,
dim
=
1
)[
rank
].
contiguous
().
clone
()
# Compare.
error
=
weight
.
sub
(
my_weight
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' row parallel max error (should be zero) on global rank '
'{}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' >> passed the test :-)'
)
class
IdentityLayer2D
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
m
,
n
):
super
(
IdentityLayer2D
,
self
).
__init__
()
self
.
weight
=
Parameter
(
torch
.
Tensor
(
m
,
n
))
torch
.
nn
.
init
.
xavier_normal_
(
self
.
weight
)
def
forward
(
self
):
return
self
.
weight
def
test_column_parallel_linear
(
tensor_model_parallel_size
):
mpu
.
initialize_model_parallel
(
tensor_model_parallel_size
)
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing ColumnParallelLinear with model parallel '
'size: {}'
.
format
(
tensor_model_parallel_size
))
tensor_model_parallel_size
=
mpu
.
get_tensor_model_parallel_world_size
()
seed
=
12345
set_random_seed
(
seed
)
input_size_coeff
=
13
input_size
=
input_size_coeff
*
tensor_model_parallel_size
output_size_coeff
=
17
output_size
=
output_size_coeff
*
tensor_model_parallel_size
batch_size
=
7
# Network
identity_layer
=
IdentityLayer2D
(
batch_size
,
input_size
).
cuda
()
linear_layer
=
mpu
.
ColumnParallelLinear
(
input_size
,
output_size
,
keep_master_weight_for_test
=
True
).
cuda
()
loss_weight
=
torch
.
randn
([
batch_size
,
output_size
]).
cuda
()
# Forward
input_
=
identity_layer
()
output
=
linear_layer
(
input_
)
loss
=
torch
.
mul
(
output
,
loss_weight
).
sum
()
# Backward
loss
.
backward
()
# Values.
dLdY
=
loss_weight
X
=
identity_layer
.
weight
A
=
linear_layer
.
master_weight
.
cuda
()
dLdA
=
torch
.
matmul
(
dLdY
.
t
(),
X
)
dLdb
=
torch
.
matmul
(
torch
.
ones
(
batch_size
,
1
).
cuda
().
t
(),
dLdY
).
view
(
-
1
)
dLdX
=
torch
.
matmul
(
dLdY
,
A
)
rank
=
mpu
.
get_tensor_model_parallel_rank
()
my_dLdA
=
torch
.
split
(
dLdA
,
output_size_coeff
,
dim
=
0
)[
rank
].
contiguous
().
clone
()
error
=
my_dLdA
.
sub
(
linear_layer
.
weight
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' error in dLdA on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
my_dLdb
=
torch
.
split
(
dLdb
,
output_size_coeff
,
dim
=
0
)[
rank
].
contiguous
().
clone
()
error
=
my_dLdb
.
sub
(
linear_layer
.
bias
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' error in dLdb on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
error
=
dLdX
.
sub
(
identity_layer
.
weight
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' error in dLdX on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' >> passed the test :-)'
)
def
test_row_parallel_linear
(
tensor_model_parallel_size
):
mpu
.
initialize_model_parallel
(
tensor_model_parallel_size
)
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing RowParallelLinear with model parallel '
'size: {}'
.
format
(
tensor_model_parallel_size
))
tensor_model_parallel_size
=
mpu
.
get_tensor_model_parallel_world_size
()
seed
=
12345
set_random_seed
(
seed
)
input_size_coeff
=
13
input_size
=
input_size_coeff
*
tensor_model_parallel_size
output_size_coeff
=
17
output_size
=
output_size_coeff
*
tensor_model_parallel_size
batch_size
=
7
# Network
identity_layer
=
IdentityLayer2D
(
batch_size
,
input_size
).
cuda
()
linear_layer
=
mpu
.
RowParallelLinear
(
input_size
,
output_size
,
keep_master_weight_for_test
=
True
).
cuda
()
loss_weight
=
torch
.
randn
([
batch_size
,
output_size
]).
cuda
()
# Forward
input_
=
identity_layer
()
output
=
linear_layer
(
input_
)
loss
=
torch
.
mul
(
output
,
loss_weight
).
sum
()
# Backward
loss
.
backward
()
# Values.
dLdY
=
loss_weight
X
=
identity_layer
.
weight
A
=
linear_layer
.
master_weight
.
cuda
()
dLdA
=
torch
.
matmul
(
dLdY
.
t
(),
X
)
dLdb
=
torch
.
matmul
(
torch
.
ones
(
batch_size
,
1
).
cuda
().
t
(),
dLdY
).
view
(
-
1
)
dLdX
=
torch
.
matmul
(
dLdY
,
A
)
rank
=
mpu
.
get_tensor_model_parallel_rank
()
my_dLdA
=
torch
.
split
(
dLdA
,
input_size_coeff
,
dim
=
1
)[
rank
].
contiguous
().
clone
()
error
=
my_dLdA
.
sub
(
linear_layer
.
weight
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' error in dLdA on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
error
=
dLdb
.
sub
(
linear_layer
.
bias
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' error in dLdb on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
error
=
dLdX
.
sub
(
identity_layer
.
weight
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' error in dLdX on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' >> passed the test :-)'
)
class
IdentityLayer3D
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
m
,
n
,
k
):
super
(
IdentityLayer3D
,
self
).
__init__
()
self
.
weight
=
Parameter
(
torch
.
Tensor
(
m
,
n
,
k
))
torch
.
nn
.
init
.
xavier_normal_
(
self
.
weight
)
def
forward
(
self
):
return
self
.
weight
def
parallel_self_attention
(
tensor_model_parallel_size
,
num_att_heads_per_partition
,
hidden_size_per_att_head
,
dropout_prob
,
batch_size
,
sequence_length
):
mpu
.
initialize_model_parallel
(
tensor_model_parallel_size
)
tensor_model_parallel_size
=
mpu
.
get_tensor_model_parallel_world_size
()
seed
=
12345
set_random_seed
(
seed
)
num_att_heads
=
num_att_heads_per_partition
*
\
torch
.
distributed
.
get_world_size
()
hidden_size
=
hidden_size_per_att_head
*
num_att_heads
# Network
identity_layer
=
IdentityLayer3D
(
batch_size
,
sequence_length
,
hidden_size
).
cuda
()
attention_layer
=
mpu
.
BertParallelSelfAttention
(
hidden_size
,
num_att_heads
,
dropout_prob
).
cuda
()
loss_weight
=
torch
.
randn
([
batch_size
,
sequence_length
,
hidden_size
]).
cuda
()
attention_mask
=
torch
.
randn
([
batch_size
,
1
,
1
,
sequence_length
]).
cuda
()
# Forward
input_
=
identity_layer
()
output
=
attention_layer
(
input_
,
attention_mask
)
loss
=
torch
.
mul
(
output
,
loss_weight
).
sum
()
# Backward
loss
.
backward
()
rank
=
mpu
.
get_tensor_model_parallel_rank
()
mpu
.
destroy_model_parallel
()
return
rank
,
hidden_size
,
tensor_model_parallel_size
,
loss
,
\
attention_layer
,
identity_layer
def
test_parallel_self_attention
(
tensor_model_parallel_size
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing ParallelSelfAttention with model parallel '
'size: {}'
.
format
(
tensor_model_parallel_size
))
num_att_heads_per_partition
=
3
hidden_size_per_att_head
=
7
dropout_prob
=
0.0
# has to be zero
batch_size
=
5
sequence_length
=
13
rank_1
,
hideen_size_1
,
tensor_model_parallel_size_1
,
loss_1
,
\
attention_layer_1
,
identity_layer_1
=
parallel_self_attention
(
1
,
num_att_heads_per_partition
,
hidden_size_per_att_head
,
dropout_prob
,
batch_size
,
sequence_length
)
rank
,
hidden_size
,
tensor_model_parallel_size
,
loss
,
\
attention_layer
,
identity_layer
=
parallel_self_attention
(
tensor_model_parallel_size
,
num_att_heads_per_partition
,
hidden_size_per_att_head
,
dropout_prob
,
batch_size
,
sequence_length
)
assert
hideen_size_1
==
hidden_size
error
=
loss_1
.
sub
(
loss
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' loss error on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
5.0e-6
my_lin_grad_list
=
torch
.
split
(
attention_layer_1
.
query_key_value
.
weight
.
grad
,
hidden_size
//
tensor_model_parallel_size
,
0
)[
rank
::
tensor_model_parallel_size
]
my_lin_grad
=
torch
.
cat
(
my_lin_grad_list
,
dim
=
0
)
error
=
my_lin_grad
.
sub
(
attention_layer
.
query_key_value
.
weight
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' weight gradient error on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
5.0e-6
error
=
identity_layer_1
.
weight
.
grad
.
sub
(
identity_layer
.
weight
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' input gradient error on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
5.0e-6
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' >> passed the test :-)'
)
def
parallel_transformer
(
tensor_model_parallel_size
,
num_att_heads_per_partition
,
hidden_size_per_att_head
,
batch_size
,
sequence_length
):
mpu
.
initialize_model_parallel
(
tensor_model_parallel_size
)
tensor_model_parallel_size
=
mpu
.
get_tensor_model_parallel_world_size
()
seed
=
12345
set_random_seed
(
seed
)
num_att_heads
=
num_att_heads_per_partition
*
\
torch
.
distributed
.
get_world_size
()
hidden_size
=
hidden_size_per_att_head
*
num_att_heads
intermediate_size
=
4
*
hidden_size
# Network
identity_layer
=
IdentityLayer3D
(
batch_size
,
sequence_length
,
hidden_size
).
cuda
()
transformer_layer
=
mpu
.
BertParallelTransformerLayer
(
hidden_size
,
intermediate_size
,
num_att_heads
,
0.0
,
0.0
,
torch
.
nn
.
functional
.
relu
,
1.0e-5
).
cuda
()
loss_weight
=
torch
.
randn
([
batch_size
,
sequence_length
,
hidden_size
]).
cuda
()
attention_mask
=
torch
.
randn
([
batch_size
,
1
,
1
,
sequence_length
]).
cuda
()
# Forward
input_
=
identity_layer
()
output
=
transformer_layer
(
input_
,
attention_mask
)
loss
=
torch
.
mul
(
output
,
loss_weight
).
sum
()
# Backward
loss
.
backward
()
rank
=
mpu
.
get_tensor_model_parallel_rank
()
mpu
.
destroy_model_parallel
()
return
rank
,
hidden_size
,
tensor_model_parallel_size
,
loss
,
\
transformer_layer
,
identity_layer
def
test_parallel_transformer_layer
(
tensor_model_parallel_size
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing ParallelTransformerLayer with model parallel '
'size: {}'
.
format
(
tensor_model_parallel_size
))
num_att_heads_per_partition
=
3
hidden_size_per_att_head
=
7
batch_size
=
5
sequence_length
=
13
rank_1
,
hidden_size_1
,
tensor_model_parallel_size_1
,
loss_1
,
\
transformer_layer_1
,
identity_layer_1
=
parallel_transformer
(
1
,
num_att_heads_per_partition
,
hidden_size_per_att_head
,
batch_size
,
sequence_length
)
rank
,
hidden_size
,
tensor_model_parallel_size
,
loss
,
\
transformer_layer
,
identity_layer
=
parallel_transformer
(
tensor_model_parallel_size
,
num_att_heads_per_partition
,
hidden_size_per_att_head
,
batch_size
,
sequence_length
)
error
=
loss_1
.
sub
(
loss
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' loss error on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
5.0e-5
,
'error: {}'
.
format
(
error
)
error
=
identity_layer_1
.
weight
.
grad
.
sub
(
identity_layer
.
weight
.
grad
).
abs
().
max
()
torch
.
distributed
.
barrier
()
print
(
' input gradient error on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
5.0e-5
,
'error: {}'
.
format
(
error
)
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
' >> passed the test :-)'
)
if
__name__
==
'__main__'
:
torch
.
backends
.
cudnn
.
deterministic
=
True
torch
.
backends
.
cudnn
.
benchmark
=
False
initialize_distributed
()
world_size
=
torch
.
distributed
.
get_world_size
()
print_separator
(
'test initialize affine weight'
)
tensor_model_parallel_size
=
1
while
tensor_model_parallel_size
<=
world_size
:
test_initialize_affine_weight
(
tensor_model_parallel_size
)
tensor_model_parallel_size
*=
2
tensor_model_parallel_size
=
1
while
tensor_model_parallel_size
<=
world_size
:
print_separator
(
'test parallel embedding'
)
test_parallel_embedding
(
tensor_model_parallel_size
)
tensor_model_parallel_size
*=
2
print_separator
(
'test column-parallel linear'
)
tensor_model_parallel_size
=
1
while
tensor_model_parallel_size
<=
world_size
:
test_column_parallel_linear
(
tensor_model_parallel_size
)
tensor_model_parallel_size
*=
2
print_separator
(
'test row-parallel linear'
)
tensor_model_parallel_size
=
1
while
tensor_model_parallel_size
<=
world_size
:
test_row_parallel_linear
(
tensor_model_parallel_size
)
tensor_model_parallel_size
*=
2
print_separator
(
'test parallel self-attention'
)
tensor_model_parallel_size
=
1
while
tensor_model_parallel_size
<=
world_size
:
test_parallel_self_attention
(
tensor_model_parallel_size
)
tensor_model_parallel_size
*=
2
print_separator
(
'test parallel transformer'
)
tensor_model_parallel_size
=
1
while
tensor_model_parallel_size
<=
world_size
:
test_parallel_transformer_layer
(
tensor_model_parallel_size
)
tensor_model_parallel_size
*=
2
megatron-deepspeed_dtk22.10/megatron/mpu/tests/test_random.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
commons
import
print_separator
from
commons
import
initialize_distributed
import
mpu
import
torch
import
sys
sys
.
path
.
append
(
"../.."
)
def
test_set_cuda_rng_state
(
tensor_model_parallel_size
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing set_rng_state with size {} ...'
.
format
(
tensor_model_parallel_size
))
mpu
.
initialize_model_parallel
(
tensor_model_parallel_size
)
tensor_model_parallel_size
=
mpu
.
get_tensor_model_parallel_world_size
()
size
=
123
seed
=
1234
torch
.
cuda
.
manual_seed
(
1234
)
tensor
=
torch
.
cuda
.
FloatTensor
(
size
)
# Get the state
rng_state
=
torch
.
cuda
.
get_rng_state
()
rng_state_copy
=
rng_state
.
clone
()
# Do some stuff.
for
_
in
range
(
5
):
torch
.
randn
(
size
,
out
=
tensor
)
result_1
=
tensor
.
clone
()
assert
rng_state
.
sub
(
rng_state_copy
).
max
()
==
0
assert
torch
.
cuda
.
get_rng_state
().
sub
(
rng_state_copy
).
max
()
>
0
# State should be different.
new_rng_state
=
torch
.
cuda
.
get_rng_state
()
max_diff
=
new_rng_state
.
sub
(
rng_state
).
max
()
print
(
' max diff in rng state (should be non-zero) on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
max_diff
))
assert
max_diff
>
0
# Reset the rng state and do the same stuff.
mpu
.
random
.
_set_cuda_rng_state
(
rng_state
)
for
_
in
range
(
5
):
torch
.
randn
(
size
,
out
=
tensor
)
mpu
.
random
.
_set_cuda_rng_state
(
rng_state
)
for
_
in
range
(
5
):
torch
.
randn
(
size
,
out
=
tensor
)
result_2
=
tensor
.
clone
()
# Results should be the same
error
=
result_2
.
sub
(
result_1
).
abs
().
max
()
print
(
' max error in generated tensors (should be zero) on '
'global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
# Input state should have remained intact.
error
=
rng_state
.
sub
(
rng_state_copy
).
max
()
print
(
' max error in rng state (should be zero) on global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
==
0
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'>> passed the test :-)'
)
def
test_cuda_rng_tracker
(
tensor_model_parallel_size
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing cuda rng tracker with size {} ...'
.
format
(
tensor_model_parallel_size
))
mpu
.
initialize_model_parallel
(
tensor_model_parallel_size
)
tensor_model_parallel_size
=
mpu
.
get_tensor_model_parallel_world_size
()
seed_1
=
1234
seed_2
=
4321
size
=
[
12
,
21
]
tensor
=
torch
.
cuda
.
FloatTensor
(
size
)
# Set to seed_1 and generate two tensors.
torch
.
cuda
.
manual_seed
(
seed_1
)
torch
.
randn
(
size
,
out
=
tensor
)
target_11
=
tensor
.
clone
()
torch
.
randn
(
size
,
out
=
tensor
)
target_12
=
tensor
.
clone
()
# Set to seed_2 and generate two tensors.
torch
.
cuda
.
manual_seed
(
seed_2
)
torch
.
randn
(
size
,
out
=
tensor
)
target_21
=
tensor
.
clone
()
torch
.
randn
(
size
,
out
=
tensor
)
target_22
=
tensor
.
clone
()
# Now if we interleave seed_1 and seed_2,
# we should still get the same tensors
torch
.
cuda
.
manual_seed
(
seed_1
)
mpu
.
get_cuda_rng_tracker
().
add
(
'test'
,
seed_2
)
torch
.
randn
(
size
,
out
=
tensor
)
result_11
=
tensor
.
clone
()
with
mpu
.
get_cuda_rng_tracker
().
fork
(
'test'
):
torch
.
randn
(
size
,
out
=
tensor
)
result_21
=
tensor
.
clone
()
torch
.
randn
(
size
,
out
=
tensor
)
result_12
=
tensor
.
clone
()
with
mpu
.
get_cuda_rng_tracker
().
fork
(
'test'
):
torch
.
randn
(
size
,
out
=
tensor
)
result_22
=
tensor
.
clone
()
diff
=
result_11
.
sub
(
result_21
).
abs
().
max
()
diff
=
min
(
diff
,
result_12
.
sub
(
result_22
).
abs
().
max
())
print
(
' max diff in generated tensors (should be non-zero) on '
'global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
diff
))
assert
diff
>
1.0e-6
error
=
max
(
result_11
.
sub
(
target_11
).
abs
().
max
(),
result_12
.
sub
(
target_12
).
abs
().
max
())
error
=
max
(
error
,
result_21
.
sub
(
target_21
).
abs
().
max
())
error
=
max
(
error
,
result_22
.
sub
(
target_22
).
abs
().
max
())
print
(
' max error in generated tensors (should be zero) on '
'global rank {}: {}'
.
format
(
torch
.
distributed
.
get_rank
(),
error
))
assert
error
<
1.0e-6
# Reset the tracker
mpu
.
get_cuda_rng_tracker
().
reset
()
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'>> passed the test :-)'
)
def
test_model_parallel_cuda_manual_seed
(
tensor_model_parallel_size
):
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'> testing model parallel cuda manual seed with size {} ...'
.
format
(
tensor_model_parallel_size
))
mpu
.
initialize_model_parallel
(
tensor_model_parallel_size
)
tensor_model_parallel_size
=
mpu
.
get_tensor_model_parallel_world_size
()
mpu
.
model_parallel_cuda_manual_seed
(
12345
)
assert
torch
.
cuda
.
initial_seed
()
==
12345
with
mpu
.
get_cuda_rng_tracker
().
fork
():
assert
torch
.
cuda
.
initial_seed
()
==
(
12345
+
2718
+
mpu
.
get_tensor_model_parallel_rank
())
# Reset the tracker
mpu
.
get_cuda_rng_tracker
().
reset
()
# Reset groups
mpu
.
destroy_model_parallel
()
torch
.
distributed
.
barrier
()
if
torch
.
distributed
.
get_rank
()
==
0
:
print
(
'>> passed the test :-)'
)
if
__name__
==
'__main__'
:
initialize_distributed
()
world_size
=
torch
.
distributed
.
get_world_size
()
tensor_model_parallel_size
=
1
while
tensor_model_parallel_size
<=
world_size
:
print_separator
(
'test set rng state'
)
test_set_cuda_rng_state
(
tensor_model_parallel_size
)
tensor_model_parallel_size
*=
2
tensor_model_parallel_size
=
1
while
tensor_model_parallel_size
<=
world_size
:
print_separator
(
'test cuda rng tracker'
)
test_cuda_rng_tracker
(
tensor_model_parallel_size
)
tensor_model_parallel_size
*=
2
tensor_model_parallel_size
=
1
while
tensor_model_parallel_size
<=
world_size
:
print_separator
(
'test model parallel cuda manual seed'
)
test_model_parallel_cuda_manual_seed
(
tensor_model_parallel_size
)
tensor_model_parallel_size
*=
2
megatron-deepspeed_dtk22.10/megatron/mpu/utils.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
def
ensure_divisibility
(
numerator
,
denominator
):
"""Ensure that numerator is divisible by the denominator."""
assert
numerator
%
denominator
==
0
,
'{} is not divisible by {}'
.
format
(
numerator
,
denominator
)
def
divide
(
numerator
,
denominator
):
"""Ensure that numerator is divisible by the denominator and return
the division value."""
ensure_divisibility
(
numerator
,
denominator
)
return
numerator
//
denominator
def
split_tensor_along_last_dim
(
tensor
,
num_partitions
,
contiguous_split_chunks
=
False
):
"""Split a tensor along its last dimension.
Arguments:
tensor: input tensor.
num_partitions: number of partitions to split the tensor
contiguous_split_chunks: If True, make each chunk contiguous
in memory.
"""
# Get the size and dimension.
last_dim
=
tensor
.
dim
()
-
1
last_dim_size
=
divide
(
tensor
.
size
()[
last_dim
],
num_partitions
)
# Split.
tensor_list
=
torch
.
split
(
tensor
,
last_dim_size
,
dim
=
last_dim
)
# Note: torch.split does not create contiguous tensors by default.
if
contiguous_split_chunks
:
return
tuple
(
chunk
.
contiguous
()
for
chunk
in
tensor_list
)
return
tensor_list
class
VocabUtility
:
"""Split the vocabulary into `world_size` chunks amd return the
first and last index of the vocabulary belonging to the `rank`
partition: Note that indecies in [fist, last)"""
@
staticmethod
def
vocab_range_from_per_partition_vocab_size
(
per_partition_vocab_size
,
rank
,
world_size
):
index_f
=
rank
*
per_partition_vocab_size
index_l
=
index_f
+
per_partition_vocab_size
return
index_f
,
index_l
@
staticmethod
def
vocab_range_from_global_vocab_size
(
global_vocab_size
,
rank
,
world_size
):
per_partition_vocab_size
=
divide
(
global_vocab_size
,
world_size
)
return
VocabUtility
.
vocab_range_from_per_partition_vocab_size
(
per_partition_vocab_size
,
rank
,
world_size
)
megatron-deepspeed_dtk22.10/megatron/optimizer/__init__.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
apex.optimizers
import
FusedAdam
as
Adam
from
apex.optimizers
import
FusedSGD
as
SGD
from
megatron
import
get_args
from
megatron.model.fused_layer_norm
import
MixedFusedLayerNorm
as
LayerNorm
from
.grad_scaler
import
ConstantGradScaler
,
DynamicGradScaler
from
.optimizer
import
Float16OptimizerWithFloat16Params
,
FP32Optimizer
def
_get_params_for_weight_decay_optimization
(
modules
):
"""Divide params into with-weight-decay and without-weight-decay groups.
Layernorms and baises will have no weight decay but the rest will.
"""
weight_decay_params
=
{
'params'
:
[]}
no_weight_decay_params
=
{
'params'
:
[],
'weight_decay'
:
0.0
}
for
module
in
modules
:
for
module_
in
module
.
modules
():
if
isinstance
(
module_
,
LayerNorm
):
no_weight_decay_params
[
'params'
].
extend
(
[
p
for
p
in
list
(
module_
.
_parameters
.
values
())
if
p
is
not
None
])
else
:
weight_decay_params
[
'params'
].
extend
(
[
p
for
n
,
p
in
list
(
module_
.
_parameters
.
items
())
if
p
is
not
None
and
n
!=
'bias'
])
no_weight_decay_params
[
'params'
].
extend
(
[
p
for
n
,
p
in
list
(
module_
.
_parameters
.
items
())
if
p
is
not
None
and
n
==
'bias'
])
# XXX: temp hack to workaround the crash in apex FusedAdam's multi_tensor_applier
#
# it crashes when the param count is larger than a certain size which we hit at 200B over 80
# A100 gpus - I think around 2.7B per gpu, so halving it works around the issue
param_count
=
len
(
weight_decay_params
[
'params'
])
first_half
=
weight_decay_params
[
'params'
][:
param_count
//
2
]
second_half
=
weight_decay_params
[
'params'
][
param_count
//
2
:]
first_half
=
{
'params'
:
first_half
}
second_half
=
{
'params'
:
second_half
}
return
first_half
,
second_half
,
no_weight_decay_params
#return weight_decay_params, no_weight_decay_params
def
get_megatron_optimizer
(
model
):
args
=
get_args
()
if
args
.
cpu_optimizer
:
raise
NotImplementedError
(
'need to add cpu adam'
)
# Base optimizer.
param_groups
=
_get_params_for_weight_decay_optimization
(
model
)
if
args
.
optimizer
==
'adam'
:
if
args
.
use_bnb_optimizer
:
import
bitsandbytes
as
bnb
adam_optimizer
=
bnb
.
optim
.
Adam8bit
else
:
adam_optimizer
=
Adam
optimizer
=
adam_optimizer
(
param_groups
,
lr
=
args
.
lr
,
weight_decay
=
args
.
weight_decay
,
betas
=
(
args
.
adam_beta1
,
args
.
adam_beta2
),
eps
=
args
.
adam_eps
)
elif
args
.
optimizer
==
'sgd'
:
optimizer
=
SGD
(
param_groups
,
lr
=
args
.
lr
,
weight_decay
=
args
.
weight_decay
,
momentum
=
args
.
sgd_momentum
)
else
:
raise
Exception
(
'{} optimizer is not supported.'
.
format
(
args
.
optimizer
))
if
args
.
deepspeed
:
return
optimizer
# Determine whether the params have main-grad field.
params_have_main_grad
=
False
if
args
.
DDP_impl
==
'local'
:
params_have_main_grad
=
True
if
args
.
fp16
or
args
.
bf16
:
# Grad scaler:
# if loss-scale is provided, instantiate the constant scaler.
# if we are using fp16 and loss-scale is not present, use a
# dynamic scaler.
# otherwise we are running in bf16 with no loss-scale so
# leave it as None.
grad_scaler
=
None
# Constant loss scale.
if
args
.
loss_scale
:
grad_scaler
=
ConstantGradScaler
(
args
.
loss_scale
)
# Dynamic loss scale.
else
:
if
args
.
fp16
:
grad_scaler
=
DynamicGradScaler
(
initial_scale
=
args
.
initial_loss_scale
,
min_scale
=
args
.
min_loss_scale
,
growth_factor
=
2.0
,
backoff_factor
=
0.5
,
growth_interval
=
args
.
loss_scale_window
,
hysteresis
=
args
.
hysteresis
)
# Megatron optimizer.
return
Float16OptimizerWithFloat16Params
(
optimizer
,
args
.
clip_grad
,
args
.
log_num_zeros_in_grad
,
params_have_main_grad
,
args
.
bf16
,
grad_scaler
)
# FP32.
return
FP32Optimizer
(
optimizer
,
args
.
clip_grad
,
args
.
log_num_zeros_in_grad
,
params_have_main_grad
)
megatron-deepspeed_dtk22.10/megatron/optimizer/clip_grads.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Gradient clipping."""
import
torch
from
torch._six
import
inf
from
apex.multi_tensor_apply
import
multi_tensor_applier
import
amp_C
from
megatron
import
mpu
from
megatron.model.module
import
param_is_not_shared
from
megatron.mpu.layers
import
param_is_not_tensor_parallel_duplicate
def
clip_grad_norm_fp32
(
parameters
,
max_norm
,
norm_type
=
2
):
"""Clips gradient norm of an iterable of parameters whose gradients
are in fp32.
This is adapted from torch.nn.utils.clip_grad.clip_grad_norm_ and
added functionality to handle model parallel parameters. Note that
the gradients are modified in place.
Arguments:
parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
single Tensor that will have gradients normalized
max_norm (float or int): max norm of the gradients
norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
infinity norm.
Returns:
Total norm of the parameters (viewed as a single vector).
"""
if
isinstance
(
parameters
,
torch
.
Tensor
):
parameters
=
[
parameters
]
# Filter parameters based on:
# - grad should not be none
# - parameter should not be shared
# - should not be a replica due to tensor model parallelism
grads
=
[]
grads_for_norm
=
[]
for
param
in
parameters
:
grad_not_none
=
param
.
grad
is
not
None
is_not_shared
=
param_is_not_shared
(
param
)
is_not_tp_duplicate
=
param_is_not_tensor_parallel_duplicate
(
param
)
grad
=
param
.
grad
.
detach
()
if
grad_not_none
:
# Make sure the grads are in fp32
assert
param
.
grad
.
type
()
==
'torch.cuda.FloatTensor'
grads
.
append
(
grad
)
if
grad_not_none
and
is_not_shared
and
is_not_tp_duplicate
:
grads_for_norm
.
append
(
grad
)
# Norm parameters.
max_norm
=
float
(
max_norm
)
norm_type
=
float
(
norm_type
)
total_norm
=
0.0
# Calculate norm.
if
norm_type
==
inf
:
total_norm
=
max
(
grad
.
abs
().
max
()
for
grad
in
grads_for_norm
)
total_norm_cuda
=
torch
.
cuda
.
FloatTensor
([
float
(
total_norm
)])
# Take max across all model-parallel GPUs.
torch
.
distributed
.
all_reduce
(
total_norm_cuda
,
op
=
torch
.
distributed
.
ReduceOp
.
MAX
,
group
=
mpu
.
get_model_parallel_group
())
total_norm
=
total_norm_cuda
[
0
].
item
()
else
:
if
norm_type
==
2.0
:
dummy_overflow_buf
=
torch
.
cuda
.
IntTensor
([
0
])
# Use apex's multi-tensor applier for efficiency reasons.
# Multi-tensor applier takes a function and a list of list
# and performs the operation on that list all in one kernel.
grad_norm
,
_
=
multi_tensor_applier
(
amp_C
.
multi_tensor_l2norm
,
dummy_overflow_buf
,
[
grads_for_norm
],
False
# no per-parameter norm
)
# Since we will be summing across data parallel groups,
# we need the pow(norm-type).
total_norm
=
grad_norm
**
norm_type
else
:
for
grad
in
grads_for_norm
:
grad_norm
=
torch
.
norm
(
grad
,
norm_type
)
total_norm
+=
grad_norm
**
norm_type
# Sum across all model-parallel GPUs.
torch
.
distributed
.
all_reduce
(
total_norm
,
op
=
torch
.
distributed
.
ReduceOp
.
SUM
,
group
=
mpu
.
get_model_parallel_group
())
total_norm
=
total_norm
.
item
()
**
(
1.0
/
norm_type
)
# Scale.
clip_coeff
=
max_norm
/
(
total_norm
+
1.0e-6
)
if
clip_coeff
<
1.0
:
dummy_overflow_buf
=
torch
.
cuda
.
IntTensor
([
0
])
multi_tensor_applier
(
amp_C
.
multi_tensor_scale
,
dummy_overflow_buf
,
[
grads
,
grads
],
clip_coeff
)
return
total_norm
def
count_zeros_fp32
(
parameters
):
if
isinstance
(
parameters
,
torch
.
Tensor
):
parameters
=
[
parameters
]
# Filter parameters based on:
# - grad should not be none
# - parameter should not be shared
# - should not be a replica due to tensor model parallelism
total_num_zeros
=
0.0
for
param
in
parameters
:
grad_not_none
=
param
.
grad
is
not
None
is_not_shared
=
param_is_not_shared
(
param
)
is_not_tp_duplicate
=
param_is_not_tensor_parallel_duplicate
(
param
)
if
grad_not_none
and
is_not_shared
and
is_not_tp_duplicate
:
grad
=
param
.
grad
.
detach
()
num_zeros
=
grad
.
numel
()
-
torch
.
count_nonzero
(
grad
)
total_num_zeros
=
num_zeros
+
total_num_zeros
# Sum across all model-parallel GPUs.
torch
.
distributed
.
all_reduce
(
total_num_zeros
,
op
=
torch
.
distributed
.
ReduceOp
.
SUM
,
group
=
mpu
.
get_model_parallel_group
())
total_num_zeros
=
total_num_zeros
.
item
()
return
total_num_zeros
megatron-deepspeed_dtk22.10/megatron/optimizer/grad_scaler.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron grad scaler."""
from
abc
import
ABC
from
abc
import
abstractmethod
import
torch
class
MegatronGradScaler
(
ABC
):
def
__init__
(
self
,
initial_scale
):
"""Initialize scale value with the input initial scale."""
assert
initial_scale
>
0.0
self
.
_scale
=
torch
.
cuda
.
FloatTensor
([
initial_scale
])
@
property
def
scale
(
self
):
return
self
.
_scale
@
property
def
inv_scale
(
self
):
return
self
.
_scale
.
double
().
reciprocal
().
float
()
@
abstractmethod
def
update
(
self
,
found_inf
):
pass
@
abstractmethod
def
state_dict
(
self
):
pass
@
abstractmethod
def
load_state_dict
(
self
,
state_dict
):
pass
class
ConstantGradScaler
(
MegatronGradScaler
):
def
update
(
self
,
found_inf
):
pass
def
state_dict
(
self
):
return
dict
()
def
load_state_dict
(
self
,
state_dict
):
pass
class
DynamicGradScaler
(
MegatronGradScaler
):
def
__init__
(
self
,
initial_scale
,
min_scale
,
growth_factor
,
backoff_factor
,
growth_interval
,
hysteresis
):
""""Grad scaler with dynamic scale that gets adjusted
during training."""
super
(
DynamicGradScaler
,
self
).
__init__
(
initial_scale
)
# Lower bound on the scale.
assert
min_scale
>
0.0
assert
min_scale
<=
initial_scale
self
.
min_scale
=
torch
.
cuda
.
FloatTensor
([
min_scale
])
# Growth and backoff factors for the scale.
assert
growth_factor
>
1.0
self
.
growth_factor
=
torch
.
cuda
.
FloatTensor
([
growth_factor
])
assert
backoff_factor
<
1.0
assert
backoff_factor
>
0.0
self
.
backoff_factor
=
torch
.
cuda
.
FloatTensor
([
backoff_factor
])
# Interval over which if we don't see any inf/nan,
# we will scale the grad scale by the growth factor.
assert
growth_interval
>
0
self
.
growth_interval
=
growth_interval
# Number of inf/nans we should see before scaling down
# the grad scale by the backoff factor.
assert
hysteresis
>
0
self
.
hysteresis
=
hysteresis
# Trackers.
self
.
_growth_tracker
=
0
self
.
_hysteresis_tracker
=
self
.
hysteresis
def
update
(
self
,
found_inf
):
# If we have an inf/nan, growth tracker is set to 0
# and hysterisis tracker is reduced by 1.
if
found_inf
:
self
.
_growth_tracker
=
0
self
.
_hysteresis_tracker
-=
1
# Now if we are out of hysteresis count, scale down the loss.
if
self
.
_hysteresis_tracker
<=
0
:
self
.
_scale
=
torch
.
max
(
self
.
_scale
*
self
.
backoff_factor
,
self
.
min_scale
)
else
:
# If there is no nan/inf, increment the growth tracker.
self
.
_growth_tracker
+=
1
# If we have had enough consequitive intervals with no nan/inf:
if
self
.
_growth_tracker
==
self
.
growth_interval
:
# Reset the tracker and hysteresis trackers,
self
.
_growth_tracker
=
0
self
.
_hysteresis_tracker
=
self
.
hysteresis
# and scale up the loss scale.
self
.
_scale
=
self
.
_scale
*
self
.
growth_factor
def
state_dict
(
self
):
state_dict
=
{}
state_dict
[
'scale'
]
=
self
.
_scale
state_dict
[
'growth_tracker'
]
=
self
.
_growth_tracker
state_dict
[
'hysteresis_tracker'
]
=
self
.
_hysteresis_tracker
return
state_dict
def
load_state_dict
(
self
,
state_dict
):
self
.
_scale
=
state_dict
[
'scale'
].
cuda
(
torch
.
cuda
.
current_device
())
self
.
_growth_tracker
=
state_dict
[
'growth_tracker'
]
self
.
_hysteresis_tracker
=
state_dict
[
'hysteresis_tracker'
]
megatron-deepspeed_dtk22.10/megatron/optimizer/optimizer.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron optimizer."""
from
abc
import
ABC
from
abc
import
abstractmethod
import
torch
from
apex.multi_tensor_apply
import
multi_tensor_applier
import
amp_C
from
megatron
import
get_timers
from
megatron
import
mpu
from
megatron
import
print_rank_0
from
.clip_grads
import
clip_grad_norm_fp32
,
count_zeros_fp32
def
_zero_grad_group_helper
(
group
,
set_to_none
):
"""Zero out the gradient for a group of parameters.
Note: copied from torch.optim.optimizer."""
for
param
in
group
:
if
param
.
grad
is
not
None
:
if
set_to_none
:
param
.
grad
=
None
else
:
if
param
.
grad
.
grad_fn
is
not
None
:
param
.
grad
.
detach_
()
else
:
param
.
grad
.
requires_grad_
(
False
)
param
.
grad
.
zero_
()
def
_multi_tensor_copy_this_to_that
(
this
,
that
,
overflow_buf
=
None
):
"""Use multi-tensor-applier to copy values from one list to another.
We don't have a blfoat16 implementation so for now if the overflow_buf
is not provided, we default back to simple loop copy to be compatible
with bfloat16."""
if
overflow_buf
:
overflow_buf
.
fill_
(
0
)
# Scaling with factor `1.0` is equivalent to copy.
multi_tensor_applier
(
amp_C
.
multi_tensor_scale
,
overflow_buf
,
[
this
,
that
],
1.0
)
else
:
for
this_
,
that_
in
zip
(
this
,
that
):
that_
.
copy_
(
this_
)
class
MegatronOptimizer
(
ABC
):
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
):
"""Input optimizer is the base optimizer for example Adam."""
self
.
optimizer
=
optimizer
assert
self
.
optimizer
,
'no optimizer is provided.'
# Set gradient clipping and logging params.
self
.
clip_grad
=
clip_grad
self
.
log_num_zeros_in_grad
=
log_num_zeros_in_grad
self
.
params_have_main_grad
=
params_have_main_grad
def
get_parameters
(
self
):
params
=
[]
for
param_group
in
self
.
optimizer
.
param_groups
:
for
param
in
param_group
[
'params'
]:
params
.
append
(
param
)
return
params
def
clip_grad_norm
(
self
,
clip_grad
):
params
=
self
.
get_parameters
()
return
clip_grad_norm_fp32
(
params
,
clip_grad
)
def
count_zeros
(
self
):
params
=
self
.
get_parameters
()
return
count_zeros_fp32
(
params
)
@
abstractmethod
def
zero_grad
(
self
,
set_to_none
=
True
):
pass
@
abstractmethod
def
get_loss_scale
(
self
):
"""The output should be a cuda tensor of size 1."""
pass
def
scale_loss
(
self
,
loss
):
"""Simple scaling."""
return
self
.
get_loss_scale
()
*
loss
@
abstractmethod
def
step
(
self
):
pass
@
abstractmethod
def
reload_model_params
(
self
):
"""Refreshes any internal state from the current model parameters.
Call whenever the parameters are changed outside of the optimizer.
For example, when we load a model from a checkpoint without loading
the optimizer, the model parameters are updated but for fp16 optimizer
with main parameters, the main parameters need to also be updated."""
pass
@
abstractmethod
def
state_dict
(
self
):
pass
@
abstractmethod
def
load_state_dict
(
self
,
state_dict
):
pass
# Promote state so it can be retrieved or set via
# "optimizer_instance.state"
def
_get_state
(
self
):
return
self
.
optimizer
.
state
def
_set_state
(
self
,
value
):
self
.
optimizer
.
state
=
value
state
=
property
(
_get_state
,
_set_state
)
# Promote param_groups so it can be retrieved or set via
# "optimizer_instance.param_groups"
# (for example, to adjust the learning rate)
def
_get_param_groups
(
self
):
return
self
.
optimizer
.
param_groups
def
_set_param_groups
(
self
,
value
):
self
.
optimizer
.
param_groups
=
value
param_groups
=
property
(
_get_param_groups
,
_set_param_groups
)
class
Float16OptimizerWithFloat16Params
(
MegatronOptimizer
):
"""Float16 optimizer for fp16 and bf16 data types.
Arguments:
optimizer: base optimizer such as Adam or SGD
clip_grad: clip gradeints with this global L2 norm. Note
that clipping is ignored if clip_grad == 0
log_num_zeros_in_grad: return number of zeros in the gradients.
params_have_main_grad: flag indicating if parameters have
a `main_grad` field. If this is set, we are assuming
that the model parameters are store in the `main_grad`
field instead of the typical `grad` field. This happens
for the DDP cases where there is a contihuous buffer
holding the gradients. For example for bfloat16, we want
to do gradient accumulation and all-reduces in float32
and as a result we store those gradients in the main_grad.
Note that main grad is not necessarily in float32.
bf16: if true, the model is running in bfloat16.
grad_scaler: used for scaling gradients. Note that this can be
None. This case happens when `bf16 = True` and we don't
use any loss scale. Note that for `bf16 = True`, we can have
a constnat gradient scaler. Also for `bf16 = False`, we
always require a grad scaler.
"""
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
,
bf16
,
grad_scaler
):
super
(
Float16OptimizerWithFloat16Params
,
self
).
__init__
(
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
)
self
.
bf16
=
bf16
self
.
grad_scaler
=
grad_scaler
# None grad scaler is only supported for bf16.
if
self
.
grad_scaler
is
None
:
assert
self
.
bf16
,
'fp16 expects a grad scaler.'
# Tensor used to determine if a nan/if has happend.
# Any non-zero value indicates inf/nan.
# Note that we keep this for the cases that grad scaler is none.
# We still record nan/inf if we have a bfloat16 with a grad scaler.
if
self
.
grad_scaler
:
self
.
found_inf
=
torch
.
cuda
.
FloatTensor
([
0.0
])
# Dummy tensor needed for apex multi-apply tensor.
# For bfloat, we don't have multi-tensor apply and for now
# we set it to none so the multi-tensor apply gets ignored.
if
bf16
:
self
.
_dummy_overflow_buf
=
None
else
:
self
.
_dummy_overflow_buf
=
torch
.
cuda
.
IntTensor
([
0
])
# In case grad scaler is not passed, define the unity scale.
if
self
.
grad_scaler
is
None
:
self
.
_scale_one
=
torch
.
cuda
.
FloatTensor
([
1.0
])
# ======================
# main parameter stuff
# ======================
# Three groups of parameters:
# float16_groups: original float16 parameters
# fp32_from_float16_groups: fp32 copy of float16 parameters
# fp32_from_fp32_groups: original fp32 parameters
self
.
float16_groups
=
[]
self
.
fp32_from_float16_groups
=
[]
self
.
fp32_from_fp32_groups
=
[]
# For all the groups in the original optimizer:
for
param_group
in
self
.
optimizer
.
param_groups
:
float16_params_this_group
=
[]
fp32_params_this_group
=
[]
fp32_from_float16_params_this_group
=
[]
# For all the parameters in this group:
for
i
,
param
in
enumerate
(
param_group
[
'params'
]):
if
param
.
requires_grad
:
# float16 params:
if
param
.
type
()
in
[
'torch.cuda.HalfTensor'
,
'torch.cuda.BFloat16Tensor'
]:
float16_params_this_group
.
append
(
param
)
# Create a copy
main_param
=
param
.
detach
().
clone
().
float
()
# Copy tensor model parallel attributes.
mpu
.
copy_tensor_model_parallel_attributes
(
main_param
,
param
)
if
hasattr
(
param
,
'shared'
):
main_param
.
shared
=
param
.
shared
# Replace the optimizer params with the new fp32 copy.
param_group
[
'params'
][
i
]
=
main_param
fp32_from_float16_params_this_group
.
append
(
main_param
)
# Reset existing state dict key to the new main param.
if
param
in
self
.
optimizer
.
state
:
self
.
optimizer
.
state
[
main_param
]
\
=
self
.
optimizer
.
state
.
pop
(
param
)
# fp32 params.
elif
param
.
type
()
==
'torch.cuda.FloatTensor'
:
fp32_params_this_group
.
append
(
param
)
param_group
[
'params'
][
i
]
=
param
else
:
raise
TypeError
(
'Wrapped parameters must be one of '
'torch.cuda.FloatTensor, '
'torch.cuda.HalfTensor, or '
'torch.cuda.BFloat16Tensor. '
'Received {}'
.
format
(
param
.
type
()))
self
.
float16_groups
.
append
(
float16_params_this_group
)
self
.
fp32_from_float16_groups
.
append
(
fp32_from_float16_params_this_group
)
self
.
fp32_from_fp32_groups
.
append
(
fp32_params_this_group
)
# Leverage state_dict() and load_state_dict() to
# recast preexisting per-param state tensors
self
.
optimizer
.
load_state_dict
(
self
.
optimizer
.
state_dict
())
def
zero_grad
(
self
,
set_to_none
=
True
):
"""We only need to zero the model related parameters, i.e.,
float16_groups & fp32_from_fp32_groups."""
for
group
in
self
.
float16_groups
:
_zero_grad_group_helper
(
group
,
set_to_none
)
for
group
in
self
.
fp32_from_fp32_groups
:
_zero_grad_group_helper
(
group
,
set_to_none
)
def
get_loss_scale
(
self
):
if
self
.
grad_scaler
is
None
:
return
self
.
_scale_one
return
self
.
grad_scaler
.
scale
def
_copy_model_grads_to_main_grads
(
self
):
# This only needs to be done for the float16 group.
for
model_group
,
main_group
in
zip
(
self
.
float16_groups
,
self
.
fp32_from_float16_groups
):
for
model_param
,
main_param
in
zip
(
model_group
,
main_group
):
if
self
.
params_have_main_grad
:
main_param
.
grad
=
model_param
.
main_grad
.
float
()
else
:
if
model_param
.
grad
is
not
None
:
main_param
.
grad
=
model_param
.
grad
.
float
()
# For fp32 grads, we need to reset the grads to main grad.
if
self
.
params_have_main_grad
:
for
model_group
in
self
.
fp32_from_fp32_groups
:
for
model_param
in
model_group
:
model_param
.
grad
=
model_param
.
main_grad
def
_unscale_main_grads_and_check_for_nan
(
self
):
main_grads
=
[]
# fp32 params fromm float16 ones.
for
main_group
in
self
.
fp32_from_float16_groups
:
for
main_param
in
main_group
:
if
main_param
.
grad
is
not
None
:
main_grads
.
append
(
main_param
.
grad
.
data
)
# Append fp32 parameters.
for
main_group
in
self
.
fp32_from_fp32_groups
:
for
main_param
in
main_group
:
if
main_param
.
grad
is
not
None
:
main_grads
.
append
(
main_param
.
grad
.
data
)
# Reset found inf.
self
.
found_inf
.
fill_
(
0.0
)
# Unscale and set found inf/nan
torch
.
_amp_foreach_non_finite_check_and_unscale_
(
main_grads
,
self
.
found_inf
,
self
.
grad_scaler
.
inv_scale
)
# Update across all model parallel instances.
torch
.
distributed
.
all_reduce
(
self
.
found_inf
,
op
=
torch
.
distributed
.
ReduceOp
.
MAX
,
group
=
mpu
.
get_model_parallel_group
())
# Check for nan.
found_inf_flag
=
(
self
.
found_inf
.
item
()
>
0
)
return
found_inf_flag
def
_get_model_and_main_params_data_float16
(
self
):
model_data
=
[]
main_data
=
[]
for
model_group
,
main_group
in
zip
(
self
.
float16_groups
,
self
.
fp32_from_float16_groups
):
for
model_param
,
main_param
in
zip
(
model_group
,
main_group
):
model_data
.
append
(
model_param
.
data
)
main_data
.
append
(
main_param
.
data
)
return
model_data
,
main_data
def
_copy_main_params_to_model_params
(
self
):
# Only needed for the float16 params.
model_data
,
main_data
=
self
.
_get_model_and_main_params_data_float16
()
_multi_tensor_copy_this_to_that
(
this
=
main_data
,
that
=
model_data
,
overflow_buf
=
self
.
_dummy_overflow_buf
)
def
_copy_model_params_to_main_params
(
self
):
# Only needed for the float16 params.
model_data
,
main_data
=
self
.
_get_model_and_main_params_data_float16
()
_multi_tensor_copy_this_to_that
(
this
=
model_data
,
that
=
main_data
,
overflow_buf
=
self
.
_dummy_overflow_buf
)
def
reload_model_params
(
self
):
self
.
_copy_model_params_to_main_params
()
@
torch
.
no_grad
()
def
step
(
self
):
timers
=
get_timers
()
# Copy gradients from model params to main params.
timers
(
'optimizer-copy-to-main-grad'
).
start
()
self
.
_copy_model_grads_to_main_grads
()
timers
(
'optimizer-copy-to-main-grad'
).
stop
()
# Do unscale, check for inf, and update grad scaler only for
# the case that grad scaler is provided.
if
self
.
grad_scaler
:
# Unscale and check for inf/nan.
timers
(
'optimizer-unscale-and-check-inf'
).
start
()
found_inf_flag
=
self
.
_unscale_main_grads_and_check_for_nan
()
timers
(
'optimizer-unscale-and-check-inf'
).
stop
()
# We are done with scaling gradients
# so we can update the loss scale.
self
.
grad_scaler
.
update
(
found_inf_flag
)
# If we found inf/nan, skip the update.
if
found_inf_flag
:
return
False
,
None
,
None
# Clip the main gradients.
timers
(
'optimizer-clip-main-grad'
).
start
()
grad_norm
=
None
if
self
.
clip_grad
>
0.0
:
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
timers
(
'optimizer-clip-main-grad'
).
stop
()
# count the zeros in the grads
num_zeros_in_grad
=
self
.
count_zeros
()
if
\
self
.
log_num_zeros_in_grad
else
None
# Step the optimizer.
self
.
optimizer
.
step
()
# Update params from main params.
timers
(
'optimizer-copy-main-to-model-params'
).
start
()
self
.
_copy_main_params_to_model_params
()
timers
(
'optimizer-copy-main-to-model-params'
).
stop
()
# Successful update.
return
True
,
grad_norm
,
num_zeros_in_grad
def
state_dict
(
self
):
state_dict
=
{}
state_dict
[
'optimizer'
]
=
self
.
optimizer
.
state_dict
()
if
self
.
grad_scaler
:
state_dict
[
'grad_scaler'
]
=
self
.
grad_scaler
.
state_dict
()
state_dict
[
'fp32_from_fp16_params'
]
=
self
.
fp32_from_float16_groups
return
state_dict
def
load_state_dict
(
self
,
state_dict
):
# Optimizer.
optimizer_key
=
'optimizer'
if
optimizer_key
not
in
state_dict
:
optimizer_key
=
'optimizer_state_dict'
print_rank_0
(
'***WARNING*** loading optimizer from '
'an old checkpoint ...'
)
self
.
optimizer
.
load_state_dict
(
state_dict
[
optimizer_key
])
# Grad scaler.
if
'grad_scaler'
not
in
state_dict
:
print_rank_0
(
'***WARNING*** found an old checkpoint, will not '
'load grad scaler ...'
)
else
:
if
self
.
grad_scaler
:
self
.
grad_scaler
.
load_state_dict
(
state_dict
[
'grad_scaler'
])
else
:
print_rank_0
(
'***WARNING*** fould the grad scaler in the '
'checkpoint but it is None in the class. '
'Skipping loading grad scaler ...'
)
# Copy data for the main params.
fp32_from_float16_params_key
=
'fp32_from_fp16_params'
if
fp32_from_float16_params_key
not
in
state_dict
:
fp32_from_float16_params_key
=
'fp32_from_fp16'
for
current_group
,
saved_group
in
zip
(
self
.
fp32_from_float16_groups
,
state_dict
[
fp32_from_float16_params_key
]):
for
current_param
,
saved_param
in
zip
(
current_group
,
saved_group
):
current_param
.
data
.
copy_
(
saved_param
.
data
)
class
FP32Optimizer
(
MegatronOptimizer
):
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
):
super
(
FP32Optimizer
,
self
).
__init__
(
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
)
self
.
_scale
=
torch
.
cuda
.
FloatTensor
([
1.0
])
def
zero_grad
(
self
,
set_to_none
=
True
):
"""Copied from torch.optim.optimizer"""
for
group
in
self
.
optimizer
.
param_groups
:
_zero_grad_group_helper
(
group
[
'params'
],
set_to_none
)
def
get_loss_scale
(
self
):
"""FP32 optimizer does not do any scaling."""
return
self
.
_scale
@
torch
.
no_grad
()
def
step
(
self
):
"""Clip gradients (if needed) and step the base optimizer.
Always return successful since there is no overflow."""
# Copy main_grads to grads.
if
self
.
params_have_main_grad
:
for
param_group
in
self
.
optimizer
.
param_groups
:
for
param
in
param_group
[
'params'
]:
param
.
grad
=
param
.
main_grad
# Clip gradients.
grad_norm
=
None
if
self
.
clip_grad
>
0.0
:
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
# count the zeros in the grads
num_zeros_in_grad
=
self
.
count_zeros
()
if
\
self
.
log_num_zeros_in_grad
else
None
# Update parameters.
self
.
optimizer
.
step
()
# No overflow for FP32 optimizer.
return
True
,
grad_norm
,
num_zeros_in_grad
def
reload_model_params
(
self
):
pass
def
state_dict
(
self
):
return
self
.
optimizer
.
state_dict
()
def
load_state_dict
(
self
,
state_dict
):
self
.
optimizer
.
load_state_dict
(
state_dict
)
megatron-deepspeed_dtk22.10/megatron/p2p_communication.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
functools
import
reduce
import
operator
import
torch
from
megatron
import
get_args
from
megatron
import
mpu
def
_communicate
(
tensor_send_next
,
tensor_send_prev
,
recv_prev
,
recv_next
,
use_ring_exchange
=
False
):
"""Communicate tensors between stages. Used as helper method in other
communication methods that are used in megatron/schedules.py.
Takes the following arguments:
tensor_send_next: tensor to send to next rank (no tensor sent if
set to None).
tensor_send_prev: tensor to send to prev rank (no tensor sent if
set to None).
recv_prev: boolean for whether tensor should be received from
previous rank.
recv_next: boolean for whether tensor should be received from
next rank.
use_ring_exchange: boolean for whether torch.distributed.ring_exchange()
API should be used.
Returns:
(tensor_recv_prev, tensor_recv_next)
"""
args
=
get_args
()
# Create placeholder tensors for receive in forward and backward directions
# if needed.
tensor_recv_prev
=
None
tensor_recv_next
=
None
tensor_shape
=
(
args
.
seq_length
,
args
.
micro_batch_size
,
args
.
hidden_size
)
if
args
.
scatter_gather_tensors_in_pipeline
:
tensor_chunk_shape
=
reduce
(
operator
.
mul
,
tensor_shape
,
1
)
//
\
mpu
.
get_tensor_model_parallel_world_size
()
else
:
tensor_chunk_shape
=
tensor_shape
dtype
=
args
.
params_dtype
if
args
.
fp32_residual_connection
:
dtype
=
torch
.
float
if
recv_prev
:
tensor_recv_prev
=
torch
.
empty
(
tensor_chunk_shape
,
requires_grad
=
True
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
dtype
)
if
recv_next
:
tensor_recv_next
=
torch
.
empty
(
tensor_chunk_shape
,
requires_grad
=
True
,
device
=
torch
.
cuda
.
current_device
(),
dtype
=
dtype
)
# Split tensor into smaller chunks if using scatter-gather optimization.
if
args
.
scatter_gather_tensors_in_pipeline
:
if
tensor_send_next
is
not
None
:
tensor_send_next
=
mpu
.
split_tensor_into_1d_equal_chunks
(
tensor_send_next
)
if
tensor_send_prev
is
not
None
:
tensor_send_prev
=
mpu
.
split_tensor_into_1d_equal_chunks
(
tensor_send_prev
)
# Send tensors in both the forward and backward directions as appropriate.
if
use_ring_exchange
:
torch
.
distributed
.
ring_exchange
(
tensor_send_prev
=
tensor_send_prev
,
tensor_recv_prev
=
tensor_recv_prev
,
tensor_send_next
=
tensor_send_next
,
tensor_recv_next
=
tensor_recv_next
,
group
=
mpu
.
get_pipeline_model_parallel_group
())
else
:
ops
=
[]
if
tensor_send_prev
is
not
None
:
send_prev_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
isend
,
tensor_send_prev
,
mpu
.
get_pipeline_model_parallel_prev_rank
())
ops
.
append
(
send_prev_op
)
if
tensor_recv_prev
is
not
None
:
recv_prev_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
irecv
,
tensor_recv_prev
,
mpu
.
get_pipeline_model_parallel_prev_rank
())
ops
.
append
(
recv_prev_op
)
if
tensor_send_next
is
not
None
:
send_next_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
isend
,
tensor_send_next
,
mpu
.
get_pipeline_model_parallel_next_rank
())
ops
.
append
(
send_next_op
)
if
tensor_recv_next
is
not
None
:
recv_next_op
=
torch
.
distributed
.
P2POp
(
torch
.
distributed
.
irecv
,
tensor_recv_next
,
mpu
.
get_pipeline_model_parallel_next_rank
())
ops
.
append
(
recv_next_op
)
if
len
(
ops
)
>
0
:
reqs
=
torch
.
distributed
.
batch_isend_irecv
(
ops
)
for
req
in
reqs
:
req
.
wait
()
# To protect against race condition when using batch_isend_irecv().
torch
.
cuda
.
synchronize
()
# If using scatter-gather optimization, gather smaller chunks.
if
args
.
scatter_gather_tensors_in_pipeline
:
if
recv_prev
:
tensor_recv_prev
=
mpu
.
gather_split_1d_tensor
(
tensor_recv_prev
).
view
(
tensor_shape
).
requires_grad_
()
if
recv_next
:
tensor_recv_next
=
mpu
.
gather_split_1d_tensor
(
tensor_recv_next
).
view
(
tensor_shape
).
requires_grad_
()
return
tensor_recv_prev
,
tensor_recv_next
def
recv_forward
(
timers
=
None
):
"""Receive tensor from previous rank in pipeline (forward receive)."""
if
mpu
.
is_pipeline_first_stage
():
input_tensor
=
None
else
:
if
timers
is
not
None
:
timers
(
'forward-recv'
).
start
()
input_tensor
,
_
=
_communicate
(
tensor_send_next
=
None
,
tensor_send_prev
=
None
,
recv_prev
=
True
,
recv_next
=
False
)
if
timers
is
not
None
:
timers
(
'forward-recv'
).
stop
()
return
input_tensor
def
recv_backward
(
timers
=
None
):
"""Receive tensor from next rank in pipeline (backward receive)."""
if
mpu
.
is_pipeline_last_stage
():
output_tensor_grad
=
None
else
:
if
timers
is
not
None
:
timers
(
'backward-recv'
).
start
()
_
,
output_tensor_grad
=
_communicate
(
tensor_send_next
=
None
,
tensor_send_prev
=
None
,
recv_prev
=
False
,
recv_next
=
True
)
if
timers
is
not
None
:
timers
(
'backward-recv'
).
stop
()
return
output_tensor_grad
def
send_forward
(
output_tensor
,
timers
=
None
):
"""Send tensor to next rank in pipeline (forward send)."""
if
not
mpu
.
is_pipeline_last_stage
():
if
timers
is
not
None
:
timers
(
'forward-send'
).
start
()
_communicate
(
tensor_send_next
=
output_tensor
,
tensor_send_prev
=
None
,
recv_prev
=
False
,
recv_next
=
False
)
if
timers
is
not
None
:
timers
(
'forward-send'
).
stop
()
def
send_backward
(
input_tensor_grad
,
timers
=
None
):
"""Send tensor to previous rank in pipeline (backward send)."""
if
not
mpu
.
is_pipeline_first_stage
():
if
timers
is
not
None
:
timers
(
'backward-send'
).
start
()
_communicate
(
tensor_send_next
=
None
,
tensor_send_prev
=
input_tensor_grad
,
recv_prev
=
False
,
recv_next
=
False
)
if
timers
is
not
None
:
timers
(
'backward-send'
).
stop
()
def
send_forward_recv_backward
(
output_tensor
,
timers
=
None
):
"""Batched send and recv with next rank in pipeline."""
if
mpu
.
is_pipeline_last_stage
():
output_tensor_grad
=
None
else
:
if
timers
is
not
None
:
timers
(
'forward-send-backward-recv'
).
start
()
_
,
output_tensor_grad
=
_communicate
(
tensor_send_next
=
output_tensor
,
tensor_send_prev
=
None
,
recv_prev
=
False
,
recv_next
=
True
)
if
timers
is
not
None
:
timers
(
'forward-send-backward-recv'
).
stop
()
return
output_tensor_grad
def
send_backward_recv_forward
(
input_tensor_grad
,
timers
=
None
):
"""Batched send and recv with previous rank in pipeline."""
if
mpu
.
is_pipeline_first_stage
():
input_tensor
=
None
else
:
if
timers
is
not
None
:
timers
(
'backward-send-forward-recv'
).
start
()
input_tensor
,
_
=
_communicate
(
tensor_send_next
=
None
,
tensor_send_prev
=
input_tensor_grad
,
recv_prev
=
True
,
recv_next
=
False
)
if
timers
is
not
None
:
timers
(
'backward-send-forward-recv'
).
stop
()
return
input_tensor
def
send_forward_recv_forward
(
output_tensor
,
recv_prev
,
timers
=
None
):
"""Batched recv from previous rank and send to next rank in pipeline."""
if
timers
is
not
None
:
timers
(
'forward-send-forward-recv'
).
start
()
input_tensor
,
_
=
_communicate
(
tensor_send_next
=
output_tensor
,
tensor_send_prev
=
None
,
recv_prev
=
recv_prev
,
recv_next
=
False
)
if
timers
is
not
None
:
timers
(
'forward-send-forward-recv'
).
stop
()
return
input_tensor
def
send_backward_recv_backward
(
input_tensor_grad
,
recv_next
,
timers
=
None
):
"""Batched recv from next rank and send to previous rank in pipeline."""
if
timers
is
not
None
:
timers
(
'backward-send-backward-recv'
).
start
()
_
,
output_tensor_grad
=
_communicate
(
tensor_send_next
=
None
,
tensor_send_prev
=
input_tensor_grad
,
recv_prev
=
False
,
recv_next
=
recv_next
)
if
timers
is
not
None
:
timers
(
'backward-send-backward-recv'
).
stop
()
return
output_tensor_grad
def
send_forward_backward_recv_forward_backward
(
output_tensor
,
input_tensor_grad
,
recv_prev
,
recv_next
,
timers
=
None
):
"""Batched send and recv with previous and next ranks in pipeline."""
if
timers
is
not
None
:
timers
(
'forward-backward-send-forward-backward-recv'
).
start
()
input_tensor
,
output_tensor_grad
=
_communicate
(
tensor_send_next
=
output_tensor
,
tensor_send_prev
=
input_tensor_grad
,
recv_prev
=
recv_prev
,
recv_next
=
recv_next
)
if
timers
is
not
None
:
timers
(
'forward-backward-send-forward-backward-recv'
).
stop
()
return
input_tensor
,
output_tensor_grad
megatron-deepspeed_dtk22.10/megatron/package_info.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
MAJOR
=
1
MINOR
=
1.5
# Use the following formatting: (major, minor)
VERSION
=
(
MAJOR
,
MINOR
)
__version__
=
'.'
.
join
(
map
(
str
,
VERSION
))
+
'.bs'
__package_name__
=
'megatron-lm'
__contact_names__
=
'NVIDIA INC'
__url__
=
'https://github.com/NVIDIA/Megatron-LM'
__download_url__
=
'https://github.com/NVIDIA/Megatron-LM/releases'
__description__
=
'Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism.'
__license__
=
'See https://github.com/NVIDIA/Megatron-LM/blob/master/LICENSE'
__keywords__
=
'deep learning, Megatron, gpu, NLP, nvidia, pytorch, torch, language'
megatron-deepspeed_dtk22.10/megatron/schedules.py
0 → 100644
View file @
8ec5d678
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
contextlib
import
contextmanager
import
torch
from
torch.nn.parallel.distributed
import
DistributedDataParallel
as
torchDDP
from
megatron
import
get_args
from
megatron
import
get_num_microbatches
from
megatron
import
get_timers
from
megatron
import
mpu
from
megatron
import
p2p_communication
from
megatron.utils
import
unwrap_model
from
megatron.model.distributed
import
DistributedDataParallel
as
LocalDDP
from
megatron.model.module
import
Float16Module
def
get_forward_backward_func
():
args
=
get_args
()
if
mpu
.
get_pipeline_model_parallel_world_size
()
>
1
:
if
args
.
virtual_pipeline_model_parallel_size
is
not
None
:
forward_backward_func
=
forward_backward_pipelining_with_interleaving
else
:
forward_backward_func
=
forward_backward_pipelining_without_interleaving
else
:
forward_backward_func
=
forward_backward_no_pipelining
return
forward_backward_func
def
forward_step
(
forward_step_func
,
data_iterator
,
model
,
input_tensor
,
losses_reduced
):
"""Forward step for passed-in model.
If first stage, input tensor is obtained from data_iterator, otherwise
passed-in input_tensor is used.
Returns output tensor."""
timers
=
get_timers
()
args
=
get_args
()
timers
(
'forward-compute'
).
start
()
unwrapped_model
=
unwrap_model
(
model
,
(
torchDDP
,
LocalDDP
,
Float16Module
))
if
not
args
.
deepspeed
:
unwrapped_model
.
set_input_tensor
(
input_tensor
)
else
:
unwrapped_model
.
module
.
set_input_tensor
(
input_tensor
)
output_tensor
,
loss_func
=
forward_step_func
(
data_iterator
,
model
)
if
mpu
.
is_pipeline_last_stage
():
output_tensor
=
loss_func
(
output_tensor
)
loss
,
loss_reduced
=
output_tensor
output_tensor
=
loss
/
get_num_microbatches
()
losses_reduced
.
append
(
loss_reduced
)
timers
(
'forward-compute'
).
stop
()
return
output_tensor
def
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
output_tensor_grad
,
model
=
None
):
"""Backward step through passed-in output tensor.
If last stage, output_tensor_grad is None, otherwise gradient of loss
with respect to stage's output tensor.
Returns gradient of loss with respect to input tensor (None if first
stage)."""
args
=
get_args
()
if
args
.
deepspeed
:
assert
model
is
not
None
timers
=
get_timers
()
timers
(
'backward-compute'
).
start
()
# Retain the grad on the input_tensor.
if
input_tensor
is
not
None
:
input_tensor
.
retain_grad
()
if
args
.
deepspeed
:
model
.
backward
(
output_tensor
)
else
:
# Backward pass.
if
output_tensor_grad
is
None
:
output_tensor
=
optimizer
.
scale_loss
(
output_tensor
)
torch
.
autograd
.
backward
(
output_tensor
,
grad_tensors
=
output_tensor_grad
)
# Collect the grad of the input_tensor.
input_tensor_grad
=
None
if
input_tensor
is
not
None
:
input_tensor_grad
=
input_tensor
.
grad
timers
(
'backward-compute'
).
stop
()
return
input_tensor_grad
@
contextmanager
def
dummy_handler
():
try
:
yield
finally
:
pass
def
forward_backward_no_pipelining
(
forward_step_func
,
data_iterator
,
model
,
optimizer
,
timers
,
forward_only
):
"""Run forward and backward passes with no pipeline parallelism
(no inter-stage communication).
Returns dictionary with losses."""
assert
len
(
model
)
==
1
model
=
model
[
0
]
context_handler
=
dummy_handler
if
isinstance
(
model
,
torchDDP
):
context_handler
=
model
.
no_sync
losses_reduced
=
[]
input_tensor
,
output_tensor_grad
=
None
,
None
with
context_handler
():
for
i
in
range
(
get_num_microbatches
()
-
1
):
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
model
,
input_tensor
,
losses_reduced
)
if
not
forward_only
:
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
output_tensor_grad
,
model
)
# Run computation for last microbatch out of context handler (want to
# synchronize gradients).
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
model
,
input_tensor
,
losses_reduced
)
if
not
forward_only
:
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
output_tensor_grad
,
model
)
return
losses_reduced
def
forward_backward_pipelining_with_interleaving
(
forward_step_func
,
data_iterator
,
model
,
optimizer
,
timers
,
forward_only
):
"""Run interleaved 1F1B schedule (model split into model chunks), with
communication between pipeline stages as needed.
Returns dictionary with losses if the last stage, empty dict otherwise."""
input_tensors
=
[[]
for
_
in
range
(
len
(
model
))]
output_tensors
=
[[]
for
_
in
range
(
len
(
model
))]
losses_reduced
=
[]
if
not
forward_only
:
output_tensor_grads
=
[[]
for
_
in
range
(
len
(
model
))]
pipeline_parallel_size
=
mpu
.
get_pipeline_model_parallel_world_size
()
pipeline_parallel_rank
=
mpu
.
get_pipeline_model_parallel_rank
()
# Compute number of warmup and remaining microbatches.
num_model_chunks
=
len
(
model
)
num_microbatches
=
get_num_microbatches
()
*
num_model_chunks
all_warmup_microbatches
=
False
if
forward_only
:
num_warmup_microbatches
=
num_microbatches
else
:
# Run all forward passes and then all backward passes if number of
# microbatches is just the number of pipeline stages.
# Otherwise, perform (num_model_chunks-1)*pipeline_parallel_size on
# all workers, followed by more microbatches after depending on
# stage ID (more forward passes for earlier stages, later stages can
# immediately start with 1F1B).
if
get_num_microbatches
()
==
pipeline_parallel_size
:
num_warmup_microbatches
=
num_microbatches
all_warmup_microbatches
=
True
else
:
num_warmup_microbatches
=
\
(
pipeline_parallel_size
-
pipeline_parallel_rank
-
1
)
*
2
num_warmup_microbatches
+=
(
num_model_chunks
-
1
)
*
pipeline_parallel_size
num_warmup_microbatches
=
min
(
num_warmup_microbatches
,
num_microbatches
)
num_microbatches_remaining
=
\
num_microbatches
-
num_warmup_microbatches
def
get_model_chunk_id
(
microbatch_id
,
forward
):
"""Helper method to get the model chunk ID given the iteration number."""
microbatch_id_in_group
=
microbatch_id
%
(
pipeline_parallel_size
*
num_model_chunks
)
model_chunk_id
=
microbatch_id_in_group
//
pipeline_parallel_size
if
not
forward
:
model_chunk_id
=
(
num_model_chunks
-
model_chunk_id
-
1
)
return
model_chunk_id
def
forward_step_helper
(
microbatch_id
):
"""Helper method to run forward step with model split into chunks
(run set_virtual_pipeline_model_parallel_rank() before calling
forward_step())."""
model_chunk_id
=
get_model_chunk_id
(
microbatch_id
,
forward
=
True
)
mpu
.
set_virtual_pipeline_model_parallel_rank
(
model_chunk_id
)
if
mpu
.
is_pipeline_first_stage
():
if
len
(
input_tensors
[
model_chunk_id
])
==
\
len
(
output_tensors
[
model_chunk_id
]):
input_tensors
[
model_chunk_id
].
append
(
None
)
input_tensor
=
input_tensors
[
model_chunk_id
][
-
1
]
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
[
model_chunk_id
],
model
[
model_chunk_id
],
input_tensor
,
losses_reduced
)
output_tensors
[
model_chunk_id
].
append
(
output_tensor
)
return
output_tensor
def
backward_step_helper
(
microbatch_id
):
"""Helper method to run backward step with model split into chunks
(run set_virtual_pipeline_model_parallel_rank() before calling
backward_step())."""
model_chunk_id
=
get_model_chunk_id
(
microbatch_id
,
forward
=
False
)
mpu
.
set_virtual_pipeline_model_parallel_rank
(
model_chunk_id
)
if
mpu
.
is_pipeline_last_stage
():
if
len
(
output_tensor_grads
[
model_chunk_id
])
==
0
:
output_tensor_grads
[
model_chunk_id
].
append
(
None
)
input_tensor
=
input_tensors
[
model_chunk_id
].
pop
(
0
)
output_tensor
=
output_tensors
[
model_chunk_id
].
pop
(
0
)
output_tensor_grad
=
output_tensor_grads
[
model_chunk_id
].
pop
(
0
)
input_tensor_grad
=
\
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
output_tensor_grad
)
return
input_tensor_grad
# Run warmup forward passes.
mpu
.
set_virtual_pipeline_model_parallel_rank
(
0
)
input_tensors
[
0
].
append
(
p2p_communication
.
recv_forward
(
timers
))
for
k
in
range
(
num_warmup_microbatches
):
output_tensor
=
forward_step_helper
(
k
)
# Determine if tensor should be received from previous stage.
next_forward_model_chunk_id
=
get_model_chunk_id
(
k
+
1
,
forward
=
True
)
recv_prev
=
True
if
mpu
.
is_pipeline_first_stage
(
ignore_virtual
=
True
):
if
next_forward_model_chunk_id
==
0
:
recv_prev
=
False
if
k
==
(
num_microbatches
-
1
):
recv_prev
=
False
# Don't send tensor downstream if on last stage.
if
mpu
.
is_pipeline_last_stage
():
output_tensor
=
None
# Send and receive tensors as appropriate (send tensors computed
# in this iteration; receive tensors for next iteration).
if
k
==
(
num_warmup_microbatches
-
1
)
and
not
forward_only
and
\
not
all_warmup_microbatches
:
input_tensor_grad
=
None
recv_next
=
True
if
mpu
.
is_pipeline_last_stage
(
ignore_virtual
=
True
):
recv_next
=
False
input_tensor
,
output_tensor_grad
=
\
p2p_communication
.
send_forward_backward_recv_forward_backward
(
output_tensor
,
input_tensor_grad
,
recv_prev
=
recv_prev
,
recv_next
=
recv_next
,
timers
=
timers
)
output_tensor_grads
[
num_model_chunks
-
1
].
append
(
output_tensor_grad
)
else
:
input_tensor
=
\
p2p_communication
.
send_forward_recv_forward
(
output_tensor
,
recv_prev
,
timers
)
input_tensors
[
next_forward_model_chunk_id
].
append
(
input_tensor
)
# Run 1F1B in steady state.
for
k
in
range
(
num_microbatches_remaining
):
# Forward pass.
forward_k
=
k
+
num_warmup_microbatches
output_tensor
=
forward_step_helper
(
forward_k
)
# Backward pass.
backward_k
=
k
input_tensor_grad
=
backward_step_helper
(
backward_k
)
# Send output_tensor and input_tensor_grad, receive input_tensor
# and output_tensor_grad.
# Determine if current stage has anything to send in either direction,
# otherwise set tensor to None.
forward_model_chunk_id
=
get_model_chunk_id
(
forward_k
,
forward
=
True
)
mpu
.
set_virtual_pipeline_model_parallel_rank
(
forward_model_chunk_id
)
if
mpu
.
is_pipeline_last_stage
():
output_tensor
=
None
backward_model_chunk_id
=
get_model_chunk_id
(
backward_k
,
forward
=
False
)
mpu
.
set_virtual_pipeline_model_parallel_rank
(
backward_model_chunk_id
)
if
mpu
.
is_pipeline_first_stage
():
input_tensor_grad
=
None
# Determine if peers are sending, and where in data structure to put
# received tensors.
recv_prev
=
True
if
mpu
.
is_pipeline_first_stage
(
ignore_virtual
=
True
):
# First stage is ahead of last stage by (pipeline_parallel_size - 1).
next_forward_model_chunk_id
=
get_model_chunk_id
(
forward_k
-
(
pipeline_parallel_size
-
1
),
forward
=
True
)
if
next_forward_model_chunk_id
==
(
num_model_chunks
-
1
):
recv_prev
=
False
next_forward_model_chunk_id
+=
1
else
:
next_forward_model_chunk_id
=
get_model_chunk_id
(
forward_k
+
1
,
forward
=
True
)
recv_next
=
True
if
mpu
.
is_pipeline_last_stage
(
ignore_virtual
=
True
):
# Last stage is ahead of first stage by (pipeline_parallel_size - 1).
next_backward_model_chunk_id
=
get_model_chunk_id
(
backward_k
-
(
pipeline_parallel_size
-
1
),
forward
=
False
)
if
next_backward_model_chunk_id
==
0
:
recv_next
=
False
next_backward_model_chunk_id
-=
1
else
:
next_backward_model_chunk_id
=
get_model_chunk_id
(
backward_k
+
1
,
forward
=
False
)
# If last iteration, don't receive; we already received one extra
# before the start of the for loop.
if
k
==
(
num_microbatches_remaining
-
1
):
recv_prev
=
False
# Communicate tensors.
input_tensor
,
output_tensor_grad
=
\
p2p_communication
.
send_forward_backward_recv_forward_backward
(
output_tensor
,
input_tensor_grad
,
recv_prev
=
recv_prev
,
recv_next
=
recv_next
,
timers
=
timers
)
# Put input_tensor and output_tensor_grad in data structures in the
# right location.
if
recv_prev
:
input_tensors
[
next_forward_model_chunk_id
].
append
(
input_tensor
)
if
recv_next
:
output_tensor_grads
[
next_backward_model_chunk_id
].
append
(
output_tensor_grad
)
# Run cooldown backward passes (flush out pipeline).
if
not
forward_only
:
if
all_warmup_microbatches
:
output_tensor_grads
[
num_model_chunks
-
1
].
append
(
p2p_communication
.
recv_backward
(
timers
))
for
k
in
range
(
num_microbatches_remaining
,
num_microbatches
):
input_tensor_grad
=
backward_step_helper
(
k
)
next_backward_model_chunk_id
=
get_model_chunk_id
(
k
+
1
,
forward
=
False
)
recv_next
=
True
if
mpu
.
is_pipeline_last_stage
(
ignore_virtual
=
True
):
if
next_backward_model_chunk_id
==
(
num_model_chunks
-
1
):
recv_next
=
False
if
k
==
(
num_microbatches
-
1
):
recv_next
=
False
output_tensor_grads
[
next_backward_model_chunk_id
].
append
(
p2p_communication
.
send_backward_recv_backward
(
input_tensor_grad
,
recv_next
,
timers
))
return
losses_reduced
def
forward_backward_pipelining_without_interleaving
(
forward_step_func
,
data_iterator
,
model
,
optimizer
,
timers
,
forward_only
):
"""Run non-interleaved 1F1B schedule, with communication between pipeline
stages.
Returns dictionary with losses if the last stage, empty dict otherwise."""
timers
=
get_timers
()
assert
len
(
model
)
==
1
model
=
model
[
0
]
# Compute number of warmup microbatches.
num_microbatches
=
get_num_microbatches
()
num_warmup_microbatches
=
\
(
mpu
.
get_pipeline_model_parallel_world_size
()
-
mpu
.
get_pipeline_model_parallel_rank
()
-
1
)
num_warmup_microbatches
=
min
(
num_warmup_microbatches
,
num_microbatches
)
num_microbatches_remaining
=
\
num_microbatches
-
num_warmup_microbatches
input_tensors
=
[]
output_tensors
=
[]
losses_reduced
=
[]
# Run warmup forward passes.
for
i
in
range
(
num_warmup_microbatches
):
input_tensor
=
p2p_communication
.
recv_forward
(
timers
)
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
model
,
input_tensor
,
losses_reduced
)
p2p_communication
.
send_forward
(
output_tensor
,
timers
)
input_tensors
.
append
(
input_tensor
)
output_tensors
.
append
(
output_tensor
)
# Before running 1F1B, need to receive first forward tensor.
# If all microbatches are run in warmup / cooldown phase, then no need to
# receive this tensor here.
if
num_microbatches_remaining
>
0
:
input_tensor
=
p2p_communication
.
recv_forward
(
timers
)
# Run 1F1B in steady state.
for
i
in
range
(
num_microbatches_remaining
):
last_iteration
=
(
i
==
(
num_microbatches_remaining
-
1
))
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
model
,
input_tensor
,
losses_reduced
)
if
forward_only
:
p2p_communication
.
send_forward
(
output_tensor
,
timers
)
else
:
output_tensor_grad
=
\
p2p_communication
.
send_forward_recv_backward
(
output_tensor
,
timers
)
# Add input_tensor and output_tensor to end of list, then pop from the
# start of the list for backward pass.
input_tensors
.
append
(
input_tensor
)
output_tensors
.
append
(
output_tensor
)
if
forward_only
:
if
not
last_iteration
:
input_tensor
=
p2p_communication
.
recv_forward
(
timers
)
else
:
input_tensor
,
output_tensor
=
input_tensors
.
pop
(
0
),
output_tensors
.
pop
(
0
)
input_tensor_grad
=
\
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
output_tensor_grad
,
model
)
if
last_iteration
:
input_tensor
=
None
p2p_communication
.
send_backward
(
input_tensor_grad
,
timers
)
else
:
input_tensor
=
\
p2p_communication
.
send_backward_recv_forward
(
input_tensor_grad
,
timers
)
# Run cooldown backward passes.
if
not
forward_only
:
for
i
in
range
(
num_warmup_microbatches
):
input_tensor
=
input_tensors
.
pop
(
0
)
output_tensor
=
output_tensors
.
pop
(
0
)
output_tensor_grad
=
p2p_communication
.
recv_backward
(
timers
)
input_tensor_grad
=
\
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
output_tensor_grad
,
model
)
p2p_communication
.
send_backward
(
input_tensor_grad
,
timers
)
return
losses_reduced
megatron-deepspeed_dtk22.10/megatron/testing_utils.py
0 → 100644
View file @
8ec5d678
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
contextlib
import
importlib.util
import
inspect
import
logging
import
numpy
as
np
import
os
import
random
import
re
import
shutil
import
sys
import
tempfile
import
unittest
from
distutils.util
import
strtobool
from
io
import
StringIO
from
packaging
import
version
from
pathlib
import
Path
from
typing
import
Iterator
,
Union
from
unittest
import
mock
from
unittest.case
import
SkipTest
try
:
import
torch
_torch_available
=
True
except
:
_torch_available
=
False
def
is_torch_available
():
return
_torch_available
def
parse_flag_from_env
(
key
,
default
=
False
):
try
:
value
=
os
.
environ
[
key
]
except
KeyError
:
# KEY isn't set, default to `default`.
_value
=
default
else
:
# KEY is set, convert it to True or False.
try
:
_value
=
strtobool
(
value
)
except
ValueError
:
# More values are supported, but let's keep the message simple.
raise
ValueError
(
f
"If set,
{
key
}
must be yes or no."
)
return
_value
def
parse_int_from_env
(
key
,
default
=
None
):
try
:
value
=
os
.
environ
[
key
]
except
KeyError
:
_value
=
default
else
:
try
:
_value
=
int
(
value
)
except
ValueError
:
raise
ValueError
(
f
"If set,
{
key
}
must be a int."
)
return
_value
def
require_torch
(
test_case
):
"""
Decorator marking a test that requires PyTorch.
These tests are skipped when PyTorch isn't installed.
"""
if
not
is_torch_available
():
return
unittest
.
skip
(
"test requires PyTorch"
)(
test_case
)
else
:
return
test_case
def
require_torch_multi_gpu
(
test_case
):
"""
Decorator marking a test that requires a multi-GPU setup (in PyTorch). These tests are skipped on a machine without
multiple GPUs.
To run *only* the multi_gpu tests, assuming all test names contain multi_gpu: $ pytest -sv ./tests -k "multi_gpu"
"""
if
not
is_torch_available
():
return
unittest
.
skip
(
"test requires PyTorch"
)(
test_case
)
import
torch
if
torch
.
cuda
.
device_count
()
<
2
:
return
unittest
.
skip
(
"test requires multiple GPUs"
)(
test_case
)
else
:
return
test_case
def
require_torch_non_multi_gpu
(
test_case
):
"""
Decorator marking a test that requires 0 or 1 GPU setup (in PyTorch).
"""
if
not
is_torch_available
():
return
unittest
.
skip
(
"test requires PyTorch"
)(
test_case
)
import
torch
if
torch
.
cuda
.
device_count
()
>
1
:
return
unittest
.
skip
(
"test requires 0 or 1 GPU"
)(
test_case
)
else
:
return
test_case
def
require_torch_up_to_2_gpus
(
test_case
):
"""
Decorator marking a test that requires 0 or 1 or 2 GPU setup (in PyTorch).
"""
if
not
is_torch_available
():
return
unittest
.
skip
(
"test requires PyTorch"
)(
test_case
)
import
torch
if
torch
.
cuda
.
device_count
()
>
2
:
return
unittest
.
skip
(
"test requires 0 or 1 or 2 GPUs"
)(
test_case
)
else
:
return
test_case
def
require_torch_tpu
(
test_case
):
"""
Decorator marking a test that requires a TPU (in PyTorch).
"""
if
not
is_torch_tpu_available
():
return
unittest
.
skip
(
"test requires PyTorch TPU"
)
else
:
return
test_case
if
is_torch_available
():
# Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode
import
torch
torch_device
=
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
else
:
torch_device
=
None
def
require_torch_gpu
(
test_case
):
"""Decorator marking a test that requires CUDA and PyTorch."""
if
torch_device
!=
"cuda"
:
return
unittest
.
skip
(
"test requires CUDA"
)(
test_case
)
else
:
return
test_case
def
require_datasets
(
test_case
):
"""Decorator marking a test that requires datasets."""
if
not
is_datasets_available
():
return
unittest
.
skip
(
"test requires `datasets`"
)(
test_case
)
else
:
return
test_case
def
is_deepspeed_available
():
return
importlib
.
util
.
find_spec
(
"deepspeed"
)
is
not
None
def
require_deepspeed
(
test_case
):
"""
Decorator marking a test that requires deepspeed
"""
if
not
is_deepspeed_available
():
return
unittest
.
skip
(
"test requires deepspeed"
)(
test_case
)
else
:
return
test_case
def
is_bnb_available
():
return
importlib
.
util
.
find_spec
(
"bitsandbytes"
)
is
not
None
def
require_bnb
(
test_case
):
"""
Decorator marking a test that requires bitsandbytes
"""
if
not
is_bnb_available
():
return
unittest
.
skip
(
"test requires bitsandbytes from https://github.com/facebookresearch/bitsandbytes"
)(
test_case
)
else
:
return
test_case
def
require_bnb_non_decorator
():
"""
Non-Decorator function that would skip a test if bitsandbytes is missing
"""
if
not
is_bnb_available
():
raise
SkipTest
(
"Test requires bitsandbytes from https://github.com/facebookresearch/bitsandbytes"
)
def
set_seed
(
seed
:
int
=
42
):
"""
Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch``
Args:
seed (:obj:`int`): The seed to set.
"""
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
if
is_torch_available
():
torch
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed_all
(
seed
)
# ^^ safe to call this function even if cuda is not available
def
get_gpu_count
():
"""
Return the number of available gpus (regardless of whether torch or tf is used)
"""
if
is_torch_available
():
import
torch
return
torch
.
cuda
.
device_count
()
elif
is_tf_available
():
import
tensorflow
as
tf
return
len
(
tf
.
config
.
list_physical_devices
(
"GPU"
))
else
:
return
0
def
torch_assert_equal
(
actual
,
expected
,
**
kwargs
):
# assert_close was added around pt-1.9, it does better checks - e.g will check dimensions match
if
hasattr
(
torch
.
testing
,
"assert_close"
):
return
torch
.
testing
.
assert_close
(
actual
,
expected
,
rtol
=
0.0
,
atol
=
0.0
,
**
kwargs
)
else
:
return
torch
.
allclose
(
actual
,
expected
,
rtol
=
0.0
,
atol
=
0.0
)
def
torch_assert_close
(
actual
,
expected
,
**
kwargs
):
# assert_close was added around pt-1.9, it does better checks - e.g will check dimensions match
if
hasattr
(
torch
.
testing
,
"assert_close"
):
return
torch
.
testing
.
assert_close
(
actual
,
expected
,
**
kwargs
)
else
:
kwargs
.
pop
(
"msg"
,
None
)
# doesn't have msg arg
return
torch
.
allclose
(
actual
,
expected
,
**
kwargs
)
def
is_torch_bf16_available
():
# from https://github.com/huggingface/transformers/blob/26eb566e43148c80d0ea098c76c3d128c0281c16/src/transformers/file_utils.py#L301
if
is_torch_available
():
import
torch
if
not
torch
.
cuda
.
is_available
()
or
torch
.
version
.
cuda
is
None
:
return
False
if
torch
.
cuda
.
get_device_properties
(
torch
.
cuda
.
current_device
()).
major
<
8
:
return
False
if
int
(
torch
.
version
.
cuda
.
split
(
"."
)[
0
])
<
11
:
return
False
if
not
version
.
parse
(
torch
.
__version__
)
>=
version
.
parse
(
"1.09"
):
return
False
return
True
else
:
return
False
def
require_torch_bf16
(
test_case
):
"""Decorator marking a test that requires CUDA hardware supporting bf16 and PyTorch >= 1.9."""
if
not
is_torch_bf16_available
():
return
unittest
.
skip
(
"test requires CUDA hardware supporting bf16 and PyTorch >= 1.9"
)(
test_case
)
else
:
return
test_case
def
get_tests_dir
(
append_path
=
None
):
"""
Args:
append_path: optional path to append to the tests dir path
Return:
The full path to the `tests` dir, so that the tests can be invoked from anywhere. Optionally `append_path` is
joined after the `tests` dir the former is provided.
"""
# this function caller's __file__
caller__file__
=
inspect
.
stack
()[
1
][
1
]
tests_dir
=
os
.
path
.
abspath
(
os
.
path
.
dirname
(
caller__file__
))
if
append_path
:
return
os
.
path
.
join
(
tests_dir
,
append_path
)
else
:
return
tests_dir
#
# Helper functions for dealing with testing text outputs
# The original code came from:
# https://github.com/fastai/fastai/blob/master/tests/utils/text.py
# When any function contains print() calls that get overwritten, like progress bars,
# a special care needs to be applied, since under pytest -s captured output (capsys
# or contextlib.redirect_stdout) contains any temporary printed strings, followed by
# \r's. This helper function ensures that the buffer will contain the same output
# with and without -s in pytest, by turning:
# foo bar\r tar mar\r final message
# into:
# final message
# it can handle a single string or a multiline buffer
def
apply_print_resets
(
buf
):
return
re
.
sub
(
r
"^.*\r"
,
""
,
buf
,
0
,
re
.
M
)
def
assert_screenout
(
out
,
what
):
out_pr
=
apply_print_resets
(
out
).
lower
()
match_str
=
out_pr
.
find
(
what
.
lower
())
assert
match_str
!=
-
1
,
f
"expecting to find
{
what
}
in output: f
{
out_pr
}
"
class
CaptureStd
:
"""
Context manager to capture:
- stdout: replay it, clean it up and make it available via ``obj.out``
- stderr: replay it and make it available via ``obj.err``
init arguments:
- out - capture stdout:`` True``/``False``, default ``True``
- err - capture stdout: ``True``/``False``, default ``True``
- replay - whether to replay or not: ``True``/``False``, default ``True``. By default each
captured stream gets replayed back on context's exit, so that one can see what the test was
doing. If this is a not wanted behavior and the captured data shouldn't be replayed, pass
``replay=False`` to disable this feature.
Examples::
# to capture stdout only with auto-replay
with CaptureStdout() as cs:
print("Secret message")
assert "message" in cs.out
# to capture stderr only with auto-replay
import sys
with CaptureStderr() as cs:
print("Warning: ", file=sys.stderr)
assert "Warning" in cs.err
# to capture both streams with auto-replay
with CaptureStd() as cs:
print("Secret message")
print("Warning: ", file=sys.stderr)
assert "message" in cs.out
assert "Warning" in cs.err
# to capture just one of the streams, and not the other, with auto-replay
with CaptureStd(err=False) as cs:
print("Secret message")
assert "message" in cs.out
# but best use the stream-specific subclasses
# to capture without auto-replay
with CaptureStd(replay=False) as cs:
print("Secret message")
assert "message" in cs.out
"""
def
__init__
(
self
,
out
=
True
,
err
=
True
,
replay
=
True
):
self
.
replay
=
replay
if
out
:
self
.
out_buf
=
StringIO
()
self
.
out
=
"error: CaptureStd context is unfinished yet, called too early"
else
:
self
.
out_buf
=
None
self
.
out
=
"not capturing stdout"
if
err
:
self
.
err_buf
=
StringIO
()
self
.
err
=
"error: CaptureStd context is unfinished yet, called too early"
else
:
self
.
err_buf
=
None
self
.
err
=
"not capturing stderr"
def
__enter__
(
self
):
if
self
.
out_buf
:
self
.
out_old
=
sys
.
stdout
sys
.
stdout
=
self
.
out_buf
if
self
.
err_buf
:
self
.
err_old
=
sys
.
stderr
sys
.
stderr
=
self
.
err_buf
return
self
def
__exit__
(
self
,
*
exc
):
if
self
.
out_buf
:
sys
.
stdout
=
self
.
out_old
captured
=
self
.
out_buf
.
getvalue
()
if
self
.
replay
:
sys
.
stdout
.
write
(
captured
)
self
.
out
=
apply_print_resets
(
captured
)
if
self
.
err_buf
:
sys
.
stderr
=
self
.
err_old
captured
=
self
.
err_buf
.
getvalue
()
if
self
.
replay
:
sys
.
stderr
.
write
(
captured
)
self
.
err
=
captured
def
__repr__
(
self
):
msg
=
""
if
self
.
out_buf
:
msg
+=
f
"stdout:
{
self
.
out
}
\n
"
if
self
.
err_buf
:
msg
+=
f
"stderr:
{
self
.
err
}
\n
"
return
msg
# in tests it's the best to capture only the stream that's wanted, otherwise
# it's easy to miss things, so unless you need to capture both streams, use the
# subclasses below (less typing). Or alternatively, configure `CaptureStd` to
# disable the stream you don't need to test.
class
CaptureStdout
(
CaptureStd
):
"""Same as CaptureStd but captures only stdout"""
def
__init__
(
self
,
replay
=
True
):
super
().
__init__
(
err
=
False
,
replay
=
replay
)
class
CaptureStderr
(
CaptureStd
):
"""Same as CaptureStd but captures only stderr"""
def
__init__
(
self
,
replay
=
True
):
super
().
__init__
(
out
=
False
,
replay
=
replay
)
class
CaptureLogger
:
"""
Context manager to capture `logging` streams
Args:
- logger: 'logging` logger object
Results:
The captured output is available via `self.out`
Example::
>>> from transformers import logging
>>> from transformers.testing_utils import CaptureLogger
>>> msg = "Testing 1, 2, 3"
>>> logging.set_verbosity_info()
>>> logger = logging.get_logger("transformers.models.bart.tokenization_bart")
>>> with CaptureLogger(logger) as cl:
... logger.info(msg)
>>> assert cl.out, msg+"
\n
"
"""
def
__init__
(
self
,
logger
):
self
.
logger
=
logger
self
.
io
=
StringIO
()
self
.
sh
=
logging
.
StreamHandler
(
self
.
io
)
self
.
out
=
""
def
__enter__
(
self
):
self
.
logger
.
addHandler
(
self
.
sh
)
return
self
def
__exit__
(
self
,
*
exc
):
self
.
logger
.
removeHandler
(
self
.
sh
)
self
.
out
=
self
.
io
.
getvalue
()
def
__repr__
(
self
):
return
f
"captured:
{
self
.
out
}
\n
"
@
contextlib
.
contextmanager
# adapted from https://stackoverflow.com/a/64789046/9201239
def
ExtendSysPath
(
path
:
Union
[
str
,
os
.
PathLike
])
->
Iterator
[
None
]:
"""
Temporary add given path to `sys.path`.
Usage ::
with ExtendSysPath('/path/to/dir'):
mymodule = importlib.import_module('mymodule')
"""
path
=
os
.
fspath
(
path
)
try
:
sys
.
path
.
insert
(
0
,
path
)
yield
finally
:
sys
.
path
.
remove
(
path
)
class
TestCasePlus
(
unittest
.
TestCase
):
"""
This class extends `unittest.TestCase` with additional features.
Feature 1: A set of fully resolved important file and dir path accessors.
In tests often we need to know where things are relative to the current test file, and it's not trivial since the
test could be invoked from more than one directory or could reside in sub-directories with different depths. This
class solves this problem by sorting out all the basic paths and provides easy accessors to them:
* ``pathlib`` objects (all fully resolved):
- ``test_file_path`` - the current test file path (=``__file__``)
- ``test_file_dir`` - the directory containing the current test file
- ``tests_dir`` - the directory of the ``tests`` test suite
- ``data_dir`` - the directory of the ``tests/data`` test suite
- ``repo_root_dir`` - the directory of the repository
- ``src_dir`` - the directory of ``src`` (i.e. where the ``transformers`` sub-dir resides)
* stringified paths---same as above but these return paths as strings, rather than ``pathlib`` objects:
- ``test_file_path_str``
- ``test_file_dir_str``
- ``tests_dir_str``
- ``data_dir_str``
- ``repo_root_dir_str``
- ``src_dir_str``
Feature 2: Flexible auto-removable temporary dirs which are guaranteed to get removed at the end of test.
1. Create a unique temporary dir:
::
def test_whatever(self):
tmp_dir = self.get_auto_remove_tmp_dir()
``tmp_dir`` will contain the path to the created temporary dir. It will be automatically removed at the end of the
test.
2. Create a temporary dir of my choice, ensure it's empty before the test starts and don't
empty it after the test.
::
def test_whatever(self):
tmp_dir = self.get_auto_remove_tmp_dir("./xxx")
This is useful for debug when you want to monitor a specific directory and want to make sure the previous tests
didn't leave any data in there.
3. You can override the first two options by directly overriding the ``before`` and ``after`` args, leading to the
following behavior:
``before=True``: the temporary dir will always be cleared at the beginning of the test.
``before=False``: if the temporary dir already existed, any existing files will remain there.
``after=True``: the temporary dir will always be deleted at the end of the test.
``after=False``: the temporary dir will always be left intact at the end of the test.
Note 1: In order to run the equivalent of ``rm -r`` safely, only subdirs of the project repository checkout are
allowed if an explicit ``tmp_dir`` is used, so that by mistake no ``/tmp`` or similar important part of the
filesystem will get nuked. i.e. please always pass paths that start with ``./``
Note 2: Each test can register multiple temporary dirs and they all will get auto-removed, unless requested
otherwise.
Feature 3: Get a copy of the ``os.environ`` object that sets up ``PYTHONPATH`` specific to the current test suite.
This is useful for invoking external programs from the test suite - e.g. distributed training.
::
def test_whatever(self):
env = self.get_env()
"""
def
setUp
(
self
):
# get_auto_remove_tmp_dir feature:
self
.
teardown_tmp_dirs
=
[]
# figure out the resolved paths for repo_root, tests, etc.
self
.
_test_file_path
=
inspect
.
getfile
(
self
.
__class__
)
path
=
Path
(
self
.
_test_file_path
).
resolve
()
self
.
_test_file_dir
=
path
.
parents
[
0
]
for
up
in
[
1
,
2
,
3
]:
tmp_dir
=
path
.
parents
[
up
]
if
(
tmp_dir
/
"megatron"
).
is_dir
()
and
(
tmp_dir
/
"tests"
).
is_dir
():
break
if
tmp_dir
:
self
.
_repo_root_dir
=
tmp_dir
else
:
raise
ValueError
(
f
"can't figure out the root of the repo from
{
self
.
_test_file_path
}
"
)
self
.
_tests_dir
=
self
.
_repo_root_dir
/
"tests"
self
.
_data_dir
=
self
.
_repo_root_dir
/
"tests"
/
"data"
self
.
_src_dir
=
self
.
_repo_root_dir
# megatron doesn't use "src/" prefix in the repo
@
property
def
test_file_path
(
self
):
return
self
.
_test_file_path
@
property
def
test_file_path_str
(
self
):
return
str
(
self
.
_test_file_path
)
@
property
def
test_file_dir
(
self
):
return
self
.
_test_file_dir
@
property
def
test_file_dir_str
(
self
):
return
str
(
self
.
_test_file_dir
)
@
property
def
tests_dir
(
self
):
return
self
.
_tests_dir
@
property
def
tests_dir_str
(
self
):
return
str
(
self
.
_tests_dir
)
@
property
def
data_dir
(
self
):
return
self
.
_data_dir
@
property
def
data_dir_str
(
self
):
return
str
(
self
.
_data_dir
)
@
property
def
repo_root_dir
(
self
):
return
self
.
_repo_root_dir
@
property
def
repo_root_dir_str
(
self
):
return
str
(
self
.
_repo_root_dir
)
@
property
def
src_dir
(
self
):
return
self
.
_src_dir
@
property
def
src_dir_str
(
self
):
return
str
(
self
.
_src_dir
)
def
get_env
(
self
):
"""
Return a copy of the ``os.environ`` object that sets up ``PYTHONPATH`` correctly. This is useful
for invoking external programs from the test suite - e.g. distributed training.
It always inserts ``.`` first, then ``./tests`` depending on the test suite type and
finally the preset ``PYTHONPATH`` if any (all full resolved paths).
"""
env
=
os
.
environ
.
copy
()
paths
=
[
self
.
src_dir_str
]
paths
.
append
(
self
.
tests_dir_str
)
paths
.
append
(
env
.
get
(
"PYTHONPATH"
,
""
))
env
[
"PYTHONPATH"
]
=
":"
.
join
(
paths
)
return
env
def
get_auto_remove_tmp_dir
(
self
,
tmp_dir
=
None
,
before
=
None
,
after
=
None
):
"""
Args:
tmp_dir (:obj:`string`, `optional`):
if :obj:`None`:
- a unique temporary path will be created
- sets ``before=True`` if ``before`` is :obj:`None`
- sets ``after=True`` if ``after`` is :obj:`None`
else:
- :obj:`tmp_dir` will be created
- sets ``before=True`` if ``before`` is :obj:`None`
- sets ``after=False`` if ``after`` is :obj:`None`
before (:obj:`bool`, `optional`):
If :obj:`True` and the :obj:`tmp_dir` already exists, make sure to empty it right away if :obj:`False`
and the :obj:`tmp_dir` already exists, any existing files will remain there.
after (:obj:`bool`, `optional`):
If :obj:`True`, delete the :obj:`tmp_dir` at the end of the test if :obj:`False`, leave the
:obj:`tmp_dir` and its contents intact at the end of the test.
Returns:
tmp_dir(:obj:`string`): either the same value as passed via `tmp_dir` or the path to the auto-selected tmp
dir
"""
if
tmp_dir
is
not
None
:
# defining the most likely desired behavior for when a custom path is provided.
# this most likely indicates the debug mode where we want an easily locatable dir that:
# 1. gets cleared out before the test (if it already exists)
# 2. is left intact after the test
if
before
is
None
:
before
=
True
if
after
is
None
:
after
=
False
# using provided path
path
=
Path
(
tmp_dir
).
resolve
()
# to avoid nuking parts of the filesystem, only relative paths are allowed
if
not
tmp_dir
.
startswith
(
"./"
):
raise
ValueError
(
f
"`tmp_dir` can only be a relative path, i.e. `./some/path`, but received `
{
tmp_dir
}
`"
)
# ensure the dir is empty to start with
if
before
is
True
and
path
.
exists
():
shutil
.
rmtree
(
tmp_dir
,
ignore_errors
=
True
)
path
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
else
:
# defining the most likely desired behavior for when a unique tmp path is auto generated
# (not a debug mode), here we require a unique tmp dir that:
# 1. is empty before the test (it will be empty in this situation anyway)
# 2. gets fully removed after the test
if
before
is
None
:
before
=
True
if
after
is
None
:
after
=
True
# using unique tmp dir (always empty, regardless of `before`)
tmp_dir
=
tempfile
.
mkdtemp
()
if
after
is
True
:
# register for deletion
self
.
teardown_tmp_dirs
.
append
(
tmp_dir
)
return
tmp_dir
def
tearDown
(
self
):
# get_auto_remove_tmp_dir feature: remove registered temp dirs
for
path
in
self
.
teardown_tmp_dirs
:
shutil
.
rmtree
(
path
,
ignore_errors
=
True
)
self
.
teardown_tmp_dirs
=
[]
def
mockenv
(
**
kwargs
):
"""
this is a convenience wrapper, that allows this ::
@mockenv(RUN_SLOW=True, USE_TF=False)
def test_something():
run_slow = os.getenv("RUN_SLOW", False)
use_tf = os.getenv("USE_TF", False)
"""
return
mock
.
patch
.
dict
(
os
.
environ
,
kwargs
)
# from https://stackoverflow.com/a/34333710/9201239
@
contextlib
.
contextmanager
def
mockenv_context
(
*
remove
,
**
update
):
"""
Temporarily updates the ``os.environ`` dictionary in-place. Similar to mockenv
The ``os.environ`` dictionary is updated in-place so that the modification is sure to work in all situations.
Args:
remove: Environment variables to remove.
update: Dictionary of environment variables and values to add/update.
"""
env
=
os
.
environ
update
=
update
or
{}
remove
=
remove
or
[]
# List of environment variables being updated or removed.
stomped
=
(
set
(
update
.
keys
())
|
set
(
remove
))
&
set
(
env
.
keys
())
# Environment variables and values to restore on exit.
update_after
=
{
k
:
env
[
k
]
for
k
in
stomped
}
# Environment variables and values to remove on exit.
remove_after
=
frozenset
(
k
for
k
in
update
if
k
not
in
env
)
try
:
env
.
update
(
update
)
[
env
.
pop
(
k
,
None
)
for
k
in
remove
]
yield
finally
:
env
.
update
(
update_after
)
[
env
.
pop
(
k
)
for
k
in
remove_after
]
# --- distributed testing functions --- #
# adapted from https://stackoverflow.com/a/59041913/9201239
import
asyncio
# noqa
class
_RunOutput
:
def
__init__
(
self
,
returncode
,
stdout
,
stderr
):
self
.
returncode
=
returncode
self
.
stdout
=
stdout
self
.
stderr
=
stderr
async
def
_read_stream
(
stream
,
callback
):
while
True
:
line
=
await
stream
.
readline
()
if
line
:
callback
(
line
)
else
:
break
async
def
_stream_subprocess
(
cmd
,
env
=
None
,
stdin
=
None
,
timeout
=
None
,
quiet
=
False
,
echo
=
False
)
->
_RunOutput
:
if
echo
:
print
(
"
\n
Running: "
,
" "
.
join
(
cmd
))
p
=
await
asyncio
.
create_subprocess_exec
(
cmd
[
0
],
*
cmd
[
1
:],
stdin
=
stdin
,
stdout
=
asyncio
.
subprocess
.
PIPE
,
stderr
=
asyncio
.
subprocess
.
PIPE
,
env
=
env
,
)
# note: there is a warning for a possible deadlock when using `wait` with huge amounts of data in the pipe
# https://docs.python.org/3/library/asyncio-subprocess.html#asyncio.asyncio.subprocess.Process.wait
#
# If it starts hanging, will need to switch to the following code. The problem is that no data
# will be seen until it's done and if it hangs for example there will be no debug info.
# out, err = await p.communicate()
# return _RunOutput(p.returncode, out, err)
out
=
[]
err
=
[]
def
tee
(
line
,
sink
,
pipe
,
label
=
""
):
line
=
line
.
decode
(
"utf-8"
).
rstrip
()
sink
.
append
(
line
)
if
not
quiet
:
print
(
label
,
line
,
file
=
pipe
)
# XXX: the timeout doesn't seem to make any difference here
await
asyncio
.
wait
(
[
_read_stream
(
p
.
stdout
,
lambda
l
:
tee
(
l
,
out
,
sys
.
stdout
,
label
=
"stdout:"
)),
_read_stream
(
p
.
stderr
,
lambda
l
:
tee
(
l
,
err
,
sys
.
stderr
,
label
=
"stderr:"
)),
],
timeout
=
timeout
,
)
return
_RunOutput
(
await
p
.
wait
(),
out
,
err
)
def
execute_subprocess_async
(
cmd
,
env
=
None
,
stdin
=
None
,
timeout
=
180
,
quiet
=
False
,
echo
=
True
)
->
_RunOutput
:
loop
=
asyncio
.
get_event_loop
()
result
=
loop
.
run_until_complete
(
_stream_subprocess
(
cmd
,
env
=
env
,
stdin
=
stdin
,
timeout
=
timeout
,
quiet
=
quiet
,
echo
=
echo
)
)
cmd_str
=
" "
.
join
(
cmd
)
if
result
.
returncode
>
0
:
stderr
=
"
\n
"
.
join
(
result
.
stderr
)
raise
RuntimeError
(
f
"'
{
cmd_str
}
' failed with returncode
{
result
.
returncode
}
\n\n
"
f
"The combined stderr from workers follows:
\n
{
stderr
}
"
)
# check that the subprocess actually did run and produced some output, should the test rely on
# the remote side to do the testing
if
not
result
.
stdout
and
not
result
.
stderr
:
raise
RuntimeError
(
f
"'
{
cmd_str
}
' produced no output."
)
return
result
# --- Misc utils --- #
def
flatten_arguments
(
args
):
"""
Converts dictionary argument to a list.
Note: we add "IGNORED" at the beginning as this value is ignored by the argparser
Example: {"arg1": "value1", "arg2": "value2"} -> ["IGNORED", "arg1", "value1", "arg2", "value2"]
"""
return
[
"IGNORED"
]
+
[
item
for
key_value
in
args
.
items
()
for
item
in
key_value
if
item
!=
""
]
Prev
1
…
3
4
5
6
7
8
9
10
11
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment