Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
41276b6c
Commit
41276b6c
authored
Oct 03, 2022
by
Vijay Korthikanti
Browse files
Merge branch 'main' into nmt-main
parents
a44360ed
fc7f4f03
Changes
135
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
191 additions
and
382 deletions
+191
-382
megatron/mpu/tests/test_initialize.py
megatron/mpu/tests/test_initialize.py
+1
-14
megatron/mpu/tests/test_layers.py
megatron/mpu/tests/test_layers.py
+1
-14
megatron/mpu/tests/test_random.py
megatron/mpu/tests/test_random.py
+1
-14
megatron/mpu/utils.py
megatron/mpu/utils.py
+1
-14
megatron/optimizer/__init__.py
megatron/optimizer/__init__.py
+2
-14
megatron/optimizer/clip_grads.py
megatron/optimizer/clip_grads.py
+1
-14
megatron/optimizer/distrib_optimizer.py
megatron/optimizer/distrib_optimizer.py
+88
-64
megatron/optimizer/grad_scaler.py
megatron/optimizer/grad_scaler.py
+1
-14
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+38
-35
megatron/optimizer_param_scheduler.py
megatron/optimizer_param_scheduler.py
+1
-14
megatron/p2p_communication.py
megatron/p2p_communication.py
+11
-23
megatron/schedules.py
megatron/schedules.py
+28
-33
megatron/static/index.html
megatron/static/index.html
+1
-13
megatron/text_generation/__init__.py
megatron/text_generation/__init__.py
+1
-14
megatron/text_generation/api.py
megatron/text_generation/api.py
+1
-14
megatron/text_generation/communication.py
megatron/text_generation/communication.py
+1
-14
megatron/text_generation/forward_step.py
megatron/text_generation/forward_step.py
+1
-14
megatron/text_generation/generation.py
megatron/text_generation/generation.py
+10
-18
megatron/text_generation/sampling.py
megatron/text_generation/sampling.py
+1
-14
megatron/text_generation/tokenization.py
megatron/text_generation/tokenization.py
+1
-14
No files found.
megatron/mpu/tests/test_initialize.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
commons
import
print_separator
from
commons
import
print_separator
from
commons
import
initialize_distributed
from
commons
import
initialize_distributed
...
...
megatron/mpu/tests/test_layers.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
mpu
import
layers
from
mpu
import
layers
from
commons
import
set_random_seed
from
commons
import
set_random_seed
...
...
megatron/mpu/tests/test_random.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
commons
import
print_separator
from
commons
import
print_separator
from
commons
import
initialize_distributed
from
commons
import
initialize_distributed
...
...
megatron/mpu/utils.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
import
torch
...
...
megatron/optimizer/__init__.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
apex.optimizers
import
FusedAdam
as
Adam
from
apex.optimizers
import
FusedAdam
as
Adam
from
apex.optimizers
import
FusedSGD
as
SGD
from
apex.optimizers
import
FusedSGD
as
SGD
...
@@ -145,6 +132,7 @@ def get_megatron_optimizer(model,
...
@@ -145,6 +132,7 @@ def get_megatron_optimizer(model,
args
.
use_contiguous_buffers_in_local_ddp
,
args
.
use_contiguous_buffers_in_local_ddp
,
args
.
fp16
,
args
.
fp16
,
args
.
bf16
,
args
.
bf16
,
args
.
params_dtype
,
grad_scaler
,
grad_scaler
,
model
)
model
)
...
...
megatron/optimizer/clip_grads.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Gradient clipping."""
"""Gradient clipping."""
...
...
megatron/optimizer/distrib_optimizer.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron distributed optimizer."""
"""Megatron distributed optimizer."""
...
@@ -351,7 +338,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
...
@@ -351,7 +338,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
,
use_contiguous_buffers_in_local_ddp
,
params_have_main_grad
,
use_contiguous_buffers_in_local_ddp
,
fp16
,
bf16
,
grad_scaler
,
models
):
fp16
,
bf16
,
params_dtype
,
grad_scaler
,
models
):
"""
"""
See top of class definition for argument descriptions.
See top of class definition for argument descriptions.
...
@@ -365,7 +352,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
...
@@ -365,7 +352,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
super
().
__init__
(
super
().
__init__
(
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
,
use_contiguous_buffers_in_local_ddp
,
params_have_main_grad
,
use_contiguous_buffers_in_local_ddp
,
fp16
,
bf16
,
grad_scaler
,
models
)
fp16
,
bf16
,
params_dtype
,
grad_scaler
,
models
)
# Verify that contiguous buffers are being used.
# Verify that contiguous buffers are being used.
# - Note: this should already be checked in arguments.py.
# - Note: this should already be checked in arguments.py.
...
@@ -394,6 +381,21 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
...
@@ -394,6 +381,21 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
self
.
model_param_gbuf_map
,
self
.
model_param_gbuf_map
,
self
.
opt_group_ranges
)
self
.
opt_group_ranges
)
# Initialize param buffers.
# - These are views on the DDP model's grad buffers, that share
# storage & have their own dtype. This is safe because the param
# dtype size is always <= grad dtype size.
self
.
param_buffers
=
[]
for
model_index
,
model
in
enumerate
(
self
.
models
):
current_param_buffers
=
{}
for
dtype
,
grad_buffer
in
model
.
_grad_buffers
.
items
():
param_buffer
=
torch
.
tensor
(
grad_buffer
.
data
.
storage
().
_untyped
(),
dtype
=
params_dtype
,
device
=
grad_buffer
.
data
.
device
)
param_buffer
=
param_buffer
[:
grad_buffer
.
numel_padded
]
current_param_buffers
[
dtype
]
=
param_buffer
self
.
param_buffers
.
append
(
current_param_buffers
)
# Update optimizer groups.
# Update optimizer groups.
# - Also, leverage state_dict() and load_state_dict() to
# - Also, leverage state_dict() and load_state_dict() to
# recast preexisting per-param state tensors.
# recast preexisting per-param state tensors.
...
@@ -449,8 +451,9 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
...
@@ -449,8 +451,9 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
# Grad scaler.
# Grad scaler.
if
'grad_scaler'
not
in
state_dict
:
if
'grad_scaler'
not
in
state_dict
:
print_rank_0
(
'***WARNING*** found an old checkpoint, will not '
if
self
.
fp16
:
'load grad scaler ...'
)
print_rank_0
(
'***WARNING*** found an old checkpoint, will not '
'load grad scaler ...'
)
else
:
else
:
if
self
.
grad_scaler
:
if
self
.
grad_scaler
:
self
.
grad_scaler
.
load_state_dict
(
state_dict
[
'grad_scaler'
])
self
.
grad_scaler
.
load_state_dict
(
state_dict
[
'grad_scaler'
])
...
@@ -487,36 +490,48 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
...
@@ -487,36 +490,48 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
_zero_grad_group_helper
(
group
,
set_to_none
)
_zero_grad_group_helper
(
group
,
set_to_none
)
def
get_model_grad_buffer_dp_views
(
self
):
@
staticmethod
def
get_model_buffer_dp_views
(
model_buffers
):
"""
"""
Get shard views of each of the DDP's grad buffers.
Get shard views of each of the DDP's
param/
grad buffers.
In this nested list, the top level is grouped by the virtual model
In this nested list, the top level is grouped by the virtual model
index and the
grad
buffer's data type. The sub-level is a list of
index and the buffer's data type. The sub-level is a list of
shards of that
grad
buffer, where each shard in the list represents
shards of that buffer, where each shard in the list represents
a contiguous view of the
grad
buffer, that is owned by a data-parallel
a contiguous view of the buffer, that is owned by a data-parallel
rank. The shard boundary does not respect parameter boundaries, and
rank. The shard boundary does not respect parameter boundaries, and
so the elements of some parameters are split across data parallel
so the elements of some parameters are split across data parallel
ranks.
ranks.
Additionally, return references to the entire
grad
buffers, for use
Additionally, return references to the entire buffers, for use
in _reduce_scatter_base and _all_gather_base.
in _reduce_scatter_base and _all_gather_base.
"""
"""
data_parallel_world_size
=
mpu
.
get_data_parallel_world_size
()
data_parallel_world_size
=
mpu
.
get_data_parallel_world_size
()
# Grad buffer views.
# Buffer views.
gbuf_view_items
=
[]
view_items
=
[]
for
model_index
,
model
in
enumerate
(
self
.
models
):
for
model_index
,
buffers
in
enumerate
(
model_buffers
):
for
dtype
,
gbuf
in
model
.
_grad_buffers
.
items
():
for
dtype
,
buf
in
buffers
.
items
():
assert
buf
.
numel
()
%
data_parallel_world_size
==
0
shard_size
=
int
(
buf
.
numel
()
/
data_parallel_world_size
)
buf_views
=
[
buf
[(
r
*
shard_size
):((
r
+
1
)
*
shard_size
)]
for
r
in
range
(
data_parallel_world_size
)]
view_items
.
append
((
model_index
,
dtype
,
buf
,
buf_views
))
assert
gbuf
.
numel_padded
%
data_parallel_world_size
==
0
return
view_items
shard_size
=
int
(
gbuf
.
numel_padded
/
data_parallel_world_size
)
gbuf_views
=
[
gbuf
.
data
[(
r
*
shard_size
):((
r
+
1
)
*
shard_size
)]
for
r
in
range
(
data_parallel_world_size
)]
gbuf_view_items
.
append
((
model_index
,
dtype
,
gbuf
.
data
,
gbuf_views
))
return
gbuf_view_items
def
get_model_grad_buffer_dp_views
(
self
):
return
self
.
get_model_buffer_dp_views
([
{
dtype
:
mem_buffer
.
data
}
for
model
in
self
.
models
for
dtype
,
mem_buffer
in
model
.
_grad_buffers
.
items
()])
def
get_model_param_buffer_dp_views
(
self
):
return
self
.
get_model_buffer_dp_views
(
self
.
param_buffers
)
def
reduce_model_grads
(
self
,
args
,
timers
):
def
reduce_model_grads
(
self
,
args
,
timers
):
...
@@ -532,17 +547,20 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
...
@@ -532,17 +547,20 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
"""
"""
# All-reduce layer-norm grads (for sequence parallelism).
# All-reduce layer-norm grads (for sequence parallelism).
timers
(
'backward-layernorm-all-reduce'
).
start
()
timers
(
'layernorm-grads-all-reduce'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
self
.
allreduce_layernorm_grads
(
args
)
self
.
allreduce_layernorm_grads
(
args
)
timers
(
'
backward-
layernorm-all-reduce'
).
stop
()
timers
(
'layernorm-
grads-
all-reduce'
).
stop
()
# All-reduce embedding grads.
# All-reduce embedding grads.
timers
(
'backward-embedding-all-reduce'
).
start
()
timers
(
'embedding-grads-all-reduce'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
self
.
allreduce_embedding_grads
(
args
)
self
.
allreduce_embedding_grads
(
args
)
timers
(
'
backward-
embedding-all-reduce'
).
stop
()
timers
(
'embedding-
grads-
all-reduce'
).
stop
()
# Reduce-scatter setup.
# Reduce-scatter setup.
timers
(
'backward-params-all-reduce'
).
start
()
timers
(
'grads-reduce-scatter'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
data_parallel_rank
=
mpu
.
get_data_parallel_rank
()
data_parallel_rank
=
mpu
.
get_data_parallel_rank
()
data_parallel_world_size
=
mpu
.
get_data_parallel_world_size
()
data_parallel_world_size
=
mpu
.
get_data_parallel_world_size
()
data_parallel_group
=
mpu
.
get_data_parallel_group
()
data_parallel_group
=
mpu
.
get_data_parallel_group
()
...
@@ -563,46 +581,49 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
...
@@ -563,46 +581,49 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
group
=
data_parallel_group
,
group
=
data_parallel_group
,
)
)
timers
(
'
backward-params-all-reduce
'
).
stop
()
timers
(
'
grads-reduce-scatter
'
).
stop
()
def
gather_model_params
(
self
,
args
,
timers
):
def
gather_model_params
(
self
,
args
,
timers
):
"""
"""
All-gather updated model params.
All-gather updated model params.
The DDP's
g
ra
d
buffer is used for the all-gather, and thus no
The DDP's
pa
ra
m
buffer is used for the all-gather, and thus no
tensors are dynamically allocated. After the all-gather, the params
tensors are dynamically allocated. After the all-gather, the params
can be copied from param
.main_grad to
param.
can be copied from
the
param
buffer to the
param.
"""
"""
timers
(
'backward-params-all-gather'
).
start
()
timers
(
'params-all-gather'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
data_parallel_rank
=
mpu
.
get_data_parallel_rank
()
data_parallel_rank
=
mpu
.
get_data_parallel_rank
()
data_parallel_group
=
mpu
.
get_data_parallel_group
()
data_parallel_group
=
mpu
.
get_data_parallel_group
()
# All-gather updated main params.
# All-gather updated main params.
# - All grad buffer views are guaranteed to have the same num elements
# - All param buffer views are guaranteed to have the same num elements
# across all data parallel ranks, with grad buffer padding that is done
# across all data parallel ranks, due to grad buffer padding that is
# in distributed.py. Thus, all sub-views will have consistent start/end
# done in distributed.py, and extended to the param buffers. Thus,
# indexes across data parallel ranks.
# all sub-views will have consistent start/end indexes across data
gbuf_view_items
=
self
.
get_model_grad_buffer_dp_views
()
# parallel ranks.
for
index
,
(
model_index
,
dtype
,
gbuf
,
gbuf_views
)
\
pbuf_view_items
=
self
.
get_model_param_buffer_dp_views
()
in
enumerate
(
gbuf_view_items
):
for
index
,
(
model_index
,
dtype
,
pbuf
,
pbuf_views
)
\
in
enumerate
(
pbuf_view_items
):
torch
.
distributed
.
_all_gather_base
(
torch
.
distributed
.
_all_gather_base
(
g
buf
,
p
buf
,
g
buf_views
[
data_parallel_rank
],
p
buf_views
[
data_parallel_rank
],
group
=
data_parallel_group
,
group
=
data_parallel_group
,
)
)
# Each model param now contains its updated values in its
# Copy from param buffer to each param.
# '.main_grad' field.
for
model_id
,
model
in
enumerate
(
self
.
models
):
for
model
in
self
.
models
:
for
dtype
,
param_map
in
model
.
_grad_buffer_param_index_map
.
items
():
for
dtype
,
param_map
in
model
.
_grad_buffer_param_index_map
.
items
():
for
param
in
param_map
:
for
param
,
buf_range
in
param_map
.
items
():
param
.
detach
().
copy_
(
param
.
main_grad
)
param_buf
=
self
.
param_buffers
[
model_id
][
dtype
]
param_buf_shard
=
param_buf
[
buf_range
[
0
]:
buf_range
[
1
]]
param
.
view
(
-
1
).
detach
().
copy_
(
param_buf_shard
)
timers
(
'
backward-
params-all-gather'
).
stop
()
timers
(
'params-all-gather'
).
stop
()
def
_collect_main_grad_data_for_unscaling
(
self
):
def
_collect_main_grad_data_for_unscaling
(
self
):
...
@@ -680,14 +701,17 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
...
@@ -680,14 +701,17 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
model_group
):
model_group
):
param_range_map
=
self
.
get_model_param_range_map
(
model_param
)
param_range_map
=
self
.
get_model_param_range_map
(
model_param
)
param_range
=
param_range_map
[
"param"
]
world_range
=
param_range_map
[
"gbuf_world"
]
assert
param_range
.
size
==
shard_main_param
.
nelement
()
model_grad
=
model_param
.
main_grad
assert
world_range
.
size
==
shard_main_param
.
nelement
()
shard_model_grad
=
model_grad
.
view
(
-
1
)
\
[
param_range
.
start
:
param_range
.
end
]
model_id
,
dtype
=
self
.
model_param_gbuf_map
[
model_param
]
model_param_buffer
=
self
.
param_buffers
[
model_id
][
dtype
]
shard_model_param
=
model_param_buffer
.
view
(
-
1
)
\
[
world_range
.
start
:
world_range
.
end
]
shard_model_
g
ra
d
.
data
.
copy_
(
shard_main_param
)
shard_model_
pa
ra
m
.
data
.
copy_
(
shard_main_param
)
# Copy shard groups to model groups.
# Copy shard groups to model groups.
copy_group_params
(
self
.
shard_fp32_from_float16_groups
,
copy_group_params
(
self
.
shard_fp32_from_float16_groups
,
...
...
megatron/optimizer/grad_scaler.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron grad scaler."""
"""Megatron grad scaler."""
...
...
megatron/optimizer/optimizer.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron optimizer."""
"""Megatron optimizer."""
...
@@ -294,21 +281,24 @@ class MegatronOptimizer(ABC):
...
@@ -294,21 +281,24 @@ class MegatronOptimizer(ABC):
"""All-reduce all grads, and all-reduce embeddings."""
"""All-reduce all grads, and all-reduce embeddings."""
# All-reduce layer-norm grads (for sequence parallelism).
# All-reduce layer-norm grads (for sequence parallelism).
timers
(
'backward-layernorm-all-reduce'
).
start
()
timers
(
'layernorm-grads-all-reduce'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
self
.
allreduce_layernorm_grads
(
args
)
self
.
allreduce_layernorm_grads
(
args
)
timers
(
'
backward-
layernorm-all-reduce'
).
stop
()
timers
(
'layernorm-
grads-
all-reduce'
).
stop
()
# All-reduce if needed.
# All-reduce if needed.
if
args
.
DDP_impl
==
'local'
:
if
args
.
DDP_impl
==
'local'
:
timers
(
'backward-params-all-reduce'
).
start
()
timers
(
'grads-all-reduce'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
for
model
in
self
.
models
:
for
model
in
self
.
models
:
model
.
allreduce_gradients
()
model
.
allreduce_gradients
()
timers
(
'
backward-pa
ra
m
s-all-reduce'
).
stop
()
timers
(
'
g
ra
d
s-all-reduce'
).
stop
()
# All-reduce embedding grads.
# All-reduce embedding grads.
timers
(
'backward-embedding-all-reduce'
).
start
()
timers
(
'embedding-grads-all-reduce'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
self
.
allreduce_embedding_grads
(
args
)
self
.
allreduce_embedding_grads
(
args
)
timers
(
'
backward-
embedding-all-reduce'
).
stop
()
timers
(
'embedding-
grads-
all-reduce'
).
stop
()
class
MixedPrecisionOptimizer
(
MegatronOptimizer
):
class
MixedPrecisionOptimizer
(
MegatronOptimizer
):
...
@@ -332,6 +322,7 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
...
@@ -332,6 +322,7 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
is using a contiguous buffer to hold the model grads.
is using a contiguous buffer to hold the model grads.
fp16: if true, the model is running in fp16.
fp16: if true, the model is running in fp16.
bf16: if true, the model is running in bfloat16.
bf16: if true, the model is running in bfloat16.
params_dtype: used by distributed optimizer.
grad_scaler: used for scaling gradients. Note that this can be
grad_scaler: used for scaling gradients. Note that this can be
None. This case happens when `bf16 = True` and we don't
None. This case happens when `bf16 = True` and we don't
use any loss scale. Note that for `bf16 = True`, we can have
use any loss scale. Note that for `bf16 = True`, we can have
...
@@ -343,7 +334,7 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
...
@@ -343,7 +334,7 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
,
use_contiguous_buffers_in_local_ddp
,
params_have_main_grad
,
use_contiguous_buffers_in_local_ddp
,
fp16
,
bf16
,
grad_scaler
,
fp16
,
bf16
,
params_dtype
,
grad_scaler
,
models
):
models
):
super
().
__init__
(
super
().
__init__
(
...
@@ -353,6 +344,7 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
...
@@ -353,6 +344,7 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
self
.
fp16
=
fp16
self
.
fp16
=
fp16
self
.
bf16
=
bf16
self
.
bf16
=
bf16
self
.
params_dtype
=
params_dtype
self
.
grad_scaler
=
grad_scaler
self
.
grad_scaler
=
grad_scaler
# None grad scaler is only supported for bf16.
# None grad scaler is only supported for bf16.
...
@@ -416,7 +408,8 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
...
@@ -416,7 +408,8 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
def
step
(
self
,
args
,
timers
):
def
step
(
self
,
args
,
timers
):
# Copy gradients from model params to main params.
# Copy gradients from model params to main params.
timers
(
'optimizer-copy-to-main-grad'
).
start
()
timers
(
'optimizer-copy-to-main-grad'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
self
.
_copy_model_grads_to_main_grads
()
self
.
_copy_model_grads_to_main_grads
()
timers
(
'optimizer-copy-to-main-grad'
).
stop
()
timers
(
'optimizer-copy-to-main-grad'
).
stop
()
...
@@ -425,7 +418,8 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
...
@@ -425,7 +418,8 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
if
self
.
grad_scaler
:
if
self
.
grad_scaler
:
# Unscale and check for inf/nan.
# Unscale and check for inf/nan.
timers
(
'optimizer-unscale-and-check-inf'
).
start
()
timers
(
'optimizer-unscale-and-check-inf'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
found_inf_flag
=
self
.
_unscale_main_grads_and_check_for_nan
()
found_inf_flag
=
self
.
_unscale_main_grads_and_check_for_nan
()
timers
(
'optimizer-unscale-and-check-inf'
).
stop
()
timers
(
'optimizer-unscale-and-check-inf'
).
stop
()
...
@@ -438,25 +432,29 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
...
@@ -438,25 +432,29 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
return
False
,
None
,
None
return
False
,
None
,
None
# Clip the main gradients.
# Clip the main gradients.
timers
(
'optimizer-clip-main-grad'
).
start
()
timers
(
'optimizer-clip-main-grad'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
grad_norm
=
None
grad_norm
=
None
if
self
.
clip_grad
>
0.0
:
if
self
.
clip_grad
>
0.0
:
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
timers
(
'optimizer-clip-main-grad'
).
stop
()
timers
(
'optimizer-clip-main-grad'
).
stop
()
# Count the zeros in the grads.
# Count the zeros in the grads.
timers
(
'optimizer-count-zeros'
).
start
()
timers
(
'optimizer-count-zeros'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
num_zeros_in_grad
=
self
.
count_zeros
()
if
\
num_zeros_in_grad
=
self
.
count_zeros
()
if
\
self
.
log_num_zeros_in_grad
else
None
self
.
log_num_zeros_in_grad
else
None
timers
(
'optimizer-count-zeros'
).
stop
()
timers
(
'optimizer-count-zeros'
).
stop
()
# Step the optimizer.
# Step the optimizer.
timers
(
'optimizer-inner-step'
).
start
()
timers
(
'optimizer-inner-step'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
self
.
optimizer
.
step
()
self
.
optimizer
.
step
()
timers
(
'optimizer-inner-step'
).
stop
()
timers
(
'optimizer-inner-step'
).
stop
()
# Update params from main params.
# Update params from main params.
timers
(
'optimizer-copy-main-to-model-params'
).
start
()
timers
(
'optimizer-copy-main-to-model-params'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
self
.
_copy_main_params_to_model_params
()
self
.
_copy_main_params_to_model_params
()
timers
(
'optimizer-copy-main-to-model-params'
).
stop
()
timers
(
'optimizer-copy-main-to-model-params'
).
stop
()
...
@@ -496,12 +494,12 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
...
@@ -496,12 +494,12 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
,
use_contiguous_buffers_in_local_ddp
,
params_have_main_grad
,
use_contiguous_buffers_in_local_ddp
,
fp16
,
bf16
,
grad_scaler
,
models
):
fp16
,
bf16
,
params_dtype
,
grad_scaler
,
models
):
super
().
__init__
(
super
().
__init__
(
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
,
use_contiguous_buffers_in_local_ddp
,
params_have_main_grad
,
use_contiguous_buffers_in_local_ddp
,
fp16
,
bf16
,
grad_scaler
,
models
)
fp16
,
bf16
,
params_dtype
,
grad_scaler
,
models
)
# ======================
# ======================
# main parameter stuff
# main parameter stuff
...
@@ -671,8 +669,9 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
...
@@ -671,8 +669,9 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
# Grad scaler.
# Grad scaler.
if
'grad_scaler'
not
in
state_dict
:
if
'grad_scaler'
not
in
state_dict
:
print_rank_0
(
'***WARNING*** found an old checkpoint, will not '
if
self
.
fp16
:
'load grad scaler ...'
)
print_rank_0
(
'***WARNING*** found an old checkpoint, will not '
'load grad scaler ...'
)
else
:
else
:
if
self
.
grad_scaler
:
if
self
.
grad_scaler
:
self
.
grad_scaler
.
load_state_dict
(
state_dict
[
'grad_scaler'
])
self
.
grad_scaler
.
load_state_dict
(
state_dict
[
'grad_scaler'
])
...
@@ -725,7 +724,8 @@ class FP32Optimizer(MegatronOptimizer):
...
@@ -725,7 +724,8 @@ class FP32Optimizer(MegatronOptimizer):
Always return successful since there is no overflow."""
Always return successful since there is no overflow."""
# Copy main_grads to grads.
# Copy main_grads to grads.
timers
(
'optimizer-copy-to-main-grad'
).
start
()
timers
(
'optimizer-copy-to-main-grad'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
if
self
.
params_have_main_grad
:
if
self
.
params_have_main_grad
:
for
param_group
in
self
.
optimizer
.
param_groups
:
for
param_group
in
self
.
optimizer
.
param_groups
:
for
param
in
param_group
[
'params'
]:
for
param
in
param_group
[
'params'
]:
...
@@ -739,20 +739,23 @@ class FP32Optimizer(MegatronOptimizer):
...
@@ -739,20 +739,23 @@ class FP32Optimizer(MegatronOptimizer):
timers
(
'optimizer-copy-to-main-grad'
).
stop
()
timers
(
'optimizer-copy-to-main-grad'
).
stop
()
# Clip gradients.
# Clip gradients.
timers
(
'optimizer-clip-main-grad'
).
start
()
timers
(
'optimizer-clip-main-grad'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
grad_norm
=
None
grad_norm
=
None
if
self
.
clip_grad
>
0.0
:
if
self
.
clip_grad
>
0.0
:
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
grad_norm
=
self
.
clip_grad_norm
(
self
.
clip_grad
)
timers
(
'optimizer-clip-main-grad'
).
stop
()
timers
(
'optimizer-clip-main-grad'
).
stop
()
# count the zeros in the grads
# count the zeros in the grads
timers
(
'optimizer-count-zeros'
).
start
()
timers
(
'optimizer-count-zeros'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
num_zeros_in_grad
=
self
.
count_zeros
()
if
\
num_zeros_in_grad
=
self
.
count_zeros
()
if
\
self
.
log_num_zeros_in_grad
else
None
self
.
log_num_zeros_in_grad
else
None
timers
(
'optimizer-count-zeros'
).
stop
()
timers
(
'optimizer-count-zeros'
).
stop
()
# Update parameters.
# Update parameters.
timers
(
'optimizer-inner-step'
).
start
()
timers
(
'optimizer-inner-step'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
self
.
optimizer
.
step
()
self
.
optimizer
.
step
()
timers
(
'optimizer-inner-step'
).
stop
()
timers
(
'optimizer-inner-step'
).
stop
()
...
...
megatron/optimizer_param_scheduler.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Learning rate decay and weight decay incr functions."""
"""Learning rate decay and weight decay incr functions."""
...
...
megatron/p2p_communication.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
functools
import
reduce
from
functools
import
reduce
import
operator
import
operator
...
@@ -163,7 +150,7 @@ def recv_forward(tensor_shape=None, dtype_=None, timers=None):
...
@@ -163,7 +150,7 @@ def recv_forward(tensor_shape=None, dtype_=None, timers=None):
input_tensor
=
None
input_tensor
=
None
else
:
else
:
if
timers
is
not
None
:
if
timers
is
not
None
:
timers
(
'forward-recv'
).
start
()
timers
(
'forward-recv'
,
log_level
=
2
).
start
()
input_tensor
,
_
=
_communicate
(
input_tensor
,
_
=
_communicate
(
tensor_send_next
=
None
,
tensor_send_next
=
None
,
tensor_send_prev
=
None
,
tensor_send_prev
=
None
,
...
@@ -182,7 +169,7 @@ def recv_backward(tensor_shape=None, timers=None):
...
@@ -182,7 +169,7 @@ def recv_backward(tensor_shape=None, timers=None):
output_tensor_grad
=
None
output_tensor_grad
=
None
else
:
else
:
if
timers
is
not
None
:
if
timers
is
not
None
:
timers
(
'backward-recv'
).
start
()
timers
(
'backward-recv'
,
log_level
=
2
).
start
()
_
,
output_tensor_grad
=
_communicate
(
_
,
output_tensor_grad
=
_communicate
(
tensor_send_next
=
None
,
tensor_send_next
=
None
,
tensor_send_prev
=
None
,
tensor_send_prev
=
None
,
...
@@ -199,7 +186,7 @@ def send_forward(output_tensor, tensor_shape=None, dtype_=None, timers=None):
...
@@ -199,7 +186,7 @@ def send_forward(output_tensor, tensor_shape=None, dtype_=None, timers=None):
if
not
mpu
.
is_pipeline_last_stage
():
if
not
mpu
.
is_pipeline_last_stage
():
if
timers
is
not
None
:
if
timers
is
not
None
:
timers
(
'forward-send'
).
start
()
timers
(
'forward-send'
,
log_level
=
2
).
start
()
_communicate
(
_communicate
(
tensor_send_next
=
output_tensor
,
tensor_send_next
=
output_tensor
,
tensor_send_prev
=
None
,
tensor_send_prev
=
None
,
...
@@ -215,7 +202,7 @@ def send_backward(input_tensor_grad, tensor_shape=None, timers=None):
...
@@ -215,7 +202,7 @@ def send_backward(input_tensor_grad, tensor_shape=None, timers=None):
"""Send tensor to previous rank in pipeline (backward send)."""
"""Send tensor to previous rank in pipeline (backward send)."""
if
not
mpu
.
is_pipeline_first_stage
():
if
not
mpu
.
is_pipeline_first_stage
():
if
timers
is
not
None
:
if
timers
is
not
None
:
timers
(
'backward-send'
).
start
()
timers
(
'backward-send'
,
log_level
=
2
).
start
()
_communicate
(
_communicate
(
tensor_send_next
=
None
,
tensor_send_next
=
None
,
tensor_send_prev
=
input_tensor_grad
,
tensor_send_prev
=
input_tensor_grad
,
...
@@ -232,7 +219,7 @@ def send_forward_recv_backward(output_tensor, tensor_shape=None, timers=None):
...
@@ -232,7 +219,7 @@ def send_forward_recv_backward(output_tensor, tensor_shape=None, timers=None):
output_tensor_grad
=
None
output_tensor_grad
=
None
else
:
else
:
if
timers
is
not
None
:
if
timers
is
not
None
:
timers
(
'forward-send-backward-recv'
).
start
()
timers
(
'forward-send-backward-recv'
,
log_level
=
2
).
start
()
_
,
output_tensor_grad
=
_communicate
(
_
,
output_tensor_grad
=
_communicate
(
tensor_send_next
=
output_tensor
,
tensor_send_next
=
output_tensor
,
tensor_send_prev
=
None
,
tensor_send_prev
=
None
,
...
@@ -250,7 +237,7 @@ def send_backward_recv_forward(input_tensor_grad, tensor_shape=None, timers=None
...
@@ -250,7 +237,7 @@ def send_backward_recv_forward(input_tensor_grad, tensor_shape=None, timers=None
input_tensor
=
None
input_tensor
=
None
else
:
else
:
if
timers
is
not
None
:
if
timers
is
not
None
:
timers
(
'backward-send-forward-recv'
).
start
()
timers
(
'backward-send-forward-recv'
,
log_level
=
2
).
start
()
input_tensor
,
_
=
_communicate
(
input_tensor
,
_
=
_communicate
(
tensor_send_next
=
None
,
tensor_send_next
=
None
,
tensor_send_prev
=
input_tensor_grad
,
tensor_send_prev
=
input_tensor_grad
,
...
@@ -265,7 +252,7 @@ def send_backward_recv_forward(input_tensor_grad, tensor_shape=None, timers=None
...
@@ -265,7 +252,7 @@ def send_backward_recv_forward(input_tensor_grad, tensor_shape=None, timers=None
def
send_forward_recv_forward
(
output_tensor
,
recv_prev
,
tensor_shape
=
None
,
timers
=
None
):
def
send_forward_recv_forward
(
output_tensor
,
recv_prev
,
tensor_shape
=
None
,
timers
=
None
):
"""Batched recv from previous rank and send to next rank in pipeline."""
"""Batched recv from previous rank and send to next rank in pipeline."""
if
timers
is
not
None
:
if
timers
is
not
None
:
timers
(
'forward-send-forward-recv'
).
start
()
timers
(
'forward-send-forward-recv'
,
log_level
=
2
).
start
()
input_tensor
,
_
=
_communicate
(
input_tensor
,
_
=
_communicate
(
tensor_send_next
=
output_tensor
,
tensor_send_next
=
output_tensor
,
tensor_send_prev
=
None
,
tensor_send_prev
=
None
,
...
@@ -280,7 +267,7 @@ def send_forward_recv_forward(output_tensor, recv_prev, tensor_shape=None, timer
...
@@ -280,7 +267,7 @@ def send_forward_recv_forward(output_tensor, recv_prev, tensor_shape=None, timer
def
send_backward_recv_backward
(
input_tensor_grad
,
recv_next
,
tensor_shape
=
None
,
timers
=
None
):
def
send_backward_recv_backward
(
input_tensor_grad
,
recv_next
,
tensor_shape
=
None
,
timers
=
None
):
"""Batched recv from next rank and send to previous rank in pipeline."""
"""Batched recv from next rank and send to previous rank in pipeline."""
if
timers
is
not
None
:
if
timers
is
not
None
:
timers
(
'backward-send-backward-recv'
).
start
()
timers
(
'backward-send-backward-recv'
,
log_level
=
2
).
start
()
_
,
output_tensor_grad
=
_communicate
(
_
,
output_tensor_grad
=
_communicate
(
tensor_send_next
=
None
,
tensor_send_next
=
None
,
tensor_send_prev
=
input_tensor_grad
,
tensor_send_prev
=
input_tensor_grad
,
...
@@ -297,7 +284,8 @@ def send_forward_backward_recv_forward_backward(
...
@@ -297,7 +284,8 @@ def send_forward_backward_recv_forward_backward(
recv_next
,
tensor_shape
=
None
,
timers
=
None
):
recv_next
,
tensor_shape
=
None
,
timers
=
None
):
"""Batched send and recv with previous and next ranks in pipeline."""
"""Batched send and recv with previous and next ranks in pipeline."""
if
timers
is
not
None
:
if
timers
is
not
None
:
timers
(
'forward-backward-send-forward-backward-recv'
).
start
()
timers
(
'forward-backward-send-forward-backward-recv'
,
log_level
=
2
).
start
()
input_tensor
,
output_tensor_grad
=
_communicate
(
input_tensor
,
output_tensor_grad
=
_communicate
(
tensor_send_next
=
output_tensor
,
tensor_send_next
=
output_tensor
,
tensor_send_prev
=
input_tensor_grad
,
tensor_send_prev
=
input_tensor_grad
,
...
...
megatron/schedules.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
contextlib
import
contextmanager
from
contextlib
import
contextmanager
import
torch
import
torch
...
@@ -107,6 +94,7 @@ def forward_step(forward_step_func,
...
@@ -107,6 +94,7 @@ def forward_step(forward_step_func,
model
,
model
,
input_tensor
,
input_tensor
,
forward_data_store
,
forward_data_store
,
timers
,
collect_non_loss_data
=
False
):
collect_non_loss_data
=
False
):
"""Forward step for passed-in model.
"""Forward step for passed-in model.
...
@@ -115,9 +103,9 @@ def forward_step(forward_step_func,
...
@@ -115,9 +103,9 @@ def forward_step(forward_step_func,
Returns output tensor."""
Returns output tensor."""
args
=
get_args
()
args
=
get_args
()
timers
=
get_timers
()
timers
(
'forward-compute'
).
start
()
if
timers
is
not
None
:
timers
(
'forward-compute'
,
log_level
=
2
).
start
()
unwrapped_model
=
unwrap_model
(
unwrapped_model
=
unwrap_model
(
model
,
(
torchDDP
,
LocalDDP
,
Float16Module
))
model
,
(
torchDDP
,
LocalDDP
,
Float16Module
))
...
@@ -138,7 +126,8 @@ def forward_step(forward_step_func,
...
@@ -138,7 +126,8 @@ def forward_step(forward_step_func,
data
=
loss_func
(
output_tensor
,
non_loss_data
=
True
)
data
=
loss_func
(
output_tensor
,
non_loss_data
=
True
)
forward_data_store
.
append
(
data
)
forward_data_store
.
append
(
data
)
timers
(
'forward-compute'
).
stop
()
if
timers
is
not
None
:
timers
(
'forward-compute'
).
stop
()
# If T5 model (or other model with encoder and decoder)
# If T5 model (or other model with encoder and decoder)
# and in decoder stack, then send encoder_hidden_state
# and in decoder stack, then send encoder_hidden_state
...
@@ -151,7 +140,8 @@ def forward_step(forward_step_func,
...
@@ -151,7 +140,8 @@ def forward_step(forward_step_func,
return
[
output_tensor
]
return
[
output_tensor
]
def
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
output_tensor_grad
):
def
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
output_tensor_grad
,
timers
):
"""Backward step through passed-in output tensor.
"""Backward step through passed-in output tensor.
If last stage, output_tensor_grad is None, otherwise gradient of loss
If last stage, output_tensor_grad is None, otherwise gradient of loss
...
@@ -165,8 +155,8 @@ def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):
...
@@ -165,8 +155,8 @@ def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):
# connections.
# connections.
args
=
get_args
()
args
=
get_args
()
timers
=
get_timers
()
if
timers
is
not
None
:
timers
(
'backward-compute'
).
start
()
timers
(
'backward-compute'
,
log_level
=
2
).
start
()
# Retain the grad on the input_tensor.
# Retain the grad on the input_tensor.
unwrap_input_tensor_grad
=
False
unwrap_input_tensor_grad
=
False
...
@@ -207,7 +197,8 @@ def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):
...
@@ -207,7 +197,8 @@ def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):
if
unwrap_input_tensor_grad
:
if
unwrap_input_tensor_grad
:
input_tensor_grad
=
input_tensor_grad
[
0
]
input_tensor_grad
=
input_tensor_grad
[
0
]
timers
(
'backward-compute'
).
stop
()
if
timers
is
not
None
:
timers
(
'backward-compute'
).
stop
()
return
input_tensor_grad
return
input_tensor_grad
...
@@ -243,18 +234,19 @@ def forward_backward_no_pipelining(forward_step_func,
...
@@ -243,18 +234,19 @@ def forward_backward_no_pipelining(forward_step_func,
for
i
in
range
(
get_num_microbatches
()
-
1
):
for
i
in
range
(
get_num_microbatches
()
-
1
):
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
model
,
input_tensor
,
forward_data_store
,
model
,
input_tensor
,
forward_data_store
,
collect_non_loss_data
)
timers
,
collect_non_loss_data
)
if
not
forward_only
:
if
not
forward_only
:
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
output_tensor_grad
)
timers
,
output_tensor_grad
)
# Run computation for last microbatch out of context handler (want to
# Run computation for last microbatch out of context handler (want to
# synchronize gradients).
# synchronize gradients).
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
model
,
input_tensor
,
forward_data_store
,
model
,
input_tensor
,
forward_data_store
,
collect_non_loss_data
)
timers
,
collect_non_loss_data
)
if
not
forward_only
:
if
not
forward_only
:
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
output_tensor_grad
)
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
output_tensor_grad
,
timers
)
return
forward_data_store
return
forward_data_store
...
@@ -269,6 +261,9 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
...
@@ -269,6 +261,9 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
communication between pipeline stages as needed.
communication between pipeline stages as needed.
Returns dictionary with losses if the last stage, empty dict otherwise."""
Returns dictionary with losses if the last stage, empty dict otherwise."""
args
=
get_args
()
input_tensors
=
[[]
for
_
in
range
(
len
(
model
))]
input_tensors
=
[[]
for
_
in
range
(
len
(
model
))]
output_tensors
=
[[]
for
_
in
range
(
len
(
model
))]
output_tensors
=
[[]
for
_
in
range
(
len
(
model
))]
forward_data_store
=
[]
forward_data_store
=
[]
...
@@ -278,7 +273,6 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
...
@@ -278,7 +273,6 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
pipeline_parallel_size
=
mpu
.
get_pipeline_model_parallel_world_size
()
pipeline_parallel_size
=
mpu
.
get_pipeline_model_parallel_world_size
()
pipeline_parallel_rank
=
mpu
.
get_pipeline_model_parallel_rank
()
pipeline_parallel_rank
=
mpu
.
get_pipeline_model_parallel_rank
()
args
=
get_args
()
if
args
.
sequence_parallel
:
if
args
.
sequence_parallel
:
seq_length
=
args
.
seq_length
//
mpu
.
get_tensor_model_parallel_world_size
()
seq_length
=
args
.
seq_length
//
mpu
.
get_tensor_model_parallel_world_size
()
else
:
else
:
...
@@ -337,6 +331,7 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
...
@@ -337,6 +331,7 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
model
[
model_chunk_id
],
model
[
model_chunk_id
],
input_tensor
,
input_tensor
,
forward_data_store
,
forward_data_store
,
timers
,
collect_non_loss_data
)
collect_non_loss_data
)
output_tensors
[
model_chunk_id
].
append
(
output_tensor
)
output_tensors
[
model_chunk_id
].
append
(
output_tensor
)
...
@@ -364,7 +359,8 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
...
@@ -364,7 +359,8 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
backward_step
(
optimizer
,
backward_step
(
optimizer
,
input_tensor
,
input_tensor
,
output_tensor
,
output_tensor
,
output_tensor_grad
)
output_tensor_grad
,
timers
)
return
input_tensor_grad
return
input_tensor_grad
...
@@ -620,8 +616,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
...
@@ -620,8 +616,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
Returns dictionary with losses if the last stage, empty dict otherwise."""
Returns dictionary with losses if the last stage, empty dict otherwise."""
args
=
get_args
()
args
=
get_args
()
timers
=
get_timers
()
assert
len
(
model
)
==
1
assert
len
(
model
)
==
1
model
=
model
[
0
]
model
=
model
[
0
]
...
@@ -656,7 +651,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
...
@@ -656,7 +651,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
input_tensor
=
recv_forward
(
recv_tensor_shapes
,
timers
=
timers
)
input_tensor
=
recv_forward
(
recv_tensor_shapes
,
timers
=
timers
)
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
model
,
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
model
,
input_tensor
,
forward_data_store
,
input_tensor
,
forward_data_store
,
collect_non_loss_data
)
timers
,
collect_non_loss_data
)
send_forward
(
output_tensor
,
send_tensor_shapes
,
timers
=
timers
)
send_forward
(
output_tensor
,
send_tensor_shapes
,
timers
=
timers
)
if
not
forward_only
:
if
not
forward_only
:
...
@@ -676,7 +671,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
...
@@ -676,7 +671,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
model
,
output_tensor
=
forward_step
(
forward_step_func
,
data_iterator
,
model
,
input_tensor
,
forward_data_store
,
input_tensor
,
forward_data_store
,
collect_non_loss_data
)
timers
,
collect_non_loss_data
)
if
forward_only
:
if
forward_only
:
send_forward
(
output_tensor
,
send_tensor_shapes
,
timers
=
timers
)
send_forward
(
output_tensor
,
send_tensor_shapes
,
timers
=
timers
)
...
@@ -701,7 +696,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
...
@@ -701,7 +696,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
input_tensor_grad
=
\
input_tensor_grad
=
\
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
output_tensor_grad
)
output_tensor_grad
,
timers
)
if
last_iteration
:
if
last_iteration
:
input_tensor
=
None
input_tensor
=
None
...
@@ -721,7 +716,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
...
@@ -721,7 +716,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
input_tensor_grad
=
\
input_tensor_grad
=
\
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
output_tensor_grad
)
output_tensor_grad
,
timers
)
send_backward
(
input_tensor_grad
,
recv_tensor_shapes
,
timers
=
timers
)
send_backward
(
input_tensor_grad
,
recv_tensor_shapes
,
timers
=
timers
)
...
...
megatron/static/index.html
View file @
41276b6c
<!-- coding=utf-8-->
<!-- coding=utf-8-->
<!-- Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.-->
<!-- Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.-->
<!---->
<!-- Licensed under the Apache License, Version 2.0 (the "License");-->
<!-- you may not use this file except in compliance with the License.-->
<!-- You may obtain a copy of the License at-->
<!---->
<!-- http://www.apache.org/licenses/LICENSE-2.0-->
<!---->
<!-- Unless required by applicable law or agreed to in writing, software-->
<!-- distributed under the License is distributed on an "AS IS" BASIS,-->
<!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.-->
<!-- See the License for the specific language governing permissions and-->
<!-- limitations under the License.-->
<!DOCTYPE html>
<!DOCTYPE html>
<html
lang=
"en"
>
<html
lang=
"en"
>
...
...
megatron/text_generation/__init__.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.api
import
(
from
.api
import
(
...
...
megatron/text_generation/api.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference API."""
"""Inference API."""
...
...
megatron/text_generation/communication.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Communications utilities."""
"""Communications utilities."""
...
...
megatron/text_generation/forward_step.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Forward step utilities."""
"""Forward step utilities."""
...
...
megatron/text_generation/generation.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Generation utilities."""
"""Generation utilities."""
...
@@ -47,10 +34,15 @@ def score_and_return_on_first_stage(model, tokens, lengths):
...
@@ -47,10 +34,15 @@ def score_and_return_on_first_stage(model, tokens, lengths):
batch_size
=
tokens
.
size
(
0
)
batch_size
=
tokens
.
size
(
0
)
max_prompt_length
=
lengths
.
max
().
item
()
max_prompt_length
=
lengths
.
max
().
item
()
assert
max_prompt_length
==
tokens
.
size
(
1
)
assert
max_prompt_length
==
tokens
.
size
(
1
)
max_sequence_length
=
min
(
max_prompt_length
,
args
.
max_position_embeddings
)
if
max_prompt_length
>
args
.
max_position_embeddings
:
raise
ValueError
(
"Length of prompt + tokens_to_generate longer than allowed"
)
if
max_prompt_length
*
batch_size
>=
MAX_TOKENS_TO_OOM
:
raise
ValueError
(
"Too many tokens. "
+
str
(
max_prompt_length
*
batch_size
)
+
" is greater than "
+
str
(
MAX_TOKENS_TO_OOM
))
# forward step.
# forward step.
forward_step
=
ForwardStep
(
model
,
batch_size
,
max_
sequence
_length
)
forward_step
=
ForwardStep
(
model
,
batch_size
,
max_
prompt
_length
)
# ===================
# ===================
# Pre-allocate memory
# Pre-allocate memory
...
@@ -58,7 +50,7 @@ def score_and_return_on_first_stage(model, tokens, lengths):
...
@@ -58,7 +50,7 @@ def score_and_return_on_first_stage(model, tokens, lengths):
# Log probability of the sequence (prompt + generated tokens).
# Log probability of the sequence (prompt + generated tokens).
output_log_probs
=
None
output_log_probs
=
None
output_log_probs_size
=
(
batch_size
,
max_
sequence
_length
-
1
)
output_log_probs_size
=
(
batch_size
,
max_
prompt
_length
-
1
)
if
mpu
.
is_pipeline_last_stage
():
if
mpu
.
is_pipeline_last_stage
():
output_log_probs
=
torch
.
empty
(
output_log_probs_size
,
output_log_probs
=
torch
.
empty
(
output_log_probs_size
,
...
@@ -395,7 +387,7 @@ def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, sto
...
@@ -395,7 +387,7 @@ def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, sto
# if cannot find stop token, add open beams to hyps
# if cannot find stop token, add open beams to hyps
if
not
done
:
if
not
done
:
for
beam_id
in
range
(
beam_size
):
for
beam_id
in
range
(
beam_size
):
beam_hyp
.
add
(
tokens
[
beam_id
].
clone
(),
scores
[
beam_id
],
context_length
+
1
-
prompt_length
)
beam_hyp
.
add
(
tokens
[
beam_id
].
clone
(),
scores
[
beam_id
]
.
squeeze
()
,
context_length
+
1
-
prompt_length
)
# rank based on scores
# rank based on scores
sorted_hyps
=
sorted
(
beam_hyp
.
beams
,
key
=
lambda
x
:
x
[
0
],
reverse
=
True
)
sorted_hyps
=
sorted
(
beam_hyp
.
beams
,
key
=
lambda
x
:
x
[
0
],
reverse
=
True
)
...
...
megatron/text_generation/sampling.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Sampling utilities.
"""Sampling utilities.
Part of this code is inspired by:
Part of this code is inspired by:
...
...
megatron/text_generation/tokenization.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization utilities."""
"""Tokenization utilities."""
...
...
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment