Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
4f4715db
Commit
4f4715db
authored
Aug 18, 2021
by
Jared Casper
Browse files
Merge branch 't5_pipeline_parallelism' into t5_pipeline_parallelism_grad_norm_fix
parents
1f4966dc
1c3a1e6a
Changes
29
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
93 additions
and
40 deletions
+93
-40
megatron/initialize.py
megatron/initialize.py
+2
-6
megatron/model/transformer.py
megatron/model/transformer.py
+1
-3
megatron/mpu/initialize.py
megatron/mpu/initialize.py
+4
-0
megatron/mpu/layers.py
megatron/mpu/layers.py
+4
-4
megatron/mpu/mappings.py
megatron/mpu/mappings.py
+1
-1
megatron/optimizer/__init__.py
megatron/optimizer/__init__.py
+3
-1
megatron/optimizer/optimizer.py
megatron/optimizer/optimizer.py
+33
-5
megatron/schedules.py
megatron/schedules.py
+27
-13
megatron/training.py
megatron/training.py
+18
-7
No files found.
megatron/initialize.py
View file @
4f4715db
...
@@ -176,14 +176,10 @@ def _initialize_distributed():
...
@@ -176,14 +176,10 @@ def _initialize_distributed():
args
.
local_rank
=
device
args
.
local_rank
=
device
torch
.
cuda
.
set_device
(
device
)
torch
.
cuda
.
set_device
(
device
)
# Call the init process
# Call the init process
init_method
=
'tcp://'
master_ip
=
os
.
getenv
(
'MASTER_ADDR'
,
'localhost'
)
master_port
=
os
.
getenv
(
'MASTER_PORT'
,
'6000'
)
init_method
+=
master_ip
+
':'
+
master_port
torch
.
distributed
.
init_process_group
(
torch
.
distributed
.
init_process_group
(
backend
=
args
.
distributed_backend
,
backend
=
args
.
distributed_backend
,
world_size
=
args
.
world_size
,
rank
=
args
.
rank
,
world_size
=
args
.
world_size
,
rank
=
args
.
rank
)
init_method
=
init_method
)
# Set the tensor model-parallel, pipeline model-parallel, and
# Set the tensor model-parallel, pipeline model-parallel, and
# data-parallel communicators.
# data-parallel communicators.
...
...
megatron/model/transformer.py
View file @
4f4715db
...
@@ -53,8 +53,7 @@ class ParallelMLP(MegatronModule):
...
@@ -53,8 +53,7 @@ class ParallelMLP(MegatronModule):
MLP will take the input with h hidden state, project it to 4*h
MLP will take the input with h hidden state, project it to 4*h
hidden dimension, perform nonlinear transformation, and project the
hidden dimension, perform nonlinear transformation, and project the
state back into h hidden dimension. At the end, dropout is also
state back into h hidden dimension.
applied.
"""
"""
def
__init__
(
self
,
init_method
,
output_layer_init_method
):
def
__init__
(
self
,
init_method
,
output_layer_init_method
):
...
@@ -84,7 +83,6 @@ class ParallelMLP(MegatronModule):
...
@@ -84,7 +83,6 @@ class ParallelMLP(MegatronModule):
init_method
=
output_layer_init_method
,
init_method
=
output_layer_init_method
,
skip_bias_add
=
True
)
skip_bias_add
=
True
)
def
forward
(
self
,
hidden_states
):
def
forward
(
self
,
hidden_states
):
# [s, b, 4hp]
# [s, b, 4hp]
...
...
megatron/mpu/initialize.py
View file @
4f4715db
...
@@ -457,9 +457,13 @@ def get_data_parallel_rank():
...
@@ -457,9 +457,13 @@ def get_data_parallel_rank():
def
destroy_model_parallel
():
def
destroy_model_parallel
():
"""Set the groups to none."""
"""Set the groups to none."""
global
_MODEL_PARALLEL_GROUP
_MODEL_PARALLEL_GROUP
=
None
global
_TENSOR_MODEL_PARALLEL_GROUP
global
_TENSOR_MODEL_PARALLEL_GROUP
_TENSOR_MODEL_PARALLEL_GROUP
=
None
_TENSOR_MODEL_PARALLEL_GROUP
=
None
global
_PIPELINE_MODEL_PARALLEL_GROUP
global
_PIPELINE_MODEL_PARALLEL_GROUP
_PIPELINE_MODEL_PARALLEL_GROUP
=
None
_PIPELINE_MODEL_PARALLEL_GROUP
=
None
global
_DATA_PARALLEL_GROUP
global
_DATA_PARALLEL_GROUP
_DATA_PARALLEL_GROUP
=
None
_DATA_PARALLEL_GROUP
=
None
global
_EMBEDDING_GROUP
_EMBEDDING_GROUP
=
None
megatron/mpu/layers.py
View file @
4f4715db
...
@@ -256,7 +256,7 @@ class ColumnParallelLinear(torch.nn.Module):
...
@@ -256,7 +256,7 @@ class ColumnParallelLinear(torch.nn.Module):
device
=
torch
.
cuda
.
current_device
(),
dtype
=
args
.
params_dtype
))
device
=
torch
.
cuda
.
current_device
(),
dtype
=
args
.
params_dtype
))
_initialize_affine_weight_gpu
(
self
.
weight
,
init_method
,
_initialize_affine_weight_gpu
(
self
.
weight
,
init_method
,
partition_dim
=
0
,
stride
=
stride
)
partition_dim
=
0
,
stride
=
stride
)
if
bias
:
if
bias
:
if
args
.
use_cpu_initialization
:
if
args
.
use_cpu_initialization
:
self
.
bias
=
Parameter
(
torch
.
empty
(
self
.
bias
=
Parameter
(
torch
.
empty
(
...
@@ -286,7 +286,7 @@ class ColumnParallelLinear(torch.nn.Module):
...
@@ -286,7 +286,7 @@ class ColumnParallelLinear(torch.nn.Module):
# All-gather across the partitions.
# All-gather across the partitions.
output
=
gather_from_tensor_model_parallel_region
(
output_parallel
)
output
=
gather_from_tensor_model_parallel_region
(
output_parallel
)
else
:
else
:
output
=
output_parallel
output
=
output_parallel
output_bias
=
self
.
bias
if
self
.
skip_bias_add
else
None
output_bias
=
self
.
bias
if
self
.
skip_bias_add
else
None
return
output
,
output_bias
return
output
,
output_bias
...
@@ -316,8 +316,8 @@ class RowParallelLinear(torch.nn.Module):
...
@@ -316,8 +316,8 @@ class RowParallelLinear(torch.nn.Module):
keep_master_weight_for_test: This was added for testing and should be
keep_master_weight_for_test: This was added for testing and should be
set to False. It returns the master weights
set to False. It returns the master weights
used for initialization.
used for initialization.
skip_bias_add: This was added to enable performance optimation
s
where bias
skip_bias_add: This was added to enable performance optim
iz
ation where bias
can be fused with other elementwise operations.
w
e skip
can be fused with other elementwise operations.
W
e skip
adding bias but instead return it.
adding bias but instead return it.
"""
"""
...
...
megatron/mpu/mappings.py
View file @
4f4715db
...
@@ -20,7 +20,7 @@ from .utils import split_tensor_along_last_dim
...
@@ -20,7 +20,7 @@ from .utils import split_tensor_along_last_dim
def
_reduce
(
input_
):
def
_reduce
(
input_
):
"""All-reduce the
the
input tensor across model parallel group."""
"""All-reduce the input tensor across model parallel group."""
# Bypass the function if we are using only 1 GPU.
# Bypass the function if we are using only 1 GPU.
if
get_tensor_model_parallel_world_size
()
==
1
:
if
get_tensor_model_parallel_world_size
()
==
1
:
...
...
megatron/optimizer/__init__.py
View file @
4f4715db
...
@@ -100,10 +100,12 @@ def get_megatron_optimizer(model):
...
@@ -100,10 +100,12 @@ def get_megatron_optimizer(model):
args
.
clip_grad
,
args
.
clip_grad
,
args
.
log_num_zeros_in_grad
,
args
.
log_num_zeros_in_grad
,
params_have_main_grad
,
params_have_main_grad
,
args
.
use_contiguous_buffers_in_ddp
,
args
.
bf16
,
args
.
bf16
,
grad_scaler
)
grad_scaler
)
# FP32.
# FP32.
return
FP32Optimizer
(
optimizer
,
args
.
clip_grad
,
return
FP32Optimizer
(
optimizer
,
args
.
clip_grad
,
args
.
log_num_zeros_in_grad
,
args
.
log_num_zeros_in_grad
,
params_have_main_grad
)
params_have_main_grad
,
args
.
use_contiguous_buffers_in_ddp
)
megatron/optimizer/optimizer.py
View file @
4f4715db
...
@@ -68,7 +68,9 @@ class MegatronOptimizer(ABC):
...
@@ -68,7 +68,9 @@ class MegatronOptimizer(ABC):
def
__init__
(
self
,
optimizer
,
clip_grad
,
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
):
params_have_main_grad
,
use_contiguous_buffers_in_ddp
):
"""Input optimizer is the base optimizer for example Adam."""
"""Input optimizer is the base optimizer for example Adam."""
self
.
optimizer
=
optimizer
self
.
optimizer
=
optimizer
assert
self
.
optimizer
,
'no optimizer is provided.'
assert
self
.
optimizer
,
'no optimizer is provided.'
...
@@ -76,7 +78,11 @@ class MegatronOptimizer(ABC):
...
@@ -76,7 +78,11 @@ class MegatronOptimizer(ABC):
self
.
clip_grad
=
clip_grad
self
.
clip_grad
=
clip_grad
self
.
log_num_zeros_in_grad
=
log_num_zeros_in_grad
self
.
log_num_zeros_in_grad
=
log_num_zeros_in_grad
self
.
params_have_main_grad
=
params_have_main_grad
self
.
params_have_main_grad
=
params_have_main_grad
self
.
use_contiguous_buffers_in_ddp
=
use_contiguous_buffers_in_ddp
if
self
.
use_contiguous_buffers_in_ddp
:
assert
self
.
params_have_main_grad
,
\
"use of contiguous buffer requires that params have main grad"
def
get_parameters
(
self
):
def
get_parameters
(
self
):
params
=
[]
params
=
[]
...
@@ -187,11 +193,12 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
...
@@ -187,11 +193,12 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
"""
"""
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
,
bf16
,
grad_scaler
):
params_have_main_grad
,
use_contiguous_buffers_in_ddp
,
bf16
,
grad_scaler
):
super
(
Float16OptimizerWithFloat16Params
,
self
).
__init__
(
super
(
Float16OptimizerWithFloat16Params
,
self
).
__init__
(
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
)
params_have_main_grad
,
use_contiguous_buffers_in_ddp
)
self
.
bf16
=
bf16
self
.
bf16
=
bf16
self
.
grad_scaler
=
grad_scaler
self
.
grad_scaler
=
grad_scaler
...
@@ -310,12 +317,26 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
...
@@ -310,12 +317,26 @@ class Float16OptimizerWithFloat16Params(MegatronOptimizer):
else
:
else
:
if
model_param
.
grad
is
not
None
:
if
model_param
.
grad
is
not
None
:
main_param
.
grad
=
model_param
.
grad
.
float
()
main_param
.
grad
=
model_param
.
grad
.
float
()
# Safe to deallocate model's grad/main_grad after copying.
# (If using contiguous buffers, main_grad's memory should
# persist and therefore should not be deallocated.)
model_param
.
grad
=
None
if
self
.
params_have_main_grad
and
\
not
self
.
use_contiguous_buffers_in_ddp
:
model_param
.
main_grad
=
None
# For fp32 grads, we need to reset the grads to main grad.
# For fp32 grads, we need to reset the grads to main grad.
if
self
.
params_have_main_grad
:
if
self
.
params_have_main_grad
:
for
model_group
in
self
.
fp32_from_fp32_groups
:
for
model_group
in
self
.
fp32_from_fp32_groups
:
for
model_param
in
model_group
:
for
model_param
in
model_group
:
model_param
.
grad
=
model_param
.
main_grad
model_param
.
grad
=
model_param
.
main_grad
# Safe to de-reference model's main_grad after copying.
# (If using contiguous buffers, main_grad's memory should
# persist and therefore should not be deallocated.)
if
not
self
.
use_contiguous_buffers_in_ddp
:
model_param
.
main_grad
=
None
def
_unscale_main_grads_and_check_for_nan
(
self
):
def
_unscale_main_grads_and_check_for_nan
(
self
):
main_grads
=
[]
main_grads
=
[]
...
@@ -469,11 +490,12 @@ class FP32Optimizer(MegatronOptimizer):
...
@@ -469,11 +490,12 @@ class FP32Optimizer(MegatronOptimizer):
def
__init__
(
self
,
optimizer
,
clip_grad
,
def
__init__
(
self
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
):
params_have_main_grad
,
use_contiguous_buffers_in_ddp
):
super
(
FP32Optimizer
,
self
).
__init__
(
super
(
FP32Optimizer
,
self
).
__init__
(
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
optimizer
,
clip_grad
,
log_num_zeros_in_grad
,
params_have_main_grad
)
params_have_main_grad
,
use_contiguous_buffers_in_ddp
)
self
.
_scale
=
torch
.
cuda
.
FloatTensor
([
1.0
])
self
.
_scale
=
torch
.
cuda
.
FloatTensor
([
1.0
])
...
@@ -500,6 +522,12 @@ class FP32Optimizer(MegatronOptimizer):
...
@@ -500,6 +522,12 @@ class FP32Optimizer(MegatronOptimizer):
for
param
in
param_group
[
'params'
]:
for
param
in
param_group
[
'params'
]:
param
.
grad
=
param
.
main_grad
param
.
grad
=
param
.
main_grad
# Safe to de-reference model's main_grad after copying.
# (If using contiguous buffers, main_grad's memory should
# persist and therefore should not be deallocated.)
if
not
self
.
use_contiguous_buffers_in_ddp
:
param
.
main_grad
=
None
# Clip gradients.
# Clip gradients.
grad_norm
=
None
grad_norm
=
None
if
self
.
clip_grad
>
0.0
:
if
self
.
clip_grad
>
0.0
:
...
...
megatron/schedules.py
View file @
4f4715db
...
@@ -243,6 +243,7 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
...
@@ -243,6 +243,7 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
model_chunk_id
=
get_model_chunk_id
(
microbatch_id
,
forward
=
True
)
model_chunk_id
=
get_model_chunk_id
(
microbatch_id
,
forward
=
True
)
mpu
.
set_virtual_pipeline_model_parallel_rank
(
model_chunk_id
)
mpu
.
set_virtual_pipeline_model_parallel_rank
(
model_chunk_id
)
# forward step
if
mpu
.
is_pipeline_first_stage
():
if
mpu
.
is_pipeline_first_stage
():
if
len
(
input_tensors
[
model_chunk_id
])
==
\
if
len
(
input_tensors
[
model_chunk_id
])
==
\
len
(
output_tensors
[
model_chunk_id
]):
len
(
output_tensors
[
model_chunk_id
]):
...
@@ -254,6 +255,11 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
...
@@ -254,6 +255,11 @@ def forward_backward_pipelining_with_interleaving(forward_step_func, data_iterat
input_tensor
,
losses_reduced
)
input_tensor
,
losses_reduced
)
output_tensors
[
model_chunk_id
].
append
(
output_tensor
)
output_tensors
[
model_chunk_id
].
append
(
output_tensor
)
# if forward-only, no need to save tensors for a backward pass
if
forward_only
:
input_tensors
[
model_chunk_id
].
pop
()
output_tensors
[
model_chunk_id
].
pop
()
return
output_tensor
return
output_tensor
def
backward_step_helper
(
microbatch_id
):
def
backward_step_helper
(
microbatch_id
):
...
@@ -538,8 +544,12 @@ def forward_backward_pipelining_without_interleaving(forward_step_func, data_ite
...
@@ -538,8 +544,12 @@ def forward_backward_pipelining_without_interleaving(forward_step_func, data_ite
recv_tensor_shapes
=
get_tensor_shapes
(
rank
-
1
,
model_type
)
recv_tensor_shapes
=
get_tensor_shapes
(
rank
-
1
,
model_type
)
send_tensor_shapes
=
get_tensor_shapes
(
rank
,
model_type
)
send_tensor_shapes
=
get_tensor_shapes
(
rank
,
model_type
)
input_tensors
=
[]
# Input, output tensors only need to be saved when doing backward passes
output_tensors
=
[]
input_tensors
=
None
output_tensors
=
None
if
not
forward_only
:
input_tensors
=
[]
output_tensors
=
[]
losses_reduced
=
[]
losses_reduced
=
[]
# Run warmup forward passes.
# Run warmup forward passes.
...
@@ -549,8 +559,9 @@ def forward_backward_pipelining_without_interleaving(forward_step_func, data_ite
...
@@ -549,8 +559,9 @@ def forward_backward_pipelining_without_interleaving(forward_step_func, data_ite
input_tensor
,
losses_reduced
)
input_tensor
,
losses_reduced
)
send_forward
(
output_tensor
,
send_tensor_shapes
,
timers
=
timers
)
send_forward
(
output_tensor
,
send_tensor_shapes
,
timers
=
timers
)
input_tensors
.
append
(
input_tensor
)
if
not
forward_only
:
output_tensors
.
append
(
output_tensor
)
input_tensors
.
append
(
input_tensor
)
output_tensors
.
append
(
output_tensor
)
# Before running 1F1B, need to receive first forward tensor.
# Before running 1F1B, need to receive first forward tensor.
# If all microbatches are run in warmup / cooldown phase, then no need to
# If all microbatches are run in warmup / cooldown phase, then no need to
...
@@ -566,21 +577,24 @@ def forward_backward_pipelining_without_interleaving(forward_step_func, data_ite
...
@@ -566,21 +577,24 @@ def forward_backward_pipelining_without_interleaving(forward_step_func, data_ite
input_tensor
,
losses_reduced
)
input_tensor
,
losses_reduced
)
if
forward_only
:
if
forward_only
:
send_forward
(
output_tensor
,
send_tensor_shapes
,
timers
=
timers
)
send_forward
(
output_tensor
,
send_tensor_shapes
,
timers
=
timers
)
if
not
last_iteration
:
input_tensor
=
recv_forward
(
recv_tensor_shapes
,
timers
=
timers
)
else
:
else
:
output_tensor_grad
=
\
output_tensor_grad
=
\
send_forward_recv_backward
(
output_tensor
,
send_forward_recv_backward
(
output_tensor
,
send_tensor_shapes
,
send_tensor_shapes
,
timers
=
timers
)
timers
=
timers
)
# Add input_tensor and output_tensor to end of list, then pop from the
# start of the list for backward pass.
input_tensors
.
append
(
input_tensor
)
output_tensors
.
append
(
output_tensor
)
if
forward_only
:
# Add input_tensor and output_tensor to end of list.
if
not
last_iteration
:
input_tensors
.
append
(
input_tensor
)
input_tensor
=
recv_forward
(
recv_tensor_shapes
,
timers
=
timers
)
output_tensors
.
append
(
output_tensor
)
else
:
input_tensor
,
output_tensor
=
input_tensors
.
pop
(
0
),
output_tensors
.
pop
(
0
)
# Pop input_tensor and output_tensor from the start of the list for
# the backward pass.
input_tensor
=
input_tensors
.
pop
(
0
)
output_tensor
=
output_tensors
.
pop
(
0
)
input_tensor_grad
=
\
input_tensor_grad
=
\
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
backward_step
(
optimizer
,
input_tensor
,
output_tensor
,
...
...
megatron/training.py
View file @
4f4715db
...
@@ -99,7 +99,7 @@ def pretrain(train_valid_test_dataset_provider,
...
@@ -99,7 +99,7 @@ def pretrain(train_valid_test_dataset_provider,
# This will be closer to what scheduler will see (outside of
# This will be closer to what scheduler will see (outside of
# image ... launches.
# image ... launches.
global
_TRAIN_START_TIME
global
_TRAIN_START_TIME
start_time_tensor
=
torch
.
cuda
.
Float
Tensor
([
_TRAIN_START_TIME
])
start_time_tensor
=
torch
.
cuda
.
Double
Tensor
([
_TRAIN_START_TIME
])
torch
.
distributed
.
all_reduce
(
start_time_tensor
,
torch
.
distributed
.
all_reduce
(
start_time_tensor
,
op
=
torch
.
distributed
.
ReduceOp
.
MIN
)
op
=
torch
.
distributed
.
ReduceOp
.
MIN
)
_TRAIN_START_TIME
=
start_time_tensor
.
item
()
_TRAIN_START_TIME
=
start_time_tensor
.
item
()
...
@@ -391,6 +391,10 @@ def train_step(forward_step_func, data_iterator,
...
@@ -391,6 +391,10 @@ def train_step(forward_step_func, data_iterator,
forward_step_func
,
data_iterator
,
model
,
forward_step_func
,
data_iterator
,
model
,
optimizer
,
timers
,
forward_only
=
False
)
optimizer
,
timers
,
forward_only
=
False
)
# Empty unused memory
if
args
.
empty_unused_memory_each_iter
>=
1
:
torch
.
cuda
.
empty_cache
()
# All-reduce if needed.
# All-reduce if needed.
if
args
.
DDP_impl
==
'local'
:
if
args
.
DDP_impl
==
'local'
:
timers
(
'backward-params-all-reduce'
).
start
()
timers
(
'backward-params-all-reduce'
).
start
()
...
@@ -438,6 +442,10 @@ def train_step(forward_step_func, data_iterator,
...
@@ -438,6 +442,10 @@ def train_step(forward_step_func, data_iterator,
else
:
else
:
skipped_iter
=
1
skipped_iter
=
1
# Empty unused memory
if
args
.
empty_unused_memory_each_iter
>=
2
:
torch
.
cuda
.
empty_cache
()
if
mpu
.
is_pipeline_last_stage
(
ignore_virtual
=
True
):
if
mpu
.
is_pipeline_last_stage
(
ignore_virtual
=
True
):
# Average loss across microbatches.
# Average loss across microbatches.
loss_reduced
=
{}
loss_reduced
=
{}
...
@@ -571,7 +579,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
...
@@ -571,7 +579,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
if
iteration
%
args
.
log_interval
==
0
:
if
iteration
%
args
.
log_interval
==
0
:
elapsed_time
=
timers
(
'interval-time'
).
elapsed
()
elapsed_time
=
timers
(
'interval-time'
).
elapsed
()
elapsed_time_per_iteration
=
elapsed_time
/
total_iterations
elapsed_time_per_iteration
=
elapsed_time
/
total_iterations
if
writer
and
torch
.
distributed
.
get_rank
()
==
0
:
if
writer
:
if
args
.
log_timers_to_tensorboard
:
if
args
.
log_timers_to_tensorboard
:
writer
.
add_scalar
(
'iteration-time'
,
writer
.
add_scalar
(
'iteration-time'
,
elapsed_time_per_iteration
,
iteration
)
elapsed_time_per_iteration
,
iteration
)
...
@@ -746,6 +754,10 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False):
...
@@ -746,6 +754,10 @@ def evaluate(forward_step_func, data_iterator, model, verbose=False):
forward_step_func
,
data_iterator
,
model
,
optimizer
=
None
,
forward_step_func
,
data_iterator
,
model
,
optimizer
=
None
,
timers
=
None
,
forward_only
=
True
)
timers
=
None
,
forward_only
=
True
)
# Empty unused memory
if
args
.
empty_unused_memory_each_iter
>=
1
:
torch
.
cuda
.
empty_cache
()
if
mpu
.
is_pipeline_last_stage
(
ignore_virtual
=
True
):
if
mpu
.
is_pipeline_last_stage
(
ignore_virtual
=
True
):
# Reduce across processes.
# Reduce across processes.
for
loss_dict
in
loss_dicts
:
for
loss_dict
in
loss_dicts
:
...
@@ -778,7 +790,7 @@ def evaluate_and_print_results(prefix, forward_step_func,
...
@@ -778,7 +790,7 @@ def evaluate_and_print_results(prefix, forward_step_func,
string
+=
'{} value: {:.6E} | '
.
format
(
key
,
total_loss_dict
[
key
].
item
())
string
+=
'{} value: {:.6E} | '
.
format
(
key
,
total_loss_dict
[
key
].
item
())
ppl
=
math
.
exp
(
min
(
20
,
total_loss_dict
[
key
].
item
()))
ppl
=
math
.
exp
(
min
(
20
,
total_loss_dict
[
key
].
item
()))
string
+=
'{} PPL: {:.6E} | '
.
format
(
key
,
ppl
)
string
+=
'{} PPL: {:.6E} | '
.
format
(
key
,
ppl
)
if
writer
and
is_last_rank
()
:
if
writer
:
writer
.
add_scalar
(
'{} validation'
.
format
(
key
),
writer
.
add_scalar
(
'{} validation'
.
format
(
key
),
total_loss_dict
[
key
].
item
(),
total_loss_dict
[
key
].
item
(),
iteration
)
iteration
)
...
@@ -817,10 +829,9 @@ def build_train_valid_test_data_iterators(
...
@@ -817,10 +829,9 @@ def build_train_valid_test_data_iterators(
'only backward compatiblity support for iteration-based training'
'only backward compatiblity support for iteration-based training'
args
.
consumed_train_samples
=
args
.
iteration
*
args
.
global_batch_size
args
.
consumed_train_samples
=
args
.
iteration
*
args
.
global_batch_size
if
args
.
iteration
>
0
and
args
.
consumed_valid_samples
==
0
:
if
args
.
iteration
>
0
and
args
.
consumed_valid_samples
==
0
:
assert
args
.
train_samples
is
None
,
\
if
args
.
train_samples
is
None
:
'only backward compatiblity support for iteration-based training'
args
.
consumed_valid_samples
=
(
args
.
iteration
//
args
.
eval_interval
)
*
\
args
.
consumed_valid_samples
=
(
args
.
iteration
//
args
.
eval_interval
)
*
\
args
.
eval_iters
*
args
.
global_batch_size
args
.
eval_iters
*
args
.
global_batch_size
# Data loader only on rank 0 of each model parallel group.
# Data loader only on rank 0 of each model parallel group.
if
mpu
.
get_tensor_model_parallel_rank
()
==
0
:
if
mpu
.
get_tensor_model_parallel_rank
()
==
0
:
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment