Commit 41276b6c authored by Vijay Korthikanti's avatar Vijay Korthikanti
Browse files

Merge branch 'main' into nmt-main

parents a44360ed fc7f4f03
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
from commons import print_separator
from commons import initialize_distributed
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
from mpu import layers
from commons import set_random_seed
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
from commons import print_separator
from commons import initialize_distributed
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
import torch
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
from apex.optimizers import FusedAdam as Adam
from apex.optimizers import FusedSGD as SGD
......@@ -145,6 +132,7 @@ def get_megatron_optimizer(model,
args.use_contiguous_buffers_in_local_ddp,
args.fp16,
args.bf16,
args.params_dtype,
grad_scaler,
model)
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Gradient clipping."""
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Megatron distributed optimizer."""
......@@ -351,7 +338,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
params_have_main_grad, use_contiguous_buffers_in_local_ddp,
fp16, bf16, grad_scaler, models):
fp16, bf16, params_dtype, grad_scaler, models):
"""
See top of class definition for argument descriptions.
......@@ -365,7 +352,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
super().__init__(
optimizer, clip_grad, log_num_zeros_in_grad,
params_have_main_grad, use_contiguous_buffers_in_local_ddp,
fp16, bf16, grad_scaler, models)
fp16, bf16, params_dtype, grad_scaler, models)
# Verify that contiguous buffers are being used.
# - Note: this should already be checked in arguments.py.
......@@ -394,6 +381,21 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
self.model_param_gbuf_map,
self.opt_group_ranges)
# Initialize param buffers.
# - These are views on the DDP model's grad buffers, that share
# storage & have their own dtype. This is safe because the param
# dtype size is always <= grad dtype size.
self.param_buffers = []
for model_index, model in enumerate(self.models):
current_param_buffers = {}
for dtype, grad_buffer in model._grad_buffers.items():
param_buffer = torch.tensor(grad_buffer.data.storage()._untyped(),
dtype = params_dtype,
device = grad_buffer.data.device)
param_buffer = param_buffer[:grad_buffer.numel_padded]
current_param_buffers[dtype] = param_buffer
self.param_buffers.append(current_param_buffers)
# Update optimizer groups.
# - Also, leverage state_dict() and load_state_dict() to
# recast preexisting per-param state tensors.
......@@ -449,8 +451,9 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
# Grad scaler.
if 'grad_scaler' not in state_dict:
print_rank_0('***WARNING*** found an old checkpoint, will not '
'load grad scaler ...')
if self.fp16:
print_rank_0('***WARNING*** found an old checkpoint, will not '
'load grad scaler ...')
else:
if self.grad_scaler:
self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
......@@ -487,36 +490,48 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
_zero_grad_group_helper(group, set_to_none)
def get_model_grad_buffer_dp_views(self):
@staticmethod
def get_model_buffer_dp_views(model_buffers):
"""
Get shard views of each of the DDP's grad buffers.
Get shard views of each of the DDP's param/grad buffers.
In this nested list, the top level is grouped by the virtual model
index and the grad buffer's data type. The sub-level is a list of
shards of that grad buffer, where each shard in the list represents
a contiguous view of the grad buffer, that is owned by a data-parallel
index and the buffer's data type. The sub-level is a list of
shards of that buffer, where each shard in the list represents
a contiguous view of the buffer, that is owned by a data-parallel
rank. The shard boundary does not respect parameter boundaries, and
so the elements of some parameters are split across data parallel
ranks.
Additionally, return references to the entire grad buffers, for use
Additionally, return references to the entire buffers, for use
in _reduce_scatter_base and _all_gather_base.
"""
data_parallel_world_size = mpu.get_data_parallel_world_size()
# Grad buffer views.
gbuf_view_items = []
for model_index, model in enumerate(self.models):
for dtype, gbuf in model._grad_buffers.items():
# Buffer views.
view_items = []
for model_index, buffers in enumerate(model_buffers):
for dtype, buf in buffers.items():
assert buf.numel() % data_parallel_world_size == 0
shard_size = int(buf.numel() / data_parallel_world_size)
buf_views = [buf[(r*shard_size):((r+1)*shard_size)]
for r in range(data_parallel_world_size)]
view_items.append((model_index, dtype, buf, buf_views))
assert gbuf.numel_padded % data_parallel_world_size == 0
shard_size = int(gbuf.numel_padded / data_parallel_world_size)
gbuf_views = [gbuf.data[(r*shard_size):((r+1)*shard_size)]
for r in range(data_parallel_world_size)]
gbuf_view_items.append((model_index, dtype, gbuf.data, gbuf_views))
return view_items
return gbuf_view_items
def get_model_grad_buffer_dp_views(self):
return self.get_model_buffer_dp_views([
{dtype : mem_buffer.data}
for model in self.models
for dtype, mem_buffer in model._grad_buffers.items()])
def get_model_param_buffer_dp_views(self):
return self.get_model_buffer_dp_views(self.param_buffers)
def reduce_model_grads(self, args, timers):
......@@ -532,17 +547,20 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
"""
# All-reduce layer-norm grads (for sequence parallelism).
timers('backward-layernorm-all-reduce').start()
timers('layernorm-grads-all-reduce', log_level=1).start(
barrier=args.barrier_with_L1_time)
self.allreduce_layernorm_grads(args)
timers('backward-layernorm-all-reduce').stop()
timers('layernorm-grads-all-reduce').stop()
# All-reduce embedding grads.
timers('backward-embedding-all-reduce').start()
timers('embedding-grads-all-reduce', log_level=1).start(
barrier=args.barrier_with_L1_time)
self.allreduce_embedding_grads(args)
timers('backward-embedding-all-reduce').stop()
timers('embedding-grads-all-reduce').stop()
# Reduce-scatter setup.
timers('backward-params-all-reduce').start()
timers('grads-reduce-scatter', log_level=1).start(
barrier=args.barrier_with_L1_time)
data_parallel_rank = mpu.get_data_parallel_rank()
data_parallel_world_size = mpu.get_data_parallel_world_size()
data_parallel_group = mpu.get_data_parallel_group()
......@@ -563,46 +581,49 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
group = data_parallel_group,
)
timers('backward-params-all-reduce').stop()
timers('grads-reduce-scatter').stop()
def gather_model_params(self, args, timers):
"""
All-gather updated model params.
The DDP's grad buffer is used for the all-gather, and thus no
The DDP's param buffer is used for the all-gather, and thus no
tensors are dynamically allocated. After the all-gather, the params
can be copied from param.main_grad to param.
can be copied from the param buffer to the param.
"""
timers('backward-params-all-gather').start()
timers('params-all-gather', log_level=1).start(
barrier=args.barrier_with_L1_time)
data_parallel_rank = mpu.get_data_parallel_rank()
data_parallel_group = mpu.get_data_parallel_group()
# All-gather updated main params.
# - All grad buffer views are guaranteed to have the same num elements
# across all data parallel ranks, with grad buffer padding that is done
# in distributed.py. Thus, all sub-views will have consistent start/end
# indexes across data parallel ranks.
gbuf_view_items = self.get_model_grad_buffer_dp_views()
for index, (model_index, dtype, gbuf, gbuf_views) \
in enumerate(gbuf_view_items):
# - All param buffer views are guaranteed to have the same num elements
# across all data parallel ranks, due to grad buffer padding that is
# done in distributed.py, and extended to the param buffers. Thus,
# all sub-views will have consistent start/end indexes across data
# parallel ranks.
pbuf_view_items = self.get_model_param_buffer_dp_views()
for index, (model_index, dtype, pbuf, pbuf_views) \
in enumerate(pbuf_view_items):
torch.distributed._all_gather_base(
gbuf,
gbuf_views[data_parallel_rank],
pbuf,
pbuf_views[data_parallel_rank],
group = data_parallel_group,
)
# Each model param now contains its updated values in its
# '.main_grad' field.
for model in self.models:
# Copy from param buffer to each param.
for model_id, model in enumerate(self.models):
for dtype, param_map in model._grad_buffer_param_index_map.items():
for param in param_map:
param.detach().copy_(param.main_grad)
for param, buf_range in param_map.items():
param_buf = self.param_buffers[model_id][dtype]
param_buf_shard = param_buf[buf_range[0]:buf_range[1]]
param.view(-1).detach().copy_(param_buf_shard)
timers('backward-params-all-gather').stop()
timers('params-all-gather').stop()
def _collect_main_grad_data_for_unscaling(self):
......@@ -680,14 +701,17 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
model_group):
param_range_map = self.get_model_param_range_map(model_param)
param_range = param_range_map["param"]
assert param_range.size == shard_main_param.nelement()
world_range = param_range_map["gbuf_world"]
model_grad = model_param.main_grad
shard_model_grad = model_grad.view(-1) \
[param_range.start:param_range.end]
assert world_range.size == shard_main_param.nelement()
model_id, dtype = self.model_param_gbuf_map[model_param]
model_param_buffer = self.param_buffers[model_id][dtype]
shard_model_param = model_param_buffer.view(-1) \
[world_range.start:world_range.end]
shard_model_grad.data.copy_(shard_main_param)
shard_model_param.data.copy_(shard_main_param)
# Copy shard groups to model groups.
copy_group_params(self.shard_fp32_from_float16_groups,
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Megatron grad scaler."""
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Megatron optimizer."""
......@@ -294,21 +281,24 @@ class MegatronOptimizer(ABC):
"""All-reduce all grads, and all-reduce embeddings."""
# All-reduce layer-norm grads (for sequence parallelism).
timers('backward-layernorm-all-reduce').start()
timers('layernorm-grads-all-reduce', log_level=1).start(
barrier=args.barrier_with_L1_time)
self.allreduce_layernorm_grads(args)
timers('backward-layernorm-all-reduce').stop()
timers('layernorm-grads-all-reduce').stop()
# All-reduce if needed.
if args.DDP_impl == 'local':
timers('backward-params-all-reduce').start()
timers('grads-all-reduce', log_level=1).start(
barrier=args.barrier_with_L1_time)
for model in self.models:
model.allreduce_gradients()
timers('backward-params-all-reduce').stop()
timers('grads-all-reduce').stop()
# All-reduce embedding grads.
timers('backward-embedding-all-reduce').start()
timers('embedding-grads-all-reduce', log_level=1).start(
barrier=args.barrier_with_L1_time)
self.allreduce_embedding_grads(args)
timers('backward-embedding-all-reduce').stop()
timers('embedding-grads-all-reduce').stop()
class MixedPrecisionOptimizer(MegatronOptimizer):
......@@ -332,6 +322,7 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
is using a contiguous buffer to hold the model grads.
fp16: if true, the model is running in fp16.
bf16: if true, the model is running in bfloat16.
params_dtype: used by distributed optimizer.
grad_scaler: used for scaling gradients. Note that this can be
None. This case happens when `bf16 = True` and we don't
use any loss scale. Note that for `bf16 = True`, we can have
......@@ -343,7 +334,7 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
params_have_main_grad, use_contiguous_buffers_in_local_ddp,
fp16, bf16, grad_scaler,
fp16, bf16, params_dtype, grad_scaler,
models):
super().__init__(
......@@ -353,6 +344,7 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
self.fp16 = fp16
self.bf16 = bf16
self.params_dtype = params_dtype
self.grad_scaler = grad_scaler
# None grad scaler is only supported for bf16.
......@@ -416,7 +408,8 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
def step(self, args, timers):
# Copy gradients from model params to main params.
timers('optimizer-copy-to-main-grad').start()
timers('optimizer-copy-to-main-grad', log_level=1).start(
barrier=args.barrier_with_L1_time)
self._copy_model_grads_to_main_grads()
timers('optimizer-copy-to-main-grad').stop()
......@@ -425,7 +418,8 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
if self.grad_scaler:
# Unscale and check for inf/nan.
timers('optimizer-unscale-and-check-inf').start()
timers('optimizer-unscale-and-check-inf', log_level=1).start(
barrier=args.barrier_with_L1_time)
found_inf_flag = self._unscale_main_grads_and_check_for_nan()
timers('optimizer-unscale-and-check-inf').stop()
......@@ -438,25 +432,29 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
return False, None, None
# Clip the main gradients.
timers('optimizer-clip-main-grad').start()
timers('optimizer-clip-main-grad', log_level=1).start(
barrier=args.barrier_with_L1_time)
grad_norm = None
if self.clip_grad > 0.0:
grad_norm = self.clip_grad_norm(self.clip_grad)
timers('optimizer-clip-main-grad').stop()
# Count the zeros in the grads.
timers('optimizer-count-zeros').start()
timers('optimizer-count-zeros', log_level=1).start(
barrier=args.barrier_with_L1_time)
num_zeros_in_grad = self.count_zeros() if \
self.log_num_zeros_in_grad else None
timers('optimizer-count-zeros').stop()
# Step the optimizer.
timers('optimizer-inner-step').start()
timers('optimizer-inner-step', log_level=1).start(
barrier=args.barrier_with_L1_time)
self.optimizer.step()
timers('optimizer-inner-step').stop()
# Update params from main params.
timers('optimizer-copy-main-to-model-params').start()
timers('optimizer-copy-main-to-model-params', log_level=1).start(
barrier=args.barrier_with_L1_time)
self._copy_main_params_to_model_params()
timers('optimizer-copy-main-to-model-params').stop()
......@@ -496,12 +494,12 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
def __init__(self, optimizer, clip_grad, log_num_zeros_in_grad,
params_have_main_grad, use_contiguous_buffers_in_local_ddp,
fp16, bf16, grad_scaler, models):
fp16, bf16, params_dtype, grad_scaler, models):
super().__init__(
optimizer, clip_grad, log_num_zeros_in_grad,
params_have_main_grad, use_contiguous_buffers_in_local_ddp,
fp16, bf16, grad_scaler, models)
fp16, bf16, params_dtype, grad_scaler, models)
# ======================
# main parameter stuff
......@@ -671,8 +669,9 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
# Grad scaler.
if 'grad_scaler' not in state_dict:
print_rank_0('***WARNING*** found an old checkpoint, will not '
'load grad scaler ...')
if self.fp16:
print_rank_0('***WARNING*** found an old checkpoint, will not '
'load grad scaler ...')
else:
if self.grad_scaler:
self.grad_scaler.load_state_dict(state_dict['grad_scaler'])
......@@ -725,7 +724,8 @@ class FP32Optimizer(MegatronOptimizer):
Always return successful since there is no overflow."""
# Copy main_grads to grads.
timers('optimizer-copy-to-main-grad').start()
timers('optimizer-copy-to-main-grad', log_level=1).start(
barrier=args.barrier_with_L1_time)
if self.params_have_main_grad:
for param_group in self.optimizer.param_groups:
for param in param_group['params']:
......@@ -739,20 +739,23 @@ class FP32Optimizer(MegatronOptimizer):
timers('optimizer-copy-to-main-grad').stop()
# Clip gradients.
timers('optimizer-clip-main-grad').start()
timers('optimizer-clip-main-grad', log_level=1).start(
barrier=args.barrier_with_L1_time)
grad_norm = None
if self.clip_grad > 0.0:
grad_norm = self.clip_grad_norm(self.clip_grad)
timers('optimizer-clip-main-grad').stop()
# count the zeros in the grads
timers('optimizer-count-zeros').start()
timers('optimizer-count-zeros', log_level=1).start(
barrier=args.barrier_with_L1_time)
num_zeros_in_grad = self.count_zeros() if \
self.log_num_zeros_in_grad else None
timers('optimizer-count-zeros').stop()
# Update parameters.
timers('optimizer-inner-step').start()
timers('optimizer-inner-step', log_level=1).start(
barrier=args.barrier_with_L1_time)
self.optimizer.step()
timers('optimizer-inner-step').stop()
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Learning rate decay and weight decay incr functions."""
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
from functools import reduce
import operator
......@@ -163,7 +150,7 @@ def recv_forward(tensor_shape=None, dtype_=None, timers=None):
input_tensor = None
else:
if timers is not None:
timers('forward-recv').start()
timers('forward-recv', log_level=2).start()
input_tensor, _ = _communicate(
tensor_send_next=None,
tensor_send_prev=None,
......@@ -182,7 +169,7 @@ def recv_backward(tensor_shape=None, timers=None):
output_tensor_grad = None
else:
if timers is not None:
timers('backward-recv').start()
timers('backward-recv', log_level=2).start()
_, output_tensor_grad = _communicate(
tensor_send_next=None,
tensor_send_prev=None,
......@@ -199,7 +186,7 @@ def send_forward(output_tensor, tensor_shape=None, dtype_=None, timers=None):
if not mpu.is_pipeline_last_stage():
if timers is not None:
timers('forward-send').start()
timers('forward-send', log_level=2).start()
_communicate(
tensor_send_next=output_tensor,
tensor_send_prev=None,
......@@ -215,7 +202,7 @@ def send_backward(input_tensor_grad, tensor_shape=None, timers=None):
"""Send tensor to previous rank in pipeline (backward send)."""
if not mpu.is_pipeline_first_stage():
if timers is not None:
timers('backward-send').start()
timers('backward-send', log_level=2).start()
_communicate(
tensor_send_next=None,
tensor_send_prev=input_tensor_grad,
......@@ -232,7 +219,7 @@ def send_forward_recv_backward(output_tensor, tensor_shape=None, timers=None):
output_tensor_grad = None
else:
if timers is not None:
timers('forward-send-backward-recv').start()
timers('forward-send-backward-recv', log_level=2).start()
_, output_tensor_grad = _communicate(
tensor_send_next=output_tensor,
tensor_send_prev=None,
......@@ -250,7 +237,7 @@ def send_backward_recv_forward(input_tensor_grad, tensor_shape=None, timers=None
input_tensor = None
else:
if timers is not None:
timers('backward-send-forward-recv').start()
timers('backward-send-forward-recv', log_level=2).start()
input_tensor, _ = _communicate(
tensor_send_next=None,
tensor_send_prev=input_tensor_grad,
......@@ -265,7 +252,7 @@ def send_backward_recv_forward(input_tensor_grad, tensor_shape=None, timers=None
def send_forward_recv_forward(output_tensor, recv_prev, tensor_shape=None, timers=None):
"""Batched recv from previous rank and send to next rank in pipeline."""
if timers is not None:
timers('forward-send-forward-recv').start()
timers('forward-send-forward-recv', log_level=2).start()
input_tensor, _ = _communicate(
tensor_send_next=output_tensor,
tensor_send_prev=None,
......@@ -280,7 +267,7 @@ def send_forward_recv_forward(output_tensor, recv_prev, tensor_shape=None, timer
def send_backward_recv_backward(input_tensor_grad, recv_next, tensor_shape=None, timers=None):
"""Batched recv from next rank and send to previous rank in pipeline."""
if timers is not None:
timers('backward-send-backward-recv').start()
timers('backward-send-backward-recv', log_level=2).start()
_, output_tensor_grad = _communicate(
tensor_send_next=None,
tensor_send_prev=input_tensor_grad,
......@@ -297,7 +284,8 @@ def send_forward_backward_recv_forward_backward(
recv_next, tensor_shape=None, timers=None):
"""Batched send and recv with previous and next ranks in pipeline."""
if timers is not None:
timers('forward-backward-send-forward-backward-recv').start()
timers('forward-backward-send-forward-backward-recv',
log_level=2).start()
input_tensor, output_tensor_grad = _communicate(
tensor_send_next=output_tensor,
tensor_send_prev=input_tensor_grad,
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
from contextlib import contextmanager
import torch
......@@ -107,6 +94,7 @@ def forward_step(forward_step_func,
model,
input_tensor,
forward_data_store,
timers,
collect_non_loss_data=False):
"""Forward step for passed-in model.
......@@ -115,9 +103,9 @@ def forward_step(forward_step_func,
Returns output tensor."""
args = get_args()
timers = get_timers()
timers('forward-compute').start()
if timers is not None:
timers('forward-compute', log_level=2).start()
unwrapped_model = unwrap_model(
model, (torchDDP, LocalDDP, Float16Module))
......@@ -138,7 +126,8 @@ def forward_step(forward_step_func,
data = loss_func(output_tensor, non_loss_data=True)
forward_data_store.append(data)
timers('forward-compute').stop()
if timers is not None:
timers('forward-compute').stop()
# If T5 model (or other model with encoder and decoder)
# and in decoder stack, then send encoder_hidden_state
......@@ -151,7 +140,8 @@ def forward_step(forward_step_func,
return [output_tensor]
def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):
def backward_step(optimizer, input_tensor, output_tensor,
output_tensor_grad, timers):
"""Backward step through passed-in output tensor.
If last stage, output_tensor_grad is None, otherwise gradient of loss
......@@ -165,8 +155,8 @@ def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):
# connections.
args = get_args()
timers = get_timers()
timers('backward-compute').start()
if timers is not None:
timers('backward-compute', log_level=2).start()
# Retain the grad on the input_tensor.
unwrap_input_tensor_grad = False
......@@ -207,7 +197,8 @@ def backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad):
if unwrap_input_tensor_grad:
input_tensor_grad = input_tensor_grad[0]
timers('backward-compute').stop()
if timers is not None:
timers('backward-compute').stop()
return input_tensor_grad
......@@ -243,18 +234,19 @@ def forward_backward_no_pipelining(forward_step_func,
for i in range(get_num_microbatches() - 1):
output_tensor = forward_step(forward_step_func, data_iterator,
model, input_tensor, forward_data_store,
collect_non_loss_data)
timers, collect_non_loss_data)
if not forward_only:
backward_step(optimizer, input_tensor, output_tensor,
output_tensor_grad)
timers, output_tensor_grad)
# Run computation for last microbatch out of context handler (want to
# synchronize gradients).
output_tensor = forward_step(forward_step_func, data_iterator,
model, input_tensor, forward_data_store,
collect_non_loss_data)
timers, collect_non_loss_data)
if not forward_only:
backward_step(optimizer, input_tensor, output_tensor, output_tensor_grad)
backward_step(optimizer, input_tensor, output_tensor,
output_tensor_grad, timers)
return forward_data_store
......@@ -269,6 +261,9 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
communication between pipeline stages as needed.
Returns dictionary with losses if the last stage, empty dict otherwise."""
args = get_args()
input_tensors = [[] for _ in range(len(model))]
output_tensors = [[] for _ in range(len(model))]
forward_data_store = []
......@@ -278,7 +273,6 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
pipeline_parallel_size = mpu.get_pipeline_model_parallel_world_size()
pipeline_parallel_rank = mpu.get_pipeline_model_parallel_rank()
args = get_args()
if args.sequence_parallel:
seq_length = args.seq_length // mpu.get_tensor_model_parallel_world_size()
else:
......@@ -337,6 +331,7 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
model[model_chunk_id],
input_tensor,
forward_data_store,
timers,
collect_non_loss_data)
output_tensors[model_chunk_id].append(output_tensor)
......@@ -364,7 +359,8 @@ def forward_backward_pipelining_with_interleaving(forward_step_func,
backward_step(optimizer,
input_tensor,
output_tensor,
output_tensor_grad)
output_tensor_grad,
timers)
return input_tensor_grad
......@@ -620,8 +616,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
Returns dictionary with losses if the last stage, empty dict otherwise."""
args = get_args()
timers = get_timers()
assert len(model) == 1
model = model[0]
......@@ -656,7 +651,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
input_tensor = recv_forward(recv_tensor_shapes, timers=timers)
output_tensor = forward_step(forward_step_func, data_iterator, model,
input_tensor, forward_data_store,
collect_non_loss_data)
timers, collect_non_loss_data)
send_forward(output_tensor, send_tensor_shapes, timers=timers)
if not forward_only:
......@@ -676,7 +671,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
output_tensor = forward_step(forward_step_func, data_iterator, model,
input_tensor, forward_data_store,
collect_non_loss_data)
timers, collect_non_loss_data)
if forward_only:
send_forward(output_tensor, send_tensor_shapes, timers=timers)
......@@ -701,7 +696,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
input_tensor_grad = \
backward_step(optimizer, input_tensor, output_tensor,
output_tensor_grad)
output_tensor_grad, timers)
if last_iteration:
input_tensor = None
......@@ -721,7 +716,7 @@ def forward_backward_pipelining_without_interleaving(forward_step_func,
input_tensor_grad = \
backward_step(optimizer, input_tensor, output_tensor,
output_tensor_grad)
output_tensor_grad, timers)
send_backward(input_tensor_grad, recv_tensor_shapes, timers=timers)
......
<!-- coding=utf-8-->
<!-- Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.-->
<!---->
<!-- Licensed under the Apache License, Version 2.0 (the "License");-->
<!-- you may not use this file except in compliance with the License.-->
<!-- You may obtain a copy of the License at-->
<!---->
<!-- http://www.apache.org/licenses/LICENSE-2.0-->
<!---->
<!-- Unless required by applicable law or agreed to in writing, software-->
<!-- distributed under the License is distributed on an "AS IS" BASIS,-->
<!-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.-->
<!-- See the License for the specific language governing permissions and-->
<!-- limitations under the License.-->
<!-- Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.-->
<!DOCTYPE html>
<html lang="en">
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
from .api import (
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Inference API."""
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Communications utilities."""
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Forward step utilities."""
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Generation utilities."""
......@@ -47,10 +34,15 @@ def score_and_return_on_first_stage(model, tokens, lengths):
batch_size = tokens.size(0)
max_prompt_length = lengths.max().item()
assert max_prompt_length == tokens.size(1)
max_sequence_length = min(max_prompt_length, args.max_position_embeddings)
if max_prompt_length > args.max_position_embeddings:
raise ValueError("Length of prompt + tokens_to_generate longer than allowed")
if max_prompt_length * batch_size >= MAX_TOKENS_TO_OOM:
raise ValueError("Too many tokens. " + str(max_prompt_length*batch_size)+ " is greater than "+str(MAX_TOKENS_TO_OOM))
# forward step.
forward_step = ForwardStep(model, batch_size, max_sequence_length)
forward_step = ForwardStep(model, batch_size, max_prompt_length)
# ===================
# Pre-allocate memory
......@@ -58,7 +50,7 @@ def score_and_return_on_first_stage(model, tokens, lengths):
# Log probability of the sequence (prompt + generated tokens).
output_log_probs = None
output_log_probs_size = (batch_size, max_sequence_length - 1)
output_log_probs_size = (batch_size, max_prompt_length - 1)
if mpu.is_pipeline_last_stage():
output_log_probs = torch.empty(output_log_probs_size,
......@@ -395,7 +387,7 @@ def beam_search_and_return_on_first_stage(model, tokens, lengths, beam_size, sto
# if cannot find stop token, add open beams to hyps
if not done:
for beam_id in range(beam_size):
beam_hyp.add(tokens[beam_id].clone(), scores[beam_id], context_length + 1 - prompt_length)
beam_hyp.add(tokens[beam_id].clone(), scores[beam_id].squeeze(), context_length + 1 - prompt_length)
# rank based on scores
sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0], reverse=True)
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Sampling utilities.
Part of this code is inspired by:
......
# coding=utf-8
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Tokenization utilities."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment