Commit 67ea635f authored by aiss's avatar aiss
Browse files

push dsv0.8.2 version

parent 1b2721ad
Pipeline #201 failed with stages
in 0 seconds
import copy
'''Copyright The Microsoft DeepSpeed Team'''
import torch
import deepspeed
def quantize_transformer_layer(orig_layer_impl, model, megatron=False, preln=False):
......
'''
Copyright 2022 The Microsoft DeepSpeed Team
'''
from abc import ABC, abstractmethod
from deepspeed.utils.types import ActivationFuncType
import torch
from deepspeed.accelerator import get_accelerator
transformer_param_names = (
'attn_qkvw', \
'attn_qkvb', \
'attn_ow' , \
'attn_ob', \
'inter_w', \
'inter_b', \
'output_w', \
'output_b', \
'attn_nw', \
'attn_nb', \
'norm_w', \
'norm_b')
class DSPolicy(ABC):
_orig_layer_class = None
def __init__(self):
self.cuda_graph_supported = False
@abstractmethod
def attention(self):
"""
Returns attention qkv and dense parameters
weight: (3*hidden, hidden) and (hidden, hidden)
bias: (3*hidden) and (hidden)
"""
raise NotImplementedError
class TransformerPolicy(DSPolicy):
# a static class variable containing the HuggingFace model configuration.
# see e.g., transformers.models.opt.configuration_opt.OPTConfig
hf_model_config = None
def __init__(
self,
inference=True,
linear_layer=True,
scale_attention=True,
megatron_v2=False,
use_mup=False,
# the type of activation function used in MLP
mlp_act_func_type=ActivationFuncType.GELU,
# applies layer norm before attention if `pre_attn_norm` is set to True
pre_attn_norm=True,
# this flag shows whether or not using prefix in loading the checkpoint
use_load_prefix=False,
# whether or not the qkv is stored in the split-format
split_qkv=True):
super().__init__()
self.cuda_graph_supported = False
self.inference = inference
self.linear_layer = linear_layer
self.scale_attention = scale_attention
self.is_megatron_v2 = megatron_v2
self.use_mup = use_mup
self.mlp_act_func_type = mlp_act_func_type
self.pre_attn_norm = pre_attn_norm
self.use_load_prefix = use_load_prefix
self.split_qkv = split_qkv
@abstractmethod
def attention(self):
"""
Returns attention qkv and dense parameters
weight: (3*hidden, hidden) and (hidden, hidden)
bias: (3*hidden) and (hidden)
"""
raise NotImplementedError
@abstractmethod
def get_hidden_heads(self):
"""
return hidden_size and number of heads
"""
raise NotImplementedError
@abstractmethod
def mlp(self):
"""
Returns mlp intermediate and output
weight: (intermediate, hidden) and (hidden, intermediate)
bias: (intermediate) and (hidden)
"""
raise NotImplementedError
@abstractmethod
def layernorm(self):
"""
Returns LayerNorms used in transformer layer
Post-Attention and pre/post layer norm
gamma and beta with shape: (hidden)
"""
raise NotImplementedError
# TODO (lekurile): This function exists in base container as well, consolidate as some point
def transpose(data):
with torch.no_grad():
data = data.contiguous()
data1 = data.transpose(-1, -2).reshape(-1)
data.reshape(-1).copy_(data1)
data1 = None
return data.reshape(data.shape[-1], data.shape[-2])
# TODO (lekurile): This function exists in megatron feature container as well, consolidate as some point
def _transpose(x, heads=1, mp_replace=None):
heads = heads // mp_replace.mp_size
outer_dim = -1
attention_head_size = x.shape[outer_dim] // heads
new_x_shape = x.size()[:outer_dim] + (heads, attention_head_size)
x_1 = x.view(*new_x_shape)
(q, k, v) = torch.split(x_1, (x_1.shape[-1] // 3), dim=-1)
if len(q.shape) > 2:
new_shape = (q.shape[0], ) + (-1, )
return torch.cat((q.reshape(new_shape),
k.reshape(new_shape),
v.reshape(new_shape)),
dim=outer_dim).reshape(x.shape)
else:
return torch.cat((q.reshape(-1),
k.reshape(-1),
v.reshape(-1)),
dim=-1).reshape(x.shape)
# This checks if the parameter exits in the checkpoint file and maybe copies it into the corresponding destination tensor.
# Note that not all parameters are saved in one checkpoint, that's why we always need to check if they exist!
def maybe_copy(module,
sd,
weight_quantizer,
mp_replace,
dst_name,
src_name,
qkv=False,
megatron_v2=False,
split_qkv=False,
heads=1):
if src_name in sd:
dst = getattr(module, dst_name)
tmp = sd[src_name]
if len(dst.shape) == 1:
if split_qkv:
dst = mp_replace.qkv_copy(dst, tmp)
else:
dst = mp_replace.copy(dst, tmp)
if qkv and megatron_v2:
dst = torch.nn.parameter.Parameter(
_transpose(dst,
heads=heads,
mp_replace=mp_replace).contiguous())
else:
if split_qkv:
dst = mp_replace.qkv_copy(dst, weight_quantizer.quantize(tmp if weight_quantizer.q_int8 else \
(transpose(tmp).contiguous())), int8=weight_quantizer.q_int8)
else:
if qkv and megatron_v2:
tmp = _transpose(transpose(tmp),
heads=heads,
mp_replace=mp_replace).contiguous()
if weight_quantizer.q_int8:
tmp = transpose(tmp)
dst = mp_replace.copy(dst, weight_quantizer.quantize(tmp if weight_quantizer.q_int8 else \
transpose(tmp)), int8=weight_quantizer.q_int8)
setattr(module, dst_name, dst)
# Extending the maybe_copy function for when the q, k, and v are in separate parameters!
def maybe_copy_qkv(module,
sd,
weight_quantizer,
mp_replace,
dst_name,
src_names,
split_qkv=False):
if src_names[0] in sd:
q = sd[src_names[0]]
k = sd[src_names[1]]
v = sd[src_names[2]]
qkv_data = torch.cat((q, k, v), dim=0)
dst = getattr(module, dst_name)
if len(dst.shape) == 1:
if split_qkv:
dst = mp_replace.qkv_copy(dst, qkv_data.contiguous())
else:
dst = mp_replace.copy(dst, qkv_data)
else:
if split_qkv:
dst = mp_replace.qkv_copy(dst, weight_quantizer.quantize(qkv_data.to(get_accelerator().device_name()) if weight_quantizer.q_int8 else \
((transpose(qkv_data)).contiguous())), int8=weight_quantizer.q_int8)
else:
dst = mp_replace.copy(dst, weight_quantizer.quantize(qkv_data.to(get_accelerator().device_name()) if weight_quantizer.q_int8 else \
transpose(qkv_data)), int8=weight_quantizer.q_int8)
setattr(module, dst_name, dst)
import copy
'''Copyright The Microsoft DeepSpeed Team'''
import os
import torch
import tqdm
import deepspeed
import deepspeed.ops.transformer as transformer_inference
from .replace_policy import HFBertLayerPolicy, HFGPT2LayerPolicy, HFGPTJLayerPolicy
from .replace_policy import replace_policies
from ..constants import INFERENCE_GENERIC_MODE, INFERENCE_SPECIALIZED_MODE
from ..runtime.weight_quantizer import WeightQuantization
from deepspeed.ops.transformer.inference.diffusers_attention import DeepSpeedDiffusersAttention
from deepspeed.ops.transformer.inference.diffusers_transformer_block import DeepSpeedDiffusersTransformerBlock
from deepspeed.ops.transformer.inference.diffusers_2d_transformer import Diffusers2DTransformerConfig
from deepspeed.accelerator import get_accelerator
from .replace_policy import HFGPT2LayerPolicy
from .replace_policy import replace_policies, generic_policies
from deepspeed import comm as dist
from torch import nn
from .layers import LinearAllreduce, LinearLayer
from .load_checkpoint import load_model_with_checkpoint
import time
class LinearAllreduce(nn.Module):
def __init__(self, weight, bias=None, mp_group=None):
super(LinearAllreduce, self).__init__()
self.weight = weight
self.bias = bias
self.mp_group = mp_group
def forward(self, input):
output = torch.matmul(input, self.weight)
if self.mp_group is not None:
torch.distributed.all_reduce(output, group=self.mp_group)
if self.bias is not None:
output += self.bias
return output
class LinearLayer(nn.Module):
def __init__(self, weight, bias=None):
super(LinearLayer, self).__init__()
self.weight = weight
self.bias = bias
def forward(self, input):
output = torch.matmul(input, self.weight)
if self.bias is not None:
output += self.bias
return output
from .utils import policy_to_ds_container
class ReplaceWithTensorSlicing:
def __init__(self, mp_group=None):
def __init__(self, mp_group=None, mp_size=1, out_dim=1, in_dim=0):
if mp_group is not None:
self.gpu_index = torch.distributed.get_rank(group=mp_group)
self.gpu_index = dist.get_rank(group=mp_group)
else:
self.gpu_index = 0
self.out_dim = out_dim
self.in_dim = in_dim
self.mp_size = mp_size
def merge_assert(self, dim1, dim2):
assert dim1 > dim2, \
......@@ -51,495 +38,415 @@ class ReplaceWithTensorSlicing:
for merging your checkpoints before replacing the transformer layer with\
inference-kernels'
def qkv_copy(self, dst, src):
def qkv_copy(self, dst, src, int8=False):
if src is None:
return torch.nn.Parameter(src)
return src
src_shape = src.shape
dst_shape = dst.shape
src_split = torch.split(src.data, src.shape[-1] // 3, dim=-1)
outer_dim = 0 if int8 else -1
inner_dim = -1 if int8 else 0
src_split = torch.split(src.data, src.shape[outer_dim] // 3, dim=outer_dim)
if (len(src_shape) == 2 and len(dst_shape) == 2):
if src_shape[1] == dst_shape[1]:
return torch.nn.Parameter(src)
self.merge_assert(src_shape[1], dst_shape[1])
qkv_size = dst_shape[1] // 3
qkv_split = [torch.split(src_s, qkv_size, dim=1) for src_s in src_split]
weight_split = [
torch.cat([qkv_s[i] for qkv_s in qkv_split],
axis=1) for i in range(len(qkv_split[0]))
]
dst.data.copy_(weight_split[self.gpu_index].to(
torch.cuda.current_device()).contiguous())
if src_shape[outer_dim] == dst_shape[self.out_dim]:
dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
if hasattr(src, 'scale'):
dst.scale = src.scale
return dst
if self.out_dim == 1:
self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
qkv_size = dst_shape[self.out_dim] // 3
qkv_split = [
torch.split(src_s,
qkv_size,
dim=outer_dim) for src_s in src_split
]
weight_split = [
torch.cat([qkv_s[i] for qkv_s in qkv_split],
axis=outer_dim) for i in range(len(qkv_split[0]))
]
dst = dst.reshape(-1).data.copy_(
weight_split[self.gpu_index].contiguous().reshape(-1)).reshape(
weight_split[self.gpu_index].shape)
else:
dst.data.copy_(src_split[self.gpu_index].to(
get_accelerator().current_device_name()).contiguous())
else:
if src_shape[0] == dst_shape[0]:
return torch.nn.Parameter(src)
qkv_size = dst_shape[0] // 3
qkv_split = [torch.split(src_s, qkv_size, dim=0) for src_s in src_split]
bias_split = [
torch.cat([qkv_s[i] for qkv_s in qkv_split],
axis=0) for i in range(len(qkv_split[0]))
]
dst.data.copy_(bias_split[self.gpu_index].to(
torch.cuda.current_device()).contiguous())
return torch.nn.parameter.Parameter(src)
if self.out_dim == 1:
qkv_size = dst_shape[0] // 3
qkv_split = [torch.split(src_s, qkv_size, dim=0) for src_s in src_split]
bias_split = [
torch.cat([qkv_s[i] for qkv_s in qkv_split],
axis=0) for i in range(len(qkv_split[0]))
]
dst.data.copy_(bias_split[self.gpu_index].contiguous())
else:
dst.data.copy_(src_split[self.gpu_index].contiguous())
return torch.nn.Parameter(dst)
dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
if hasattr(src, 'scale'):
dst.scale = src.scale
return dst
def copy(self, dst, src):
def copy(self, dst, src, int8=False):
if src is None:
return torch.nn.Parameter(src)
return src
assert not dst.data.is_meta # the torch.Tensor.copy_ method used below will silently fail on meta tensors
outer_dim = 0 if int8 else 1
inner_dim = 1 if int8 else 0
src_shape = src.shape
dst_shape = dst.shape
if (len(src_shape) == 2 and len(dst_shape) == 2):
if src_shape[0] == dst_shape[0] and src_shape[1] == dst_shape[1]:
return torch.nn.Parameter(src)
if src_shape[0] != dst_shape[0]:
self.merge_assert(src_shape[0], dst_shape[0])
weight_split = torch.split(src, dst_shape[0])
if src_shape[inner_dim] == dst_shape[
self.in_dim] and src_shape[outer_dim] == dst_shape[self.out_dim]:
dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
else:
self.merge_assert(src_shape[1], dst_shape[1])
weight_split = torch.split(src.data, dst_shape[1], dim=1)
dst.data.copy_(weight_split[self.gpu_index].to(
torch.cuda.current_device()).contiguous())
if src_shape[inner_dim] != dst_shape[self.in_dim]:
self.merge_assert(src_shape[inner_dim], dst_shape[self.in_dim])
weight_split = torch.split(
src,
dst_shape[self.in_dim],
dim=inner_dim)[self.gpu_index].contiguous()
else:
self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
weight_split = torch.split(
src.data,
dst_shape[self.out_dim],
dim=outer_dim)[self.gpu_index].contiguous()
dst = dst.reshape(-1).data.copy_(weight_split.reshape(-1)).reshape(
weight_split.shape)
else:
if src_shape[0] == dst_shape[0]:
return torch.nn.Parameter(src)
bias_split = torch.split(src.data, dst_shape[-1])
dst.data.copy_(bias_split[self.gpu_index].to(
torch.cuda.current_device()).contiguous())
return torch.nn.Parameter(dst)
dst.data.copy_(src)
else:
bias_split = torch.split(src.data,
dst_shape[-1])[self.gpu_index].contiguous()
dst.data.copy_(bias_split)
dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
if hasattr(src, 'scale'):
dst.scale = src.scale
return dst
def get_transformer_name(replaced_module):
from .containers import supported_models
from torch.nn import ModuleList
transformer_name = ''
for n, c in replaced_module.named_children():
if c.__class__ in supported_models:
transformer_name += n + '.'
for name, child in c.named_children():
if child.__class__ is ModuleList:
transformer_name += name
break
break
return transformer_name
class GroupQuantizer:
def __init__(self, q_int8=True, group_size=1, num_bits=8, num_groups=0):
self.group_size = group_size
self.num_bits = num_bits
self.q_int8 = q_int8
self.num_groups = num_groups
def quantize(self, inputs, qkv=True, count=1, parallel_dim=0):
if not self.q_int8 or not qkv:
inputs = torch.nn.Parameter(inputs, requires_grad=False)
inputs.scale = torch.empty(1)
return inputs
q_range = 2**self.num_bits
num_groups = self.num_groups if self.num_groups > 0 else inputs.shape[
0] // self.group_size
inputs = inputs.to(get_accelerator().current_device_name())
input_flat = inputs.reshape(num_groups, -1).contiguous()
input_min = torch.min(input_flat, dim=1, keepdim=True)[0].float()
input_max = torch.max(input_flat, dim=1, keepdim=True)[0].float()
scale = torch.max(input_min.abs(), input_max.abs()) * 2.0 / (q_range)
input_flat = (input_flat / scale).round().clamp(-q_range // 2, q_range // 2 - 1)
inputs_q = input_flat.reshape(inputs.shape).to(torch.int8).contiguous()
out = torch.nn.Parameter(inputs_q, requires_grad=False)
inputs_split = inputs.split(inputs.shape[parallel_dim] // 2, dim=parallel_dim)
input_flat = [
inputs_split[i].reshape(num_groups,
-1).contiguous() for i in range(2)
]
input_min = [
torch.min(input_flat[i],
dim=1,
keepdim=True)[0].float() for i in range(2)
]
input_max = [
torch.max(input_flat[i],
dim=1,
keepdim=True)[0].float() for i in range(2)
]
scale1 = [
(torch.max(input_min[i].abs(),
input_max[i].abs()) * 2.0 / (q_range)).squeeze().unsqueeze(0)
for i in range(2)
]
out.scale = torch.cat([scale.squeeze().unsqueeze(0),
scale1[0],
scale1[1]],
dim=0).reshape(num_groups,
-1).contiguous()
return out
def _module_match(module):
for policy in generic_policies:
policy = policy()
if policy.match(module):
return policy
return None
def generic_injection(module, fp16=False, enable_cuda_graph=True):
def replace_attn(child, policy):
policy_attn = policy.attention(child)
if policy_attn is None:
return child
if len(policy_attn) == 5:
qkvw, attn_ow, attn_ob, hidden_size, heads = policy_attn
else:
qw, kw, vw, attn_ow, attn_ob, hidden_size, heads = policy_attn
config = transformer_inference.DeepSpeedInferenceConfig(
hidden_size=hidden_size,
heads=heads,
fp16=fp16,
triangular_masking=False,
max_out_tokens=4096,
)
attn_module = DeepSpeedDiffusersAttention(config)
def transpose(data):
data = data.contiguous()
data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1))
data = data.reshape(data.shape[-1], data.shape[-2])
data.to(get_accelerator().current_device_name())
return data
if len(policy_attn) == 5:
attn_module.attn_qkvw.data = transpose(qkvw.data)
else:
attn_module.attn_qkvw = None
attn_module.attn_qw.data = transpose(qw.data)
attn_module.attn_kw.data = transpose(kw.data)
attn_module.attn_vw.data = transpose(vw.data)
attn_module.attn_qkvb = None
attn_module.attn_ow.data = transpose(attn_ow.data)
attn_module.attn_ob.data.copy_(
attn_ob.data.to(get_accelerator().current_device_name()))
return attn_module
def replace_attn_block(child, policy):
config = Diffusers2DTransformerConfig()
return DeepSpeedDiffusersTransformerBlock(child, config)
if isinstance(module, torch.nn.Module):
pass
else:
if fp16 is False:
raise ValueError("Generic injection only supported with FP16")
try:
import diffusers
cross_attention = diffusers.models.attention.CrossAttention
attention_block = diffusers.models.attention.BasicTransformerBlock
new_policies = {
cross_attention: replace_attn,
attention_block: replace_attn_block,
}
except ImportError:
new_policies = {}
#replace_transformer_layer(None,
# module.text_encoder,
# training=False,
# replace_with_kernel_inject=True,
# triangular_masking=True,
# max_out_tokens=8192)
from ..model_implementations.transformers.clip_encoder import DSClipEncoder
cg_encoder = DSClipEncoder(module.text_encoder,
enable_cuda_graph=enable_cuda_graph)
setattr(module, 'text_encoder', cg_encoder)
for name in module.__dict__.keys():
sub_module = getattr(module, name)
policy = _module_match(sub_module)
if policy is not None:
def _replace_module(module, policy):
for name, child in module.named_children():
_replace_module(child, policy)
if child.__class__ in new_policies:
replaced_module = new_policies[child.__class__](child,
policy)
setattr(module, name, replaced_module)
_replace_module(sub_module, policy)
new_module = policy.apply(sub_module,
enable_cuda_graph=enable_cuda_graph)
print(f"**** found and replaced {name} w. {type(new_module)}")
setattr(module, name, new_module)
container_g = None
def replace_transformer_layer(orig_layer_impl,
model,
policy=None,
micro_batch_size=-1,
config=None,
seed=-1,
hidden_size=-1,
num_attention_heads=-1,
mp_size=1,
training_mp_size=1,
mp_group=None,
ep_group=None,
expert_mp_group=None,
preln=True,
fp16=True,
local_rank=-1,
stochastic_mode=True,
training=True,
quantize=False,
quantize_settings=None,
triangular_masking=False,
return_tuple=True,
replace_with_kernel_inject=False,
linear_layer_setting=None,
moe=False,
moe_experts=1,
moe_type='standard'):
checkpoint_dict,
config,
model_config):
""" Replace bert-style transformer layers with DeepSpeed's transformer layer
Arguments:
orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
e.g., transformers.modeling_bert.BertLayer.
model (torch.nn.Module): user's nn.module representing their model
policy: shows the policy for mapping from the orig_layer_impl to transformer parameters when
replace_with_kernel_inject is set, otherwise, it provides the names of two linear layers as
a tuple: (attention_output projection, transformer output projection)
micro_batch_size (int): micro batch size per gpu used during training/eval
config (dict): model config containing hidden size, attention heads, etc.
seed (int): random seed value
max_seq_length (int): max sequence length for training
hidden_size (int): hidden dimension
num_attention_heads (int): number of attention heads
mp_size (int): model_parallelism degree
mp_group : model_parallel group initialized on the modeling side
preln (bool): does the original layer implementation do pre or post layer norm?
fp16 (bool): fp16 or fp32
local_rank (int): GPU rank (optional),
stochastic_mode (bool): whether to use stochastic mode
training (bool): specifying whether kernel-injection is done for training/inference (set to false for inference-mode injection)
quantize_settings (tuple): this setting shows how we can quantize a model for running it through the inference kernels.
It includes (quantization_scales, merge_count, mlp_extra_grouping, quantize_groups).
return_tuple (bool): if set, transformer layer returns a tuple as the output.
Note: this flag needs to be set for huggingface models.
replace_with_kernel_inject (bool): injection_mode, if true, kernels will be add along with configuring
Tensor-Parallelism
linear_layer_setting (tuple of modules) [Optional]: shows which two classes are used for linear layers
and embedding layers
attention_params: (list of strings) [Optional]: shows the parameters in the attention part that needs to
be adjusted based on the model-parallelism
checkpoint_dict: Dictionary for checkpoint passed from the Inference Engine
config: top-level DS Inference config defined in inference/config.py
model_config: HuggingFace model config passed from the inference/engine.py
Returns:
Updated nn.module with replaced transformer layers
"""
# defining globals as internally defined functions inherit these everywhere
fp16 = (config.dtype == torch.float16 or config.dtype == torch.int8)
quantize = (config.dtype == torch.int8)
# todo: Refactor later. In future, let's minimize the style used above and use config.** instead
linear_layer_setting = None
'''
linear_layer_setting (tuple of modules) [Optional]: shows which two classes are used for linear layers and embedding layers
'''
micro_batch_size = -1
seed = -1
local_rank = -1
mp_replace = ReplaceWithTensorSlicing(
mp_group=config.tensor_parallel.tp_group,
mp_size=config.tensor_parallel.tp_size) #, out_dim=0, in_dim=1)
def replace_with_policy(child,
policy_cls,
triangular_masking,
inference=False,
preln=True,
layer_id=0):
preln = False if policy_cls is HFBertLayerPolicy else preln
if policy_cls is HFBertLayerPolicy:
policy = policy_cls(child, inference=inference, preln=preln)
else:
policy = policy_cls(child, inference=inference)
policy = policy_cls(child, inference=inference)
if not policy.cuda_graph_supported:
# policy says cuda graph is not supported raise an error if set
assert not config.enable_cuda_graph, "cuda graph is not supported with this model, please disable"
if inference:
hidden_size, num_attention_heads = policy.get_hidden_heads()
assert num_attention_heads % mp_size == 0,\
"To run the model parallel across the GPUs, the attention_heads require to be divisible by the world_size!" +\
"This is because the attention computation is partitioned evenly among the parallel GPUs."
from deepspeed.moe.layer import MoE
moe = False
if hasattr(child, 'mlp') and isinstance(child.mlp, MoE):
num_experts = child.mlp.num_experts
moe = True
attn_linear_layer, qkvw, qkvb, dense_w, dense_b, scale_attention, megatron_v2 = policy.attention()
if not moe or moe_type == 'standard':
mlp_linear_layer, _h4h_w, _h4h_b, _4hh_w, _4hh_b = policy.mlp()
else:
mlp_linear_layer, _h4h_w, _h4h_b, _4hh_w, _4hh_b, \
_res_h4h_w, _res_h4h_b, _res_4hh_w, _res_4hh_b, _res_coef = policy.mlp(moe_type)
attn_nw, attn_nb, input_nw, input_nb = policy.layerNorm()
if quantize:
if policy_cls is not HFBertLayerPolicy:
qkvw = qkvw.to(torch.int8)
dense_w = dense_w.to(torch.int8)
_h4h_w = [moe_w1.to(torch.int8)
for moe_w1 in _h4h_w] if moe else _h4h_w.to(torch.int8)
_4hh_w = [moe_w1.to(torch.int8)
for moe_w1 in _4hh_w] if moe else _4hh_w.to(torch.int8)
elif fp16:
qkvw = qkvw.half()
dense_w = dense_w.half()
_h4h_w = [moe_w1.half() for moe_w1 in _h4h_w] if moe else _h4h_w.half()
_4hh_w = [moe_w1.half() for moe_w1 in _4hh_w] if moe else _4hh_w.half()
if quantize or fp16:
qkvb = qkvb if qkvb is None else qkvb.half()
dense_b = dense_b if dense_b is None else dense_b.half()
_h4h_b = [moe_b1.half() for moe_b1 in _h4h_b] if moe else _h4h_b.half()
_4hh_b = [moe_b1.half() for moe_b1 in _4hh_b] if moe else _4hh_b.half()
attn_nw = attn_nw if attn_nw is None else attn_nw.half()
attn_nb = attn_nb if attn_nb is None else attn_nb.half()
input_nw = input_nw.half()
input_nb = input_nb.half()
if moe and moe_type == 'residual' and fp16:
_res_h4h_b = _res_h4h_b.half()
_res_4hh_b = _res_4hh_b.half()
_res_h4h_w = _res_h4h_w.half()
_res_4hh_w = _res_4hh_w.half()
_res_coef = _res_coef.half()
mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
#expert_mp_replace = ReplaceWithTensorSlicing(mp_group=expert_mp_group)
if inference:
if moe:
ep_world_size = torch.distributed.get_world_size()
local_ep_size = 1 if num_experts < ep_world_size else num_experts // ep_world_size
transformer_config = transformer_inference.DeepSpeedMoEInferenceConfig(
hidden_size=hidden_size,
heads=num_attention_heads,
layer_norm_eps=config.layer_norm_eps if hasattr(
config,
'layer_norm_eps') else 1e-12,
fp16=fp16,
pre_layer_norm=preln,
mp_size=mp_size,
q_int8=quantize,
moe_experts=local_ep_size,
global_experts=num_experts,
mlp_type=moe_type)
else:
rotary_dim = config.rotary_dim if hasattr(config, 'rotary_dim') else child.attention.rotary_ndims \
if hasattr(child, 'attention') and hasattr(child.attention,'rotary_ndims') else -1
transformer_config = transformer_inference.DeepSpeedInferenceConfig(
hidden_size=hidden_size,
heads=num_attention_heads,
layer_norm_eps=config.layer_norm_eps if hasattr(
config,
'layer_norm_eps') else
(config.layer_norm_epsilon
if hasattr(config,
'layer_norm_epsilon') else config.layernorm_epsilon
if hasattr(config,
'layernorm_epsilon') else 1.0e-12),
fp16=fp16,
pre_layer_norm=preln,
mp_size=mp_size,
q_int8=quantize,
return_tuple=(return_tuple or (policy_cls is HFBertLayerPolicy)),
triangular_masking=(policy_cls is not HFBertLayerPolicy),
local_attention=((config.attention_layers[layer_id] == "local")
if hasattr(config,
'attention_layers') else False),
window_size=(config.window_size if hasattr(config,
'window_size') else 1),
rotary_dim=rotary_dim,
mlp_after_attn=(rotary_dim is None or rotary_dim < 0),
training_mp_size=training_mp_size)
if quantize and quantize_settings is not None:
(quantization_scales,
merge_count,
mlp_extra_grouping,
quantize_groups) = quantize_settings
if moe:
new_module = transformer_inference.DeepSpeedMoEInference(
transformer_config,
mp_group=mp_group,
ep_group=None if ep_group is None else ep_group[num_experts],
expert_mp_group=None
if expert_mp_group is None else expert_mp_group[num_experts],
quantize_scales=quantization_scales[layer_id],
quantize_groups=quantize_groups,
merge_count=merge_count,
mlp_extra_grouping=mlp_extra_grouping,
qkv_merging=(policy_cls is HFBertLayerPolicy))
# 1. Create a model-specific container object using the policy object.
_container = policy_to_ds_container(policy=policy,
config=config,
model_config=model_config,
layer_id=layer_id,
child=child)
_container.set_dtype(fp16)
_container.set_moe(moe)
else:
new_module = transformer_inference.DeepSpeedTransformerInference(
transformer_config,
mp_group=mp_group,
quantize_scales=quantization_scales[layer_id],
quantize_groups=quantize_groups,
merge_count=merge_count,
mlp_extra_grouping=mlp_extra_grouping,
qkv_merging=(policy_cls is HFBertLayerPolicy))
if quantize and qkvw.dtype != torch.int8:
quantize_bits = 8
quantizer = WeightQuantization()
if policy_cls is HFBertLayerPolicy:
data_quantized, _ = quantizer.quantize_data(qkvw.data, quantize_bits, quantize_groups * 3)
else:
data_quantized, _ = quantizer.quantize_data(qkvw.data, quantize_bits, quantize_groups)
qkvw.data.copy_(data_quantized)
qkvw.data = qkvw.data.to(torch.int8)
else:
# 2. Set the tensor parallelism config
_container.set_tensor_parallel_config(config.tensor_parallel.tp_size,
config.tensor_parallel.tp_group)
if moe:
new_module = transformer_inference.DeepSpeedMoEInference(
transformer_config,
mp_group=mp_group,
ep_group=None if ep_group is None else ep_group[num_experts],
expert_mp_group=None
if expert_mp_group is None else expert_mp_group[num_experts],
)
# 3. Initialize tensors
_container.initialize_tensors()
else:
new_module = transformer_inference.DeepSpeedTransformerInference(
transformer_config,
mp_group=mp_group,
)
new_module.config.scale_attention = scale_attention
# we want the weights in [input, output] shape
# linear layer is created with [input, output] shape
# transpose it here to reduce inference cost!
def transpose(data):
data.view(-1).copy_(data.transpose(-1, -2).contiguous().view(-1))
data = data.reshape(data.shape[-1], data.shape[-2])
return data
if attn_linear_layer:
qkvw.data = transpose(qkvw.data)
dense_w.data = transpose(dense_w.data)
if megatron_v2:
new_module.config.rotate_half = True
new_module.config.rotate_every_two = False
def _transpose(x):
num_attention_heads_per_partition = transformer_config.heads // transformer_config.mp_size
attention_head_size = x.shape[-1] // num_attention_heads_per_partition
new_x_shape = x.size()[:-1] + (num_attention_heads_per_partition,
attention_head_size)
x_1 = x.view(*new_x_shape)
(q,
k,
v) = torch.split(x_1,
(x_1.shape[-1] // 3),
dim=(x_1.dim() - 1))
if len(q.shape) > 2:
return torch.cat((q.reshape(q.shape[0],
-1),
k.reshape(q.shape[0],
-1),
v.reshape(q.shape[0],
-1)),
dim=-1).reshape(x.shape)
else:
return torch.cat((q.reshape(-1),
k.reshape(-1),
v.reshape(-1)),
dim=-1).reshape(x.shape)
qkvw = torch.nn.Parameter(_transpose(qkvw).contiguous())
qkvb = torch.nn.Parameter(_transpose(qkvb).contiguous())
dense_b = dense_b * (transformer_config.training_mp_size /
transformer_config.mp_size)
_4hh_b = _4hh_b * (transformer_config.training_mp_size /
transformer_config.mp_size)
if mlp_linear_layer:
_h4h_w = [transpose(moe_w1.data)
for moe_w1 in _h4h_w] if moe else transpose(_h4h_w.data)
_4hh_w = [transpose(moe_w1.data)
for moe_w1 in _4hh_w] if moe else transpose(_4hh_w.data)
if moe and moe_type == 'residual':
_res_h4h_w.data = transpose(_res_h4h_w.data)
_res_4hh_w.data = transpose(_res_4hh_w.data)
_res_coef.data = transpose(_res_coef.data)
attn_block = new_module.attention
attn_block.attn_qkvw = mp_replace.qkv_copy(attn_block.attn_qkvw, qkvw)
attn_block.attn_qkvb = mp_replace.qkv_copy(attn_block.attn_qkvb, qkvb)
attn_block.attn_ow = mp_replace.copy(attn_block.attn_ow, dense_w)
attn_block.attn_ob = mp_replace.copy(attn_block.attn_ob, dense_b)
mpl_block = new_module.mlp
if moe:
gpu_index = torch.distributed.get_rank()
gpu_index = 0
for ep_index in range(local_ep_size):
mpl_block[ep_index].inter_w.data = _h4h_w[
gpu_index * local_ep_size + ep_index].to(
torch.cuda.current_device())
mpl_block[ep_index].inter_b.data = _h4h_b[
gpu_index * local_ep_size + ep_index].to(
torch.cuda.current_device())
mpl_block[ep_index].output_w.data = _4hh_w[
gpu_index * local_ep_size + ep_index].to(
torch.cuda.current_device())
mpl_block[ep_index].output_b.data = _4hh_b[
gpu_index * local_ep_size + ep_index].to(
torch.cuda.current_device())
new_module.attn_nw.data = attn_nw.to(torch.cuda.current_device())
new_module.attn_nb.data = attn_nb.to(torch.cuda.current_device())
if moe_type == 'residual':
new_module.res_mlp.inter_w.data = _res_h4h_w.to(
torch.cuda.current_device())
new_module.res_mlp.inter_b.data = _res_h4h_b.to(
torch.cuda.current_device())
new_module.res_mlp.output_w.data = _res_4hh_w.to(
torch.cuda.current_device())
new_module.res_mlp.output_b.data = _res_4hh_b.to(
torch.cuda.current_device())
new_module.res_coef.data = _res_coef.to(torch.cuda.current_device())
else:
mpl_block.inter_w.data = mp_replace.copy(mpl_block.inter_w, _h4h_w)
mpl_block.inter_b.data = mp_replace.copy(mpl_block.inter_b, _h4h_b)
mpl_block.output_w.data = mp_replace.copy(mpl_block.output_w, _4hh_w)
mpl_block.output_b.data = mp_replace.copy(mpl_block.output_b, _4hh_b)
if attn_nw is None:
new_module.mlp.attn_nw = attn_nw
else:
new_module.mlp.attn_nw.data = attn_nw.to(torch.cuda.current_device())
if attn_nb is None:
new_module.mlp.attn_nb = attn_nb
else:
new_module.mlp.attn_nb.data = attn_nb.to(torch.cuda.current_device())
new_module.norm_w.data = input_nw.to(torch.cuda.current_device())
new_module.norm_b.data = input_nb.to(torch.cuda.current_device())
else:
transformer_config = deepspeed.DeepSpeedTransformerConfig(
batch_size=micro_batch_size,
hidden_size=config.hidden_size,
heads=config.num_attention_heads,
attn_dropout_ratio=config.attention_probs_dropout_prob,
hidden_dropout_ratio=config.hidden_dropout_prob,
num_hidden_layers=config.num_hidden_layers,
initializer_range=config.initializer_range,
layer_norm_eps=config.layer_norm_eps if hasattr(
config,
'layer_norm_eps') else 1e-12,
seed=seed,
fp16=fp16,
pre_layer_norm=(False if policy_cls is HFBertLayerPolicy else preln),
return_tuple=return_tuple,
local_rank=local_rank,
stochastic_mode=stochastic_mode,
normalize_invertible=True,
training=training)
new_module = deepspeed.DeepSpeedTransformerLayer(transformer_config)
new_module.attn_qkvw.data = qkvw
new_module.attn_qkvb.data = qkvb
new_module.attn_ow.data = dense_w
new_module.attn_ob.data = dense_b
new_module.attn_nw.data = attn_nw
new_module.attn_nb.data = attn_nb
new_module.norm_w.data = input_nw
new_module.norm_b.data = input_nb
new_module.inter_w.data = _h4h_w
new_module.inter_b.data = _h4h_b
new_module.output_w.data = _4hh_w
new_module.output_b.data = _4hh_b
return new_module
# 4. deal with data types -- needs refactor to use dtype instead of fp16
if fp16:
_container.convert_to_required_dtype(dtype=torch.half)
# 5. Set the quantization config
quantizer = GroupQuantizer(q_int8=quantize)
_container.set_quantization_config(quantize, quantizer)
# 6. create a DS Inference config object
_container.create_ds_model_config()
# 7. use the config and create the module
_container.create_module()
# 8. transpose the weights and bias if needed
_container.transpose()
# 9. deal with tensor parallelism.
_container.apply_tensor_parallelism(mp_replace)
# 10. copy the tensors from the model-specific container to the new module
_container.copy_data_to_new_module()
# 11. set global for generic checkpoint loading
global container_g
if container_g is None:
container_g = _container
return _container.module
def replace_wo_policy(module, all_reduce_linears):
mp_size = config.tensor_parallel.tp_size
mp_group = config.tensor_parallel.tp_group
def _replace(child, name, conv_linear_layer):
mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
weight_shape = child.weight.shape
if name in all_reduce_linears:
new_weight = torch.empty(
(child.weight.shape[0]
if conv_linear_layer else child.weight.shape[1] // mp_size,
child.weight.shape[1]
if conv_linear_layer else child.weight.shape[0]),
device=child.weight.device,
dtype=torch.half if fp16 else torch.float)
if not conv_linear_layer:
child.weight.data.view(-1).copy_(
child.weight.data.transpose(-1,
-2).contiguous().view(-1))
child.weight.data = child.weight.data.reshape(
child.weight.data.shape[-1],
child.weight.data.shape[-2])
data = mp_replace.copy(new_weight,
child.weight.data).to(torch.cuda.current_device())
new_weight = torch.empty((
weight_shape[1] if conv_linear_layer else weight_shape[0],
(weight_shape[0] if conv_linear_layer else weight_shape[1]) //
mp_size,
),
device=child.weight.device,
dtype=child.weight.dtype)
if conv_linear_layer:
child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
data = mp_replace.copy(new_weight, child.weight.data)
new_bias = torch.empty((weight_shape[0]),
device=child.weight.device,
dtype=child.weight.dtype)
if child.bias is not None:
new_bias.data.copy_(child.bias.data)
return LinearAllreduce(data, child.bias if child.bias is None else \
child.bias.to(torch.cuda.current_device()), mp_group)
torch.nn.parameter.Parameter(new_bias.to(get_accelerator().current_device_name())), mp_group)
else:
new_weight = torch.empty(
(child.weight.shape[0] //
mp_size if conv_linear_layer else child.weight.shape[1],
child.weight.shape[1]
if conv_linear_layer else child.weight.shape[0] // mp_size),
device=child.weight.device,
dtype=torch.half if fp16 else torch.float)
if not conv_linear_layer:
child.weight.data.view(-1).copy_(
child.weight.data.transpose(-1,
-2).contiguous().view(-1))
child.weight.data = child.weight.data.reshape(
child.weight.data.shape[-1],
child.weight.data.shape[-2])
new_weight = torch.empty((
(weight_shape[1] if conv_linear_layer else weight_shape[0]) //
mp_size,
weight_shape[0] // mp_size if conv_linear_layer else weight_shape[1],
),
device=child.weight.device,
dtype=child.weight.dtype)
if conv_linear_layer:
child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
data = mp_replace.copy(new_weight, child.weight.data)
new_bias = torch.empty((child.weight.shape[1] // mp_size),
new_bias = torch.empty((weight_shape[0] // mp_size),
device=child.weight.device,
dtype=torch.half if fp16 else torch.float)
dtype=child.weight.dtype)
bias_data = None if child.bias is None else mp_replace.copy(
new_bias,
child.bias.data).to(torch.cuda.current_device())
return LinearLayer(data.to(torch.cuda.current_device()), bias_data)
child.bias.data).to(get_accelerator().current_device_name())
return LinearLayer(weight=data.to(
get_accelerator().current_device_name()),
bias=bias_data)
def _slice_embedding(child, name, conv_linear_layer):
mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
......@@ -547,7 +454,9 @@ def replace_transformer_layer(orig_layer_impl,
child.weight.shape[1] // mp_size),
device=child.weight.device,
dtype=child.weight.dtype)
data = mp_replace.copy(new_weight, child.weight.data)
data = mp_replace.copy(new_weight,
child.weight.ds_tensor.data if hasattr(child.weight, 'ds_tensor') else \
child.weight.data)
new_embedding = nn.Embedding(child.weight.shape[0],
child.weight.shape[1] // mp_size)
new_embedding.weight.data.copy_(data)
......@@ -562,10 +471,14 @@ def replace_transformer_layer(orig_layer_impl,
child.num_heads = child.num_heads // mp_size
if hasattr(child, 'num_attention_heads'):
child.num_attention_heads = child.num_attention_heads // mp_size
if hasattr(child, 'num_attn_heads'):
child.num_attn_heads = child.num_attn_heads // mp_size
if hasattr(child, 'all_head_size'):
child.all_head_size = child.all_head_size // mp_size
if hasattr(child, 'embed_dim'):
child.embed_dim = child.embed_dim // mp_size
if hasattr(child, 'hidden_size'):
child.hidden_size = child.hidden_size // mp_size
conv_linear_layer = False
if linear_layer_setting is not None:
......@@ -600,32 +513,193 @@ def replace_transformer_layer(orig_layer_impl,
return _replace_module(module)
def replace_fn(child, _policy, layer_id=0):
training = False # todo: refactor this part to go in the config
if training:
# copy relevant state from child -> new module
new_module = replace_with_policy(child,
_policy,
triangular_masking,
preln=preln)
new_module = replace_with_policy(child, _policy, config.triangular_masking)
else:
# copy relevant state from child -> new module
if replace_with_kernel_inject:
if config.replace_with_kernel_inject:
new_module = replace_with_policy(child,
_policy,
triangular_masking,
config.triangular_masking,
inference=True,
preln=(_policy
is not HFBertLayerPolicy),
layer_id=layer_id)
else:
new_module = replace_wo_policy(child, _policy)
return new_module
return replace_module(model=model,
orig_class=orig_layer_impl,
replace_fn=replace_fn,
_replace_policy=policy)
replaced_module = replace_module(model=model,
orig_class=orig_layer_impl,
replace_fn=replace_fn,
_replace_policy=config.injection_policy_tuple)
quantizer = GroupQuantizer(q_int8=quantize)
world_size = dist.get_world_size() if dist.is_initialized() else 1
rank = dist.get_rank() if dist.is_initialized() else 0
if checkpoint_dict is not None:
assert container_g.ckpt_load_enabled, \
f"Meta Tensor checkpoint loading not supported in {container_g.__class__.__name__} container"
start_time = time.time()
checkpoint = checkpoint_dict['checkpoints']
ckpt_list = checkpoint["tp"] if type(checkpoint) is dict else checkpoint
ckpt_type = checkpoint_dict.get('parallelization', 'pp')
ckpt_mp_size = checkpoint_dict.get('tp_size', len(ckpt_list))
ckpt_mp_size = checkpoint_dict.get('mp_size', ckpt_mp_size)
base_dir1 = checkpoint_dict.get('base_dir', config.base_dir)
if ckpt_type == 'pp' and type(checkpoint) is list:
pbar = tqdm.tqdm(total=len(checkpoint),
desc=f"Loading {len(checkpoint)} checkpoint shards")
for i in range(len(checkpoint)):
sd = [
torch.load(os.path.join(base_dir1,
checkpoint[i]),
map_location='cpu')
]
load_model_with_checkpoint(replaced_module,
sd,
mp_replace,
ckpt_type,
ckpt_mp_size,
quantizer,
container=container_g)
pbar.update(1)
else:
import gc
num_checkpoints = len(ckpt_list) // ckpt_mp_size
tp_split_size = (world_size / ckpt_mp_size)
sd_offset = int(rank / tp_split_size)
sd_count = int((rank + max(1, tp_split_size)) / tp_split_size) - sd_offset
pbar = tqdm.tqdm(total=num_checkpoints,
desc=f"Loading {num_checkpoints} checkpoint shards")
for i in range(num_checkpoints):
pbar.update(1)
ckpt_index = i * ckpt_mp_size + sd_offset
ckpt_files = [
os.path.join(base_dir1,
ckpt_list[ckpt_index +
j]) if base_dir1 else ckpt_list[ckpt_index +
j]
for j in range(sd_count)
]
sds = [
torch.load(ckpt_file,
map_location='cpu') for ckpt_file in ckpt_files
]
load_model_with_checkpoint(replaced_module,
sds,
mp_replace,
ckpt_type,
ckpt_mp_size,
quantizer,
int(rank % tp_split_size),
container=container_g)
sds = [None for _ in sds]
gc.collect()
if "non_tp" in checkpoint:
pbar = tqdm.tqdm(
total=len(checkpoint["non_tp"]),
desc=f"Loading {len(checkpoint['non_tp'])} checkpoint shards")
for i in range(len(checkpoint["non_tp"])):
pbar.update(1)
ckpt_file = os.path.join(base_dir1,
checkpoint["non_tp"][i]
) if base_dir1 else checkpoint["non_tp"][i]
sds = [torch.load(ckpt_file, map_location='cpu')]
load_model_with_checkpoint(replaced_module,
sds,
mp_replace,
ckpt_type,
ckpt_mp_size,
quantizer,
int(rank % tp_split_size),
container=container_g)
sds = [None for _ in sds]
gc.collect()
print(f"checkpoint loading time at rank {rank}: {time.time()-start_time} sec")
if config.save_mp_checkpoint_path is not None:
from collections import OrderedDict
import json
num_partitions = 8
if checkpoint_dict is None:
ckpt_name = "ds_model"
try:
from transformers.models.bloom.modeling_bloom import BloomForCausalLM
if isinstance(model, BloomForCausalLM):
ckpt_name = "bloom"
except ImportError:
ckpt_name = "ds_model"
else:
ckpt_name = checkpoint_dict['type']
if dist.is_initialized():
dist.barrier()
transformer_name = get_transformer_name(replaced_module)
non_tp_ckpt_name = f'non-tp.pt'
ckpt_files = [non_tp_ckpt_name]
os.makedirs(config.save_mp_checkpoint_path, exist_ok=True)
if not dist.is_initialized() or dist.get_rank() == 0:
print("Saving tp-sharded checkpoints")
torch.save(
OrderedDict({
k: v
for k,
v in dict(replaced_module.state_dict()).items()
if transformer_name not in k
}),
f'{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}')
ckpt_config = json.dumps({
'type':
ckpt_name,
'base_dir':
f'{config.save_mp_checkpoint_path}',
'checkpoints': {
"non_tp":
ckpt_files,
"tp": [
f'tp_{r:0>2d}_{m:0>2d}.pt' for m in range(num_partitions)
for r in range(world_size)
]
},
'version':
1.0,
'parallelization':
'tp',
'tp_size':
world_size,
'dtype':
'int8' if quantize else ('float16' if fp16 else 'float32')
})
with open(f"{config.save_mp_checkpoint_path}/ds_inference_config.json",
"w") as cfg:
cfg.write(ckpt_config)
rep_sd = replaced_module.state_dict()
for n, p in replaced_module.named_parameters():
if hasattr(p, 'scale'):
rep_sd[n] = [p, p.scale]
keys = list(rep_sd.keys())
partition_size = (len(keys) // num_partitions + 1)
for m in range(num_partitions):
torch.save(
OrderedDict({
k: [rep_sd[k],
rep_sd[k].scale] if hasattr(rep_sd[k],
'scale') else rep_sd[k]
for k in keys[m * partition_size:(m + 1) * partition_size]
if transformer_name in k
}),
f'{config.save_mp_checkpoint_path}/tp_{rank:0>2d}_{m:0>2d}.pt')
return replaced_module
def revert_transformer_layer(orig_layer_impl, model, config, preln=False):
......@@ -751,4 +825,6 @@ def _replace_module(model, policies, layer_id=0):
else:
_, layer_id = _replace_module(child, policies, layer_id=layer_id)
# Add the reset_cache func to the model, so that it can be called in the beginning of text-generation.
model.reset_cache = transformer_inference.DeepSpeedTransformerInference.reset_cache
return model, layer_id
from abc import ABC
import torch
from torch.nn.parameter import Parameter
class DSPolicy(ABC):
def __init__(self,
inference=True,
linear_layer=True,
scale_attention=True,
megatron_v2=False):
self.inference = inference
self.linear_layer = linear_layer
self.scale_attention = scale_attention
self.is_megatron_v2 = megatron_v2
def attention(self):
"""
Returns attention qkv and dense parameters
weight: (3*hidden, hidden) and (hidden, hidden)
bias: (3*hidden) and (hidden)
"""
raise NotImplementedError
def get_hidden_heads(self):
"""
return hidden_size and number of heads
"""
raise NotImplementedError
def mlp(self):
"""
Returns mlp intermediate and output
weight: (intermediate, hidden) and (hidden, intermediate)
bias: (intermediate) and (hidden)
"""
raise NotImplementedError
def layerNorm(self):
"""
Returns LayerNorms used in transformer layer
Post-Attention and pre/post layer norm
gamma and beta with shape: (hidden)
"""
raise NotImplementedError
class HFBertLayerPolicy(DSPolicy):
_orig_layer_class = None
def __init__(self, client_module, inference=False, preln=False):
super().__init__(inference)
self.client_module = client_module
self.preln = preln
if HFBertLayerPolicy._orig_layer_class is None:
try:
import transformers
HFBertLayerPolicy._orig_layer_class = [
transformers.models.bert.modeling_bert.BertLayer,
transformers.models.roberta.modeling_roberta.RobertaLayer
]
except:
HFBertLayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
return self.client_module.attention.self.query.weight.shape[1], \
self.client_module.attention.self.num_attention_heads
def attention(self):
qw = self.client_module.attention.self.query.weight
qb = self.client_module.attention.self.query.bias
kw = self.client_module.attention.self.key.weight
kb = self.client_module.attention.self.key.bias
vw = self.client_module.attention.self.value.weight
vb = self.client_module.attention.self.value.bias
qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
qkvb = Parameter(torch.cat((qb, kb, vb), dim=0), requires_grad=False)
return self.linear_layer, \
qkvw, \
qkvb, \
self.client_module.attention.output.dense.weight, \
self.client_module.attention.output.dense.bias, \
self.scale_attention, \
self.is_megatron_v2
def mlp(self):
if self.preln:
intermediate_ff = self.client_module.intermediate.dense_act
else:
intermediate_ff = self.client_module.intermediate.dense
return self.linear_layer, intermediate_ff.weight, intermediate_ff.bias, \
self.client_module.output.dense.weight, \
self.client_module.output.dense.bias
def layerNorm(self):
if self.preln:
attention_layernorm = self.client_module.PostAttentionLayerNorm
transformer_layernorm = self.client_module.PreAttentionLayerNorm
else:
attention_layernorm = self.client_module.attention.output.LayerNorm
transformer_layernorm = self.client_module.output.LayerNorm
return attention_layernorm.weight, \
attention_layernorm.bias, \
transformer_layernorm.weight, \
transformer_layernorm.bias
class HFGPTNEOLayerPolicy(DSPolicy):
_orig_layer_class = None
def __init__(self, client_module, inference=True):
super().__init__(inference, scale_attention=False)
self.client_module = client_module
try:
import transformers
HFGPTNEOLayerPolicy._orig_layer_class = transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoBlock
except:
HFGPTNEOLayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
return self.client_module.attn.attention.q_proj.weight.shape[1], \
self.client_module.attn.attention.num_heads
def attention(self):
qw = self.client_module.attn.attention.q_proj.weight
kw = self.client_module.attn.attention.k_proj.weight
vw = self.client_module.attn.attention.v_proj.weight
qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
return self.linear_layer, \
qkvw, \
None, \
self.client_module.attn.attention.out_proj.weight, \
self.client_module.attn.attention.out_proj.bias, \
self.scale_attention, \
self.is_megatron_v2
def mlp(self):
return self.linear_layer, \
self.client_module.mlp.c_fc.weight, \
self.client_module.mlp.c_fc.bias, \
self.client_module.mlp.c_proj.weight, \
self.client_module.mlp.c_proj.bias
def layerNorm(self):
return self.client_module.ln_2.weight, \
self.client_module.ln_2.bias, \
self.client_module.ln_1.weight, \
self.client_module.ln_1.bias
class HFGPTJLayerPolicy(DSPolicy):
_orig_layer_class = None
def __init__(self, client_module, inference=True):
super().__init__(inference, scale_attention=True)
self.client_module = client_module
try:
import transformers
HFGPTJLayerPolicy._orig_layer_class = transformers.models.gptj.modeling_gptj.GPTJBlock
except:
HFGPTJLayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
return self.client_module.attn.q_proj.weight.shape[1], \
self.client_module.attn.num_attention_heads
def attention(self):
qw = self.client_module.attn.q_proj.weight
kw = self.client_module.attn.k_proj.weight
vw = self.client_module.attn.v_proj.weight
qkvw = Parameter(torch.cat((qw, kw, vw), dim=0), requires_grad=False)
return self.linear_layer, \
qkvw, \
None, \
self.client_module.attn.out_proj.weight, \
None, \
self.scale_attention, \
self.is_megatron_v2
def mlp(self):
return self.linear_layer, \
self.client_module.mlp.fc_in.weight, \
self.client_module.mlp.fc_in.bias, \
self.client_module.mlp.fc_out.weight, \
self.client_module.mlp.fc_out.bias
def layerNorm(self):
return None, \
None, \
self.client_module.ln_1.weight, \
self.client_module.ln_1.bias
class MegatronLayerPolicy(DSPolicy):
_orig_layer_class = None
version = 0
moe_type = 'standard'
def __init__(self, client_module, inference=True):
super().__init__(inference)
self.client_module = client_module
# we use megatron version to differentiate between the old and new
# megatron-lm source code
if MegatronLayerPolicy._orig_layer_class is None:
try:
import megatron
from megatron.model.transformer import ParallelTransformerLayer
MegatronLayerPolicy._orig_layer_class = ParallelTransformerLayer
except ImportError:
MegatronLayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
return self.client_module.attention.query_key_value.weight.shape[1], \
self.client_module.attention.num_attention_heads
def attention(self):
if self.inference:
if MegatronLayerPolicy.version == 0:
attention = self.client_module.attention
else:
attention = self.client_module.self_attention
return self.linear_layer, \
attention.query_key_value.weight, \
attention.query_key_value.bias, \
attention.dense.weight, \
attention.dense.bias, \
self.scale_attention, \
self.is_megatron_v2
def mlp(self, moe_type='standard'):
from deepspeed.moe.utils import has_moe_layers
moe, _ = has_moe_layers(self.client_module)
if moe:
moe_experts = self.client_module.mlp.deepspeed_moe.experts.deepspeed_experts if moe_type == 'standard' else \
self.client_module.mlp.moe.deepspeed_moe.experts.deepspeed_experts
num_experts = len(moe_experts)
if moe_type == 'standard':
return self.linear_layer, \
[moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
[moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
[moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
[moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)]
else:
return self.linear_layer, \
[moe_experts[i].dense_h_to_4h.weight for i in range(num_experts)], \
[moe_experts[i].dense_h_to_4h.bias for i in range(num_experts)], \
[moe_experts[i].dense_4h_to_h.weight for i in range(num_experts)], \
[moe_experts[i].dense_4h_to_h.bias for i in range(num_experts)], \
self.client_module.mlp.mlp.dense_h_to_4h.weight, \
self.client_module.mlp.mlp.dense_h_to_4h.bias, \
self.client_module.mlp.mlp.dense_4h_to_h.weight, \
self.client_module.mlp.mlp.dense_4h_to_h.bias, \
self.client_module.mlp.coefficient.weight
else:
return self.linear_layer, \
self.client_module.mlp.dense_h_to_4h.weight, \
self.client_module.mlp.dense_h_to_4h.bias, \
self.client_module.mlp.dense_4h_to_h.weight, \
self.client_module.mlp.dense_4h_to_h.bias
def layerNorm(self):
return self.client_module.post_attention_layernorm.weight, \
self.client_module.post_attention_layernorm.bias, \
self.client_module.input_layernorm.weight, \
self.client_module.input_layernorm.bias
class HFGPT2LayerPolicy(DSPolicy):
_orig_layer_class = None
def __init__(self, client_module, inference=True):
# HuggingFace GPT2 uses convolutional layer instead of linear layer
super().__init__(inference, linear_layer=False)
self.client_module = client_module
try:
import transformers
HFGPT2LayerPolicy._orig_layer_class = transformers.models.gpt2.modeling_gpt2.GPT2Block
except:
HFGPT2LayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
return self.client_module.attn.embed_dim, \
self.client_module.attn.num_heads
def attention(self):
return self.linear_layer, \
self.client_module.attn.c_attn.weight, \
self.client_module.attn.c_attn.bias, \
self.client_module.attn.c_proj.weight, \
self.client_module.attn.c_proj.bias, \
self.scale_attention, \
self.is_megatron_v2
def mlp(self):
return self.linear_layer, \
self.client_module.mlp.c_fc.weight, \
self.client_module.mlp.c_fc.bias, \
self.client_module.mlp.c_proj.weight, \
self.client_module.mlp.c_proj.bias
def layerNorm(self):
return self.client_module.ln_2.weight, \
self.client_module.ln_2.bias, \
self.client_module.ln_1.weight, \
self.client_module.ln_1.bias
class GPTNEOXLayerPolicy(DSPolicy):
_orig_layer_class = None
version = 0
def __init__(self, client_module, inference=True, megatron_v2=True):
super().__init__(inference, megatron_v2=megatron_v2)
self.client_module = client_module
if GPTNEOXLayerPolicy._orig_layer_class is None:
try:
import megatron
from megatron.model.transformer import ParallelTransformerLayerPipe
GPTNEOXLayerPolicy._orig_layer_class = ParallelTransformerLayerPipe
except ImportError:
GPTNEOXLayerPolicy._orig_layer_class = None
def get_hidden_heads(self):
if GPTNEOXLayerPolicy.version == 0:
attention = self.client_module.attention
else:
attention = self.client_module.self_attention
return self.client_module.attention.query_key_value.weight.shape[1], \
self.client_module.attention.num_attention_heads
def attention(self):
if GPTNEOXLayerPolicy.version == 0:
attention = self.client_module.attention
else:
attention = self.client_module.self_attention
return self.linear_layer, \
attention.query_key_value.weight, \
attention.query_key_value.bias, \
attention.dense.weight, \
attention.dense.bias, \
self.scale_attention, \
self.is_megatron_v2
def mlp(self):
return self.linear_layer, \
self.client_module.mlp.dense_h_to_4h.weight, \
self.client_module.mlp.dense_h_to_4h.bias, \
self.client_module.mlp.dense_4h_to_h.weight, \
self.client_module.mlp.dense_4h_to_h.bias
def layerNorm(self):
return self.client_module.post_attention_layernorm.weight, \
self.client_module.post_attention_layernorm.bias, \
self.client_module.input_layernorm.weight, \
self.client_module.input_layernorm.bias
'''
Copyright 2020 The Microsoft DeepSpeed Team
'''
from .containers import HFGPT2LayerPolicy
from .containers import HFBertLayerPolicy
from .containers import BLOOMLayerPolicy
from .containers import HFGPTJLayerPolicy
from .containers import HFGPTNEOLayerPolicy
from .containers import GPTNEOXLayerPolicy
from .containers import HFOPTLayerPolicy
from .containers import MegatronLayerPolicy
from .containers import HFDistilBertLayerPolicy
from .containers import HFCLIPLayerPolicy
from .containers import UNetPolicy
from .containers import VAEPolicy
# transformer-based policies
replace_policies = [
HFBertLayerPolicy,
HFGPTNEOLayerPolicy,
......@@ -376,4 +22,11 @@ replace_policies = [
HFGPTJLayerPolicy,
MegatronLayerPolicy,
HFGPT2LayerPolicy,
BLOOMLayerPolicy,
HFOPTLayerPolicy,
HFCLIPLayerPolicy,
HFDistilBertLayerPolicy
]
# non-transformer-based policies
generic_policies = [UNetPolicy, VAEPolicy]
'''Copyright The Microsoft DeepSpeed Team'''
from deepspeed.utils import log_dist
# helper function to map between DS policies and DS containers
def policy_to_ds_container(**kwargs):
from .containers import HFGPT2LayerPolicy, DS_GPT2Container
from .containers import HFBertLayerPolicy, DS_BERTContainer
from .containers import BLOOMLayerPolicy, DS_BloomContainer
from .containers import HFGPTJLayerPolicy, DS_GPTJContainer
from .containers import HFGPTNEOLayerPolicy, DS_GPTNEOContainer
from .containers import GPTNEOXLayerPolicy, DS_GPTNEOXContainer
from .containers import HFOPTLayerPolicy, DS_OPTContainer
from .containers import MegatronLayerPolicy, DS_MegatronGPTContainer
from .containers import HFDistilBertLayerPolicy, DS_DistilBERTContainer
policy_to_container = {
HFGPT2LayerPolicy: DS_GPT2Container,
HFBertLayerPolicy: DS_BERTContainer,
BLOOMLayerPolicy: DS_BloomContainer,
HFGPTJLayerPolicy: DS_GPTJContainer,
HFGPTNEOLayerPolicy: DS_GPTNEOContainer,
GPTNEOXLayerPolicy: DS_GPTNEOXContainer,
HFOPTLayerPolicy: DS_OPTContainer,
MegatronLayerPolicy: DS_MegatronGPTContainer,
HFDistilBertLayerPolicy: DS_DistilBERTContainer,
}
container = None
policy = kwargs['policy']
assert policy is not None, "Policy cannot be None"
policy_type = type(policy)
if policy_type not in policy_to_container:
log_dist(f"Policy type {policy_type} not supported", [0])
else:
container = policy_to_container[policy_type](**kwargs)
return container
'''Copyright The Microsoft DeepSpeed Team'''
......@@ -2,20 +2,35 @@
Copyright 2020 The Microsoft DeepSpeed Team
'''
import torch.nn.init as init
import torch
import torch.distributed as dist
from deepspeed.utils import logger, log_dist
from deepspeed.utils import log_dist
import deepspeed.utils.groups as groups
from deepspeed.utils import groups
from .sharded_moe import MOELayer, TopKGate
from .experts import Experts
import copy
import typing
class MoE(torch.nn.Module):
"""Initialize an MoE layer.
Arguments:
hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension.
expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).
num_experts (int, optional): default=1, the total number of experts per layer.
ep_size (int, optional): default=1, number of ranks in the expert parallel world or group.
k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.
drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
use_rts (bool, optional): default=True, whether to use Random Token Selection.
use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed).
enable_expert_tensor_parallelism (bool, optional): default=False, whether to use tensor parallelism for experts
"""
def __init__(self,
hidden_size,
expert,
......@@ -29,37 +44,21 @@ class MoE(torch.nn.Module):
noisy_gate_policy: typing.Optional[str] = None,
drop_tokens: bool = True,
use_rts=True,
use_tutel: bool = False):
"""Initialize an MoE layer.
Arguments:
hidden_size (int): the hidden dimension of the model, importantly this is also the input and output dimension.
expert (torch.nn.Module): the torch module that defines the expert (e.g., MLP, torch.linear).
num_experts (int, optional): default=1, the total number of experts per layer.
ep_size (int, optional): default=1, number of ranks in the expert parallel world or group.
k (int, optional): default=1, top-k gating value, only supports k=1 or k=2.
capacity_factor (float, optional): default=1.0, the capacity of the expert at training time.
eval_capacity_factor (float, optional): default=1.0, the capacity of the expert at eval time.
min_capacity (int, optional): default=4, the minimum capacity per expert regardless of the capacity_factor.
use_residual (bool, optional): default=False, make this MoE layer a Residual MoE (https://arxiv.org/abs/2201.05596) layer.
noisy_gate_policy (str, optional): default=None, noisy gate policy, valid options are 'Jitter', 'RSample' or 'None'.
drop_tokens (bool, optional): default=True, whether to drop tokens - (setting to False is equivalent to infinite capacity).
use_rts (bool, optional): default=True, whether to use Random Token Selection.
use_tutel (bool, optional): default=False, whether to use Tutel optimizations (if installed).
"""
use_tutel: bool = False,
enable_expert_tensor_parallelism: bool = False):
super(MoE, self).__init__()
self.use_residual = use_residual
self.ep_size = min(
ep_size,
num_experts) # the ep size should be less than the number of experts
self.enable_expert_tensor_parallelism = enable_expert_tensor_parallelism
assert num_experts % ep_size == 0, f"Number of experts ({num_experts}) should be divisible by expert parallel size ({ep_size})"
self.ep_size = ep_size
self.expert_group_name = f"ep_size_{self.ep_size}"
self.num_experts = num_experts
self.num_local_experts = 1 if num_experts < ep_size else num_experts // ep_size
self.num_local_experts = num_experts // self.ep_size
log_dist(
f'Creating MoE layer with num_experts: {num_experts} | num_local_experts: {self.num_local_experts} | expert_parallel_size: {ep_size}',
f'Creating MoE layer with num_experts: {num_experts} | num_local_experts: {self.num_local_experts} | expert_parallel_size: {self.ep_size}',
[0])
assert noisy_gate_policy is None or noisy_gate_policy in ['None', 'Jitter', 'RSample'], \
......@@ -94,9 +93,12 @@ class MoE(torch.nn.Module):
print(
f"No existing process group found, creating a new group named: {self.expert_group_name}"
)
if groups.mpu is None:
if (groups.mpu is None) or (not self.enable_expert_tensor_parallelism):
# Condition 1 - no groups.mpu means no tensor parallelism
# Condition 2 - disabling expert tensor parallelism on purpose
groups._create_expert_and_data_parallel(self.ep_size)
else:
# expert tensor parallelism is enabled
groups._create_expert_data_and_model_parallel(self.ep_size,
mpu=groups.mpu)
# Set the group handle for the MOELayer (deepspeed_moe) object
......@@ -126,6 +128,6 @@ class MoE(torch.nn.Module):
if type(output_mlp) is tuple:
output_mlp = output_mlp[0] # Ignore the bias term for now
coef = self.coefficient(hidden_states)
coef = torch.nn.functional.softmax(coef, dim=1)
coef = torch.nn.functional.softmax(coef, dim=-1)
output = output * coef[..., 0:1] + output_mlp * coef[..., 1:]
return output, self.deepspeed_moe.l_aux, self.deepspeed_moe.exp_counts
'''
Copyright 2022 The Microsoft DeepSpeed Team
'''
# The file has been adapted from the following Megatron-LM file:
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/mpu/mappings.py
# Git commit hash: 9dc3c42a84aa656f583703cf8b6b4f79f712b796
# We retain the following copyright from the original files:
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
import deepspeed
def _gather_tokens(input_, dim=0):
"""Gather tensors and concatenate them along a dimension"""
mpu = deepspeed.utils.groups.mpu
input_ = input_.contiguous()
# Size and dimension.
rank = mpu.get_tensor_model_parallel_rank()
tensor_list = [
torch.empty_like(input_)
for _ in range(mpu.get_tensor_model_parallel_world_size())
]
tensor_list[rank] = input_
deepspeed.comm.all_gather(tensor_list,
input_,
group=mpu.get_tensor_model_parallel_group())
# Note: torch.cat already creates a contiguous tensor.
output = torch.cat(tensor_list, dim=dim).contiguous()
return output
def _drop_tokens(input_, dim=0):
"""Divide a tensor among the tensor parallel ranks"""
mpu = deepspeed.utils.groups.mpu
total_chunks = mpu.get_tensor_model_parallel_world_size()
this_chunk = mpu.get_tensor_model_parallel_rank()
assert input_.shape[dim] % total_chunks == 0, f"input dimension {dim} ({input_.shape[dim]}) is not divisible by tensor parallel world size ({total_chunks})"
chunk_size = input_.shape[dim] // total_chunks
return torch.narrow(input_, dim, this_chunk * chunk_size, chunk_size)
class _GatherTokens(torch.autograd.Function):
"""All gather tokens among the tensor parallel ranks"""
@staticmethod
def symbolic(graph, input_, dim):
return _gather_tokens(input_, dim)
@staticmethod
def forward(ctx, input_, dim):
ctx.dim = dim
return _gather_tokens(input_, dim)
@staticmethod
def backward(ctx, grad_output):
return _drop_tokens(grad_output, ctx.dim), None
class _DropTokens(torch.autograd.Function):
"Divide tokens equally among the tensor parallel ranks"
@staticmethod
def symbolic(graph, input_, dim):
return _drop_tokens(input_, dim)
@staticmethod
def forward(ctx, input_, dim):
ctx.dim = dim
return _drop_tokens(input_, dim)
@staticmethod
def backward(ctx, input_):
return _gather_tokens(input_, ctx.dim), None
def gather_tokens(input_, dim=0):
mpu = deepspeed.utils.groups.mpu
if mpu is None or mpu.get_tensor_model_parallel_world_size() == 1:
# no tensor parallelism for non-experts
return input_
return _GatherTokens.apply(input_, dim)
def drop_tokens(input_, dim=0):
mpu = deepspeed.utils.groups.mpu
if mpu is None or mpu.get_tensor_model_parallel_world_size() == 1:
# no tensor parallelism for non-experts
return input_
return _DropTokens.apply(input_, dim)
......@@ -12,17 +12,16 @@ Copyright 2021 The Microsoft DeepSpeed Team
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
from deepspeed.utils import logger, log_dist
from typing import Callable, Dict, TYPE_CHECKING, Any, Optional, Tuple, Union, cast
from deepspeed.utils.timer import SynchronizedWallClockTimer
from deepspeed.utils import logger
from typing import Callable, Dict, TYPE_CHECKING, Any, Optional, Tuple
import time
from time import perf_counter
import torch
from torch import Tensor
import torch.distributed as dist
from torch.nn import Module, ModuleList
from torch.nn import Module
import torch.nn.functional as F
from deepspeed.utils import groups
from .mappings import drop_tokens, gather_tokens
if TYPE_CHECKING:
Base = Module[Tensor]
......@@ -80,12 +79,20 @@ def gumbel_rsample(shape: Tuple, device: torch.device) -> Tensor:
return gumbel(shape)
from deepspeed import comm as dist
# einsum dimensions: (g)roup, (s)equence, (e)xpert, (m)odel, (c)apacity
# See https://arxiv.org/pdf/2006.16668.pdf for details.
# Based on https://github.com/pytorch/pytorch/pull/40762
class _AllToAll(torch.autograd.Function):
@staticmethod
def forward(ctx: Any,
group: dist.ProcessGroup,
input: Tensor) -> Tensor: # type: ignore
def forward(
ctx: Any,
# TODO: replace with DS process group
group: torch.distributed.ProcessGroup,
input: Tensor) -> Tensor: # type: ignore
ctx.group = group
input = input.contiguous()
output = torch.empty_like(input)
......@@ -206,7 +213,7 @@ def top1gating(logits: Tensor,
# if we don't want to drop any tokens
if not drop_tokens:
new_capacity = torch.max(exp_counts).to(logits.device)
dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.group.WORLD)
dist.all_reduce(new_capacity, op=dist.ReduceOp.MAX, group=dist.get_world_group())
capacity = new_capacity
# Compute l_aux
......@@ -424,7 +431,7 @@ class TopKGate(Module):
if self.wall_clock_breakdown:
self.timers('TopKGate').stop()
self.gate_time = self.timers('TopKGate').elapsed(reset=False) * 1000
self.gate_time = self.timers('TopKGate').elapsed(reset=False)
return gate_output
......@@ -466,13 +473,17 @@ class MOELayer(Base):
self.timers = SynchronizedWallClockTimer()
self.wall_clock_breakdown = False
self.use_tutel = use_tutel and TUTEL_INSTALLED
self.use_tutel = use_tutel and TUTEL_INSTALLED and gate.k == 1
if self.use_tutel:
logger.info('Using Tutel optimizations.')
elif use_tutel and not TUTEL_INSTALLED:
logger.warning("Tutel optimization requested but not installed. "
"Proceeding without Tutel.")
elif use_tutel and TUTEL_INSTALLED and gate.k != 1:
logger.warning(
"To enable Tutel optimization, use top-1 instead of top-2 gate. "
"Proceeding without Tutel.")
def _set_ep_group(self, ep_group):
self.ep_group = ep_group
......@@ -511,11 +522,20 @@ class MOELayer(Base):
if self.wall_clock_breakdown:
self.timers('falltoall').start()
if groups._get_expert_model_parallel_world_size() == 1:
# If the non-expert is tensor-parallel, it will create
# duplicate tokens on the tensor-parallel ranks.
# Since our experts are not tensor-parallel, these duplicates
# need to be dropped to ensure correctness.
# this also doubles up as a communication optimization as we are
# reducing the all-to-all communication volume.
dispatched_input = drop_tokens(dispatched_input, dim=1)
dispatched_input = _AllToAll.apply(self.ep_group, dispatched_input)
if self.wall_clock_breakdown:
self.timers('falltoall').stop()
self.time_falltoall = self.timers('falltoall').elapsed(reset=False) * 1000
self.time_falltoall = self.timers('falltoall').elapsed(reset=False)
# Re-shape after all-to-all: ecm -> gecm
dispatched_input = dispatched_input.reshape(self.ep_size,
......@@ -532,13 +552,19 @@ class MOELayer(Base):
if self.wall_clock_breakdown:
self.timers('salltoall').stop()
self.time_salltoall = self.timers('salltoall').elapsed(reset=False) * 1000
self.time_salltoall = self.timers('salltoall').elapsed(reset=False)
# Re-shape back: gecm -> ecm
expert_output = expert_output.reshape(self.ep_size * self.num_local_experts,
-1,
d_model)
if groups._get_expert_model_parallel_world_size() == 1:
# the dropped duplicate tokens need to be gathered on each
# tensor parallel rank again for the tensor-parallel
# non-expert of the next layer.
expert_output = gather_tokens(expert_output, dim=1)
if self.use_tutel:
combined_output = self._tutel_dispatcher.decode(expert_output.view(E * C, M))
else:
......@@ -550,6 +576,6 @@ class MOELayer(Base):
if self.wall_clock_breakdown:
self.timers('moe').stop()
self.time_moe = self.timers('moe').elapsed(reset=False) * 1000
self.time_moe = self.timers('moe').elapsed(reset=False)
return a
'''Copyright The Microsoft DeepSpeed Team'''
from typing import List, Tuple, Dict
import torch
import deepspeed.utils.groups as groups
from .layer import MoE
def has_moe_layers(m):
has_moe = False
num_experts = 0
for _, module in m.named_modules():
if isinstance(module, MoE):
has_moe = True
......@@ -59,8 +61,9 @@ def split_params_grads_into_shared_and_expert_params(
return shared_grads, expert_grads
def split_params_into_different_moe_groups_for_optimizer(
param_groups: Tuple[Dict]) -> Tuple[Dict]:
def split_params_into_different_moe_groups_for_optimizer(param_groups: Tuple[Dict],
max_group_size=178956971
) -> Tuple[Dict]:
"""Split parameters into different MoE groups for optimizer
Args:
......@@ -112,8 +115,32 @@ def split_params_into_different_moe_groups_for_optimizer(
param_group['params'] = new_params
# Flatten the moe groups
for k, v in group_moe.items():
for k1, v1 in v.items():
param_groups.append(v1)
if max_group_size is not None:
for k, v in group_moe.items():
for k1, v1 in v.items():
cur_group = []
all_groups = []
size_of_cur_group = 0
for param in v1['params']:
if size_of_cur_group + param.numel() <= max_group_size:
cur_group.append(param)
size_of_cur_group += param.numel()
else:
all_groups.append(cur_group)
cur_group = [param]
size_of_cur_group = param.numel()
if cur_group:
all_groups.append(cur_group)
for group in all_groups:
new_dict = {}
for key, val in v1.items():
if key != 'params':
new_dict[key] = val
new_dict['params'] = group
param_groups.append(new_dict)
else:
for k, v in group_moe.items():
for k1, v1 in v.items():
param_groups.append(v1)
return tuple(param_groups)
'''Copyright The Microsoft DeepSpeed Team'''
'''Copyright The Microsoft DeepSpeed Team'''
"""
Copyright (c) Microsoft Corporation
Licensed under the MIT license.
"""
from pydantic import root_validator
from deepspeed.runtime.config_utils import DeepSpeedConfigModel
def get_monitor_config(param_dict):
monitor_dict = {
key: param_dict.get(key,
{})
for key in ("tensorboard",
"wandb",
"csv_monitor")
}
return DeepSpeedMonitorConfig(**monitor_dict)
class TensorBoardConfig(DeepSpeedConfigModel):
"""Sets parameters for TensorBoard monitor."""
enabled: bool = False
""" Whether logging to Tensorboard is enabled. Requires `tensorboard` package is installed. """
output_path: str = ""
"""
Path to where the Tensorboard logs will be written. If not provided, the
output path is set under the training script’s launching path.
"""
job_name: str = "DeepSpeedJobName"
""" Name for the current job. This will become a new directory inside `output_path`. """
class WandbConfig(DeepSpeedConfigModel):
"""Sets parameters for WandB monitor."""
enabled: bool = False
""" Whether logging to WandB is enabled. Requires `wandb` package is installed. """
group: str = None
""" Name for the WandB group. This can be used to group together runs. """
team: str = None
""" Name for the WandB team. """
project: str = "deepspeed"
""" Name for the WandB project. """
class CSVConfig(DeepSpeedConfigModel):
"""Sets parameters for CSV monitor."""
enabled: bool = False
""" Whether logging to local CSV files is enabled. """
output_path: str = ""
"""
Path to where the csv files will be written. If not provided, the output
path is set under the training script’s launching path.
"""
job_name: str = "DeepSpeedJobName"
""" Name for the current job. This will become a new directory inside `output_path`. """
class DeepSpeedMonitorConfig(DeepSpeedConfigModel):
"""Sets parameters for various monitoring methods."""
tensorboard: TensorBoardConfig = {}
""" TensorBoard monitor, requires `tensorboard` package is installed. """
wandb: WandbConfig = {}
""" WandB monitor, requires `wandb` package is installed. """
csv_monitor: CSVConfig = {}
""" Local CSV output of monitoring data. """
@root_validator
def check_enabled(cls, values):
values["enabled"] = False
if (values.get("tensorboard").enabled or values.get("wandb").enabled
or values.get("csv_monitor").enabled):
values["enabled"] = True
return values
'''Copyright The Microsoft DeepSpeed Team'''
from .monitor import Monitor
import os
import deepspeed.comm as dist
class csvMonitor(Monitor):
def __init__(self, csv_config):
super().__init__(csv_config)
self.filenames = []
self.enabled = csv_config.enabled
self.output_path = csv_config.output_path
self.job_name = csv_config.job_name
self.log_dir = self.setup_log_dir()
def setup_log_dir(self, base=os.path.join(os.path.expanduser("~"), "csv_monitor")):
if self.enabled and dist.get_rank() == 0:
if self.output_path is not None:
log_dir = os.path.join(self.output_path, self.job_name)
# NOTE: This code path currently is never used since the default tensorboard_output_path is an empty string and not None. Saving it in case we want this functionality in the future.
else:
if "DLWS_JOB_ID" in os.environ:
infra_job_id = os.environ["DLWS_JOB_ID"]
elif "DLTS_JOB_ID" in os.environ:
infra_job_id = os.environ["DLTS_JOB_ID"]
else:
infra_job_id = "unknown-job-id"
csv_monitor_dir_name = os.path.join(infra_job_id, "logs")
log_dir = os.path.join(base, csv_monitor_dir_name, self.job_name)
os.makedirs(log_dir, exist_ok=True)
return log_dir
def write_events(self, event_list):
if self.enabled and dist.get_rank() == 0:
import csv
# We assume each event_list element is a tensorboard-style tuple in the format: (log_name: String, value, step: Int)
for event in event_list:
log_name = event[0]
value = event[1]
step = event[2]
# Set the header to the log_name
# Need this check because the deepspeed engine currently formats log strings to separate with '/'
if '/' in log_name:
record_splits = log_name.split('/')
header = record_splits[len(record_splits) - 1]
else:
header = log_name
# sanitize common naming conventions into filename
filename = log_name.replace('/', '_').replace(' ', '_')
fname = self.log_dir + '/' + filename + '.csv'
# Open file and record event. Insert header if this is the first time writing
with open(fname, 'a+') as csv_monitor_file:
csv_monitor_writer = csv.writer(csv_monitor_file)
if filename not in self.filenames:
self.filenames.append(filename)
csv_monitor_writer.writerow(['step', header])
csv_monitor_writer.writerow([step, value])
'''Copyright The Microsoft DeepSpeed Team'''
"""
Support different forms of monitoring such as wandb and tensorboard
"""
from abc import ABC, abstractmethod
import deepspeed.comm as dist
class Monitor(ABC):
@abstractmethod
def __init__(self, monitor_config):
self.monitor_config = monitor_config
@abstractmethod
def write_events(self, event_list):
pass
from .wandb import WandbMonitor
from .tensorboard import TensorBoardMonitor
from .csv_monitor import csvMonitor
class MonitorMaster(Monitor):
def __init__(self, monitor_config):
super().__init__(monitor_config)
self.tb_monitor = None
self.wandb_monitor = None
self.csv_monitor = None
self.enabled = monitor_config.enabled
if dist.get_rank() == 0:
if monitor_config.tensorboard.enabled:
self.tb_monitor = TensorBoardMonitor(monitor_config.tensorboard)
if monitor_config.wandb.enabled:
self.wandb_monitor = WandbMonitor(monitor_config.wandb)
if monitor_config.csv_monitor.enabled:
self.csv_monitor = csvMonitor(monitor_config.csv_monitor)
def write_events(self, event_list):
if dist.get_rank() == 0:
if self.tb_monitor is not None:
self.tb_monitor.write_events(event_list)
if self.wandb_monitor is not None:
self.wandb_monitor.write_events(event_list)
if self.csv_monitor is not None:
self.csv_monitor.write_events(event_list)
'''Copyright The Microsoft DeepSpeed Team'''
from .utils import check_tb_availability
from .monitor import Monitor
import os
import deepspeed.comm as dist
class TensorBoardMonitor(Monitor):
def __init__(self, tensorboard_config):
super().__init__(tensorboard_config)
check_tb_availability()
self.summary_writer = None
self.enabled = tensorboard_config.enabled
self.output_path = tensorboard_config.output_path
self.job_name = tensorboard_config.job_name
if self.enabled and dist.get_rank() == 0:
self.get_summary_writer()
def get_summary_writer(self,
base=os.path.join(os.path.expanduser("~"),
"tensorboard")):
if self.enabled and dist.get_rank() == 0:
from torch.utils.tensorboard import SummaryWriter
if self.output_path is not None:
log_dir = os.path.join(self.output_path, self.job_name)
# NOTE: This code path currently is never used since the default output_path is an empty string and not None. Saving it in case we want this functionality in the future.
else:
if "DLWS_JOB_ID" in os.environ:
infra_job_id = os.environ["DLWS_JOB_ID"]
elif "DLTS_JOB_ID" in os.environ:
infra_job_id = os.environ["DLTS_JOB_ID"]
else:
infra_job_id = "unknown-job-id"
summary_writer_dir_name = os.path.join(infra_job_id, "logs")
log_dir = os.path.join(base, summary_writer_dir_name, self.output_path)
os.makedirs(log_dir, exist_ok=True)
self.summary_writer = SummaryWriter(log_dir=log_dir)
return self.summary_writer
def write_events(self, event_list, flush=True):
if self.enabled and self.summary_writer is not None and dist.get_rank() == 0:
for event in event_list:
self.summary_writer.add_scalar(*event)
if flush:
self.summary_writer.flush()
def flush(self):
if self.enabled and self.summary_writer is not None and dist.get_rank() == 0:
self.summary_writer.flush()
'''Copyright The Microsoft DeepSpeed Team'''
def check_tb_availability():
try:
# torch.utils.tensorboard will fail if `tensorboard` is not available,
# see their docs for more details: https://pytorch.org/docs/1.8.0/tensorboard.html
import tensorboard # noqa: F401
except ImportError:
print('If you want to use tensorboard logging, please `pip install tensorboard`')
raise
def check_wandb_availability():
try:
import wandb # noqa: F401
except ImportError:
print(
'If you want to use wandb logging, please `pip install wandb` and follow the instructions at https://docs.wandb.ai/quickstart'
)
raise
'''Copyright The Microsoft DeepSpeed Team'''
from .utils import check_wandb_availability
from .monitor import Monitor
import deepspeed.comm as dist
class WandbMonitor(Monitor):
def __init__(self, wandb_config):
super().__init__(wandb_config)
check_wandb_availability()
import wandb
self.enabled = wandb_config.enabled
self.group = wandb_config.group
self.team = wandb_config.team
self.project = wandb_config.project
if self.enabled and dist.get_rank() == 0:
wandb.init(project=self.project, group=self.group, entity=self.team)
def log(self, data, step=None, commit=None, sync=None):
if self.enabled and dist.get_rank() == 0:
import wandb
return wandb.log(data, step=step, commit=commit, sync=sync)
def write_events(self, event_list):
if self.enabled and dist.get_rank() == 0:
for event in event_list:
label = event[0]
value = event[1]
step = event[2]
self.log({label: value}, step=step)
'''Copyright The Microsoft DeepSpeed Team'''
'''Copyright The Microsoft DeepSpeed Team'''
"""
Copyright (c) Microsoft Corporation
Licensed under the MIT license.
"""
from deepspeed.runtime.config_utils import get_scalar_param, DeepSpeedConfigObject
from deepspeed.nebula.constants import *
class DeepSpeedNebulaConfig(DeepSpeedConfigObject):
def __init__(self, param_dict):
super(DeepSpeedNebulaConfig, self).__init__()
self.enabled = None
self.persistent_storage_path = None
self.persistent_time_interval = None
self.num_of_version_in_retention = None
self.enable_nebula_load = None
if NEBULA in param_dict.keys():
nebula_dict = param_dict[NEBULA]
else:
nebula_dict = {}
self._initialize(nebula_dict)
def _initialize(self, nebula_dict):
self.enabled = get_scalar_param(nebula_dict,
NEBULA_ENABLED,
NEBULA_ENABLED_DEFAULT)
self.load_path = get_scalar_param(nebula_dict,
NEBULA_LOAD_PATH,
NEBULA_LOAD_PATH_DEFAULT)
self.enable_nebula_load = get_scalar_param(nebula_dict,
NEBULA_ENABLE_NEBULA_LOAD,
NEBULA_ENABLE_NEBULA_LOAD_DEFAULT)
self.persistent_storage_path = get_scalar_param(
nebula_dict,
NEBULA_PERSISTENT_STORAGE_PATH,
NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT)
self.persistent_time_interval = get_scalar_param(
nebula_dict,
NEBULA_PERSISTENT_TIME_INTERVAL,
NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT)
self.num_of_version_in_retention = get_scalar_param(
nebula_dict,
NEBULA_NUM_OF_VERSION_IN_RETENTION,
NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT)
'''Copyright The Microsoft DeepSpeed Team'''
"""
Copyright (c) Microsoft Corporation
Licensed under the MIT license.
"""
#########################################
# nebula
#########################################
# Nebula. By default, this feature is not enabled.
# Users can configure in ds_config.json as below example:
NEBULA_FORMAT = '''
nebula should be enabled as:
"session_params": {
"nebula": {
"enabled": true,
"persistent_storage_path": "/foo/bar",
"persistent_time_interval": 100,
"num_of_version_in_retention": 2,
"enable_nebula_load": true
}
}
'''
NEBULA = "nebula"
NEBULA_ENABLED = "enabled"
NEBULA_ENABLED_DEFAULT = False
# There is a case where customer want to load the checkpoint saved
# by raw torch. Because nebula cannot load torch checkpoint directly
# as they have different folder structures to bring the gap for
# loading(the data are totaly same in bytes for torch and enbula s
# aving).
# In this case, we must disable nebula load to use raw torch load.
# Customer can just set NEBULA_ENABLE_NEBULA_LOAD to False. Then use
# original way of deepspeed to load, i.e. set the value of "--load".
NEBULA_ENABLE_NEBULA_LOAD = "enable_nebula_load"
NEBULA_ENABLE_NEBULA_LOAD_DEFAULT = True
# When you want to resume the previous checkpoint saved by nebula,
# you can set NEBULA_LOAD_PATH as the parent folder of checkpoint.
# If NEBULA_LOAD_PATH is None, the NEBULA_PERSISTENT_STORAGE_PATH
# will be the default path to load.
NEBULA_LOAD_PATH = "nebula_load_path"
NEBULA_LOAD_PATH_DEFAULT = None
# Nebula will save the checkpoint under NEBULA_LOAD_PATH in the
# asynchronous way.
NEBULA_PERSISTENT_STORAGE_PATH = "persistent_storage_path"
NEBULA_PERSISTENT_STORAGE_PATH_DEFAULT = None
# Time interval to trigger the nebula persistence.
NEBULA_PERSISTENT_TIME_INTERVAL = "persistent_time_interval"
NEBULA_PERSISTENT_TIME_INTERVAL_DEFAULT = 100
# Checkpoint number which will be kept in memory. Let us say,
# if the value is 2. Then we have checkpoints 1 and 2 are ready
# now. When it comes to checkpoint 3, the 1 will be removed if
# 1 has been persisted to disk.
NEBULA_NUM_OF_VERSION_IN_RETENTION = "num_of_version_in_retention"
NEBULA_NUM_OF_VERSION_IN_RETENTION_DEFAULT = 2
# Neubla envs
NEBULA_EXPORT_ENVS = [
'DLTS_JOB_ID',
'DLTS_NUM_WORKER',
'NEBULA_PERSISTENT_STORAGE_PATH',
'NEBULA_PERSISTENT_TIME_INTERVAL',
'AML_RUN_ID',
'AZUREML_RUN_TOKEN',
'AZUREML_WORKSPACE_SCOPE',
'AZUREML_EXPERIMENT_SCOPE',
'AZUREML_RUN_HISTORY_SERVICE_ENDPOINT',
'AZUREML_RUN_ID',
'NEBULA_MEMORY_BUFFER_SIZE',
'AZUREML_PARAMETER_ITPJOB_NAME',
'FC_TASKROLE_NAME',
'FC_TASK_INDEX',
'MASTER_HOST',
'LOCAL_HOST',
'AZUREML_BLOB_ACCOUNT_NAME',
'AZUREML_BLOB_ACCOUNT_KEY'
]
# ITP env files
DLTS_POD_ENV_PATH = '/dlts-runtime/env/pod.env'
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment