Unverified Commit f8a7148d authored by ver217's avatar ver217 Committed by GitHub
Browse files

[kernel] move all symlinks of kernel to `colossalai._C` (#1971)

parent 7e24b9b9
......@@ -38,7 +38,6 @@ jobs:
pip install -r requirements/requirements.txt
pip install -v -e .
cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
cp /__w/ColossalAI/ColossalAI/*.so /github/home/cuda_ext_cache/
pip install -r requirements/requirements-test.txt
- name: Unit Testing
run: |
......
......@@ -36,7 +36,6 @@ jobs:
pip install -r requirements/requirements.txt
pip install -v -e .
cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
cp /__w/ColossalAI/ColossalAI/*.so /github/home/cuda_ext_cache/
pip install -r requirements/requirements-test.txt
- name: Unit Testing
run: |
......
include *.txt README.md
recursive-include requirements *.txt
recursive-include colossalai *.cpp *.h *.cu *.tr *.cuh *.cc
recursive-include colossalai *.cpp *.h *.cu *.tr *.cuh *.cc *.pyi
from . import (
cpu_optim,
fused_optim,
layer_norm,
moe,
multihead_attention,
scaled_masked_softmax,
scaled_upper_triang_masked_softmax,
)
from torch import Tensor
class CPUAdamOptimizer:
def __init__(self, lr: float, beta1: float, beta2: float, eps: float,
weight_decay: float, adamw_mode: float) -> None: ...
def step(self, step: int, lr: float, beta1: float, beta2: float, eps: float, weight_decay: float, bias_correction: bool,
param: Tensor, grad: Tensor, exp_avg: Tensor, exp_avg_sq: Tensor, loss_scale: float) -> None: ...
from typing import List
from torch import Tensor
def multi_tensor_scale(chunk_size: int, noop_flag: Tensor, tensor_lists: List[List[Tensor]], scale: float) -> None:
...
def multi_tensor_sgd(chunk_size: int, noop_flag: Tensor, tensor_lists: List[List[Tensor]], weight_decay: float,
momentum: float, dampening: float, lr: float, nesterov: bool, first_run: bool, weight_decay_after_momentum: bool, scale: float) -> None:
...
def multi_tensor_adam(chunk_size: int, noop_flag: Tensor, tensor_lists: List[List[Tensor]], lr: float, beta1: float, beta2: float, epsilon: float, step: int, mode: int, bias_correction: int, weight_decay: float) -> None:
...
def multi_tensor_lamb(chunk_size: int, noop_flag: Tensor, tensor_lists: List[List[Tensor]], lr: float, beta1: float, beta2: float, epsilon: float, step: int, bias_correction: int, weight_decay: float, grad_averaging: int, mode: int, global_grad_norm: Tensor, max_grad_norm: float, use_nvlamb_python: bool) -> None:
...
def multi_tensor_l2norm(chunk_size: int, noop_flag: Tensor, tensor_lists: List[List[Tensor]], per_tensor_python: bool) -> None:
...
from typing import List
from torch import Tensor
def forward_affine(input: Tensor, normalized_shape: List[int], gamma: Tensor, beta: Tensor, epsilon: float) -> List[Tensor]:
...
def backward_affine(dout: Tensor, mean: Tensor, invvar: Tensor, input: Tensor,
normalized_shape: List[int], gamma: Tensor, beta: Tensor, epsilon: float) -> List[Tensor]:
...
from torch import Tensor
def cumsum_sub_one(mask: Tensor) -> Tensor:
...
def dispatch_forward(s: int, ec: int, h: int, batch_tokens: Tensor, mask: Tensor, dest_idx: Tensor) -> Tensor:
...
def dispatch_backward(s: int, ec: int, h: int, expert_grad: Tensor, mask: Tensor, dest_idx: Tensor) -> Tensor:
...
def combine_forward(s: int, e: int, c: int, h: int, expert_tokens: Tensor, logits: Tensor, mask: Tensor, dest_idx: Tensor) -> Tensor:
...
def combine_backward(s: int, e: int, c: int, h: int, tokens_grad: Tensor, expert_tokens: Tensor, logits: Tensor, mask: Tensor, dest_idx: Tensor) -> Tensor:
...
from typing import List
from torch import Tensor
from torch.distributed import ProcessGroup
def multihead_attention_fw_fp32(layer_id: int, input: Tensor, input_mask: Tensor,
in_proj_weight: Tensor, in_proj_bias: Tensor,
out_proj_weight: Tensor, out_proj_bias: Tensor,
norm_weight: Tensor, norm_bias: Tensor,
training_mode: bool, prelayernorm: bool) -> List[Tensor]:
...
def multihead_attention_fw_fp16(layer_id: int, input: Tensor, input_mask: Tensor,
in_proj_weight: Tensor, in_proj_bias: Tensor,
out_proj_weight: Tensor, out_proj_bias: Tensor,
norm_weight: Tensor, norm_bias: Tensor,
training_mode: bool, prelayernorm: bool) -> List[Tensor]:
...
def multihead_attention_bw_fp32(layer_id: int, grad_dec_output: Tensor,
output: Tensor, input: Tensor,
input_mask: Tensor, in_proj_weight: Tensor,
in_proj_bias: Tensor, out_proj_weight: Tensor,
out_proj_bias: Tensor, norm_weight: Tensor,
norm_bias: Tensor) -> List[Tensor]:
...
def multihead_attention_bw_fp16(layer_id: int, grad_dec_output: Tensor,
output: Tensor, input: Tensor,
input_mask: Tensor, in_proj_weight: Tensor,
in_proj_bias: Tensor, out_proj_weight: Tensor,
out_proj_bias: Tensor, norm_weight: Tensor,
norm_bias: Tensor) -> List[Tensor]:
...
def create_multihead_attention_fp32(layer_id: int, max_batch_tokens: int,
max_seq_len: int, hidden_dim: int, num_heads: int,
attn_prob_dropout_ratio: float,
hidden_dropout_ratio: float,
pre_or_postLayerNorm: bool,
pg: ProcessGroup) -> int:
...
def create_multihead_attention_fp16(layer_id: int, max_batch_tokens: int,
max_seq_len: int, hidden_dim: int, num_heads: int,
attn_prob_dropout_ratio: float,
hidden_dropout_ratio: float,
pre_or_postLayerNorm: bool,
pg: ProcessGroup) -> int:
...
from torch import Tensor
def forward(input: Tensor, mask: Tensor, scale: float) -> Tensor:
...
def backward(output_grads: Tensor, softmax_results: Tensor, scale: float) -> Tensor:
...
def get_batch_per_block(query_seq_len: int, key_seq_len: int, batches: int, attn_heads: int) -> int:
...
from torch import Tensor
def forward(input: Tensor, scale: float) -> Tensor:
...
def backward(output_grads: Tensor, softmax_results: Tensor, scale: float) -> Tensor:
...
......@@ -5,7 +5,7 @@ import torch
import torch.distributed as dist
try:
import colossal_C
import colossalai._C.fused_optim
except:
print('Colossalai should be built with cuda extension to use the FP16 optimizer')
......@@ -35,7 +35,7 @@ def _multi_tensor_copy_this_to_that(this, that, overflow_buf=None):
if overflow_buf:
overflow_buf.fill_(0)
# Scaling with factor `1.0` is equivalent to copy.
multi_tensor_applier(colossal_C.multi_tensor_scale, overflow_buf, [this, that], 1.0)
multi_tensor_applier(colossalai._C.fused_optim.multi_tensor_scale, overflow_buf, [this, that], 1.0)
else:
for this_, that_ in zip(this, that):
that_.copy_(this_)
......
import click
import subprocess
import click
import torch
from torch.utils.cpp_extension import CUDA_HOME
......@@ -17,7 +18,7 @@ def check_installation():
def _check_cuda_extension_installed():
try:
import colossal_C
import colossalai._C.fused_optim
is_cuda_extension_installed = u'\u2713'
except ImportError:
is_cuda_extension_installed = 'x'
......
......@@ -3,14 +3,11 @@
with some changes. """
import numbers
import torch
from torch.nn.parameter import Parameter
from torch.cuda.amp import custom_bwd, custom_fwd
from torch.nn import init
from torch.cuda.amp import custom_fwd, custom_bwd
import importlib
global colossal_layer_norm_cuda
colossal_layer_norm_cuda = None
from torch.nn.parameter import Parameter
class FusedLayerNormAffineFunction(torch.autograd.Function):
......@@ -18,13 +15,17 @@ class FusedLayerNormAffineFunction(torch.autograd.Function):
@staticmethod
@custom_fwd(cast_inputs=torch.float32)
def forward(ctx, input, weight, bias, normalized_shape, eps):
try:
import colossalai._C.layer_norm
except ImportError:
raise RuntimeError('FusedLayerNormAffineFunction requires cuda extensions')
ctx.normalized_shape = normalized_shape
ctx.eps = eps
input_ = input.contiguous()
weight_ = weight.contiguous()
bias_ = bias.contiguous()
output, mean, invvar = colossal_layer_norm_cuda.forward_affine(input_, ctx.normalized_shape, weight_, bias_,
output, mean, invvar = colossalai._C.layer_norm.forward_affine(input_, ctx.normalized_shape, weight_, bias_,
ctx.eps)
ctx.save_for_backward(input_, weight_, bias_, mean, invvar)
......@@ -33,11 +34,15 @@ class FusedLayerNormAffineFunction(torch.autograd.Function):
@staticmethod
@custom_bwd
def backward(ctx, grad_output):
try:
import colossalai._C.layer_norm
except ImportError:
raise RuntimeError('FusedLayerNormAffineFunction requires cuda extensions')
input_, weight_, bias_, mean, invvar = ctx.saved_tensors
grad_input = grad_weight = grad_bias = None
grad_input, grad_weight, grad_bias \
= colossal_layer_norm_cuda.backward_affine(
= colossalai._C.layer_norm.backward_affine(
grad_output.contiguous(), mean, invvar,
input_, ctx.normalized_shape,
weight_, bias_, ctx.eps)
......@@ -50,13 +55,6 @@ class MixedFusedLayerNorm(torch.nn.Module):
def __init__(self, normalized_shape, eps=1e-5, device=None, dtype=None):
super(MixedFusedLayerNorm, self).__init__()
global colossal_layer_norm_cuda
if colossal_layer_norm_cuda is None:
try:
colossal_layer_norm_cuda = importlib.import_module("colossal_layer_norm_cuda")
except ImportError:
raise RuntimeError('MixedFusedLayerNorm requires cuda extensions')
if isinstance(normalized_shape, numbers.Integral):
normalized_shape = (normalized_shape,)
self.normalized_shape = torch.Size(normalized_shape)
......
import math
import importlib
from dataclasses import dataclass
import torch
......@@ -37,21 +36,21 @@ colossal_multihead_attention = None
@dataclass
class Config:
max_batch_tokens: int # max batch token numbers
max_seq_len: int # max sequence length
hidden_size: int # size of transformer hidden layers
nhead: int # number of heads in attention
attn_prob_dropout_ratio: float # attention score dropout ratio
hidden_dropout_ratio: float # dropout ration before residual
norm_first: bool # norm_first
fp16: bool # fp16 presion
max_batch_tokens: int # max batch token numbers
max_seq_len: int # max sequence length
hidden_size: int # size of transformer hidden layers
nhead: int # number of heads in attention
attn_prob_dropout_ratio: float # attention score dropout ratio
hidden_dropout_ratio: float # dropout ration before residual
norm_first: bool # norm_first
fp16: bool # fp16 presion
class MultiHeadAttention1DFunc(Function):
@staticmethod
def forward(ctx, input, input_mask, in_proj_weight, in_proj_bias, out_proj_weight,
out_proj_bias, norm_weight, norm_bias, config):
def forward(ctx, input, input_mask, in_proj_weight, in_proj_bias, out_proj_weight, out_proj_bias, norm_weight,
norm_bias, config):
cuda_module = colossal_multihead_attention
forward_func = (cuda_module.multihead_attention_fw_fp16
if config.fp16 else cuda_module.multihead_attention_fw_fp32)
......@@ -59,13 +58,12 @@ class MultiHeadAttention1DFunc(Function):
input = input.to(torch.half)
input_mask = input_mask.to(torch.half)
(output,) = forward_func(config.layer_id, input, input_mask, in_proj_weight, in_proj_bias,
out_proj_weight, out_proj_bias, norm_weight, norm_bias,
config.training, config.norm_first)
(output,) = forward_func(config.layer_id, input, input_mask, in_proj_weight, in_proj_bias, out_proj_weight,
out_proj_bias, norm_weight, norm_bias, config.training, config.norm_first)
if config.is_grad_enabled and config.training:
ctx.save_for_backward(output, input, input_mask, in_proj_weight, in_proj_bias,
out_proj_weight, out_proj_bias, norm_weight, norm_bias)
ctx.save_for_backward(output, input, input_mask, in_proj_weight, in_proj_bias, out_proj_weight,
out_proj_bias, norm_weight, norm_bias)
ctx.config = config
return output
......@@ -98,8 +96,8 @@ class MultiHeadAttention1DFunc(Function):
ctx.config.layer_id, grad_output, output, input, input_mask, in_proj_weight,
in_proj_bias, out_proj_weight, out_proj_bias, norm_weight, norm_bias)
return (grad_input, None, grad_in_proj_weight, grad_in_proj_bias, grad_out_proj_weight,
grad_out_proj_bias, grad_norm_weight, grad_norm_bias, None)
return (grad_input, None, grad_in_proj_weight, grad_in_proj_bias, grad_out_proj_weight, grad_out_proj_bias,
grad_norm_weight, grad_norm_bias, None)
class MultiHeadAttention(nn.Module):
......@@ -121,19 +119,11 @@ class MultiHeadAttention(nn.Module):
layer_id = 0
def __init__(self,
hidden_size,
nhead,
batch_size,
max_seq_len,
dropout=0.0,
norm_first=False,
fp16=True,
pg=None):
def __init__(self, hidden_size, nhead, batch_size, max_seq_len, dropout=0.0, norm_first=False, fp16=True, pg=None):
super(MultiHeadAttention, self).__init__()
self.config = Config(batch_size * max_seq_len, max_seq_len, hidden_size, nhead, dropout,
dropout, norm_first, fp16)
self.config = Config(batch_size * max_seq_len, max_seq_len, hidden_size, nhead, dropout, dropout, norm_first,
fp16)
check_config(self.config)
self.pg = pg
self.pg_size = 1
......@@ -146,7 +136,8 @@ class MultiHeadAttention(nn.Module):
global colossal_multihead_attention
if colossal_multihead_attention is None:
try:
colossal_multihead_attention = importlib.import_module("colossal_multihead_attention")
import colossalai._C.multihead_attention
colossal_multihead_attention = colossalai._C.multihead_attention
except ImportError:
raise RuntimeError('MultiHeadAttention requires cuda extensions')
......@@ -215,14 +206,13 @@ class MultiHeadAttention(nn.Module):
with torch.no_grad():
self.in_proj_weight.copy_(
attn_qkvw_global.view(3, hs, hs)[
:, int(hs * rank_in_pg / self.pg_size):
int(hs * (rank_in_pg + 1) / self.pg_size),
:])
attn_qkvw_global.view(3, hs, hs)[:,
int(hs * rank_in_pg / self.pg_size):int(hs * (rank_in_pg + 1) /
self.pg_size), :])
self.in_proj_bias.copy_(
attn_qkvb_global.view(3, hs)[
:, int(hs * rank_in_pg / self.pg_size):
int(hs * (rank_in_pg + 1) / self.pg_size)])
attn_qkvb_global.view(3, hs)[:,
int(hs * rank_in_pg / self.pg_size):int(hs * (rank_in_pg + 1) /
self.pg_size)])
attn_ow_global = torch.empty(hs, hs)
nn.init.xavier_uniform_(attn_ow_global, 1.0)
......@@ -230,9 +220,9 @@ class MultiHeadAttention(nn.Module):
torch.distributed.broadcast(attn_ow_global, src=0, group=self.pg)
attn_ow_global = attn_ow_global.cpu()
with torch.no_grad():
self.out_proj_weight.copy_(attn_ow_global[
:, int(hs * rank_in_pg / self.pg_size):
int(hs * (rank_in_pg + 1) / self.pg_size)])
self.out_proj_weight.copy_(attn_ow_global[:,
int(hs * rank_in_pg /
self.pg_size):int(hs * (rank_in_pg + 1) / self.pg_size)])
else:
attn_qkvw = self.in_proj_weight.view(-1, hs)
......@@ -243,10 +233,7 @@ class MultiHeadAttention(nn.Module):
nn.init.xavier_uniform_(self.out_proj_weight, 1.0)
def state_dict(self, destination=None, prefix="", keep_vars=False):
destination = torch.nn.Module.state_dict(self,
destination=destination,
prefix=prefix,
keep_vars=keep_vars)
destination = torch.nn.Module.state_dict(self, destination=destination, prefix=prefix, keep_vars=keep_vars)
return destination
def forward(self, hidden_states, encoder_padding_mask):
......@@ -257,8 +244,7 @@ class MultiHeadAttention(nn.Module):
bs, sl, dim = hidden_states.size()
if bs * sl > self.config.max_batch_tokens:
raise ValueError(
f"Batch token numbers {bs * sl} exceeds the limit {self.config.max_batch_tokens}.")
raise ValueError(f"Batch token numbers {bs * sl} exceeds the limit {self.config.max_batch_tokens}.")
if sl > self.config.max_seq_len:
raise ValueError(f"Sequence length {sl} exceeds the limit {self.config.max_seq_len}.")
if len(encoder_padding_mask.size()) == 1:
......@@ -266,9 +252,8 @@ class MultiHeadAttention(nn.Module):
else:
assert bs == encoder_padding_mask.size(0) and sl == encoder_padding_mask.size(1)
output = MultiHeadAttention1DFunc.apply(hidden_states, encoder_padding_mask,
self.in_proj_weight, self.in_proj_bias,
self.out_proj_weight, self.out_proj_bias,
output = MultiHeadAttention1DFunc.apply(hidden_states, encoder_padding_mask, self.in_proj_weight,
self.in_proj_bias, self.out_proj_weight, self.out_proj_bias,
self.norm_weight, self.norm_bias, self.config)
return output.to(self.precision)
"""This code from NVIDIA Megatron
with some changes. """
import enum
import torch
import torch.nn as nn
import enum
class AttnMaskType(enum.Enum):
......@@ -23,12 +24,12 @@ class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
@staticmethod
def forward(ctx, inputs, scale):
try:
import colossal_scaled_upper_triang_masked_softmax
import colossalai._C.scaled_upper_triang_masked_softmax
except ImportError:
raise RuntimeError('ScaledUpperTriangMaskedSoftmax requires cuda extensions')
scale_t = torch.tensor([scale])
softmax_results = colossal_scaled_upper_triang_masked_softmax.forward(inputs, scale_t[0])
softmax_results = colossalai._C.scaled_upper_triang_masked_softmax.forward(inputs, scale_t[0])
ctx.save_for_backward(softmax_results, scale_t)
return softmax_results
......@@ -36,12 +37,13 @@ class ScaledUpperTriangMaskedSoftmax(torch.autograd.Function):
@staticmethod
def backward(ctx, output_grads):
try:
import colossal_scaled_upper_triang_masked_softmax
import colossalai._C.scaled_upper_triang_masked_softmax
except ImportError:
raise RuntimeError('ScaledUpperTriangMaskedSoftmax requires cuda extensions')
softmax_results, scale_t = ctx.saved_tensors
input_grads = colossal_scaled_upper_triang_masked_softmax.backward(output_grads, softmax_results, scale_t[0])
input_grads = colossalai._C.scaled_upper_triang_masked_softmax.backward(output_grads, softmax_results,
scale_t[0])
return input_grads, None
......@@ -58,26 +60,26 @@ class ScaledMaskedSoftmax(torch.autograd.Function):
@staticmethod
def forward(ctx, inputs, mask, scale):
try:
import colossal_scaled_masked_softmax
import colossalai._C.scaled_masked_softmax
except ImportError:
raise RuntimeError('ScaledMaskedSoftmax requires cuda extensions')
scale_t = torch.tensor([scale])
softmax_results = colossal_scaled_masked_softmax.forward(inputs, mask, scale_t[0])
softmax_results = colossalai._C.scaled_masked_softmax.forward(inputs, mask, scale_t[0])
ctx.save_for_backward(softmax_results, scale_t)
return softmax_results
@staticmethod
def backward(ctx, output_grads):
try:
import colossal_scaled_masked_softmax
import colossalai._C.scaled_masked_softmax
except ImportError:
raise RuntimeError('ScaledMaskedSoftmax requires cuda extensions')
softmax_results, scale_t = ctx.saved_tensors
input_grads = colossal_scaled_masked_softmax.backward(output_grads, softmax_results, scale_t[0])
input_grads = colossalai._C.scaled_masked_softmax.backward(output_grads, softmax_results, scale_t[0])
return input_grads, None, None
......@@ -184,8 +186,8 @@ class FusedScaleMaskSoftmax(nn.Module):
@staticmethod
def get_batch_per_block(sq, sk, b, np):
try:
import colossal_scaled_masked_softmax
import colossalai._C.scaled_masked_softmax
except ImportError:
raise RuntimeError('ScaledMaskedSoftmax requires cuda extensions')
return colossal_scaled_masked_softmax.get_batch_per_block(sq, sk, b, np)
return colossalai._C.scaled_masked_softmax.get_batch_per_block(sq, sk, b, np)
import torch
import torch.distributed as dist
from torch import Tensor
from typing import Any, Tuple, Optional
from torch.distributed import ProcessGroup
COL_MOE_KERNEL_FLAG = False
try:
import colossal_moe_cuda
COL_MOE_KERNEL_FLAG = True
except ImportError:
print("If you want to activate cuda mode for MoE, please install with cuda_ext!")
class AllGather(torch.autograd.Function):
@staticmethod
def forward(ctx: Any, inputs: Tensor, group: Optional[ProcessGroup] = None) -> Tensor:
if ctx is not None:
ctx.comm_grp = group
comm_size = dist.get_world_size(group)
if comm_size == 1:
return inputs.unsqueeze(0)
buffer_shape = (comm_size,) + inputs.shape
outputs = torch.empty(buffer_shape, dtype=inputs.dtype, device=inputs.device)
buffer_list = list(torch.chunk(outputs, comm_size, dim=0))
dist.all_gather(buffer_list, inputs, group=group)
return outputs
@staticmethod
def backward(ctx: Any, grad_outputs: Tensor) -> Tuple[Tensor, None]:
return ReduceScatter.forward(None, grad_outputs, ctx.comm_grp), None
class ReduceScatter(torch.autograd.Function):
@staticmethod
def forward(ctx: Any, inputs: Tensor, group: Optional[ProcessGroup] = None) -> Tensor:
if ctx is not None:
ctx.comm_grp = group
comm_size = dist.get_world_size(group)
if comm_size == 1:
return inputs.squeeze(0)
if not inputs.is_contiguous():
inputs = inputs.contiguous()
output_shape = inputs.shape[1:]
outputs = torch.empty(output_shape, dtype=inputs.dtype, device=inputs.device)
buffer_list = list(torch.chunk(inputs, comm_size, dim=0))
dist.reduce_scatter(outputs, buffer_list, group=group)
return outputs
@staticmethod
def backward(ctx: Any, grad_outputs: Tensor) -> Tuple[Tensor, None]:
return AllGather.forward(None, grad_outputs, ctx.comm_grp), None
class AllToAll(torch.autograd.Function):
"""Dispatches input tensor [e, c, h] to all experts by all_to_all_single
operation in torch.distributed.
"""
@staticmethod
def forward(ctx: Any, inputs: Tensor, group: Optional[ProcessGroup] = None) -> Tensor:
if ctx is not None:
ctx.comm_grp = group
if not inputs.is_contiguous():
inputs = inputs.contiguous()
if dist.get_world_size(group) == 1:
return inputs
output = torch.empty_like(inputs)
dist.all_to_all_single(output, inputs, group=group)
return output
@staticmethod
def backward(ctx: Any, *grad_outputs: Tensor) -> Tuple[Tensor, None]:
return AllToAll.forward(None, *grad_outputs, ctx.comm_grp), None
class MoeDispatch(torch.autograd.Function):
@staticmethod
def forward(ctx, tokens, mask, dest_idx, ec):
s = tokens.size(0)
h = tokens.size(1)
expert_input = colossal_moe_cuda.dispatch_forward(s, ec, h, tokens, mask, dest_idx)
ctx.save_for_backward(mask, dest_idx)
ctx.s = s
ctx.h = h
ctx.ec = ec
return expert_input
@staticmethod
def backward(ctx, output_grad):
mask, dest_idx = ctx.saved_tensors
d_tokens = colossal_moe_cuda.dispatch_backward(ctx.s, ctx.ec, ctx.h, output_grad, mask, dest_idx)
return d_tokens, None, None, None
class MoeCombine(torch.autograd.Function):
@staticmethod
def forward(ctx, expert_tokens, logits, mask, dest_idx, ec):
assert logits.dtype == torch.float32
s = logits.size(0)
e = logits.size(1)
c = ec // e
h = expert_tokens.size(-1)
fp16_flag = (expert_tokens.dtype == torch.float16)
cb_input = expert_tokens.to(torch.float32) if fp16_flag else expert_tokens
ctokens = colossal_moe_cuda.combine_forward(s, e, c, h, cb_input, logits, mask, dest_idx)
output = ctokens.to(torch.float16) if fp16_flag else ctokens
ctx.save_for_backward(expert_tokens, logits, mask, dest_idx)
ctx.s = s
ctx.e = e
ctx.c = c
ctx.h = h
ctx.fp16_flag = fp16_flag
return output
@staticmethod
def backward(ctx, tokens_grad):
expert_tokens, logits, mask, dest_idx = ctx.saved_tensors
cb_grad = tokens_grad.to(torch.float32) if tokens_grad.dtype is torch.float16 \
else tokens_grad
cb_input = expert_tokens.to(torch.float32) if ctx.fp16_flag else expert_tokens
d_expert, d_logits = colossal_moe_cuda.combine_backward(ctx.s, ctx.e, ctx.c, ctx.h, cb_grad, cb_input, logits,
mask, dest_idx)
d_expert = d_expert.to(torch.float16) if ctx.fp16_flag else d_expert
return d_expert, d_logits, None, None, None
def moe_cumsum(inputs: Tensor):
dim0 = inputs.size(0)
flag = (dim0 <= 1024) or (dim0 <= 2048 and dim0 % 2 == 0) or (dim0 % 4 == 0)
if flag and COL_MOE_KERNEL_FLAG:
return colossal_moe_cuda.cumsum_sub_one(inputs)
else:
return torch.cumsum(inputs, dim=0) - 1
from typing import Any, Optional, Tuple
import torch
import torch.distributed as dist
from torch import Tensor
from torch.distributed import ProcessGroup
COL_MOE_KERNEL_FLAG = False
try:
import colossalai._C.moe
COL_MOE_KERNEL_FLAG = True
except ImportError:
print("If you want to activate cuda mode for MoE, please install with cuda_ext!")
class AllGather(torch.autograd.Function):
@staticmethod
def forward(ctx: Any, inputs: Tensor, group: Optional[ProcessGroup] = None) -> Tensor:
if ctx is not None:
ctx.comm_grp = group
comm_size = dist.get_world_size(group)
if comm_size == 1:
return inputs.unsqueeze(0)
buffer_shape = (comm_size,) + inputs.shape
outputs = torch.empty(buffer_shape, dtype=inputs.dtype, device=inputs.device)
buffer_list = list(torch.chunk(outputs, comm_size, dim=0))
dist.all_gather(buffer_list, inputs, group=group)
return outputs
@staticmethod
def backward(ctx: Any, grad_outputs: Tensor) -> Tuple[Tensor, None]:
return ReduceScatter.forward(None, grad_outputs, ctx.comm_grp), None
class ReduceScatter(torch.autograd.Function):
@staticmethod
def forward(ctx: Any, inputs: Tensor, group: Optional[ProcessGroup] = None) -> Tensor:
if ctx is not None:
ctx.comm_grp = group
comm_size = dist.get_world_size(group)
if comm_size == 1:
return inputs.squeeze(0)
if not inputs.is_contiguous():
inputs = inputs.contiguous()
output_shape = inputs.shape[1:]
outputs = torch.empty(output_shape, dtype=inputs.dtype, device=inputs.device)
buffer_list = list(torch.chunk(inputs, comm_size, dim=0))
dist.reduce_scatter(outputs, buffer_list, group=group)
return outputs
@staticmethod
def backward(ctx: Any, grad_outputs: Tensor) -> Tuple[Tensor, None]:
return AllGather.forward(None, grad_outputs, ctx.comm_grp), None
class AllToAll(torch.autograd.Function):
"""Dispatches input tensor [e, c, h] to all experts by all_to_all_single
operation in torch.distributed.
"""
@staticmethod
def forward(ctx: Any, inputs: Tensor, group: Optional[ProcessGroup] = None) -> Tensor:
if ctx is not None:
ctx.comm_grp = group
if not inputs.is_contiguous():
inputs = inputs.contiguous()
if dist.get_world_size(group) == 1:
return inputs
output = torch.empty_like(inputs)
dist.all_to_all_single(output, inputs, group=group)
return output
@staticmethod
def backward(ctx: Any, *grad_outputs: Tensor) -> Tuple[Tensor, None]:
return AllToAll.forward(None, *grad_outputs, ctx.comm_grp), None
class MoeDispatch(torch.autograd.Function):
@staticmethod
def forward(ctx, tokens, mask, dest_idx, ec):
s = tokens.size(0)
h = tokens.size(1)
expert_input = colossalai._C.moe.dispatch_forward(s, ec, h, tokens, mask, dest_idx)
ctx.save_for_backward(mask, dest_idx)
ctx.s = s
ctx.h = h
ctx.ec = ec
return expert_input
@staticmethod
def backward(ctx, output_grad):
mask, dest_idx = ctx.saved_tensors
d_tokens = colossalai._C.moe.dispatch_backward(ctx.s, ctx.ec, ctx.h, output_grad, mask, dest_idx)
return d_tokens, None, None, None
class MoeCombine(torch.autograd.Function):
@staticmethod
def forward(ctx, expert_tokens, logits, mask, dest_idx, ec):
assert logits.dtype == torch.float32
s = logits.size(0)
e = logits.size(1)
c = ec // e
h = expert_tokens.size(-1)
fp16_flag = (expert_tokens.dtype == torch.float16)
cb_input = expert_tokens.to(torch.float32) if fp16_flag else expert_tokens
ctokens = colossalai._C.moe.combine_forward(s, e, c, h, cb_input, logits, mask, dest_idx)
output = ctokens.to(torch.float16) if fp16_flag else ctokens
ctx.save_for_backward(expert_tokens, logits, mask, dest_idx)
ctx.s = s
ctx.e = e
ctx.c = c
ctx.h = h
ctx.fp16_flag = fp16_flag
return output
@staticmethod
def backward(ctx, tokens_grad):
expert_tokens, logits, mask, dest_idx = ctx.saved_tensors
cb_grad = tokens_grad.to(torch.float32) if tokens_grad.dtype is torch.float16 \
else tokens_grad
cb_input = expert_tokens.to(torch.float32) if ctx.fp16_flag else expert_tokens
d_expert, d_logits = colossalai._C.moe.combine_backward(ctx.s, ctx.e, ctx.c, ctx.h, cb_grad, cb_input, logits,
mask, dest_idx)
d_expert = d_expert.to(torch.float16) if ctx.fp16_flag else d_expert
return d_expert, d_logits, None, None, None
def moe_cumsum(inputs: Tensor):
dim0 = inputs.size(0)
flag = (dim0 <= 1024) or (dim0 <= 2048 and dim0 % 2 == 0) or (dim0 % 4 == 0)
if flag and COL_MOE_KERNEL_FLAG:
return colossalai._C.moe.cumsum_sub_one(inputs)
else:
return torch.cumsum(inputs, dim=0) - 1
import math
from typing import Optional
import torch
from colossalai.registry import OPTIMIZERS
from .nvme_optimizer import NVMeOptimizer
from typing import Optional
@OPTIMIZERS.register_module
......@@ -11,7 +13,7 @@ class CPUAdam(NVMeOptimizer):
"""Implements Adam algorithm.
Supports parameters updating on both GPU and CPU, depanding on the device of paramters.
But the parameters and gradients should on the same device:
But the parameters and gradients should on the same device:
* Parameters on CPU and gradients on CPU is allowed.
* Parameters on GPU and gradients on GPU is allowed.
* Parameters on GPU and gradients on CPU is **not** allowed.
......@@ -44,7 +46,7 @@ class CPUAdam(NVMeOptimizer):
(default: False) NOT SUPPORTED yet in CPUAdam!
adamw_mode (boolean, optional): Apply L2 regularization or weight decay
True for decoupled weight decay(also known as AdamW) (default: True)
simd_log (boolean, optional): whether to show if you are using SIMD to
simd_log (boolean, optional): whether to show if you are using SIMD to
accelerate. (default: False)
nvme_offload_fraction (float, optional): Fraction of optimizer states to be offloaded to NVMe. Defaults to 0.0.
nvme_offload_dir (Optional[str], optional): Directory to save NVMe offload files.
......@@ -75,10 +77,11 @@ class CPUAdam(NVMeOptimizer):
super(CPUAdam, self).__init__(model_params, default_args, nvme_offload_fraction, nvme_offload_dir)
self.adamw_mode = adamw_mode
try:
import cpu_adam
import colossalai._C.cpu_optim
except ImportError:
raise ImportError('Please install colossalai from source code to use CPUAdam')
self.cpu_adam_op = cpu_adam.CPUAdamOptimizer(lr, betas[0], betas[1], eps, weight_decay, adamw_mode)
self.cpu_adam_op = colossalai._C.cpu_optim.CPUAdamOptimizer(lr, betas[0], betas[1], eps, weight_decay,
adamw_mode)
def torch_adam_update(self,
data,
......
......@@ -20,7 +20,7 @@ class FusedAdam(torch.optim.Optimizer):
:class:`colossalai.nn.optimizer.FusedAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
or ``torch.optim.Adam`` with ``adamw_mode=False``
:class:`colossalai.nn.optimizer.FusedAdam` may be used with or without Amp.
:class:`colossalai.nn.optimizer.FusedAdam` may be used with or without Amp.
Adam was been proposed in `Adam: A Method for Stochastic Optimization`_.
......@@ -65,10 +65,11 @@ class FusedAdam(torch.optim.Optimizer):
self.adamw_mode = 1 if adamw_mode else 0
self.set_grad_none = set_grad_none
if multi_tensor_applier.available:
import colossal_C
import colossalai._C.fused_optim
# Skip buffer
self._dummy_overflow_buf = torch.cuda.IntTensor([0])
self.multi_tensor_adam = colossal_C.multi_tensor_adam
self.multi_tensor_adam = colossalai._C.fused_optim.multi_tensor_adam
else:
raise RuntimeError('FusedAdam requires cuda extensions')
......
......@@ -76,13 +76,13 @@ class FusedLAMB(torch.optim.Optimizer):
max_grad_norm=max_grad_norm)
super(FusedLAMB, self).__init__(params, defaults)
if multi_tensor_applier.available:
import colossal_C
self.multi_tensor_l2norm = colossal_C.multi_tensor_l2norm
import colossalai._C.fused_optim
self.multi_tensor_l2norm = colossalai._C.fused_optim.multi_tensor_l2norm
# Skip buffer
self._dummy_overflow_buf = torch.tensor([0],
dtype=torch.int,
device=self.param_groups[0]["params"][0].device)
self.multi_tensor_lamb = colossal_C.multi_tensor_lamb
self.multi_tensor_lamb = colossalai._C.fused_optim.multi_tensor_lamb
else:
raise RuntimeError('FusedLAMB requires cuda extensions')
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment