Unverified Commit a148bd33 authored by RezaYazdaniAminabadi's avatar RezaYazdaniAminabadi Committed by GitHub
Browse files

Add configurable intermediate size to transformer kernels (#423)

parent a825f996
...@@ -69,7 +69,7 @@ public: ...@@ -69,7 +69,7 @@ public:
if (!_workspace) { if (!_workspace) {
assert(_workspace == nullptr); assert(_workspace == nullptr);
cudaMalloc(&_workspace, size); cudaMalloc(&_workspace, size);
} else if (_workSpaceSize != size) { } else if (_workSpaceSize < size) {
cudaFree(_workspace); cudaFree(_workspace);
cudaMalloc(&_workspace, size); cudaMalloc(&_workspace, size);
} }
......
...@@ -20,13 +20,14 @@ template <typename T> ...@@ -20,13 +20,14 @@ template <typename T>
size_t get_workspace_size(int maxBatchSize, size_t get_workspace_size(int maxBatchSize,
int seq_len, int seq_len,
int hidden_size, int hidden_size,
int intermediate_size,
int heads, int heads,
bool training, bool training,
bool gelu_checkpoint) bool gelu_checkpoint)
{ {
size_t workSpacesize = 4 * (size_t(maxBatchSize) * seq_len * hidden_size); size_t workSpacesize = 4 * (size_t(maxBatchSize) * seq_len * hidden_size);
if (training) { if (training) {
workSpacesize += (std::max((4 * size_t(maxBatchSize) * seq_len * hidden_size), workSpacesize += (std::max((size_t(maxBatchSize) * seq_len * intermediate_size),
2 * (size_t(maxBatchSize) * heads * seq_len * seq_len))); 2 * (size_t(maxBatchSize) * heads * seq_len * seq_len)));
if (gelu_checkpoint) workSpacesize += 2 * (size_t(maxBatchSize) * seq_len * hidden_size); if (gelu_checkpoint) workSpacesize += 2 * (size_t(maxBatchSize) * seq_len * hidden_size);
} }
...@@ -92,12 +93,12 @@ BertTransformerLayer<T>::BertTransformerLayer(int layer_id, ...@@ -92,12 +93,12 @@ BertTransformerLayer<T>::BertTransformerLayer(int layer_id,
false, false,
!normalize_invertible)), !normalize_invertible)),
_ff1(typename FeedForward<T>::Config(batch_size * seq_length, _ff1(typename FeedForward<T>::Config(batch_size * seq_length,
4 * hidden_size, _intermediate_size,
hidden_size, hidden_size,
gemm_algos[1])), gemm_algos[1])),
_ff2(typename FeedForward<T>::Config(batch_size * seq_length, _ff2(typename FeedForward<T>::Config(batch_size * seq_length,
hidden_size, hidden_size,
4 * hidden_size, _intermediate_size,
gemm_algos[2])), gemm_algos[2])),
_softmax(typename Softmax<T>::Config(batch_size, num_heads, seq_length)), _softmax(typename Softmax<T>::Config(batch_size, num_heads, seq_length)),
_gelu(typename Gelu<T>::Config(_batch_size, _seq_length, _intermediate_size)), _gelu(typename Gelu<T>::Config(_batch_size, _seq_length, _intermediate_size)),
...@@ -143,8 +144,13 @@ BertTransformerLayer<T>::~BertTransformerLayer() ...@@ -143,8 +144,13 @@ BertTransformerLayer<T>::~BertTransformerLayer()
template <typename T> template <typename T>
void BertTransformerLayer<T>::Initialize() void BertTransformerLayer<T>::Initialize()
{ {
Context::Instance().GenWorkSpace(get_workspace_size<T>( Context::Instance().GenWorkSpace(get_workspace_size<T>(_batch_size,
_batch_size, _seq_length, _hidden_size, _heads, _training, _gelu_checkpoint)); _seq_length,
_hidden_size,
_intermediate_size,
_heads,
_training,
_gelu_checkpoint));
if (std::is_same<T, __half>::value) cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH); if (std::is_same<T, __half>::value) cublasSetMathMode(_cublasHandle, CUBLAS_TENSOR_OP_MATH);
} }
...@@ -343,7 +349,8 @@ void BertTransformerLayer<T>::Backward(int bsz, ...@@ -343,7 +349,8 @@ void BertTransformerLayer<T>::Backward(int bsz,
T* buf_2 = buf_1 + small_buf_size; T* buf_2 = buf_1 + small_buf_size;
T* buf_3 = buf_2 + small_buf_size; T* buf_3 = buf_2 + small_buf_size;
T* ff2_buf = buf_3 + (_gelu_checkpoint ? 3 : 1) * small_buf_size; T* ff2_buf = (_gelu_checkpoint ? buf_2 + (bsz * _seq_length * _intermediate_size)
: buf_3 + small_buf_size);
T* ctx_bufB_ptr_recomp = ff2_buf + (_seq_length * _seq_length * bsz * _heads); T* ctx_bufB_ptr_recomp = ff2_buf + (_seq_length * _seq_length * bsz * _heads);
cudaStream_t streams[2] = {_stream, _stream}; cudaStream_t streams[2] = {_stream, _stream};
......
...@@ -18,6 +18,7 @@ class TransformerConfig(): ...@@ -18,6 +18,7 @@ class TransformerConfig():
batch_size, batch_size,
max_seq_length, max_seq_length,
hidden_size, hidden_size,
intermediate_size,
heads, heads,
attn_dropout_ratio, attn_dropout_ratio,
hidden_dropout_ratio, hidden_dropout_ratio,
...@@ -26,6 +27,7 @@ class TransformerConfig(): ...@@ -26,6 +27,7 @@ class TransformerConfig():
self.layer_id = -1 self.layer_id = -1
self.batch_size = batch_size self.batch_size = batch_size
self.hidden_size = hidden_size self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.max_seq_length = max_seq_length self.max_seq_length = max_seq_length
self.heads = heads self.heads = heads
self.attn_dropout_ratio = attn_dropout_ratio self.attn_dropout_ratio = attn_dropout_ratio
...@@ -44,6 +46,8 @@ class DeepSpeedTransformerConfig(TransformerConfig): ...@@ -44,6 +46,8 @@ class DeepSpeedTransformerConfig(TransformerConfig):
hidden_size: The hidden size of the transformer layer hidden_size: The hidden size of the transformer layer
intermediate_size: The intermediate size of the feed-forward part of transformer layer
heads: The number of heads in the self-attention of the transformer layer heads: The number of heads in the self-attention of the transformer layer
attn_dropout_ratio: The ratio of dropout for the attention's output attn_dropout_ratio: The ratio of dropout for the attention's output
...@@ -88,6 +92,7 @@ class DeepSpeedTransformerConfig(TransformerConfig): ...@@ -88,6 +92,7 @@ class DeepSpeedTransformerConfig(TransformerConfig):
batch_size=-1, batch_size=-1,
max_seq_length=-1, max_seq_length=-1,
hidden_size=-1, hidden_size=-1,
intermediate_size=-1,
heads=-1, heads=-1,
attn_dropout_ratio=-1, attn_dropout_ratio=-1,
hidden_dropout_ratio=-1, hidden_dropout_ratio=-1,
...@@ -103,9 +108,11 @@ class DeepSpeedTransformerConfig(TransformerConfig): ...@@ -103,9 +108,11 @@ class DeepSpeedTransformerConfig(TransformerConfig):
attn_dropout_checkpoint=False, attn_dropout_checkpoint=False,
stochastic_mode=False): stochastic_mode=False):
super(DeepSpeedTransformerConfig, super(DeepSpeedTransformerConfig,
self).__init__(batch_size, self).__init__(
batch_size,
max_seq_length, max_seq_length,
hidden_size, hidden_size,
(intermediate_size if intermediate_size > 0 else 4 * hidden_size),
heads, heads,
attn_dropout_ratio, attn_dropout_ratio,
hidden_dropout_ratio, hidden_dropout_ratio,
...@@ -432,12 +439,12 @@ class DeepSpeedTransformerLayer(nn.Module): ...@@ -432,12 +439,12 @@ class DeepSpeedTransformerLayer(nn.Module):
self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size)) self.attn_nw = nn.Parameter(torch.Tensor(self.config.hidden_size))
self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size)) self.attn_nb = nn.Parameter(torch.Tensor(self.config.hidden_size))
self.inter_w = nn.Parameter( self.inter_w = nn.Parameter(
torch.Tensor(4 * self.config.hidden_size, torch.Tensor(self.config.intermediate_size,
self.config.hidden_size)) self.config.hidden_size))
self.inter_b = nn.Parameter(torch.Tensor(4 * self.config.hidden_size)) self.inter_b = nn.Parameter(torch.Tensor(self.config.intermediate_size))
self.output_w = nn.Parameter( self.output_w = nn.Parameter(
torch.Tensor(self.config.hidden_size, torch.Tensor(self.config.hidden_size,
4 * self.config.hidden_size)) self.config.intermediate_size))
self.output_b = nn.Parameter(torch.Tensor(self.config.hidden_size)) self.output_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
self.norm_w = nn.Parameter(torch.Tensor(self.config.hidden_size)) self.norm_w = nn.Parameter(torch.Tensor(self.config.hidden_size))
self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size)) self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
...@@ -485,7 +492,7 @@ class DeepSpeedTransformerLayer(nn.Module): ...@@ -485,7 +492,7 @@ class DeepSpeedTransformerLayer(nn.Module):
self.config.batch_size, self.config.batch_size,
self.config.hidden_size, self.config.hidden_size,
self.config.heads, self.config.heads,
4 * self.config.hidden_size, self.config.intermediate_size,
self.config.max_seq_length, self.config.max_seq_length,
self.config.attn_dropout_ratio, self.config.attn_dropout_ratio,
self.config.hidden_dropout_ratio, self.config.hidden_dropout_ratio,
......
...@@ -146,7 +146,7 @@ def create_models(ds_config): ...@@ -146,7 +146,7 @@ def create_models(ds_config):
hidden_size=ds_config.hidden_size, hidden_size=ds_config.hidden_size,
num_hidden_layers=ds_config.num_hidden_layers, num_hidden_layers=ds_config.num_hidden_layers,
num_attention_heads=ds_config.heads, num_attention_heads=ds_config.heads,
intermediate_size=4 * ds_config.hidden_size, intermediate_size=ds_config.intermediate_size,
hidden_act="gelu", hidden_act="gelu",
hidden_dropout_prob=ds_config.hidden_dropout_ratio, hidden_dropout_prob=ds_config.hidden_dropout_ratio,
attention_probs_dropout_prob=ds_config.attn_dropout_ratio, attention_probs_dropout_prob=ds_config.attn_dropout_ratio,
...@@ -166,12 +166,12 @@ def create_models(ds_config): ...@@ -166,12 +166,12 @@ def create_models(ds_config):
weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
weights[4].data.fill_(1.0) weights[4].data.fill_(1.0)
weights.append( weights.append(
nn.Parameter(torch.Tensor(4 * ds_config.hidden_size, nn.Parameter(torch.Tensor(ds_config.intermediate_size,
ds_config.hidden_size))) ds_config.hidden_size)))
weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range) weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range)
weights.append( weights.append(
nn.Parameter(torch.Tensor(ds_config.hidden_size, nn.Parameter(torch.Tensor(ds_config.hidden_size,
4 * ds_config.hidden_size))) ds_config.intermediate_size)))
weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range) weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range)
weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
weights[7].data.fill_(1.0) weights[7].data.fill_(1.0)
...@@ -181,7 +181,7 @@ def create_models(ds_config): ...@@ -181,7 +181,7 @@ def create_models(ds_config):
for i in range(4): for i in range(4):
biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
biases[i + 1].data.zero_() biases[i + 1].data.zero_()
biases.append(nn.Parameter(torch.Tensor(4 * ds_config.hidden_size))) biases.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size)))
biases[5].data.zero_() biases[5].data.zero_()
biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
biases[6].data.zero_() biases[6].data.zero_()
...@@ -278,6 +278,7 @@ def test_backward(batch_size, ...@@ -278,6 +278,7 @@ def test_backward(batch_size,
ds_config.layer_id = None ds_config.layer_id = None
ds_config.batch_size = batch_size ds_config.batch_size = batch_size
ds_config.hidden_size = hidden_size ds_config.hidden_size = hidden_size
ds_config.intermediate_size = hidden_size
ds_config.max_seq_length = seq_len ds_config.max_seq_length = seq_len
ds_config.heads = heads ds_config.heads = heads
ds_config.attn_dropout_ratio = 0.0 ds_config.attn_dropout_ratio = 0.0
...@@ -314,6 +315,7 @@ def test_backward(batch_size, ...@@ -314,6 +315,7 @@ def test_backward(batch_size,
# ds_config.layer_id = None # ds_config.layer_id = None
# ds_config.batch_size = batch_size # ds_config.batch_size = batch_size
# ds_config.hidden_size = hidden_size # ds_config.hidden_size = hidden_size
# ds_config.intermediate_size = 4 * hidden_size
# ds_config.max_seq_length = seq_len # ds_config.max_seq_length = seq_len
# ds_config.heads = heads # ds_config.heads = heads
# ds_config.attn_dropout_ratio = 0.0 # ds_config.attn_dropout_ratio = 0.0
......
...@@ -113,7 +113,7 @@ def create_models(ds_config): ...@@ -113,7 +113,7 @@ def create_models(ds_config):
num_hidden_layers=ds_config.num_hidden_layers, num_hidden_layers=ds_config.num_hidden_layers,
num_attention_heads=ds_config.heads, num_attention_heads=ds_config.heads,
batch_size=ds_config.batch_size, batch_size=ds_config.batch_size,
intermediate_size=4 * ds_config.hidden_size, intermediate_size=ds_config.intermediate_size,
hidden_act="gelu", hidden_act="gelu",
hidden_dropout_prob=ds_config.hidden_dropout_ratio, hidden_dropout_prob=ds_config.hidden_dropout_ratio,
attention_probs_dropout_prob=ds_config.attn_dropout_ratio, attention_probs_dropout_prob=ds_config.attn_dropout_ratio,
...@@ -134,12 +134,12 @@ def create_models(ds_config): ...@@ -134,12 +134,12 @@ def create_models(ds_config):
weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
weights[4].data.fill_(1.0) weights[4].data.fill_(1.0)
weights.append( weights.append(
nn.Parameter(torch.Tensor(4 * ds_config.hidden_size, nn.Parameter(torch.Tensor(ds_config.intermediate_size,
ds_config.hidden_size))) ds_config.hidden_size)))
weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range) weights[5].data.normal_(mean=0.0, std=ds_config.initializer_range)
weights.append( weights.append(
nn.Parameter(torch.Tensor(ds_config.hidden_size, nn.Parameter(torch.Tensor(ds_config.hidden_size,
4 * ds_config.hidden_size))) ds_config.intermediate_size)))
weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range) weights[6].data.normal_(mean=0.0, std=ds_config.initializer_range)
weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) weights.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
weights[7].data.fill_(1.0) weights[7].data.fill_(1.0)
...@@ -149,7 +149,7 @@ def create_models(ds_config): ...@@ -149,7 +149,7 @@ def create_models(ds_config):
for i in range(4): for i in range(4):
biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
biases[i + 1].data.zero_() biases[i + 1].data.zero_()
biases.append(nn.Parameter(torch.Tensor(4 * ds_config.hidden_size))) biases.append(nn.Parameter(torch.Tensor(ds_config.intermediate_size)))
biases[5].data.zero_() biases[5].data.zero_()
biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size))) biases.append(nn.Parameter(torch.Tensor(ds_config.hidden_size)))
biases[6].data.zero_() biases[6].data.zero_()
...@@ -246,6 +246,7 @@ def test_forward(batch_size, ...@@ -246,6 +246,7 @@ def test_forward(batch_size,
ds_config.layer_id = None ds_config.layer_id = None
ds_config.batch_size = batch_size ds_config.batch_size = batch_size
ds_config.hidden_size = hidden_size ds_config.hidden_size = hidden_size
ds_config.intermediate_size = 4 * hidden_size
ds_config.max_seq_length = seq_len ds_config.max_seq_length = seq_len
ds_config.heads = heads ds_config.heads = heads
ds_config.attn_dropout_ratio = 0.0 ds_config.attn_dropout_ratio = 0.0
...@@ -282,6 +283,7 @@ def test_forward_with_small_bsz(batch_size, ...@@ -282,6 +283,7 @@ def test_forward_with_small_bsz(batch_size,
ds_config.layer_id = None ds_config.layer_id = None
ds_config.batch_size = batch_size ds_config.batch_size = batch_size
ds_config.hidden_size = hidden_size ds_config.hidden_size = hidden_size
ds_config.intermediate_size = 4 * hidden_size
ds_config.max_seq_length = seq_len ds_config.max_seq_length = seq_len
ds_config.heads = heads ds_config.heads = heads
ds_config.attn_dropout_ratio = 0.0 ds_config.attn_dropout_ratio = 0.0
...@@ -316,6 +318,7 @@ def test_forward_stochastic(batch_size, ...@@ -316,6 +318,7 @@ def test_forward_stochastic(batch_size,
ds_config.layer_id = None ds_config.layer_id = None
ds_config.batch_size = batch_size ds_config.batch_size = batch_size
ds_config.hidden_size = hidden_size ds_config.hidden_size = hidden_size
ds_config.intermediate_size = 4 * hidden_size
ds_config.max_seq_length = seq_len ds_config.max_seq_length = seq_len
ds_config.heads = heads ds_config.heads = heads
ds_config.attn_dropout_ratio = 0.0 ds_config.attn_dropout_ratio = 0.0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment