"torchvision/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "76144badd130504abb519175a52c2f87c424bcba"
Unverified Commit e2dfcadf authored by Reza Yazdani's avatar Reza Yazdani Committed by GitHub
Browse files

Fix the bias-add and add the layer-norm-eps parameter (#791)

* fix the bias-add precision and indexing and also adding the layer-norm-eps as a configurable parameter for transformer

* add ACC_HALF config

* use defined to check if ACC_Half is defined
parent 48065c06
...@@ -42,6 +42,7 @@ public: ...@@ -42,6 +42,7 @@ public:
int seq_length, int seq_length,
float attn_dropout_ratio, float attn_dropout_ratio,
float hidden_output_dropout_ratio, float hidden_output_dropout_ratio,
float layer_norm_eps,
bool pre_or_postLayerNorm, bool pre_or_postLayerNorm,
const std::vector<std::array<int, 3>>& gemm_algos, const std::vector<std::array<int, 3>>& gemm_algos,
bool attn_dropout_checkpoint, bool attn_dropout_checkpoint,
......
...@@ -18,11 +18,16 @@ public: ...@@ -18,11 +18,16 @@ public:
float epsilon; float epsilon;
bool training; bool training;
bool useMean; bool useMean;
Config(uint32_t batch, uint32_t seq, uint32_t h, bool training, bool useMean = true) Config(uint32_t batch,
uint32_t seq,
uint32_t h,
float epsilon = 1e-12,
bool training = true,
bool useMean = true)
: batchSize(batch), : batchSize(batch),
seqLength(seq), seqLength(seq),
hiddenDim(h), hiddenDim(h),
epsilon(1e-12), epsilon(epsilon),
training(training), training(training),
useMean(useMean) useMean(useMean)
{ {
......
...@@ -493,7 +493,7 @@ __global__ void dropout_kernel(const int N, ...@@ -493,7 +493,7 @@ __global__ void dropout_kernel(const int N,
m[3] = (uint8_t)(rand.w > ratio); m[3] = (uint8_t)(rand.w > ratio);
float4 x_data = Xdata_cast[j]; float4 x_data = Xdata_cast[j];
float4 b_data = bias_cast[tid]; float4 b_data = bias_cast[j % (dim / unroll_factor)];
x_data.x += b_data.x; x_data.x += b_data.x;
x_data.y += b_data.y; x_data.y += b_data.y;
...@@ -515,7 +515,7 @@ __global__ void dropout_kernel(const int N, ...@@ -515,7 +515,7 @@ __global__ void dropout_kernel(const int N,
float* rand_data = &(rand.x); float* rand_data = &(rand.x);
int k = 0; int k = 0;
for (int i = high_index; i < N; i++) { for (int i = high_index; i < N; i++) {
float x_data = Xdata[i] + bias[threadIdx.x % dim]; float x_data = Xdata[i] + bias[i % dim];
uint8_t m = (uint8_t)(rand_data[k++] > ratio); uint8_t m = (uint8_t)(rand_data[k++] > ratio);
Xdata[i] = x_data * scale * m; Xdata[i] = x_data * scale * m;
mask[i] = m; mask[i] = m;
...@@ -553,7 +553,7 @@ __global__ void dropout_kernel(const int N, ...@@ -553,7 +553,7 @@ __global__ void dropout_kernel(const int N,
__half2* bias_h = reinterpret_cast<__half2*>(&bias_f); __half2* bias_h = reinterpret_cast<__half2*>(&bias_f);
data_f = Xdata_cast[j]; data_f = Xdata_cast[j];
bias_f = bias_cast[tid]; bias_f = bias_cast[j % (dim / unroll_factor)];
float2 data_h_0 = __half22float2(data_h[0]); float2 data_h_0 = __half22float2(data_h[0]);
float2 data_h_1 = __half22float2(data_h[1]); float2 data_h_1 = __half22float2(data_h[1]);
...@@ -595,7 +595,7 @@ __global__ void dropout_kernel(const int N, ...@@ -595,7 +595,7 @@ __global__ void dropout_kernel(const int N,
float* rand_data = &(rand.x); float* rand_data = &(rand.x);
int k = 0; int k = 0;
for (int i = high_index; i < N; i++) { for (int i = high_index; i < N; i++) {
float x_data = (float)Xdata[i] + (float)bias[threadIdx.x % dim]; float x_data = (float)Xdata[i] + (float)bias[i % dim];
uint8_t m = (uint8_t)(rand_data[k++] > ratio); uint8_t m = (uint8_t)(rand_data[k++] > ratio);
Xdata[i] = __float2half(x_data * scale * m); Xdata[i] = __float2half(x_data * scale * m);
mask[i] = m; mask[i] = m;
...@@ -678,7 +678,7 @@ __global__ void dropout_kernel(const int N, ...@@ -678,7 +678,7 @@ __global__ void dropout_kernel(const int N,
m[3] = (uint8_t)(rand.w > ratio); m[3] = (uint8_t)(rand.w > ratio);
float4 out_data; float4 out_data;
float4 b_data = bias_cast[tid]; float4 b_data = bias_cast[j % (dim / unroll_factor)];
float4 res_data = residual_cast[j]; float4 res_data = residual_cast[j];
float4 inp_data = input_cast[j]; float4 inp_data = input_cast[j];
...@@ -707,7 +707,7 @@ __global__ void dropout_kernel(const int N, ...@@ -707,7 +707,7 @@ __global__ void dropout_kernel(const int N,
float* rand_data = &(rand.x); float* rand_data = &(rand.x);
int k = 0; int k = 0;
for (int i = high_index; i < N; i++) { for (int i = high_index; i < N; i++) {
float x_data = input[i] + bias[threadIdx.x % dim]; float x_data = input[i] + bias[i % dim];
uint8_t m = (uint8_t)(rand_data[k++] > ratio); uint8_t m = (uint8_t)(rand_data[k++] > ratio);
x_data = x_data * scale * m; x_data = x_data * scale * m;
x_data += residual[i]; x_data += residual[i];
...@@ -758,7 +758,7 @@ __global__ void dropout_kernel(const int N, ...@@ -758,7 +758,7 @@ __global__ void dropout_kernel(const int N,
float2 input_f; float2 input_f;
__half2* input_h = reinterpret_cast<__half2*>(&input_f); __half2* input_h = reinterpret_cast<__half2*>(&input_f);
bias_f = bias_cast[tid]; bias_f = bias_cast[j % (dim / unroll_factor)];
residual_f = residual_cast[j]; residual_f = residual_cast[j];
input_f = input_cast[j]; input_f = input_cast[j];
...@@ -813,7 +813,7 @@ __global__ void dropout_kernel(const int N, ...@@ -813,7 +813,7 @@ __global__ void dropout_kernel(const int N,
float* rand_data = &(rand.x); float* rand_data = &(rand.x);
int k = 0; int k = 0;
for (int i = high_index; i < N; i++) { for (int i = high_index; i < N; i++) {
float x_data = (float)input[i] + (float)bias[threadIdx.x % dim]; float x_data = (float)input[i] + (float)bias[i % dim];
uint8_t m = (uint8_t)(rand_data[k++] > ratio); uint8_t m = (uint8_t)(rand_data[k++] > ratio);
x_data = x_data * scale * m; x_data = x_data * scale * m;
x_data += (float)residual[i]; x_data += (float)residual[i];
......
...@@ -52,6 +52,7 @@ BertTransformerLayer<T>::BertTransformerLayer(int layer_id, ...@@ -52,6 +52,7 @@ BertTransformerLayer<T>::BertTransformerLayer(int layer_id,
int seq_length, int seq_length,
float attn_prob_dropout_ratio, float attn_prob_dropout_ratio,
float hidden_output_dropout_ratio, float hidden_output_dropout_ratio,
float layer_norm_eps,
bool pre_or_postLayerNorm, bool pre_or_postLayerNorm,
const std::vector<std::array<int, 3>>& gemm_algos, const std::vector<std::array<int, 3>>& gemm_algos,
bool attn_dropout_checkpoint, bool attn_dropout_checkpoint,
...@@ -83,11 +84,13 @@ BertTransformerLayer<T>::BertTransformerLayer(int layer_id, ...@@ -83,11 +84,13 @@ BertTransformerLayer<T>::BertTransformerLayer(int layer_id,
_attn_layer_norm(typename Normalize_Layer<T>::Config(batch_size, _attn_layer_norm(typename Normalize_Layer<T>::Config(batch_size,
seq_length, seq_length,
hidden_size, hidden_size,
layer_norm_eps,
true, true,
!normalize_invertible)), !normalize_invertible)),
_layer_norm(typename Normalize_Layer<T>::Config(batch_size, _layer_norm(typename Normalize_Layer<T>::Config(batch_size,
seq_length, seq_length,
hidden_size, hidden_size,
layer_norm_eps,
true, true,
!normalize_invertible)), !normalize_invertible)),
_ff1(typename FeedForward<T>::Config(batch_size * seq_length, _ff1(typename FeedForward<T>::Config(batch_size * seq_length,
...@@ -583,6 +586,7 @@ int create_transformer_layer(int layer_id, ...@@ -583,6 +586,7 @@ int create_transformer_layer(int layer_id,
int intermediate_size, int intermediate_size,
float attn_dropout_ratio, float attn_dropout_ratio,
float hidden_dropout_ratio, float hidden_dropout_ratio,
float layer_norm_eps,
int seed, int seed,
bool pre_or_postLayerNorm, bool pre_or_postLayerNorm,
bool test_gemm, bool test_gemm,
...@@ -603,6 +607,7 @@ int create_transformer_layer(int layer_id, ...@@ -603,6 +607,7 @@ int create_transformer_layer(int layer_id,
init_seq_length, init_seq_length,
attn_dropout_ratio, attn_dropout_ratio,
hidden_dropout_ratio, hidden_dropout_ratio,
layer_norm_eps,
pre_or_postLayerNorm, pre_or_postLayerNorm,
Context::Instance().GetGemmAlgos(), Context::Instance().GetGemmAlgos(),
attn_dropout_checkpoint, attn_dropout_checkpoint,
......
...@@ -260,11 +260,23 @@ __global__ void bias_add_transform_0213<__half>(__half* output, ...@@ -260,11 +260,23 @@ __global__ void bias_add_transform_0213<__half>(__half* output,
bias_arr = bias_vec[d3]; bias_arr = bias_vec[d3];
vals_arr = vals_vec[d3]; vals_arr = vals_vec[d3];
#if defined(__ACC_HALF__)
output_half[0] = vals_half[0] + bias_half[0]; output_half[0] = vals_half[0] + bias_half[0];
output_half[1] = vals_half[1] + bias_half[1]; output_half[1] = vals_half[1] + bias_half[1];
output_half[2] = vals_half[2] + bias_half[2]; output_half[2] = vals_half[2] + bias_half[2];
output_half[3] = vals_half[3] + bias_half[3]; output_half[3] = vals_half[3] + bias_half[3];
#else
float2 bias_arr_f[4];
float2 vals_arr_f[4];
#pragma unroll
for (int l = 0; l < 4; l++) {
bias_arr_f[l] = __half22float2(bias_half[l]);
vals_arr_f[l] = __half22float2(vals_half[l]);
vals_arr_f[l].x += bias_arr_f[l].x;
vals_arr_f[l].y += bias_arr_f[l].y;
output_half[l] = __float22half2_rn(vals_arr_f[l]);
}
#endif
output_vec[d3] = output_arr; output_vec[d3] = output_arr;
#endif #endif
......
...@@ -38,6 +38,7 @@ def replace_transformer_layer(orig_layer_impl, ...@@ -38,6 +38,7 @@ def replace_transformer_layer(orig_layer_impl,
hidden_dropout_ratio=bert_config.hidden_dropout_prob, hidden_dropout_ratio=bert_config.hidden_dropout_prob,
num_hidden_layers=bert_config.num_hidden_layers, num_hidden_layers=bert_config.num_hidden_layers,
initializer_range=bert_config.initializer_range, initializer_range=bert_config.initializer_range,
layer_norm_eps=bert_config.layer_norm_eps,
seed=seed, seed=seed,
fp16=fp16, fp16=fp16,
pre_layer_norm=preln, pre_layer_norm=preln,
......
...@@ -101,6 +101,7 @@ class DeepSpeedTransformerConfig(TransformerConfig): ...@@ -101,6 +101,7 @@ class DeepSpeedTransformerConfig(TransformerConfig):
hidden_dropout_ratio=-1, hidden_dropout_ratio=-1,
num_hidden_layers=-1, num_hidden_layers=-1,
initializer_range=-1, initializer_range=-1,
layer_norm_eps=1e-12,
local_rank=-1, local_rank=-1,
seed=-1, seed=-1,
fp16=False, fp16=False,
...@@ -130,6 +131,7 @@ class DeepSpeedTransformerConfig(TransformerConfig): ...@@ -130,6 +131,7 @@ class DeepSpeedTransformerConfig(TransformerConfig):
self.gelu_checkpoint = gelu_checkpoint # True: if higher batch size is required self.gelu_checkpoint = gelu_checkpoint # True: if higher batch size is required
self.adjust_init_range = adjust_init_range self.adjust_init_range = adjust_init_range
self.test_gemm = False self.test_gemm = False
self.layer_norm_eps = layer_norm_eps
self.training = training self.training = training
self.is_grad_enabled = True self.is_grad_enabled = True
self.attn_dropout_checkpoint = attn_dropout_checkpoint self.attn_dropout_checkpoint = attn_dropout_checkpoint
...@@ -553,6 +555,7 @@ class DeepSpeedTransformerLayer(nn.Module): ...@@ -553,6 +555,7 @@ class DeepSpeedTransformerLayer(nn.Module):
self.config.intermediate_size, self.config.intermediate_size,
self.config.attn_dropout_ratio, self.config.attn_dropout_ratio,
self.config.hidden_dropout_ratio, self.config.hidden_dropout_ratio,
self.config.layer_norm_eps,
self.config.seed, self.config.seed,
self.config.pre_layer_norm, self.config.pre_layer_norm,
self.config.test_gemm, self.config.test_gemm,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment