Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
eadbbe09
Commit
eadbbe09
authored
Apr 25, 2021
by
401qingkong
Browse files
push rocm deepspeed v0.3.13
parent
ab5534fc
Changes
155
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2705 additions
and
0 deletions
+2705
-0
deepspeed/ops/csrc/includes/cpu_adam.h
deepspeed/ops/csrc/includes/cpu_adam.h
+163
-0
deepspeed/ops/csrc/includes/cublas_wrappers.h
deepspeed/ops/csrc/includes/cublas_wrappers.h
+69
-0
deepspeed/ops/csrc/includes/custom_cuda_layers.h
deepspeed/ops/csrc/includes/custom_cuda_layers.h
+257
-0
deepspeed/ops/csrc/includes/dropout.h
deepspeed/ops/csrc/includes/dropout.h
+76
-0
deepspeed/ops/csrc/includes/ds_transformer_cuda.h
deepspeed/ops/csrc/includes/ds_transformer_cuda.h
+184
-0
deepspeed/ops/csrc/includes/feed_forward.h
deepspeed/ops/csrc/includes/feed_forward.h
+93
-0
deepspeed/ops/csrc/includes/gelu.h
deepspeed/ops/csrc/includes/gelu.h
+36
-0
deepspeed/ops/csrc/includes/gemm_test.h
deepspeed/ops/csrc/includes/gemm_test.h
+293
-0
deepspeed/ops/csrc/includes/general_kernels.h
deepspeed/ops/csrc/includes/general_kernels.h
+47
-0
deepspeed/ops/csrc/includes/hip/StopWatch.h
deepspeed/ops/csrc/includes/hip/StopWatch.h
+98
-0
deepspeed/ops/csrc/includes/hip/Timer.h
deepspeed/ops/csrc/includes/hip/Timer.h
+47
-0
deepspeed/ops/csrc/includes/hip/context.h
deepspeed/ops/csrc/includes/hip/context.h
+171
-0
deepspeed/ops/csrc/includes/hip/cpu_adam.h
deepspeed/ops/csrc/includes/hip/cpu_adam.h
+163
-0
deepspeed/ops/csrc/includes/hip/cublas_wrappers.h
deepspeed/ops/csrc/includes/hip/cublas_wrappers.h
+69
-0
deepspeed/ops/csrc/includes/hip/custom_hip_layers.h
deepspeed/ops/csrc/includes/hip/custom_hip_layers.h
+257
-0
deepspeed/ops/csrc/includes/hip/dropout.h
deepspeed/ops/csrc/includes/hip/dropout.h
+76
-0
deepspeed/ops/csrc/includes/hip/ds_transformer_hip.h
deepspeed/ops/csrc/includes/hip/ds_transformer_hip.h
+184
-0
deepspeed/ops/csrc/includes/hip/feed_forward.h
deepspeed/ops/csrc/includes/hip/feed_forward.h
+93
-0
deepspeed/ops/csrc/includes/hip/gelu.h
deepspeed/ops/csrc/includes/hip/gelu.h
+36
-0
deepspeed/ops/csrc/includes/hip/gemm_test.h
deepspeed/ops/csrc/includes/hip/gemm_test.h
+293
-0
No files found.
deepspeed/ops/csrc/includes/cpu_adam.h
0 → 100755
View file @
eadbbe09
#pragma once
#include <cpuid.h>
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <x86intrin.h>
#include <cassert>
#include "context.h"
#include "cublas_v2.h"
#include "cuda.h"
#include "curand.h"
#define CUDA_CHECK(callstr) \
{
\
cudaError_t
error_code
=
callstr
;
\
if
(
error_code
!=
cudaSuccess
)
{
\
std
::
cerr
<<
"CUDA error "
<<
error_code
<<
" at "
<<
__FILE__
<<
":"
<<
__LINE__
;
\
assert
(
0
);
\
}
\
}
#define TILE (1024 * 1024 * 1024)
#if defined(__AVX512__)
#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm512_loadu_ps(x)
#define SIMD_SET(x) _mm512_set1_ps(x)
#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
#define SIMD_WIDTH 16
#else
#if defined(__AVX256__)
#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm256_loadu_ps(x)
#define SIMD_SET(x) _mm256_set1_ps(x)
#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
#define SIMD_WIDTH 8
#endif
#endif
class
Adam_Optimizer
{
public:
Adam_Optimizer
(
float
alpha
=
1e-3
,
float
betta1
=
0.9
,
float
betta2
=
0.999
,
float
eps
=
1e-8
,
float
weight_decay
=
0
,
bool
adamw_mode
=
true
)
:
_alpha
(
alpha
),
_betta1
(
betta1
),
_betta2
(
betta2
),
_eps
(
eps
),
_weight_decay
(
weight_decay
),
_betta1_t
(
1.0
),
_betta2_t
(
1.0
),
_step
(
0
),
_buf_index
(
false
),
_adamw_mode
(
adamw_mode
)
{
cudaMallocHost
((
void
**
)
_doubled_buffer
,
TILE
*
sizeof
(
float
));
cudaMallocHost
((
void
**
)(
_doubled_buffer
+
1
),
TILE
*
sizeof
(
float
));
_streams
[
0
]
=
Context
::
Instance
().
GetCurrentStream
();
_streams
[
1
]
=
Context
::
Instance
().
GetNewStream
();
}
~
Adam_Optimizer
()
{
cudaFreeHost
(
_doubled_buffer
[
0
]);
cudaFreeHost
(
_doubled_buffer
[
1
]);
}
void
Step
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
);
void
Step_4
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sa
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
);
void
Step_8
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
=
nullptr
);
inline
void
SynchronizeStreams
()
{
for
(
int
i
=
0
;
i
<
2
;
i
++
)
cudaStreamSynchronize
(
_streams
[
i
]);
}
inline
void
IncrementStep
(
size_t
step
,
float
beta1
,
float
beta2
)
{
if
(
beta1
!=
_betta1
||
beta2
!=
_betta2
)
{
_step
=
step
;
_betta1
=
beta1
;
_betta2
=
beta2
;
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
}
else
{
_step
++
;
if
(
_step
!=
step
)
{
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
_step
=
step
;
}
else
{
_betta1_t
*=
_betta1
;
_betta2_t
*=
_betta2
;
}
}
}
inline
void
update_state
(
float
lr
,
float
epsilon
,
float
weight_decay
,
bool
bias_correction
)
{
_alpha
=
lr
;
_eps
=
epsilon
;
_weight_decay
=
weight_decay
;
_bias_correction1
=
1.0
f
;
_bias_correction2
=
1.0
f
;
if
(
bias_correction
==
1
)
{
_bias_correction1
=
1
-
_betta1_t
;
_bias_correction2
=
1
/
sqrt
(
1
-
_betta2_t
);
}
}
private:
#if defined(__AVX512__) or defined(__AVX256__)
union
AVX_Data
{
#if defined(__AVX512__)
__m512
data
;
#else
__m256
data
;
#endif
// float data_f[16];
};
#endif
float
_alpha
;
float
_betta1
;
float
_betta2
;
float
_eps
;
float
_weight_decay
;
float
_betta1_t
;
float
_betta2_t
;
size_t
_step
;
float
_bias_correction1
;
float
_bias_correction2
;
float
*
_doubled_buffer
[
2
];
bool
_buf_index
;
bool
_adamw_mode
;
cudaStream_t
_streams
[
2
];
};
deepspeed/ops/csrc/includes/cublas_wrappers.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <assert.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <mma.h>
#include <stdio.h>
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
deepspeed/ops/csrc/includes/custom_cuda_layers.h
0 → 100755
View file @
eadbbe09
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#include <cooperative_groups.h>
#include <curand_kernel.h>
#include "context.h"
#include "cublas_wrappers.h"
#define MAX_THREADS 1024
#define THREADS 256
#define MAX_THREAD_STRIDE 32
#define TILE_DIM 32
// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
#define MAX_THREAD_ITERATIONS 8 // Maximum 8K
#define MAX_WARP_NUM 32
#define MAX_REGISTERS 256
// Fused bias add with gelu activation
template
<
typename
T
>
void
launch_bias_gelu
(
const
T
*
input
,
const
T
*
bias
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_gelu
(
const
T
*
input
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_d_gelu
(
T
*
d_output
,
const
T
*
input
,
const
T
*
bias
,
int
intermediate_size
,
int
batch_size
,
cudaStream_t
stream
);
// Custom fused bias add with layer normalization
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
,
T
*
means
);
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward_nreversible
(
const
T
*
out_grad
,
const
T
*
vals
,
const
T
*
out_grad_trans
,
const
T
*
vals_trans
,
const
T
*
means
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
[
2
]);
template
<
typename
T
>
void
Transpose
(
const
T
*
inp_mat
,
T
*
out_mat
,
int
rows
,
int
cols
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward_v2
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
cudaStream_t
stream
);
// Custom softmax with scaling and attention mask addition
template
<
typename
T
>
void
launch_attn_softmax
(
T
*
vals
,
const
T
*
attn_mask
,
int
batch_size
,
int
heads
,
int
sequence_length
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_transform_0213
(
T
*
output
,
const
T
*
vals
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
cudaStream_t
stream
);
// Custom bias add
template
<
typename
T
>
void
launch_bias_add_transform_0213
(
T
*
outputs
,
const
T
*
vals
,
const
T
*
bias
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
cudaStream_t
stream
,
int
trans_count
);
// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
template
<
typename
T
>
void
launch_transform4d_0213
(
T
*
out
,
const
T
*
in
,
int
batch_size
,
int
heads
,
int
seq_length
,
int
hidden_dim
,
cudaStream_t
stream
,
int
trans_count
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
,
bool
bwd
=
false
);
template
<
typename
T
>
void
launch_dropout
(
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_fuse_transpose_bias_kernel
(
const
T
*
inp
,
T
*
out
,
int
rows
,
int
cols
,
cudaStream_t
stream
);
void
launch_param_update
(
const
float
*
input
,
__half
*
output
,
int
size
,
cudaStream_t
stream
);
deepspeed/ops/csrc/includes/dropout.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
template
<
typename
T
>
class
Dropout
{
public:
struct
Config
{
float
ratio
;
uint32_t
dim
;
bool
training
;
Config
(
float
r
,
uint32_t
d
)
:
ratio
(
r
),
dim
(
d
),
training
(
true
)
{}
float
RATIO
()
const
{
return
training
?
ratio
:
0.0
;
}
inline
void
SetDim
(
uint32_t
d
)
{
dim
=
d
;
}
};
Dropout
(
const
Config
&
config
)
:
_config
(
config
),
_mask
(
nullptr
)
{}
virtual
~
Dropout
()
{}
void
Forward
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
cudaStream_t
stream
,
bool
bwd
=
false
)
{
launch_dropout
<
T
>
(
out
,
vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
dim
,
_config
.
RATIO
(),
stream
,
bwd
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
vals
,
const
T
*
bias
,
cudaStream_t
stream
)
{
launch_dropout
<
T
>
(
vals
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
cudaStream_t
stream
)
{
launch_dropout
<
T
>
(
out
,
vals
,
residual
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals
,
cudaStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals_out
,
const
T
*
d_vals
,
cudaStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals_out
,
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
bool
HasDropout
()
const
{
return
_config
.
RATIO
()
>
0.0
;
}
void
SetTrainingMode
(
bool
training
)
{
_config
.
training
=
training
;
}
void
SetMask
(
uint8_t
*
mask
)
{
if
(
!
mask
)
{
throw
std
::
runtime_error
(
"Dropout mask is null."
);
}
_mask
=
mask
;
}
Config
GetConfig
()
const
{
return
_config
;
}
inline
void
SetDimension
(
uint32_t
dim
)
{
_config
.
SetDim
(
dim
);
}
private:
uint8_t
*
_mask
;
Config
_config
;
};
deepspeed/ops/csrc/includes/ds_transformer_cuda.h
0 → 100755
View file @
eadbbe09
#pragma once
#include <cuda_runtime_api.h>
#include <curand.h>
#include <memory>
#include <vector>
#include "cublas_v2.h"
#include "cuda.h"
#include "dropout.h"
#include "feed_forward.h"
#include "gelu.h"
#include "general_kernels.h"
#include "normalize_layer.h"
#include "softmax.h"
#include "strided_batch_gemm.h"
struct
BertGemmAlgos
{
int
m_gemm_qkv_algo
;
int
m_gemm_inter_algo
;
int
m_gemm_output_algo
;
int
m_gemm_batch1_algo
;
int
m_gemm_batch2_algo
;
BertGemmAlgos
()
:
m_gemm_qkv_algo
(
-
1
),
m_gemm_inter_algo
(
-
1
),
m_gemm_output_algo
(
-
1
),
m_gemm_batch1_algo
(
-
1
),
m_gemm_batch2_algo
(
-
1
)
{
}
};
template
<
typename
T
>
class
BertTransformerLayer
{
public:
BertTransformerLayer
(
int
layer_id
,
int
batch_size
,
int
hidden_size
,
int
num_heads
,
int
intermediate_size
,
int
seq_length
,
float
attn_dropout_ratio
,
float
hidden_output_dropout_ratio
,
float
layer_norm_eps
,
bool
pre_or_postLayerNorm
,
const
std
::
vector
<
std
::
array
<
int
,
3
>>&
gemm_algos
,
bool
attn_dropout_checkpoint
,
bool
normalize_invertible
,
bool
gelu_checkpoint
,
bool
stochastic_mode
);
virtual
~
BertTransformerLayer
();
void
Forward
(
int
bsz
,
const
T
*
input_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_qkvb_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_ob_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
output_b_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
out_ptr
,
T
*
inp_norm_ptr
,
T
*
q_tf_ptr
,
T
*
k_tf_ptr
,
T
*
v_tf_ptr
,
T
*
softmax_output_ptr
,
T
*
ctx_bufB_ptr
,
T
*
attn_o_inp_ptr
,
T
*
add_res_ptr
,
T
*
ff1_inp_ptr
,
T
*
gelu_inp_ptr
,
T
*
ff2_inp_ptr
);
void
Backward
(
int
bsz
,
const
T
*
grad_output_ptr
,
const
T
*
input_ptr
,
const
T
*
output_ptr
,
const
T
*
inp_norm_ptr
,
const
T
*
q_tf_ptr
,
const
T
*
k_tf_ptr
,
const
T
*
v_tf_ptr
,
const
T
*
softmax_output_ptr
,
const
T
*
ctx_bufB_ptr
,
const
T
*
attn_o_inp_ptr
,
const
T
*
add_res_ptr
,
const
T
*
ff1_inp_ptr
,
const
T
*
gelu_inp_ptr
,
const
T
*
ff2_inp_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
grad_input_ptr
,
T
*
grad_attn_qkvw_ptr
,
T
*
grad_attn_qkvb_ptr
,
T
*
grad_attn_ow_ptr
,
T
*
grad_attn_ob_ptr
,
T
*
grad_attn_nw_ptr
,
T
*
grad_attn_nb_ptr
,
T
*
grad_inter_w_ptr
,
T
*
grad_inter_b_ptr
,
T
*
grad_output_w_ptr
,
T
*
grad_output_b_ptr
,
T
*
grad_norm_w_ptr
,
T
*
grad_norm_b_ptr
);
void
SetIntermediateBuffers
(
uint8_t
*
attn_prob_dropout_mask_ptr
,
uint8_t
*
attn_output_dropout_mask_ptr
,
uint8_t
*
layer_output_dropout_mask_ptr
,
T
*
layer_norm_var
,
T
*
layer_norm_mean
,
T
*
attn_layer_norm_var
,
T
*
attn_layer_norm_mean
);
inline
int
GetBatchSize
()
const
{
return
_batch_size
;
}
inline
int
GetNumHeads
()
const
{
return
_heads
;
}
inline
int
GetSeqLength
()
const
{
return
_seq_length
;
}
inline
int
GetIntermediateSize
()
const
{
return
_intermediate_size
;
}
void
SetSeqLength
(
int
seq_len
);
inline
int
GetHiddenSize
()
const
{
return
_hidden_size
;
}
void
SetTrainingMode
(
bool
training
);
inline
bool
IsTrainingMode
()
const
{
return
_training
;
}
inline
bool
GeluCheckpoint
()
const
{
return
_gelu_checkpoint
;
}
private:
void
Initialize
();
size_t
getWorkspaceSize
(
int
maxBatchSize
)
const
;
// Params
int
_layer_id
;
int
_batch_size
;
int
_hidden_size
;
int
_heads
;
int
_size_per_head
;
int
_intermediate_size
;
int
_seq_length
;
bool
_pre_or_postLayerNorm
;
cublasHandle_t
_cublasHandle
;
cudaStream_t
_stream
;
// layers
FeedForward
<
T
>
_qkv_linear
;
FeedForward
<
T
>
_attn_out_linear
;
Normalize_Layer
<
T
>
_attn_layer_norm
;
Normalize_Layer
<
T
>
_layer_norm
;
Normalize_Layer
<
T
>*
_last_normalize
;
FeedForward
<
T
>
_ff1
,
_ff2
;
Softmax
<
T
>
_softmax
;
Gelu
<
T
>
_gelu
;
Dropout
<
T
>
_attn_prob_dropout
;
Dropout
<
T
>
_attn_output_dropout
;
Dropout
<
T
>
_layer_output_dropout
;
StridedBatchGemm
<
T
>
_attn_scores
;
StridedBatchGemm
<
T
>
_attn_context
;
bool
_training
;
// Memory saving flags
bool
_attn_dropout_checkpoint
;
bool
_normalize_invertible
;
bool
_gelu_checkpoint
;
// High Performace flags
bool
_stochastic_mode
;
};
deepspeed/ops/csrc/includes/feed_forward.h
0 → 100644
View file @
eadbbe09
#ifndef __FEEDFORWARD_H__
#define __FEEDFORWARD_H__
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include "custom_cuda_layers.h"
template
<
typename
T
>
class
FeedForward
{
public:
struct
Config
{
int
batchSize
,
outputSize
;
int
inputSize
;
std
::
array
<
int
,
3
>
gemm_algos
;
Config
(
int
batch
,
int
outputs
,
int
inputs
,
const
std
::
array
<
int
,
3
>&
algos
)
:
batchSize
(
batch
),
outputSize
(
outputs
),
inputSize
(
inputs
),
gemm_algos
(
algos
)
{
}
};
FeedForward
(
Config
config
)
:
config_
(
config
)
{}
~
FeedForward
()
{}
void
Forward
(
int
bsz
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
out
,
cublasHandle_t
&
_cublasHandle
)
{
float
alpha
=
T
(
1.
);
float
beta
=
T
(
0.
);
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_T
,
CUBLAS_OP_N
,
config_
.
outputSize
,
bsz
,
config_
.
inputSize
,
&
alpha
,
&
beta
,
weights
,
input_ptr
,
out
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
0
]));
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
weights_grad
,
T
*
bias_grad
,
cublasHandle_t
&
_cublasHandle
,
cudaStream_t
&
stream
,
T
*
inp_grad_out
=
nullptr
,
T
*
out_grad_trans_out
=
nullptr
)
{
float
alpha
=
(
T
)
1.0
,
beta
=
(
T
)
0.0
;
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_N
,
CUBLAS_OP_T
,
config_
.
inputSize
,
config_
.
outputSize
,
bsz
,
&
alpha
,
&
beta
,
input_ptr
,
out_grad
,
weights_grad
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
1
]));
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
config_
.
inputSize
,
bsz
,
config_
.
outputSize
,
&
alpha
,
&
beta
,
weights
,
out_grad
,
inp_grad_out
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
2
]));
launch_fuse_transpose_bias_kernel
<
T
>
(
out_grad
,
bias_grad
,
bsz
,
config_
.
outputSize
,
stream
);
}
private:
Config
config_
;
};
#endif
deepspeed/ops/csrc/includes/gelu.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include "custom_cuda_layers.h"
template
<
typename
T
>
class
Gelu
{
public:
struct
Config
{
uint32_t
intermediate_size
;
Config
(
uint32_t
inter_size
)
:
intermediate_size
(
inter_size
)
{}
};
Gelu
(
const
Config
&
config
)
:
_config
(
config
)
{}
virtual
~
Gelu
()
{}
void
ForwardWithBiasAdd
(
int
bsz
,
const
T
*
input_buf
,
const
T
*
bias
,
T
*
output
,
cudaStream_t
stream
)
{
launch_bias_gelu
<
T
>
(
input_buf
,
bias
,
output
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_output
,
const
T
*
input_buf
,
const
T
*
bias
,
cudaStream_t
stream
)
{
launch_d_gelu
<
T
>
(
d_output
,
input_buf
,
bias
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
private:
Config
_config
;
};
deepspeed/ops/csrc/includes/gemm_test.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <array>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <limits>
#include <memory>
#include "StopWatch.h"
#include "cublas_wrappers.h"
template
<
typename
T
>
void
check
(
T
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
)
{
if
(
result
)
{
std
::
cout
<<
(
std
::
string
(
"CUDA runtime error: "
)
+
+
file
+
":"
+
std
::
to_string
(
line
)
+
"
\n
"
);
}
}
#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
template
<
typename
T
>
class
GemmTest
{
public:
GemmTest
(
int
m
,
int
n
,
int
k
,
cublasOperation_t
ta
,
cublasOperation_t
tb
,
cublasHandle_t
h
)
:
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
cudaMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
));
check_cuda_error
(
cudaMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
));
check_cuda_error
(
cudaMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
));
}
~
GemmTest
()
{
check_cuda_error
(
cudaFree
(
A
));
check_cuda_error
(
cudaFree
(
B
));
check_cuda_error
(
cudaFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
CUBLAS_OP_T
,
CUBLAS_OP_N
,
N
,
M
,
K
,
&
alpha
,
&
beta
,
B
,
A
,
C
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
CUBLAS_OP_N
,
CUBLAS_OP_T
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
K
,
M
,
N
,
&
alpha
,
&
beta
,
B
,
C
,
A
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
cudaDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
cudaDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
M
,
N
,
K
;
cublasHandle_t
handle
;
cublasOperation_t
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
template
<
typename
T
>
class
StridedGemmTest
{
public:
StridedGemmTest
(
int
b
,
int
m
,
int
n
,
int
k
,
cublasOperation_t
ta
,
cublasOperation_t
tb
,
cublasHandle_t
h
)
:
bsz
(
b
),
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
cudaMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
*
bsz
));
check_cuda_error
(
cudaMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
*
bsz
));
check_cuda_error
(
cudaMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
*
bsz
));
}
~
StridedGemmTest
()
{
check_cuda_error
(
cudaFree
(
A
));
check_cuda_error
(
cudaFree
(
B
));
check_cuda_error
(
cudaFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
stride_a
=
M
*
K
;
int
stride_b
=
N
*
K
;
int
stride_c
=
M
*
N
;
cublas_strided_batched_gemm
(
handle
,
M
,
N
,
K
,
&
alpha
,
&
beta
,
A
,
B
,
C
,
transa
,
transb
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
mb
=
(
transa
==
CUBLAS_OP_T
?
K
:
M
);
int
kb
=
(
transa
==
CUBLAS_OP_T
?
M
:
K
);
int
stride_a
=
mb
*
N
;
int
stride_b
=
N
*
kb
;
int
stride_c
=
M
*
K
;
// B need to transpose.
cublasOperation_t
op_b
=
(
transb
==
CUBLAS_OP_T
?
CUBLAS_OP_N
:
CUBLAS_OP_T
);
// Calculate d_A.
cublas_strided_batched_gemm
(
handle
,
mb
,
kb
,
N
,
&
alpha
,
&
beta
,
(
transa
==
CUBLAS_OP_T
?
B
:
C
),
(
transa
==
CUBLAS_OP_T
?
C
:
B
),
A
,
CUBLAS_OP_N
,
op_b
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
// A need to transpose.
cublasOperation_t
op_a
=
(
transa
==
CUBLAS_OP_T
?
CUBLAS_OP_N
:
CUBLAS_OP_T
);
int
stride_a
=
M
*
K
;
int
stride_b
=
M
*
N
;
int
stride_c
=
N
*
K
;
// Calculate d_B.
cublas_strided_batched_gemm
(
handle
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
op_a
,
CUBLAS_OP_N
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
cudaDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
cudaDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
bsz
,
M
,
N
,
K
;
cublasHandle_t
handle
;
cublasOperation_t
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
deepspeed/ops/csrc/includes/general_kernels.h
0 → 100644
View file @
eadbbe09
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#include <cooperative_groups.h>
#include <curand_kernel.h>
#include "context.h"
#include "cublas_wrappers.h"
#define THREADS 256
#define TILE_DIM 32
#define minus_infinity -1 * std::numeric_limits<float>::infinity()
#define FINAL_MASK 0xffffffff
template
<
typename
T
>
void
launch_fused_add2
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
cudaStream_t
&
stream
);
template
<
typename
T
>
void
launch_fused_add4
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
const
T
*
inp3
,
const
T
*
inp4
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
cudaStream_t
&
stream
);
template
<
typename
T
>
void
launch_fused_add3
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
const
T
*
inp3
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
cudaStream_t
&
stream
);
deepspeed/ops/csrc/includes/hip/StopWatch.h
0 → 100644
View file @
eadbbe09
#pragma once
#ifdef _WIN32
#include <windows.h>
#else
#include <time.h>
#endif
#ifdef _WIN32
class
Stopwatch
{
private:
double
m_total_time
;
LARGE_INTEGER
m_start_time
;
public:
Stopwatch
()
{
m_total_time
=
0.0
;
}
~
Stopwatch
()
{}
void
Reset
()
{
m_total_time
=
0.0
;
}
void
Start
()
{
QueryPerformanceCounter
(
&
m_start_time
);
}
void
Restart
()
{
m_total_time
=
0.0
;
QueryPerformanceCounter
(
&
m_start_time
);
}
void
Stop
()
{
LARGE_INTEGER
frequency
;
LARGE_INTEGER
stop_time
;
QueryPerformanceFrequency
(
&
frequency
);
QueryPerformanceCounter
(
&
stop_time
);
m_total_time
+=
((
double
)(
stop_time
.
QuadPart
-
m_start_time
.
QuadPart
)
/
(
double
)
frequency
.
QuadPart
);
}
double
GetTimeInSeconds
()
{
return
m_total_time
;
}
};
#else
class
Stopwatch
{
private:
double
m_total_time
;
struct
timespec
m_start_time
;
bool
m_is_started
;
public:
Stopwatch
()
{
m_total_time
=
0.0
;
m_is_started
=
false
;
}
~
Stopwatch
()
{}
void
Reset
()
{
m_total_time
=
0.0
;
}
void
Start
()
{
clock_gettime
(
CLOCK_MONOTONIC
,
&
m_start_time
);
m_is_started
=
true
;
}
void
Restart
()
{
m_total_time
=
0.0
;
clock_gettime
(
CLOCK_MONOTONIC
,
&
m_start_time
);
m_is_started
=
true
;
}
void
Stop
()
{
if
(
m_is_started
)
{
m_is_started
=
false
;
struct
timespec
end_time
;
clock_gettime
(
CLOCK_MONOTONIC
,
&
end_time
);
m_total_time
+=
(
double
)(
end_time
.
tv_sec
-
m_start_time
.
tv_sec
)
+
(
double
)(
end_time
.
tv_nsec
-
m_start_time
.
tv_nsec
)
/
1e9
;
}
}
double
GetTimeInSeconds
()
{
if
(
m_is_started
)
{
Stop
();
Start
();
}
return
m_total_time
;
}
};
#endif
deepspeed/ops/csrc/includes/hip/Timer.h
0 → 100644
View file @
eadbbe09
#ifndef __TIMER_H__
#define __TIMER_H__
#include <hip/hip_runtime.h>
#include <chrono>
#include "hip/hip_runtime.h"
class
GPUTimer
{
hipEvent_t
start
,
stop
;
public:
GPUTimer
()
{
hipEventCreate
(
&
start
);
hipEventCreate
(
&
stop
);
}
~
GPUTimer
()
{
hipEventDestroy
(
start
);
hipEventDestroy
(
stop
);
}
inline
void
Record
()
{
hipEventRecord
(
start
);
}
inline
void
Elapsed
(
float
&
time_elapsed
)
{
hipEventRecord
(
stop
);
hipEventSynchronize
(
stop
);
hipEventElapsedTime
(
&
time_elapsed
,
start
,
stop
);
}
};
class
CPUTimer
{
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
public:
CPUTimer
()
:
start
(
std
::
chrono
::
high_resolution_clock
::
now
())
{}
inline
void
Reset
()
{
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
inline
float
Elapsed
()
{
auto
temp
=
start
;
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
return
(
float
)(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
start
-
temp
).
count
()
/
1e3
);
}
};
#endif
deepspeed/ops/csrc/includes/hip/context.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <ATen/hip/HIPContext.h>
#include <hip/hip_runtime_api.h>
#include <cassert>
#include <iostream>
#include <vector>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand.h"
#include "gemm_test.h"
#define WARP_SIZE 32
#define CUDA_CHECK(callstr) \
{ \
hipError_t error_code = callstr; \
if (error_code != hipSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
#define DS_CUDA_NUM_THREADS 512
#define DS_MAXIMUM_NUM_BLOCKS 262144
inline
int
DS_GET_BLOCKS
(
const
int
N
)
{
return
(
std
::
max
)(
(
std
::
min
)((
N
+
DS_CUDA_NUM_THREADS
-
1
)
/
DS_CUDA_NUM_THREADS
,
DS_MAXIMUM_NUM_BLOCKS
),
// Use at least 1 block, since CUDA does not allow empty block
1
);
}
class
Context
{
public:
Context
()
:
_workspace
(
nullptr
),
_seed
(
42
),
_curr_offset
(
0
)
{
hiprandCreateGenerator
(
&
_gen
,
HIPRAND_RNG_PSEUDO_DEFAULT
);
hiprandSetPseudoRandomGeneratorSeed
(
_gen
,
123
);
if
(
rocblas_create_handle
(
&
_cublasHandle
)
!=
rocblas_status_success
)
{
auto
message
=
std
::
string
(
"Fail to create cublas handle."
);
std
::
cerr
<<
message
<<
std
::
endl
;
throw
std
::
runtime_error
(
message
);
}
}
virtual
~
Context
()
{
rocblas_destroy_handle
(
_cublasHandle
);
hipFree
(
_workspace
);
}
static
Context
&
Instance
()
{
static
Context
_ctx
;
return
_ctx
;
}
void
SetWorkSpace
(
void
*
workspace
)
{
if
(
!
workspace
)
{
throw
std
::
runtime_error
(
"Workspace is null."
);
}
_workspace
=
workspace
;
}
void
*
GetWorkSpace
()
{
return
_workspace
;
}
hiprandGenerator_t
&
GetRandGenerator
()
{
return
_gen
;
}
hipStream_t
GetCurrentStream
()
{
// get current pytorch stream.
hipStream_t
stream
=
at
::
hip
::
getCurrentHIPStreamMasqueradingAsCUDA
();
return
stream
;
}
hipStream_t
GetNewStream
()
{
return
at
::
hip
::
getStreamFromPoolMasqueradingAsCUDA
();
}
rocblas_handle
GetCublasHandle
()
{
return
_cublasHandle
;
}
std
::
pair
<
uint64_t
,
uint64_t
>
IncrementOffset
(
uint64_t
offset_inc
)
{
uint64_t
offset
=
_curr_offset
;
_curr_offset
+=
offset_inc
;
return
std
::
pair
<
uint64_t
,
uint64_t
>
(
_seed
,
offset
);
}
void
SetSeed
(
uint64_t
new_seed
)
{
_seed
=
new_seed
;
}
void
TestGemmFP16
(
bool
test_gemm
,
int
batch_size
,
int
seq_len
,
int
head_num
,
int
size_per_head
)
{
// avoid rerun.
if
(
_gemm_algos
.
size
()
>
0
)
return
;
if
(
test_gemm
)
{
rocblas_handle
handle
=
GetCublasHandle
();
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_qkv_fw
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
head_num
*
size_per_head
,
// N
head_num
*
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_inter
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
4
*
head_num
*
size_per_head
,
// N
head_num
*
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_output
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
head_num
*
size_per_head
,
// N
4
*
head_num
*
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
StridedGemmTest
<
__half
>>
test_attn_scores
(
new
StridedGemmTest
<
__half
>
(
batch_size
*
head_num
,
// batch
seq_len
,
// M
seq_len
,
// N
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
StridedGemmTest
<
__half
>>
test_attn_context
(
new
StridedGemmTest
<
__half
>
(
batch_size
*
head_num
,
// batch
size_per_head
,
// M
seq_len
,
// N
seq_len
,
// K
rocblas_operation_none
,
rocblas_operation_none
,
handle
));
_gemm_algos
.
push_back
(
test_qkv_fw
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_inter
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_output
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_attn_scores
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_attn_context
->
TestAlgo
(
100
));
}
else
{
// Use default algo.
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
}
}
const
std
::
vector
<
std
::
array
<
int
,
3
>>&
GetGemmAlgos
()
const
{
return
_gemm_algos
;
}
private:
hiprandGenerator_t
_gen
;
rocblas_handle
_cublasHandle
;
void
*
_workspace
;
uint64_t
_seed
;
uint64_t
_curr_offset
;
std
::
vector
<
std
::
array
<
int
,
3
>>
_gemm_algos
;
};
deepspeed/ops/csrc/includes/hip/cpu_adam.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <cpuid.h>
#include <hip/hip_fp16.h>
#include <hip/hip_runtime_api.h>
#include <stdio.h>
#include <x86intrin.h>
#include <cassert>
#include "context.h"
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand.h"
#define CUDA_CHECK(callstr) \
{ \
hipError_t error_code = callstr; \
if (error_code != hipSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define TILE (1024 * 1024 * 1024)
#if defined(__AVX512__)
#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm512_loadu_ps(x)
#define SIMD_SET(x) _mm512_set1_ps(x)
#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
#define SIMD_WIDTH 16
#else
#if defined(__AVX256__)
#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm256_loadu_ps(x)
#define SIMD_SET(x) _mm256_set1_ps(x)
#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
#define SIMD_WIDTH 8
#endif
#endif
class
Adam_Optimizer
{
public:
Adam_Optimizer
(
float
alpha
=
1e-3
,
float
betta1
=
0.9
,
float
betta2
=
0.999
,
float
eps
=
1e-8
,
float
weight_decay
=
0
,
bool
adamw_mode
=
true
)
:
_alpha
(
alpha
),
_betta1
(
betta1
),
_betta2
(
betta2
),
_eps
(
eps
),
_weight_decay
(
weight_decay
),
_betta1_t
(
1.0
),
_betta2_t
(
1.0
),
_step
(
0
),
_buf_index
(
false
),
_adamw_mode
(
adamw_mode
)
{
hipHostMalloc
((
void
**
)
_doubled_buffer
,
TILE
*
sizeof
(
float
));
hipHostMalloc
((
void
**
)(
_doubled_buffer
+
1
),
TILE
*
sizeof
(
float
));
_streams
[
0
]
=
Context
::
Instance
().
GetCurrentStream
();
_streams
[
1
]
=
Context
::
Instance
().
GetNewStream
();
}
~
Adam_Optimizer
()
{
hipHostFree
(
_doubled_buffer
[
0
]);
hipHostFree
(
_doubled_buffer
[
1
]);
}
void
Step
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
);
void
Step_4
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sa
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
);
void
Step_8
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
=
nullptr
);
inline
void
SynchronizeStreams
()
{
for
(
int
i
=
0
;
i
<
2
;
i
++
)
hipStreamSynchronize
(
_streams
[
i
]);
}
inline
void
IncrementStep
(
size_t
step
,
float
beta1
,
float
beta2
)
{
if
(
beta1
!=
_betta1
||
beta2
!=
_betta2
)
{
_step
=
step
;
_betta1
=
beta1
;
_betta2
=
beta2
;
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
}
else
{
_step
++
;
if
(
_step
!=
step
)
{
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
_step
=
step
;
}
else
{
_betta1_t
*=
_betta1
;
_betta2_t
*=
_betta2
;
}
}
}
inline
void
update_state
(
float
lr
,
float
epsilon
,
float
weight_decay
,
bool
bias_correction
)
{
_alpha
=
lr
;
_eps
=
epsilon
;
_weight_decay
=
weight_decay
;
_bias_correction1
=
1.0
f
;
_bias_correction2
=
1.0
f
;
if
(
bias_correction
==
1
)
{
_bias_correction1
=
1
-
_betta1_t
;
_bias_correction2
=
1
/
sqrt
(
1
-
_betta2_t
);
}
}
private:
#if defined(__AVX512__) or defined(__AVX256__)
union
AVX_Data
{
#if defined(__AVX512__)
__m512
data
;
#else
__m256
data
;
#endif
// float data_f[16];
};
#endif
float
_alpha
;
float
_betta1
;
float
_betta2
;
float
_eps
;
float
_weight_decay
;
float
_betta1_t
;
float
_betta2_t
;
size_t
_step
;
float
_bias_correction1
;
float
_bias_correction2
;
float
*
_doubled_buffer
[
2
];
bool
_buf_index
;
bool
_adamw_mode
;
hipStream_t
_streams
[
2
];
};
deepspeed/ops/csrc/includes/hip/cublas_wrappers.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <assert.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
#include <mma.h>
#include <stdio.h>
int
cublas_gemm_ex
(
rocblas_handle
handle
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
int
cublas_gemm_ex
(
rocblas_handle
handle
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
int
cublas_strided_batched_gemm
(
rocblas_handle
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
rocblas_operation
op_A
,
rocblas_operation
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
int
cublas_strided_batched_gemm
(
rocblas_handle
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
rocblas_operation
op_A
,
rocblas_operation
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
deepspeed/ops/csrc/includes/hip/custom_hip_layers.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#include <cooperative_groups.h>
#include <hiprand_kernel.h>
#include "context.h"
#include "cublas_wrappers.h"
#define MAX_THREADS 1024
#define THREADS 256
#define MAX_THREAD_STRIDE 32
#define TILE_DIM 32
// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
#define MAX_THREAD_ITERATIONS 8 // Maximum 8K
#define MAX_WARP_NUM 32
#define MAX_REGISTERS 256
// Fused bias add with gelu activation
template
<
typename
T
>
void
launch_bias_gelu
(
const
T
*
input
,
const
T
*
bias
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_gelu
(
const
T
*
input
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_d_gelu
(
T
*
d_output
,
const
T
*
input
,
const
T
*
bias
,
int
intermediate_size
,
int
batch_size
,
hipStream_t
stream
);
// Custom fused bias add with layer normalization
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
,
T
*
means
);
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward_nreversible
(
const
T
*
out_grad
,
const
T
*
vals
,
const
T
*
out_grad_trans
,
const
T
*
vals_trans
,
const
T
*
means
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
]);
template
<
typename
T
>
void
Transpose
(
const
T
*
inp_mat
,
T
*
out_mat
,
int
rows
,
int
cols
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward_v2
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
hipStream_t
stream
);
// Custom softmax with scaling and attention mask addition
template
<
typename
T
>
void
launch_attn_softmax
(
T
*
vals
,
const
T
*
attn_mask
,
int
batch_size
,
int
heads
,
int
sequence_length
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_transform_0213
(
T
*
output
,
const
T
*
vals
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
hipStream_t
stream
);
// Custom bias add
template
<
typename
T
>
void
launch_bias_add_transform_0213
(
T
*
outputs
,
const
T
*
vals
,
const
T
*
bias
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
hipStream_t
stream
,
int
trans_count
);
// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
template
<
typename
T
>
void
launch_transform4d_0213
(
T
*
out
,
const
T
*
in
,
int
batch_size
,
int
heads
,
int
seq_length
,
int
hidden_dim
,
hipStream_t
stream
,
int
trans_count
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
int
dim
,
float
ratio
,
hipStream_t
stream
,
bool
bwd
=
false
);
template
<
typename
T
>
void
launch_dropout
(
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_fuse_transpose_bias_kernel
(
const
T
*
inp
,
T
*
out
,
int
rows
,
int
cols
,
hipStream_t
stream
);
void
launch_param_update
(
const
float
*
input
,
__half
*
output
,
int
size
,
hipStream_t
stream
);
deepspeed/ops/csrc/includes/hip/dropout.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
template
<
typename
T
>
class
Dropout
{
public:
struct
Config
{
float
ratio
;
uint32_t
dim
;
bool
training
;
Config
(
float
r
,
uint32_t
d
)
:
ratio
(
r
),
dim
(
d
),
training
(
true
)
{}
float
RATIO
()
const
{
return
training
?
ratio
:
0.0
;
}
inline
void
SetDim
(
uint32_t
d
)
{
dim
=
d
;
}
};
Dropout
(
const
Config
&
config
)
:
_config
(
config
),
_mask
(
nullptr
)
{}
virtual
~
Dropout
()
{}
void
Forward
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
hipStream_t
stream
,
bool
bwd
=
false
)
{
launch_dropout
<
T
>
(
out
,
vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
dim
,
_config
.
RATIO
(),
stream
,
bwd
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
vals
,
const
T
*
bias
,
hipStream_t
stream
)
{
launch_dropout
<
T
>
(
vals
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
hipStream_t
stream
)
{
launch_dropout
<
T
>
(
out
,
vals
,
residual
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals
,
hipStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals_out
,
const
T
*
d_vals
,
hipStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals_out
,
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
bool
HasDropout
()
const
{
return
_config
.
RATIO
()
>
0.0
;
}
void
SetTrainingMode
(
bool
training
)
{
_config
.
training
=
training
;
}
void
SetMask
(
uint8_t
*
mask
)
{
if
(
!
mask
)
{
throw
std
::
runtime_error
(
"Dropout mask is null."
);
}
_mask
=
mask
;
}
Config
GetConfig
()
const
{
return
_config
;
}
inline
void
SetDimension
(
uint32_t
dim
)
{
_config
.
SetDim
(
dim
);
}
private:
uint8_t
*
_mask
;
Config
_config
;
};
deepspeed/ops/csrc/includes/hip/ds_transformer_hip.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <hip/hip_runtime_api.h>
#include <hiprand.h>
#include <memory>
#include <vector>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "dropout.h"
#include "feed_forward.h"
#include "gelu.h"
#include "general_kernels.h"
#include "normalize_layer.h"
#include "softmax.h"
#include "strided_batch_gemm.h"
struct
BertGemmAlgos
{
int
m_gemm_qkv_algo
;
int
m_gemm_inter_algo
;
int
m_gemm_output_algo
;
int
m_gemm_batch1_algo
;
int
m_gemm_batch2_algo
;
BertGemmAlgos
()
:
m_gemm_qkv_algo
(
-
1
),
m_gemm_inter_algo
(
-
1
),
m_gemm_output_algo
(
-
1
),
m_gemm_batch1_algo
(
-
1
),
m_gemm_batch2_algo
(
-
1
)
{
}
};
template
<
typename
T
>
class
BertTransformerLayer
{
public:
BertTransformerLayer
(
int
layer_id
,
int
batch_size
,
int
hidden_size
,
int
num_heads
,
int
intermediate_size
,
int
seq_length
,
float
attn_dropout_ratio
,
float
hidden_output_dropout_ratio
,
float
layer_norm_eps
,
bool
pre_or_postLayerNorm
,
const
std
::
vector
<
std
::
array
<
int
,
3
>>&
gemm_algos
,
bool
attn_dropout_checkpoint
,
bool
normalize_invertible
,
bool
gelu_checkpoint
,
bool
stochastic_mode
);
virtual
~
BertTransformerLayer
();
void
Forward
(
int
bsz
,
const
T
*
input_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_qkvb_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_ob_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
output_b_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
out_ptr
,
T
*
inp_norm_ptr
,
T
*
q_tf_ptr
,
T
*
k_tf_ptr
,
T
*
v_tf_ptr
,
T
*
softmax_output_ptr
,
T
*
ctx_bufB_ptr
,
T
*
attn_o_inp_ptr
,
T
*
add_res_ptr
,
T
*
ff1_inp_ptr
,
T
*
gelu_inp_ptr
,
T
*
ff2_inp_ptr
);
void
Backward
(
int
bsz
,
const
T
*
grad_output_ptr
,
const
T
*
input_ptr
,
const
T
*
output_ptr
,
const
T
*
inp_norm_ptr
,
const
T
*
q_tf_ptr
,
const
T
*
k_tf_ptr
,
const
T
*
v_tf_ptr
,
const
T
*
softmax_output_ptr
,
const
T
*
ctx_bufB_ptr
,
const
T
*
attn_o_inp_ptr
,
const
T
*
add_res_ptr
,
const
T
*
ff1_inp_ptr
,
const
T
*
gelu_inp_ptr
,
const
T
*
ff2_inp_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
grad_input_ptr
,
T
*
grad_attn_qkvw_ptr
,
T
*
grad_attn_qkvb_ptr
,
T
*
grad_attn_ow_ptr
,
T
*
grad_attn_ob_ptr
,
T
*
grad_attn_nw_ptr
,
T
*
grad_attn_nb_ptr
,
T
*
grad_inter_w_ptr
,
T
*
grad_inter_b_ptr
,
T
*
grad_output_w_ptr
,
T
*
grad_output_b_ptr
,
T
*
grad_norm_w_ptr
,
T
*
grad_norm_b_ptr
);
void
SetIntermediateBuffers
(
uint8_t
*
attn_prob_dropout_mask_ptr
,
uint8_t
*
attn_output_dropout_mask_ptr
,
uint8_t
*
layer_output_dropout_mask_ptr
,
T
*
layer_norm_var
,
T
*
layer_norm_mean
,
T
*
attn_layer_norm_var
,
T
*
attn_layer_norm_mean
);
inline
int
GetBatchSize
()
const
{
return
_batch_size
;
}
inline
int
GetNumHeads
()
const
{
return
_heads
;
}
inline
int
GetSeqLength
()
const
{
return
_seq_length
;
}
inline
int
GetIntermediateSize
()
const
{
return
_intermediate_size
;
}
void
SetSeqLength
(
int
seq_len
);
inline
int
GetHiddenSize
()
const
{
return
_hidden_size
;
}
void
SetTrainingMode
(
bool
training
);
inline
bool
IsTrainingMode
()
const
{
return
_training
;
}
inline
bool
GeluCheckpoint
()
const
{
return
_gelu_checkpoint
;
}
private:
void
Initialize
();
size_t
getWorkspaceSize
(
int
maxBatchSize
)
const
;
// Params
int
_layer_id
;
int
_batch_size
;
int
_hidden_size
;
int
_heads
;
int
_size_per_head
;
int
_intermediate_size
;
int
_seq_length
;
bool
_pre_or_postLayerNorm
;
rocblas_handle
_cublasHandle
;
hipStream_t
_stream
;
// layers
FeedForward
<
T
>
_qkv_linear
;
FeedForward
<
T
>
_attn_out_linear
;
Normalize_Layer
<
T
>
_attn_layer_norm
;
Normalize_Layer
<
T
>
_layer_norm
;
Normalize_Layer
<
T
>*
_last_normalize
;
FeedForward
<
T
>
_ff1
,
_ff2
;
Softmax
<
T
>
_softmax
;
Gelu
<
T
>
_gelu
;
Dropout
<
T
>
_attn_prob_dropout
;
Dropout
<
T
>
_attn_output_dropout
;
Dropout
<
T
>
_layer_output_dropout
;
StridedBatchGemm
<
T
>
_attn_scores
;
StridedBatchGemm
<
T
>
_attn_context
;
bool
_training
;
// Memory saving flags
bool
_attn_dropout_checkpoint
;
bool
_normalize_invertible
;
bool
_gelu_checkpoint
;
// High Performace flags
bool
_stochastic_mode
;
};
deepspeed/ops/csrc/includes/hip/feed_forward.h
0 → 100644
View file @
eadbbe09
#ifndef __FEEDFORWARD_H__
#define __FEEDFORWARD_H__
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_cuda_layers.h"
template
<
typename
T
>
class
FeedForward
{
public:
struct
Config
{
int
batchSize
,
outputSize
;
int
inputSize
;
std
::
array
<
int
,
3
>
gemm_algos
;
Config
(
int
batch
,
int
outputs
,
int
inputs
,
const
std
::
array
<
int
,
3
>&
algos
)
:
batchSize
(
batch
),
outputSize
(
outputs
),
inputSize
(
inputs
),
gemm_algos
(
algos
)
{
}
};
FeedForward
(
Config
config
)
:
config_
(
config
)
{}
~
FeedForward
()
{}
void
Forward
(
int
bsz
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
out
,
rocblas_handle
&
_cublasHandle
)
{
float
alpha
=
T
(
1.
);
float
beta
=
T
(
0.
);
cublas_gemm_ex
(
_cublasHandle
,
rocblas_operation_transpose
,
rocblas_operation_none
,
config_
.
outputSize
,
bsz
,
config_
.
inputSize
,
&
alpha
,
&
beta
,
weights
,
input_ptr
,
out
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
0
]));
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
weights_grad
,
T
*
bias_grad
,
rocblas_handle
&
_cublasHandle
,
hipStream_t
&
stream
,
T
*
inp_grad_out
=
nullptr
,
T
*
out_grad_trans_out
=
nullptr
)
{
float
alpha
=
(
T
)
1.0
,
beta
=
(
T
)
0.0
;
cublas_gemm_ex
(
_cublasHandle
,
rocblas_operation_none
,
rocblas_operation_transpose
,
config_
.
inputSize
,
config_
.
outputSize
,
bsz
,
&
alpha
,
&
beta
,
input_ptr
,
out_grad
,
weights_grad
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
1
]));
cublas_gemm_ex
(
_cublasHandle
,
rocblas_operation_none
,
rocblas_operation_none
,
config_
.
inputSize
,
bsz
,
config_
.
outputSize
,
&
alpha
,
&
beta
,
weights
,
out_grad
,
inp_grad_out
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
2
]));
launch_fuse_transpose_bias_kernel
<
T
>
(
out_grad
,
bias_grad
,
bsz
,
config_
.
outputSize
,
stream
);
}
private:
Config
config_
;
};
#endif
deepspeed/ops/csrc/includes/hip/gelu.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_cuda_layers.h"
template
<
typename
T
>
class
Gelu
{
public:
struct
Config
{
uint32_t
intermediate_size
;
Config
(
uint32_t
inter_size
)
:
intermediate_size
(
inter_size
)
{}
};
Gelu
(
const
Config
&
config
)
:
_config
(
config
)
{}
virtual
~
Gelu
()
{}
void
ForwardWithBiasAdd
(
int
bsz
,
const
T
*
input_buf
,
const
T
*
bias
,
T
*
output
,
hipStream_t
stream
)
{
launch_bias_gelu
<
T
>
(
input_buf
,
bias
,
output
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_output
,
const
T
*
input_buf
,
const
T
*
bias
,
hipStream_t
stream
)
{
launch_d_gelu
<
T
>
(
d_output
,
input_buf
,
bias
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
private:
Config
_config
;
};
deepspeed/ops/csrc/includes/hip/gemm_test.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <hip/hip_fp16.h>
#include <cuda_profiler_api.h>
#include <array>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <limits>
#include <memory>
#include "StopWatch.h"
#include "cublas_wrappers.h"
template
<
typename
T
>
void
check
(
T
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
)
{
if
(
result
)
{
std
::
cout
<<
(
std
::
string
(
"CUDA runtime error: "
)
+
+
file
+
":"
+
std
::
to_string
(
line
)
+
"
\n
"
);
}
}
#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
template
<
typename
T
>
class
GemmTest
{
public:
GemmTest
(
int
m
,
int
n
,
int
k
,
rocblas_operation
ta
,
rocblas_operation
tb
,
rocblas_handle
h
)
:
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
hipMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
));
}
~
GemmTest
()
{
check_cuda_error
(
hipFree
(
A
));
check_cuda_error
(
hipFree
(
B
));
check_cuda_error
(
hipFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
rocblas_operation_transpose
,
rocblas_operation_none
,
N
,
M
,
K
,
&
alpha
,
&
beta
,
B
,
A
,
C
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
rocblas_operation_none
,
rocblas_operation_transpose
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
rocblas_operation_none
,
rocblas_operation_none
,
K
,
M
,
N
,
&
alpha
,
&
beta
,
B
,
C
,
A
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
M
,
N
,
K
;
rocblas_handle
handle
;
rocblas_operation
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
template
<
typename
T
>
class
StridedGemmTest
{
public:
StridedGemmTest
(
int
b
,
int
m
,
int
n
,
int
k
,
rocblas_operation
ta
,
rocblas_operation
tb
,
rocblas_handle
h
)
:
bsz
(
b
),
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
hipMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
*
bsz
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
*
bsz
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
*
bsz
));
}
~
StridedGemmTest
()
{
check_cuda_error
(
hipFree
(
A
));
check_cuda_error
(
hipFree
(
B
));
check_cuda_error
(
hipFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
stride_a
=
M
*
K
;
int
stride_b
=
N
*
K
;
int
stride_c
=
M
*
N
;
cublas_strided_batched_gemm
(
handle
,
M
,
N
,
K
,
&
alpha
,
&
beta
,
A
,
B
,
C
,
transa
,
transb
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
mb
=
(
transa
==
rocblas_operation_transpose
?
K
:
M
);
int
kb
=
(
transa
==
rocblas_operation_transpose
?
M
:
K
);
int
stride_a
=
mb
*
N
;
int
stride_b
=
N
*
kb
;
int
stride_c
=
M
*
K
;
// B need to transpose.
rocblas_operation
op_b
=
(
transb
==
rocblas_operation_transpose
?
rocblas_operation_none
:
rocblas_operation_transpose
);
// Calculate d_A.
cublas_strided_batched_gemm
(
handle
,
mb
,
kb
,
N
,
&
alpha
,
&
beta
,
(
transa
==
rocblas_operation_transpose
?
B
:
C
),
(
transa
==
rocblas_operation_transpose
?
C
:
B
),
A
,
rocblas_operation_none
,
op_b
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
// A need to transpose.
rocblas_operation
op_a
=
(
transa
==
rocblas_operation_transpose
?
rocblas_operation_none
:
rocblas_operation_transpose
);
int
stride_a
=
M
*
K
;
int
stride_b
=
M
*
N
;
int
stride_c
=
N
*
K
;
// Calculate d_B.
cublas_strided_batched_gemm
(
handle
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
op_a
,
rocblas_operation_none
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
bsz
,
M
,
N
,
K
;
rocblas_handle
handle
;
rocblas_operation
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment