Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
eadbbe09
"vscode:/vscode.git/clone" did not exist on "58e3fff311f9e7abec20cdfe20fa43958e447aeb"
Commit
eadbbe09
authored
Apr 25, 2021
by
401qingkong
Browse files
push rocm deepspeed v0.3.13
parent
ab5534fc
Changes
155
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2705 additions
and
0 deletions
+2705
-0
deepspeed/ops/csrc/includes/cpu_adam.h
deepspeed/ops/csrc/includes/cpu_adam.h
+163
-0
deepspeed/ops/csrc/includes/cublas_wrappers.h
deepspeed/ops/csrc/includes/cublas_wrappers.h
+69
-0
deepspeed/ops/csrc/includes/custom_cuda_layers.h
deepspeed/ops/csrc/includes/custom_cuda_layers.h
+257
-0
deepspeed/ops/csrc/includes/dropout.h
deepspeed/ops/csrc/includes/dropout.h
+76
-0
deepspeed/ops/csrc/includes/ds_transformer_cuda.h
deepspeed/ops/csrc/includes/ds_transformer_cuda.h
+184
-0
deepspeed/ops/csrc/includes/feed_forward.h
deepspeed/ops/csrc/includes/feed_forward.h
+93
-0
deepspeed/ops/csrc/includes/gelu.h
deepspeed/ops/csrc/includes/gelu.h
+36
-0
deepspeed/ops/csrc/includes/gemm_test.h
deepspeed/ops/csrc/includes/gemm_test.h
+293
-0
deepspeed/ops/csrc/includes/general_kernels.h
deepspeed/ops/csrc/includes/general_kernels.h
+47
-0
deepspeed/ops/csrc/includes/hip/StopWatch.h
deepspeed/ops/csrc/includes/hip/StopWatch.h
+98
-0
deepspeed/ops/csrc/includes/hip/Timer.h
deepspeed/ops/csrc/includes/hip/Timer.h
+47
-0
deepspeed/ops/csrc/includes/hip/context.h
deepspeed/ops/csrc/includes/hip/context.h
+171
-0
deepspeed/ops/csrc/includes/hip/cpu_adam.h
deepspeed/ops/csrc/includes/hip/cpu_adam.h
+163
-0
deepspeed/ops/csrc/includes/hip/cublas_wrappers.h
deepspeed/ops/csrc/includes/hip/cublas_wrappers.h
+69
-0
deepspeed/ops/csrc/includes/hip/custom_hip_layers.h
deepspeed/ops/csrc/includes/hip/custom_hip_layers.h
+257
-0
deepspeed/ops/csrc/includes/hip/dropout.h
deepspeed/ops/csrc/includes/hip/dropout.h
+76
-0
deepspeed/ops/csrc/includes/hip/ds_transformer_hip.h
deepspeed/ops/csrc/includes/hip/ds_transformer_hip.h
+184
-0
deepspeed/ops/csrc/includes/hip/feed_forward.h
deepspeed/ops/csrc/includes/hip/feed_forward.h
+93
-0
deepspeed/ops/csrc/includes/hip/gelu.h
deepspeed/ops/csrc/includes/hip/gelu.h
+36
-0
deepspeed/ops/csrc/includes/hip/gemm_test.h
deepspeed/ops/csrc/includes/hip/gemm_test.h
+293
-0
No files found.
deepspeed/ops/csrc/includes/cpu_adam.h
0 → 100755
View file @
eadbbe09
#pragma once
#include <cpuid.h>
#include <cuda_fp16.h>
#include <cuda_runtime_api.h>
#include <stdio.h>
#include <x86intrin.h>
#include <cassert>
#include "context.h"
#include "cublas_v2.h"
#include "cuda.h"
#include "curand.h"
#define CUDA_CHECK(callstr) \
{
\
cudaError_t
error_code
=
callstr
;
\
if
(
error_code
!=
cudaSuccess
)
{
\
std
::
cerr
<<
"CUDA error "
<<
error_code
<<
" at "
<<
__FILE__
<<
":"
<<
__LINE__
;
\
assert
(
0
);
\
}
\
}
#define TILE (1024 * 1024 * 1024)
#if defined(__AVX512__)
#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm512_loadu_ps(x)
#define SIMD_SET(x) _mm512_set1_ps(x)
#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
#define SIMD_WIDTH 16
#else
#if defined(__AVX256__)
#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm256_loadu_ps(x)
#define SIMD_SET(x) _mm256_set1_ps(x)
#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
#define SIMD_WIDTH 8
#endif
#endif
class
Adam_Optimizer
{
public:
Adam_Optimizer
(
float
alpha
=
1e-3
,
float
betta1
=
0.9
,
float
betta2
=
0.999
,
float
eps
=
1e-8
,
float
weight_decay
=
0
,
bool
adamw_mode
=
true
)
:
_alpha
(
alpha
),
_betta1
(
betta1
),
_betta2
(
betta2
),
_eps
(
eps
),
_weight_decay
(
weight_decay
),
_betta1_t
(
1.0
),
_betta2_t
(
1.0
),
_step
(
0
),
_buf_index
(
false
),
_adamw_mode
(
adamw_mode
)
{
cudaMallocHost
((
void
**
)
_doubled_buffer
,
TILE
*
sizeof
(
float
));
cudaMallocHost
((
void
**
)(
_doubled_buffer
+
1
),
TILE
*
sizeof
(
float
));
_streams
[
0
]
=
Context
::
Instance
().
GetCurrentStream
();
_streams
[
1
]
=
Context
::
Instance
().
GetNewStream
();
}
~
Adam_Optimizer
()
{
cudaFreeHost
(
_doubled_buffer
[
0
]);
cudaFreeHost
(
_doubled_buffer
[
1
]);
}
void
Step
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
);
void
Step_4
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sa
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
);
void
Step_8
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
=
nullptr
);
inline
void
SynchronizeStreams
()
{
for
(
int
i
=
0
;
i
<
2
;
i
++
)
cudaStreamSynchronize
(
_streams
[
i
]);
}
inline
void
IncrementStep
(
size_t
step
,
float
beta1
,
float
beta2
)
{
if
(
beta1
!=
_betta1
||
beta2
!=
_betta2
)
{
_step
=
step
;
_betta1
=
beta1
;
_betta2
=
beta2
;
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
}
else
{
_step
++
;
if
(
_step
!=
step
)
{
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
_step
=
step
;
}
else
{
_betta1_t
*=
_betta1
;
_betta2_t
*=
_betta2
;
}
}
}
inline
void
update_state
(
float
lr
,
float
epsilon
,
float
weight_decay
,
bool
bias_correction
)
{
_alpha
=
lr
;
_eps
=
epsilon
;
_weight_decay
=
weight_decay
;
_bias_correction1
=
1.0
f
;
_bias_correction2
=
1.0
f
;
if
(
bias_correction
==
1
)
{
_bias_correction1
=
1
-
_betta1_t
;
_bias_correction2
=
1
/
sqrt
(
1
-
_betta2_t
);
}
}
private:
#if defined(__AVX512__) or defined(__AVX256__)
union
AVX_Data
{
#if defined(__AVX512__)
__m512
data
;
#else
__m256
data
;
#endif
// float data_f[16];
};
#endif
float
_alpha
;
float
_betta1
;
float
_betta2
;
float
_eps
;
float
_weight_decay
;
float
_betta1_t
;
float
_betta2_t
;
size_t
_step
;
float
_bias_correction1
;
float
_bias_correction2
;
float
*
_doubled_buffer
[
2
];
bool
_buf_index
;
bool
_adamw_mode
;
cudaStream_t
_streams
[
2
];
};
deepspeed/ops/csrc/includes/cublas_wrappers.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <assert.h>
#include <cublas_v2.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <mma.h>
#include <stdio.h>
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
int
cublas_gemm_ex
(
cublasHandle_t
handle
,
cublasOperation_t
transa
,
cublasOperation_t
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
int
cublas_strided_batched_gemm
(
cublasHandle_t
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasOperation_t
op_A
,
cublasOperation_t
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
deepspeed/ops/csrc/includes/custom_cuda_layers.h
0 → 100755
View file @
eadbbe09
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#include <cooperative_groups.h>
#include <curand_kernel.h>
#include "context.h"
#include "cublas_wrappers.h"
#define MAX_THREADS 1024
#define THREADS 256
#define MAX_THREAD_STRIDE 32
#define TILE_DIM 32
// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
#define MAX_THREAD_ITERATIONS 8 // Maximum 8K
#define MAX_WARP_NUM 32
#define MAX_REGISTERS 256
// Fused bias add with gelu activation
template
<
typename
T
>
void
launch_bias_gelu
(
const
T
*
input
,
const
T
*
bias
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_gelu
(
const
T
*
input
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_d_gelu
(
T
*
d_output
,
const
T
*
input
,
const
T
*
bias
,
int
intermediate_size
,
int
batch_size
,
cudaStream_t
stream
);
// Custom fused bias add with layer normalization
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
,
T
*
means
);
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward_nreversible
(
const
T
*
out_grad
,
const
T
*
vals
,
const
T
*
out_grad_trans
,
const
T
*
vals_trans
,
const
T
*
means
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
[
2
]);
template
<
typename
T
>
void
Transpose
(
const
T
*
inp_mat
,
T
*
out_mat
,
int
rows
,
int
cols
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward_v2
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
cudaStream_t
stream
);
// Custom softmax with scaling and attention mask addition
template
<
typename
T
>
void
launch_attn_softmax
(
T
*
vals
,
const
T
*
attn_mask
,
int
batch_size
,
int
heads
,
int
sequence_length
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_transform_0213
(
T
*
output
,
const
T
*
vals
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
cudaStream_t
stream
);
// Custom bias add
template
<
typename
T
>
void
launch_bias_add_transform_0213
(
T
*
outputs
,
const
T
*
vals
,
const
T
*
bias
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
cudaStream_t
stream
,
int
trans_count
);
// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
template
<
typename
T
>
void
launch_transform4d_0213
(
T
*
out
,
const
T
*
in
,
int
batch_size
,
int
heads
,
int
seq_length
,
int
hidden_dim
,
cudaStream_t
stream
,
int
trans_count
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
int
dim
,
float
ratio
,
cudaStream_t
stream
,
bool
bwd
=
false
);
template
<
typename
T
>
void
launch_dropout
(
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_fuse_transpose_bias_kernel
(
const
T
*
inp
,
T
*
out
,
int
rows
,
int
cols
,
cudaStream_t
stream
);
void
launch_param_update
(
const
float
*
input
,
__half
*
output
,
int
size
,
cudaStream_t
stream
);
deepspeed/ops/csrc/includes/dropout.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
template
<
typename
T
>
class
Dropout
{
public:
struct
Config
{
float
ratio
;
uint32_t
dim
;
bool
training
;
Config
(
float
r
,
uint32_t
d
)
:
ratio
(
r
),
dim
(
d
),
training
(
true
)
{}
float
RATIO
()
const
{
return
training
?
ratio
:
0.0
;
}
inline
void
SetDim
(
uint32_t
d
)
{
dim
=
d
;
}
};
Dropout
(
const
Config
&
config
)
:
_config
(
config
),
_mask
(
nullptr
)
{}
virtual
~
Dropout
()
{}
void
Forward
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
cudaStream_t
stream
,
bool
bwd
=
false
)
{
launch_dropout
<
T
>
(
out
,
vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
dim
,
_config
.
RATIO
(),
stream
,
bwd
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
vals
,
const
T
*
bias
,
cudaStream_t
stream
)
{
launch_dropout
<
T
>
(
vals
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
cudaStream_t
stream
)
{
launch_dropout
<
T
>
(
out
,
vals
,
residual
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals
,
cudaStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals_out
,
const
T
*
d_vals
,
cudaStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals_out
,
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
bool
HasDropout
()
const
{
return
_config
.
RATIO
()
>
0.0
;
}
void
SetTrainingMode
(
bool
training
)
{
_config
.
training
=
training
;
}
void
SetMask
(
uint8_t
*
mask
)
{
if
(
!
mask
)
{
throw
std
::
runtime_error
(
"Dropout mask is null."
);
}
_mask
=
mask
;
}
Config
GetConfig
()
const
{
return
_config
;
}
inline
void
SetDimension
(
uint32_t
dim
)
{
_config
.
SetDim
(
dim
);
}
private:
uint8_t
*
_mask
;
Config
_config
;
};
deepspeed/ops/csrc/includes/ds_transformer_cuda.h
0 → 100755
View file @
eadbbe09
#pragma once
#include <cuda_runtime_api.h>
#include <curand.h>
#include <memory>
#include <vector>
#include "cublas_v2.h"
#include "cuda.h"
#include "dropout.h"
#include "feed_forward.h"
#include "gelu.h"
#include "general_kernels.h"
#include "normalize_layer.h"
#include "softmax.h"
#include "strided_batch_gemm.h"
struct
BertGemmAlgos
{
int
m_gemm_qkv_algo
;
int
m_gemm_inter_algo
;
int
m_gemm_output_algo
;
int
m_gemm_batch1_algo
;
int
m_gemm_batch2_algo
;
BertGemmAlgos
()
:
m_gemm_qkv_algo
(
-
1
),
m_gemm_inter_algo
(
-
1
),
m_gemm_output_algo
(
-
1
),
m_gemm_batch1_algo
(
-
1
),
m_gemm_batch2_algo
(
-
1
)
{
}
};
template
<
typename
T
>
class
BertTransformerLayer
{
public:
BertTransformerLayer
(
int
layer_id
,
int
batch_size
,
int
hidden_size
,
int
num_heads
,
int
intermediate_size
,
int
seq_length
,
float
attn_dropout_ratio
,
float
hidden_output_dropout_ratio
,
float
layer_norm_eps
,
bool
pre_or_postLayerNorm
,
const
std
::
vector
<
std
::
array
<
int
,
3
>>&
gemm_algos
,
bool
attn_dropout_checkpoint
,
bool
normalize_invertible
,
bool
gelu_checkpoint
,
bool
stochastic_mode
);
virtual
~
BertTransformerLayer
();
void
Forward
(
int
bsz
,
const
T
*
input_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_qkvb_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_ob_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
output_b_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
out_ptr
,
T
*
inp_norm_ptr
,
T
*
q_tf_ptr
,
T
*
k_tf_ptr
,
T
*
v_tf_ptr
,
T
*
softmax_output_ptr
,
T
*
ctx_bufB_ptr
,
T
*
attn_o_inp_ptr
,
T
*
add_res_ptr
,
T
*
ff1_inp_ptr
,
T
*
gelu_inp_ptr
,
T
*
ff2_inp_ptr
);
void
Backward
(
int
bsz
,
const
T
*
grad_output_ptr
,
const
T
*
input_ptr
,
const
T
*
output_ptr
,
const
T
*
inp_norm_ptr
,
const
T
*
q_tf_ptr
,
const
T
*
k_tf_ptr
,
const
T
*
v_tf_ptr
,
const
T
*
softmax_output_ptr
,
const
T
*
ctx_bufB_ptr
,
const
T
*
attn_o_inp_ptr
,
const
T
*
add_res_ptr
,
const
T
*
ff1_inp_ptr
,
const
T
*
gelu_inp_ptr
,
const
T
*
ff2_inp_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
grad_input_ptr
,
T
*
grad_attn_qkvw_ptr
,
T
*
grad_attn_qkvb_ptr
,
T
*
grad_attn_ow_ptr
,
T
*
grad_attn_ob_ptr
,
T
*
grad_attn_nw_ptr
,
T
*
grad_attn_nb_ptr
,
T
*
grad_inter_w_ptr
,
T
*
grad_inter_b_ptr
,
T
*
grad_output_w_ptr
,
T
*
grad_output_b_ptr
,
T
*
grad_norm_w_ptr
,
T
*
grad_norm_b_ptr
);
void
SetIntermediateBuffers
(
uint8_t
*
attn_prob_dropout_mask_ptr
,
uint8_t
*
attn_output_dropout_mask_ptr
,
uint8_t
*
layer_output_dropout_mask_ptr
,
T
*
layer_norm_var
,
T
*
layer_norm_mean
,
T
*
attn_layer_norm_var
,
T
*
attn_layer_norm_mean
);
inline
int
GetBatchSize
()
const
{
return
_batch_size
;
}
inline
int
GetNumHeads
()
const
{
return
_heads
;
}
inline
int
GetSeqLength
()
const
{
return
_seq_length
;
}
inline
int
GetIntermediateSize
()
const
{
return
_intermediate_size
;
}
void
SetSeqLength
(
int
seq_len
);
inline
int
GetHiddenSize
()
const
{
return
_hidden_size
;
}
void
SetTrainingMode
(
bool
training
);
inline
bool
IsTrainingMode
()
const
{
return
_training
;
}
inline
bool
GeluCheckpoint
()
const
{
return
_gelu_checkpoint
;
}
private:
void
Initialize
();
size_t
getWorkspaceSize
(
int
maxBatchSize
)
const
;
// Params
int
_layer_id
;
int
_batch_size
;
int
_hidden_size
;
int
_heads
;
int
_size_per_head
;
int
_intermediate_size
;
int
_seq_length
;
bool
_pre_or_postLayerNorm
;
cublasHandle_t
_cublasHandle
;
cudaStream_t
_stream
;
// layers
FeedForward
<
T
>
_qkv_linear
;
FeedForward
<
T
>
_attn_out_linear
;
Normalize_Layer
<
T
>
_attn_layer_norm
;
Normalize_Layer
<
T
>
_layer_norm
;
Normalize_Layer
<
T
>*
_last_normalize
;
FeedForward
<
T
>
_ff1
,
_ff2
;
Softmax
<
T
>
_softmax
;
Gelu
<
T
>
_gelu
;
Dropout
<
T
>
_attn_prob_dropout
;
Dropout
<
T
>
_attn_output_dropout
;
Dropout
<
T
>
_layer_output_dropout
;
StridedBatchGemm
<
T
>
_attn_scores
;
StridedBatchGemm
<
T
>
_attn_context
;
bool
_training
;
// Memory saving flags
bool
_attn_dropout_checkpoint
;
bool
_normalize_invertible
;
bool
_gelu_checkpoint
;
// High Performace flags
bool
_stochastic_mode
;
};
deepspeed/ops/csrc/includes/feed_forward.h
0 → 100644
View file @
eadbbe09
#ifndef __FEEDFORWARD_H__
#define __FEEDFORWARD_H__
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include "custom_cuda_layers.h"
template
<
typename
T
>
class
FeedForward
{
public:
struct
Config
{
int
batchSize
,
outputSize
;
int
inputSize
;
std
::
array
<
int
,
3
>
gemm_algos
;
Config
(
int
batch
,
int
outputs
,
int
inputs
,
const
std
::
array
<
int
,
3
>&
algos
)
:
batchSize
(
batch
),
outputSize
(
outputs
),
inputSize
(
inputs
),
gemm_algos
(
algos
)
{
}
};
FeedForward
(
Config
config
)
:
config_
(
config
)
{}
~
FeedForward
()
{}
void
Forward
(
int
bsz
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
out
,
cublasHandle_t
&
_cublasHandle
)
{
float
alpha
=
T
(
1.
);
float
beta
=
T
(
0.
);
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_T
,
CUBLAS_OP_N
,
config_
.
outputSize
,
bsz
,
config_
.
inputSize
,
&
alpha
,
&
beta
,
weights
,
input_ptr
,
out
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
0
]));
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
weights_grad
,
T
*
bias_grad
,
cublasHandle_t
&
_cublasHandle
,
cudaStream_t
&
stream
,
T
*
inp_grad_out
=
nullptr
,
T
*
out_grad_trans_out
=
nullptr
)
{
float
alpha
=
(
T
)
1.0
,
beta
=
(
T
)
0.0
;
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_N
,
CUBLAS_OP_T
,
config_
.
inputSize
,
config_
.
outputSize
,
bsz
,
&
alpha
,
&
beta
,
input_ptr
,
out_grad
,
weights_grad
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
1
]));
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
config_
.
inputSize
,
bsz
,
config_
.
outputSize
,
&
alpha
,
&
beta
,
weights
,
out_grad
,
inp_grad_out
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
2
]));
launch_fuse_transpose_bias_kernel
<
T
>
(
out_grad
,
bias_grad
,
bsz
,
config_
.
outputSize
,
stream
);
}
private:
Config
config_
;
};
#endif
deepspeed/ops/csrc/includes/gelu.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include "custom_cuda_layers.h"
template
<
typename
T
>
class
Gelu
{
public:
struct
Config
{
uint32_t
intermediate_size
;
Config
(
uint32_t
inter_size
)
:
intermediate_size
(
inter_size
)
{}
};
Gelu
(
const
Config
&
config
)
:
_config
(
config
)
{}
virtual
~
Gelu
()
{}
void
ForwardWithBiasAdd
(
int
bsz
,
const
T
*
input_buf
,
const
T
*
bias
,
T
*
output
,
cudaStream_t
stream
)
{
launch_bias_gelu
<
T
>
(
input_buf
,
bias
,
output
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_output
,
const
T
*
input_buf
,
const
T
*
bias
,
cudaStream_t
stream
)
{
launch_d_gelu
<
T
>
(
d_output
,
input_buf
,
bias
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
private:
Config
_config
;
};
deepspeed/ops/csrc/includes/gemm_test.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <array>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <limits>
#include <memory>
#include "StopWatch.h"
#include "cublas_wrappers.h"
template
<
typename
T
>
void
check
(
T
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
)
{
if
(
result
)
{
std
::
cout
<<
(
std
::
string
(
"CUDA runtime error: "
)
+
+
file
+
":"
+
std
::
to_string
(
line
)
+
"
\n
"
);
}
}
#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
template
<
typename
T
>
class
GemmTest
{
public:
GemmTest
(
int
m
,
int
n
,
int
k
,
cublasOperation_t
ta
,
cublasOperation_t
tb
,
cublasHandle_t
h
)
:
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
cudaMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
));
check_cuda_error
(
cudaMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
));
check_cuda_error
(
cudaMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
));
}
~
GemmTest
()
{
check_cuda_error
(
cudaFree
(
A
));
check_cuda_error
(
cudaFree
(
B
));
check_cuda_error
(
cudaFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
CUBLAS_OP_T
,
CUBLAS_OP_N
,
N
,
M
,
K
,
&
alpha
,
&
beta
,
B
,
A
,
C
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
CUBLAS_OP_N
,
CUBLAS_OP_T
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
K
,
M
,
N
,
&
alpha
,
&
beta
,
B
,
C
,
A
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
cudaDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
cudaDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
M
,
N
,
K
;
cublasHandle_t
handle
;
cublasOperation_t
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
template
<
typename
T
>
class
StridedGemmTest
{
public:
StridedGemmTest
(
int
b
,
int
m
,
int
n
,
int
k
,
cublasOperation_t
ta
,
cublasOperation_t
tb
,
cublasHandle_t
h
)
:
bsz
(
b
),
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
cudaMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
*
bsz
));
check_cuda_error
(
cudaMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
*
bsz
));
check_cuda_error
(
cudaMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
*
bsz
));
}
~
StridedGemmTest
()
{
check_cuda_error
(
cudaFree
(
A
));
check_cuda_error
(
cudaFree
(
B
));
check_cuda_error
(
cudaFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
stride_a
=
M
*
K
;
int
stride_b
=
N
*
K
;
int
stride_c
=
M
*
N
;
cublas_strided_batched_gemm
(
handle
,
M
,
N
,
K
,
&
alpha
,
&
beta
,
A
,
B
,
C
,
transa
,
transb
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
mb
=
(
transa
==
CUBLAS_OP_T
?
K
:
M
);
int
kb
=
(
transa
==
CUBLAS_OP_T
?
M
:
K
);
int
stride_a
=
mb
*
N
;
int
stride_b
=
N
*
kb
;
int
stride_c
=
M
*
K
;
// B need to transpose.
cublasOperation_t
op_b
=
(
transb
==
CUBLAS_OP_T
?
CUBLAS_OP_N
:
CUBLAS_OP_T
);
// Calculate d_A.
cublas_strided_batched_gemm
(
handle
,
mb
,
kb
,
N
,
&
alpha
,
&
beta
,
(
transa
==
CUBLAS_OP_T
?
B
:
C
),
(
transa
==
CUBLAS_OP_T
?
C
:
B
),
A
,
CUBLAS_OP_N
,
op_b
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
// A need to transpose.
cublasOperation_t
op_a
=
(
transa
==
CUBLAS_OP_T
?
CUBLAS_OP_N
:
CUBLAS_OP_T
);
int
stride_a
=
M
*
K
;
int
stride_b
=
M
*
N
;
int
stride_c
=
N
*
K
;
// Calculate d_B.
cublas_strided_batched_gemm
(
handle
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
op_a
,
CUBLAS_OP_N
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
cudaDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
cudaDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
bsz
,
M
,
N
,
K
;
cublasHandle_t
handle
;
cublasOperation_t
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
deepspeed/ops/csrc/includes/general_kernels.h
0 → 100644
View file @
eadbbe09
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#include <cooperative_groups.h>
#include <curand_kernel.h>
#include "context.h"
#include "cublas_wrappers.h"
#define THREADS 256
#define TILE_DIM 32
#define minus_infinity -1 * std::numeric_limits<float>::infinity()
#define FINAL_MASK 0xffffffff
template
<
typename
T
>
void
launch_fused_add2
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
cudaStream_t
&
stream
);
template
<
typename
T
>
void
launch_fused_add4
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
const
T
*
inp3
,
const
T
*
inp4
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
cudaStream_t
&
stream
);
template
<
typename
T
>
void
launch_fused_add3
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
const
T
*
inp3
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
cudaStream_t
&
stream
);
deepspeed/ops/csrc/includes/hip/StopWatch.h
0 → 100644
View file @
eadbbe09
#pragma once
#ifdef _WIN32
#include <windows.h>
#else
#include <time.h>
#endif
#ifdef _WIN32
class
Stopwatch
{
private:
double
m_total_time
;
LARGE_INTEGER
m_start_time
;
public:
Stopwatch
()
{
m_total_time
=
0.0
;
}
~
Stopwatch
()
{}
void
Reset
()
{
m_total_time
=
0.0
;
}
void
Start
()
{
QueryPerformanceCounter
(
&
m_start_time
);
}
void
Restart
()
{
m_total_time
=
0.0
;
QueryPerformanceCounter
(
&
m_start_time
);
}
void
Stop
()
{
LARGE_INTEGER
frequency
;
LARGE_INTEGER
stop_time
;
QueryPerformanceFrequency
(
&
frequency
);
QueryPerformanceCounter
(
&
stop_time
);
m_total_time
+=
((
double
)(
stop_time
.
QuadPart
-
m_start_time
.
QuadPart
)
/
(
double
)
frequency
.
QuadPart
);
}
double
GetTimeInSeconds
()
{
return
m_total_time
;
}
};
#else
class
Stopwatch
{
private:
double
m_total_time
;
struct
timespec
m_start_time
;
bool
m_is_started
;
public:
Stopwatch
()
{
m_total_time
=
0.0
;
m_is_started
=
false
;
}
~
Stopwatch
()
{}
void
Reset
()
{
m_total_time
=
0.0
;
}
void
Start
()
{
clock_gettime
(
CLOCK_MONOTONIC
,
&
m_start_time
);
m_is_started
=
true
;
}
void
Restart
()
{
m_total_time
=
0.0
;
clock_gettime
(
CLOCK_MONOTONIC
,
&
m_start_time
);
m_is_started
=
true
;
}
void
Stop
()
{
if
(
m_is_started
)
{
m_is_started
=
false
;
struct
timespec
end_time
;
clock_gettime
(
CLOCK_MONOTONIC
,
&
end_time
);
m_total_time
+=
(
double
)(
end_time
.
tv_sec
-
m_start_time
.
tv_sec
)
+
(
double
)(
end_time
.
tv_nsec
-
m_start_time
.
tv_nsec
)
/
1e9
;
}
}
double
GetTimeInSeconds
()
{
if
(
m_is_started
)
{
Stop
();
Start
();
}
return
m_total_time
;
}
};
#endif
deepspeed/ops/csrc/includes/hip/Timer.h
0 → 100644
View file @
eadbbe09
#ifndef __TIMER_H__
#define __TIMER_H__
#include <hip/hip_runtime.h>
#include <chrono>
#include "hip/hip_runtime.h"
class
GPUTimer
{
hipEvent_t
start
,
stop
;
public:
GPUTimer
()
{
hipEventCreate
(
&
start
);
hipEventCreate
(
&
stop
);
}
~
GPUTimer
()
{
hipEventDestroy
(
start
);
hipEventDestroy
(
stop
);
}
inline
void
Record
()
{
hipEventRecord
(
start
);
}
inline
void
Elapsed
(
float
&
time_elapsed
)
{
hipEventRecord
(
stop
);
hipEventSynchronize
(
stop
);
hipEventElapsedTime
(
&
time_elapsed
,
start
,
stop
);
}
};
class
CPUTimer
{
std
::
chrono
::
high_resolution_clock
::
time_point
start
;
public:
CPUTimer
()
:
start
(
std
::
chrono
::
high_resolution_clock
::
now
())
{}
inline
void
Reset
()
{
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
inline
float
Elapsed
()
{
auto
temp
=
start
;
start
=
std
::
chrono
::
high_resolution_clock
::
now
();
return
(
float
)(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
microseconds
>
(
start
-
temp
).
count
()
/
1e3
);
}
};
#endif
deepspeed/ops/csrc/includes/hip/context.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <ATen/hip/HIPContext.h>
#include <hip/hip_runtime_api.h>
#include <cassert>
#include <iostream>
#include <vector>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand.h"
#include "gemm_test.h"
#define WARP_SIZE 32
#define CUDA_CHECK(callstr) \
{ \
hipError_t error_code = callstr; \
if (error_code != hipSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x)
#define CUDA_2D_KERNEL_LOOP(i, n, j, m) \
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); i += blockDim.x * gridDim.x) \
for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); j += blockDim.y * gridDim.y)
#define DS_CUDA_NUM_THREADS 512
#define DS_MAXIMUM_NUM_BLOCKS 262144
inline
int
DS_GET_BLOCKS
(
const
int
N
)
{
return
(
std
::
max
)(
(
std
::
min
)((
N
+
DS_CUDA_NUM_THREADS
-
1
)
/
DS_CUDA_NUM_THREADS
,
DS_MAXIMUM_NUM_BLOCKS
),
// Use at least 1 block, since CUDA does not allow empty block
1
);
}
class
Context
{
public:
Context
()
:
_workspace
(
nullptr
),
_seed
(
42
),
_curr_offset
(
0
)
{
hiprandCreateGenerator
(
&
_gen
,
HIPRAND_RNG_PSEUDO_DEFAULT
);
hiprandSetPseudoRandomGeneratorSeed
(
_gen
,
123
);
if
(
rocblas_create_handle
(
&
_cublasHandle
)
!=
rocblas_status_success
)
{
auto
message
=
std
::
string
(
"Fail to create cublas handle."
);
std
::
cerr
<<
message
<<
std
::
endl
;
throw
std
::
runtime_error
(
message
);
}
}
virtual
~
Context
()
{
rocblas_destroy_handle
(
_cublasHandle
);
hipFree
(
_workspace
);
}
static
Context
&
Instance
()
{
static
Context
_ctx
;
return
_ctx
;
}
void
SetWorkSpace
(
void
*
workspace
)
{
if
(
!
workspace
)
{
throw
std
::
runtime_error
(
"Workspace is null."
);
}
_workspace
=
workspace
;
}
void
*
GetWorkSpace
()
{
return
_workspace
;
}
hiprandGenerator_t
&
GetRandGenerator
()
{
return
_gen
;
}
hipStream_t
GetCurrentStream
()
{
// get current pytorch stream.
hipStream_t
stream
=
at
::
hip
::
getCurrentHIPStreamMasqueradingAsCUDA
();
return
stream
;
}
hipStream_t
GetNewStream
()
{
return
at
::
hip
::
getStreamFromPoolMasqueradingAsCUDA
();
}
rocblas_handle
GetCublasHandle
()
{
return
_cublasHandle
;
}
std
::
pair
<
uint64_t
,
uint64_t
>
IncrementOffset
(
uint64_t
offset_inc
)
{
uint64_t
offset
=
_curr_offset
;
_curr_offset
+=
offset_inc
;
return
std
::
pair
<
uint64_t
,
uint64_t
>
(
_seed
,
offset
);
}
void
SetSeed
(
uint64_t
new_seed
)
{
_seed
=
new_seed
;
}
void
TestGemmFP16
(
bool
test_gemm
,
int
batch_size
,
int
seq_len
,
int
head_num
,
int
size_per_head
)
{
// avoid rerun.
if
(
_gemm_algos
.
size
()
>
0
)
return
;
if
(
test_gemm
)
{
rocblas_handle
handle
=
GetCublasHandle
();
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_qkv_fw
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
head_num
*
size_per_head
,
// N
head_num
*
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_inter
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
4
*
head_num
*
size_per_head
,
// N
head_num
*
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
GemmTest
<
__half
>>
test_output
(
new
GemmTest
<
__half
>
(
batch_size
*
seq_len
,
// M
head_num
*
size_per_head
,
// N
4
*
head_num
*
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
StridedGemmTest
<
__half
>>
test_attn_scores
(
new
StridedGemmTest
<
__half
>
(
batch_size
*
head_num
,
// batch
seq_len
,
// M
seq_len
,
// N
size_per_head
,
// K
rocblas_operation_transpose
,
rocblas_operation_none
,
handle
));
std
::
unique_ptr
<
StridedGemmTest
<
__half
>>
test_attn_context
(
new
StridedGemmTest
<
__half
>
(
batch_size
*
head_num
,
// batch
size_per_head
,
// M
seq_len
,
// N
seq_len
,
// K
rocblas_operation_none
,
rocblas_operation_none
,
handle
));
_gemm_algos
.
push_back
(
test_qkv_fw
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_inter
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_output
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_attn_scores
->
TestAlgo
(
100
));
_gemm_algos
.
push_back
(
test_attn_context
->
TestAlgo
(
100
));
}
else
{
// Use default algo.
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
_gemm_algos
.
push_back
(
std
::
array
<
int
,
3
>
({
99
,
99
,
99
}));
}
}
const
std
::
vector
<
std
::
array
<
int
,
3
>>&
GetGemmAlgos
()
const
{
return
_gemm_algos
;
}
private:
hiprandGenerator_t
_gen
;
rocblas_handle
_cublasHandle
;
void
*
_workspace
;
uint64_t
_seed
;
uint64_t
_curr_offset
;
std
::
vector
<
std
::
array
<
int
,
3
>>
_gemm_algos
;
};
deepspeed/ops/csrc/includes/hip/cpu_adam.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <cpuid.h>
#include <hip/hip_fp16.h>
#include <hip/hip_runtime_api.h>
#include <stdio.h>
#include <x86intrin.h>
#include <cassert>
#include "context.h"
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "hiprand.h"
#define CUDA_CHECK(callstr) \
{ \
hipError_t error_code = callstr; \
if (error_code != hipSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define TILE (1024 * 1024 * 1024)
#if defined(__AVX512__)
#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm512_loadu_ps(x)
#define SIMD_SET(x) _mm512_set1_ps(x)
#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
#define SIMD_WIDTH 16
#else
#if defined(__AVX256__)
#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm256_loadu_ps(x)
#define SIMD_SET(x) _mm256_set1_ps(x)
#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
#define SIMD_WIDTH 8
#endif
#endif
class
Adam_Optimizer
{
public:
Adam_Optimizer
(
float
alpha
=
1e-3
,
float
betta1
=
0.9
,
float
betta2
=
0.999
,
float
eps
=
1e-8
,
float
weight_decay
=
0
,
bool
adamw_mode
=
true
)
:
_alpha
(
alpha
),
_betta1
(
betta1
),
_betta2
(
betta2
),
_eps
(
eps
),
_weight_decay
(
weight_decay
),
_betta1_t
(
1.0
),
_betta2_t
(
1.0
),
_step
(
0
),
_buf_index
(
false
),
_adamw_mode
(
adamw_mode
)
{
hipHostMalloc
((
void
**
)
_doubled_buffer
,
TILE
*
sizeof
(
float
));
hipHostMalloc
((
void
**
)(
_doubled_buffer
+
1
),
TILE
*
sizeof
(
float
));
_streams
[
0
]
=
Context
::
Instance
().
GetCurrentStream
();
_streams
[
1
]
=
Context
::
Instance
().
GetNewStream
();
}
~
Adam_Optimizer
()
{
hipHostFree
(
_doubled_buffer
[
0
]);
hipHostFree
(
_doubled_buffer
[
1
]);
}
void
Step
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
);
void
Step_4
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sa
,
size_t
param_size
,
__half
*
dev_param
=
nullptr
);
void
Step_8
(
float
*
_params
,
float
*
grads
,
float
*
_exp_avg
,
float
*
_exp_avg_sq
,
size_t
_param_size
,
__half
*
dev_params
=
nullptr
);
inline
void
SynchronizeStreams
()
{
for
(
int
i
=
0
;
i
<
2
;
i
++
)
hipStreamSynchronize
(
_streams
[
i
]);
}
inline
void
IncrementStep
(
size_t
step
,
float
beta1
,
float
beta2
)
{
if
(
beta1
!=
_betta1
||
beta2
!=
_betta2
)
{
_step
=
step
;
_betta1
=
beta1
;
_betta2
=
beta2
;
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
}
else
{
_step
++
;
if
(
_step
!=
step
)
{
_betta1_t
=
std
::
pow
(
_betta1
,
step
);
_betta2_t
=
std
::
pow
(
_betta2
,
step
);
_step
=
step
;
}
else
{
_betta1_t
*=
_betta1
;
_betta2_t
*=
_betta2
;
}
}
}
inline
void
update_state
(
float
lr
,
float
epsilon
,
float
weight_decay
,
bool
bias_correction
)
{
_alpha
=
lr
;
_eps
=
epsilon
;
_weight_decay
=
weight_decay
;
_bias_correction1
=
1.0
f
;
_bias_correction2
=
1.0
f
;
if
(
bias_correction
==
1
)
{
_bias_correction1
=
1
-
_betta1_t
;
_bias_correction2
=
1
/
sqrt
(
1
-
_betta2_t
);
}
}
private:
#if defined(__AVX512__) or defined(__AVX256__)
union
AVX_Data
{
#if defined(__AVX512__)
__m512
data
;
#else
__m256
data
;
#endif
// float data_f[16];
};
#endif
float
_alpha
;
float
_betta1
;
float
_betta2
;
float
_eps
;
float
_weight_decay
;
float
_betta1_t
;
float
_betta2_t
;
size_t
_step
;
float
_bias_correction1
;
float
_bias_correction2
;
float
*
_doubled_buffer
[
2
];
bool
_buf_index
;
bool
_adamw_mode
;
hipStream_t
_streams
[
2
];
};
deepspeed/ops/csrc/includes/hip/cublas_wrappers.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <assert.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
#include <mma.h>
#include <stdio.h>
int
cublas_gemm_ex
(
rocblas_handle
handle
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
int
cublas_gemm_ex
(
rocblas_handle
handle
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
int
cublas_strided_batched_gemm
(
rocblas_handle
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
rocblas_operation
op_A
,
rocblas_operation
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT
);
int
cublas_strided_batched_gemm
(
rocblas_handle
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
rocblas_operation
op_A
,
rocblas_operation
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
=
CUBLAS_GEMM_DEFAULT_TENSOR_OP
);
deepspeed/ops/csrc/includes/hip/custom_hip_layers.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#include <cooperative_groups.h>
#include <hiprand_kernel.h>
#include "context.h"
#include "cublas_wrappers.h"
#define MAX_THREADS 1024
#define THREADS 256
#define MAX_THREAD_STRIDE 32
#define TILE_DIM 32
// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
#define MAX_THREAD_ITERATIONS 8 // Maximum 8K
#define MAX_WARP_NUM 32
#define MAX_REGISTERS 256
// Fused bias add with gelu activation
template
<
typename
T
>
void
launch_bias_gelu
(
const
T
*
input
,
const
T
*
bias
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_gelu
(
const
T
*
input
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_d_gelu
(
T
*
d_output
,
const
T
*
input
,
const
T
*
bias
,
int
intermediate_size
,
int
batch_size
,
hipStream_t
stream
);
// Custom fused bias add with layer normalization
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
,
T
*
means
);
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward_nreversible
(
const
T
*
out_grad
,
const
T
*
vals
,
const
T
*
out_grad_trans
,
const
T
*
vals_trans
,
const
T
*
means
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
]);
template
<
typename
T
>
void
Transpose
(
const
T
*
inp_mat
,
T
*
out_mat
,
int
rows
,
int
cols
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward_v2
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
hipStream_t
stream
);
// Custom softmax with scaling and attention mask addition
template
<
typename
T
>
void
launch_attn_softmax
(
T
*
vals
,
const
T
*
attn_mask
,
int
batch_size
,
int
heads
,
int
sequence_length
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_transform_0213
(
T
*
output
,
const
T
*
vals
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
hipStream_t
stream
);
// Custom bias add
template
<
typename
T
>
void
launch_bias_add_transform_0213
(
T
*
outputs
,
const
T
*
vals
,
const
T
*
bias
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
hipStream_t
stream
,
int
trans_count
);
// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
template
<
typename
T
>
void
launch_transform4d_0213
(
T
*
out
,
const
T
*
in
,
int
batch_size
,
int
heads
,
int
seq_length
,
int
hidden_dim
,
hipStream_t
stream
,
int
trans_count
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
int
dim
,
float
ratio
,
hipStream_t
stream
,
bool
bwd
=
false
);
template
<
typename
T
>
void
launch_dropout
(
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_fuse_transpose_bias_kernel
(
const
T
*
inp
,
T
*
out
,
int
rows
,
int
cols
,
hipStream_t
stream
);
void
launch_param_update
(
const
float
*
input
,
__half
*
output
,
int
size
,
hipStream_t
stream
);
deepspeed/ops/csrc/includes/hip/dropout.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
template
<
typename
T
>
class
Dropout
{
public:
struct
Config
{
float
ratio
;
uint32_t
dim
;
bool
training
;
Config
(
float
r
,
uint32_t
d
)
:
ratio
(
r
),
dim
(
d
),
training
(
true
)
{}
float
RATIO
()
const
{
return
training
?
ratio
:
0.0
;
}
inline
void
SetDim
(
uint32_t
d
)
{
dim
=
d
;
}
};
Dropout
(
const
Config
&
config
)
:
_config
(
config
),
_mask
(
nullptr
)
{}
virtual
~
Dropout
()
{}
void
Forward
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
hipStream_t
stream
,
bool
bwd
=
false
)
{
launch_dropout
<
T
>
(
out
,
vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
dim
,
_config
.
RATIO
(),
stream
,
bwd
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
vals
,
const
T
*
bias
,
hipStream_t
stream
)
{
launch_dropout
<
T
>
(
vals
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
hipStream_t
stream
)
{
launch_dropout
<
T
>
(
out
,
vals
,
residual
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals
,
hipStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals_out
,
const
T
*
d_vals
,
hipStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals_out
,
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
bool
HasDropout
()
const
{
return
_config
.
RATIO
()
>
0.0
;
}
void
SetTrainingMode
(
bool
training
)
{
_config
.
training
=
training
;
}
void
SetMask
(
uint8_t
*
mask
)
{
if
(
!
mask
)
{
throw
std
::
runtime_error
(
"Dropout mask is null."
);
}
_mask
=
mask
;
}
Config
GetConfig
()
const
{
return
_config
;
}
inline
void
SetDimension
(
uint32_t
dim
)
{
_config
.
SetDim
(
dim
);
}
private:
uint8_t
*
_mask
;
Config
_config
;
};
deepspeed/ops/csrc/includes/hip/ds_transformer_hip.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <hip/hip_runtime_api.h>
#include <hiprand.h>
#include <memory>
#include <vector>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "dropout.h"
#include "feed_forward.h"
#include "gelu.h"
#include "general_kernels.h"
#include "normalize_layer.h"
#include "softmax.h"
#include "strided_batch_gemm.h"
struct
BertGemmAlgos
{
int
m_gemm_qkv_algo
;
int
m_gemm_inter_algo
;
int
m_gemm_output_algo
;
int
m_gemm_batch1_algo
;
int
m_gemm_batch2_algo
;
BertGemmAlgos
()
:
m_gemm_qkv_algo
(
-
1
),
m_gemm_inter_algo
(
-
1
),
m_gemm_output_algo
(
-
1
),
m_gemm_batch1_algo
(
-
1
),
m_gemm_batch2_algo
(
-
1
)
{
}
};
template
<
typename
T
>
class
BertTransformerLayer
{
public:
BertTransformerLayer
(
int
layer_id
,
int
batch_size
,
int
hidden_size
,
int
num_heads
,
int
intermediate_size
,
int
seq_length
,
float
attn_dropout_ratio
,
float
hidden_output_dropout_ratio
,
float
layer_norm_eps
,
bool
pre_or_postLayerNorm
,
const
std
::
vector
<
std
::
array
<
int
,
3
>>&
gemm_algos
,
bool
attn_dropout_checkpoint
,
bool
normalize_invertible
,
bool
gelu_checkpoint
,
bool
stochastic_mode
);
virtual
~
BertTransformerLayer
();
void
Forward
(
int
bsz
,
const
T
*
input_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_qkvb_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_ob_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
output_b_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
out_ptr
,
T
*
inp_norm_ptr
,
T
*
q_tf_ptr
,
T
*
k_tf_ptr
,
T
*
v_tf_ptr
,
T
*
softmax_output_ptr
,
T
*
ctx_bufB_ptr
,
T
*
attn_o_inp_ptr
,
T
*
add_res_ptr
,
T
*
ff1_inp_ptr
,
T
*
gelu_inp_ptr
,
T
*
ff2_inp_ptr
);
void
Backward
(
int
bsz
,
const
T
*
grad_output_ptr
,
const
T
*
input_ptr
,
const
T
*
output_ptr
,
const
T
*
inp_norm_ptr
,
const
T
*
q_tf_ptr
,
const
T
*
k_tf_ptr
,
const
T
*
v_tf_ptr
,
const
T
*
softmax_output_ptr
,
const
T
*
ctx_bufB_ptr
,
const
T
*
attn_o_inp_ptr
,
const
T
*
add_res_ptr
,
const
T
*
ff1_inp_ptr
,
const
T
*
gelu_inp_ptr
,
const
T
*
ff2_inp_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
grad_input_ptr
,
T
*
grad_attn_qkvw_ptr
,
T
*
grad_attn_qkvb_ptr
,
T
*
grad_attn_ow_ptr
,
T
*
grad_attn_ob_ptr
,
T
*
grad_attn_nw_ptr
,
T
*
grad_attn_nb_ptr
,
T
*
grad_inter_w_ptr
,
T
*
grad_inter_b_ptr
,
T
*
grad_output_w_ptr
,
T
*
grad_output_b_ptr
,
T
*
grad_norm_w_ptr
,
T
*
grad_norm_b_ptr
);
void
SetIntermediateBuffers
(
uint8_t
*
attn_prob_dropout_mask_ptr
,
uint8_t
*
attn_output_dropout_mask_ptr
,
uint8_t
*
layer_output_dropout_mask_ptr
,
T
*
layer_norm_var
,
T
*
layer_norm_mean
,
T
*
attn_layer_norm_var
,
T
*
attn_layer_norm_mean
);
inline
int
GetBatchSize
()
const
{
return
_batch_size
;
}
inline
int
GetNumHeads
()
const
{
return
_heads
;
}
inline
int
GetSeqLength
()
const
{
return
_seq_length
;
}
inline
int
GetIntermediateSize
()
const
{
return
_intermediate_size
;
}
void
SetSeqLength
(
int
seq_len
);
inline
int
GetHiddenSize
()
const
{
return
_hidden_size
;
}
void
SetTrainingMode
(
bool
training
);
inline
bool
IsTrainingMode
()
const
{
return
_training
;
}
inline
bool
GeluCheckpoint
()
const
{
return
_gelu_checkpoint
;
}
private:
void
Initialize
();
size_t
getWorkspaceSize
(
int
maxBatchSize
)
const
;
// Params
int
_layer_id
;
int
_batch_size
;
int
_hidden_size
;
int
_heads
;
int
_size_per_head
;
int
_intermediate_size
;
int
_seq_length
;
bool
_pre_or_postLayerNorm
;
rocblas_handle
_cublasHandle
;
hipStream_t
_stream
;
// layers
FeedForward
<
T
>
_qkv_linear
;
FeedForward
<
T
>
_attn_out_linear
;
Normalize_Layer
<
T
>
_attn_layer_norm
;
Normalize_Layer
<
T
>
_layer_norm
;
Normalize_Layer
<
T
>*
_last_normalize
;
FeedForward
<
T
>
_ff1
,
_ff2
;
Softmax
<
T
>
_softmax
;
Gelu
<
T
>
_gelu
;
Dropout
<
T
>
_attn_prob_dropout
;
Dropout
<
T
>
_attn_output_dropout
;
Dropout
<
T
>
_layer_output_dropout
;
StridedBatchGemm
<
T
>
_attn_scores
;
StridedBatchGemm
<
T
>
_attn_context
;
bool
_training
;
// Memory saving flags
bool
_attn_dropout_checkpoint
;
bool
_normalize_invertible
;
bool
_gelu_checkpoint
;
// High Performace flags
bool
_stochastic_mode
;
};
deepspeed/ops/csrc/includes/hip/feed_forward.h
0 → 100644
View file @
eadbbe09
#ifndef __FEEDFORWARD_H__
#define __FEEDFORWARD_H__
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_cuda_layers.h"
template
<
typename
T
>
class
FeedForward
{
public:
struct
Config
{
int
batchSize
,
outputSize
;
int
inputSize
;
std
::
array
<
int
,
3
>
gemm_algos
;
Config
(
int
batch
,
int
outputs
,
int
inputs
,
const
std
::
array
<
int
,
3
>&
algos
)
:
batchSize
(
batch
),
outputSize
(
outputs
),
inputSize
(
inputs
),
gemm_algos
(
algos
)
{
}
};
FeedForward
(
Config
config
)
:
config_
(
config
)
{}
~
FeedForward
()
{}
void
Forward
(
int
bsz
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
out
,
rocblas_handle
&
_cublasHandle
)
{
float
alpha
=
T
(
1.
);
float
beta
=
T
(
0.
);
cublas_gemm_ex
(
_cublasHandle
,
rocblas_operation_transpose
,
rocblas_operation_none
,
config_
.
outputSize
,
bsz
,
config_
.
inputSize
,
&
alpha
,
&
beta
,
weights
,
input_ptr
,
out
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
0
]));
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
weights_grad
,
T
*
bias_grad
,
rocblas_handle
&
_cublasHandle
,
hipStream_t
&
stream
,
T
*
inp_grad_out
=
nullptr
,
T
*
out_grad_trans_out
=
nullptr
)
{
float
alpha
=
(
T
)
1.0
,
beta
=
(
T
)
0.0
;
cublas_gemm_ex
(
_cublasHandle
,
rocblas_operation_none
,
rocblas_operation_transpose
,
config_
.
inputSize
,
config_
.
outputSize
,
bsz
,
&
alpha
,
&
beta
,
input_ptr
,
out_grad
,
weights_grad
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
1
]));
cublas_gemm_ex
(
_cublasHandle
,
rocblas_operation_none
,
rocblas_operation_none
,
config_
.
inputSize
,
bsz
,
config_
.
outputSize
,
&
alpha
,
&
beta
,
weights
,
out_grad
,
inp_grad_out
,
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
2
]));
launch_fuse_transpose_bias_kernel
<
T
>
(
out_grad
,
bias_grad
,
bsz
,
config_
.
outputSize
,
stream
);
}
private:
Config
config_
;
};
#endif
deepspeed/ops/csrc/includes/hip/gelu.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_cuda_layers.h"
template
<
typename
T
>
class
Gelu
{
public:
struct
Config
{
uint32_t
intermediate_size
;
Config
(
uint32_t
inter_size
)
:
intermediate_size
(
inter_size
)
{}
};
Gelu
(
const
Config
&
config
)
:
_config
(
config
)
{}
virtual
~
Gelu
()
{}
void
ForwardWithBiasAdd
(
int
bsz
,
const
T
*
input_buf
,
const
T
*
bias
,
T
*
output
,
hipStream_t
stream
)
{
launch_bias_gelu
<
T
>
(
input_buf
,
bias
,
output
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_output
,
const
T
*
input_buf
,
const
T
*
bias
,
hipStream_t
stream
)
{
launch_d_gelu
<
T
>
(
d_output
,
input_buf
,
bias
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
private:
Config
_config
;
};
deepspeed/ops/csrc/includes/hip/gemm_test.h
0 → 100644
View file @
eadbbe09
#pragma once
#include <hip/hip_fp16.h>
#include <cuda_profiler_api.h>
#include <array>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <limits>
#include <memory>
#include "StopWatch.h"
#include "cublas_wrappers.h"
template
<
typename
T
>
void
check
(
T
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
)
{
if
(
result
)
{
std
::
cout
<<
(
std
::
string
(
"CUDA runtime error: "
)
+
+
file
+
":"
+
std
::
to_string
(
line
)
+
"
\n
"
);
}
}
#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
template
<
typename
T
>
class
GemmTest
{
public:
GemmTest
(
int
m
,
int
n
,
int
k
,
rocblas_operation
ta
,
rocblas_operation
tb
,
rocblas_handle
h
)
:
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
hipMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
));
}
~
GemmTest
()
{
check_cuda_error
(
hipFree
(
A
));
check_cuda_error
(
hipFree
(
B
));
check_cuda_error
(
hipFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
rocblas_operation_transpose
,
rocblas_operation_none
,
N
,
M
,
K
,
&
alpha
,
&
beta
,
B
,
A
,
C
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
rocblas_operation_none
,
rocblas_operation_transpose
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
rocblas_operation_none
,
rocblas_operation_none
,
K
,
M
,
N
,
&
alpha
,
&
beta
,
B
,
C
,
A
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
M
,
N
,
K
;
rocblas_handle
handle
;
rocblas_operation
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
template
<
typename
T
>
class
StridedGemmTest
{
public:
StridedGemmTest
(
int
b
,
int
m
,
int
n
,
int
k
,
rocblas_operation
ta
,
rocblas_operation
tb
,
rocblas_handle
h
)
:
bsz
(
b
),
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
hipMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
*
bsz
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
*
bsz
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
*
bsz
));
}
~
StridedGemmTest
()
{
check_cuda_error
(
hipFree
(
A
));
check_cuda_error
(
hipFree
(
B
));
check_cuda_error
(
hipFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
stride_a
=
M
*
K
;
int
stride_b
=
N
*
K
;
int
stride_c
=
M
*
N
;
cublas_strided_batched_gemm
(
handle
,
M
,
N
,
K
,
&
alpha
,
&
beta
,
A
,
B
,
C
,
transa
,
transb
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
mb
=
(
transa
==
rocblas_operation_transpose
?
K
:
M
);
int
kb
=
(
transa
==
rocblas_operation_transpose
?
M
:
K
);
int
stride_a
=
mb
*
N
;
int
stride_b
=
N
*
kb
;
int
stride_c
=
M
*
K
;
// B need to transpose.
rocblas_operation
op_b
=
(
transb
==
rocblas_operation_transpose
?
rocblas_operation_none
:
rocblas_operation_transpose
);
// Calculate d_A.
cublas_strided_batched_gemm
(
handle
,
mb
,
kb
,
N
,
&
alpha
,
&
beta
,
(
transa
==
rocblas_operation_transpose
?
B
:
C
),
(
transa
==
rocblas_operation_transpose
?
C
:
B
),
A
,
rocblas_operation_none
,
op_b
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
// A need to transpose.
rocblas_operation
op_a
=
(
transa
==
rocblas_operation_transpose
?
rocblas_operation_none
:
rocblas_operation_transpose
);
int
stride_a
=
M
*
K
;
int
stride_b
=
M
*
N
;
int
stride_c
=
N
*
K
;
// Calculate d_B.
cublas_strided_batched_gemm
(
handle
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
op_a
,
rocblas_operation_none
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
bsz
,
M
,
N
,
K
;
rocblas_handle
handle
;
rocblas_operation
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
Prev
1
2
3
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment