Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
67ea635f
Commit
67ea635f
authored
Mar 30, 2023
by
aiss
Browse files
push dsv0.8.2 version
parent
1b2721ad
Pipeline
#201
failed with stages
in 0 seconds
Changes
341
Pipelines
2
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
0 additions
and
2550 deletions
+0
-2550
deepspeed/ops/csrc/includes/custom_hip_layers.h
deepspeed/ops/csrc/includes/custom_hip_layers.h
+0
-304
deepspeed/ops/csrc/includes/dropout.h
deepspeed/ops/csrc/includes/dropout.h
+0
-76
deepspeed/ops/csrc/includes/dropout_hip.h
deepspeed/ops/csrc/includes/dropout_hip.h
+0
-77
deepspeed/ops/csrc/includes/ds_transformer_cuda.h
deepspeed/ops/csrc/includes/ds_transformer_cuda.h
+0
-184
deepspeed/ops/csrc/includes/ds_transformer_hip.h
deepspeed/ops/csrc/includes/ds_transformer_hip.h
+0
-185
deepspeed/ops/csrc/includes/feed_forward.h
deepspeed/ops/csrc/includes/feed_forward.h
+0
-105
deepspeed/ops/csrc/includes/feed_forward_hip.h
deepspeed/ops/csrc/includes/feed_forward_hip.h
+0
-106
deepspeed/ops/csrc/includes/gelu.h
deepspeed/ops/csrc/includes/gelu.h
+0
-36
deepspeed/ops/csrc/includes/gelu_hip.h
deepspeed/ops/csrc/includes/gelu_hip.h
+0
-37
deepspeed/ops/csrc/includes/gemm_test.h
deepspeed/ops/csrc/includes/gemm_test.h
+0
-327
deepspeed/ops/csrc/includes/gemm_test_hip.h
deepspeed/ops/csrc/includes/gemm_test_hip.h
+0
-328
deepspeed/ops/csrc/includes/general_kernels.h
deepspeed/ops/csrc/includes/general_kernels.h
+0
-51
deepspeed/ops/csrc/includes/general_kernels_hip.h
deepspeed/ops/csrc/includes/general_kernels_hip.h
+0
-52
deepspeed/ops/csrc/includes/normalize_layer.h
deepspeed/ops/csrc/includes/normalize_layer.h
+0
-202
deepspeed/ops/csrc/includes/normalize_layer_hip.h
deepspeed/ops/csrc/includes/normalize_layer_hip.h
+0
-203
deepspeed/ops/csrc/includes/quantizer.h
deepspeed/ops/csrc/includes/quantizer.h
+0
-9
deepspeed/ops/csrc/includes/quantizer_hip.h
deepspeed/ops/csrc/includes/quantizer_hip.h
+0
-10
deepspeed/ops/csrc/includes/simd.h
deepspeed/ops/csrc/includes/simd.h
+0
-137
deepspeed/ops/csrc/includes/softmax.h
deepspeed/ops/csrc/includes/softmax.h
+0
-60
deepspeed/ops/csrc/includes/softmax_hip.h
deepspeed/ops/csrc/includes/softmax_hip.h
+0
-61
No files found.
Too many changes to show.
To preserve performance only
341 of 341+
files are displayed.
Plain diff
Email patch
deepspeed/ops/csrc/includes/custom_hip_layers.h
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include <hiprand/hiprand_kernel.h>
#include "context_hip.h"
#include "cublas_wrappers_hip.h"
#define CUDA_CHECK(callstr) \
{ \
hipError_t error_code = callstr; \
if (error_code != hipSuccess) { \
std::cerr << "CUDA error " << error_code << " at " << __FILE__ << ":" << __LINE__; \
assert(0); \
} \
}
#define MAX_THREADS 1024
#define THREADS 256
#define MAX_THREAD_STRIDE 32
#define TILE_DIM 32
// Maximum sequence-length support based on the number of threads (2048) allowed in each block and
// this MAX is 8K For higher sequence length we need to use higher Max, like for 64K : 32
#define MAX_THREAD_ITERATIONS 8 // Maximum 8K
#define MAX_WARP_NUM 32
#define MAX_REGISTERS 256
#define MAX_REG 256
#define WARP_SIZE_BITS 5
template
<
typename
T
>
void
launch_quantize_kernel
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_sr_quantize_kernel
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_quantize_kernel_asym
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_sr_quantize_kernel_asym
(
T
*
vals
,
int
total_count
,
int
group_num
,
int
num_bits
,
hipStream_t
stream
);
// Fused bias add with gelu activation
template
<
typename
T
>
void
launch_bias_gelu
(
const
T
*
input
,
const
T
*
bias
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_gelu
(
const
T
*
input
,
T
*
output
,
int
intermediate_size
,
int
batch_size
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_d_gelu
(
T
*
d_output
,
const
T
*
input
,
const
T
*
bias
,
int
intermediate_size
,
int
batch_size
,
hipStream_t
stream
);
// Custom fused bias add with layer normalization
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
,
T
*
means
);
template
<
typename
T
>
void
launch_bias_residual_layer_norm
(
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
,
bool
preLayerNorm
,
bool
training
,
T
*
vars
);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward_fused_add
(
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
X_data
,
const
T
*
vars
,
const
T
*
means
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
]);
template
<
typename
T
>
void
launch_layerNorm_backward
(
const
T
*
out_grad
,
const
T
*
vals_hat
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
],
bool
invertible
=
false
,
const
T
*
betta
=
nullptr
);
template
<
typename
T
>
void
launch_layerNorm_backward_nreversible
(
const
T
*
out_grad
,
const
T
*
vals
,
const
T
*
out_grad_trans
,
const
T
*
vals_trans
,
const
T
*
means
,
const
T
*
vars
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
T
*
inp_grad
,
int
batch_size
,
int
hidden_dim
,
hipStream_t
stream
[
2
]);
template
<
typename
T
>
void
Transpose
(
const
T
*
inp_mat
,
T
*
out_mat
,
int
rows
,
int
cols
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_attn_softmax_backward_v2
(
T
*
out_grad
,
const
T
*
soft_inp
,
int
batch_size
,
int
heads
,
int
seq_length
,
hipStream_t
stream
);
// Custom softmax with scaling and attention mask addition
template
<
typename
T
>
void
launch_attn_softmax
(
T
*
vals
,
const
T
*
attn_mask
,
int
batch_size
,
int
heads
,
int
sequence_length
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_transform_0213
(
T
*
output
,
const
T
*
vals
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
hipStream_t
stream
);
// Custom bias add
template
<
typename
T
>
void
launch_bias_add_transform_0213
(
T
*
outputs
,
const
T
*
vals
,
const
T
*
bias
,
int
batch_size
,
int
seq_length
,
int
hidden_dim
,
int
heads
,
hipStream_t
stream
,
int
trans_count
);
// 4D transform [0, 1, 2, 3] -> [0, 2, 1, 3]
template
<
typename
T
>
void
launch_transform4d_0213
(
T
*
out
,
const
T
*
in
,
int
batch_size
,
int
heads
,
int
seq_length
,
int
hidden_dim
,
hipStream_t
stream
,
int
trans_count
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_dropout
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
int
dim
,
float
ratio
,
hipStream_t
stream
,
bool
bwd
=
false
);
template
<
typename
T
>
void
launch_dropout
(
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
uint8_t
*
mask
,
int
batch
,
int
dim
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_dropout_grad
(
T
*
vals_out
,
const
T
*
vals
,
uint8_t
*
mask
,
int
total_count
,
float
ratio
,
hipStream_t
stream
);
template
<
typename
T
>
void
launch_fuse_transpose_bias_kernel
(
const
T
*
inp
,
T
*
out
,
int
rows
,
int
cols
,
hipStream_t
stream
);
void
launch_param_update
(
const
float
*
input
,
__half
*
output
,
int
size
,
hipStream_t
stream
);
void
launch_param_update_half
(
const
float
*
input
,
__half
*
output
,
int
size
,
hipStream_t
stream
);
deepspeed/ops/csrc/includes/dropout.h
deleted
100644 → 0
View file @
1b2721ad
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
template
<
typename
T
>
class
Dropout
{
public:
struct
Config
{
float
ratio
;
uint32_t
dim
;
bool
training
;
Config
(
float
r
,
uint32_t
d
)
:
ratio
(
r
),
dim
(
d
),
training
(
true
)
{}
float
RATIO
()
const
{
return
training
?
ratio
:
0.0
;
}
inline
void
SetDim
(
uint32_t
d
)
{
dim
=
d
;
}
};
Dropout
(
const
Config
&
config
)
:
_config
(
config
),
_mask
(
nullptr
)
{}
virtual
~
Dropout
()
{}
void
Forward
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
cudaStream_t
stream
,
bool
bwd
=
false
)
{
launch_dropout
<
T
>
(
out
,
vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
dim
,
_config
.
RATIO
(),
stream
,
bwd
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
vals
,
const
T
*
bias
,
cudaStream_t
stream
)
{
launch_dropout
<
T
>
(
vals
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
cudaStream_t
stream
)
{
launch_dropout
<
T
>
(
out
,
vals
,
residual
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals
,
cudaStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals_out
,
const
T
*
d_vals
,
cudaStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals_out
,
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
bool
HasDropout
()
const
{
return
_config
.
RATIO
()
>
0.0
;
}
void
SetTrainingMode
(
bool
training
)
{
_config
.
training
=
training
;
}
void
SetMask
(
uint8_t
*
mask
)
{
if
(
!
mask
)
{
throw
std
::
runtime_error
(
"Dropout mask is null."
);
}
_mask
=
mask
;
}
Config
GetConfig
()
const
{
return
_config
;
}
inline
void
SetDimension
(
uint32_t
dim
)
{
_config
.
SetDim
(
dim
);
}
private:
uint8_t
*
_mask
;
Config
_config
;
};
deepspeed/ops/csrc/includes/dropout_hip.h
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
template
<
typename
T
>
class
Dropout
{
public:
struct
Config
{
float
ratio
;
uint32_t
dim
;
bool
training
;
Config
(
float
r
,
uint32_t
d
)
:
ratio
(
r
),
dim
(
d
),
training
(
true
)
{}
float
RATIO
()
const
{
return
training
?
ratio
:
0.0
;
}
inline
void
SetDim
(
uint32_t
d
)
{
dim
=
d
;
}
};
Dropout
(
const
Config
&
config
)
:
_config
(
config
),
_mask
(
nullptr
)
{}
virtual
~
Dropout
()
{}
void
Forward
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
hipStream_t
stream
,
bool
bwd
=
false
)
{
launch_dropout
<
T
>
(
out
,
vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
dim
,
_config
.
RATIO
(),
stream
,
bwd
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
vals
,
const
T
*
bias
,
hipStream_t
stream
)
{
launch_dropout
<
T
>
(
vals
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
ForwardWithBias
(
int
bsz
,
T
*
out
,
const
T
*
vals
,
const
T
*
residual
,
const
T
*
bias
,
hipStream_t
stream
)
{
launch_dropout
<
T
>
(
out
,
vals
,
residual
,
bias
,
_mask
,
bsz
,
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals
,
hipStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_vals_out
,
const
T
*
d_vals
,
hipStream_t
stream
)
{
launch_dropout_grad
<
T
>
(
d_vals_out
,
d_vals
,
_mask
,
bsz
*
_config
.
dim
,
_config
.
RATIO
(),
stream
);
}
bool
HasDropout
()
const
{
return
_config
.
RATIO
()
>
0.0
;
}
void
SetTrainingMode
(
bool
training
)
{
_config
.
training
=
training
;
}
void
SetMask
(
uint8_t
*
mask
)
{
if
(
!
mask
)
{
throw
std
::
runtime_error
(
"Dropout mask is null."
);
}
_mask
=
mask
;
}
Config
GetConfig
()
const
{
return
_config
;
}
inline
void
SetDimension
(
uint32_t
dim
)
{
_config
.
SetDim
(
dim
);
}
private:
uint8_t
*
_mask
;
Config
_config
;
};
deepspeed/ops/csrc/includes/ds_transformer_cuda.h
deleted
100644 → 0
View file @
1b2721ad
#pragma once
#include <cuda_runtime_api.h>
#include <curand.h>
#include <memory>
#include <vector>
#include "cublas_v2.h"
#include "cuda.h"
#include "dropout.h"
#include "feed_forward.h"
#include "gelu.h"
#include "general_kernels.h"
#include "normalize_layer.h"
#include "softmax.h"
#include "strided_batch_gemm.h"
struct
BertGemmAlgos
{
int
m_gemm_qkv_algo
;
int
m_gemm_inter_algo
;
int
m_gemm_output_algo
;
int
m_gemm_batch1_algo
;
int
m_gemm_batch2_algo
;
BertGemmAlgos
()
:
m_gemm_qkv_algo
(
-
1
),
m_gemm_inter_algo
(
-
1
),
m_gemm_output_algo
(
-
1
),
m_gemm_batch1_algo
(
-
1
),
m_gemm_batch2_algo
(
-
1
)
{
}
};
template
<
typename
T
>
class
BertTransformerLayer
{
public:
BertTransformerLayer
(
unsigned
layer_id
,
unsigned
batch_size
,
unsigned
hidden_size
,
unsigned
num_heads
,
unsigned
intermediate_size
,
unsigned
seq_length
,
float
attn_dropout_ratio
,
float
hidden_output_dropout_ratio
,
float
layer_norm_eps
,
bool
pre_or_postLayerNorm
,
const
std
::
vector
<
std
::
array
<
int
,
3
>>&
gemm_algos
,
bool
attn_dropout_checkpoint
,
bool
normalize_invertible
,
bool
gelu_checkpoint
,
bool
stochastic_mode
);
virtual
~
BertTransformerLayer
();
void
Forward
(
unsigned
bsz
,
const
T
*
input_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_qkvb_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_ob_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
output_b_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
out_ptr
,
T
*
inp_norm_ptr
,
T
*
q_tf_ptr
,
T
*
k_tf_ptr
,
T
*
v_tf_ptr
,
T
*
softmax_output_ptr
,
T
*
ctx_bufB_ptr
,
T
*
attn_o_inp_ptr
,
T
*
add_res_ptr
,
T
*
ff1_inp_ptr
,
T
*
gelu_inp_ptr
,
T
*
ff2_inp_ptr
);
void
Backward
(
unsigned
bsz
,
const
T
*
grad_output_ptr
,
const
T
*
input_ptr
,
const
T
*
output_ptr
,
const
T
*
inp_norm_ptr
,
const
T
*
q_tf_ptr
,
const
T
*
k_tf_ptr
,
const
T
*
v_tf_ptr
,
const
T
*
softmax_output_ptr
,
const
T
*
ctx_bufB_ptr
,
const
T
*
attn_o_inp_ptr
,
const
T
*
add_res_ptr
,
const
T
*
ff1_inp_ptr
,
const
T
*
gelu_inp_ptr
,
const
T
*
ff2_inp_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
grad_input_ptr
,
T
*
grad_attn_qkvw_ptr
,
T
*
grad_attn_qkvb_ptr
,
T
*
grad_attn_ow_ptr
,
T
*
grad_attn_ob_ptr
,
T
*
grad_attn_nw_ptr
,
T
*
grad_attn_nb_ptr
,
T
*
grad_inter_w_ptr
,
T
*
grad_inter_b_ptr
,
T
*
grad_output_w_ptr
,
T
*
grad_output_b_ptr
,
T
*
grad_norm_w_ptr
,
T
*
grad_norm_b_ptr
);
void
SetIntermediateBuffers
(
uint8_t
*
attn_prob_dropout_mask_ptr
,
uint8_t
*
attn_output_dropout_mask_ptr
,
uint8_t
*
layer_output_dropout_mask_ptr
,
T
*
layer_norm_var
,
T
*
layer_norm_mean
,
T
*
attn_layer_norm_var
,
T
*
attn_layer_norm_mean
);
inline
unsigned
GetBatchSize
()
const
{
return
_batch_size
;
}
inline
unsigned
GetNumHeads
()
const
{
return
_heads
;
}
inline
unsigned
GetSeqLength
()
const
{
return
_seq_length
;
}
inline
unsigned
GetIntermediateSize
()
const
{
return
_intermediate_size
;
}
void
SetSeqLength
(
unsigned
seq_len
);
inline
unsigned
GetHiddenSize
()
const
{
return
_hidden_size
;
}
void
SetTrainingMode
(
bool
training
);
inline
bool
IsTrainingMode
()
const
{
return
_training
;
}
inline
bool
GeluCheckpoint
()
const
{
return
_gelu_checkpoint
;
}
private:
void
Initialize
();
size_t
getWorkspaceSize
(
int
maxBatchSize
)
const
;
// Params
unsigned
_layer_id
;
unsigned
_batch_size
;
unsigned
_hidden_size
;
unsigned
_heads
;
unsigned
_size_per_head
;
unsigned
_intermediate_size
;
unsigned
_seq_length
;
bool
_pre_or_postLayerNorm
;
cublasHandle_t
_cublasHandle
;
cudaStream_t
_stream
;
// layers
FeedForward
<
T
>
_qkv_linear
;
FeedForward
<
T
>
_attn_out_linear
;
Normalize_Layer
<
T
>
_attn_layer_norm
;
Normalize_Layer
<
T
>
_layer_norm
;
Normalize_Layer
<
T
>*
_last_normalize
;
FeedForward
<
T
>
_ff1
,
_ff2
;
Softmax
<
T
>
_softmax
;
Gelu
<
T
>
_gelu
;
Dropout
<
T
>
_attn_prob_dropout
;
Dropout
<
T
>
_attn_output_dropout
;
Dropout
<
T
>
_layer_output_dropout
;
StridedBatchGemm
<
T
>
_attn_scores
;
StridedBatchGemm
<
T
>
_attn_context
;
bool
_training
;
// Memory saving flags
bool
_attn_dropout_checkpoint
;
bool
_normalize_invertible
;
bool
_gelu_checkpoint
;
// High Performance flags
bool
_stochastic_mode
;
};
deepspeed/ops/csrc/includes/ds_transformer_hip.h
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime_api.h>
#include <hiprand/hiprand.h>
#include <memory>
#include <vector>
#include "rocblas.h"
#include "hip/hip_runtime.h"
#include "dropout_hip.h"
#include "feed_forward_hip.h"
#include "gelu_hip.h"
#include "general_kernels_hip.h"
#include "normalize_layer_hip.h"
#include "softmax_hip.h"
#include "strided_batch_gemm_hip.h"
struct
BertGemmAlgos
{
int
m_gemm_qkv_algo
;
int
m_gemm_inter_algo
;
int
m_gemm_output_algo
;
int
m_gemm_batch1_algo
;
int
m_gemm_batch2_algo
;
BertGemmAlgos
()
:
m_gemm_qkv_algo
(
-
1
),
m_gemm_inter_algo
(
-
1
),
m_gemm_output_algo
(
-
1
),
m_gemm_batch1_algo
(
-
1
),
m_gemm_batch2_algo
(
-
1
)
{
}
};
template
<
typename
T
>
class
BertTransformerLayer
{
public:
BertTransformerLayer
(
unsigned
layer_id
,
unsigned
batch_size
,
unsigned
hidden_size
,
unsigned
num_heads
,
unsigned
intermediate_size
,
unsigned
seq_length
,
float
attn_dropout_ratio
,
float
hidden_output_dropout_ratio
,
float
layer_norm_eps
,
bool
pre_or_postLayerNorm
,
const
std
::
vector
<
std
::
array
<
int
,
3
>>&
gemm_algos
,
bool
attn_dropout_checkpoint
,
bool
normalize_invertible
,
bool
gelu_checkpoint
,
bool
stochastic_mode
);
virtual
~
BertTransformerLayer
();
void
Forward
(
unsigned
bsz
,
const
T
*
input_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_qkvb_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_ob_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
output_b_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
out_ptr
,
T
*
inp_norm_ptr
,
T
*
q_tf_ptr
,
T
*
k_tf_ptr
,
T
*
v_tf_ptr
,
T
*
softmax_output_ptr
,
T
*
ctx_bufB_ptr
,
T
*
attn_o_inp_ptr
,
T
*
add_res_ptr
,
T
*
ff1_inp_ptr
,
T
*
gelu_inp_ptr
,
T
*
ff2_inp_ptr
);
void
Backward
(
unsigned
bsz
,
const
T
*
grad_output_ptr
,
const
T
*
input_ptr
,
const
T
*
output_ptr
,
const
T
*
inp_norm_ptr
,
const
T
*
q_tf_ptr
,
const
T
*
k_tf_ptr
,
const
T
*
v_tf_ptr
,
const
T
*
softmax_output_ptr
,
const
T
*
ctx_bufB_ptr
,
const
T
*
attn_o_inp_ptr
,
const
T
*
add_res_ptr
,
const
T
*
ff1_inp_ptr
,
const
T
*
gelu_inp_ptr
,
const
T
*
ff2_inp_ptr
,
const
T
*
input_mask_ptr
,
const
T
*
attn_qkvw_ptr
,
const
T
*
attn_ow_ptr
,
const
T
*
attn_nw_ptr
,
const
T
*
attn_nb_ptr
,
const
T
*
inter_w_ptr
,
const
T
*
inter_b_ptr
,
const
T
*
output_w_ptr
,
const
T
*
norm_w_ptr
,
const
T
*
norm_b_ptr
,
T
*
grad_input_ptr
,
T
*
grad_attn_qkvw_ptr
,
T
*
grad_attn_qkvb_ptr
,
T
*
grad_attn_ow_ptr
,
T
*
grad_attn_ob_ptr
,
T
*
grad_attn_nw_ptr
,
T
*
grad_attn_nb_ptr
,
T
*
grad_inter_w_ptr
,
T
*
grad_inter_b_ptr
,
T
*
grad_output_w_ptr
,
T
*
grad_output_b_ptr
,
T
*
grad_norm_w_ptr
,
T
*
grad_norm_b_ptr
);
void
SetIntermediateBuffers
(
uint8_t
*
attn_prob_dropout_mask_ptr
,
uint8_t
*
attn_output_dropout_mask_ptr
,
uint8_t
*
layer_output_dropout_mask_ptr
,
T
*
layer_norm_var
,
T
*
layer_norm_mean
,
T
*
attn_layer_norm_var
,
T
*
attn_layer_norm_mean
);
inline
unsigned
GetBatchSize
()
const
{
return
_batch_size
;
}
inline
unsigned
GetNumHeads
()
const
{
return
_heads
;
}
inline
unsigned
GetSeqLength
()
const
{
return
_seq_length
;
}
inline
unsigned
GetIntermediateSize
()
const
{
return
_intermediate_size
;
}
void
SetSeqLength
(
unsigned
seq_len
);
inline
unsigned
GetHiddenSize
()
const
{
return
_hidden_size
;
}
void
SetTrainingMode
(
bool
training
);
inline
bool
IsTrainingMode
()
const
{
return
_training
;
}
inline
bool
GeluCheckpoint
()
const
{
return
_gelu_checkpoint
;
}
private:
void
Initialize
();
size_t
getWorkspaceSize
(
int
maxBatchSize
)
const
;
// Params
unsigned
_layer_id
;
unsigned
_batch_size
;
unsigned
_hidden_size
;
unsigned
_heads
;
unsigned
_size_per_head
;
unsigned
_intermediate_size
;
unsigned
_seq_length
;
bool
_pre_or_postLayerNorm
;
rocblas_handle
_cublasHandle
;
hipStream_t
_stream
;
// layers
FeedForward
<
T
>
_qkv_linear
;
FeedForward
<
T
>
_attn_out_linear
;
Normalize_Layer
<
T
>
_attn_layer_norm
;
Normalize_Layer
<
T
>
_layer_norm
;
Normalize_Layer
<
T
>*
_last_normalize
;
FeedForward
<
T
>
_ff1
,
_ff2
;
Softmax
<
T
>
_softmax
;
Gelu
<
T
>
_gelu
;
Dropout
<
T
>
_attn_prob_dropout
;
Dropout
<
T
>
_attn_output_dropout
;
Dropout
<
T
>
_layer_output_dropout
;
StridedBatchGemm
<
T
>
_attn_scores
;
StridedBatchGemm
<
T
>
_attn_context
;
bool
_training
;
// Memory saving flags
bool
_attn_dropout_checkpoint
;
bool
_normalize_invertible
;
bool
_gelu_checkpoint
;
// High Performance flags
bool
_stochastic_mode
;
};
deepspeed/ops/csrc/includes/feed_forward.h
deleted
100644 → 0
View file @
1b2721ad
#ifndef __FEEDFORWARD_H__
#define __FEEDFORWARD_H__
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include "custom_cuda_layers.h"
template
<
typename
T
>
class
FeedForward
{
public:
struct
Config
{
int
batchSize
,
outputSize
;
int
inputSize
;
std
::
array
<
int
,
3
>
gemm_algos
;
Config
(
int
batch
,
int
outputs
,
int
inputs
,
const
std
::
array
<
int
,
3
>&
algos
)
:
batchSize
(
batch
),
outputSize
(
outputs
),
inputSize
(
inputs
),
gemm_algos
(
algos
)
{
}
};
FeedForward
(
Config
config
)
:
config_
(
config
)
{}
~
FeedForward
()
{}
void
Forward
(
int
bsz
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
out
,
cublasHandle_t
&
_cublasHandle
)
{
float
alpha
=
T
(
1.
);
float
beta
=
T
(
0.
);
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_T
,
CUBLAS_OP_N
,
config_
.
outputSize
,
bsz
,
config_
.
inputSize
,
&
alpha
,
&
beta
,
weights
,
input_ptr
,
out
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
(
config_
.
gemm_algos
[
0
]));
#else
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
0
]));
#endif
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
weights_grad
,
T
*
bias_grad
,
cublasHandle_t
&
_cublasHandle
,
cudaStream_t
&
stream
,
T
*
inp_grad_out
=
nullptr
,
T
*
out_grad_trans_out
=
nullptr
)
{
float
alpha
=
(
T
)
1.0
,
beta
=
(
T
)
0.0
;
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_N
,
CUBLAS_OP_T
,
config_
.
inputSize
,
config_
.
outputSize
,
bsz
,
&
alpha
,
&
beta
,
input_ptr
,
out_grad
,
weights_grad
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
(
config_
.
gemm_algos
[
1
]));
#else
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
1
]));
#endif
cublas_gemm_ex
(
_cublasHandle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
config_
.
inputSize
,
bsz
,
config_
.
outputSize
,
&
alpha
,
&
beta
,
weights
,
out_grad
,
inp_grad_out
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
(
config_
.
gemm_algos
[
2
]));
#else
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
2
]));
#endif
launch_fuse_transpose_bias_kernel
<
T
>
(
out_grad
,
bias_grad
,
bsz
,
config_
.
outputSize
,
stream
);
}
private:
Config
config_
;
};
#endif
deepspeed/ops/csrc/includes/feed_forward_hip.h
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#ifndef __FEEDFORWARD_H__
#define __FEEDFORWARD_H__
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_hip_layers.h"
template
<
typename
T
>
class
FeedForward
{
public:
struct
Config
{
int
batchSize
,
outputSize
;
int
inputSize
;
std
::
array
<
int
,
3
>
gemm_algos
;
Config
(
int
batch
,
int
outputs
,
int
inputs
,
const
std
::
array
<
int
,
3
>&
algos
)
:
batchSize
(
batch
),
outputSize
(
outputs
),
inputSize
(
inputs
),
gemm_algos
(
algos
)
{
}
};
FeedForward
(
Config
config
)
:
config_
(
config
)
{}
~
FeedForward
()
{}
void
Forward
(
int
bsz
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
out
,
rocblas_handle
&
_cublasHandle
)
{
float
alpha
=
T
(
1.
);
float
beta
=
T
(
0.
);
cublas_gemm_ex
(
_cublasHandle
,
rocblas_operation_transpose
,
rocblas_operation_none
,
config_
.
outputSize
,
bsz
,
config_
.
inputSize
,
&
alpha
,
&
beta
,
weights
,
input_ptr
,
out
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
(
config_
.
gemm_algos
[
0
]));
#else
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
0
]));
#endif
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
input_ptr
,
const
T
*
weights
,
T
*
weights_grad
,
T
*
bias_grad
,
rocblas_handle
&
_cublasHandle
,
hipStream_t
&
stream
,
T
*
inp_grad_out
=
nullptr
,
T
*
out_grad_trans_out
=
nullptr
)
{
float
alpha
=
(
T
)
1.0
,
beta
=
(
T
)
0.0
;
cublas_gemm_ex
(
_cublasHandle
,
rocblas_operation_none
,
rocblas_operation_transpose
,
config_
.
inputSize
,
config_
.
outputSize
,
bsz
,
&
alpha
,
&
beta
,
input_ptr
,
out_grad
,
weights_grad
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
(
config_
.
gemm_algos
[
1
]));
#else
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
1
]));
#endif
cublas_gemm_ex
(
_cublasHandle
,
rocblas_operation_none
,
rocblas_operation_none
,
config_
.
inputSize
,
bsz
,
config_
.
outputSize
,
&
alpha
,
&
beta
,
weights
,
out_grad
,
inp_grad_out
,
#ifdef __HIP_PLATFORM_HCC__
rocblas_gemm_algo
(
config_
.
gemm_algos
[
2
]));
#else
cublasGemmAlgo_t
(
config_
.
gemm_algos
[
2
]));
#endif
launch_fuse_transpose_bias_kernel
<
T
>
(
out_grad
,
bias_grad
,
bsz
,
config_
.
outputSize
,
stream
);
}
private:
Config
config_
;
};
#endif
deepspeed/ops/csrc/includes/gelu.h
deleted
100644 → 0
View file @
1b2721ad
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include "custom_cuda_layers.h"
template
<
typename
T
>
class
Gelu
{
public:
struct
Config
{
uint32_t
intermediate_size
;
Config
(
uint32_t
inter_size
)
:
intermediate_size
(
inter_size
)
{}
};
Gelu
(
const
Config
&
config
)
:
_config
(
config
)
{}
virtual
~
Gelu
()
{}
void
ForwardWithBiasAdd
(
int
bsz
,
const
T
*
input_buf
,
const
T
*
bias
,
T
*
output
,
cudaStream_t
stream
)
{
launch_bias_gelu
<
T
>
(
input_buf
,
bias
,
output
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_output
,
const
T
*
input_buf
,
const
T
*
bias
,
cudaStream_t
stream
)
{
launch_d_gelu
<
T
>
(
d_output
,
input_buf
,
bias
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
private:
Config
_config
;
};
deepspeed/ops/csrc/includes/gelu_hip.h
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_hip_layers.h"
template
<
typename
T
>
class
Gelu
{
public:
struct
Config
{
uint32_t
intermediate_size
;
Config
(
uint32_t
inter_size
)
:
intermediate_size
(
inter_size
)
{}
};
Gelu
(
const
Config
&
config
)
:
_config
(
config
)
{}
virtual
~
Gelu
()
{}
void
ForwardWithBiasAdd
(
int
bsz
,
const
T
*
input_buf
,
const
T
*
bias
,
T
*
output
,
hipStream_t
stream
)
{
launch_bias_gelu
<
T
>
(
input_buf
,
bias
,
output
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
void
Backward
(
int
bsz
,
T
*
d_output
,
const
T
*
input_buf
,
const
T
*
bias
,
hipStream_t
stream
)
{
launch_d_gelu
<
T
>
(
d_output
,
input_buf
,
bias
,
_config
.
intermediate_size
,
bsz
,
stream
);
}
private:
Config
_config
;
};
deepspeed/ops/csrc/includes/gemm_test.h
deleted
100644 → 0
View file @
1b2721ad
#pragma once
#include <cuda_fp16.h>
#ifndef __HIP_PLATFORM_HCC__
#include <cuda_profiler_api.h>
#endif
#include <array>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <limits>
#include <memory>
#include "StopWatch.h"
#include "cublas_wrappers.h"
template
<
typename
T
>
void
check
(
T
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
)
{
if
(
result
)
{
std
::
cout
<<
(
std
::
string
(
"CUDA runtime error: "
)
+
+
file
+
":"
+
std
::
to_string
(
line
)
+
"
\n
"
);
}
}
#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
template
<
typename
T
>
class
GemmTest
{
public:
GemmTest
(
int
m
,
int
n
,
int
k
,
cublasOperation_t
ta
,
cublasOperation_t
tb
,
cublasHandle_t
h
)
:
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
cudaMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
));
check_cuda_error
(
cudaMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
));
check_cuda_error
(
cudaMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
));
}
~
GemmTest
()
{
check_cuda_error
(
cudaFree
(
A
));
check_cuda_error
(
cudaFree
(
B
));
check_cuda_error
(
cudaFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
CUBLAS_OP_T
,
CUBLAS_OP_N
,
N
,
M
,
K
,
&
alpha
,
&
beta
,
B
,
A
,
C
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
CUBLAS_OP_N
,
CUBLAS_OP_T
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
CUBLAS_OP_N
,
CUBLAS_OP_N
,
K
,
M
,
N
,
&
alpha
,
&
beta
,
B
,
C
,
A
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
#ifdef __HIP_PLATFORM_HCC__
for
(
int
algo
=
(
int
)
rocblas_gemm_algo_standard
;
algo
<=
(
int
)
rocblas_gemm_algo_standard
;
#else
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
#endif
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
cudaDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
cudaDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
M
,
N
,
K
;
cublasHandle_t
handle
;
cublasOperation_t
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
template
<
typename
T
>
class
StridedGemmTest
{
public:
StridedGemmTest
(
int
b
,
int
m
,
int
n
,
int
k
,
cublasOperation_t
ta
,
cublasOperation_t
tb
,
cublasHandle_t
h
)
:
bsz
(
b
),
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
cudaMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
*
bsz
));
check_cuda_error
(
cudaMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
*
bsz
));
check_cuda_error
(
cudaMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
*
bsz
));
}
~
StridedGemmTest
()
{
check_cuda_error
(
cudaFree
(
A
));
check_cuda_error
(
cudaFree
(
B
));
check_cuda_error
(
cudaFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
stride_a
=
M
*
K
;
int
stride_b
=
N
*
K
;
int
stride_c
=
M
*
N
;
cublas_strided_batched_gemm
(
handle
,
M
,
N
,
K
,
&
alpha
,
&
beta
,
A
,
B
,
C
,
transa
,
transb
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
mb
=
(
transa
==
CUBLAS_OP_T
?
K
:
M
);
int
kb
=
(
transa
==
CUBLAS_OP_T
?
M
:
K
);
int
stride_a
=
mb
*
N
;
int
stride_b
=
N
*
kb
;
int
stride_c
=
M
*
K
;
// B need to transpose.
cublasOperation_t
op_b
=
(
transb
==
CUBLAS_OP_T
?
CUBLAS_OP_N
:
CUBLAS_OP_T
);
// Calculate d_A.
cublas_strided_batched_gemm
(
handle
,
mb
,
kb
,
N
,
&
alpha
,
&
beta
,
(
transa
==
CUBLAS_OP_T
?
B
:
C
),
(
transa
==
CUBLAS_OP_T
?
C
:
B
),
A
,
CUBLAS_OP_N
,
op_b
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
// A need to transpose.
cublasOperation_t
op_a
=
(
transa
==
CUBLAS_OP_T
?
CUBLAS_OP_N
:
CUBLAS_OP_T
);
int
stride_a
=
M
*
K
;
int
stride_b
=
M
*
N
;
int
stride_c
=
N
*
K
;
// Calculate d_B.
cublas_strided_batched_gemm
(
handle
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
op_a
,
CUBLAS_OP_N
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
#ifdef __HIP_PLATFORM_HCC__
for
(
int
algo
=
(
int
)
rocblas_gemm_algo_standard
;
algo
<=
(
int
)
rocblas_gemm_algo_standard
;
#else
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
#endif
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
cudaDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
cudaDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
bsz
,
M
,
N
,
K
;
cublasHandle_t
handle
;
cublasOperation_t
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
deepspeed/ops/csrc/includes/gemm_test_hip.h
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_fp16.h>
#ifndef __HIP_PLATFORM_HCC__
#include <cuda_profiler_api.h>
#endif
#include <array>
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <limits>
#include <memory>
#include "StopWatch.h"
#include "cublas_wrappers_hip.h"
template
<
typename
T
>
void
check
(
T
result
,
char
const
*
const
func
,
const
char
*
const
file
,
int
const
line
)
{
if
(
result
)
{
std
::
cout
<<
(
std
::
string
(
"CUDA runtime error: "
)
+
+
file
+
":"
+
std
::
to_string
(
line
)
+
"
\n
"
);
}
}
#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
template
<
typename
T
>
class
GemmTest
{
public:
GemmTest
(
int
m
,
int
n
,
int
k
,
rocblas_operation
ta
,
rocblas_operation
tb
,
rocblas_handle
h
)
:
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
hipMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
));
}
~
GemmTest
()
{
check_cuda_error
(
hipFree
(
A
));
check_cuda_error
(
hipFree
(
B
));
check_cuda_error
(
hipFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
rocblas_operation_transpose
,
rocblas_operation_none
,
N
,
M
,
K
,
&
alpha
,
&
beta
,
B
,
A
,
C
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
rocblas_operation_none
,
rocblas_operation_transpose
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
cublas_gemm_ex
(
handle
,
rocblas_operation_none
,
rocblas_operation_none
,
K
,
M
,
N
,
&
alpha
,
&
beta
,
B
,
C
,
A
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
#ifdef __HIP_PLATFORM_HCC__
for
(
int
algo
=
(
int
)
rocblas_gemm_algo_standard
;
algo
<=
(
int
)
rocblas_gemm_algo_standard
;
#else
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
#endif
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
M
,
N
,
K
;
rocblas_handle
handle
;
rocblas_operation
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
template
<
typename
T
>
class
StridedGemmTest
{
public:
StridedGemmTest
(
int
b
,
int
m
,
int
n
,
int
k
,
rocblas_operation
ta
,
rocblas_operation
tb
,
rocblas_handle
h
)
:
bsz
(
b
),
M
(
m
),
N
(
n
),
K
(
k
),
transa
(
ta
),
transb
(
tb
),
handle
(
h
)
{
check_cuda_error
(
hipMalloc
((
void
**
)
&
A
,
sizeof
(
T
)
*
M
*
K
*
bsz
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
B
,
sizeof
(
T
)
*
K
*
N
*
bsz
));
check_cuda_error
(
hipMalloc
((
void
**
)
&
C
,
sizeof
(
T
)
*
M
*
N
*
bsz
));
}
~
StridedGemmTest
()
{
check_cuda_error
(
hipFree
(
A
));
check_cuda_error
(
hipFree
(
B
));
check_cuda_error
(
hipFree
(
C
));
}
std
::
array
<
int
,
3
>
TestAlgo
(
int
loops
)
{
float
alpha
=
(
T
)
1.0
f
;
float
beta
=
(
T
)
0.0
f
;
int
algo_fw
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
stride_a
=
M
*
K
;
int
stride_b
=
N
*
K
;
int
stride_c
=
M
*
N
;
cublas_strided_batched_gemm
(
handle
,
M
,
N
,
K
,
&
alpha
,
&
beta
,
A
,
B
,
C
,
transa
,
transb
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw1
=
Run
(
loops
,
[
=
](
int
algo
)
{
int
mb
=
(
transa
==
rocblas_operation_transpose
?
K
:
M
);
int
kb
=
(
transa
==
rocblas_operation_transpose
?
M
:
K
);
int
stride_a
=
mb
*
N
;
int
stride_b
=
N
*
kb
;
int
stride_c
=
M
*
K
;
// B need to transpose.
rocblas_operation
op_b
=
(
transb
==
rocblas_operation_transpose
?
rocblas_operation_none
:
rocblas_operation_transpose
);
// Calculate d_A.
cublas_strided_batched_gemm
(
handle
,
mb
,
kb
,
N
,
&
alpha
,
&
beta
,
(
transa
==
rocblas_operation_transpose
?
B
:
C
),
(
transa
==
rocblas_operation_transpose
?
C
:
B
),
A
,
rocblas_operation_none
,
op_b
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
int
algo_bw2
=
Run
(
loops
,
[
=
](
int
algo
)
{
// A need to transpose.
rocblas_operation
op_a
=
(
transa
==
rocblas_operation_transpose
?
rocblas_operation_none
:
rocblas_operation_transpose
);
int
stride_a
=
M
*
K
;
int
stride_b
=
M
*
N
;
int
stride_c
=
N
*
K
;
// Calculate d_B.
cublas_strided_batched_gemm
(
handle
,
K
,
N
,
M
,
&
alpha
,
&
beta
,
A
,
C
,
B
,
op_a
,
rocblas_operation_none
,
stride_a
,
stride_b
,
stride_c
,
bsz
,
#ifdef __HIP_PLATFORM_HCC__
static_cast
<
rocblas_gemm_algo
>
(
algo
));
#else
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
#endif
});
return
std
::
array
<
int
,
3
>
({
algo_fw
,
algo_bw1
,
algo_bw2
});
}
template
<
typename
Func
>
int
Run
(
int
loops
,
Func
f
)
{
float
fast_latency
=
(
std
::
numeric_limits
<
float
>::
max
)();
int
fast_algo
=
0
;
#ifdef __HIP_PLATFORM_HCC__
for
(
int
algo
=
(
int
)
rocblas_gemm_algo_standard
;
algo
<=
(
int
)
rocblas_gemm_algo_standard
;
#else
for
(
int
algo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
algo
<=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
#endif
algo
++
)
{
int
warm_up
=
5
;
for
(
int
i
=
0
;
i
<
warm_up
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
Stopwatch
timer
;
timer
.
Restart
();
for
(
int
i
=
0
;
i
<
loops
;
++
i
)
f
(
algo
);
hipDeviceSynchronize
();
timer
.
Stop
();
float
avg_latency
=
(
float
)
timer
.
GetTimeInSeconds
()
*
1000
/
loops
;
printf
(
"algo-%d: %.3fms
\n
"
,
algo
,
avg_latency
);
if
(
avg_latency
<
fast_latency
)
{
fast_latency
=
avg_latency
;
fast_algo
=
algo
;
}
}
printf
(
"fast_algo %d: %.3f ms
\n
"
,
fast_algo
,
fast_latency
);
return
fast_algo
;
}
private:
int
bsz
,
M
,
N
,
K
;
rocblas_handle
handle
;
rocblas_operation
transa
,
transb
;
T
*
A
,
*
B
,
*
C
;
};
deepspeed/ops/csrc/includes/general_kernels.h
deleted
100644 → 0
View file @
1b2721ad
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
#endif
#include <curand_kernel.h>
#include "context.h"
#include "cublas_wrappers.h"
#define THREADS 256
#define TILE_DIM 32
#define minus_infinity -1 * std::numeric_limits<float>::infinity()
#define FINAL_MASK 0xffffffff
template
<
typename
T
>
void
launch_fused_add2
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
cudaStream_t
&
stream
);
template
<
typename
T
>
void
launch_fused_add4
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
const
T
*
inp3
,
const
T
*
inp4
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
cudaStream_t
&
stream
);
template
<
typename
T
>
void
launch_fused_add3
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
const
T
*
inp3
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
cudaStream_t
&
stream
);
deepspeed/ops/csrc/includes/general_kernels_hip.h
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef __HIP_PLATFORM_HCC__
#include <hip/hip_cooperative_groups.h>
#else
#include <cooperative_groups.h>
#endif
#include <hiprand/hiprand_kernel.h>
#include "context_hip.h"
#include "cublas_wrappers_hip.h"
#define THREADS 256
#define TILE_DIM 32
#define minus_infinity -1 * std::numeric_limits<float>::infinity()
#define FINAL_MASK 0xffffffff
template
<
typename
T
>
void
launch_fused_add2
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
hipStream_t
&
stream
);
template
<
typename
T
>
void
launch_fused_add4
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
const
T
*
inp3
,
const
T
*
inp4
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
hipStream_t
&
stream
);
template
<
typename
T
>
void
launch_fused_add3
(
T
*
out
,
const
T
*
inp1
,
const
T
*
inp2
,
const
T
*
inp3
,
int
batch_size
,
int
seq_length
,
int
hidden_size
,
hipStream_t
&
stream
);
deepspeed/ops/csrc/includes/normalize_layer.h
deleted
100644 → 0
View file @
1b2721ad
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <fstream>
#include "custom_cuda_layers.h"
using
namespace
std
;
template
<
typename
T
>
class
Normalize_Layer
{
public:
struct
Config
{
uint32_t
batchSize
;
uint32_t
seqLength
;
uint32_t
hiddenDim
;
float
epsilon
;
bool
training
;
bool
useMean
;
Config
(
uint32_t
batch
,
uint32_t
seq
,
uint32_t
h
,
float
epsilon
=
1e-12
,
bool
training
=
true
,
bool
useMean
=
true
)
:
batchSize
(
batch
),
seqLength
(
seq
),
hiddenDim
(
h
),
epsilon
(
epsilon
),
training
(
training
),
useMean
(
useMean
)
{
}
};
Normalize_Layer
(
Config
config
)
:
config_
(
config
),
vars
(
nullptr
),
means
(
nullptr
),
vals_hat
(
nullptr
)
{
}
~
Normalize_Layer
()
{}
void
ForwardCheckpoint
(
int
bsz
,
// batch * seq
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
betta
,
cudaStream_t
&
stream
,
bool
preLayerNorm
=
false
)
{
launch_bias_residual_layer_norm
(
vals
,
residual
,
gamma
,
betta
,
config_
.
epsilon
,
bsz
,
config_
.
hiddenDim
,
stream
,
preLayerNorm
,
config_
.
training
,
vars
,
means
);
}
void
Forward
(
int
bsz
,
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
betta
,
cudaStream_t
&
stream
,
bool
preLayerNorm
=
false
)
{
launch_bias_residual_layer_norm
(
vals
,
residual
,
gamma
,
betta
,
config_
.
epsilon
,
bsz
,
config_
.
hiddenDim
,
stream
,
preLayerNorm
,
config_
.
training
,
vars
);
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
cudaStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_in
=
nullptr
)
{
launch_layerNorm_backward
(
out_grad
,
norm_in
,
vars
,
means
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
);
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
gamma
,
const
T
*
betta
,
T
*
gamma_grad
,
T
*
betta_grad
,
cudaStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_out
)
{
launch_layerNorm_backward
(
out_grad
,
norm_out
,
vars
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
,
!
config_
.
useMean
,
betta
);
}
void
BackwardFusedAdd
(
int
bsz
,
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
cudaStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_in
=
nullptr
)
{
launch_layerNorm_backward_fused_add
(
out_grad1
,
out_grad2
,
norm_in
,
vars
,
means
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
);
}
void
BackwardFusedAdd
(
int
bsz
,
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
gamma
,
const
T
*
betta
,
T
*
gamma_grad
,
T
*
betta_grad
,
cudaStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_out
)
{
launch_layerNorm_backward_fused_add
(
out_grad1
,
out_grad2
,
norm_out
,
vars
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
,
!
config_
.
useMean
,
betta
);
}
inline
bool
UseMean
()
const
{
return
config_
.
useMean
;
}
inline
void
SetVar
(
T
*
variance
)
{
if
(
!
variance
)
{
throw
std
::
runtime_error
(
"Normalize variance is null."
);
}
vars
=
variance
;
}
inline
void
SetMean
(
T
*
mean
)
{
if
(
!
mean
)
{
throw
std
::
runtime_error
(
"Normalize mean is null."
);
}
means
=
mean
;
}
private:
Config
config_
;
T
*
vars
;
T
*
means
;
T
*
vals_hat
;
};
deepspeed/ops/csrc/includes/normalize_layer_hip.h
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <fstream>
#include "custom_hip_layers.h"
using
namespace
std
;
template
<
typename
T
>
class
Normalize_Layer
{
public:
struct
Config
{
uint32_t
batchSize
;
uint32_t
seqLength
;
uint32_t
hiddenDim
;
float
epsilon
;
bool
training
;
bool
useMean
;
Config
(
uint32_t
batch
,
uint32_t
seq
,
uint32_t
h
,
float
epsilon
=
1e-12
,
bool
training
=
true
,
bool
useMean
=
true
)
:
batchSize
(
batch
),
seqLength
(
seq
),
hiddenDim
(
h
),
epsilon
(
epsilon
),
training
(
training
),
useMean
(
useMean
)
{
}
};
Normalize_Layer
(
Config
config
)
:
config_
(
config
),
vars
(
nullptr
),
means
(
nullptr
),
vals_hat
(
nullptr
)
{
}
~
Normalize_Layer
()
{}
void
ForwardCheckpoint
(
int
bsz
,
// batch * seq
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
betta
,
hipStream_t
&
stream
,
bool
preLayerNorm
=
false
)
{
launch_bias_residual_layer_norm
(
vals
,
residual
,
gamma
,
betta
,
config_
.
epsilon
,
bsz
,
config_
.
hiddenDim
,
stream
,
preLayerNorm
,
config_
.
training
,
vars
,
means
);
}
void
Forward
(
int
bsz
,
T
*
vals
,
const
T
*
residual
,
const
T
*
gamma
,
const
T
*
betta
,
hipStream_t
&
stream
,
bool
preLayerNorm
=
false
)
{
launch_bias_residual_layer_norm
(
vals
,
residual
,
gamma
,
betta
,
config_
.
epsilon
,
bsz
,
config_
.
hiddenDim
,
stream
,
preLayerNorm
,
config_
.
training
,
vars
);
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
hipStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_in
=
nullptr
)
{
launch_layerNorm_backward
(
out_grad
,
norm_in
,
vars
,
means
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
);
}
void
Backward
(
int
bsz
,
const
T
*
out_grad
,
const
T
*
gamma
,
const
T
*
betta
,
T
*
gamma_grad
,
T
*
betta_grad
,
hipStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_out
)
{
launch_layerNorm_backward
(
out_grad
,
norm_out
,
vars
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
,
!
config_
.
useMean
,
betta
);
}
void
BackwardFusedAdd
(
int
bsz
,
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
gamma
,
T
*
gamma_grad
,
T
*
betta_grad
,
hipStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_in
=
nullptr
)
{
launch_layerNorm_backward_fused_add
(
out_grad1
,
out_grad2
,
norm_in
,
vars
,
means
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
);
}
void
BackwardFusedAdd
(
int
bsz
,
const
T
*
out_grad1
,
const
T
*
out_grad2
,
const
T
*
gamma
,
const
T
*
betta
,
T
*
gamma_grad
,
T
*
betta_grad
,
hipStream_t
stream
[
2
],
T
*
inp_grad_out
,
const
T
*
norm_out
)
{
launch_layerNorm_backward_fused_add
(
out_grad1
,
out_grad2
,
norm_out
,
vars
,
gamma
,
gamma_grad
,
betta_grad
,
inp_grad_out
,
bsz
,
config_
.
hiddenDim
,
stream
,
!
config_
.
useMean
,
betta
);
}
inline
bool
UseMean
()
const
{
return
config_
.
useMean
;
}
inline
void
SetVar
(
T
*
variance
)
{
if
(
!
variance
)
{
throw
std
::
runtime_error
(
"Normalize variance is null."
);
}
vars
=
variance
;
}
inline
void
SetMean
(
T
*
mean
)
{
if
(
!
mean
)
{
throw
std
::
runtime_error
(
"Normalize mean is null."
);
}
means
=
mean
;
}
private:
Config
config_
;
T
*
vars
;
T
*
means
;
T
*
vals_hat
;
};
deepspeed/ops/csrc/includes/quantizer.h
deleted
100644 → 0
View file @
1b2721ad
#pragma once
#include <cooperative_groups.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#include <cassert>
#include <iostream>
deepspeed/ops/csrc/includes/quantizer_hip.h
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <cooperative_groups.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#include <cassert>
#include <iostream>
deepspeed/ops/csrc/includes/simd.h
deleted
100644 → 0
View file @
1b2721ad
#pragma once
#if (__x86_64__ || __i386__)
#include <cpuid.h>
#include <x86intrin.h>
#endif
#define TILE (128 * 1024 * 1024)
#if defined(__AVX512__) or defined(__AVX256__)
#define ROUND_DOWN(size, step) ((size) & ~((step)-1))
#if defined(__AVX512__)
#define SIMD_STORE(a, d) _mm512_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm512_loadu_ps(x)
#define SIMD_SET(x) _mm512_set1_ps(x)
#define SIMD_ADD(x, y) _mm512_add_ps(x, y)
#define SIMD_MUL(x, y) _mm512_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm512_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm512_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm512_div_ps(x, y)
#define SIMD_WIDTH 16
#define SIMD_LOAD2(x, h) \
((h) ? _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)x)) : _mm512_loadu_ps(x))
#define SIMD_STORE2(x, d, h) \
((h) ? _mm256_store_ps(x, _mm256_castsi256_ps(_mm512_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
: _mm512_storeu_ps(x, d))
#define INTV __m256i
#elif defined(__AVX256__)
#define SIMD_STORE(a, d) _mm256_storeu_ps(a, d)
#define SIMD_LOAD(x) _mm256_loadu_ps(x)
#define SIMD_SET(x) _mm256_set1_ps(x)
#define SIMD_ADD(x, y) _mm256_add_ps(x, y)
#define SIMD_MUL(x, y) _mm256_mul_ps(x, y)
#define SIMD_FMA(x, y, c) _mm256_fmadd_ps(x, y, c)
#define SIMD_SQRT(x) _mm256_sqrt_ps(x)
#define SIMD_DIV(x, y) _mm256_div_ps(x, y)
#define SIMD_WIDTH 8
#define SIMD_LOAD2(x, h) \
((h) ? _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)x)) : _mm256_loadu_ps(x))
#define SIMD_STORE2(x, d, h) \
((h) ? _mm_store_ps(x, _mm_castsi128_ps(_mm256_cvtps_ph(d, _MM_FROUND_TO_NEAREST_INT))) \
: _mm256_storeu_ps(x, d))
#define INTV __m128i
#endif
union
AVX_Data
{
#if defined(__AVX512__)
__m512
data
;
#elif defined(__AVX256__)
__m256
data
;
#endif
// float data_f[16];
};
template
<
int
span
>
inline
void
simd_store
(
float
*
dst
,
AVX_Data
*
src
,
bool
half_precision
)
{
#pragma unroll
for
(
size_t
i
=
0
;
i
<
span
;
++
i
)
{
SIMD_STORE2
(
dst
+
SIMD_WIDTH
*
i
,
src
[
i
].
data
,
half_precision
);
}
}
template
<
int
span
>
inline
void
simd_load
(
AVX_Data
*
dst
,
float
*
src
,
bool
half_precision
)
{
#pragma unroll
for
(
size_t
i
=
0
;
i
<
span
;
++
i
)
{
dst
[
i
].
data
=
SIMD_LOAD2
(
src
+
SIMD_WIDTH
*
i
,
half_precision
);
}
}
template
<
int
span
>
inline
void
simd_fma
(
AVX_Data
*
dst
,
AVX_Data
*
src_m_l
,
AVX_Data
src_m_r
,
AVX_Data
*
src_a
)
{
#pragma unroll
for
(
size_t
i
=
0
;
i
<
span
;
++
i
)
{
dst
[
i
].
data
=
SIMD_FMA
(
src_m_l
[
i
].
data
,
src_m_r
.
data
,
src_a
[
i
].
data
);
}
}
template
<
int
span
>
inline
void
simd_fma
(
AVX_Data
*
dst
,
AVX_Data
*
src_m_l
,
AVX_Data
src_m_r
,
AVX_Data
src_a
)
{
#pragma unroll
for
(
size_t
i
=
0
;
i
<
span
;
++
i
)
{
dst
[
i
].
data
=
SIMD_FMA
(
src_m_l
[
i
].
data
,
src_m_r
.
data
,
src_a
.
data
);
}
}
template
<
int
span
>
inline
void
simd_fma
(
AVX_Data
*
dst
,
AVX_Data
*
src_m_l
,
AVX_Data
*
src_m_r
,
AVX_Data
*
src_a
)
{
#pragma unroll
for
(
size_t
i
=
0
;
i
<
span
;
++
i
)
{
dst
[
i
].
data
=
SIMD_FMA
(
src_m_l
[
i
].
data
,
src_m_r
[
i
].
data
,
src_a
[
i
].
data
);
}
}
template
<
int
span
>
inline
void
simd_sqrt
(
AVX_Data
*
dst
,
AVX_Data
*
src
)
{
#pragma unroll
for
(
size_t
i
=
0
;
i
<
span
;
++
i
)
{
dst
[
i
].
data
=
SIMD_SQRT
(
src
[
i
].
data
);
}
}
template
<
int
span
>
inline
void
simd_add
(
AVX_Data
*
dst
,
AVX_Data
*
src_a_l
,
AVX_Data
src_a_r
)
{
#pragma unroll
for
(
size_t
i
=
0
;
i
<
span
;
++
i
)
{
dst
[
i
].
data
=
SIMD_ADD
(
src_a_l
[
i
].
data
,
src_a_r
.
data
);
}
}
template
<
int
span
>
inline
void
simd_add
(
AVX_Data
*
dst
,
AVX_Data
*
src_a_l
,
AVX_Data
*
src_a_r
)
{
#pragma unroll
for
(
size_t
i
=
0
;
i
<
span
;
++
i
)
{
dst
[
i
].
data
=
SIMD_ADD
(
src_a_l
[
i
].
data
,
src_a_r
[
i
].
data
);
}
}
template
<
int
span
>
inline
void
simd_mul
(
AVX_Data
*
dst
,
AVX_Data
*
src_a_l
,
AVX_Data
src_a_r
)
{
#pragma unroll
for
(
size_t
i
=
0
;
i
<
span
;
++
i
)
{
dst
[
i
].
data
=
SIMD_MUL
(
src_a_l
[
i
].
data
,
src_a_r
.
data
);
}
}
template
<
int
span
>
inline
void
simd_mul
(
AVX_Data
*
dst
,
AVX_Data
*
src_a_l
,
AVX_Data
*
src_a_r
)
{
#pragma unroll
for
(
size_t
i
=
0
;
i
<
span
;
++
i
)
{
dst
[
i
].
data
=
SIMD_MUL
(
src_a_l
[
i
].
data
,
src_a_r
[
i
].
data
);
}
}
template
<
int
span
>
inline
void
simd_div
(
AVX_Data
*
dst
,
AVX_Data
*
src_a_l
,
AVX_Data
*
src_a_r
)
{
#pragma unroll
for
(
size_t
i
=
0
;
i
<
span
;
++
i
)
{
dst
[
i
].
data
=
SIMD_DIV
(
src_a_l
[
i
].
data
,
src_a_r
[
i
].
data
);
}
}
#endif
deepspeed/ops/csrc/includes/softmax.h
deleted
100644 → 0
View file @
1b2721ad
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include "custom_cuda_layers.h"
#include <fstream>
using
namespace
std
;
template
<
typename
T
>
class
Softmax
{
public:
struct
Config
{
size_t
batchSize
;
size_t
heads
;
size_t
seq_length
;
size_t
prob_depth
;
float
temperature
;
bool
mem_alloc
;
Config
(
size_t
batch
,
size_t
h
,
size_t
seq
,
int
prob_size
=
0
,
bool
mem_alloc
=
false
)
:
batchSize
(
batch
),
heads
(
h
),
seq_length
(
seq
),
prob_depth
(
prob_size
),
temperature
(
1.0
),
mem_alloc
(
mem_alloc
)
{
}
};
Softmax
(
Config
config
)
:
config_
(
config
)
{}
~
Softmax
()
{}
void
Forward
(
int
bsz
,
T
*
vals
,
const
T
*
attn_mask
,
cudaStream_t
&
stream
)
{
launch_attn_softmax
<
T
>
(
vals
,
attn_mask
,
bsz
,
config_
.
heads
,
config_
.
seq_length
,
stream
);
}
void
Backward
(
int
bsz
,
T
*
out_grad
,
const
T
*
soft_out
,
cudaStream_t
stream
)
{
launch_attn_softmax_backward_v2
<
T
>
(
out_grad
,
soft_out
,
bsz
,
config_
.
heads
,
config_
.
seq_length
,
stream
);
}
inline
size_t
GetProbDepth
()
const
{
return
config_
.
prob_depth
;
}
inline
size_t
GetBatchSize
()
const
{
return
config_
.
batchSize
;
}
inline
size_t
GetNumHeads
()
const
{
return
config_
.
heads
;
}
inline
size_t
GetSeqLength
()
const
{
return
config_
.
seq_length
;
}
inline
void
SetSeqLength
(
size_t
seq_len
)
{
config_
.
seq_length
=
seq_len
;
}
private:
Config
config_
;
};
deepspeed/ops/csrc/includes/softmax_hip.h
deleted
100644 → 0
View file @
1b2721ad
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <stdio.h>
#include "custom_hip_layers.h"
#include <fstream>
using
namespace
std
;
template
<
typename
T
>
class
Softmax
{
public:
struct
Config
{
size_t
batchSize
;
size_t
heads
;
size_t
seq_length
;
size_t
prob_depth
;
float
temperature
;
bool
mem_alloc
;
Config
(
size_t
batch
,
size_t
h
,
size_t
seq
,
int
prob_size
=
0
,
bool
mem_alloc
=
false
)
:
batchSize
(
batch
),
heads
(
h
),
seq_length
(
seq
),
prob_depth
(
prob_size
),
temperature
(
1.0
),
mem_alloc
(
mem_alloc
)
{
}
};
Softmax
(
Config
config
)
:
config_
(
config
)
{}
~
Softmax
()
{}
void
Forward
(
int
bsz
,
T
*
vals
,
const
T
*
attn_mask
,
hipStream_t
&
stream
)
{
launch_attn_softmax
<
T
>
(
vals
,
attn_mask
,
bsz
,
config_
.
heads
,
config_
.
seq_length
,
stream
);
}
void
Backward
(
int
bsz
,
T
*
out_grad
,
const
T
*
soft_out
,
hipStream_t
stream
)
{
launch_attn_softmax_backward_v2
<
T
>
(
out_grad
,
soft_out
,
bsz
,
config_
.
heads
,
config_
.
seq_length
,
stream
);
}
inline
size_t
GetProbDepth
()
const
{
return
config_
.
prob_depth
;
}
inline
size_t
GetBatchSize
()
const
{
return
config_
.
batchSize
;
}
inline
size_t
GetNumHeads
()
const
{
return
config_
.
heads
;
}
inline
size_t
GetSeqLength
()
const
{
return
config_
.
seq_length
;
}
inline
void
SetSeqLength
(
size_t
seq_len
)
{
config_
.
seq_length
=
seq_len
;
}
private:
Config
config_
;
};
Prev
1
…
12
13
14
15
16
17
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment