Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
7d1a83a9
Commit
7d1a83a9
authored
May 25, 2022
by
aiss
Browse files
push Deepspeed 0.6.3 rocm version
parent
ab5534fc
Changes
162
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
332 additions
and
0 deletions
+332
-0
csrc/transformer_bak/inference/includes/cublas_wrappers_hip.h
.../transformer_bak/inference/includes/cublas_wrappers_hip.h
+208
-0
csrc/transformer_bak/inference/includes/custom_cuda_layers.h
csrc/transformer_bak/inference/includes/custom_cuda_layers.h
+124
-0
No files found.
Too many changes to show.
To preserve performance only
162 of 162+
files are displayed.
Plain diff
Email patch
csrc/transformer_bak/inference/includes/cublas_wrappers_hip.h
0 → 100644
View file @
7d1a83a9
// !!! This is a file automatically generated by hipify!!!
#pragma once
#include <assert.h>
#include <rocblas.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>
#include <mma.h>
#include <stdio.h>
int
cublas_gemm_ex
(
rocblas_handle
handle
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
cublasGemmAlgo_t
algo
)
{
rocblas_status
status
=
rocblas_gemmex
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
(
const
void
*
)
alpha
,
(
const
void
*
)
A
,
hipR32F
,
(
transa
==
rocblas_operation_none
)
?
m
:
k
,
(
const
void
*
)
B
,
hipR32F
,
(
transb
==
rocblas_operation_none
)
?
k
:
n
,
(
const
void
*
)
beta
,
C
,
hipR32F
,
m
,
hipR32F
,
algo
);
if
(
status
!=
rocblas_status_success
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d)
\n
"
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
int
cublas_gemm_ex
(
rocblas_handle
handle
,
rocblas_operation
transa
,
rocblas_operation
transb
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
cublasGemmAlgo_t
algo
)
{
rocblas_status
status
=
rocblas_gemmex
(
handle
,
transa
,
transb
,
m
,
n
,
k
,
(
const
void
*
)
alpha
,
(
const
void
*
)
A
,
hipR16F
,
(
transa
==
rocblas_operation_none
)
?
m
:
k
,
(
const
void
*
)
B
,
hipR16F
,
(
transb
==
rocblas_operation_none
)
?
k
:
n
,
(
const
void
*
)
beta
,
(
void
*
)
C
,
hipR16F
,
m
,
hipR32F
,
algo
);
if
(
status
!=
rocblas_status_success
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d)
\n
"
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
int
cublas_strided_batched_gemm
(
rocblas_handle
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
float
*
A
,
const
float
*
B
,
float
*
C
,
rocblas_operation
op_A
,
rocblas_operation
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
)
{
rocblas_status
status
=
cublasGemmStridedBatchedEx
(
handle
,
op_A
,
op_B
,
m
,
n
,
k
,
alpha
,
A
,
hipR32F
,
(
op_A
==
rocblas_operation_none
)
?
m
:
k
,
stride_A
,
B
,
hipR32F
,
(
op_B
==
rocblas_operation_none
)
?
k
:
n
,
stride_B
,
beta
,
C
,
hipR32F
,
m
,
stride_C
,
batch
,
hipR32F
,
algo
);
if
(
status
!=
rocblas_status_success
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (batch: %d, m: %d, n: %d, k: %d, error: %d)
\n
"
,
batch
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
int
cublas_strided_batched_gemm
(
rocblas_handle
handle
,
int
m
,
int
n
,
int
k
,
const
float
*
alpha
,
const
float
*
beta
,
const
__half
*
A
,
const
__half
*
B
,
__half
*
C
,
rocblas_operation
op_A
,
rocblas_operation
op_B
,
int
stride_A
,
int
stride_B
,
int
stride_C
,
int
batch
,
cublasGemmAlgo_t
algo
)
{
rocblas_status
status
=
cublasGemmStridedBatchedEx
(
handle
,
op_A
,
op_B
,
m
,
n
,
k
,
alpha
,
A
,
hipR16F
,
(
op_A
==
rocblas_operation_none
)
?
m
:
k
,
stride_A
,
B
,
hipR16F
,
(
op_B
==
rocblas_operation_none
)
?
k
:
n
,
stride_B
,
beta
,
C
,
hipR16F
,
m
,
stride_C
,
batch
,
hipR32F
,
algo
);
if
(
status
!=
rocblas_status_success
)
{
fprintf
(
stderr
,
"!!!! kernel execution error. (m: %d, n: %d, k: %d, error: %d)
\n
"
,
m
,
n
,
k
,
(
int
)
status
);
return
EXIT_FAILURE
;
}
return
0
;
}
csrc/transformer_bak/inference/includes/custom_cuda_layers.h
0 → 100644
View file @
7d1a83a9
#pragma once
#ifdef __HIP_PLATFORM_HCC__
#define HALF_PRECISION_AVAILABLE = 1
#include <hip/hip_cooperative_groups.h>
#else
#if __CUDA_ARCH__ >= 700
#define HALF_PRECISION_AVAILABLE = 1
#endif
#include <cooperative_groups.h>
#endif
#include <cuda.h>
#include <cuda_fp16.h>
#include <stdio.h>
#include <stdlib.h>
#include <cassert>
#include <iostream>
#define MAX_WARP_NUM 32
#define WARP_SIZE 32
#define SMs 80
#define MAX_REGISTERS 256
template
<
typename
T
>
void
launch_attn_softmax_v2
(
T
*
vals
,
T
*
mask
,
bool
triangular
,
bool
recompute
,
bool
local_attention
,
int
window_size
,
int
batch_size
,
int
heads
,
int
num_seq
,
int
sequence_length
,
float
scale
,
cudaStream_t
stream
);
// Fused bias add with gelu activation
template
<
typename
T
>
void
launch_bias_gelu
(
T
*
input
,
const
T
*
bias
,
int
intermediate_size
,
int
batch_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_bias_add
(
T
*
input
,
const
T
*
bias
,
int
hidden_size
,
int
batch_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_bias_residual
(
T
*
input
,
T
*
output
,
T
*
attn
,
T
*
bias
,
T
*
attn_bias
,
int
batch
,
int
hidden_dim
,
int
mp_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_layer_norm
(
T
*
out
,
T
*
vals
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_residual_layer_norm
(
T
*
norm
,
T
*
res_add
,
T
*
vals
,
T
*
residual
,
const
T
*
bias
,
const
T
*
gamma
,
const
T
*
beta
,
float
epsilon
,
int
batch_size
,
int
hidden_dim
,
bool
preLN
,
bool
mlp_after_attn
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_dequantize
(
T
*
output
,
const
int8_t
*
input
,
const
float
*
qscale
,
unsigned
output_size
,
unsigned
hidden_dim
,
unsigned
groups
,
unsigned
merge_count
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_gptj_residual_add
(
T
*
input
,
T
*
output
,
T
*
attn
,
T
*
bias
,
T
*
attn_bias
,
int
batch
,
int
head_size
,
int
mp_size
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_apply_rotary_pos_emb
(
T
*
mixed_query
,
T
*
key_layer
,
unsigned
head_size
,
unsigned
seq_len
,
unsigned
rotary_dim
,
unsigned
offset
,
unsigned
num_heads
,
unsigned
batch
,
bool
rotate_half
,
bool
rotate_every_two
,
cudaStream_t
stream
);
template
<
typename
T
>
void
launch_moe_res_matmul
(
T
*
residual
,
T
*
coef
,
T
*
mlp_out
,
int
seq_len
,
int
hidden_dim
,
cudaStream_t
stream
);
Prev
1
…
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment