Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Lmdeploy
Commits
9484fd1c
"vscode:/vscode.git/clone" did not exist on "f594c8eb9162bfb6c74531831ceb4a99d6533fbe"
Commit
9484fd1c
authored
Dec 20, 2023
by
xiabo
Browse files
Adapt to 0.1.0
parent
477f2db8
Changes
56
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
2119 additions
and
2037 deletions
+2119
-2037
src/turbomind/utils/cublasMMWrapper.cc
src/turbomind/utils/cublasMMWrapper.cc
+192
-189
src/turbomind/utils/cublasMMWrapper.h
src/turbomind/utils/cublasMMWrapper.h
+14
-14
src/turbomind/utils/cuda_type_utils.cuh
src/turbomind/utils/cuda_type_utils.cuh
+22
-10
src/turbomind/utils/custom_ar_comm.cc
src/turbomind/utils/custom_ar_comm.cc
+7
-7
src/turbomind/utils/gemm.cc
src/turbomind/utils/gemm.cc
+96
-94
src/turbomind/utils/gemm.h
src/turbomind/utils/gemm.h
+4
-4
src/turbomind/utils/gemm_test/CMakeLists.txt
src/turbomind/utils/gemm_test/CMakeLists.txt
+45
-32
src/turbomind/utils/gemm_test/decoding_gemm_func.cc
src/turbomind/utils/gemm_test/decoding_gemm_func.cc
+48
-41
src/turbomind/utils/gemm_test/encoder_gemm_func.cc
src/turbomind/utils/gemm_test/encoder_gemm_func.cc
+40
-33
src/turbomind/utils/gemm_test/encoder_igemm_func.cc
src/turbomind/utils/gemm_test/encoder_igemm_func.cc
+711
-711
src/turbomind/utils/gemm_test/gemm_func.cc
src/turbomind/utils/gemm_test/gemm_func.cc
+708
-708
src/turbomind/utils/gemm_test/gpt_gemm_func.cc
src/turbomind/utils/gemm_test/gpt_gemm_func.cc
+71
-53
src/turbomind/utils/gemm_test/swin_gemm_func.cc
src/turbomind/utils/gemm_test/swin_gemm_func.cc
+40
-33
src/turbomind/utils/gemm_test/swin_igemm_func.cc
src/turbomind/utils/gemm_test/swin_igemm_func.cc
+17
-17
src/turbomind/utils/gemm_test/t5_gemm_func.cc
src/turbomind/utils/gemm_test/t5_gemm_func.cc
+64
-58
src/turbomind/utils/gemm_test/xlnet_gemm_func.cc
src/turbomind/utils/gemm_test/xlnet_gemm_func.cc
+40
-33
No files found.
src/turbomind/utils/cublasMMWrapper.cc
View file @
9484fd1c
...
...
@@ -185,124 +185,126 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
cublasLtMatmulAlgo_info
info
=
cublas_algo_map_
->
getAlgo
(
batch_count
,
m
,
n
,
k
,
getCublasDataType
(
Atype_
));
if
(
findAlgo
)
{
if
(
info
.
stages
!=
-
1
)
{
using_cublasLt
=
true
;
}
else
{
using_cublasLt
=
false
;
}
}
if
(
using_cublasLt
)
{
cublasLtMatmulDesc_t
operationDesc
=
NULL
;
cublasLtMatrixLayout_t
Adesc
=
NULL
,
Bdesc
=
NULL
,
Cdesc
=
NULL
;
cudaDataType_t
scaleType
;
#if (CUDART_VERSION >= 11000)
cublasComputeType_t
computeType
;
#else
cudaDataType_t
computeType
;
#endif
if
(
is_fp16_computeType
)
{
#if (CUDART_VERSION >= 11000)
computeType
=
CUBLAS_COMPUTE_16F
;
#else
computeType
=
CUDA_R_16F
;
#endif
scaleType
=
CUDA_R_16F
;
}
else
{
#if (CUDART_VERSION >= 11000)
computeType
=
CUBLAS_COMPUTE_32F
;
#else
computeType
=
CUDA_R_32F
;
#endif
scaleType
=
CUDA_R_32F
;
}
// --------------------------------------
// Create descriptors for the original matrices
cublasLtMatrixLayoutCreate
(
&
Adesc
,
Atype_
,
transa
==
CUBLAS_OP_N
?
m
:
k
,
transa
==
CUBLAS_OP_N
?
k
:
m
,
lda
);
cublasLtMatrixLayoutCreate
(
&
Bdesc
,
Btype_
,
transb
==
CUBLAS_OP_N
?
k
:
n
,
transb
==
CUBLAS_OP_N
?
n
:
k
,
ldb
);
cublasLtMatrixLayoutCreate
(
&
Cdesc
,
Ctype_
,
m
,
n
,
ldc
);
#if (CUDART_VERSION >= 11000)
cublasLtMatmulDescCreate
(
&
operationDesc
,
computeType
,
scaleType
);
#else
cublasLtMatmulDescCreate
(
&
operationDesc
,
computeType
);
#endif
cublasLtMatmulDescSetAttribute
(
operationDesc
,
CUBLASLT_MATMUL_DESC_TRANSA
,
&
transa
,
sizeof
(
cublasOperation_t
));
cublasLtMatmulDescSetAttribute
(
operationDesc
,
CUBLASLT_MATMUL_DESC_TRANSB
,
&
transb
,
sizeof
(
cublasOperation_t
));
cublasLtMatmulAlgo_t
algo
;
void
*
workSpace
=
cublas_workspace_
;
int
workspaceSize
=
cublas_workspace_
==
NULL
?
0
:
CUBLAS_WORKSPACE_SIZE
;
if
(
findAlgo
)
{
if
(
info
.
workspaceSize
>
workspaceSize
)
{
findAlgo
=
0
;
}
else
{
cublasLtMatmulAlgoInit
(
cublaslt_handle_
,
computeType
,
scaleType
,
Atype_
,
Btype_
,
Ctype_
,
Ctype_
,
info
.
algoId
,
&
algo
);
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION
,
&
(
info
.
customOption
),
sizeof
(
info
.
customOption
));
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_TILE_ID
,
&
(
info
.
tile
),
sizeof
(
info
.
tile
));
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_SPLITK_NUM
,
&
(
info
.
splitK_val
),
sizeof
(
info
.
splitK_val
));
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
,
&
(
info
.
swizzle
),
sizeof
(
info
.
swizzle
));
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
info
.
reductionScheme
),
sizeof
(
info
.
reductionScheme
));
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_STAGES_ID
,
&
(
info
.
stages
),
sizeof
(
info
.
stages
));
#endif
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID
,
&
(
info
.
inner_shapeId
),
sizeof
(
info
.
inner_shapeId
));
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID
,
&
(
info
.
cluster_shapeId
),
sizeof
(
info
.
cluster_shapeId
));
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID
,
&
(
info
.
mma_shapeId
),
sizeof
(
info
.
mma_shapeId
));
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID
,
&
(
info
.
cga_shapeId
),
sizeof
(
info
.
cga_shapeId
));
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE
,
&
(
info
.
sche_mode
),
sizeof
(
info
.
sche_mode
));
#endif
using_cublasLt
=
false
;
}
}
cublasLtMatmul
(
cublaslt_handle_
,
operationDesc
,
alpha
,
A
,
Adesc
,
B
,
Bdesc
,
beta
,
C
,
Cdesc
,
C
,
Cdesc
,
(
findAlgo
==
1
?
(
&
algo
)
:
NULL
),
workSpace
,
workspaceSize
,
stream_
);
cublasLtMatmulDescDestroy
(
operationDesc
);
cublasLtMatrixLayoutDestroy
(
Adesc
);
cublasLtMatrixLayoutDestroy
(
Bdesc
);
cublasLtMatrixLayoutDestroy
(
Cdesc
);
sync_check_cuda_error
();
}
else
{
// if (using_cublasLt) {
// if (0) {
// cublasLtMatmulDesc_t operationDesc = NULL;
// cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
// cudaDataType_t scaleType;
// #if (CUDART_VERSION >= 11000)
// cublasComputeType_t computeType;
// #else
// cudaDataType_t computeType;
// #endif
// if (is_fp16_computeType) {
// #if (CUDART_VERSION >= 11000)
// computeType = CUBLAS_COMPUTE_16F;
// #else
// computeType = CUDA_R_16F;
// #endif
// scaleType = CUDA_R_16F;
// }
// else {
// #if (CUDART_VERSION >= 11000)
// computeType = CUBLAS_COMPUTE_32F;
// #else
// computeType = CUDA_R_32F;
// #endif
// scaleType = CUDA_R_32F;
// }
// // --------------------------------------
// // Create descriptors for the original matrices
// cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
// cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
// cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc);
// #if (CUDART_VERSION >= 11000)
// cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
// #else
// cublasLtMatmulDescCreate(&operationDesc, computeType);
// #endif
// cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
// cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
// cublasLtMatmulAlgo_t algo;
// void* workSpace = cublas_workspace_;
// int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
// if (findAlgo) {
// if (info.workspaceSize > workspaceSize) {
// findAlgo = 0;
// }
// else {
// cublasLtMatmulAlgoInit(
// cublaslt_handle_, computeType, scaleType, Atype_, Btype_, Ctype_, Ctype_, info.algoId, &algo);
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
// cublasLtMatmulAlgoConfigSetAttribute(&algo,
// CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
// &(info.reductionScheme),
// sizeof(info.reductionScheme));
// // #if (CUDART_VERSION >= 11000)
// // cublasLtMatmulAlgoConfigSetAttribute(
// // &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
// // #endif
// #if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId));
// cublasLtMatmulAlgoConfigSetAttribute(&algo,
// CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID,
// &(info.cluster_shapeId),
// sizeof(info.cluster_shapeId));
// #elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode));
// #endif
// }
// }
// // cublasLtMatmul(cublaslt_handle_,
// // operationDesc,
// // alpha,
// // A,
// // Adesc,
// // B,
// // Bdesc,
// // beta,
// // C,
// // Cdesc,
// // C,
// // Cdesc,
// // (findAlgo == 1 ? (&algo) : NULL),
// // workSpace,
// // workspaceSize,
// // stream_);
// cublasLtMatmulDescDestroy(operationDesc);
// cublasLtMatrixLayoutDestroy(Adesc);
// cublasLtMatrixLayoutDestroy(Bdesc);
// cublasLtMatrixLayoutDestroy(Cdesc);
// sync_check_cuda_error();
// }
// else {
int
cublasAlgo
=
info
.
algoId
;
check_cuda_error
(
cublasGemmEx
(
cublas_handle_
,
transa
,
...
...
@@ -324,7 +326,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
computeType_
,
static_cast
<
cublasGemmAlgo_t
>
(
cublasAlgo
)));
sync_check_cuda_error
();
}
//
}
mu_
->
unlock
();
}
...
...
@@ -341,7 +343,7 @@ void cublasMMWrapper::setFP16GemmConfig()
Atype_
=
CUDA_R_16F
;
Btype_
=
CUDA_R_16F
;
Ctype_
=
CUDA_R_16F
;
computeType_
=
CUDA_R_
32
F
;
computeType_
=
CUDA_R_
16
F
;
}
#ifdef ENABLE_BF16
...
...
@@ -381,81 +383,81 @@ CublasDataType cublasMMWrapper::getCublasDataType(cudaDataType_t data_type)
return
FLOAT_DATATYPE
;
}
#if (CUDART_VERSION >= 11000)
// input, weight, output are row-major
// only works for cublas 11.x
void
cublasMMWrapper
::
Gemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
const
int
m
,
const
int
n
,
const
int
k
,
const
void
*
A
,
const
int
lda
,
const
void
*
B
,
const
int
ldb
,
const
void
*
bias
,
void
*
C
,
const
int
ldc
)
{
TM_LOG_DEBUG
(
__PRETTY_FUNCTION__
);
cudaDataType_t
Atype
,
Btype
,
Ctype
;
cublasComputeType_t
computeType
;
cudaDataType_t
scaleType
;
float
alpha_float
=
1.0
f
;
float
beta_float
=
0.0
f
;
half
alpha_half
=
half
(
1.0
f
);
half
beta_half
=
half
(
0.0
f
);
void
*
alpha
,
*
beta
;
// int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
if
(
Atype_
==
CUDA_R_32F
)
{
computeType
=
CUBLAS_COMPUTE_32F_FAST_TF32
;
Atype
=
CUDA_R_32F
;
Btype
=
CUDA_R_32F
;
Ctype
=
CUDA_R_32F
;
scaleType
=
CUDA_R_32F
;
alpha
=
&
alpha_float
;
beta
=
&
beta_float
;
}
else
if
(
Atype_
==
CUDA_R_16BF
)
{
computeType
=
CUBLAS_COMPUTE_32F_FAST_TF32
;
Atype
=
CUDA_R_16BF
;
Btype
=
CUDA_R_16BF
;
Ctype
=
CUDA_R_16BF
;
scaleType
=
CUDA_R_32F
;
alpha
=
&
alpha_float
;
beta
=
&
beta_float
;
}
else
{
computeType
=
CUBLAS_COMPUTE_16F
;
Atype
=
CUDA_R_16F
;
Btype
=
CUDA_R_16F
;
Ctype
=
CUDA_R_16F
;
scaleType
=
CUDA_R_16F
;
alpha
=
&
alpha_half
;
beta
=
&
beta_half
;
}
cublasLtMatmulDesc_t
operationDesc
=
NULL
;
cublasLtMatrixLayout_t
Adesc
=
NULL
,
Bdesc
=
NULL
,
Cdesc
=
NULL
;
cublasLtEpilogue_t
epi
=
CUBLASLT_EPILOGUE_BIAS
;
cublasLtMatrixLayoutCreate
(
&
Adesc
,
Atype
,
(
transa
==
CUBLAS_OP_N
)
?
m
:
k
,
(
transa
==
CUBLAS_OP_N
)
?
k
:
m
,
lda
);
cublasLtMatrixLayoutCreate
(
&
Bdesc
,
Btype
,
(
transb
==
CUBLAS_OP_N
)
?
k
:
n
,
(
transb
==
CUBLAS_OP_N
)
?
n
:
k
,
ldb
);
cublasLtMatrixLayoutCreate
(
&
Cdesc
,
Ctype
,
m
,
n
,
ldc
);
cublasLtMatmulDescCreate
(
&
operationDesc
,
computeType
,
scaleType
);
cublasLtMatmulDescSetAttribute
(
operationDesc
,
CUBLASLT_MATMUL_DESC_TRANSA
,
&
transa
,
sizeof
(
cublasOperation_t
));
cublasLtMatmulDescSetAttribute
(
operationDesc
,
CUBLASLT_MATMUL_DESC_TRANSB
,
&
transb
,
sizeof
(
cublasOperation_t
));
cublasLtMatmulDescSetAttribute
(
operationDesc
,
CUBLASLT_MATMUL_DESC_EPILOGUE
,
&
epi
,
sizeof
(
cublasLtEpilogue_t
));
cublasLtMatmulDescSetAttribute
(
operationDesc
,
CUBLASLT_MATMUL_DESC_BIAS_POINTER
,
&
bias
,
sizeof
(
const
void
*
));
check_cuda_error
(
cublasLtMatmul
(
cublaslt_handle_
,
operationDesc
,
alpha
,
A
,
Adesc
,
B
,
Bdesc
,
beta
,
C
,
Cdesc
,
C
,
Cdesc
,
NULL
,
NULL
,
0
,
stream_
));
cublasLtMatrixLayoutDestroy
(
Adesc
);
cublasLtMatrixLayoutDestroy
(
Bdesc
);
cublasLtMatrixLayoutDestroy
(
Cdesc
);
cublasLtMatmulDescDestroy
(
operationDesc
);
}
#endif
//
#if (CUDART_VERSION >= 11000)
//
// input, weight, output are row-major
//
// only works for cublas 11.x
//
void cublasMMWrapper::Gemm(cublasOperation_t transa,
//
cublasOperation_t transb,
//
const int m,
//
const int n,
//
const int k,
//
const void* A,
//
const int lda,
//
const void* B,
//
const int ldb,
//
const void* bias,
//
void* C,
//
const int ldc)
//
{
//
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
//
cudaDataType_t Atype, Btype, Ctype;
//
cublasComputeType_t computeType;
//
cudaDataType_t scaleType;
//
float alpha_float = 1.0f;
//
float beta_float = 0.0f;
//
half alpha_half = half(1.0f);
//
half beta_half = half(0.0f);
//
void * alpha, *beta;
//
// int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
//
if (Atype_ == CUDA_R_32F) {
//
computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
//
Atype = CUDA_R_32F;
//
Btype = CUDA_R_32F;
//
Ctype = CUDA_R_32F;
//
scaleType = CUDA_R_32F;
//
alpha = &alpha_float;
//
beta = &beta_float;
//
}
//
else if (Atype_ == CUDA_R_16BF) {
//
computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
//
Atype = CUDA_R_16BF;
//
Btype = CUDA_R_16BF;
//
Ctype = CUDA_R_16BF;
//
scaleType = CUDA_R_32F;
//
alpha = &alpha_float;
//
beta = &beta_float;
//
}
//
else {
//
computeType = CUBLAS_COMPUTE_16F;
//
Atype = CUDA_R_16F;
//
Btype = CUDA_R_16F;
//
Ctype = CUDA_R_16F;
//
scaleType = CUDA_R_16F;
//
alpha = &alpha_half;
//
beta = &beta_half;
//
}
//
cublasLtMatmulDesc_t operationDesc = NULL;
//
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
//
cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_BIAS;
//
cublasLtMatrixLayoutCreate(&Adesc, Atype, (transa == CUBLAS_OP_N) ? m : k, (transa == CUBLAS_OP_N) ? k : m, lda);
//
cublasLtMatrixLayoutCreate(&Bdesc, Btype, (transb == CUBLAS_OP_N) ? k : n, (transb == CUBLAS_OP_N) ? n : k, ldb);
//
cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldc);
//
cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
//
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
//
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
//
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t));
//
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*));
//
//
check_cuda_error(cublasLtMatmul(
//
//
cublaslt_handle_, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C, Cdesc, NULL, NULL, 0, stream_));
//
cublasLtMatrixLayoutDestroy(Adesc);
//
cublasLtMatrixLayoutDestroy(Bdesc);
//
cublasLtMatrixLayoutDestroy(Cdesc);
//
cublasLtMatmulDescDestroy(operationDesc);
//
}
//
#endif
void
cublasMMWrapper
::
setStream
(
cudaStream_t
stream
)
{
stream_
=
stream
;
...
...
@@ -985,7 +987,8 @@ void cublasMMWrapper::_Int8Gemm(const int m,
* - 0: int8 * int8 -> int32 -> int8
* - 1: int8 * int8 -> int32 -> int32
*/
#if (CUBLAS_VERSION) <= 11601
// #if (CUBLAS_VERSION) <= 11601
#if 1
FT_CHECK_WITH_INFO
(
false
,
"CUBLAS version too low."
);
#else
...
...
src/turbomind/utils/cublasMMWrapper.h
View file @
9484fd1c
...
...
@@ -207,20 +207,20 @@ public:
CublasDataType
getCublasDataType
(
cudaDataType_t
data_type
);
#if (CUDART_VERSION >= 11000)
void
Gemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
const
int
m
,
const
int
n
,
const
int
k
,
const
void
*
A
,
const
int
lda
,
const
void
*
B
,
const
int
ldb
,
const
void
*
bias
,
void
*
C
,
const
int
ldc
);
#endif
//
#if (CUDART_VERSION >= 11000)
//
void Gemm(cublasOperation_t transa,
//
cublasOperation_t transb,
//
const int m,
//
const int n,
//
const int k,
//
const void* A,
//
const int lda,
//
const void* B,
//
const int ldb,
//
const void* bias,
//
void* C,
//
const int ldc);
//
#endif
void
stridedBatchedGemm
(
cublasOperation_t
transa
,
cublasOperation_t
transb
,
...
...
src/turbomind/utils/cuda_type_utils.cuh
View file @
9484fd1c
...
...
@@ -322,7 +322,7 @@ __device__ inline int8_t cuda_cast<int8_t, half>(half val)
int16_t
int16_in
;
};
fp16
=
val
;
asm
volatile
(
"cvt.rni.sat.s8.f16 %0, %1;"
:
"=h"
(
int16
)
:
"h"
(
int16_in
));
//
asm volatile("cvt.rni.sat.s8.f16 %0, %1;" : "=h"(int16) : "h"(int16_in));
return
int8
[
0
];
}
...
...
@@ -333,20 +333,31 @@ __device__ inline int16_t cuda_cast<int16_t, half2>(half2 val)
int8_t
int8
[
2
];
int16_t
int16
;
};
int8
[
0
]
=
cuda_cast
<
int8_t
>
(
val
.
x
);
int8
[
1
]
=
cuda_cast
<
int8_t
>
(
val
.
y
);
// int8[0] = cuda_cast<int8_t>(val.x);
// int8[1] = cuda_cast<int8_t>(val.y);
int8
[
0
]
=
cuda_cast
<
int8_t
>
((
val
.
data
[
0
]));
int8
[
1
]
=
cuda_cast
<
int8_t
>
((
val
.
data
[
1
]));
return
int16
;
}
template
<
>
__device__
inline
int8_t
cuda_cast
<
int8_t
,
float
>
(
float
val
)
{
union
{
int8_t
int8
[
2
];
int16_t
int16
;
};
asm
volatile
(
"cvt.rni.sat.s8.f32 %0, %1;"
:
"=h"
(
int16
)
:
"f"
(
val
));
return
int8
[
0
];
// union {
// int8_t int8[2];
// int16_t int16;
// };
// asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val));
// return int8[0];
int8_t
dst
;
if
(
val
>=
128
){
dst
=
127
;
}
else
if
(
val
<
-
128
){
dst
=
-
128
;
}
else
{
dst
=
static_cast
<
int8_t
>
(
val
);
}
return
dst
;
}
template
<
>
...
...
@@ -528,7 +539,8 @@ __device__ inline To cuda_max(Ti val)
template
<
>
__device__
inline
half
cuda_max
(
half2
val
)
{
return
(
val
.
x
>
val
.
y
)
?
val
.
x
:
val
.
y
;
// return (val.x > val.y) ? val.x : val.y;
return
(
val
.
data
[
0
]
>
val
.
data
[
1
])
?
val
.
data
[
0
]
:
val
.
data
[
1
];
}
#ifdef ENABLE_BF16
template
<
>
...
...
src/turbomind/utils/custom_ar_comm.cc
View file @
9484fd1c
...
...
@@ -152,17 +152,17 @@ void initCustomAllReduceComm(std::vector<std::shared_ptr<AbstractCustomComm>>* c
return
;
}
#if defined(CUDART_VERSION) && CUDART_VERSION >= 11020
for
(
size_t
i
=
0
;
i
<
rank_size
;
i
++
)
{
custom_all_reduce_comms
->
push_back
(
std
::
make_shared
<
CustomAllReduceComm
<
T
>>
(
rank_size
,
i
));
}
custom_all_reduce_comms
->
at
(
0
)
->
allocateAndExchangePeerAccessPointer
(
custom_all_reduce_comms
);
#else
//
#if defined(CUDART_VERSION) && CUDART_VERSION >= 11020
//
for (size_t i = 0; i < rank_size; i++) {
//
custom_all_reduce_comms->push_back(std::make_shared<CustomAllReduceComm<T>>(rank_size, i));
//
}
//
custom_all_reduce_comms->at(0)->allocateAndExchangePeerAccessPointer(custom_all_reduce_comms);
//
#else
TM_LOG_WARNING
(
"Custom All Reduce is not supported before CUDA 11.2. Using NCCL as Comm."
);
for
(
size_t
i
=
0
;
i
<
rank_size
;
i
++
)
{
custom_all_reduce_comms
->
push_back
(
nullptr
);
}
#endif
//
#endif
}
// Template instantiation
...
...
src/turbomind/utils/gemm.cc
View file @
9484fd1c
...
...
@@ -26,7 +26,7 @@ Gemm::Gemm(IAllocator* allocator, cudaStream_t stream, std::string config_file)
stream_
=
stream
;
mutex_
=
new
std
::
mutex
();
// mutex per process
check_cuda_error
(
cublasCreate
(
&
cublas_handle_
));
check_cuda_error
(
cublasLtCreate
(
&
cublaslt_handle_
));
//
check_cuda_error(cublasLtCreate(&cublaslt_handle_));
check_cuda_error
(
cublasSetStream
(
cublas_handle_
,
stream
));
if
(
allocator_
!=
nullptr
)
{
...
...
@@ -41,7 +41,7 @@ Gemm::~Gemm()
allocator_
->
free
((
void
**
)(
&
workspace_
));
allocator_
=
nullptr
;
}
cublasLtDestroy
(
cublaslt_handle_
);
//
cublasLtDestroy(cublaslt_handle_);
cublasDestroy
(
cublas_handle_
);
delete
cublas_algo_map_
;
delete
mutex_
;
...
...
@@ -248,7 +248,8 @@ void Gemm::gemm(const GemmOp transa,
mutex_
->
lock
();
// Use cublas as default in FP32 and cublasLt as default in FP16
bool
is_fp16_compute_type
=
compute_type_
==
TYPE_FP16
;
bool
using_cublasLt
=
Atype
==
TYPE_FP16
;
// bool using_cublasLt = Atype == TYPE_FP16;
bool
using_cublasLt
=
(
Atype
==
TYPE_FP16
)
?
false
:
false
;
int
batch_count
=
1
;
half
h_alpha
=
(
half
)
alpha
;
...
...
@@ -267,82 +268,83 @@ void Gemm::gemm(const GemmOp transa,
using_cublasLt
=
(
info
.
stages
!=
-
1
);
}
if
(
using_cublasLt
)
{
const
size_t
a_rows
=
(
a_op
==
getCublasOperation
(
GEMM_OP_N
))
?
_m
:
k
;
const
size_t
a_cols
=
(
a_op
==
getCublasOperation
(
GEMM_OP_N
))
?
k
:
_m
;
const
size_t
b_rows
=
(
b_op
==
getCublasOperation
(
GEMM_OP_N
))
?
k
:
_n
;
const
size_t
b_cols
=
(
b_op
==
getCublasOperation
(
GEMM_OP_N
))
?
_n
:
k
;
cublasLtMatmulDesc_t
matmul_desc
=
NULL
;
cublasLtMatrixLayout_t
a_desc
=
NULL
,
b_desc
=
NULL
,
c_desc
=
NULL
;
cudaDataType_t
scale_type
=
getCublasDataType
(
compute_type_
);
auto
compute_type
=
getCublasComputeType
(
compute_type_
);
// --------------------------------------
// Create descriptors for the original matrices
cublasLtMatrixLayoutCreate
(
&
a_desc
,
a_type
,
a_rows
,
a_cols
,
_lda
);
cublasLtMatrixLayoutCreate
(
&
b_desc
,
b_type
,
b_rows
,
b_cols
,
_ldb
);
cublasLtMatrixLayoutCreate
(
&
c_desc
,
c_type
,
_m
,
_n
,
ldc
);
#if (CUDART_VERSION >= 11000)
cublasLtMatmulDescCreate
(
&
matmul_desc
,
compute_type
,
scale_type
);
#else
cublasLtMatmulDescCreate
(
&
matmul_desc
,
compute_type
);
#endif
cublasLtMatmulDescSetAttribute
(
matmul_desc
,
CUBLASLT_MATMUL_DESC_TRANSA
,
&
a_op
,
sizeof
(
cublasOperation_t
));
cublasLtMatmulDescSetAttribute
(
matmul_desc
,
CUBLASLT_MATMUL_DESC_TRANSB
,
&
b_op
,
sizeof
(
cublasOperation_t
));
cublasLtMatmulAlgo_t
algo
;
void
*
workspace
=
workspace_
;
int
workspace_size
=
workspace_
==
nullptr
?
0
:
CUBLAS_WORKSPACE_SIZE
;
if
(
findAlgo
)
{
if
(
info
.
workspaceSize
>
workspace_size
)
{
findAlgo
=
0
;
}
else
{
cublasLtMatmulAlgoInit
(
cublaslt_handle_
,
compute_type
,
scale_type
,
a_type
,
b_type
,
c_type
,
c_type
,
info
.
algoId
,
&
algo
);
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION
,
&
(
info
.
customOption
),
sizeof
(
info
.
customOption
));
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_TILE_ID
,
&
(
info
.
tile
),
sizeof
(
info
.
tile
));
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_SPLITK_NUM
,
&
(
info
.
splitK_val
),
sizeof
(
info
.
splitK_val
));
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
,
&
(
info
.
swizzle
),
sizeof
(
info
.
swizzle
));
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
,
&
(
info
.
reductionScheme
),
sizeof
(
int
));
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigSetAttribute
(
&
algo
,
CUBLASLT_ALGO_CONFIG_STAGES_ID
,
&
(
info
.
stages
),
sizeof
(
info
.
stages
));
#endif
}
}
// if (using_cublasLt) {
// if(0) {
// const size_t a_rows = (a_op == getCublasOperation(GEMM_OP_N)) ? _m : k;
// const size_t a_cols = (a_op == getCublasOperation(GEMM_OP_N)) ? k : _m;
// const size_t b_rows = (b_op == getCublasOperation(GEMM_OP_N)) ? k : _n;
// const size_t b_cols = (b_op == getCublasOperation(GEMM_OP_N)) ? _n : k;
// cublasLtMatmulDesc_t matmul_desc = NULL;
// cublasLtMatrixLayout_t a_desc = NULL, b_desc = NULL, c_desc = NULL;
// cudaDataType_t scale_type = getCublasDataType(compute_type_);
// auto compute_type = getCublasComputeType(compute_type_);
// // --------------------------------------
// // Create descriptors for the original matrices
// cublasLtMatrixLayoutCreate(&a_desc, a_type, a_rows, a_cols, _lda);
// cublasLtMatrixLayoutCreate(&b_desc, b_type, b_rows, b_cols, _ldb);
// cublasLtMatrixLayoutCreate(&c_desc, c_type, _m, _n, ldc);
// #if (CUDART_VERSION >= 11000)
// cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_type);
// #else
// cublasLtMatmulDescCreate(&matmul_desc, compute_type);
// #endif
// cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &a_op, sizeof(cublasOperation_t));
// cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &b_op, sizeof(cublasOperation_t));
// cublasLtMatmulAlgo_t algo;
// void* workspace = workspace_;
// int workspace_size = workspace_ == nullptr ? 0 : CUBLAS_WORKSPACE_SIZE;
// if (findAlgo) {
// if (info.workspaceSize > workspace_size) {
// findAlgo = 0;
// }
// else {
// cublasLtMatmulAlgoInit(
// cublaslt_handle_, compute_type, scale_type, a_type, b_type, c_type, c_type, info.algoId, &algo);
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(int));
// #if (CUDART_VERSION >= 11000)
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
// #endif
// }
// }
cublasLtMatmul
(
cublaslt_handle_
,
matmul_desc
,
alpha_ptr
,
a_data_ptr
,
a_desc
,
b_data_ptr
,
b_desc
,
beta_ptr
,
C
,
c_desc
,
C
,
c_desc
,
(
findAlgo
==
1
?
(
&
algo
)
:
NULL
),
workspace
,
workspace_size
,
stream_
);
cublasLtMatmulDescDestroy
(
matmul_desc
);
cublasLtMatrixLayoutDestroy
(
a_desc
);
cublasLtMatrixLayoutDestroy
(
b_desc
);
cublasLtMatrixLayoutDestroy
(
c_desc
);
sync_check_cuda_error
();
}
else
{
//
cublasLtMatmul(cublaslt_handle_,
//
matmul_desc,
//
alpha_ptr,
//
a_data_ptr,
//
a_desc,
//
b_data_ptr,
//
b_desc,
//
beta_ptr,
//
C,
//
c_desc,
//
C,
//
c_desc,
//
(findAlgo == 1 ? (&algo) : NULL),
//
workspace,
//
workspace_size,
//
stream_);
//
cublasLtMatmulDescDestroy(matmul_desc);
//
cublasLtMatrixLayoutDestroy(a_desc);
//
cublasLtMatrixLayoutDestroy(b_desc);
//
cublasLtMatrixLayoutDestroy(c_desc);
//
sync_check_cuda_error();
//
}
//
else {
cudaDataType_t
compute_type
=
getCublasDataType
(
compute_type_
);
int
cublas_algo
=
info
.
algoId
;
check_cuda_error
(
cublasGemmEx
(
cublas_handle_
,
...
...
@@ -365,7 +367,7 @@ void Gemm::gemm(const GemmOp transa,
compute_type
,
static_cast
<
cublasGemmAlgo_t
>
(
cublas_algo
)));
sync_check_cuda_error
();
}
//
}
mutex_
->
unlock
();
}
...
...
@@ -1033,19 +1035,19 @@ cudaDataType_t getCublasDataType(DataType dtype)
}
}
#if (CUDART_VERSION >= 11000)
cublasComputeType_t
getCublasComputeType
(
DataType
ctype
)
{
switch
(
ctype
)
{
case
TYPE_FP16
:
return
CUBLAS_COMPUTE_16F
;
case
TYPE_FP32
:
return
CUBLAS_COMPUTE_32F
;
default:
throw
GemmNotSupportedException
(
"Not supported cublas compute type."
);
}
}
#else
//
#if (CUDART_VERSION >= 11000)
//
cublasComputeType_t getCublasComputeType(DataType ctype)
//
{
//
switch (ctype) {
//
case TYPE_FP16:
//
return CUBLAS_COMPUTE_16F;
//
case TYPE_FP32:
//
return CUBLAS_COMPUTE_32F;
//
default:
//
throw GemmNotSupportedException("Not supported cublas compute type.");
//
}
//
}
//
#else
cudaDataType_t
getCublasComputeType
(
DataType
ctype
)
{
switch
(
ctype
)
{
...
...
@@ -1057,7 +1059,7 @@ cudaDataType_t getCublasComputeType(DataType ctype)
throw
GemmNotSupportedException
(
"Not supported cublas compute type."
);
}
}
#endif
//
#endif
cublasOperation_t
getCublasOperation
(
GemmOp
op
)
{
...
...
src/turbomind/utils/gemm.h
View file @
9484fd1c
...
...
@@ -622,11 +622,11 @@ std::shared_ptr<Gemm>
createGemm
(
IAllocator
*
allocator
,
cudaStream_t
stream
,
bool
sparse
=
false
,
bool
quantized
=
false
);
cudaDataType_t
getCublasDataType
(
DataType
dtype
);
#if (CUDART_VERSION >= 11000)
cublasComputeType_t
getCublasComputeType
(
DataType
dtype
);
#else
//
#if (CUDART_VERSION >= 11000)
//
cublasComputeType_t getCublasComputeType(DataType dtype);
//
#else
cudaDataType_t
getCublasComputeType
(
DataType
dtype
);
#endif
//
#endif
cublasOperation_t
getCublasOperation
(
GemmOp
op
);
std
::
string
getGemmOpString
(
const
GemmOp
&
op
);
...
...
src/turbomind/utils/gemm_test/CMakeLists.txt
View file @
9484fd1c
...
...
@@ -13,7 +13,8 @@
# limitations under the License.
cmake_minimum_required
(
VERSION 3.8
)
find_package
(
CUDAToolkit REQUIRED
)
#find_package(CUDAToolkit REQUIRED)
find_package
(
CUDA REQUIRED
)
set
(
gemm_func_files
gemm_func.cc
...
...
@@ -51,59 +52,71 @@ set(swin_gemm_func_files
swin_gemm_func.cc
)
set
(
CMAKE_CXX_FLAGS
"
${
CMAKE_CXX_FLAGS
}
-fPIC"
)
set
(
CMAKE_CUDA_FLAGS
"
${
CMAKE_CUDA_FLAGS
}
-fPIC"
)
add_library
(
gemm_func STATIC
${
gemm_func_files
}
)
target_link_libraries
(
gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger
)
set_property
(
TARGET gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#target_link_libraries(gemm_func PUBLIC cublas cublasLt cudart cuda_utils logger)
target_link_libraries
(
gemm_func PUBLIC cublas cudart cuda_utils logger
)
#set_property(TARGET gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
encoder_gemm_func STATIC
${
encoder_gemm_func_files
}
)
target_link_libraries
(
encoder_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger
)
#target_link_libraries(encoder_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
target_link_libraries
(
encoder_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger
)
if
(
SPARSITY_SUPPORT
)
target_link_libraries
(
encoder_gemm_func PUBLIC
CUDA::
cusparse -lcusparseLt
)
target_link_libraries
(
encoder_gemm_func PUBLIC cusparse -lcusparseLt
)
endif
()
set_property
(
TARGET encoder_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET encoder_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET encoder_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#
set_property(TARGET encoder_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
encoder_igemm_func STATIC
${
encoder_igemm_func_files
}
)
target_link_libraries
(
encoder_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger
)
#target_link_libraries(encoder_igemm_func PUBLIC cublas cublasLt cudart cuda_utils logger)
target_link_libraries
(
encoder_igemm_func PUBLIC cublas cudart cuda_utils logger
)
if
(
SPARSITY_SUPPORT
)
target_link_libraries
(
encoder_igemm_func PUBLIC
CUDA::
cusparse -lcusparseLt
)
target_link_libraries
(
encoder_igemm_func PUBLIC cusparse -lcusparseLt
)
endif
()
set_property
(
TARGET encoder_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET encoder_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET encoder_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#
set_property(TARGET encoder_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
decoding_gemm_func STATIC
${
decoding_gemm_func_files
}
)
target_link_libraries
(
decoding_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger
)
set_property
(
TARGET decoding_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET decoding_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#target_link_libraries(decoding_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
target_link_libraries
(
decoding_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger
)
#set_property(TARGET decoding_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET decoding_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
gpt_gemm_func STATIC
${
gpt_gemm_func_files
}
)
target_link_libraries
(
gpt_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger
)
#target_link_libraries(gpt_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
target_link_libraries
(
gpt_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger
)
if
(
SPARSITY_SUPPORT
)
target_link_libraries
(
gpt_gemm_func PUBLIC
CUDA::
cusparse -lcusparseLt
)
target_link_libraries
(
gpt_gemm_func PUBLIC cusparse -lcusparseLt
)
endif
()
set_property
(
TARGET gpt_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET gpt_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET gpt_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#
set_property(TARGET gpt_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
xlnet_gemm_func STATIC
${
xlnet_gemm_func_files
}
)
target_link_libraries
(
xlnet_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger
)
set_property
(
TARGET xlnet_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET xlnet_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#target_link_libraries(xlnet_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
target_link_libraries
(
xlnet_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger
)
#set_property(TARGET xlnet_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET xlnet_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
t5_gemm_func STATIC
${
t5_gemm_func_files
}
)
target_link_libraries
(
t5_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger
)
#target_link_libraries(t5_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
target_link_libraries
(
t5_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger
)
if
(
SPARSITY_SUPPORT
)
target_link_libraries
(
t5_gemm_func PUBLIC
CUDA::
cusparse -lcusparseLt
)
target_link_libraries
(
t5_gemm_func PUBLIC cusparse -lcusparseLt
)
endif
()
set_property
(
TARGET t5_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET t5_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#
set_property(TARGET t5_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#
set_property(TARGET t5_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
swin_igemm_func STATIC
${
swin_igemm_func_files
}
)
target_link_libraries
(
swin_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func encoder_igemm_func cuda_utils logger
)
set_property
(
TARGET swin_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET swin_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#target_link_libraries(swin_igemm_func PUBLIC cublas cublasLt cudart gemm_func encoder_igemm_func cuda_utils logger)
target_link_libraries
(
swin_igemm_func PUBLIC cublas cudart gemm_func encoder_igemm_func cuda_utils logger
)
#set_property(TARGET swin_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET swin_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library
(
swin_gemm_func STATIC
${
swin_gemm_func_files
}
)
target_link_libraries
(
swin_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger
)
set_property
(
TARGET swin_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON
)
set_property
(
TARGET swin_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON
)
#target_link_libraries(swin_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
target_link_libraries
(
swin_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger
)
#set_property(TARGET swin_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET swin_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
src/turbomind/utils/gemm_test/decoding_gemm_func.cc
View file @
9484fd1c
...
...
@@ -130,8 +130,8 @@ void generate_decoding_gemm_config(int batch_size,
cublasHandle_t
cublas_handle
;
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
cublasLtHandle_t
ltHandle
;
check_cuda_error
(
cublasLtCreate
(
&
ltHandle
));
//
cublasLtHandle_t ltHandle;
//
check_cuda_error(cublasLtCreate(<Handle));
cudaDataType_t
AType
;
cudaDataType_t
BType
;
...
...
@@ -148,16 +148,19 @@ void generate_decoding_gemm_config(int batch_size,
CType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO23
;
// endAlgo = (int)CUBLAS_GEMM_ALGO23;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
else
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
data_type
=
HALF_DATATYPE
;
AType
=
CUDA_R_16F
;
BType
=
CUDA_R_16F
;
CType
=
CUDA_R_16F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
computeType
=
CUDA_R_16F
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
#ifdef ENABLE_BF16
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
...
...
@@ -166,11 +169,14 @@ void generate_decoding_gemm_config(int batch_size,
BType
=
CUDA_R_16BF
;
CType
=
CUDA_R_16BF
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
#endif
using
scaleT
=
typename
ScaleTypeConverter
<
T
>::
Type
;
// using scaleT = typename ScaleTypeConverter<T>::Type;
using
scaleT
=
typename
ScaleTypeConverter
<
T
,
true
>::
Type
;
scaleT
alpha
=
(
scaleT
)
1.0
f
;
scaleT
beta
=
(
scaleT
)
0.0
f
;
...
...
@@ -241,38 +247,39 @@ void generate_decoding_gemm_config(int batch_size,
const
int
ALGO_COMBINATIONS
=
5000
;
customMatmulPerf_t
perfResults
[
ALGO_COMBINATIONS
];
LtHgemmCustomFind
<
T
,
scaleT
>
(
ltHandle
,
batch_size
*
beam_width
,
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
&
alpha
,
d_B
,
d_A
,
&
beta
,
d_C
,
cublas_workspace
,
workSpaceSize
,
fd
,
perfResults
,
ALGO_COMBINATIONS
);
if
(
perfResults
[
0
].
time
<
exec_time
)
{
printPerfStructure
(
batch_size
*
beam_width
,
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
perfResults
[
0
],
fd
,
data_type
,
0
);
}
else
{
// LtHgemmCustomFind<T, scaleT>(ltHandle,
// batch_size * beam_width,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// &alpha,
// d_B,
// d_A,
// &beta,
// d_C,
// cublas_workspace,
// workSpaceSize,
// fd,
// perfResults,
// ALGO_COMBINATIONS);
// if (perfResults[0].time < exec_time) {
// printPerfStructure(batch_size * beam_width,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// perfResults[0],
// fd,
// data_type,
// 0);
// }
// else {
{
fprintf
(
fd
,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
...
...
src/turbomind/utils/gemm_test/encoder_gemm_func.cc
View file @
9484fd1c
...
...
@@ -127,8 +127,8 @@ void generate_encoder_gemm_config(
cublasHandle_t
cublas_handle
;
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
cublasLtHandle_t
ltHandle
;
check_cuda_error
(
cublasLtCreate
(
&
ltHandle
));
//
cublasLtHandle_t ltHandle;
//
check_cuda_error(cublasLtCreate(<Handle));
cudaDataType_t
AType
;
cudaDataType_t
BType
;
...
...
@@ -145,16 +145,19 @@ void generate_encoder_gemm_config(
CType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO23
;
// endAlgo = (int)CUBLAS_GEMM_ALGO23;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
else
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
data_type
=
HALF_DATATYPE
;
AType
=
CUDA_R_16F
;
BType
=
CUDA_R_16F
;
CType
=
CUDA_R_16F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
computeType
=
CUDA_R_16F
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
#ifdef ENABLE_BF16
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
...
...
@@ -163,11 +166,14 @@ void generate_encoder_gemm_config(
BType
=
CUDA_R_16BF
;
CType
=
CUDA_R_16BF
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
#endif
using
scaleT
=
typename
ScaleTypeConverter
<
T
,
false
>::
Type
;
// using scaleT = typename ScaleTypeConverter<T, false>::Type;
using
scaleT
=
typename
ScaleTypeConverter
<
T
,
true
>::
Type
;
scaleT
alpha
=
(
scaleT
)
1.0
f
;
scaleT
beta
=
(
scaleT
)
0.0
f
;
...
...
@@ -331,30 +337,31 @@ void generate_encoder_gemm_config(
// Let try a fixed number of combinations
const
int
ALGO_COMBINATIONS
=
5000
;
customMatmulPerf_t
perfResults
[
ALGO_COMBINATIONS
];
LtHgemmCustomFind
<
T
,
scaleT
>
(
ltHandle
,
batch_size
,
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
&
alpha
,
d_B
,
d_A
,
&
beta
,
d_C
,
cublas_workspace
,
workSpaceSize
,
fd
,
perfResults
,
ALGO_COMBINATIONS
);
if
(
perfResults
[
0
].
time
<
exec_time
)
{
printPerfStructure
(
batch_size
,
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
perfResults
[
0
],
fd
,
data_type
,
0
);
exec_time
=
perfResults
[
0
].
time
;
}
else
{
// LtHgemmCustomFind<T, scaleT>(ltHandle,
// batch_size,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// &alpha,
// d_B,
// d_A,
// &beta,
// d_C,
// cublas_workspace,
// workSpaceSize,
// fd,
// perfResults,
// ALGO_COMBINATIONS);
// if (perfResults[0].time < exec_time) {
// printPerfStructure(
// batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
// exec_time = perfResults[0].time;
// }
// else {
{
fprintf
(
fd
,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
...
...
src/turbomind/utils/gemm_test/encoder_igemm_func.cc
View file @
9484fd1c
This diff is collapsed.
Click to expand it.
src/turbomind/utils/gemm_test/gemm_func.cc
View file @
9484fd1c
This diff is collapsed.
Click to expand it.
src/turbomind/utils/gemm_test/gpt_gemm_func.cc
View file @
9484fd1c
...
...
@@ -223,8 +223,8 @@ void generate_gpt_gemm_config(int batch_size,
cublasHandle_t
cublas_handle
;
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
cublasLtHandle_t
ltHandle
;
check_cuda_error
(
cublasLtCreate
(
&
ltHandle
));
//
cublasLtHandle_t ltHandle;
//
check_cuda_error(cublasLtCreate(<Handle));
cudaDataType_t
AType
;
cudaDataType_t
BType
;
...
...
@@ -244,7 +244,8 @@ void generate_gpt_gemm_config(int batch_size,
DType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO23
;
// endAlgo = (int)CUBLAS_GEMM_ALGO23;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
else
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
data_type
=
HALF_DATATYPE
;
...
...
@@ -252,9 +253,11 @@ void generate_gpt_gemm_config(int batch_size,
BType
=
CUDA_R_16F
;
CType
=
CUDA_R_16F
;
DType
=
CUDA_R_16F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
computeType
=
CUDA_R_16F
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
#ifdef ENABLE_BF16
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
...
...
@@ -264,8 +267,10 @@ void generate_gpt_gemm_config(int batch_size,
CType
=
CUDA_R_16BF
;
DType
=
CUDA_R_16BF
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
#endif
#ifdef ENABLE_FP8
...
...
@@ -293,12 +298,24 @@ void generate_gpt_gemm_config(int batch_size,
DType_FP8
[
9
]
=
CUDA_R_16BF
;
#endif
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
#endif
float
alpha
=
(
float
)
1.0
f
;
float
beta
=
(
float
)
0.0
f
;
// float alpha = (float)1.0f;
// float beta = (float)0.0f;
float
f_alpha
=
(
float
)
1.0
f
;
float
f_beta
=
(
float
)
0.0
f
;
half
h_alpha
=
(
half
)(
f_alpha
);
half
h_beta
=
(
half
)(
f_beta
);
int
is_fp16_computeType
=
computeType
==
CUDA_R_16F
?
1
:
0
;
const
void
*
alpha
=
is_fp16_computeType
?
reinterpret_cast
<
void
*>
(
&
h_alpha
)
:
reinterpret_cast
<
void
*>
(
&
f_alpha
);
const
void
*
beta
=
is_fp16_computeType
?
reinterpret_cast
<
void
*>
(
&
h_beta
)
:
reinterpret_cast
<
void
*>
(
&
f_beta
);
printf
(
"***Encoder Gemm Testing Begin***
\n
"
);
printf
(
"***Cublas Gemm Testing Begin***
\n
"
);
...
...
@@ -342,7 +359,7 @@ void generate_gpt_gemm_config(int batch_size,
max_input_len
,
max_input_len
,
size_per_head
,
&
alpha
,
&
f_
alpha
,
d_B
,
BType
,
size_per_head
,
...
...
@@ -351,13 +368,13 @@ void generate_gpt_gemm_config(int batch_size,
AType
,
size_per_head
,
max_input_len
*
size_per_head
,
&
beta
,
&
f_
beta
,
d_C
,
CUDA_R_32F
,
// CType,
max_input_len
,
max_input_len
*
max_input_len
,
batchCount
[
i
],
computeType
,
CUDA_R_32F
,
static_cast
<
cublasGemmAlgo_t
>
(
algo
));
}
else
if
(
i
==
2
)
{
...
...
@@ -456,44 +473,45 @@ void generate_gpt_gemm_config(int batch_size,
customMatmulPerf_t
perfResults
[
ALGO_COMBINATIONS
];
// for gpt, computeType & scaleType should be FP32
LtHgemmCustomFind
<
T
,
float
>
(
ltHandle
,
batch_size
*
beam_width
,
i
==
1
||
i
==
2
?
max_input_len
:
1
,
head_num
,
size_per_head
,
n
,
m
,
k
,
&
alpha
,
d_B
,
d_A
,
&
beta
,
d_C
,
cublas_workspace
,
workSpaceSize
,
fd
,
perfResults
,
ALGO_COMBINATIONS
,
DType_FP8
[
i
],
batchCount
[
i
],
strideA
[
i
],
strideB
[
i
],
strideD
[
i
]);
if
(
perfResults
[
0
].
time
<
exec_time
)
{
printPerfStructure
(
batch_size
*
beam_width
,
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
perfResults
[
0
],
fd
,
data_type
,
0
,
batchCount
[
i
]);
}
else
{
// LtHgemmCustomFind<T, float>(ltHandle,
// batch_size * beam_width,
// i == 1 || i == 2 ? max_input_len : 1,
// head_num,
// size_per_head,
// n,
// m,
// k,
// &alpha,
// d_B,
// d_A,
// &beta,
// d_C,
// cublas_workspace,
// workSpaceSize,
// fd,
// perfResults,
// ALGO_COMBINATIONS,
// DType_FP8[i],
// batchCount[i],
// strideA[i],
// strideB[i],
// strideD[i]);
// if (perfResults[0].time < exec_time) {
// printPerfStructure(batch_size * beam_width,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// perfResults[0],
// fd,
// data_type,
// 0,
// batchCount[i]);
// }
// else {
{
fprintf
(
fd
,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
...
...
src/turbomind/utils/gemm_test/swin_gemm_func.cc
View file @
9484fd1c
...
...
@@ -133,8 +133,8 @@ void generate_swin_gemm_config(
cublasHandle_t
cublas_handle
;
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
cublasLtHandle_t
ltHandle
;
check_cuda_error
(
cublasLtCreate
(
&
ltHandle
));
//
cublasLtHandle_t ltHandle;
//
check_cuda_error(cublasLtCreate(<Handle));
cudaDataType_t
AType
;
cudaDataType_t
BType
;
...
...
@@ -151,16 +151,19 @@ void generate_swin_gemm_config(
CType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO23
;
// endAlgo = (int)CUBLAS_GEMM_ALGO23;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
else
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
data_type
=
HALF_DATATYPE
;
AType
=
CUDA_R_16F
;
BType
=
CUDA_R_16F
;
CType
=
CUDA_R_16F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
computeType
=
CUDA_R_16F
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
#ifdef ENABLE_BF16
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
...
...
@@ -169,11 +172,14 @@ void generate_swin_gemm_config(
BType
=
CUDA_R_16BF
;
CType
=
CUDA_R_16BF
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
#endif
using
scaleT
=
typename
ScaleTypeConverter
<
T
,
false
>::
Type
;
// using scaleT = typename ScaleTypeConverter<T, false>::Type;
using
scaleT
=
typename
ScaleTypeConverter
<
T
,
true
>::
Type
;
scaleT
alpha
=
(
scaleT
)
1.0
f
;
scaleT
beta
=
(
scaleT
)
0.0
f
;
...
...
@@ -309,30 +315,31 @@ void generate_swin_gemm_config(
const
int
ALGO_COMBINATIONS
=
5000
;
customMatmulPerf_t
perfResults
[
ALGO_COMBINATIONS
];
LtHgemmCustomFind
<
T
,
scaleT
>
(
ltHandle
,
batch_size
,
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
&
alpha
,
d_B
,
d_A
,
&
beta
,
d_C
,
cublas_workspace
,
workSpaceSize
,
fd
,
perfResults
,
ALGO_COMBINATIONS
);
if
(
perfResults
[
0
].
time
<
exec_time
)
{
printPerfStructure
(
batch_size
,
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
perfResults
[
0
],
fd
,
data_type
,
0
);
exec_time
=
perfResults
[
0
].
time
;
}
else
{
// LtHgemmCustomFind<T, scaleT>(ltHandle,
// batch_size,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// &alpha,
// d_B,
// d_A,
// &beta,
// d_C,
// cublas_workspace,
// workSpaceSize,
// fd,
// perfResults,
// ALGO_COMBINATIONS);
// if (perfResults[0].time < exec_time) {
// printPerfStructure(
// batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
// exec_time = perfResults[0].time;
// }
// else {
{
fprintf
(
fd
,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
...
...
src/turbomind/utils/gemm_test/swin_igemm_func.cc
View file @
9484fd1c
...
...
@@ -144,23 +144,23 @@ int igemm_config_INT8IO(int m, int n, int k, FILE* fout, void* buffer)
int8_t
*
d_B
=
d_A
+
m
*
k
;
// k * n, stored in column-major
int8_t
*
d_C
=
(
int8_t
*
)(
d_B
+
k
*
n
);
// m * n, stored in column-major
cublasLtHandle_t
ltHandle
;
cublasLtCreate
(
&
ltHandle
);
LtIgemmCustomFind
(
ltHandle
,
m
,
n
,
k
,
&
alpha
,
/* host pointer */
d_A
,
d_B
,
&
beta
,
/* host pointer */
d_C
,
NULL
,
0
,
fout
);
cublasLtDestroy
(
ltHandle
);
//
cublasLtHandle_t ltHandle;
//
cublasLtCreate(<Handle);
//
LtIgemmCustomFind(ltHandle,
//
m,
//
n,
//
k,
//
&alpha, /* host pointer */
//
d_A,
//
d_B,
//
&beta, /* host pointer */
//
d_C,
//
NULL,
//
0,
//
fout);
//
cublasLtDestroy(ltHandle);
return
0
;
}
...
...
src/turbomind/utils/gemm_test/t5_gemm_func.cc
View file @
9484fd1c
...
...
@@ -195,8 +195,8 @@ void generate_t5_gemm_config(int batch_size,
cublasHandle_t
cublas_handle
;
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
cublasLtHandle_t
ltHandle
;
check_cuda_error
(
cublasLtCreate
(
&
ltHandle
));
//
cublasLtHandle_t ltHandle;
//
check_cuda_error(cublasLtCreate(<Handle));
cudaDataType_t
AType
;
cudaDataType_t
BType
;
...
...
@@ -213,16 +213,19 @@ void generate_t5_gemm_config(int batch_size,
CType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO23
;
// endAlgo = (int)CUBLAS_GEMM_ALGO23;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
else
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
data_type
=
HALF_DATATYPE
;
AType
=
CUDA_R_16F
;
BType
=
CUDA_R_16F
;
CType
=
CUDA_R_16F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
computeType
=
CUDA_R_16F
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
#ifdef ENABLE_BF16
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
...
...
@@ -231,8 +234,10 @@ void generate_t5_gemm_config(int batch_size,
BType
=
CUDA_R_16BF
;
CType
=
CUDA_R_16BF
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
#endif
float
f_alpha
=
(
float
)
1.0
f
;
...
...
@@ -442,60 +447,61 @@ void generate_t5_gemm_config(int batch_size,
scaleT
alpha_scale
=
(
scaleT
)
1.0
f
;
scaleT
beta_scale
=
(
scaleT
)
0.0
f
;
LtHgemmCustomFind
<
T
,
scaleT
>
(
ltHandle
,
m
,
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
&
(
alpha_scale
),
d_B
,
d_A
,
&
(
beta_scale
),
d_C
,
cublas_workspace
,
workSpaceSize
,
fd
,
perfResults
,
ALGO_COMBINATIONS
);
}
else
{
LtHgemmCustomFind
<
T
,
float
>
(
ltHandle
,
m
,
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
&
(
f_alpha
),
d_B
,
d_A
,
&
(
f_beta
),
d_C
,
cublas_workspace
,
workSpaceSize
,
fd
,
perfResults
,
ALGO_COMBINATIONS
);
}
if
(
perfResults
[
0
].
time
<
exec_time
)
{
printPerfStructure
(
batch_size
*
(
i
<=
5
||
i
==
1
?
1
:
beam_width
),
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
perfResults
[
0
],
fd
,
data_type
,
0
);
// LtHgemmCustomFind<T, scaleT>(ltHandle,
// m,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// &(alpha_scale),
// d_B,
// d_A,
// &(beta_scale),
// d_C,
// cublas_workspace,
// workSpaceSize,
// fd,
// perfResults,
// ALGO_COMBINATIONS);
}
else
{
// LtHgemmCustomFind<T, float>(ltHandle,
// m,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// &(f_alpha),
// d_B,
// d_A,
// &(f_beta),
// d_C,
// cublas_workspace,
// workSpaceSize,
// fd,
// perfResults,
// ALGO_COMBINATIONS);
}
// if (perfResults[0].time < exec_time) {
// printPerfStructure(batch_size * (i <= 5 || i == 1 ? 1 : beam_width),
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// perfResults[0],
// fd,
// data_type,
// 0);
// }
// else {
{
fprintf
(
fd
,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
...
...
src/turbomind/utils/gemm_test/xlnet_gemm_func.cc
View file @
9484fd1c
...
...
@@ -218,8 +218,8 @@ void generate_xlnet_gemm_config(int batch_size,
cublasHandle_t
cublas_handle
;
check_cuda_error
(
cublasCreate
(
&
cublas_handle
));
cublasLtHandle_t
ltHandle
;
check_cuda_error
(
cublasLtCreate
(
&
ltHandle
));
//
cublasLtHandle_t ltHandle;
//
check_cuda_error(cublasLtCreate(<Handle));
cudaDataType_t
AType
;
cudaDataType_t
BType
;
...
...
@@ -236,16 +236,19 @@ void generate_xlnet_gemm_config(int batch_size,
CType
=
CUDA_R_32F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO23
;
// endAlgo = (int)CUBLAS_GEMM_ALGO23;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
else
if
(
std
::
is_same
<
T
,
half
>::
value
)
{
data_type
=
HALF_DATATYPE
;
AType
=
CUDA_R_16F
;
BType
=
CUDA_R_16F
;
CType
=
CUDA_R_16F
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
computeType
=
CUDA_R_16F
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
#ifdef ENABLE_BF16
else
if
(
std
::
is_same
<
T
,
__nv_bfloat16
>::
value
)
{
...
...
@@ -254,12 +257,15 @@ void generate_xlnet_gemm_config(int batch_size,
BType
=
CUDA_R_16BF
;
CType
=
CUDA_R_16BF
;
computeType
=
CUDA_R_32F
;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT_TENSOR_OP
;
endAlgo
=
(
int
)
CUBLAS_GEMM_ALGO15_TENSOR_OP
;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
endAlgo
=
(
int
)
CUBLAS_GEMM_DEFAULT
;
}
#endif
using
scaleT
=
typename
ScaleTypeConverter
<
T
,
false
>::
Type
;
// using scaleT = typename ScaleTypeConverter<T, false>::Type;
using
scaleT
=
typename
ScaleTypeConverter
<
T
,
true
>::
Type
;
scaleT
alpha
=
(
scaleT
)
1.0
f
;
scaleT
beta
=
(
scaleT
)
0.0
f
;
...
...
@@ -358,30 +364,31 @@ void generate_xlnet_gemm_config(int batch_size,
const
int
ALGO_COMBINATIONS
=
5000
;
customMatmulPerf_t
perfResults
[
ALGO_COMBINATIONS
];
LtHgemmCustomFind
<
T
,
scaleT
>
(
ltHandle
,
batch_size
,
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
&
alpha
,
d_B
,
d_A
,
&
beta
,
d_C
,
cublas_workspace
,
workSpaceSize
,
fd
,
perfResults
,
ALGO_COMBINATIONS
);
if
(
perfResults
[
0
].
time
<
exec_time
)
{
printPerfStructure
(
batch_size
,
seq_len
,
head_num
,
size_per_head
,
n
,
m
,
k
,
perfResults
[
0
],
fd
,
data_type
,
0
);
exec_time
=
perfResults
[
0
].
time
;
}
else
{
// LtHgemmCustomFind<T, scaleT>(ltHandle,
// batch_size,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// &alpha,
// d_B,
// d_A,
// &beta,
// d_C,
// cublas_workspace,
// workSpaceSize,
// fd,
// perfResults,
// ALGO_COMBINATIONS);
// if (perfResults[0].time < exec_time) {
// printPerfStructure(
// batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
// exec_time = perfResults[0].time;
// }
// else {
{
fprintf
(
fd
,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment