Commit 9484fd1c authored by xiabo's avatar xiabo
Browse files

Adapt to 0.1.0

parent 477f2db8
......@@ -185,124 +185,126 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
cublasLtMatmulAlgo_info info = cublas_algo_map_->getAlgo(batch_count, m, n, k, getCublasDataType(Atype_));
if (findAlgo) {
if (info.stages != -1) {
using_cublasLt = true;
}
else {
using_cublasLt = false;
}
}
if (using_cublasLt) {
cublasLtMatmulDesc_t operationDesc = NULL;
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
cudaDataType_t scaleType;
#if (CUDART_VERSION >= 11000)
cublasComputeType_t computeType;
#else
cudaDataType_t computeType;
#endif
if (is_fp16_computeType) {
#if (CUDART_VERSION >= 11000)
computeType = CUBLAS_COMPUTE_16F;
#else
computeType = CUDA_R_16F;
#endif
scaleType = CUDA_R_16F;
}
else {
#if (CUDART_VERSION >= 11000)
computeType = CUBLAS_COMPUTE_32F;
#else
computeType = CUDA_R_32F;
#endif
scaleType = CUDA_R_32F;
}
// --------------------------------------
// Create descriptors for the original matrices
cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc);
#if (CUDART_VERSION >= 11000)
cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
#else
cublasLtMatmulDescCreate(&operationDesc, computeType);
#endif
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
cublasLtMatmulAlgo_t algo;
void* workSpace = cublas_workspace_;
int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
if (findAlgo) {
if (info.workspaceSize > workspaceSize) {
findAlgo = 0;
}
else {
cublasLtMatmulAlgoInit(
cublaslt_handle_, computeType, scaleType, Atype_, Btype_, Ctype_, Ctype_, info.algoId, &algo);
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
&(info.reductionScheme),
sizeof(info.reductionScheme));
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
#endif
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId));
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID,
&(info.cluster_shapeId),
sizeof(info.cluster_shapeId));
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode));
#endif
using_cublasLt = false;
}
}
cublasLtMatmul(cublaslt_handle_,
operationDesc,
alpha,
A,
Adesc,
B,
Bdesc,
beta,
C,
Cdesc,
C,
Cdesc,
(findAlgo == 1 ? (&algo) : NULL),
workSpace,
workspaceSize,
stream_);
cublasLtMatmulDescDestroy(operationDesc);
cublasLtMatrixLayoutDestroy(Adesc);
cublasLtMatrixLayoutDestroy(Bdesc);
cublasLtMatrixLayoutDestroy(Cdesc);
sync_check_cuda_error();
}
else {
// if (using_cublasLt) {
// if (0) {
// cublasLtMatmulDesc_t operationDesc = NULL;
// cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
// cudaDataType_t scaleType;
// #if (CUDART_VERSION >= 11000)
// cublasComputeType_t computeType;
// #else
// cudaDataType_t computeType;
// #endif
// if (is_fp16_computeType) {
// #if (CUDART_VERSION >= 11000)
// computeType = CUBLAS_COMPUTE_16F;
// #else
// computeType = CUDA_R_16F;
// #endif
// scaleType = CUDA_R_16F;
// }
// else {
// #if (CUDART_VERSION >= 11000)
// computeType = CUBLAS_COMPUTE_32F;
// #else
// computeType = CUDA_R_32F;
// #endif
// scaleType = CUDA_R_32F;
// }
// // --------------------------------------
// // Create descriptors for the original matrices
// cublasLtMatrixLayoutCreate(&Adesc, Atype_, transa == CUBLAS_OP_N ? m : k, transa == CUBLAS_OP_N ? k : m, lda);
// cublasLtMatrixLayoutCreate(&Bdesc, Btype_, transb == CUBLAS_OP_N ? k : n, transb == CUBLAS_OP_N ? n : k, ldb);
// cublasLtMatrixLayoutCreate(&Cdesc, Ctype_, m, n, ldc);
// #if (CUDART_VERSION >= 11000)
// cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
// #else
// cublasLtMatmulDescCreate(&operationDesc, computeType);
// #endif
// cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
// cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
// cublasLtMatmulAlgo_t algo;
// void* workSpace = cublas_workspace_;
// int workspaceSize = cublas_workspace_ == NULL ? 0 : CUBLAS_WORKSPACE_SIZE;
// if (findAlgo) {
// if (info.workspaceSize > workspaceSize) {
// findAlgo = 0;
// }
// else {
// cublasLtMatmulAlgoInit(
// cublaslt_handle_, computeType, scaleType, Atype_, Btype_, Ctype_, Ctype_, info.algoId, &algo);
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
// cublasLtMatmulAlgoConfigSetAttribute(&algo,
// CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
// &(info.reductionScheme),
// sizeof(info.reductionScheme));
// // #if (CUDART_VERSION >= 11000)
// // cublasLtMatmulAlgoConfigSetAttribute(
// // &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
// // #endif
// #if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &(info.inner_shapeId), sizeof(info.inner_shapeId));
// cublasLtMatmulAlgoConfigSetAttribute(&algo,
// CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID,
// &(info.cluster_shapeId),
// sizeof(info.cluster_shapeId));
// #elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &(info.mma_shapeId), sizeof(info.mma_shapeId));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &(info.cga_shapeId), sizeof(info.cga_shapeId));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &(info.sche_mode), sizeof(info.sche_mode));
// #endif
// }
// }
// // cublasLtMatmul(cublaslt_handle_,
// // operationDesc,
// // alpha,
// // A,
// // Adesc,
// // B,
// // Bdesc,
// // beta,
// // C,
// // Cdesc,
// // C,
// // Cdesc,
// // (findAlgo == 1 ? (&algo) : NULL),
// // workSpace,
// // workspaceSize,
// // stream_);
// cublasLtMatmulDescDestroy(operationDesc);
// cublasLtMatrixLayoutDestroy(Adesc);
// cublasLtMatrixLayoutDestroy(Bdesc);
// cublasLtMatrixLayoutDestroy(Cdesc);
// sync_check_cuda_error();
// }
// else {
int cublasAlgo = info.algoId;
check_cuda_error(cublasGemmEx(cublas_handle_,
transa,
......@@ -324,7 +326,7 @@ void cublasMMWrapper::Gemm(cublasOperation_t transa,
computeType_,
static_cast<cublasGemmAlgo_t>(cublasAlgo)));
sync_check_cuda_error();
}
// }
mu_->unlock();
}
......@@ -341,7 +343,7 @@ void cublasMMWrapper::setFP16GemmConfig()
Atype_ = CUDA_R_16F;
Btype_ = CUDA_R_16F;
Ctype_ = CUDA_R_16F;
computeType_ = CUDA_R_32F;
computeType_ = CUDA_R_16F;
}
#ifdef ENABLE_BF16
......@@ -381,81 +383,81 @@ CublasDataType cublasMMWrapper::getCublasDataType(cudaDataType_t data_type)
return FLOAT_DATATYPE;
}
#if (CUDART_VERSION >= 11000)
// input, weight, output are row-major
// only works for cublas 11.x
void cublasMMWrapper::Gemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* A,
const int lda,
const void* B,
const int ldb,
const void* bias,
void* C,
const int ldc)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
cudaDataType_t Atype, Btype, Ctype;
cublasComputeType_t computeType;
cudaDataType_t scaleType;
float alpha_float = 1.0f;
float beta_float = 0.0f;
half alpha_half = half(1.0f);
half beta_half = half(0.0f);
void * alpha, *beta;
// int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
if (Atype_ == CUDA_R_32F) {
computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
Atype = CUDA_R_32F;
Btype = CUDA_R_32F;
Ctype = CUDA_R_32F;
scaleType = CUDA_R_32F;
alpha = &alpha_float;
beta = &beta_float;
}
else if (Atype_ == CUDA_R_16BF) {
computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
Atype = CUDA_R_16BF;
Btype = CUDA_R_16BF;
Ctype = CUDA_R_16BF;
scaleType = CUDA_R_32F;
alpha = &alpha_float;
beta = &beta_float;
}
else {
computeType = CUBLAS_COMPUTE_16F;
Atype = CUDA_R_16F;
Btype = CUDA_R_16F;
Ctype = CUDA_R_16F;
scaleType = CUDA_R_16F;
alpha = &alpha_half;
beta = &beta_half;
}
cublasLtMatmulDesc_t operationDesc = NULL;
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_BIAS;
cublasLtMatrixLayoutCreate(&Adesc, Atype, (transa == CUBLAS_OP_N) ? m : k, (transa == CUBLAS_OP_N) ? k : m, lda);
cublasLtMatrixLayoutCreate(&Bdesc, Btype, (transb == CUBLAS_OP_N) ? k : n, (transb == CUBLAS_OP_N) ? n : k, ldb);
cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldc);
cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t));
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*));
check_cuda_error(cublasLtMatmul(
cublaslt_handle_, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C, Cdesc, NULL, NULL, 0, stream_));
cublasLtMatrixLayoutDestroy(Adesc);
cublasLtMatrixLayoutDestroy(Bdesc);
cublasLtMatrixLayoutDestroy(Cdesc);
cublasLtMatmulDescDestroy(operationDesc);
}
#endif
// #if (CUDART_VERSION >= 11000)
// // input, weight, output are row-major
// // only works for cublas 11.x
// void cublasMMWrapper::Gemm(cublasOperation_t transa,
// cublasOperation_t transb,
// const int m,
// const int n,
// const int k,
// const void* A,
// const int lda,
// const void* B,
// const int ldb,
// const void* bias,
// void* C,
// const int ldc)
// {
// TM_LOG_DEBUG(__PRETTY_FUNCTION__);
// cudaDataType_t Atype, Btype, Ctype;
// cublasComputeType_t computeType;
// cudaDataType_t scaleType;
// float alpha_float = 1.0f;
// float beta_float = 0.0f;
// half alpha_half = half(1.0f);
// half beta_half = half(0.0f);
// void * alpha, *beta;
// // int is_fp16_computeType = computeType_ == CUDA_R_16F ? 1 : 0;
// if (Atype_ == CUDA_R_32F) {
// computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
// Atype = CUDA_R_32F;
// Btype = CUDA_R_32F;
// Ctype = CUDA_R_32F;
// scaleType = CUDA_R_32F;
// alpha = &alpha_float;
// beta = &beta_float;
// }
// else if (Atype_ == CUDA_R_16BF) {
// computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
// Atype = CUDA_R_16BF;
// Btype = CUDA_R_16BF;
// Ctype = CUDA_R_16BF;
// scaleType = CUDA_R_32F;
// alpha = &alpha_float;
// beta = &beta_float;
// }
// else {
// computeType = CUBLAS_COMPUTE_16F;
// Atype = CUDA_R_16F;
// Btype = CUDA_R_16F;
// Ctype = CUDA_R_16F;
// scaleType = CUDA_R_16F;
// alpha = &alpha_half;
// beta = &beta_half;
// }
// cublasLtMatmulDesc_t operationDesc = NULL;
// cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
// cublasLtEpilogue_t epi = CUBLASLT_EPILOGUE_BIAS;
// cublasLtMatrixLayoutCreate(&Adesc, Atype, (transa == CUBLAS_OP_N) ? m : k, (transa == CUBLAS_OP_N) ? k : m, lda);
// cublasLtMatrixLayoutCreate(&Bdesc, Btype, (transb == CUBLAS_OP_N) ? k : n, (transb == CUBLAS_OP_N) ? n : k, ldb);
// cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldc);
// cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
// cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &transa, sizeof(cublasOperation_t));
// cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &transb, sizeof(cublasOperation_t));
// cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epi, sizeof(cublasLtEpilogue_t));
// cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(const void*));
// // check_cuda_error(cublasLtMatmul(
// // cublaslt_handle_, operationDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, C, Cdesc, NULL, NULL, 0, stream_));
// cublasLtMatrixLayoutDestroy(Adesc);
// cublasLtMatrixLayoutDestroy(Bdesc);
// cublasLtMatrixLayoutDestroy(Cdesc);
// cublasLtMatmulDescDestroy(operationDesc);
// }
// #endif
void cublasMMWrapper::setStream(cudaStream_t stream)
{
stream_ = stream;
......@@ -985,7 +987,8 @@ void cublasMMWrapper::_Int8Gemm(const int m,
* - 0: int8 * int8 -> int32 -> int8
* - 1: int8 * int8 -> int32 -> int32
*/
#if (CUBLAS_VERSION) <= 11601
// #if (CUBLAS_VERSION) <= 11601
#if 1
FT_CHECK_WITH_INFO(false, "CUBLAS version too low.");
#else
......
......@@ -207,20 +207,20 @@ public:
CublasDataType getCublasDataType(cudaDataType_t data_type);
#if (CUDART_VERSION >= 11000)
void Gemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* A,
const int lda,
const void* B,
const int ldb,
const void* bias,
void* C,
const int ldc);
#endif
// #if (CUDART_VERSION >= 11000)
// void Gemm(cublasOperation_t transa,
// cublasOperation_t transb,
// const int m,
// const int n,
// const int k,
// const void* A,
// const int lda,
// const void* B,
// const int ldb,
// const void* bias,
// void* C,
// const int ldc);
// #endif
void stridedBatchedGemm(cublasOperation_t transa,
cublasOperation_t transb,
......
......@@ -322,7 +322,7 @@ __device__ inline int8_t cuda_cast<int8_t, half>(half val)
int16_t int16_in;
};
fp16 = val;
asm volatile("cvt.rni.sat.s8.f16 %0, %1;" : "=h"(int16) : "h"(int16_in));
// asm volatile("cvt.rni.sat.s8.f16 %0, %1;" : "=h"(int16) : "h"(int16_in));
return int8[0];
}
......@@ -333,20 +333,31 @@ __device__ inline int16_t cuda_cast<int16_t, half2>(half2 val)
int8_t int8[2];
int16_t int16;
};
int8[0] = cuda_cast<int8_t>(val.x);
int8[1] = cuda_cast<int8_t>(val.y);
// int8[0] = cuda_cast<int8_t>(val.x);
// int8[1] = cuda_cast<int8_t>(val.y);
int8[0] = cuda_cast<int8_t>((val.data[0]));
int8[1] = cuda_cast<int8_t>((val.data[1]));
return int16;
}
template<>
__device__ inline int8_t cuda_cast<int8_t, float>(float val)
{
union {
int8_t int8[2];
int16_t int16;
};
asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val));
return int8[0];
// union {
// int8_t int8[2];
// int16_t int16;
// };
// asm volatile("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val));
// return int8[0];
int8_t dst;
if (val >= 128){
dst = 127;
}else if (val < -128){
dst = -128;
}else{
dst = static_cast<int8_t>(val);
}
return dst;
}
template<>
......@@ -528,7 +539,8 @@ __device__ inline To cuda_max(Ti val)
template<>
__device__ inline half cuda_max(half2 val)
{
return (val.x > val.y) ? val.x : val.y;
// return (val.x > val.y) ? val.x : val.y;
return (val.data[0] > val.data[1]) ? val.data[0] : val.data[1];
}
#ifdef ENABLE_BF16
template<>
......
......@@ -152,17 +152,17 @@ void initCustomAllReduceComm(std::vector<std::shared_ptr<AbstractCustomComm>>* c
return;
}
#if defined(CUDART_VERSION) && CUDART_VERSION >= 11020
for (size_t i = 0; i < rank_size; i++) {
custom_all_reduce_comms->push_back(std::make_shared<CustomAllReduceComm<T>>(rank_size, i));
}
custom_all_reduce_comms->at(0)->allocateAndExchangePeerAccessPointer(custom_all_reduce_comms);
#else
// #if defined(CUDART_VERSION) && CUDART_VERSION >= 11020
// for (size_t i = 0; i < rank_size; i++) {
// custom_all_reduce_comms->push_back(std::make_shared<CustomAllReduceComm<T>>(rank_size, i));
// }
// custom_all_reduce_comms->at(0)->allocateAndExchangePeerAccessPointer(custom_all_reduce_comms);
// #else
TM_LOG_WARNING("Custom All Reduce is not supported before CUDA 11.2. Using NCCL as Comm.");
for (size_t i = 0; i < rank_size; i++) {
custom_all_reduce_comms->push_back(nullptr);
}
#endif
// #endif
}
// Template instantiation
......
......@@ -26,7 +26,7 @@ Gemm::Gemm(IAllocator* allocator, cudaStream_t stream, std::string config_file)
stream_ = stream;
mutex_ = new std::mutex(); // mutex per process
check_cuda_error(cublasCreate(&cublas_handle_));
check_cuda_error(cublasLtCreate(&cublaslt_handle_));
// check_cuda_error(cublasLtCreate(&cublaslt_handle_));
check_cuda_error(cublasSetStream(cublas_handle_, stream));
if (allocator_ != nullptr) {
......@@ -41,7 +41,7 @@ Gemm::~Gemm()
allocator_->free((void**)(&workspace_));
allocator_ = nullptr;
}
cublasLtDestroy(cublaslt_handle_);
// cublasLtDestroy(cublaslt_handle_);
cublasDestroy(cublas_handle_);
delete cublas_algo_map_;
delete mutex_;
......@@ -248,7 +248,8 @@ void Gemm::gemm(const GemmOp transa,
mutex_->lock();
// Use cublas as default in FP32 and cublasLt as default in FP16
bool is_fp16_compute_type = compute_type_ == TYPE_FP16;
bool using_cublasLt = Atype == TYPE_FP16;
// bool using_cublasLt = Atype == TYPE_FP16;
bool using_cublasLt = (Atype == TYPE_FP16) ? false : false;
int batch_count = 1;
half h_alpha = (half)alpha;
......@@ -267,82 +268,83 @@ void Gemm::gemm(const GemmOp transa,
using_cublasLt = (info.stages != -1);
}
if (using_cublasLt) {
const size_t a_rows = (a_op == getCublasOperation(GEMM_OP_N)) ? _m : k;
const size_t a_cols = (a_op == getCublasOperation(GEMM_OP_N)) ? k : _m;
const size_t b_rows = (b_op == getCublasOperation(GEMM_OP_N)) ? k : _n;
const size_t b_cols = (b_op == getCublasOperation(GEMM_OP_N)) ? _n : k;
cublasLtMatmulDesc_t matmul_desc = NULL;
cublasLtMatrixLayout_t a_desc = NULL, b_desc = NULL, c_desc = NULL;
cudaDataType_t scale_type = getCublasDataType(compute_type_);
auto compute_type = getCublasComputeType(compute_type_);
// --------------------------------------
// Create descriptors for the original matrices
cublasLtMatrixLayoutCreate(&a_desc, a_type, a_rows, a_cols, _lda);
cublasLtMatrixLayoutCreate(&b_desc, b_type, b_rows, b_cols, _ldb);
cublasLtMatrixLayoutCreate(&c_desc, c_type, _m, _n, ldc);
#if (CUDART_VERSION >= 11000)
cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_type);
#else
cublasLtMatmulDescCreate(&matmul_desc, compute_type);
#endif
cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &a_op, sizeof(cublasOperation_t));
cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &b_op, sizeof(cublasOperation_t));
cublasLtMatmulAlgo_t algo;
void* workspace = workspace_;
int workspace_size = workspace_ == nullptr ? 0 : CUBLAS_WORKSPACE_SIZE;
if (findAlgo) {
if (info.workspaceSize > workspace_size) {
findAlgo = 0;
}
else {
cublasLtMatmulAlgoInit(
cublaslt_handle_, compute_type, scale_type, a_type, b_type, c_type, c_type, info.algoId, &algo);
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(int));
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
#endif
}
}
// if (using_cublasLt) {
// if(0) {
// const size_t a_rows = (a_op == getCublasOperation(GEMM_OP_N)) ? _m : k;
// const size_t a_cols = (a_op == getCublasOperation(GEMM_OP_N)) ? k : _m;
// const size_t b_rows = (b_op == getCublasOperation(GEMM_OP_N)) ? k : _n;
// const size_t b_cols = (b_op == getCublasOperation(GEMM_OP_N)) ? _n : k;
// cublasLtMatmulDesc_t matmul_desc = NULL;
// cublasLtMatrixLayout_t a_desc = NULL, b_desc = NULL, c_desc = NULL;
// cudaDataType_t scale_type = getCublasDataType(compute_type_);
// auto compute_type = getCublasComputeType(compute_type_);
// // --------------------------------------
// // Create descriptors for the original matrices
// cublasLtMatrixLayoutCreate(&a_desc, a_type, a_rows, a_cols, _lda);
// cublasLtMatrixLayoutCreate(&b_desc, b_type, b_rows, b_cols, _ldb);
// cublasLtMatrixLayoutCreate(&c_desc, c_type, _m, _n, ldc);
// #if (CUDART_VERSION >= 11000)
// cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_type);
// #else
// cublasLtMatmulDescCreate(&matmul_desc, compute_type);
// #endif
// cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &a_op, sizeof(cublasOperation_t));
// cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &b_op, sizeof(cublasOperation_t));
// cublasLtMatmulAlgo_t algo;
// void* workspace = workspace_;
// int workspace_size = workspace_ == nullptr ? 0 : CUBLAS_WORKSPACE_SIZE;
// if (findAlgo) {
// if (info.workspaceSize > workspace_size) {
// findAlgo = 0;
// }
// else {
// cublasLtMatmulAlgoInit(
// cublaslt_handle_, compute_type, scale_type, a_type, b_type, c_type, c_type, info.algoId, &algo);
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(int));
// #if (CUDART_VERSION >= 11000)
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
// #endif
// }
// }
cublasLtMatmul(cublaslt_handle_,
matmul_desc,
alpha_ptr,
a_data_ptr,
a_desc,
b_data_ptr,
b_desc,
beta_ptr,
C,
c_desc,
C,
c_desc,
(findAlgo == 1 ? (&algo) : NULL),
workspace,
workspace_size,
stream_);
cublasLtMatmulDescDestroy(matmul_desc);
cublasLtMatrixLayoutDestroy(a_desc);
cublasLtMatrixLayoutDestroy(b_desc);
cublasLtMatrixLayoutDestroy(c_desc);
sync_check_cuda_error();
}
else {
// cublasLtMatmul(cublaslt_handle_,
// matmul_desc,
// alpha_ptr,
// a_data_ptr,
// a_desc,
// b_data_ptr,
// b_desc,
// beta_ptr,
// C,
// c_desc,
// C,
// c_desc,
// (findAlgo == 1 ? (&algo) : NULL),
// workspace,
// workspace_size,
// stream_);
// cublasLtMatmulDescDestroy(matmul_desc);
// cublasLtMatrixLayoutDestroy(a_desc);
// cublasLtMatrixLayoutDestroy(b_desc);
// cublasLtMatrixLayoutDestroy(c_desc);
// sync_check_cuda_error();
// }
// else {
cudaDataType_t compute_type = getCublasDataType(compute_type_);
int cublas_algo = info.algoId;
check_cuda_error(cublasGemmEx(cublas_handle_,
......@@ -365,7 +367,7 @@ void Gemm::gemm(const GemmOp transa,
compute_type,
static_cast<cublasGemmAlgo_t>(cublas_algo)));
sync_check_cuda_error();
}
// }
mutex_->unlock();
}
......@@ -1033,19 +1035,19 @@ cudaDataType_t getCublasDataType(DataType dtype)
}
}
#if (CUDART_VERSION >= 11000)
cublasComputeType_t getCublasComputeType(DataType ctype)
{
switch (ctype) {
case TYPE_FP16:
return CUBLAS_COMPUTE_16F;
case TYPE_FP32:
return CUBLAS_COMPUTE_32F;
default:
throw GemmNotSupportedException("Not supported cublas compute type.");
}
}
#else
// #if (CUDART_VERSION >= 11000)
// cublasComputeType_t getCublasComputeType(DataType ctype)
// {
// switch (ctype) {
// case TYPE_FP16:
// return CUBLAS_COMPUTE_16F;
// case TYPE_FP32:
// return CUBLAS_COMPUTE_32F;
// default:
// throw GemmNotSupportedException("Not supported cublas compute type.");
// }
// }
// #else
cudaDataType_t getCublasComputeType(DataType ctype)
{
switch (ctype) {
......@@ -1057,7 +1059,7 @@ cudaDataType_t getCublasComputeType(DataType ctype)
throw GemmNotSupportedException("Not supported cublas compute type.");
}
}
#endif
// #endif
cublasOperation_t getCublasOperation(GemmOp op)
{
......
......@@ -622,11 +622,11 @@ std::shared_ptr<Gemm>
createGemm(IAllocator* allocator, cudaStream_t stream, bool sparse = false, bool quantized = false);
cudaDataType_t getCublasDataType(DataType dtype);
#if (CUDART_VERSION >= 11000)
cublasComputeType_t getCublasComputeType(DataType dtype);
#else
// #if (CUDART_VERSION >= 11000)
// cublasComputeType_t getCublasComputeType(DataType dtype);
// #else
cudaDataType_t getCublasComputeType(DataType dtype);
#endif
// #endif
cublasOperation_t getCublasOperation(GemmOp op);
std::string getGemmOpString(const GemmOp& op);
......
......@@ -13,7 +13,8 @@
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
find_package(CUDAToolkit REQUIRED)
#find_package(CUDAToolkit REQUIRED)
find_package(CUDA REQUIRED)
set(gemm_func_files
gemm_func.cc
......@@ -51,59 +52,71 @@ set(swin_gemm_func_files
swin_gemm_func.cc
)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -fPIC")
add_library(gemm_func STATIC ${gemm_func_files})
target_link_libraries(gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger)
set_property(TARGET gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#target_link_libraries(gemm_func PUBLIC cublas cublasLt cudart cuda_utils logger)
target_link_libraries(gemm_func PUBLIC cublas cudart cuda_utils logger)
#set_property(TARGET gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(encoder_gemm_func STATIC ${encoder_gemm_func_files})
target_link_libraries(encoder_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
#target_link_libraries(encoder_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
target_link_libraries(encoder_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(encoder_gemm_func PUBLIC CUDA::cusparse -lcusparseLt)
target_link_libraries(encoder_gemm_func PUBLIC cusparse -lcusparseLt)
endif()
set_property(TARGET encoder_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET encoder_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET encoder_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET encoder_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(encoder_igemm_func STATIC ${encoder_igemm_func_files})
target_link_libraries(encoder_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart cuda_utils logger)
#target_link_libraries(encoder_igemm_func PUBLIC cublas cublasLt cudart cuda_utils logger)
target_link_libraries(encoder_igemm_func PUBLIC cublas cudart cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(encoder_igemm_func PUBLIC CUDA::cusparse -lcusparseLt)
target_link_libraries(encoder_igemm_func PUBLIC cusparse -lcusparseLt)
endif()
set_property(TARGET encoder_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET encoder_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET encoder_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET encoder_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(decoding_gemm_func STATIC ${decoding_gemm_func_files})
target_link_libraries(decoding_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
set_property(TARGET decoding_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET decoding_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#target_link_libraries(decoding_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
target_link_libraries(decoding_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger)
#set_property(TARGET decoding_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET decoding_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(gpt_gemm_func STATIC ${gpt_gemm_func_files})
target_link_libraries(gpt_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
#target_link_libraries(gpt_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
target_link_libraries(gpt_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(gpt_gemm_func PUBLIC CUDA::cusparse -lcusparseLt)
target_link_libraries(gpt_gemm_func PUBLIC cusparse -lcusparseLt)
endif()
set_property(TARGET gpt_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET gpt_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET gpt_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET gpt_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(xlnet_gemm_func STATIC ${xlnet_gemm_func_files})
target_link_libraries(xlnet_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
set_property(TARGET xlnet_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET xlnet_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#target_link_libraries(xlnet_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
target_link_libraries(xlnet_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger)
#set_property(TARGET xlnet_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET xlnet_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(t5_gemm_func STATIC ${t5_gemm_func_files})
target_link_libraries(t5_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
#target_link_libraries(t5_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
target_link_libraries(t5_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(t5_gemm_func PUBLIC CUDA::cusparse -lcusparseLt)
target_link_libraries(t5_gemm_func PUBLIC cusparse -lcusparseLt)
endif()
set_property(TARGET t5_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET t5_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#set_property(TARGET t5_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET t5_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(swin_igemm_func STATIC ${swin_igemm_func_files})
target_link_libraries(swin_igemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func encoder_igemm_func cuda_utils logger)
set_property(TARGET swin_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET swin_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#target_link_libraries(swin_igemm_func PUBLIC cublas cublasLt cudart gemm_func encoder_igemm_func cuda_utils logger)
target_link_libraries(swin_igemm_func PUBLIC cublas cudart gemm_func encoder_igemm_func cuda_utils logger)
#set_property(TARGET swin_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET swin_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(swin_gemm_func STATIC ${swin_gemm_func_files})
target_link_libraries(swin_gemm_func PUBLIC CUDA::cublas CUDA::cublasLt CUDA::cudart gemm_func cuda_utils logger)
set_property(TARGET swin_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET swin_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
#target_link_libraries(swin_gemm_func PUBLIC cublas cublasLt cudart gemm_func cuda_utils logger)
target_link_libraries(swin_gemm_func PUBLIC cublas cudart gemm_func cuda_utils logger)
#set_property(TARGET swin_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
#set_property(TARGET swin_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
......@@ -130,8 +130,8 @@ void generate_decoding_gemm_config(int batch_size,
cublasHandle_t cublas_handle;
check_cuda_error(cublasCreate(&cublas_handle));
cublasLtHandle_t ltHandle;
check_cuda_error(cublasLtCreate(&ltHandle));
// cublasLtHandle_t ltHandle;
// check_cuda_error(cublasLtCreate(&ltHandle));
cudaDataType_t AType;
cudaDataType_t BType;
......@@ -148,16 +148,19 @@ void generate_decoding_gemm_config(int batch_size,
CType = CUDA_R_32F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_ALGO23;
// endAlgo = (int)CUBLAS_GEMM_ALGO23;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
else if (std::is_same<T, half>::value) {
data_type = HALF_DATATYPE;
AType = CUDA_R_16F;
BType = CUDA_R_16F;
CType = CUDA_R_16F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
computeType = CUDA_R_16F;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value) {
......@@ -166,11 +169,14 @@ void generate_decoding_gemm_config(int batch_size,
BType = CUDA_R_16BF;
CType = CUDA_R_16BF;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
#endif
using scaleT = typename ScaleTypeConverter<T>::Type;
// using scaleT = typename ScaleTypeConverter<T>::Type;
using scaleT = typename ScaleTypeConverter<T, true>::Type;
scaleT alpha = (scaleT)1.0f;
scaleT beta = (scaleT)0.0f;
......@@ -241,38 +247,39 @@ void generate_decoding_gemm_config(int batch_size,
const int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
LtHgemmCustomFind<T, scaleT>(ltHandle,
batch_size * beam_width,
seq_len,
head_num,
size_per_head,
n,
m,
k,
&alpha,
d_B,
d_A,
&beta,
d_C,
cublas_workspace,
workSpaceSize,
fd,
perfResults,
ALGO_COMBINATIONS);
if (perfResults[0].time < exec_time) {
printPerfStructure(batch_size * beam_width,
seq_len,
head_num,
size_per_head,
n,
m,
k,
perfResults[0],
fd,
data_type,
0);
}
else {
// LtHgemmCustomFind<T, scaleT>(ltHandle,
// batch_size * beam_width,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// &alpha,
// d_B,
// d_A,
// &beta,
// d_C,
// cublas_workspace,
// workSpaceSize,
// fd,
// perfResults,
// ALGO_COMBINATIONS);
// if (perfResults[0].time < exec_time) {
// printPerfStructure(batch_size * beam_width,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// perfResults[0],
// fd,
// data_type,
// 0);
// }
// else {
{
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
......
......@@ -127,8 +127,8 @@ void generate_encoder_gemm_config(
cublasHandle_t cublas_handle;
check_cuda_error(cublasCreate(&cublas_handle));
cublasLtHandle_t ltHandle;
check_cuda_error(cublasLtCreate(&ltHandle));
// cublasLtHandle_t ltHandle;
// check_cuda_error(cublasLtCreate(&ltHandle));
cudaDataType_t AType;
cudaDataType_t BType;
......@@ -145,16 +145,19 @@ void generate_encoder_gemm_config(
CType = CUDA_R_32F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_ALGO23;
// endAlgo = (int)CUBLAS_GEMM_ALGO23;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
else if (std::is_same<T, half>::value) {
data_type = HALF_DATATYPE;
AType = CUDA_R_16F;
BType = CUDA_R_16F;
CType = CUDA_R_16F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
computeType = CUDA_R_16F;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value) {
......@@ -163,11 +166,14 @@ void generate_encoder_gemm_config(
BType = CUDA_R_16BF;
CType = CUDA_R_16BF;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
#endif
using scaleT = typename ScaleTypeConverter<T, false>::Type;
// using scaleT = typename ScaleTypeConverter<T, false>::Type;
using scaleT = typename ScaleTypeConverter<T, true>::Type;
scaleT alpha = (scaleT)1.0f;
scaleT beta = (scaleT)0.0f;
......@@ -331,30 +337,31 @@ void generate_encoder_gemm_config(
// Let try a fixed number of combinations
const int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
LtHgemmCustomFind<T, scaleT>(ltHandle,
batch_size,
seq_len,
head_num,
size_per_head,
n,
m,
k,
&alpha,
d_B,
d_A,
&beta,
d_C,
cublas_workspace,
workSpaceSize,
fd,
perfResults,
ALGO_COMBINATIONS);
if (perfResults[0].time < exec_time) {
printPerfStructure(
batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
exec_time = perfResults[0].time;
}
else {
// LtHgemmCustomFind<T, scaleT>(ltHandle,
// batch_size,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// &alpha,
// d_B,
// d_A,
// &beta,
// d_C,
// cublas_workspace,
// workSpaceSize,
// fd,
// perfResults,
// ALGO_COMBINATIONS);
// if (perfResults[0].time < exec_time) {
// printPerfStructure(
// batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
// exec_time = perfResults[0].time;
// }
// else {
{
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
......
......@@ -82,11 +82,11 @@ int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE
matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
#else
// #if (CUDART_VERSION >= 11000)
// cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
// #else
stages = 0;
#endif
// #endif
printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
"time %f workspace=%d mathMode=%d waves=%f\n",
......@@ -148,11 +148,11 @@ int printBatchPerfStructure(
matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
#else
// #if (CUDART_VERSION >= 11000)
// cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
// #else
stages = 0;
#endif
// #endif
printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
"time %f workspace=%d mathMode=%d waves=%f\n",
......@@ -234,22 +234,22 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, //
cudaDeviceSynchronize();
auto start = std::chrono::high_resolution_clock::now();
for (int loop = 0; loop < repeats; loop++) {
oneRunStatus = cublasLtMatmul(ltHandle,
operationDesc,
alpha,
A,
Adesc,
B,
Bdesc,
beta,
C,
Cdesc,
D,
Ddesc,
&algo,
workSpace,
workSpaceSizeInBytes,
stream);
// oneRunStatus = cublasLtMatmul(ltHandle,
// operationDesc,
// alpha,
// A,
// Adesc,
// B,
// Bdesc,
// beta,
// C,
// Cdesc,
// D,
// Ddesc,
// &algo,
// workSpace,
// workSpaceSizeInBytes,
// stream);
}
cudaDeviceSynchronize();
auto end = std::chrono::high_resolution_clock::now();
......@@ -279,693 +279,693 @@ static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, //
// Sample wrapper running through multiple algo and config attributes combination for INT8 gemm using cublasLt low-level
// API
template<typename T, typename scaleT>
int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
int m,
int n,
int k,
const scaleT* alpha, /* host pointer */
const int8_t* A,
const int8_t* B,
const scaleT* beta, /* host pointer */
T* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout)
{
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
cublasLtMatmulDesc_t operationDesc = NULL;
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
cudaStream_t stream = 0;
// SplitK value that we are going to try when SplitK is supported for a given algo
const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
// Let try a fixed number of combinations
#define ALGO_COMBINATIONS 50000
int AlgoCombinations = ALGO_COMBINATIONS;
int AlgoCount = 0;
int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
int nbAlgoIds = 0;
#define ALGO_IDS 100
int algoIdA[ALGO_IDS];
cudaDataType_t Atype, Btype, Ctype, scaleType;
Atype = CUDA_R_8I;
Btype = CUDA_R_8I;
if (std::is_same<T, int32_t>::value && std::is_same<scaleT, int>::value) {
Ctype = CUDA_R_32I;
scaleType = CUDA_R_32I;
}
else if (std::is_same<T, int8_t>::value && std::is_same<scaleT, float>::value) {
Ctype = CUDA_R_8I;
scaleType = CUDA_R_32F;
}
else {
printf("[ERROR]<T,scaleT> of igemm is invalid\n");
exit(-1);
}
#if (CUDART_VERSION >= 11000)
cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
#else
cudaDataType_t computeType = CUDA_R_32I;
#endif
cublasOperation_t opTranspose = CUBLAS_OP_T;
bool use_ORDER_COL32_2R_4R4 = false;
#if (CUDART_VERSION >= 11000)
int device{-1};
cudaGetDevice(&device);
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device);
if (props.major * 10 + props.minor >= 80) {
use_ORDER_COL32_2R_4R4 = true;
}
#endif
cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
cublasLtOrder_t order_matrixB;
#if (CUDART_VERSION >= 11000)
if (use_ORDER_COL32_2R_4R4) {
order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
}
else {
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
}
#else
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
#endif
int ldaTransform = 32 * m;
int ldbTransform;
if (use_ORDER_COL32_2R_4R4) {
ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
}
else {
ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
}
int ldcTransform = 32 * m;
#if (CUDART_VERSION >= 11000)
status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
#else
status = cublasLtMatmulDescCreate(&operationDesc, scaleType);
#endif
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
// Create matrix descriptors.
status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, ldaTransform);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, n, k, ldbTransform);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status =
cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldcTransform);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
// Request AlgoId available for IGEMM
status = cublasLtMatmulAlgoGetIds(
ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, ALGO_IDS, algoIdA, &nbAlgoIds);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
// Loop over the Algo IDs
for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
cublasLtMatmulAlgo_t algo;
size_t sizeWritten = 0;
/* Initialize algo structure with given Algp ID */
status =
cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, algoIdA[idx], &algo);
if (status != CUBLAS_STATUS_SUCCESS) {
continue;
}
// Query the tiles enums supported by that algo
cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
int nbTiles = int(sizeWritten / sizeof(int));
int* tileA = new int[nbTiles == 0 ? 1 : nbTiles];
if (nbTiles == 0) {
tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
nbTiles = 1;
}
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
int nbStages = int(sizeWritten / sizeof(int));
std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
if (nbStages == 0) {
stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
nbStages = 1;
}
else {
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
}
#endif
int splitkSupport, redMask, swizzlingMax, customOptionMax;
// Retrieve Algo Capabilities attributes to be able to setup loop over the different combinations
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
/* Loop over the different tiles */
for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
#if (CUDART_VERSION >= 11000)
/* Loop over different stages count */
for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
#endif
/* Loop over the different custom option if any */
for (int customOption = 0; customOption <= customOptionMax; customOption++) {
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
/* Loop over the CTAs swizzling support */
for (int k = 0; k <= swizzlingMax; k++) {
int splitK_trial = 0;
if (splitkSupport) {
splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
}
// Loop over the splitK value over a fixed sequence splitKSequenceA in addition to the case
// where splitK is not enabled
for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
/* Setup attribute of the algo to run */
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
int splitK_val = 0;
int redScheme = CUBLASLT_REDUCTION_SCHEME_NONE;
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
if (l > 0) { // Split-K case
splitK_val = splitKSequenceA[l - 1];
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
&splitKSequenceA[l - 1],
sizeof(splitKSequenceA[l - 1]));
/* Going over all the reduction scheme */
for (redScheme = 1;
redScheme <= (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
redScheme = redScheme << 1) {
if (redScheme & redMask) {
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
&redScheme,
sizeof(redScheme));
status = customMatmulRun(ltHandle,
operationDesc,
alpha, /* host or device pointer */
A,
Adesc,
B,
Bdesc,
beta, /* host or device pointer */
C,
Cdesc,
C,
Cdesc,
algo,
kernelRepeats,
workSpace,
workSpaceSize,
perfResults[AlgoCount],
stream);
perfResults[AlgoCount].status = status;
if (status == CUBLAS_STATUS_SUCCESS) {
AlgoCount++;
}
} // end if
} // end for
}
else { // Non-splitK case
/* if user preference is ok with workspace */
if (AlgoCount < AlgoCombinations) {
status = customMatmulRun(ltHandle,
operationDesc,
alpha, /* host or device pointer */
A,
Adesc,
B,
Bdesc,
beta, /* host or device pointer */
C,
Cdesc,
C,
Cdesc,
algo,
kernelRepeats,
workSpace,
workSpaceSize,
perfResults[AlgoCount],
stream);
perfResults[AlgoCount].status = status;
if (status == CUBLAS_STATUS_SUCCESS) {
AlgoCount++;
}
}
}
} // end l
} // end k
} // end customOption
#if (CUDART_VERSION >= 11000)
} // end stagesIdx
#endif
} // end tileIdx
delete[] tileA;
} // end idx
// Sort the results per run duration
std::sort(perfResults, perfResults + AlgoCount, time_compare);
// Print timing and perf details
for (int i = 0, hasPrint = 0; i < AlgoCount; i++) {
printf("result %03d : ", i);
hasPrint = printPerfStructure(m, n, k, perfResults[i], fout, hasPrint);
}
CLEANUP:
// Descriptors are no longer needed as all GPU work was already enqueued
if (Cdesc) {
cublasLtMatrixLayoutDestroy(Cdesc);
}
if (Bdesc) {
cublasLtMatrixLayoutDestroy(Bdesc);
}
if (Adesc) {
cublasLtMatrixLayoutDestroy(Adesc);
}
if (operationDesc) {
cublasLtMatmulDescDestroy(operationDesc);
}
return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}
template int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
int m,
int n,
int k,
const int* alpha, /* host pointer */
const int8_t* A,
const int8_t* B,
const int* beta, /* host pointer */
int32_t* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout);
template int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
int m,
int n,
int k,
const float* alpha, /* host pointer */
const int8_t* A,
const int8_t* B,
const float* beta, /* host pointer */
int8_t* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout);
template<typename T, typename scaleT>
int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
int batchCount,
int m,
int n,
int k,
const scaleT* alpha, /* host pointer */
const int8_t* A,
const int8_t* B,
const scaleT* beta, /* host pointer */
T* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout)
{
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
cublasLtMatmulDesc_t operationDesc = NULL;
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
cudaStream_t stream = 0;
// SplitK value that we are going to try when SplitK is supported for a given algo
const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
// Let try a fixed number of combinations
#define ALGO_COMBINATIONS 50000
int AlgoCombinations = ALGO_COMBINATIONS;
int AlgoCount = 0;
int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
int nbAlgoIds = 0;
#define ALGO_IDS 100
int algoIdA[ALGO_IDS];
cudaDataType_t Atype, Btype, Ctype, scaleType;
Atype = CUDA_R_8I;
Btype = CUDA_R_8I;
if (std::is_same<T, int32_t>::value && std::is_same<scaleT, int>::value) {
Ctype = CUDA_R_32I;
scaleType = CUDA_R_32I;
}
else if (std::is_same<T, int8_t>::value && std::is_same<scaleT, float>::value) {
Ctype = CUDA_R_8I;
scaleType = CUDA_R_32F;
}
else {
printf("[ERROR]<T,scaleT> of igemm is invalid\n");
exit(-1);
}
#if (CUDART_VERSION >= 11000)
cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
#else
cudaDataType_t computeType = CUDA_R_32I;
#endif
cublasOperation_t opTranspose = CUBLAS_OP_T;
bool use_ORDER_COL32_2R_4R4 = false;
#if (CUDART_VERSION >= 11000)
int device{-1};
cudaGetDevice(&device);
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device);
if (props.major * 10 + props.minor >= 80) {
use_ORDER_COL32_2R_4R4 = true;
}
#endif
cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
cublasLtOrder_t order_matrixB;
#if (CUDART_VERSION >= 11000)
if (use_ORDER_COL32_2R_4R4) {
order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
}
else {
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
}
#else
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
#endif
int ldaTransform = 32 * m;
int ldbTransform;
if (use_ORDER_COL32_2R_4R4) {
ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
}
else {
ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
}
int ldcTransform = 32 * m;
int64_t stridea, strideb, stridec;
stridea = m * k;
strideb = n * k;
stridec = m * n;
#if (CUDART_VERSION >= 11000)
status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
#else
status = cublasLtMatmulDescCreate(&operationDesc, scaleType);
#endif
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
// Create matrix descriptors.
status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, ldaTransform);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea));
status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, n, k, ldbTransform);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status =
cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb));
status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldcTransform);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec));
// Request AlgoId available for IGEMM
status = cublasLtMatmulAlgoGetIds(
ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, ALGO_IDS, algoIdA, &nbAlgoIds);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
// Loop over the Algo IDs
for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
cublasLtMatmulAlgo_t algo;
size_t sizeWritten = 0;
/* Initialize algo structure with given Algp ID */
status =
cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, algoIdA[idx], &algo);
if (status != CUBLAS_STATUS_SUCCESS) {
continue;
}
// Query the tiles enums supported by that algo
cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
int nbTiles = int(sizeWritten / sizeof(int));
int* tileA = new int[nbTiles == 0 ? 1 : nbTiles];
if (nbTiles == 0) {
tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
nbTiles = 1;
}
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
int nbStages = int(sizeWritten / sizeof(int));
std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
if (nbStages == 0) {
stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
nbStages = 1;
}
else {
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
}
#endif
int splitkSupport, redMask, swizzlingMax, customOptionMax;
// Retrieve Algo Capabilities attributes to be able to setup loop over the different combinations
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
/* Loop over the different tiles */
for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
#if (CUDART_VERSION >= 11000)
/* Loop over different stages count */
for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
#endif
/* Loop over the different custom option if any */
for (int customOption = 0; customOption <= customOptionMax; customOption++) {
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
/* Loop over the CTAs swizzling support */
for (int k = 0; k <= swizzlingMax; k++) {
int splitK_trial = 0;
if (splitkSupport) {
splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
}
// Loop over the splitK value over a fixed sequence splitKSequenceA in addition to the case
// where splitK is not enabled
for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
/* Setup attribute of the algo to run */
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
int splitK_val = 0;
int redScheme = CUBLASLT_REDUCTION_SCHEME_NONE;
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
if (l > 0) { // Split-K case
splitK_val = splitKSequenceA[l - 1];
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
&splitKSequenceA[l - 1],
sizeof(splitKSequenceA[l - 1]));
/* Going over all the reduction scheme */
for (redScheme = 1;
redScheme <= (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
redScheme = redScheme << 1) {
if (redScheme & redMask) {
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
&redScheme,
sizeof(redScheme));
status = customMatmulRun(ltHandle,
operationDesc,
alpha, /* host or device pointer */
A,
Adesc,
B,
Bdesc,
beta, /* host or device pointer */
C,
Cdesc,
C,
Cdesc,
algo,
kernelRepeats,
workSpace,
workSpaceSize,
perfResults[AlgoCount],
stream);
perfResults[AlgoCount].status = status;
if (status == CUBLAS_STATUS_SUCCESS) {
AlgoCount++;
}
} // end if
} // end for
}
else { // Non-splitK case
/* if user preference is ok with workspace */
if (AlgoCount < AlgoCombinations) {
status = customMatmulRun(ltHandle,
operationDesc,
alpha, /* host or device pointer */
A,
Adesc,
B,
Bdesc,
beta, /* host or device pointer */
C,
Cdesc,
C,
Cdesc,
algo,
kernelRepeats,
workSpace,
workSpaceSize,
perfResults[AlgoCount],
stream);
perfResults[AlgoCount].status = status;
if (status == CUBLAS_STATUS_SUCCESS) {
AlgoCount++;
}
}
}
} // end l
} // end k
} // end customOption
#if (CUDART_VERSION >= 11000)
} // end stagesIdx
#endif
} // end tileIdx
delete[] tileA;
} // end idx
// Sort the results per run duration
std::sort(perfResults, perfResults + AlgoCount, time_compare);
// Print timing and perf details
for (int i = 0, hasPrint = 0; i < AlgoCount; i++) {
printf("result %03d : ", i);
hasPrint = printBatchPerfStructure(batchCount, m, n, k, perfResults[i], fout, hasPrint);
}
CLEANUP:
// Descriptors are no longer needed as all GPU work was already enqueued
if (Cdesc) {
cublasLtMatrixLayoutDestroy(Cdesc);
}
if (Bdesc) {
cublasLtMatrixLayoutDestroy(Bdesc);
}
if (Adesc) {
cublasLtMatrixLayoutDestroy(Adesc);
}
if (operationDesc) {
cublasLtMatmulDescDestroy(operationDesc);
}
return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}
template int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
int batchCount,
int m,
int n,
int k,
const int* alpha, /* host pointer */
const int8_t* A,
const int8_t* B,
const int* beta, /* host pointer */
int32_t* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout);
template int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
int batchCount,
int m,
int n,
int k,
const float* alpha, /* host pointer */
const int8_t* A,
const int8_t* B,
const float* beta, /* host pointer */
int8_t* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout);
// template<typename T, typename scaleT>
// int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
// int m,
// int n,
// int k,
// const scaleT* alpha, /* host pointer */
// const int8_t* A,
// const int8_t* B,
// const scaleT* beta, /* host pointer */
// T* C,
// void* workSpace,
// size_t workSpaceSize,
// FILE* fout)
// {
// cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
// cublasLtMatmulDesc_t operationDesc = NULL;
// cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
// cudaStream_t stream = 0;
// // SplitK value that we are going to try when SplitK is supported for a given algo
// const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
// // Let try a fixed number of combinations
// #define ALGO_COMBINATIONS 50000
// int AlgoCombinations = ALGO_COMBINATIONS;
// int AlgoCount = 0;
// int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back
// customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
// int nbAlgoIds = 0;
// #define ALGO_IDS 100
// int algoIdA[ALGO_IDS];
// cudaDataType_t Atype, Btype, Ctype, scaleType;
// Atype = CUDA_R_8I;
// Btype = CUDA_R_8I;
// if (std::is_same<T, int32_t>::value && std::is_same<scaleT, int>::value) {
// Ctype = CUDA_R_32I;
// scaleType = CUDA_R_32I;
// }
// else if (std::is_same<T, int8_t>::value && std::is_same<scaleT, float>::value) {
// Ctype = CUDA_R_8I;
// scaleType = CUDA_R_32F;
// }
// else {
// printf("[ERROR]<T,scaleT> of igemm is invalid\n");
// exit(-1);
// }
// // #if (CUDART_VERSION >= 11000)
// // cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
// // #else
// cudaDataType_t computeType = CUDA_R_32I;
// // #endif
// cublasOperation_t opTranspose = CUBLAS_OP_T;
// bool use_ORDER_COL32_2R_4R4 = false;
// // #if (CUDART_VERSION >= 11000)
// // int device{-1};
// // cudaGetDevice(&device);
// // cudaDeviceProp props;
// // cudaGetDeviceProperties(&props, device);
// // if (props.major * 10 + props.minor >= 80) {
// // use_ORDER_COL32_2R_4R4 = true;
// // }
// // #endif
// cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
// cublasLtOrder_t order_matrixB;
// // #if (CUDART_VERSION >= 11000)
// // if (use_ORDER_COL32_2R_4R4) {
// // order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
// // }
// // else {
// // order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
// // }
// // #else
// order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
// // #endif
// int ldaTransform = 32 * m;
// int ldbTransform;
// if (use_ORDER_COL32_2R_4R4) {
// ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
// }
// else {
// ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
// }
// int ldcTransform = 32 * m;
// // #if (CUDART_VERSION >= 11000)
// // status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
// // #else
// status = cublasLtMatmulDescCreate(&operationDesc, scaleType);
// // #endif
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
// // Create matrix descriptors.
// status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, ldaTransform);
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// status = cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, n, k, ldbTransform);
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// status =
// cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldcTransform);
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// status = cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// // Request AlgoId available for IGEMM
// status = cublasLtMatmulAlgoGetIds(
// ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, ALGO_IDS, algoIdA, &nbAlgoIds);
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// // Loop over the Algo IDs
// for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
// cublasLtMatmulAlgo_t algo;
// size_t sizeWritten = 0;
// /* Initialize algo structure with given Algp ID */
// status =
// cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, algoIdA[idx], &algo);
// if (status != CUBLAS_STATUS_SUCCESS) {
// continue;
// }
// // Query the tiles enums supported by that algo
// cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
// int nbTiles = int(sizeWritten / sizeof(int));
// int* tileA = new int[nbTiles == 0 ? 1 : nbTiles];
// if (nbTiles == 0) {
// tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
// nbTiles = 1;
// }
// // #if (CUDART_VERSION >= 11000)
// // cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
// // int nbStages = int(sizeWritten / sizeof(int));
// // std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
// // if (nbStages == 0) {
// // stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
// // nbStages = 1;
// // }
// // else {
// // cublasLtMatmulAlgoCapGetAttribute(
// // &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
// // }
// // #endif
// int splitkSupport, redMask, swizzlingMax, customOptionMax;
// // Retrieve Algo Capabilities attributes to be able to setup loop over the different combinations
// cublasLtMatmulAlgoCapGetAttribute(
// &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
// cublasLtMatmulAlgoCapGetAttribute(
// &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
// cublasLtMatmulAlgoCapGetAttribute(
// &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
// cublasLtMatmulAlgoCapGetAttribute(
// &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
// cublasLtMatmulAlgoCapGetAttribute(
// &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
// /* Loop over the different tiles */
// for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
// // #if (CUDART_VERSION >= 11000)
// // /* Loop over different stages count */
// // for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
// // cublasLtMatmulAlgoConfigSetAttribute(
// // &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
// // #endif
// /* Loop over the different custom option if any */
// for (int customOption = 0; customOption <= customOptionMax; customOption++) {
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
// /* Loop over the CTAs swizzling support */
// for (int k = 0; k <= swizzlingMax; k++) {
// int splitK_trial = 0;
// if (splitkSupport) {
// splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
// }
// // Loop over the splitK value over a fixed sequence splitKSequenceA in addition to the case
// // where splitK is not enabled
// for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
// /* Setup attribute of the algo to run */
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
// int splitK_val = 0;
// int redScheme = CUBLASLT_REDUCTION_SCHEME_NONE;
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
// if (l > 0) { // Split-K case
// splitK_val = splitKSequenceA[l - 1];
// cublasLtMatmulAlgoConfigSetAttribute(&algo,
// CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
// &splitKSequenceA[l - 1],
// sizeof(splitKSequenceA[l - 1]));
// /* Going over all the reduction scheme */
// for (redScheme = 1;
// redScheme <= (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
// redScheme = redScheme << 1) {
// if (redScheme & redMask) {
// cublasLtMatmulAlgoConfigSetAttribute(&algo,
// CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
// &redScheme,
// sizeof(redScheme));
// status = customMatmulRun(ltHandle,
// operationDesc,
// alpha, /* host or device pointer */
// A,
// Adesc,
// B,
// Bdesc,
// beta, /* host or device pointer */
// C,
// Cdesc,
// C,
// Cdesc,
// algo,
// kernelRepeats,
// workSpace,
// workSpaceSize,
// perfResults[AlgoCount],
// stream);
// perfResults[AlgoCount].status = status;
// if (status == CUBLAS_STATUS_SUCCESS) {
// AlgoCount++;
// }
// } // end if
// } // end for
// }
// else { // Non-splitK case
// /* if user preference is ok with workspace */
// if (AlgoCount < AlgoCombinations) {
// status = customMatmulRun(ltHandle,
// operationDesc,
// alpha, /* host or device pointer */
// A,
// Adesc,
// B,
// Bdesc,
// beta, /* host or device pointer */
// C,
// Cdesc,
// C,
// Cdesc,
// algo,
// kernelRepeats,
// workSpace,
// workSpaceSize,
// perfResults[AlgoCount],
// stream);
// perfResults[AlgoCount].status = status;
// if (status == CUBLAS_STATUS_SUCCESS) {
// AlgoCount++;
// }
// }
// }
// } // end l
// } // end k
// } // end customOption
// // #if (CUDART_VERSION >= 11000)
// // } // end stagesIdx
// // #endif
// } // end tileIdx
// delete[] tileA;
// } // end idx
// // Sort the results per run duration
// std::sort(perfResults, perfResults + AlgoCount, time_compare);
// // Print timing and perf details
// for (int i = 0, hasPrint = 0; i < AlgoCount; i++) {
// printf("result %03d : ", i);
// hasPrint = printPerfStructure(m, n, k, perfResults[i], fout, hasPrint);
// }
// CLEANUP:
// // Descriptors are no longer needed as all GPU work was already enqueued
// if (Cdesc) {
// cublasLtMatrixLayoutDestroy(Cdesc);
// }
// if (Bdesc) {
// cublasLtMatrixLayoutDestroy(Bdesc);
// }
// if (Adesc) {
// cublasLtMatrixLayoutDestroy(Adesc);
// }
// if (operationDesc) {
// cublasLtMatmulDescDestroy(operationDesc);
// }
// return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
// }
// template int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
// int m,
// int n,
// int k,
// const int* alpha, /* host pointer */
// const int8_t* A,
// const int8_t* B,
// const int* beta, /* host pointer */
// int32_t* C,
// void* workSpace,
// size_t workSpaceSize,
// FILE* fout);
// template int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
// int m,
// int n,
// int k,
// const float* alpha, /* host pointer */
// const int8_t* A,
// const int8_t* B,
// const float* beta, /* host pointer */
// int8_t* C,
// void* workSpace,
// size_t workSpaceSize,
// FILE* fout);
// template<typename T, typename scaleT>
// int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
// int batchCount,
// int m,
// int n,
// int k,
// const scaleT* alpha, /* host pointer */
// const int8_t* A,
// const int8_t* B,
// const scaleT* beta, /* host pointer */
// T* C,
// void* workSpace,
// size_t workSpaceSize,
// FILE* fout)
// {
// cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
// cublasLtMatmulDesc_t operationDesc = NULL;
// cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
// cudaStream_t stream = 0;
// // SplitK value that we are going to try when SplitK is supported for a given algo
// const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
// // Let try a fixed number of combinations
// #define ALGO_COMBINATIONS 50000
// int AlgoCombinations = ALGO_COMBINATIONS;
// int AlgoCount = 0;
// int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back
// customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
// int nbAlgoIds = 0;
// #define ALGO_IDS 100
// int algoIdA[ALGO_IDS];
// cudaDataType_t Atype, Btype, Ctype, scaleType;
// Atype = CUDA_R_8I;
// Btype = CUDA_R_8I;
// if (std::is_same<T, int32_t>::value && std::is_same<scaleT, int>::value) {
// Ctype = CUDA_R_32I;
// scaleType = CUDA_R_32I;
// }
// else if (std::is_same<T, int8_t>::value && std::is_same<scaleT, float>::value) {
// Ctype = CUDA_R_8I;
// scaleType = CUDA_R_32F;
// }
// else {
// printf("[ERROR]<T,scaleT> of igemm is invalid\n");
// exit(-1);
// }
// // #if (CUDART_VERSION >= 11000)
// // cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
// // #else
// cudaDataType_t computeType = CUDA_R_32I;
// // #endif
// cublasOperation_t opTranspose = CUBLAS_OP_T;
// bool use_ORDER_COL32_2R_4R4 = false;
// // #if (CUDART_VERSION >= 11000)
// // int device{-1};
// // cudaGetDevice(&device);
// // cudaDeviceProp props;
// // cudaGetDeviceProperties(&props, device);
// // if (props.major * 10 + props.minor >= 80) {
// // use_ORDER_COL32_2R_4R4 = true;
// // }
// // #endif
// cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
// cublasLtOrder_t order_matrixB;
// // #if (CUDART_VERSION >= 11000)
// // if (use_ORDER_COL32_2R_4R4) {
// // order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
// // }
// // else {
// // order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
// // }
// // #else
// order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
// // #endif
// int ldaTransform = 32 * m;
// int ldbTransform;
// if (use_ORDER_COL32_2R_4R4) {
// ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
// }
// else {
// ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
// }
// int ldcTransform = 32 * m;
// int64_t stridea, strideb, stridec;
// stridea = m * k;
// strideb = n * k;
// stridec = m * n;
// // #if (CUDART_VERSION >= 11000)
// // status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
// // #else
// status = cublasLtMatmulDescCreate(&operationDesc, scaleType);
// // #endif
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
// // Create matrix descriptors.
// status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, ldaTransform);
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// status = cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
// cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea));
// status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, n, k, ldbTransform);
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// status =
// cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
// cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb));
// status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldcTransform);
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// status = cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
// cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec));
// // Request AlgoId available for IGEMM
// status = cublasLtMatmulAlgoGetIds(
// ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, ALGO_IDS, algoIdA, &nbAlgoIds);
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// // Loop over the Algo IDs
// for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
// cublasLtMatmulAlgo_t algo;
// size_t sizeWritten = 0;
// /* Initialize algo structure with given Algp ID */
// status =
// cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, algoIdA[idx], &algo);
// if (status != CUBLAS_STATUS_SUCCESS) {
// continue;
// }
// // Query the tiles enums supported by that algo
// cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
// int nbTiles = int(sizeWritten / sizeof(int));
// int* tileA = new int[nbTiles == 0 ? 1 : nbTiles];
// if (nbTiles == 0) {
// tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
// nbTiles = 1;
// }
// // #if (CUDART_VERSION >= 11000)
// // cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
// // int nbStages = int(sizeWritten / sizeof(int));
// // std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
// // if (nbStages == 0) {
// // stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
// // nbStages = 1;
// // }
// // else {
// // cublasLtMatmulAlgoCapGetAttribute(
// // &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
// // }
// // #endif
// int splitkSupport, redMask, swizzlingMax, customOptionMax;
// // Retrieve Algo Capabilities attributes to be able to setup loop over the different combinations
// cublasLtMatmulAlgoCapGetAttribute(
// &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
// cublasLtMatmulAlgoCapGetAttribute(
// &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
// cublasLtMatmulAlgoCapGetAttribute(
// &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
// cublasLtMatmulAlgoCapGetAttribute(
// &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
// cublasLtMatmulAlgoCapGetAttribute(
// &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
// /* Loop over the different tiles */
// for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
// // #if (CUDART_VERSION >= 11000)
// // /* Loop over different stages count */
// // for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
// // cublasLtMatmulAlgoConfigSetAttribute(
// // &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
// // #endif
// /* Loop over the different custom option if any */
// for (int customOption = 0; customOption <= customOptionMax; customOption++) {
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
// /* Loop over the CTAs swizzling support */
// for (int k = 0; k <= swizzlingMax; k++) {
// int splitK_trial = 0;
// if (splitkSupport) {
// splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
// }
// // Loop over the splitK value over a fixed sequence splitKSequenceA in addition to the case
// // where splitK is not enabled
// for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
// /* Setup attribute of the algo to run */
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
// int splitK_val = 0;
// int redScheme = CUBLASLT_REDUCTION_SCHEME_NONE;
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
// if (l > 0) { // Split-K case
// splitK_val = splitKSequenceA[l - 1];
// cublasLtMatmulAlgoConfigSetAttribute(&algo,
// CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
// &splitKSequenceA[l - 1],
// sizeof(splitKSequenceA[l - 1]));
// /* Going over all the reduction scheme */
// for (redScheme = 1;
// redScheme <= (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
// redScheme = redScheme << 1) {
// if (redScheme & redMask) {
// cublasLtMatmulAlgoConfigSetAttribute(&algo,
// CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
// &redScheme,
// sizeof(redScheme));
// status = customMatmulRun(ltHandle,
// operationDesc,
// alpha, /* host or device pointer */
// A,
// Adesc,
// B,
// Bdesc,
// beta, /* host or device pointer */
// C,
// Cdesc,
// C,
// Cdesc,
// algo,
// kernelRepeats,
// workSpace,
// workSpaceSize,
// perfResults[AlgoCount],
// stream);
// perfResults[AlgoCount].status = status;
// if (status == CUBLAS_STATUS_SUCCESS) {
// AlgoCount++;
// }
// } // end if
// } // end for
// }
// else { // Non-splitK case
// /* if user preference is ok with workspace */
// if (AlgoCount < AlgoCombinations) {
// status = customMatmulRun(ltHandle,
// operationDesc,
// alpha, /* host or device pointer */
// A,
// Adesc,
// B,
// Bdesc,
// beta, /* host or device pointer */
// C,
// Cdesc,
// C,
// Cdesc,
// algo,
// kernelRepeats,
// workSpace,
// workSpaceSize,
// perfResults[AlgoCount],
// stream);
// perfResults[AlgoCount].status = status;
// if (status == CUBLAS_STATUS_SUCCESS) {
// AlgoCount++;
// }
// }
// }
// } // end l
// } // end k
// } // end customOption
// // #if (CUDART_VERSION >= 11000)
// // } // end stagesIdx
// // #endif
// } // end tileIdx
// delete[] tileA;
// } // end idx
// // Sort the results per run duration
// std::sort(perfResults, perfResults + AlgoCount, time_compare);
// // Print timing and perf details
// for (int i = 0, hasPrint = 0; i < AlgoCount; i++) {
// printf("result %03d : ", i);
// hasPrint = printBatchPerfStructure(batchCount, m, n, k, perfResults[i], fout, hasPrint);
// }
// CLEANUP:
// // Descriptors are no longer needed as all GPU work was already enqueued
// if (Cdesc) {
// cublasLtMatrixLayoutDestroy(Cdesc);
// }
// if (Bdesc) {
// cublasLtMatrixLayoutDestroy(Bdesc);
// }
// if (Adesc) {
// cublasLtMatrixLayoutDestroy(Adesc);
// }
// if (operationDesc) {
// cublasLtMatmulDescDestroy(operationDesc);
// }
// return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
// }
// template int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
// int batchCount,
// int m,
// int n,
// int k,
// const int* alpha, /* host pointer */
// const int8_t* A,
// const int8_t* B,
// const int* beta, /* host pointer */
// int32_t* C,
// void* workSpace,
// size_t workSpaceSize,
// FILE* fout);
// template int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
// int batchCount,
// int m,
// int n,
// int k,
// const float* alpha, /* host pointer */
// const int8_t* A,
// const int8_t* B,
// const float* beta, /* host pointer */
// int8_t* C,
// void* workSpace,
// size_t workSpaceSize,
// FILE* fout);
// initialize matrix in column-major
void matInit(int rows, int cols, int8_t* p, int ld)
......
......@@ -52,11 +52,11 @@ int printPerfStructure(int batch_size,
matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
#else
// #if (CUDART_VERSION >= 11000)
// cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
// #else
stages = 0;
#endif
// #endif
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
uint16_t inner_shapeId, cluster_shapeId;
cublasLtMatmulAlgoConfigGetAttribute(
......@@ -74,9 +74,9 @@ int printPerfStructure(int batch_size,
#endif
printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d "
#if (CUDART_VERSION >= 11000)
"stages=%d "
#endif
// #if (CUDART_VERSION >= 11000)
// "stages=%d "
// #endif
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
"inner_shapeId=%d cluster_shapeId=%d"
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
......@@ -91,9 +91,9 @@ int printPerfStructure(int batch_size,
reductionScheme,
swizzle,
customOption,
#if (CUDART_VERSION >= 11000)
stages,
#endif
// #if (CUDART_VERSION >= 11000)
// stages,
// #endif
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
inner_shapeId,
cluster_shapeId,
......@@ -154,704 +154,704 @@ static inline bool time_compare(const customMatmulPerf_t& perf_a, const customMa
return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.time < perf_b.time));
}
static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, // to get the capabilities (required a GPU)
cublasLtMatmulDesc_t operationDesc,
const void* alpha, /* host or device pointer */
const void* A,
cublasLtMatrixLayout_t Adesc,
const void* B,
cublasLtMatrixLayout_t Bdesc,
const void* beta, /* host or device pointer */
const void* C,
cublasLtMatrixLayout_t Cdesc,
void* D,
cublasLtMatrixLayout_t Ddesc,
const cublasLtMatmulAlgo_t& algo,
int kernelRepeats,
void* workSpace,
size_t workSpaceSizeInBytes,
customMatmulPerf_t& perfResults,
cudaStream_t stream,
cudaEvent_t& startEvent,
cudaEvent_t& stopEvent)
{
cublasLtMatmulHeuristicResult_t heurResult;
/* Looping over the Algo */
int repeats = kernelRepeats;
cublasStatus_t algoStatus =
cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
cudaError_t err, err1, err2, err3;
err = cudaEventRecord(startEvent, stream);
for (int loop = 0; loop < repeats; loop++) {
cublasStatus_t oneRunStatus = cublasLtMatmul(ltHandle,
operationDesc,
alpha,
A,
Adesc,
B,
Bdesc,
beta,
C,
Cdesc,
D,
Ddesc,
&algo,
workSpace,
workSpaceSizeInBytes,
stream);
if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
algoStatus = oneRunStatus;
break;
}
}
err1 = cudaEventRecord(stopEvent, stream);
err2 = cudaEventSynchronize(stopEvent);
float time;
err3 = cudaEventElapsedTime(&time, startEvent, stopEvent);
if ((err != cudaSuccess) || (err1 != cudaSuccess) || (err2 != cudaSuccess) || (err3 != cudaSuccess)) {
algoStatus = CUBLAS_STATUS_INTERNAL_ERROR;
}
// For the moment only add successful findings
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
perfResults.algo = algo;
perfResults.time = time / repeats;
perfResults.workspaceSize = heurResult.workspaceSize;
perfResults.wavesCount = heurResult.wavesCount;
}
}
else {
// printf("not enough workspace! %ld\n", heurResult.workspaceSize);
algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not enough workspace
}
}
return algoStatus;
}
template<typename T, typename scaleT>
int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
int batch_size,
int seq_len,
int head_num,
int size_per_head,
int m,
int n,
int k,
const scaleT* alpha, /* host pointer */
const T* A,
const T* B,
const scaleT* beta, /* host pointer */
T* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout,
customMatmulPerf_t perfResults[],
int AlgoCombinations,
cudaDataType_t dtype_fp8,
int batchCount,
int64_t strideA,
int64_t strideB,
int64_t strideD)
{
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
cudaEvent_t startEvent;
cudaEvent_t stopEvent;
CublasDataType data_type;
cublasLtMatmulDesc_t operationDesc = NULL;
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL;
cudaStream_t stream = 0;
// SplitK value that we are going to try when SplitK is supported for a
// given algo
const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
// Let try a fixed number of combinations
int AlgoCount = 0;
int AlgoCountRestrict = 0; // workspace == 0
const int maxNumTraversal = 50; // max number of traversal
std::vector<cublasLtMatmulAlgo_t> algos(AlgoCombinations); // 0 <= workspace <= 32MB
std::vector<cublasLtMatmulAlgo_t> algosRestrict(AlgoCombinations); // workspace == 0
const int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back
int nbAlgoIds = 0; // Number of algorithms actually returned by
// cublasLtMatmulAlgoGetIds function.
#define ALGO_IDS 100 // Number of algorithms requested.
int algoIdA[ALGO_IDS]; // Array containing the algorithm IDs returned by
// cublasLtMatmulAlgoGetIds function.
cudaDataType_t Atype, Btype, Ctype, scaleType, Dtype;
#if (CUDART_VERSION >= 11000)
cublasComputeType_t computeType;
#else
cudaDataType_t computeType;
#endif
if (std::is_same<T, float>::value) {
data_type = FLOAT_DATATYPE;
Atype = CUDA_R_32F, Btype = CUDA_R_32F, Ctype = CUDA_R_32F, Dtype = CUDA_R_32F;
}
else if (std::is_same<T, half>::value) {
data_type = HALF_DATATYPE;
Atype = CUDA_R_16F, Btype = CUDA_R_16F, Ctype = CUDA_R_16F, Dtype = CUDA_R_16F;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value) {
data_type = BFLOAT16_DATATYPE;
Atype = CUDA_R_16BF, Btype = CUDA_R_16BF, Ctype = CUDA_R_16BF, Dtype = CUDA_R_16BF;
}
#endif
#ifdef ENABLE_FP8
else if (std::is_same<T, __nv_fp8_e4m3>::value) {
data_type = FP8_DATATYPE;
Atype = CUDA_R_8F_E4M3, Btype = CUDA_R_8F_E4M3, Ctype = CUDA_R_16BF;
#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
Dtype = CUDA_R_16BF;
#else
Dtype = dtype_fp8;
#endif
}
#endif
if (sizeof(scaleT) == sizeof(float)) {
scaleType = CUDA_R_32F;
#if (CUDART_VERSION >= 11000)
computeType = CUBLAS_COMPUTE_32F;
#else
computeType = CUDA_R_32F;
#endif
}
else {
scaleType = CUDA_R_16F;
#if (CUDART_VERSION >= 11000)
computeType = CUBLAS_COMPUTE_16F;
#else
computeType = CUDA_R_16F;
#endif
}
const cublasOperation_t tA = data_type == FP8_DATATYPE ? CUBLAS_OP_T : CUBLAS_OP_N;
// Create operation descriptor; see cublasLtMatmulDescAttributes_t for
// details about defaults; here we just need to set the transforms for A and
// B
#if (CUDART_VERSION >= 11000)
status = cublasLtMatmulDescCreate(&operationDesc, computeType,
scaleType); // creates a matrix multiply descriptor
#else
status = cublasLtMatmulDescCreate(&operationDesc, computeType);
#endif
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
#ifdef ENABLE_FP8
if (data_type == FP8_DATATYPE) {
const int8_t fastAccuMode = 1; // enable fast imprecise accum
status = cublasLtMatmulDescSetAttribute(
operationDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode)));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
}
#endif
// Create matrix descriptors. We are good with the details here so no need
// to set any extra attributes
if (data_type == FP8_DATATYPE) {
status = cublasLtMatrixLayoutCreate(&Adesc, Atype, k, m, k);
}
else {
status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, m);
}
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, k, n, k);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, m);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutCreate(&Ddesc, Dtype, m, n, m);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
}
// Create CUDA event to time the execution time of each algo
if (cudaEventCreate(&startEvent, cudaEventBlockingSync) != cudaSuccess) {
goto CLEANUP;
}
if (cudaEventCreate(&stopEvent, cudaEventBlockingSync) != cudaSuccess) {
goto CLEANUP;
}
// Request the 100 first AlgoId available
status = cublasLtMatmulAlgoGetIds(
ltHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, ALGO_IDS, algoIdA, &nbAlgoIds);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
if (nbAlgoIds > ALGO_IDS) {
printf(
"Warning: the algo id count is not large enough to guarantee the best algo %d, %d\n", nbAlgoIds, ALGO_IDS);
}
// Loop over the Algo IDs
// This loop doesn't work for fp8 gemm
for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
cublasLtMatmulAlgo_t algo;
size_t sizeWritten = 0;
/* Initialize algo structure with given Algp ID */
status =
cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, algoIdA[idx], &algo);
if (status != CUBLAS_STATUS_SUCCESS) {
continue;
}
// Query the tiles enums supported by that algo
cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
int nbTiles = int(sizeWritten / sizeof(int));
int* tileA = new int[nbTiles == 0 ? 1 : nbTiles];
if (nbTiles == 0) {
tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
nbTiles = 1;
}
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
int nbStages = int(sizeWritten / sizeof(int));
std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
if (nbStages == 0) {
stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
nbStages = 1;
}
else {
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
}
#endif
int splitkSupport, redMask, swizzlingMax, customOptionMax;
// Retrieve Algo Capabilities attributes to be able to setup loop over
// the different combinations
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
/* Loop over the different tiles */
for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
#if (CUDART_VERSION >= 11000)
/* Loop over different stages count */
for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
#endif
/* Loop over the different custom option if any */
for (int customOption = 0; customOption <= customOptionMax; customOption++) {
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
/* Loop over the CTAs swizzling support */
for (int k = 0; k <= swizzlingMax; k++) {
int splitK_trial = 0;
if (splitkSupport) {
splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
}
// Loop over the splitK value over a fixed sequence
// splitKSequenceA in addition to the case where splitK
// is not enabled
for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
/* Setup attribute of the algo to run */
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
int splitK_val = 0;
int redScheme = CUBLASLT_REDUCTION_SCHEME_NONE;
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
if (l > 0) { // Split-K case
splitK_val = splitKSequenceA[l - 1];
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
&splitKSequenceA[l - 1],
sizeof(splitKSequenceA[l - 1]));
/* Going over all the reduction scheme */
for (redScheme = 1;
redScheme < (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
redScheme = redScheme << 1) {
if (redScheme & redMask) {
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
&redScheme,
sizeof(redScheme));
cublasLtMatmulHeuristicResult_t heurResult;
cublasStatus_t algoStatus = cublasLtMatmulAlgoCheck(
ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult);
if (heurResult.workspaceSize > workSpaceSize) {
// printf("not enough workspace!
// %ld\n",
// heurResult.workspaceSize);
algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not enough workspace
}
else if (heurResult.workspaceSize == 0) {
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
algosRestrict[AlgoCountRestrict++] = algo;
}
}
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
algos[AlgoCount++] = algo;
}
} // end if
} // end for
}
else { // Non-splitK case
/* if user preference is ok with workspace */
if (AlgoCount < AlgoCombinations) {
cublasLtMatmulHeuristicResult_t heurResult;
cublasStatus_t algoStatus = cublasLtMatmulAlgoCheck(
ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult);
if (heurResult.workspaceSize > workSpaceSize) {
// printf("not enough workspace! %ld\n",
// heurResult.workspaceSize);
algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not
// enough
// workspace
}
else if (heurResult.workspaceSize == 0) {
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
algosRestrict[AlgoCountRestrict++] = algo;
}
}
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
algos[AlgoCount++] = algo;
}
}
}
} // end l
} // end k
} // end customOption
#if (CUDART_VERSION >= 11000)
} // end stagesIdx
#endif
} // end tileIdx
delete[] tileA;
} // end idx
printf("AlgoCount: %d\n", AlgoCount);
if (data_type == FP8_DATATYPE) {
assert(AlgoCount == 0);
}
if (AlgoCount < maxNumTraversal && data_type != FP8_DATATYPE) {
// 0 <= workspacesize <= 32MB
for (int i = 0; i < AlgoCount; i++) {
status = customMatmulRun(ltHandle,
operationDesc,
alpha, /* host or device pointer */
A,
Adesc,
B,
Bdesc,
beta, /* host or device pointer */
C,
Cdesc,
C,
Cdesc,
algos[i],
kernelRepeats,
workSpace,
workSpaceSize,
perfResults[i],
stream,
startEvent,
stopEvent);
perfResults[i].status = status;
// if (status == CUBLAS_STATUS_SUCCESS) AlgoCount++;
}
}
else {
// Heuristic + workspacesize==0
AlgoCount = 0;
nbAlgoIds = 0;
cublasLtMatmulPreference_t pref;
cublasLtMatmulPreferenceCreate(&pref);
uint64_t maxWorkSpaceSize = workSpaceSize; //(32MB)
cublasLtMatmulPreferenceSetAttribute(
pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &maxWorkSpaceSize, sizeof(maxWorkSpaceSize));
cublasLtMatmulHeuristicResult_t heuristicResultsArray[maxNumTraversal];
cublasLtMatmulAlgoGetHeuristic(ltHandle,
operationDesc,
Adesc,
Bdesc,
Cdesc,
Ddesc,
pref,
maxNumTraversal,
heuristicResultsArray,
&nbAlgoIds);
cublasLtMatmulPreferenceDestroy(pref);
printf("return %d and run heuristic algo\n", nbAlgoIds);
for (int i = 0; i < nbAlgoIds; i++) {
if (heuristicResultsArray[i].state == CUBLAS_STATUS_SUCCESS) {
status = customMatmulRun(ltHandle,
operationDesc,
alpha, /* host or device pointer */
A,
Adesc,
B,
Bdesc,
beta, /* host or device pointer */
C,
Cdesc,
C,
Ddesc,
heuristicResultsArray[i].algo,
kernelRepeats,
workSpace,
workSpaceSize,
perfResults[AlgoCount],
stream,
startEvent,
stopEvent);
perfResults[AlgoCount].status = status;
if (status == CUBLAS_STATUS_SUCCESS) {
AlgoCount++;
}
}
}
// workspacesize==0
printf("workspacesize==0, run %d algos\n", AlgoCountRestrict);
for (int i = 0; i < AlgoCountRestrict && i < (maxNumTraversal - nbAlgoIds); i++) {
status = customMatmulRun(ltHandle,
operationDesc,
alpha, /* host or device pointer */
A,
Adesc,
B,
Bdesc,
beta, /* host or device pointer */
C,
Cdesc,
C,
Ddesc,
algosRestrict[i],
kernelRepeats,
NULL,
0,
perfResults[AlgoCount],
stream,
startEvent,
stopEvent);
perfResults[AlgoCount].status = status;
if (status == CUBLAS_STATUS_SUCCESS) {
AlgoCount++;
}
}
}
// Sort the results per run duration
std::sort(perfResults, perfResults + AlgoCount, time_compare);
// Print timing and perf details
for (int i = 0, hasPrint = 1; i < AlgoCount; i++) {
printf("result %03d : ", i);
hasPrint = printPerfStructure(batch_size,
seq_len,
head_num,
size_per_head,
m,
n,
k,
perfResults[i],
fout,
data_type,
hasPrint,
batchCount);
}
CLEANUP:
// Descriptors are no longer needed as all GPU work was already enqueued
if (Cdesc) {
cublasLtMatrixLayoutDestroy(Cdesc);
}
if (Bdesc) {
cublasLtMatrixLayoutDestroy(Bdesc);
}
if (Adesc) {
cublasLtMatrixLayoutDestroy(Adesc);
}
if (operationDesc) {
cublasLtMatmulDescDestroy(operationDesc);
}
if (startEvent) {
cudaEventDestroy(startEvent);
}
if (stopEvent) {
cudaEventDestroy(stopEvent);
}
return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}
template int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
int batch_size,
int seq_len,
int head_num,
int size_per_head,
int m,
int n,
int k,
const float* alpha, /* host pointer */
const float* A,
const float* B,
const float* beta, /* host pointer */
float* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout,
customMatmulPerf_t perfResults[],
int AlgoCombinations,
cudaDataType_t dtype_fp8,
int batchCount,
int64_t strideA,
int64_t strideB,
int64_t strideD);
template int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
int batch_size,
int seq_len,
int head_num,
int size_per_head,
int m,
int n,
int k,
const half* alpha, /* host pointer */
const half* A,
const half* B,
const half* beta, /* host pointer */
half* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout,
customMatmulPerf_t perfResults[],
int AlgoCombinations,
cudaDataType_t dtype_fp8,
int batchCount,
int64_t strideA,
int64_t strideB,
int64_t strideD);
#ifdef ENABLE_BF16
template int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
int batch_size,
int seq_len,
int head_num,
int size_per_head,
int m,
int n,
int k,
const float* alpha, /* host pointer */
const __nv_bfloat16* A,
const __nv_bfloat16* B,
const float* beta, /* host pointer */
__nv_bfloat16* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout,
customMatmulPerf_t perfResults[],
int AlgoCombinations,
cudaDataType_t dtype_fp8,
int batchCount,
int64_t strideA,
int64_t strideB,
int64_t strideD);
#endif
#ifdef ENABLE_FP8
template int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
int batch_size,
int seq_len,
int head_num,
int size_per_head,
int m,
int n,
int k,
const float* alpha, /* host pointer */
const __nv_fp8_e4m3* A,
const __nv_fp8_e4m3* B,
const float* beta, /* host pointer */
__nv_fp8_e4m3* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout,
customMatmulPerf_t perfResults[],
int AlgoCombinations,
cudaDataType_t dtype_fp8,
int batchCount,
int64_t strideA,
int64_t strideB,
int64_t strideD);
#endif
template int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
int batch_size,
int seq_len,
int head_num,
int size_per_head,
int m,
int n,
int k,
const float* alpha, /* host pointer */
const half* A,
const half* B,
const float* beta, /* host pointer */
half* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout,
customMatmulPerf_t perfResults[],
int AlgoCombinations,
cudaDataType_t dtype_fp8,
int batchCount,
int64_t strideA,
int64_t strideB,
int64_t strideD);
// static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, // to get the capabilities (required a GPU)
// cublasLtMatmulDesc_t operationDesc,
// const void* alpha, /* host or device pointer */
// const void* A,
// cublasLtMatrixLayout_t Adesc,
// const void* B,
// cublasLtMatrixLayout_t Bdesc,
// const void* beta, /* host or device pointer */
// const void* C,
// cublasLtMatrixLayout_t Cdesc,
// void* D,
// cublasLtMatrixLayout_t Ddesc,
// const cublasLtMatmulAlgo_t& algo,
// int kernelRepeats,
// void* workSpace,
// size_t workSpaceSizeInBytes,
// customMatmulPerf_t& perfResults,
// cudaStream_t stream,
// cudaEvent_t& startEvent,
// cudaEvent_t& stopEvent)
// {
// cublasLtMatmulHeuristicResult_t heurResult;
// /* Looping over the Algo */
// int repeats = kernelRepeats;
// cublasStatus_t algoStatus =
// cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
// if (algoStatus == CUBLAS_STATUS_SUCCESS) {
// if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
// cudaError_t err, err1, err2, err3;
// err = cudaEventRecord(startEvent, stream);
// for (int loop = 0; loop < repeats; loop++) {
// cublasStatus_t oneRunStatus = cublasLtMatmul(ltHandle,
// operationDesc,
// alpha,
// A,
// Adesc,
// B,
// Bdesc,
// beta,
// C,
// Cdesc,
// D,
// Ddesc,
// &algo,
// workSpace,
// workSpaceSizeInBytes,
// stream);
// if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
// algoStatus = oneRunStatus;
// break;
// }
// }
// err1 = cudaEventRecord(stopEvent, stream);
// err2 = cudaEventSynchronize(stopEvent);
// float time;
// err3 = cudaEventElapsedTime(&time, startEvent, stopEvent);
// if ((err != cudaSuccess) || (err1 != cudaSuccess) || (err2 != cudaSuccess) || (err3 != cudaSuccess)) {
// algoStatus = CUBLAS_STATUS_INTERNAL_ERROR;
// }
// // For the moment only add successful findings
// if (algoStatus == CUBLAS_STATUS_SUCCESS) {
// perfResults.algo = algo;
// perfResults.time = time / repeats;
// perfResults.workspaceSize = heurResult.workspaceSize;
// perfResults.wavesCount = heurResult.wavesCount;
// }
// }
// else {
// // printf("not enough workspace! %ld\n", heurResult.workspaceSize);
// algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not enough workspace
// }
// }
// return algoStatus;
// }
// template<typename T, typename scaleT>
// int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
// int batch_size,
// int seq_len,
// int head_num,
// int size_per_head,
// int m,
// int n,
// int k,
// const scaleT* alpha, /* host pointer */
// const T* A,
// const T* B,
// const scaleT* beta, /* host pointer */
// T* C,
// void* workSpace,
// size_t workSpaceSize,
// FILE* fout,
// customMatmulPerf_t perfResults[],
// int AlgoCombinations,
// cudaDataType_t dtype_fp8,
// int batchCount,
// int64_t strideA,
// int64_t strideB,
// int64_t strideD)
// {
// cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
// cudaEvent_t startEvent;
// cudaEvent_t stopEvent;
// CublasDataType data_type;
// cublasLtMatmulDesc_t operationDesc = NULL;
// cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL;
// cudaStream_t stream = 0;
// // SplitK value that we are going to try when SplitK is supported for a
// // given algo
// const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
// // Let try a fixed number of combinations
// int AlgoCount = 0;
// int AlgoCountRestrict = 0; // workspace == 0
// const int maxNumTraversal = 50; // max number of traversal
// std::vector<cublasLtMatmulAlgo_t> algos(AlgoCombinations); // 0 <= workspace <= 32MB
// std::vector<cublasLtMatmulAlgo_t> algosRestrict(AlgoCombinations); // workspace == 0
// const int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back
// int nbAlgoIds = 0; // Number of algorithms actually returned by
// // cublasLtMatmulAlgoGetIds function.
// #define ALGO_IDS 100 // Number of algorithms requested.
// int algoIdA[ALGO_IDS]; // Array containing the algorithm IDs returned by
// // cublasLtMatmulAlgoGetIds function.
// cudaDataType_t Atype, Btype, Ctype, scaleType, Dtype;
// // #if (CUDART_VERSION >= 11000)
// // cublasComputeType_t computeType;
// // #else
// cudaDataType_t computeType;
// // #endif
// if (std::is_same<T, float>::value) {
// data_type = FLOAT_DATATYPE;
// Atype = CUDA_R_32F, Btype = CUDA_R_32F, Ctype = CUDA_R_32F, Dtype = CUDA_R_32F;
// }
// else if (std::is_same<T, half>::value) {
// data_type = HALF_DATATYPE;
// Atype = CUDA_R_16F, Btype = CUDA_R_16F, Ctype = CUDA_R_16F, Dtype = CUDA_R_16F;
// }
// #ifdef ENABLE_BF16
// else if (std::is_same<T, __nv_bfloat16>::value) {
// data_type = BFLOAT16_DATATYPE;
// Atype = CUDA_R_16BF, Btype = CUDA_R_16BF, Ctype = CUDA_R_16BF, Dtype = CUDA_R_16BF;
// }
// #endif
// #ifdef ENABLE_FP8
// else if (std::is_same<T, __nv_fp8_e4m3>::value) {
// data_type = FP8_DATATYPE;
// Atype = CUDA_R_8F_E4M3, Btype = CUDA_R_8F_E4M3, Ctype = CUDA_R_16BF;
// #ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
// Dtype = CUDA_R_16BF;
// #else
// Dtype = dtype_fp8;
// #endif
// }
// #endif
// if (sizeof(scaleT) == sizeof(float)) {
// scaleType = CUDA_R_32F;
// // #if (CUDART_VERSION >= 11000)
// // computeType = CUBLAS_COMPUTE_32F;
// // #else
// computeType = CUDA_R_32F;
// // #endif
// }
// else {
// scaleType = CUDA_R_16F;
// // #if (CUDART_VERSION >= 11000)
// // computeType = CUBLAS_COMPUTE_16F;
// // #else
// computeType = CUDA_R_16F;
// // #endif
// }
// const cublasOperation_t tA = data_type == FP8_DATATYPE ? CUBLAS_OP_T : CUBLAS_OP_N;
// // Create operation descriptor; see cublasLtMatmulDescAttributes_t for
// // details about defaults; here we just need to set the transforms for A and
// // B
// // #if (CUDART_VERSION >= 11000)
// // status = cublasLtMatmulDescCreate(&operationDesc, computeType,
// // scaleType); // creates a matrix multiply descriptor
// // #else
// status = cublasLtMatmulDescCreate(&operationDesc, computeType);
// // #endif
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// status = cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA));
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// #ifdef ENABLE_FP8
// if (data_type == FP8_DATATYPE) {
// const int8_t fastAccuMode = 1; // enable fast imprecise accum
// status = cublasLtMatmulDescSetAttribute(
// operationDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode)));
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// }
// #endif
// // Create matrix descriptors. We are good with the details here so no need
// // to set any extra attributes
// if (data_type == FP8_DATATYPE) {
// status = cublasLtMatrixLayoutCreate(&Adesc, Atype, k, m, k);
// }
// else {
// status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, m);
// }
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, k, n, k);
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, m);
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// status = cublasLtMatrixLayoutCreate(&Ddesc, Dtype, m, n, m);
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// if (batchCount > 1) {
// check_cuda_error(cublasLtMatrixLayoutSetAttribute(
// Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
// check_cuda_error(cublasLtMatrixLayoutSetAttribute(
// Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
// check_cuda_error(cublasLtMatrixLayoutSetAttribute(
// Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
// check_cuda_error(cublasLtMatrixLayoutSetAttribute(
// Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
// check_cuda_error(cublasLtMatrixLayoutSetAttribute(
// Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
// check_cuda_error(cublasLtMatrixLayoutSetAttribute(
// Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
// check_cuda_error(cublasLtMatrixLayoutSetAttribute(
// Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
// check_cuda_error(cublasLtMatrixLayoutSetAttribute(
// Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
// }
// // Create CUDA event to time the execution time of each algo
// if (cudaEventCreate(&startEvent, cudaEventBlockingSync) != cudaSuccess) {
// goto CLEANUP;
// }
// if (cudaEventCreate(&stopEvent, cudaEventBlockingSync) != cudaSuccess) {
// goto CLEANUP;
// }
// // Request the 100 first AlgoId available
// status = cublasLtMatmulAlgoGetIds(
// ltHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, ALGO_IDS, algoIdA, &nbAlgoIds);
// if (status != CUBLAS_STATUS_SUCCESS) {
// goto CLEANUP;
// }
// if (nbAlgoIds > ALGO_IDS) {
// printf(
// "Warning: the algo id count is not large enough to guarantee the best algo %d, %d\n", nbAlgoIds, ALGO_IDS);
// }
// // Loop over the Algo IDs
// // This loop doesn't work for fp8 gemm
// for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
// cublasLtMatmulAlgo_t algo;
// size_t sizeWritten = 0;
// /* Initialize algo structure with given Algp ID */
// status =
// cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, algoIdA[idx], &algo);
// if (status != CUBLAS_STATUS_SUCCESS) {
// continue;
// }
// // Query the tiles enums supported by that algo
// cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
// int nbTiles = int(sizeWritten / sizeof(int));
// int* tileA = new int[nbTiles == 0 ? 1 : nbTiles];
// if (nbTiles == 0) {
// tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
// nbTiles = 1;
// }
// // #if (CUDART_VERSION >= 11000)
// // cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
// // int nbStages = int(sizeWritten / sizeof(int));
// // std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
// // if (nbStages == 0) {
// // stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
// // nbStages = 1;
// // }
// // else {
// // cublasLtMatmulAlgoCapGetAttribute(
// // &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
// // }
// // #endif
// int splitkSupport, redMask, swizzlingMax, customOptionMax;
// // Retrieve Algo Capabilities attributes to be able to setup loop over
// // the different combinations
// cublasLtMatmulAlgoCapGetAttribute(
// &algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
// cublasLtMatmulAlgoCapGetAttribute(
// &algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
// cublasLtMatmulAlgoCapGetAttribute(
// &algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
// cublasLtMatmulAlgoCapGetAttribute(
// &algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
// cublasLtMatmulAlgoCapGetAttribute(
// &algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
// /* Loop over the different tiles */
// for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
// // #if (CUDART_VERSION >= 11000)make:q
// // /* Loop over different stages count */
// // for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
// // cublasLtMatmulAlgoConfigSetAttribute(
// // &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
// // #endif
// /* Loop over the different custom option if any */
// for (int customOption = 0; customOption <= customOptionMax; customOption++) {
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
// /* Loop over the CTAs swizzling support */
// for (int k = 0; k <= swizzlingMax; k++) {
// int splitK_trial = 0;
// if (splitkSupport) {
// splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
// }
// // Loop over the splitK value over a fixed sequence
// // splitKSequenceA in addition to the case where splitK
// // is not enabled
// for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
// /* Setup attribute of the algo to run */
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
// int splitK_val = 0;
// int redScheme = CUBLASLT_REDUCTION_SCHEME_NONE;
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
// cublasLtMatmulAlgoConfigSetAttribute(
// &algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
// if (l > 0) { // Split-K case
// splitK_val = splitKSequenceA[l - 1];
// cublasLtMatmulAlgoConfigSetAttribute(&algo,
// CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
// &splitKSequenceA[l - 1],
// sizeof(splitKSequenceA[l - 1]));
// /* Going over all the reduction scheme */
// for (redScheme = 1;
// redScheme < (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
// redScheme = redScheme << 1) {
// if (redScheme & redMask) {
// cublasLtMatmulAlgoConfigSetAttribute(&algo,
// CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
// &redScheme,
// sizeof(redScheme));
// cublasLtMatmulHeuristicResult_t heurResult;
// cublasStatus_t algoStatus = cublasLtMatmulAlgoCheck(
// ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult);
// if (heurResult.workspaceSize > workSpaceSize) {
// // printf("not enough workspace!
// // %ld\n",
// // heurResult.workspaceSize);
// algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not enough workspace
// }
// else if (heurResult.workspaceSize == 0) {
// if (algoStatus == CUBLAS_STATUS_SUCCESS) {
// algosRestrict[AlgoCountRestrict++] = algo;
// }
// }
// if (algoStatus == CUBLAS_STATUS_SUCCESS) {
// algos[AlgoCount++] = algo;
// }
// } // end if
// } // end for
// }
// else { // Non-splitK case
// /* if user preference is ok with workspace */
// if (AlgoCount < AlgoCombinations) {
// cublasLtMatmulHeuristicResult_t heurResult;
// cublasStatus_t algoStatus = cublasLtMatmulAlgoCheck(
// ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult);
// if (heurResult.workspaceSize > workSpaceSize) {
// // printf("not enough workspace! %ld\n",
// // heurResult.workspaceSize);
// algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not
// // enough
// // workspace
// }
// else if (heurResult.workspaceSize == 0) {
// if (algoStatus == CUBLAS_STATUS_SUCCESS) {
// algosRestrict[AlgoCountRestrict++] = algo;
// }
// }
// if (algoStatus == CUBLAS_STATUS_SUCCESS) {
// algos[AlgoCount++] = algo;
// }
// }
// }
// } // end l
// } // end k
// } // end customOption
// // #if (CUDART_VERSION >= 11000)
// } // end stagesIdx
// // #endif
// } // end tileIdx
// delete[] tileA;
// } // end idx
// printf("AlgoCount: %d\n", AlgoCount);
// if (data_type == FP8_DATATYPE) {
// assert(AlgoCount == 0);
// }
// if (AlgoCount < maxNumTraversal && data_type != FP8_DATATYPE) {
// // 0 <= workspacesize <= 32MB
// for (int i = 0; i < AlgoCount; i++) {
// status = customMatmulRun(ltHandle,
// operationDesc,
// alpha, /* host or device pointer */
// A,
// Adesc,
// B,
// Bdesc,
// beta, /* host or device pointer */
// C,
// Cdesc,
// C,
// Cdesc,
// algos[i],
// kernelRepeats,
// workSpace,
// workSpaceSize,
// perfResults[i],
// stream,
// startEvent,
// stopEvent);
// perfResults[i].status = status;
// // if (status == CUBLAS_STATUS_SUCCESS) AlgoCount++;
// }
// }
// else {
// // Heuristic + workspacesize==0
// AlgoCount = 0;
// nbAlgoIds = 0;
// cublasLtMatmulPreference_t pref;
// cublasLtMatmulPreferenceCreate(&pref);
// uint64_t maxWorkSpaceSize = workSpaceSize; //(32MB)
// cublasLtMatmulPreferenceSetAttribute(
// pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &maxWorkSpaceSize, sizeof(maxWorkSpaceSize));
// cublasLtMatmulHeuristicResult_t heuristicResultsArray[maxNumTraversal];
// cublasLtMatmulAlgoGetHeuristic(ltHandle,
// operationDesc,
// Adesc,
// Bdesc,
// Cdesc,
// Ddesc,
// pref,
// maxNumTraversal,
// heuristicResultsArray,
// &nbAlgoIds);
// cublasLtMatmulPreferenceDestroy(pref);
// printf("return %d and run heuristic algo\n", nbAlgoIds);
// for (int i = 0; i < nbAlgoIds; i++) {
// if (heuristicResultsArray[i].state == CUBLAS_STATUS_SUCCESS) {
// status = customMatmulRun(ltHandle,
// operationDesc,
// alpha, /* host or device pointer */
// A,
// Adesc,
// B,
// Bdesc,
// beta, /* host or device pointer */
// C,
// Cdesc,
// C,
// Ddesc,
// heuristicResultsArray[i].algo,
// kernelRepeats,
// workSpace,
// workSpaceSize,
// perfResults[AlgoCount],
// stream,
// startEvent,
// stopEvent);
// perfResults[AlgoCount].status = status;
// if (status == CUBLAS_STATUS_SUCCESS) {
// AlgoCount++;
// }
// }
// }
// // workspacesize==0
// printf("workspacesize==0, run %d algos\n", AlgoCountRestrict);
// for (int i = 0; i < AlgoCountRestrict && i < (maxNumTraversal - nbAlgoIds); i++) {
// status = customMatmulRun(ltHandle,
// operationDesc,
// alpha, /* host or device pointer */
// A,
// Adesc,
// B,
// Bdesc,
// beta, /* host or device pointer */
// C,
// Cdesc,
// C,
// Ddesc,
// algosRestrict[i],
// kernelRepeats,
// NULL,
// 0,
// perfResults[AlgoCount],
// stream,
// startEvent,
// stopEvent);
// perfResults[AlgoCount].status = status;
// if (status == CUBLAS_STATUS_SUCCESS) {
// AlgoCount++;
// }
// }
// }
// // Sort the results per run duration
// std::sort(perfResults, perfResults + AlgoCount, time_compare);
// // Print timing and perf details
// for (int i = 0, hasPrint = 1; i < AlgoCount; i++) {
// printf("result %03d : ", i);
// hasPrint = printPerfStructure(batch_size,
// seq_len,
// head_num,
// size_per_head,
// m,
// n,
// k,
// perfResults[i],
// fout,
// data_type,
// hasPrint,
// batchCount);
// }
// CLEANUP:
// // Descriptors are no longer needed as all GPU work was already enqueued
// if (Cdesc) {
// cublasLtMatrixLayoutDestroy(Cdesc);
// }
// if (Bdesc) {
// cublasLtMatrixLayoutDestroy(Bdesc);
// }
// if (Adesc) {
// cublasLtMatrixLayoutDestroy(Adesc);
// }
// if (operationDesc) {
// cublasLtMatmulDescDestroy(operationDesc);
// }
// if (startEvent) {
// cudaEventDestroy(startEvent);
// }
// if (stopEvent) {
// cudaEventDestroy(stopEvent);
// }
// return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
// }
// template int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
// int batch_size,
// int seq_len,
// int head_num,
// int size_per_head,
// int m,
// int n,
// int k,
// const float* alpha, /* host pointer */
// const float* A,
// const float* B,
// const float* beta, /* host pointer */
// float* C,
// void* workSpace,
// size_t workSpaceSize,
// FILE* fout,
// customMatmulPerf_t perfResults[],
// int AlgoCombinations,
// cudaDataType_t dtype_fp8,
// int batchCount,
// int64_t strideA,
// int64_t strideB,
// int64_t strideD);
// template int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
// int batch_size,
// int seq_len,
// int head_num,
// int size_per_head,
// int m,
// int n,
// int k,
// const half* alpha, /* host pointer */
// const half* A,
// const half* B,
// const half* beta, /* host pointer */
// half* C,
// void* workSpace,
// size_t workSpaceSize,
// FILE* fout,
// customMatmulPerf_t perfResults[],
// int AlgoCombinations,
// cudaDataType_t dtype_fp8,
// int batchCount,
// int64_t strideA,
// int64_t strideB,
// int64_t strideD);
// #ifdef ENABLE_BF16
// template int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
// int batch_size,
// int seq_len,
// int head_num,
// int size_per_head,
// int m,
// int n,
// int k,
// const float* alpha, /* host pointer */
// const __nv_bfloat16* A,
// const __nv_bfloat16* B,
// const float* beta, /* host pointer */
// __nv_bfloat16* C,
// void* workSpace,
// size_t workSpaceSize,
// FILE* fout,
// customMatmulPerf_t perfResults[],
// int AlgoCombinations,
// cudaDataType_t dtype_fp8,
// int batchCount,
// int64_t strideA,
// int64_t strideB,
// int64_t strideD);
// #endif
// #ifdef ENABLE_FP8
// template int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
// int batch_size,
// int seq_len,
// int head_num,
// int size_per_head,
// int m,
// int n,
// int k,
// const float* alpha, /* host pointer */
// const __nv_fp8_e4m3* A,
// const __nv_fp8_e4m3* B,
// const float* beta, /* host pointer */
// __nv_fp8_e4m3* C,
// void* workSpace,
// size_t workSpaceSize,
// FILE* fout,
// customMatmulPerf_t perfResults[],
// int AlgoCombinations,
// cudaDataType_t dtype_fp8,
// int batchCount,
// int64_t strideA,
// int64_t strideB,
// int64_t strideD);
// #endif
// template int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
// int batch_size,
// int seq_len,
// int head_num,
// int size_per_head,
// int m,
// int n,
// int k,
// const float* alpha, /* host pointer */
// const half* A,
// const half* B,
// const float* beta, /* host pointer */
// half* C,
// void* workSpace,
// size_t workSpaceSize,
// FILE* fout,
// customMatmulPerf_t perfResults[],
// int AlgoCombinations,
// cudaDataType_t dtype_fp8,
// int batchCount,
// int64_t strideA,
// int64_t strideB,
// int64_t strideD);
size_t calGemmTestBufSizeInByte(int batch_size,
int seq_len,
......
......@@ -223,8 +223,8 @@ void generate_gpt_gemm_config(int batch_size,
cublasHandle_t cublas_handle;
check_cuda_error(cublasCreate(&cublas_handle));
cublasLtHandle_t ltHandle;
check_cuda_error(cublasLtCreate(&ltHandle));
// cublasLtHandle_t ltHandle;
// check_cuda_error(cublasLtCreate(&ltHandle));
cudaDataType_t AType;
cudaDataType_t BType;
......@@ -244,7 +244,8 @@ void generate_gpt_gemm_config(int batch_size,
DType = CUDA_R_32F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_ALGO23;
// endAlgo = (int)CUBLAS_GEMM_ALGO23;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
else if (std::is_same<T, half>::value) {
data_type = HALF_DATATYPE;
......@@ -252,9 +253,11 @@ void generate_gpt_gemm_config(int batch_size,
BType = CUDA_R_16F;
CType = CUDA_R_16F;
DType = CUDA_R_16F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
computeType = CUDA_R_16F;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value) {
......@@ -264,8 +267,10 @@ void generate_gpt_gemm_config(int batch_size,
CType = CUDA_R_16BF;
DType = CUDA_R_16BF;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
#endif
#ifdef ENABLE_FP8
......@@ -293,12 +298,24 @@ void generate_gpt_gemm_config(int batch_size,
DType_FP8[9] = CUDA_R_16BF;
#endif
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
#endif
float alpha = (float)1.0f;
float beta = (float)0.0f;
// float alpha = (float)1.0f;
// float beta = (float)0.0f;
float f_alpha = (float)1.0f;
float f_beta = (float)0.0f;
half h_alpha = (half)(f_alpha);
half h_beta = (half)(f_beta);
int is_fp16_computeType = computeType == CUDA_R_16F ? 1 : 0;
const void* alpha = is_fp16_computeType ? reinterpret_cast<void*>(&h_alpha) : reinterpret_cast<void*>(&f_alpha);
const void* beta = is_fp16_computeType ? reinterpret_cast<void*>(&h_beta) : reinterpret_cast<void*>(&f_beta);
printf("***Encoder Gemm Testing Begin***\n");
printf("***Cublas Gemm Testing Begin***\n");
......@@ -342,7 +359,7 @@ void generate_gpt_gemm_config(int batch_size,
max_input_len,
max_input_len,
size_per_head,
&alpha,
&f_alpha,
d_B,
BType,
size_per_head,
......@@ -351,13 +368,13 @@ void generate_gpt_gemm_config(int batch_size,
AType,
size_per_head,
max_input_len * size_per_head,
&beta,
&f_beta,
d_C,
CUDA_R_32F, // CType,
max_input_len,
max_input_len * max_input_len,
batchCount[i],
computeType,
CUDA_R_32F,
static_cast<cublasGemmAlgo_t>(algo));
}
else if (i == 2) {
......@@ -456,44 +473,45 @@ void generate_gpt_gemm_config(int batch_size,
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
// for gpt, computeType & scaleType should be FP32
LtHgemmCustomFind<T, float>(ltHandle,
batch_size * beam_width,
i == 1 || i == 2 ? max_input_len : 1,
head_num,
size_per_head,
n,
m,
k,
&alpha,
d_B,
d_A,
&beta,
d_C,
cublas_workspace,
workSpaceSize,
fd,
perfResults,
ALGO_COMBINATIONS,
DType_FP8[i],
batchCount[i],
strideA[i],
strideB[i],
strideD[i]);
if (perfResults[0].time < exec_time) {
printPerfStructure(batch_size * beam_width,
seq_len,
head_num,
size_per_head,
n,
m,
k,
perfResults[0],
fd,
data_type,
0,
batchCount[i]);
}
else {
// LtHgemmCustomFind<T, float>(ltHandle,
// batch_size * beam_width,
// i == 1 || i == 2 ? max_input_len : 1,
// head_num,
// size_per_head,
// n,
// m,
// k,
// &alpha,
// d_B,
// d_A,
// &beta,
// d_C,
// cublas_workspace,
// workSpaceSize,
// fd,
// perfResults,
// ALGO_COMBINATIONS,
// DType_FP8[i],
// batchCount[i],
// strideA[i],
// strideB[i],
// strideD[i]);
// if (perfResults[0].time < exec_time) {
// printPerfStructure(batch_size * beam_width,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// perfResults[0],
// fd,
// data_type,
// 0,
// batchCount[i]);
// }
// else {
{
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
......
......@@ -133,8 +133,8 @@ void generate_swin_gemm_config(
cublasHandle_t cublas_handle;
check_cuda_error(cublasCreate(&cublas_handle));
cublasLtHandle_t ltHandle;
check_cuda_error(cublasLtCreate(&ltHandle));
// cublasLtHandle_t ltHandle;
// check_cuda_error(cublasLtCreate(&ltHandle));
cudaDataType_t AType;
cudaDataType_t BType;
......@@ -151,16 +151,19 @@ void generate_swin_gemm_config(
CType = CUDA_R_32F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_ALGO23;
// endAlgo = (int)CUBLAS_GEMM_ALGO23;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
else if (std::is_same<T, half>::value) {
data_type = HALF_DATATYPE;
AType = CUDA_R_16F;
BType = CUDA_R_16F;
CType = CUDA_R_16F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
computeType = CUDA_R_16F;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value) {
......@@ -169,11 +172,14 @@ void generate_swin_gemm_config(
BType = CUDA_R_16BF;
CType = CUDA_R_16BF;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
#endif
using scaleT = typename ScaleTypeConverter<T, false>::Type;
// using scaleT = typename ScaleTypeConverter<T, false>::Type;
using scaleT = typename ScaleTypeConverter<T, true>::Type;
scaleT alpha = (scaleT)1.0f;
scaleT beta = (scaleT)0.0f;
......@@ -309,30 +315,31 @@ void generate_swin_gemm_config(
const int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
LtHgemmCustomFind<T, scaleT>(ltHandle,
batch_size,
seq_len,
head_num,
size_per_head,
n,
m,
k,
&alpha,
d_B,
d_A,
&beta,
d_C,
cublas_workspace,
workSpaceSize,
fd,
perfResults,
ALGO_COMBINATIONS);
if (perfResults[0].time < exec_time) {
printPerfStructure(
batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
exec_time = perfResults[0].time;
}
else {
// LtHgemmCustomFind<T, scaleT>(ltHandle,
// batch_size,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// &alpha,
// d_B,
// d_A,
// &beta,
// d_C,
// cublas_workspace,
// workSpaceSize,
// fd,
// perfResults,
// ALGO_COMBINATIONS);
// if (perfResults[0].time < exec_time) {
// printPerfStructure(
// batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
// exec_time = perfResults[0].time;
// }
// else {
{
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
......
......@@ -144,23 +144,23 @@ int igemm_config_INT8IO(int m, int n, int k, FILE* fout, void* buffer)
int8_t* d_B = d_A + m * k; // k * n, stored in column-major
int8_t* d_C = (int8_t*)(d_B + k * n); // m * n, stored in column-major
cublasLtHandle_t ltHandle;
cublasLtCreate(&ltHandle);
LtIgemmCustomFind(ltHandle,
m,
n,
k,
&alpha, /* host pointer */
d_A,
d_B,
&beta, /* host pointer */
d_C,
NULL,
0,
fout);
cublasLtDestroy(ltHandle);
// cublasLtHandle_t ltHandle;
// cublasLtCreate(&ltHandle);
// LtIgemmCustomFind(ltHandle,
// m,
// n,
// k,
// &alpha, /* host pointer */
// d_A,
// d_B,
// &beta, /* host pointer */
// d_C,
// NULL,
// 0,
// fout);
// cublasLtDestroy(ltHandle);
return 0;
}
......
......@@ -195,8 +195,8 @@ void generate_t5_gemm_config(int batch_size,
cublasHandle_t cublas_handle;
check_cuda_error(cublasCreate(&cublas_handle));
cublasLtHandle_t ltHandle;
check_cuda_error(cublasLtCreate(&ltHandle));
// cublasLtHandle_t ltHandle;
// check_cuda_error(cublasLtCreate(&ltHandle));
cudaDataType_t AType;
cudaDataType_t BType;
......@@ -213,16 +213,19 @@ void generate_t5_gemm_config(int batch_size,
CType = CUDA_R_32F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_ALGO23;
// endAlgo = (int)CUBLAS_GEMM_ALGO23;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
else if (std::is_same<T, half>::value) {
data_type = HALF_DATATYPE;
AType = CUDA_R_16F;
BType = CUDA_R_16F;
CType = CUDA_R_16F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
computeType = CUDA_R_16F;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value) {
......@@ -231,8 +234,10 @@ void generate_t5_gemm_config(int batch_size,
BType = CUDA_R_16BF;
CType = CUDA_R_16BF;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
#endif
float f_alpha = (float)1.0f;
......@@ -442,60 +447,61 @@ void generate_t5_gemm_config(int batch_size,
scaleT alpha_scale = (scaleT)1.0f;
scaleT beta_scale = (scaleT)0.0f;
LtHgemmCustomFind<T, scaleT>(ltHandle,
m,
seq_len,
head_num,
size_per_head,
n,
m,
k,
&(alpha_scale),
d_B,
d_A,
&(beta_scale),
d_C,
cublas_workspace,
workSpaceSize,
fd,
perfResults,
ALGO_COMBINATIONS);
}
else {
LtHgemmCustomFind<T, float>(ltHandle,
m,
seq_len,
head_num,
size_per_head,
n,
m,
k,
&(f_alpha),
d_B,
d_A,
&(f_beta),
d_C,
cublas_workspace,
workSpaceSize,
fd,
perfResults,
ALGO_COMBINATIONS);
}
if (perfResults[0].time < exec_time) {
printPerfStructure(batch_size * (i <= 5 || i == 1 ? 1 : beam_width),
seq_len,
head_num,
size_per_head,
n,
m,
k,
perfResults[0],
fd,
data_type,
0);
// LtHgemmCustomFind<T, scaleT>(ltHandle,
// m,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// &(alpha_scale),
// d_B,
// d_A,
// &(beta_scale),
// d_C,
// cublas_workspace,
// workSpaceSize,
// fd,
// perfResults,
// ALGO_COMBINATIONS);
}
else {
// LtHgemmCustomFind<T, float>(ltHandle,
// m,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// &(f_alpha),
// d_B,
// d_A,
// &(f_beta),
// d_C,
// cublas_workspace,
// workSpaceSize,
// fd,
// perfResults,
// ALGO_COMBINATIONS);
}
// if (perfResults[0].time < exec_time) {
// printPerfStructure(batch_size * (i <= 5 || i == 1 ? 1 : beam_width),
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// perfResults[0],
// fd,
// data_type,
// 0);
// }
// else {
{
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
......
......@@ -218,8 +218,8 @@ void generate_xlnet_gemm_config(int batch_size,
cublasHandle_t cublas_handle;
check_cuda_error(cublasCreate(&cublas_handle));
cublasLtHandle_t ltHandle;
check_cuda_error(cublasLtCreate(&ltHandle));
// cublasLtHandle_t ltHandle;
// check_cuda_error(cublasLtCreate(&ltHandle));
cudaDataType_t AType;
cudaDataType_t BType;
......@@ -236,16 +236,19 @@ void generate_xlnet_gemm_config(int batch_size,
CType = CUDA_R_32F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_ALGO23;
// endAlgo = (int)CUBLAS_GEMM_ALGO23;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
else if (std::is_same<T, half>::value) {
data_type = HALF_DATATYPE;
AType = CUDA_R_16F;
BType = CUDA_R_16F;
CType = CUDA_R_16F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
computeType = CUDA_R_16F;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value) {
......@@ -254,12 +257,15 @@ void generate_xlnet_gemm_config(int batch_size,
BType = CUDA_R_16BF;
CType = CUDA_R_16BF;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
// startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
// endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_DEFAULT;
}
#endif
using scaleT = typename ScaleTypeConverter<T, false>::Type;
// using scaleT = typename ScaleTypeConverter<T, false>::Type;
using scaleT = typename ScaleTypeConverter<T, true>::Type;
scaleT alpha = (scaleT)1.0f;
scaleT beta = (scaleT)0.0f;
......@@ -358,30 +364,31 @@ void generate_xlnet_gemm_config(int batch_size,
const int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
LtHgemmCustomFind<T, scaleT>(ltHandle,
batch_size,
seq_len,
head_num,
size_per_head,
n,
m,
k,
&alpha,
d_B,
d_A,
&beta,
d_C,
cublas_workspace,
workSpaceSize,
fd,
perfResults,
ALGO_COMBINATIONS);
if (perfResults[0].time < exec_time) {
printPerfStructure(
batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
exec_time = perfResults[0].time;
}
else {
// LtHgemmCustomFind<T, scaleT>(ltHandle,
// batch_size,
// seq_len,
// head_num,
// size_per_head,
// n,
// m,
// k,
// &alpha,
// d_B,
// d_A,
// &beta,
// d_C,
// cublas_workspace,
// workSpaceSize,
// fd,
// perfResults,
// ALGO_COMBINATIONS);
// if (perfResults[0].time < exec_time) {
// printPerfStructure(
// batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
// exec_time = perfResults[0].time;
// }
// else {
{
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment