Unverified Commit 9efcac38 authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

check-in fastertransformer (#7)

* add ft code

* gitignore

* fix lint

* revert fmha
parent 720fc533
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "encoder_igemm_func.h" // TODO(bhsueh) Remove this include
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include <cstdio>
#include <cstdlib>
#include <ctime>
#ifdef ENABLE_BF16
#include <cuda_fp16.h>
#endif
#ifdef ENABLE_FP8
#include <cuda_fp8.h>
#endif
#include <cuda_profiler_api.h>
#include <map>
#include <sys/time.h>
#include <unistd.h>
#include <vector>
namespace fastertransformer {
// Scale Type Converter
// is_fp16_compute_type is only valid when T = half
template<typename T, bool is_fp16_compute_type = false>
struct ScaleTypeConverter {
using Type = float;
};
template<>
struct ScaleTypeConverter<half, true> {
using Type = half;
};
template<typename T, typename scaleT>
int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
int batch_size,
int seq_len,
int head_num,
int size_per_head,
int m,
int n,
int k,
const scaleT* alpha, /* host pointer */
const T* A,
const T* B,
const scaleT* beta, /* host pointer */
T* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout,
customMatmulPerf_t perfResults[],
int AlgoCombinations,
cudaDataType_t dtype_fp8 = CUDA_R_32F,
int batchCount = 1,
int64_t strideA = 0,
int64_t strideB = 0,
int64_t strideD = 0);
size_t calGemmTestBufSizeInByte(int batch_size,
int seq_len,
int head_num,
int size_per_head,
int inter_size,
int vocab_size,
int int8_mode,
CublasDataType data_type);
size_t calGemmTestBufSizeInByteXlnet(
int batch_size, int seq_len, int head_num, int size_per_head, int inter_size, int hidden_units, int is_fp16);
int printPerfStructure(int batch_size,
int seq_len,
int head_num,
int size_per_head,
int m,
int n,
int k,
const customMatmulPerf_t& perf,
FILE* fout,
CublasDataType data_type,
int hasPrint,
int batch_count = 1);
} // namespace fastertransformer
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/utils/gemm_test/gpt_gemm_func.h"
namespace fastertransformer {
bool isSparseGemmAvailable(size_t m, size_t n, size_t k)
{
return m % 8 == 0 && n % 8 == 0 && k % 8 == 0;
}
template<typename T>
void generate_gpt_gemm_config(int batch_size,
int beam_width,
int max_input_len,
int head_num,
int size_per_head,
int inter_size,
int vocab_size,
int tensor_para_size,
void* buffer_in,
bool isAppend)
{
FT_CHECK(head_num % tensor_para_size == 0);
void* cublas_workspace;
void* buffer;
int workSpaceSize;
bool workspace_flag = std::is_same<T, half>::value;
#ifdef ENABLE_FP8
workspace_flag = workspace_flag || std::is_same<T, __nv_fp8_e4m3>::value;
#endif
#if ENABLE_BF16
workspace_flag = workspace_flag || std::is_same<T, __nv_bfloat16>::value;
#endif
if (workspace_flag) {
// cublas_workspace_ should be the start pointer of cudaMalloc()
// to ensure 16B alignemnet
cublas_workspace = buffer_in;
buffer = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
workSpaceSize = CUBLAS_WORKSPACE_SIZE;
}
else {
cublas_workspace = nullptr;
buffer = buffer_in;
workSpaceSize = 0;
}
struct cudaDeviceProp prop;
check_cuda_error(cudaGetDeviceProperties(&prop, 0));
printf("Device %s\n", prop.name);
// check config
FILE* fd;
int line_count = 0;
if (!isAppend) {
fd = fopen(GEMM_CONFIG, "w+");
}
else {
fd = fopen(GEMM_CONFIG, "a+");
std::vector<std::string> config;
char line[1024];
while (fgets(line, 1024, fd) != NULL) {
config.push_back(std::string(line));
}
line_count = config.size();
// if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1)) // 6 cublas/cublasLt, first row is not included
// {
// int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
// fclose(fd);
// fd = fopen(GEMM_CONFIG, "w+");
// fprintf(fd, "%s", config[0].c_str());
// for (uint i = startIdx; i < config.size(); i++) {
// fprintf(fd, "%s", config[i].c_str());
// }
// line_count = config.size() - (GEMM_NUM + 3);
// }
}
const int hidden_units = head_num * size_per_head;
const int local_head_num = head_num / tensor_para_size;
const int local_hidden_units = local_head_num * size_per_head;
const int max_input_len_padded = (max_input_len + 15) / 16 * 16;
const int gemm_num = 11;
int M[gemm_num];
int N[gemm_num];
int K[gemm_num];
int batchCount[gemm_num];
int64_t strideA[gemm_num];
int64_t strideB[gemm_num];
int64_t strideD[gemm_num];
char mess[gemm_num][256];
float exec_times[gemm_num];
// gemm 0
M[0] = batch_size * beam_width * max_input_len;
K[0] = hidden_units;
N[0] = 3 * local_hidden_units;
batchCount[0] = 1;
strideA[0] = 0;
strideB[0] = 0;
strideD[0] = 0;
strcpy(mess[0], "context from_tensor * weightQKV");
// gemm 1
M[1] = max_input_len_padded;
K[1] = size_per_head;
N[1] = max_input_len_padded;
batchCount[1] = batch_size * beam_width * local_head_num;
strideA[1] = max_input_len_padded * size_per_head;
strideB[1] = max_input_len_padded * size_per_head;
strideD[1] = max_input_len_padded * max_input_len_padded;
strcpy(mess[1], "context batch gemm Q*K^T");
// gemm 2
M[2] = max_input_len_padded;
K[2] = max_input_len_padded;
N[2] = size_per_head;
batchCount[2] = batch_size * beam_width * local_head_num;
strideA[2] = max_input_len_padded * size_per_head;
strideB[2] = max_input_len_padded * max_input_len_padded;
strideD[2] = max_input_len_padded * size_per_head;
strcpy(mess[2], "context batch gemm QK*V^T");
// gemm 3
M[3] = batch_size * beam_width * max_input_len;
K[3] = local_hidden_units;
N[3] = hidden_units;
batchCount[3] = 1;
strideA[3] = 0;
strideB[3] = 0;
strideD[3] = 0;
strcpy(mess[3], "context attr * output_kernel");
// gemm 4
M[4] = batch_size * beam_width * max_input_len;
K[4] = hidden_units;
N[4] = inter_size / tensor_para_size;
batchCount[4] = 1;
strideA[4] = 0;
strideB[4] = 0;
strideD[4] = 0;
strcpy(mess[4], "context ffn gemm 1");
// gemm 5
M[5] = batch_size * beam_width * max_input_len;
K[5] = inter_size / tensor_para_size;
N[5] = hidden_units;
batchCount[5] = 1;
strideA[5] = 0;
strideB[5] = 0;
strideD[5] = 0;
strcpy(mess[5], "context ffn gemm 2");
// gemm 6
M[6] = batch_size * beam_width;
K[6] = hidden_units;
N[6] = 3 * local_hidden_units;
batchCount[6] = 1;
strideA[6] = 0;
strideB[6] = 0;
strideD[6] = 0;
strcpy(mess[6], "from_tensor * weightQKV");
// gemm 7
M[7] = batch_size * beam_width;
K[7] = local_hidden_units;
N[7] = hidden_units;
batchCount[7] = 1;
strideA[7] = 0;
strideB[7] = 0;
strideD[7] = 0;
strcpy(mess[7], "attr * output_kernel");
// gemm 8
M[8] = batch_size * beam_width;
K[8] = hidden_units;
N[8] = inter_size / tensor_para_size;
batchCount[8] = 1;
strideA[8] = 0;
strideB[8] = 0;
strideD[8] = 0;
strcpy(mess[8], "ffn gemm 1");
// gemm 9
M[9] = batch_size * beam_width;
K[9] = inter_size / tensor_para_size;
N[9] = hidden_units;
batchCount[9] = 1;
strideA[9] = 0;
strideB[9] = 0;
strideD[9] = 0;
strcpy(mess[9], "ffn gemm 2");
// gemm 10
M[10] = batch_size * beam_width;
K[10] = hidden_units;
N[10] = ceil(vocab_size / 8.) * 8 / tensor_para_size;
batchCount[10] = 1;
strideA[10] = 0;
strideB[10] = 0;
strideD[10] = 0;
strcpy(mess[10], "logits gemm");
cublasHandle_t cublas_handle;
check_cuda_error(cublasCreate(&cublas_handle));
cublasLtHandle_t ltHandle;
check_cuda_error(cublasLtCreate(&ltHandle));
cudaDataType_t AType;
cudaDataType_t BType;
cudaDataType_t CType;
cudaDataType_t DType;
cudaDataType_t DType_FP8[gemm_num];
cudaDataType_t computeType;
int startAlgo, endAlgo;
const int ites = 100;
struct timeval start, end;
CublasDataType data_type;
if (std::is_same<T, float>::value) {
data_type = FLOAT_DATATYPE;
AType = CUDA_R_32F;
BType = CUDA_R_32F;
CType = CUDA_R_32F;
DType = CUDA_R_32F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_ALGO23;
}
else if (std::is_same<T, half>::value) {
data_type = HALF_DATATYPE;
AType = CUDA_R_16F;
BType = CUDA_R_16F;
CType = CUDA_R_16F;
DType = CUDA_R_16F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value) {
data_type = BFLOAT16_DATATYPE;
AType = CUDA_R_16BF;
BType = CUDA_R_16BF;
CType = CUDA_R_16BF;
DType = CUDA_R_16BF;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
}
#endif
#ifdef ENABLE_FP8
else if (std::is_same<T, __nv_fp8_e4m3>::value) {
data_type = FP8_DATATYPE;
AType = CUDA_R_8F_E4M3;
BType = CUDA_R_8F_E4M3;
CType = CUDA_R_16BF;
#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
DType = CUDA_R_16BF
#else
DType_FP8[0] = CUDA_R_8F_E4M3;
DType_FP8[1] = CUDA_R_16BF;
DType_FP8[2] = CUDA_R_8F_E4M3;
DType_FP8[3] = CUDA_R_16BF;
DType_FP8[4] = CUDA_R_16BF;
DType_FP8[5] = CUDA_R_16BF;
#ifdef FP8_MHA
DType_FP8[6] = CUDA_R_8F_E4M3;
#else
DType_FP8[6] = CUDA_R_16BF;
#endif
DType_FP8[7] = CUDA_R_16BF;
DType_FP8[8] = CUDA_R_16BF;
DType_FP8[9] = CUDA_R_16BF;
#endif
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
}
#endif
float alpha = (float)1.0f;
float beta = (float)0.0f;
printf("***Encoder Gemm Testing Begin***\n");
printf("***Cublas Gemm Testing Begin***\n");
if (line_count == 0) {
fprintf(fd,
"batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, "
"customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
"inner_shapeId, cluster_shapeId, "
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
"mma_shapeId, cga_shapeId, schedule_mode, "
#endif
"exec_time\n");
}
for (int i = 0; i < gemm_num; ++i) {
if (i <= 5) {
continue;
}
int seq_len = i <= 5 ? max_input_len : 1;
int m = M[i], n = N[i], k = K[i];
printf("\n-----------------------------\n");
printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
T* d_A = (T*)buffer;
T* d_B = d_A + m * k * batchCount[i];
T* d_C = d_B + k * n * batchCount[i];
float exec_time = 99999.0f;
int fast_algo = 0;
for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status;
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
for (int ite = 0; ite < ites; ++ite) {
if (i == 1) {
status = cublasGemmStridedBatchedEx(cublas_handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
max_input_len,
max_input_len,
size_per_head,
&alpha,
d_B,
BType,
size_per_head,
max_input_len * size_per_head,
d_A,
AType,
size_per_head,
max_input_len * size_per_head,
&beta,
d_C,
CUDA_R_32F, // CType,
max_input_len,
max_input_len * max_input_len,
batchCount[i],
computeType,
static_cast<cublasGemmAlgo_t>(algo));
}
else if (i == 2) {
status = cublasGemmStridedBatchedEx(cublas_handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
size_per_head,
max_input_len,
max_input_len,
&alpha,
d_B,
BType,
size_per_head,
max_input_len * size_per_head,
d_A,
AType,
max_input_len,
max_input_len * max_input_len,
&beta,
d_C,
CType,
size_per_head,
max_input_len * size_per_head,
batchCount[i],
computeType,
static_cast<cublasGemmAlgo_t>(algo));
}
else if (i == 10) {
status = cublasGemmEx(cublas_handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
n,
m,
k,
&alpha,
d_B,
BType,
k,
d_A,
AType,
k,
&beta,
d_C,
CType,
n,
computeType,
static_cast<cublasGemmAlgo_t>(algo));
}
else {
status = cublasGemmEx(cublas_handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
n,
m,
k,
&alpha,
d_B,
BType,
n,
d_A,
AType,
k,
&beta,
d_C,
CType,
n,
computeType,
static_cast<cublasGemmAlgo_t>(algo));
}
if (status != CUBLAS_STATUS_SUCCESS) {
break;
}
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
if (diffTime(start, end) / ites < exec_time) {
exec_time = diffTime(start, end) / ites;
fast_algo = algo;
}
}
sync_check_cuda_error();
}
printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
// for fp16 and bf16, we compare cublasLt
// for fp8, compare cublaslt for all gemm kernels
if ((data_type != FLOAT_DATATYPE && i != 1 && i != 2 && i != 10) || data_type == FP8_DATATYPE) {
printf("***cublasLt Gemm Testing Beign***\n");
// Let try a fixed number of combinations
int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
// for gpt, computeType & scaleType should be FP32
LtHgemmCustomFind<T, float>(ltHandle,
batch_size * beam_width,
i == 1 || i == 2 ? max_input_len : 1,
head_num,
size_per_head,
n,
m,
k,
&alpha,
d_B,
d_A,
&beta,
d_C,
cublas_workspace,
workSpaceSize,
fd,
perfResults,
ALGO_COMBINATIONS,
DType_FP8[i],
batchCount[i],
strideA[i],
strideB[i],
strideD[i]);
if (perfResults[0].time < exec_time) {
printPerfStructure(batch_size * beam_width,
seq_len,
head_num,
size_per_head,
n,
m,
k,
perfResults[0],
fd,
data_type,
0,
batchCount[i]);
}
else {
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
"-1 -1 "
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
"-1 -1 -1 "
#endif
"%f\n",
batch_size * beam_width,
seq_len,
head_num,
size_per_head,
data_type,
batchCount[i],
n,
m,
k,
fast_algo,
exec_time);
}
printf("***cublasLt Gemm Testing End***\n");
}
else {
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
"-1 -1 "
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
"-1 -1 -1 "
#endif
"%f\n",
batch_size * beam_width,
seq_len,
head_num,
size_per_head,
data_type,
batchCount[i],
n,
m,
k,
fast_algo,
exec_time);
}
sync_check_cuda_error();
exec_times[i] = exec_time;
}
printf("***cublas Gemm Testing End***\n\n");
fclose(fd);
#ifdef SPARSITY_ENABLED
bool do_sparse_test = false;
if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6) && sizeof(T) == sizeof(half)) {
do_sparse_test = true;
}
if (do_sparse_test) {
printf("***cusparseLt Gemm Testing Begin***\n");
// Only first 8 cases can be sparse
// - QKV kernel, Projection, FC1, FC2 in context or decoding.
const int spgemm_num = 8;
if (!isAppend) {
fd = fopen(SPGEMM_CONFIG, "w+");
}
else {
fd = fopen(SPGEMM_CONFIG, "a+");
std::vector<std::string> config;
char line[1024];
while (fgets(line, 1024, fd) != NULL) {
config.push_back(std::string(line));
}
line_count = config.size();
// gemm_num configs (cublas/cublasLt), first row is not included
if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1)) {
int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num);
fclose(fd);
fd = fopen(SPGEMM_CONFIG, "w+");
fprintf(fd, "%s", config[0].c_str());
for (uint i = startIdx; i < config.size(); i++) {
fprintf(fd, "%s", config[i].c_str());
}
line_count = config.size() - (spgemm_num + 3);
}
}
if (line_count == 0) {
// header line
fprintf(fd,
"batch_size, seq_len, head_num, size_per_head dataType "
"### batchCount, m, n, k, algoId, exec_time\n");
}
cusparseLtHandle_t handle;
CHECK_CUSPARSE(cusparseLtInit(&handle));
cusparseOrder_t order = CUSPARSE_ORDER_COL;
cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
// let's make this optional
cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F;
unsigned alignment = 16;
cudaStream_t stream = 0;
float alpha2 = 1.0f;
float beta2 = 0.0f;
for (int i = 0; i < gemm_num; ++i) {
// skip qk or attn or logit gemms.
if (i == 1 || i == 2 || i == 10) {
continue;
}
// seq_len is always 1 except context gemms.
int seq_len = i <= 5 ? max_input_len : 1;
// to be compatible with spgemm wrapper, we let A be the weight matrix
// so m and n are swapped
// A: mxk B: kxn C:mxn
int m = N[i], n = M[i], k = K[i];
printf("\n-----------------------------\n");
printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n);
if (n % 8 != 0) {
n = div_up(n, 8) * 8; // pad n to be multiple of 8 as FT does.
}
T* d_A = (T*)buffer;
T* d_B = d_A + m * k * batchCount[i];
T* d_C = d_B + k * n * batchCount[i];
T* dA_compressed;
{
cusparseLtMatDescriptor_t matA;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
&handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(
cusparseLtSpMMAPrune2(&handle, &matA, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
size_t compressed_size;
CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &matA, &compressed_size))
check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &matA, true, opA, d_A, dA_compressed, stream))
}
float exec_time = 99999.0f;
int fast_algo = 0;
if (isSparseGemmAvailable(m, n, k)) {
for (int alg = 0; alg < 4; ++alg) {
cudaDeviceSynchronize();
cusparseLtMatDescriptor_t matA, matB, matC;
void* d_workspace = nullptr;
int num_streams = 1;
cudaStream_t streams[1] = {stream};
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
&handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matB, k, n, k, alignment, CUDA_R_16F, order))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matC, m, n, m, alignment, CUDA_R_16F, order))
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
for (int ite = 0; ite < ites; ++ite) {
// initializing MatDesc takes a lot of time
// and these descs can be stored to other place
// whereas storing MatMulPlan to other place will cause errors
cusparseLtMatmulDescriptor_t matmul;
cusparseLtMatmulAlgSelection_t alg_sel;
cusparseLtMatmulPlan_t plan;
CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
&handle, &matmul, opA, opB, &matA, &matB, &matC, &matC, compute_type))
CHECK_CUSPARSE(
cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
&handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
size_t workspace_size;
CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size))
CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size))
CHECK_CUSPARSE(cusparseLtMatmul(&handle,
&plan,
&alpha2,
dA_compressed,
d_B,
&beta2,
d_C,
d_C,
d_workspace,
streams,
num_streams))
CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
if (diffTime(start, end) < exec_time) {
exec_time = diffTime(start, end);
fast_algo = alg;
}
}
}
exec_time /= ites;
if (exec_time >= exec_times[i]) {
fast_algo = -1;
}
printf("fast_algo %d\n", fast_algo);
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d %f\n",
batch_size * beam_width,
seq_len,
head_num,
size_per_head,
data_type,
batchCount[i],
m,
n,
k,
fast_algo,
exec_time);
cudaFree(dA_compressed);
}
CHECK_CUSPARSE(cusparseLtDestroy(&handle))
fclose(fd);
printf("***cusparseLt Gemm Testing End***\n");
}
#endif
printf("***GPT Gemm Testing End***\n");
return;
}
template void generate_gpt_gemm_config<float>(int batch_size,
int beam_width,
int max_input_len,
int head_num,
int size_per_head,
int inter_size,
int vocab_size,
int tensor_para_size,
void* buffer_in,
bool isAppend);
template void generate_gpt_gemm_config<half>(int batch_size,
int beam_width,
int max_input_len,
int head_num,
int size_per_head,
int inter_size,
int vocab_size,
int tensor_para_size,
void* buffer_in,
bool isAppend);
#ifdef ENABLE_BF16
template void generate_gpt_gemm_config<__nv_bfloat16>(int batch_size,
int beam_width,
int max_input_len,
int head_num,
int size_per_head,
int inter_size,
int vocab_size,
int tensor_para_size,
void* buffer_in,
bool isAppend);
#endif
#ifdef ENABLE_FP8
template void generate_gpt_gemm_config<__nv_fp8_e4m3>(int batch_size,
int beam_width,
int max_input_len,
int head_num,
int size_per_head,
int inter_size,
int vocab_size,
int tensor_para_size,
void* buffer_in,
bool isAppend);
#endif
size_t calGptGemmTestBufSizeInByte(int batch_size,
int beam_width,
int max_input_len,
int head_num,
int size_per_head,
int inter_size,
int vocab_size,
int tensor_para_size,
CublasDataType data_type)
{
size_t buf_size_in_byte = 0;
const size_t hidden_units = head_num * size_per_head;
const size_t local_head_num = head_num / tensor_para_size;
const size_t local_hidden_units = local_head_num * size_per_head;
// int wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half));
// Because we always use float for some buffer, set the wordSize to float directly.
int wordSize = sizeof(float);
size_t m = batch_size * beam_width * max_input_len;
std::vector<size_t> buff_size;
// for context qkv gemm
buff_size.push_back(m * hidden_units + hidden_units * 3 * local_hidden_units + m * 3 * local_hidden_units);
// for context batch gemm
buff_size.push_back(m * local_hidden_units + m * local_hidden_units
+ batch_size * beam_width * head_num * max_input_len * max_input_len);
// for context ffn gemm
buff_size.push_back(m * inter_size / tensor_para_size + hidden_units * inter_size / tensor_para_size
+ m * hidden_units);
// for vocab
buff_size.push_back(m * hidden_units + hidden_units * ceil(vocab_size / 8.) * 8 / tensor_para_size
+ m * ceil(vocab_size / 8.) * 8 / tensor_para_size);
for (auto t : buff_size) {
buf_size_in_byte = buf_size_in_byte > t ? buf_size_in_byte : t;
}
buf_size_in_byte *= wordSize;
buf_size_in_byte += ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE || data_type == FP8_DATATYPE) ?
CUBLAS_WORKSPACE_SIZE :
0);
return buf_size_in_byte;
}
} // namespace fastertransformer
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
#include <cstdio>
#include <cstdlib>
#include <ctime>
#ifdef ENABLE_BF16
#include <cuda_fp16.h>
#endif
#ifdef ENABLE_FP8
#include <cuda_fp8.h>
#endif
#include <cuda_profiler_api.h>
#include <map>
#include <sys/time.h>
#include <unistd.h>
#include <vector>
namespace fastertransformer {
template<typename T>
void generate_gpt_gemm_config(int batch_size,
int beam_width,
int seq_len,
int head_num,
int size_per_head,
int inter_size,
int vocab_size,
int tensor_para_size,
void* buffer_in,
bool isAppend);
size_t calGptGemmTestBufSizeInByte(int batch_size,
int beam_width,
int max_input_len,
int head_num,
int size_per_head,
int inter_size,
int vocab_size,
int tensor_para_size,
CublasDataType data_type);
} // namespace fastertransformer
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/utils/gemm_test/swin_gemm_func.h"
namespace fastertransformer {
template<typename T>
void generate_swin_gemm_config(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer_in, bool isAppend)
{
void* cublas_workspace;
void* buffer;
int workSpaceSize;
#ifdef ENABLE_BF16
if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
#else
if (std::is_same<T, half>::value) {
#endif // ENABLE_BF16
// cublas_workspace_ should be the start pointer of cudaMalloc()
// to ensure 16B alignemnet
cublas_workspace = buffer_in;
buffer = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
workSpaceSize = CUBLAS_WORKSPACE_SIZE;
}
else {
cublas_workspace = nullptr;
buffer = buffer_in;
workSpaceSize = 0;
}
struct cudaDeviceProp prop;
check_cuda_error(cudaGetDeviceProperties(&prop, 0));
printf("Device %s\n", prop.name);
// check config
FILE* fd;
int line_count = 0;
if (!isAppend) {
fd = fopen(GEMM_CONFIG, "w+");
fprintf(
fd,
"batch_size seq_len head_num size_per_head dataType ### batchCount n m k algoId customOption tile splitK_val swizzle reductionScheme workspaceSize stages exec_time\n");
}
else {
fd = fopen(GEMM_CONFIG, "a+");
std::vector<std::string> config;
char line[1024];
while (fgets(line, 1024, fd) != NULL) {
config.push_back(std::string(line));
}
line_count = config.size();
if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1)) // 6 cublas/cublasLt, first row is not included
{
int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
fclose(fd);
fd = fopen(GEMM_CONFIG, "w+");
fprintf(fd, "%s", config[0].c_str());
for (uint i = startIdx; i < config.size(); i++) {
fprintf(fd, "%s", config[i].c_str());
}
line_count = config.size() - (GEMM_NUM + 3);
}
}
const int gemm_num = 7;
const int NUM_OF_BASIC_LAYERS = 4;
int M[gemm_num];
int N[gemm_num];
int K[gemm_num];
int batchCount[gemm_num] = {1, 1, 1, 1, 1, 1, 1};
char mess[gemm_num][256];
float exec_times[gemm_num];
printf("***Encoder Gemm Testing Begin***\n");
printf("***Cublas Gemm Testing Begin***\n");
for (int basic_layer = 0; basic_layer < NUM_OF_BASIC_LAYERS; basic_layer++) {
// gemm1
M[0] = batch_size * seq_len;
K[0] = head_num * size_per_head;
N[0] = 3 * K[0];
strcpy(mess[0], "from_tensor * weightQ/K/V");
// gemm2
M[1] = M[0];
K[1] = K[0];
N[1] = K[0];
strcpy(mess[1], "attr * output_kernel");
// gemm3
M[2] = M[0];
K[2] = K[0];
N[2] = 4 * K[0];
strcpy(mess[2], "attr_output * inter_kernel");
// gemm3
M[3] = M[0];
K[3] = 4 * K[0];
N[3] = K[0];
strcpy(mess[3], "inter_matmul * output_kernel");
M[4] = M[0] / 4;
K[4] = 4 * K[0];
N[4] = 2 * K[0];
strcpy(mess[4], "patchMerge gemm");
M[5] = seq_len;
N[5] = seq_len;
K[5] = size_per_head;
batchCount[5] = batch_size * head_num;
strcpy(mess[5], "attention batched Gemm1");
M[6] = seq_len;
N[6] = size_per_head;
K[6] = seq_len;
batchCount[6] = batch_size * head_num;
strcpy(mess[6], "attention batched Gemm2");
cublasHandle_t cublas_handle;
check_cuda_error(cublasCreate(&cublas_handle));
cublasLtHandle_t ltHandle;
check_cuda_error(cublasLtCreate(&ltHandle));
cudaDataType_t AType;
cudaDataType_t BType;
cudaDataType_t CType;
cudaDataType_t computeType;
int startAlgo, endAlgo;
const int ites = 100;
struct timeval start, end;
CublasDataType data_type;
if (std::is_same<T, float>::value) {
data_type = FLOAT_DATATYPE;
AType = CUDA_R_32F;
BType = CUDA_R_32F;
CType = CUDA_R_32F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_ALGO23;
}
else if (std::is_same<T, half>::value) {
data_type = HALF_DATATYPE;
AType = CUDA_R_16F;
BType = CUDA_R_16F;
CType = CUDA_R_16F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value) {
data_type = BFLOAT16_DATATYPE;
AType = CUDA_R_16BF;
BType = CUDA_R_16BF;
CType = CUDA_R_16BF;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
}
#endif
using scaleT = typename ScaleTypeConverter<T, false>::Type;
scaleT alpha = (scaleT)1.0f;
scaleT beta = (scaleT)0.0f;
for (int i = 0; i < gemm_num; ++i) {
// if(i != 0 && i != 5) continue;
int m = M[i], n = N[i], k = K[i];
printf("\n-----------------------------\n");
printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
T* d_A = (T*)buffer;
T* d_B = d_A + m * k * batchCount[i];
T* d_C = d_B + k * n * batchCount[i];
// array of pointer for batchedGemm
T* harray[12];
harray[0] = (T*)buffer;
harray[1] = (T*)((char*)buffer + sizeof(T) * m * k);
harray[2] = (T*)((char*)buffer + 2 * sizeof(T) * m * k);
harray[4] = (T*)((char*)buffer + 3 * sizeof(T) * m * k);
harray[5] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + sizeof(T) * k * n);
harray[6] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 2 * sizeof(T) * k * n);
harray[8] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n);
harray[9] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + sizeof(T) * m * n);
harray[10] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + 2 * sizeof(T) * m * n);
T** darray = 0;
check_cuda_error(cudaMalloc((void**)&darray, sizeof(T*) * 12));
cudaMemcpy((void*)darray, (void*)harray, sizeof(T*) * 12, cudaMemcpyHostToDevice);
T** dAarray = darray;
T** dBarray = darray + 4;
T** dCarray = darray + 8;
float exec_time = 99999.0f;
int fast_algo = 0;
for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status;
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
for (int ite = 0; ite < ites; ++ite) {
if (i < 5) {
status = cublasGemmEx(cublas_handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
n,
m,
k,
&alpha,
d_B,
BType,
n,
d_A,
AType,
k,
&beta,
d_C,
CType,
n,
computeType,
static_cast<cublasGemmAlgo_t>(algo));
}
else if (i == 5) {
status = cublasGemmStridedBatchedEx(cublas_handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
seq_len,
seq_len,
size_per_head,
&alpha,
d_B,
BType,
size_per_head,
seq_len * size_per_head,
d_A,
AType,
size_per_head,
seq_len * size_per_head,
&beta,
d_C,
CType,
seq_len,
seq_len * seq_len,
batch_size * head_num,
computeType,
static_cast<cublasGemmAlgo_t>(algo));
}
else if (i == 6) {
status = cublasGemmStridedBatchedEx(cublas_handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
size_per_head,
seq_len,
seq_len,
&alpha,
d_B,
BType,
size_per_head,
seq_len * size_per_head,
d_A,
AType,
seq_len,
seq_len * seq_len,
&beta,
d_C,
CType,
size_per_head,
seq_len * size_per_head,
batch_size * head_num,
computeType,
static_cast<cublasGemmAlgo_t>(algo));
}
if (status != CUBLAS_STATUS_SUCCESS) {
break;
}
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
if (diffTime(start, end) / ites < exec_time) {
exec_time = diffTime(start, end) / ites;
fast_algo = algo;
}
}
}
printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
// for fp16 and bf16, we compare cublasLt
if (i < 5 && data_type != FLOAT_DATATYPE) {
printf("***cublasLt Gemm Testing Begin***\n");
// Let try a fixed number of combinations
int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
LtHgemmCustomFind<T, scaleT>(ltHandle,
batch_size,
seq_len,
head_num,
size_per_head,
n,
m,
k,
&alpha,
d_B,
d_A,
&beta,
d_C,
cublas_workspace,
workSpaceSize,
fd,
perfResults,
ALGO_COMBINATIONS);
if (perfResults[0].time < exec_time) {
printPerfStructure(
batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
exec_time = perfResults[0].time;
}
else {
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
"-1 -1 "
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
"-1 -1 -1 "
#endif
"%f\n",
batch_size,
seq_len,
head_num,
size_per_head,
data_type,
batchCount[i],
n,
m,
k,
fast_algo,
exec_time);
}
printf("***cublasLt Gemm Testing End***\n");
}
else {
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
"-1 -1 "
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
"-1 -1 -1 "
#endif
"%f\n",
batch_size,
seq_len,
head_num,
size_per_head,
data_type,
batchCount[i],
n,
m,
k,
fast_algo,
exec_time);
}
exec_times[i] = exec_time;
cudaFree(darray);
}
if (basic_layer != NUM_OF_BASIC_LAYERS - 1) {
batch_size = batch_size / 4;
head_num = head_num * 2;
}
}
printf("***cublas Gemm Testing End***\n\n");
fclose(fd);
printf("***Encoder Gemm Testing End***\n");
return;
}
template void generate_swin_gemm_config<float>(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend);
template void generate_swin_gemm_config<half>(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend);
#ifdef ENABLE_BF16
template void generate_swin_gemm_config<__nv_bfloat16>(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend);
#endif
} // namespace fastertransformer
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <map>
#include <sys/time.h>
#include <unistd.h>
#include <vector>
namespace fastertransformer {
template<typename T>
void generate_swin_gemm_config(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
} // namespace fastertransformer
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "swin_igemm_func.h"
namespace fastertransformer {
static const char* showStatus(cublasStatus_t error)
{
switch (error) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
}
return "<unknown>";
}
static inline bool time_compare(const customMatmulPerf_t& perf_a, const customMatmulPerf_t& perf_b)
{
return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.time < perf_b.time));
}
static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, // to get the capabilities (required a GPU)
cublasLtMatmulDesc_t operationDesc,
const void* alpha, /* host or device pointer */
const void* A,
cublasLtMatrixLayout_t Adesc,
const void* B,
cublasLtMatrixLayout_t Bdesc,
const void* beta, /* host or device pointer */
const void* C,
cublasLtMatrixLayout_t Cdesc,
void* D,
cublasLtMatrixLayout_t Ddesc,
const cublasLtMatmulAlgo_t& algo,
int kernelRepeats,
void* workSpace,
size_t workSpaceSizeInBytes,
customMatmulPerf_t& perfResults,
cudaStream_t stream)
{
cublasLtMatmulHeuristicResult_t heurResult;
/* Looping over the Algo */
int repeats = kernelRepeats;
cublasStatus_t algoStatus =
cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
struct timeval start, end;
cublasStatus_t oneRunStatus;
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
for (int loop = 0; loop < repeats; loop++) {
oneRunStatus = cublasLtMatmul(ltHandle,
operationDesc,
alpha,
A,
Adesc,
B,
Bdesc,
beta,
C,
Cdesc,
D,
Ddesc,
&algo,
workSpace,
workSpaceSizeInBytes,
stream);
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
algoStatus = oneRunStatus;
}
float time = diffTime(start, end);
// For the moment only add successful findings
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
perfResults.algo = algo;
perfResults.time = time / repeats;
perfResults.workspaceSize = heurResult.workspaceSize;
perfResults.wavesCount = heurResult.wavesCount;
}
}
else {
// printf("not enough workspace! %ld\n", heurResult.workspaceSize);
algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not enough workspace
}
}
else {
// printf("check fail!\n");
}
return algoStatus;
}
int igemm_config_INT8IO(int m, int n, int k, FILE* fout, void* buffer)
{
printf("batchCount %d m %d n %d k %d\n", 1, m, n, k);
float alpha = 1.0f;
float beta = 0.0f;
int8_t* d_A = (int8_t*)buffer; // m * k, stored in column-major
int8_t* d_B = d_A + m * k; // k * n, stored in column-major
int8_t* d_C = (int8_t*)(d_B + k * n); // m * n, stored in column-major
cublasLtHandle_t ltHandle;
cublasLtCreate(&ltHandle);
LtIgemmCustomFind(ltHandle,
m,
n,
k,
&alpha, /* host pointer */
d_A,
d_B,
&beta, /* host pointer */
d_C,
NULL,
0,
fout);
cublasLtDestroy(ltHandle);
return 0;
}
int generate_swin_igemm_config(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend)
{
// ensure program running on SM >= 7.5
struct cudaDeviceProp prop;
check_cuda_error(cudaGetDeviceProperties(&prop, 0));
if (!(prop.major >= 8 || (prop.major >= 7 && prop.minor >= 5))) {
printf("[ERROR] INT8 mode > 0 is only supported on device with sm >= 7.5\n ");
exit(-1);
}
printf("Device %s\n", prop.name);
// check config
FILE* fout;
if (!isAppend) {
fout = fopen(IGEMM_CONFIG, "w+");
fprintf(
fout,
"batch_size seq_len head_num size_per_head dataType ### batchCount m n k algoId customOption tile splitK_val swizzle reductionScheme workspaceSize stages exec_time\n");
}
else {
fout = fopen(IGEMM_CONFIG, "a+");
std::vector<std::string> config;
char line[1024];
while (fgets(line, 1024, fout) != NULL) {
config.push_back(std::string(line));
}
if (config.size() >= MAX_CONFIG_NUM * GEMM_NUM) {
int startIdx = config.size() - (MAX_CONFIG_NUM - 1) * GEMM_NUM;
fclose(fout);
fout = fopen(IGEMM_CONFIG, "w+");
for (int i = startIdx; i < (int)config.size(); i++) {
fprintf(fout, "%s", config[i].c_str());
}
}
}
int m = batch_size * seq_len;
int n = head_num * size_per_head;
int k = n;
int batchCount;
const int NUM_OF_BASIC_LAYERS = 4;
printf("***Swin IGemm Testing Begin***\n");
for (int basic_layer = 0; basic_layer < NUM_OF_BASIC_LAYERS; basic_layer++) {
printf("\n-----------------------------\n");
batchCount = 1;
m = batch_size * seq_len;
k = head_num * size_per_head;
n = 3 * head_num * size_per_head;
if (n % 32 != 0 || k % 32 != 0) {
printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
}
else {
igemm_config_INT8IO(m, n, k, fout, buffer);
}
printf("\n-----------------------------\n");
m = batch_size * seq_len;
n = head_num * size_per_head;
k = head_num * size_per_head;
if (n % 32 != 0 || k % 32 != 0) {
printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
}
else {
igemm_config_INT8IO(m, n, k, fout, buffer);
}
printf("\n-----------------------------\n");
m = batch_size * seq_len;
n = 4 * head_num * size_per_head;
k = head_num * size_per_head;
if (n % 32 != 0 || k % 32 != 0) {
printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
}
else {
igemm_config_INT8IO(m, n, k, fout, buffer);
}
printf("\n-----------------------------\n");
m = batch_size * seq_len;
n = head_num * size_per_head;
k = 4 * head_num * size_per_head;
if (n % 32 != 0 || k % 32 != 0) {
printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
}
else {
igemm_config_INT8IO(m, n, k, fout, buffer);
}
if (basic_layer != NUM_OF_BASIC_LAYERS - 1) {
printf("\n-----------------------------\n");
batch_size = batch_size / 4;
head_num = head_num * 2;
m = batch_size * seq_len;
n = head_num * size_per_head;
k = 2 * head_num * size_per_head;
if (n % 32 != 0 || k % 32 != 0) {
printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
}
else {
igemm_config_INT8IO(m, n, k, fout, buffer);
}
}
printf("\n-----------------------------\n");
}
fclose(fout);
printf("\n-----------------------------\n");
printf("***Swin IGemm Testing End***\n");
return 0;
}
} // namespace fastertransformer
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/gemm_test/encoder_igemm_func.h"
#include <algorithm>
#include <cublasLt.h>
#include <cuda_runtime.h>
#include <map>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <unistd.h>
#include <vector>
namespace fastertransformer {
/* CAUTION : must match cublasLtMatmulTile_t */
// const char* const matmulTileName[] = {
// "UNDEF", "8x8", "8x16", "16x8", "8x32", "16x16", "32x8", "8x64", "16x32",
// "32x16", "64x8", "32x32", "32x64", "64x32", "32x128", "64x64", "128x32", "64x128",
// "128x64", "64x256", "128x128", "256x64", "64x512", "128x256", "256x128", "512x64",
// };
int generate_swin_igemm_config(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
} // namespace fastertransformer
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment