Unverified Commit 9efcac38 authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

check-in fastertransformer (#7)

* add ft code

* gitignore

* fix lint

* revert fmha
parent 720fc533
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cuda_utils.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <map>
#include <mutex>
#include <string>
#pragma once
namespace fastertransformer {
class cublasMMWrapper {
protected:
cublasHandle_t cublas_handle_;
cublasLtHandle_t cublaslt_handle_;
#ifdef SPARSITY_ENABLED
cusparseLtHandle_t cusparselt_handle_;
std::map<std::string, cusparseLtMatDescriptor_t> sp_mat_A_desc_map_;
std::map<std::string, cusparseLtMatDescriptor_t> sp_mat_B_desc_map_;
std::map<std::string, cusparseLtMatDescriptor_t> sp_mat_C_desc_map_;
#endif
cudaDataType_t Atype_;
cudaDataType_t Btype_;
cudaDataType_t Ctype_;
cudaDataType_t computeType_;
cudaStream_t stream_;
cublasAlgoMap* cublas_algo_map_;
std::mutex* mu_;
IAllocator* allocator_ = nullptr;
void* cublas_workspace_ = nullptr;
friend class cublasINT8MMWrapper;
void _Int8Gemm(const int m,
const int n,
const int k,
const int8_t* A,
const int lda,
const int8_t* B,
const int ldb,
void* C,
const int ldc,
const void* alpha,
const int mode,
const bool per_column_scaling);
public:
cublasMMWrapper(cublasHandle_t cublas_handle_,
cublasLtHandle_t cublaslt_handle_,
cudaStream_t stream,
cublasAlgoMap* map,
std::mutex* mu,
IAllocator* allocator);
#ifdef SPARSITY_ENABLED
cublasMMWrapper(cublasHandle_t cublas_handle_,
cublasLtHandle_t cublaslt_handle_,
cusparseLtHandle_t cusparselt_handle,
cudaStream_t stream,
cublasAlgoMap* map,
std::mutex* mu,
IAllocator* allocator);
#endif
~cublasMMWrapper();
cublasMMWrapper(const cublasMMWrapper& wrapper);
virtual void cublasVersionCheck()
{
return;
};
cublasStatus_t cublasLtMatmulWrapper(cublasLtHandle_t lightHandle,
cublasLtMatmulDesc_t computeDesc,
const void* alpha,
const void* A,
cublasLtMatrixLayout_t Adesc,
const void* B,
cublasLtMatrixLayout_t Bdesc,
const void* beta,
const void* C,
cublasLtMatrixLayout_t Cdesc,
void* D,
cublasLtMatrixLayout_t Ddesc,
const cublasLtMatmulAlgo_t* algo,
void* workspace,
size_t workspaceSizeInBytes,
cudaStream_t stream);
std::pair<bool, cublasLtMatmulAlgo_t> findBestAlgo(cublasLtHandle_t lightHandle,
cublasLtMatmulDesc_t computeDesc,
const void* alpha,
const void* A,
cublasLtMatrixLayout_t Adesc,
const void* B,
cublasLtMatrixLayout_t Bdesc,
const void* beta,
const void* C,
cublasLtMatrixLayout_t Cdesc,
void* D,
cublasLtMatrixLayout_t Ddesc,
cudaStream_t stream);
using MatrixLayout = std::tuple<cudaDataType_t, cublasLtOrder_t, uint64_t, uint64_t>;
using cache_idx_t = std::tuple<cublasLtMatmulDesc_t, std::array<MatrixLayout, 4>>;
std::map<cache_idx_t, cublasLtMatmulAlgo_t> algo_cache;
MatrixLayout createMatrixLayout(cublasLtMatrixLayout_t Mdesc);
void Gemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* alpha,
const void* A,
cudaDataType_t Atype,
int lda,
const void* B,
cudaDataType_t Btype,
int ldb,
const void* beta,
void* C,
cudaDataType_t Ctype,
int ldc,
cudaDataType_t computeType,
cublasGemmAlgo_t algo);
void Gemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* A,
const int lda,
const void* B,
const int ldb,
void* C,
const int ldc);
void Gemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* A,
const int lda,
const void* B,
const int ldb,
void* C,
const int ldc,
float f_alpha,
float f_beta);
void Int8Gemm(const int m,
const int n,
const int k,
const int8_t* A,
const int lda,
const int8_t* B,
const int ldb,
int8_t* C,
const int ldc,
const float* alpha,
const bool per_column_scaling = false);
void Int8Gemm(const int m,
const int n,
const int k,
const int8_t* A,
const int lda,
const int8_t* B,
const int ldb,
int32_t* C,
const int ldc);
void setFP32GemmConfig();
void setFP16GemmConfig();
#ifdef ENABLE_BF16
void setBF16GemmConfig();
#endif
void setStream(cudaStream_t stream);
void setGemmConfig(cudaDataType_t aType, cudaDataType_t bType, cudaDataType_t cType, cudaDataType_t computeType);
CublasDataType getCublasDataType(cudaDataType_t data_type);
#if (CUDART_VERSION >= 11000)
void Gemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* A,
const int lda,
const void* B,
const int ldb,
const void* bias,
void* C,
const int ldc);
#endif
void stridedBatchedGemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* A,
const int lda,
const int64_t strideA,
const void* B,
const int ldb,
const int64_t strideB,
void* C,
const int ldc,
const int64_t strideC,
const int batchCount,
const float f_alpha = 1.0f,
const float f_beta = 0.0f);
void stridedBatchedGemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const float f_alpha,
const void* A,
cudaDataType_t AType,
const int lda,
const int64_t strideA,
const void* B,
cudaDataType_t BType,
const int ldb,
const int64_t strideB,
const float f_beta,
void* C,
cudaDataType_t CType,
const int ldc,
const int64_t strideC,
const int batch_count,
cudaDataType_t computeType);
void batchedGemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* const* A,
const int lda,
const void* const* B,
const int ldb,
void* const* C,
const int ldc,
const int batch_count);
bool isFuseBatchGemm(const int batch_count, const int m, const int k, const int n);
#ifdef SPARSITY_ENABLED
void SpGemm(cublasOperation_t transa,
cublasOperation_t transb,
const int m,
const int n,
const int k,
const void* A,
const void* B,
void* C);
size_t getSparseMatrixSize(int m, int k);
void compressMatrix(const void* input, void* output, const int m, const int k);
bool isUseSparse(const int batch_count, const int m, const int n, const int k);
#endif
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include <cuda_fp16.h>
namespace fastertransformer {
#ifdef ENABLE_BF16
inline __device__ float2 bf1622float2(const __nv_bfloat162 val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float2 f_val;
f_val.x = __low2float(val);
f_val.y = __high2float(val);
return f_val;
#else
return __bfloat1622float2(val);
#endif
}
inline __device__ int16_t bf1622int16(__nv_bfloat162 val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float2 f_val;
f_val.x = max(min(__low2float(val), 127.f), -128.f);
f_val.y = max(min(__high2float(val), 127.f), -128.f);
union { int8_t int8[2]; int16_t int16; };
int8[0] = static_cast<int8_t>(static_cast<short>(f_val.x));
int8[1] = static_cast<int8_t>(static_cast<short>(f_val.y));
return int16;
#else
val = __hmin2(val, make_bfloat162(127., 127.));
val = __hmax2(val, make_bfloat162(-128., -128.));
union { int8_t int8[2]; int16_t int16; };
int8[0] = static_cast<int8_t>(static_cast<short>(val.x));
int8[1] = static_cast<int8_t>(static_cast<short>(val.y));
return int16;
#endif
}
inline __device__ __nv_bfloat162 float22bf162(const float2 val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __floats2bfloat162_rn(val.x, val.y);
#else
return __float22bfloat162_rn(val);
#endif
}
inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
__nv_bfloat162 val2;
val2.x = val;
val2.y = val;
return val2;
#else
return __bfloat162bfloat162(val);
#endif
}
inline __device__ __nv_bfloat162 bf16hadd2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fxl, fxh, fyl, fyh;
fxl = __low2float(x);
fxh = __high2float(x);
fyl = __low2float(y);
fyh = __high2float(y);
return __floats2bfloat162_rn(fxl + fyl, fxh + fyh);
#else
return __hadd2(x, y);
#endif
}
inline __device__ __nv_bfloat16 bf16hadd(const __nv_bfloat16 x, const __nv_bfloat16 y) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __float2bfloat16( __bfloat162float(x) + __bfloat162float(y) );
#else
return __hadd(x, y);
#endif
}
inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fxl, fxh, fyl, fyh;
fxl = __low2float(x);
fxh = __high2float(x);
fyl = __low2float(y);
fyh = __high2float(y);
return __floats2bfloat162_rn(fxl - fyl, fxh - fyh);
#else
return __hsub2(x, y);
#endif
}
inline __device__ __nv_bfloat16 bf16hsub(const __nv_bfloat16 x, const __nv_bfloat16 y) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __float2bfloat16( __bfloat162float(x) - __bfloat162float(y) );
#else
return __hsub(x, y);
#endif
}
inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fxl, fxh, fyl, fyh;
fxl = __low2float(x);
fxh = __high2float(x);
fyl = __low2float(y);
fyh = __high2float(y);
return __floats2bfloat162_rn(fxl * fyl, fxh * fyh);
#else
return __hmul2(x, y);
#endif
}
inline __device__ __nv_bfloat16 bf16hmul(const __nv_bfloat16 x, const __nv_bfloat16 y) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) );
#else
return __hmul(x, y);
#endif
}
inline __device__ __nv_bfloat162 bf16hfma2(const __nv_bfloat162 x, const __nv_bfloat162 y, const __nv_bfloat162 z) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fxl, fxh, fyl, fyh, fzl, fzh;
fxl = __low2float(x);
fxh = __high2float(x);
fyl = __low2float(y);
fyh = __high2float(y);
fzl = __low2float(z);
fzh = __high2float(z);
return __floats2bfloat162_rn(fxl * fyl + fzl, fxh * fyh + fzh);
#else
return __hfma2(x, y, z);
#endif
}
inline __device__ __nv_bfloat16 bf16hfma(const __nv_bfloat16 x, const __nv_bfloat16 y, const __nv_bfloat16 z) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) + __bfloat162float(z));
#else
return __hfma(x, y, z);
#endif
}
inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fxl, fxh;
fxl = __low2float(x);
fxh = __high2float(x);;
return __floats2bfloat162_rn(expf(fxl), expf(fxh));
#else
return h2exp(x);
#endif
}
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
inline __device__ __nv_bfloat162 operator*(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hmul2(x, y); };
inline __device__ __nv_bfloat162 operator+(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hadd2(x, y); };
inline __device__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
{
__nv_bfloat162 t; t.x = x; t.y = y; return t;
}
#endif
inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c));
#else
return a + b + c;
#endif
}
inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c, __nv_bfloat16 d) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c) + __bfloat162float(d));
#else
return (__nv_bfloat16)((float)a + (float)b + (float)c + (float)d);
#endif
}
inline __device__ __nv_bfloat162 bf16hadd2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fal, fah, fbl, fbh, fcl, fch;
fal = __low2float(a);
fah = __high2float(a);
fbl = __low2float(b);
fbh = __high2float(b);
fcl = __low2float(c);
fch = __high2float(c);
return __floats2bfloat162_rn(fal + fbl + fcl, fah + fbh + fch);
#else
return a + b + c;
#endif
}
inline __device__ __nv_bfloat16 bf16hmul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b) * __bfloat162float(c));
#else
return a * b * c;
#endif
}
inline __device__ __nv_bfloat162 bf16hmul2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fal, fah, fbl, fbh, fcl, fch;
fal = __low2float(a);
fah = __high2float(a);
fbl = __low2float(b);
fbh = __high2float(b);
fcl = __low2float(c);
fch = __high2float(c);
return __floats2bfloat162_rn(fal * fbl * fcl, fah * fbh * fch);
#else
return a * b * c;
#endif
}
inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162 d) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fal, fah, fbl, fbh, fcl, fch, fdl, fdh;
fal = __low2float(a);
fah = __high2float(a);
fbl = __low2float(b);
fbh = __high2float(b);
fcl = __low2float(c);
fch = __high2float(c);
fdl = __low2float(d);
fdh = __high2float(d);
return __floats2bfloat162_rn(fal * fbl * fcl + fdl, fah * fbh * fch + fdh);
#else
return a * b * c + d;
#endif
}
#endif // ENABLE_BF16
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifdef ENABLE_BF16
#include <cuda_bf16.h>
#endif
\ No newline at end of file
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "cuda_fp8_utils.h"
namespace fastertransformer {
#ifdef ENABLE_FP8
template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
__global__ void quantizeMatrix(T_OUT* output, float const* input_scale, T_IN const* input, uint32_t size, uint32_t n)
{
for (uint32_t i = threadIdx.x + blockIdx.x * blockDim.x; i < size; i += blockDim.x * gridDim.x) {
if (quantize_mode == QUANTIZE_MODE::PER_CHANNEL) {
output[i] = T_OUT((float)(input[i]) * __ldg(input_scale + (i % n)));
}
else {
output[i] = T_OUT((float)(input[i]) * __ldg(input_scale));
}
}
}
template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
void invokeQuantizeMatrix(
T_OUT* output, float const* input_scale, T_IN const* input, uint32_t size, uint32_t n, cudaStream_t stream)
{
dim3 grid(32);
dim3 block(256);
quantizeMatrix<T_OUT, T_IN, quantize_mode><<<grid, block, 0, stream>>>(output, input_scale, input, size, n);
}
#define defineinvokeQuantizeMatrix(type_out, type_in, mode) \
template void invokeQuantizeMatrix<type_out, type_in, mode>(type_out * output, \
float const* input_scale, \
type_in const* input, \
uint32_t size, \
uint32_t n, \
cudaStream_t stream);
defineinvokeQuantizeMatrix(__nv_fp8_e4m3, float, QUANTIZE_MODE::PER_CHANNEL);
defineinvokeQuantizeMatrix(__nv_fp8_e4m3, float, QUANTIZE_MODE::PER_TENSOR);
defineinvokeQuantizeMatrix(__nv_fp8_e4m3, half, QUANTIZE_MODE::PER_CHANNEL);
defineinvokeQuantizeMatrix(__nv_fp8_e4m3, half, QUANTIZE_MODE::PER_TENSOR);
defineinvokeQuantizeMatrix(half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_CHANNEL);
defineinvokeQuantizeMatrix(half, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR);
defineinvokeQuantizeMatrix(float, __nv_fp8_e4m3, QUANTIZE_MODE::PER_CHANNEL);
defineinvokeQuantizeMatrix(float, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR);
#ifdef ENABLE_BF16
defineinvokeQuantizeMatrix(__nv_fp8_e4m3, __nv_bfloat16, QUANTIZE_MODE::PER_CHANNEL);
defineinvokeQuantizeMatrix(__nv_fp8_e4m3, __nv_bfloat16, QUANTIZE_MODE::PER_TENSOR);
defineinvokeQuantizeMatrix(__nv_bfloat16, __nv_fp8_e4m3, QUANTIZE_MODE::PER_CHANNEL);
defineinvokeQuantizeMatrix(__nv_bfloat16, __nv_fp8_e4m3, QUANTIZE_MODE::PER_TENSOR);
#endif
template<typename T_OUT, typename T_IN, typename T_FAKE>
__global__ void fakeQuantize(T_OUT* dst, const T_IN* src, const int size)
{
for (int tid = threadIdx.x + blockIdx.x * blockDim.x; tid < size; tid += blockDim.x * gridDim.x) {
T_FAKE tmp = (T_FAKE)((float)src[tid]);
dst[tid] = (T_OUT)((float)tmp);
}
}
template<typename T_OUT, typename T_IN, typename T_FAKE>
void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int size, cudaStream_t stream)
{
fakeQuantize<T_OUT, T_IN, T_FAKE><<<256, 256, 0, stream>>>(dst, src, size);
}
template void
invokeFakeQuantize<float, float, __nv_fp8_e4m3>(float* dst, const float* src, const int size, cudaStream_t stream);
template void
invokeFakeQuantize<half, half, __nv_fp8_e4m3>(half* dst, const half* src, const int size, cudaStream_t stream);
template void invokeFakeQuantize<__nv_bfloat16, __nv_bfloat16, __nv_fp8_e4m3>(__nv_bfloat16* dst,
const __nv_bfloat16* src,
const int size,
cudaStream_t stream);
template<typename T_W>
__global__ void computeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n)
{
float max = -10000.f;
for (int i = 0; i < k; i++) {
float val = fabs((float)weights[i * n + blockIdx.x * blockDim.x + threadIdx.x]);
max = max > val ? max : val;
if (threadIdx.x == 0 && blockIdx.x == 0 && i % 100 == 0) {
printf("max: %f, val: %f \n", max, val);
}
}
// quant_ptr[blockIdx.x * blockDim.x + threadIdx.x] = 1.0f;
// quant_ptr[blockIdx.x * blockDim.x + threadIdx.x] = FP8_E4M3_MAX / max;
quant_ptr[blockIdx.x * blockDim.x + threadIdx.x] = std::max(max / FP8_E4M3_MAX, 1.0f / 32.f);
}
template<typename T_W>
void invokeComputeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n, cudaStream_t stream)
{
dim3 block(256);
dim3 grid;
grid.x = (n + 255) / 256;
computeFP8QuantizeScale<T_W><<<grid, block, 0, stream>>>(quant_ptr, weights, k, n);
}
#ifdef ENABLE_BF16
template void invokeComputeFP8QuantizeScale(
float* quant_ptr, const __nv_bfloat16* weights, const int k, const int n, cudaStream_t stream);
#endif
template void
invokeComputeFP8QuantizeScale(float* quant_ptr, const float* weights, const int k, const int n, cudaStream_t stream);
#endif // ENABLE_FP8
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#ifdef ENABLE_FP8
#include <cuda_fp8.h>
#include <cuda_runtime.h>
#include <stdint.h>
// #define FP8_MHA
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 900
#define FUSE_GEMM_ACT
#endif
#define FP8_GEMM_OUTPUT_QUANT_DISABLE
#ifdef FUSE_GEMM_ACT
#define USE_QGMMA
#endif
namespace fastertransformer {
const float FP8_E4M3_MAX = 480.0f;
enum QUANTIZE_MODE {
PER_CHANNEL,
PER_TENSOR,
PER_CHANNEL_WEIGHT_PER_TENSOR_ACT
};
// Packed Data Type
typedef struct __CUDA_ALIGN__(32) {
float array[8];
} float8;
typedef struct __CUDA_ALIGN__(16) {
half array[8];
} half8;
#ifdef ENABLE_BF16
typedef struct __CUDA_ALIGN__(4) {
__nv_bfloat16 array[2];
} __nv_bfloat16_2;
typedef struct __CUDA_ALIGN__(8) {
__nv_bfloat162 x, y;
} __nv_bfloat162_2_xy;
typedef struct __CUDA_ALIGN__(8) {
__nv_bfloat16 array[4];
} __nv_bfloat164;
typedef struct __CUDA_ALIGN__(8) {
__nv_bfloat162 array[2];
} __nv_bfloat162_2;
typedef struct __CUDA_ALIGN__(16) {
__nv_bfloat16 array[8];
} __nv_bfloat168;
typedef struct __CUDA_ALIGN__(16) {
__nv_bfloat162 array[4];
} __nv_bfloat162_4;
typedef struct __CUDA_ALIGN__(32) {
__nv_bfloat16 array[16];
} __nv_bfloat1616;
#endif
#ifdef ENABLE_FP8
typedef struct __CUDA_ALIGN__(2) {
__nv_fp8_e4m3 array[2];
} __nv_fp8_2_e4m3;
typedef struct __CUDA_ALIGN__(4) {
__nv_fp8_e4m3 array[4];
} __nv_fp8_4_e4m3;
typedef struct __CUDA_ALIGN__(4) {
__nv_fp8x2_e4m3 array[2];
} __nv_fp8x2_x2_e4m3;
typedef struct __CUDA_ALIGN__(8) {
__nv_fp8_e4m3 array[8];
} __nv_fp8_8_e4m3;
typedef struct __CUDA_ALIGN__(8) {
__nv_fp8x2_e4m3 array[4];
} __nv_fp8x2_x4_e4m3;
typedef struct __CUDA_ALIGN__(16) {
__nv_fp8_e4m3 array[16];
} __nv_fp8x16_e4m3;
#endif
// only BF16 and FP8
template<typename T, int PACK_SIZE>
struct PackType {
using type = float;
};
#ifdef ENABLE_BF16
template<>
struct PackType<__nv_bfloat16, 2> {
using type = __nv_bfloat16_2;
};
template<>
struct PackType<__nv_bfloat16, 4> {
using type = __nv_bfloat164;
};
template<>
struct PackType<__nv_bfloat16, 8> {
using type = __nv_bfloat168;
};
#endif
#ifdef ENABLE_FP8
template<>
struct PackType<__nv_fp8_e4m3, 2> {
using type = __nv_fp8_2_e4m3;
};
template<>
struct PackType<__nv_fp8_e4m3, 4> {
using type = __nv_fp8_4_e4m3;
};
template<>
struct PackType<__nv_fp8_e4m3, 8> {
using type = __nv_fp8_8_e4m3;
};
#endif
__inline__ __device__ void fp8x4_e4m3_to_bfloat2(__nv_bfloat162* out1, __nv_bfloat162* out2, const __nv_fp8x4_e4m3* in)
{
const char4 tmp_val = reinterpret_cast<const char4*>(in)[0];
*out1 = __nv_bfloat162((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
(float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
*out2 = __nv_bfloat162((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.z)[0],
(float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.w)[0]);
}
__inline__ __device__ __nv_bfloat162 fp8x2_e4m3_to_bfloat2(const __nv_fp8x2_e4m3* in)
{
const char2 tmp_val = reinterpret_cast<const char2*>(in)[0];
__nv_bfloat162 out = __nv_bfloat162((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
(float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
return out;
}
__inline__ __device__ void fp8x4_e4m3_to_half2(half2* out1, half2* out2, const __nv_fp8x4_e4m3* in)
{
const char4 tmp_val = reinterpret_cast<const char4*>(in)[0];
*out1 = half2((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
(float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
*out2 = half2((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.z)[0],
(float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.w)[0]);
}
__inline__ __device__ half2 fp8x2_e4m3_to_half2(const __nv_fp8x2_e4m3* in)
{
const char2 tmp_val = reinterpret_cast<const char2*>(in)[0];
half2 out = half2((float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.x)[0],
(float)reinterpret_cast<const __nv_fp8_e4m3*>(&tmp_val.y)[0]);
return out;
}
template<typename T_OUT, typename T_IN, QUANTIZE_MODE quantize_mode>
void invokeQuantizeMatrix(
T_OUT* output, float const* input_qua_amax_ptr, T_IN const* input, uint32_t size, uint32_t n, cudaStream_t stream);
template<typename T_OUT, typename T_IN, typename T_FAKE>
void invokeFakeQuantize(T_OUT* dst, const T_IN* src, const int size, cudaStream_t stream);
template<typename T_W>
void invokeComputeFP8QuantizeScale(float* quant_ptr, const T_W* weights, const int k, const int n, cudaStream_t stream);
} // namespace fastertransformer
#endif // ENABLE_FP8
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cuda.h>
#include <cuda_fp16.h>
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_bf16_fallbacks.cuh"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
namespace fastertransformer {
template<typename T>
inline __device__ T ldg(const T* val) {
return __ldg(val);
}
#if ENABLE_BF16
template<>
inline __device__ __nv_bfloat162 ldg(const __nv_bfloat162* val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return val[0];
#else
return __ldg(val);
#endif
}
template<>
inline __device__ __nv_bfloat16 ldg(const __nv_bfloat16* val) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return val[0];
#else
return __ldg(val);
#endif
}
#endif // ENABLE_BF16
// Get type2 from type or vice versa (applied to half and bfloat16)
template<typename T>
struct TypeConverter {using Type = half2;}; // keep for generality
template<>
struct TypeConverter<half2> {using Type = half;};
template<>
struct TypeConverter<half> {using Type = half2;};
#if ENABLE_BF16
template<>
struct TypeConverter<__nv_bfloat162> {using Type = __nv_bfloat16;};
template<>
struct TypeConverter<__nv_bfloat16> {using Type = __nv_bfloat162;};
#endif // ENABLE_BF16
// Defined math operations (bfloat16 fallback to fp32 when it is not supported)
template<typename T>
inline __device__ T hadd2(T a, T b) {
return __hadd2(a, b);
}
#if ENABLE_BF16
template<>
inline __device__ __nv_bfloat162 hadd2(__nv_bfloat162 a, __nv_bfloat162 b) {
return bf16hadd2(a, b);
}
#endif // ENABLE_BF16
template<typename T>
inline __device__ T add(T a, T b) {
return a + b;
}
template<>
inline __device__ half2 add(half2 a, half2 b) {
return __hadd2(a, b);
}
template<>
inline __device__ half add(half a, half b) {
return __hadd(a, b);
}
#if ENABLE_BF16
template<>
inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b) {
return bf16hadd2(a, b);
}
template<>
inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
return bf16hadd(a, b);
}
inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, float b) {
return bf16hadd(a, __float2bfloat16(b));
}
#endif // ENABLE_BF16
// applies to all 4 values addition
template<typename T>
inline __device__ T add(T a, T b, T c) {
return a + b + c;
}
#if ENABLE_BF16
inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
return bf16hadd(a, b, c);
}
inline __device__ __nv_bfloat162 add(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
return bf16hadd2(a, b, c);
}
#endif // ENABLE_BF16
// applies to all 4 values addition
template<typename T>
inline __device__ T add(T a, T b, T c, T d) {
return (T)((float)a + (float)b + (float)c + (float)d);
}
#if ENABLE_BF16
inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c, __nv_bfloat16 d) {
return bf16hadd(a, b, c, d);
}
#endif // ENABLE_BF16
template<typename T>
inline __device__ T hsub2(T a, T b) {
return __hsub2(a, b);
}
#if ENABLE_BF16
template<>
inline __device__ __nv_bfloat162 hsub2(__nv_bfloat162 a, __nv_bfloat162 b) {
return bf16hsub2(a, b);
}
#endif // ENABLE_BF16
template<typename T>
inline __device__ T hmul2(T a, T b) {
return __hmul2(a, b);
}
#if ENABLE_BF16
template<>
inline __device__ __nv_bfloat162 hmul2(__nv_bfloat162 a, __nv_bfloat162 b) {
return bf16hmul2(a, b);
}
#endif // ENABLE_BF16
template<typename T>
inline __device__ T hmul2(T a, T b, T c) {
return a * b * c;
}
#if ENABLE_BF16
template<>
inline __device__ __nv_bfloat162 hmul2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
return bf16hmul2(a, b, c);
}
#endif // ENABLE_BF16
template<typename T>
inline __device__ T mul(T a, T b, T c) {
return a * b * c;
}
#if ENABLE_BF16
template<>
inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
return bf16hmul(a, b, c);
}
inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
return bf16hmul2(a, b, c);
}
#endif // ENABLE_BF16
template<typename T>
inline __device__ T fma(T a, T b, T c, T d) {
return a * b * c + d;
}
#if ENABLE_BF16
inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162 d) {
return bf16hfma2(a, b, c, d);
}
#endif // ENABLE_BF16
template<typename T>
inline __device__ T fma(T a, T b, T c) {
return a * b + c;
}
#if ENABLE_BF16
template<>
inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
return bf16hfma2(a, b, c);
}
template<>
inline __device__ __nv_bfloat16 fma(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
return bf16hfma(a, b, c);
}
#endif // ENABLE_BF16
template<typename T>
inline __device__ T hexp2(T a) {
return h2exp(a);
}
#if ENABLE_BF16
template<>
inline __device__ __nv_bfloat162 hexp2(__nv_bfloat162 a) {
return bf16exp2(a);
}
#endif // ENABLE_BF16
template<typename T_OUT, typename T_IN> __device__ inline T_OUT cuda_cast(T_IN val) { return val; }
template<> __device__ inline float2 cuda_cast<float2, int2>(int2 val) { return make_float2(val.x, val.y); }
template<> __device__ inline float2 cuda_cast<float2, float>(float val) { return make_float2(val, val); }
template<> __device__ inline float2 cuda_cast<float2, half2>(half2 val) { return __half22float2(val); }
template<> __device__ inline half2 cuda_cast<half2, float2>(float2 val) { return __float22half2_rn(val); }
template<> __device__ inline half2 cuda_cast<half2, float>(float val) { return __float2half2_rn(val); }
template<> __device__ inline half2 cuda_cast<half2, half>(half val) { return __half2half2(val); }
template<> __device__ inline int8_t cuda_cast<int8_t, half>(half val) {
union { int8_t int8[2]; int16_t int16; };
union { half fp16; int16_t int16_in; };
fp16 = val;
asm volatile ("cvt.rni.sat.s8.f16 %0, %1;" : "=h"(int16) : "h"(int16_in));
return int8[0];
}
template<> __device__ inline int16_t cuda_cast<int16_t, half2>(half2 val) {
union { int8_t int8[2]; int16_t int16; };
int8[0] = cuda_cast<int8_t>(val.x);
int8[1] = cuda_cast<int8_t>(val.y);
return int16;
}
template<> __device__ inline int8_t cuda_cast<int8_t, float>(float val) {
union { int8_t int8[2]; int16_t int16; };
asm volatile ("cvt.rni.sat.s8.f32 %0, %1;" : "=h"(int16) : "f"(val));
return int8[0];
}
template<> __device__ inline int16_t cuda_cast<int16_t, float2>(float2 val) {
union { int8_t int8[2]; int16_t int16; };
int8[0] = cuda_cast<int8_t>(val.x);
int8[1] = cuda_cast<int8_t>(val.y);
return int16;
}
template<> __device__ inline half2 cuda_cast<half2, int16_t>(int16_t val) {
union { int8_t int8[2]; int16_t int16; };
int16 = val;
return make_half2(int8[0], int8[1]);
}
template<> __device__ inline float2 cuda_cast<float2, int16_t>(int16_t val) {
union { int8_t int8[2]; int16_t int16; };
int16 = val;
return make_float2(int8[0], int8[1]);
}
#ifdef ENABLE_BF16
template<> __device__ inline __nv_bfloat16 cuda_cast(int32_t val) { return static_cast<float>(val); }
template<> __device__ inline __nv_bfloat16 cuda_cast(int8_t val) { return static_cast<float>(val); }
template<> __device__ inline int8_t cuda_cast(__nv_bfloat16 val) { return static_cast<float>(val); }
template<>
__device__ inline float cuda_cast<float, __nv_bfloat16>(__nv_bfloat16 val) { return __bfloat162float(val); }
template<> __device__ inline float2 cuda_cast<float2, __nv_bfloat162>(__nv_bfloat162 val) { return bf1622float2(val); }
template<> __device__ inline half cuda_cast<half, __nv_bfloat16>(__nv_bfloat16 val) { return __float2half(__bfloat162float(val)); }
template<> __device__ inline int16_t cuda_cast<int16_t, __nv_bfloat162>(__nv_bfloat162 val) { return bf1622int16(val); }
template<> __device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, float>(float val) { return __float2bfloat16(val); }
template<> __device__ inline __nv_bfloat16 cuda_cast<__nv_bfloat16, half>(half val) { return __float2bfloat16(__half2float(val)); }
template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, __nv_bfloat16>(__nv_bfloat16 val) { return bf162bf162(val); }
template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float>(float val) { return __float2bfloat162_rn(val); }
template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, float2>(float2 val) { return float22bf162(val); }
template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, int16_t>(int16_t val) {
union { int8_t int8[2]; int16_t int16; };
int16 = val;
__nv_bfloat162 res;
res.x = cuda_cast<__nv_bfloat16>(int8[0]);
res.y = cuda_cast<__nv_bfloat16>(int8[1]);
return res;
}
template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, half2>(half2 val) { return float22bf162(__half22float2(val)); }
#endif // ENABLE BF16
template<typename T> __device__ inline T cuda_abs(T val);
template<> __device__ inline float cuda_abs(float val) { return fabs(val); }
template<> __device__ inline half cuda_abs(half val) { return __habs(val); }
template<> __device__ inline half2 cuda_abs(half2 val) { return __habs2(val); }
#ifdef ENABLE_BF16
#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)
template<> __device__ inline __nv_bfloat16 cuda_abs(__nv_bfloat16 val) { return __habs(val); }
template<> __device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val) { return __habs2(val); }
#else
template<> __device__ inline __nv_bfloat16 cuda_abs(__nv_bfloat16 val) { return fabs(val); }
template<> __device__ inline __nv_bfloat162 cuda_abs(__nv_bfloat162 val) { return make_bfloat162(fabs(val.x), fabs(val.y)); }
#endif
#endif // ENABLE_FP16
// Unary maximum: compute the max of a vector type
template<typename To, typename Ti> __device__ inline To cuda_max(Ti val)
{
return cuda_cast<To>(val);
};
template<> __device__ inline half cuda_max(half2 val) { return (val.x > val.y) ? val.x : val.y; }
#ifdef ENABLE_BF16
template<> __device__ inline __nv_bfloat16 cuda_max(__nv_bfloat162 val) { return (val.x > val.y) ? val.x : val.y; }
#endif
// Binary maximum: compute the max of two scalar types
template<typename T> __device__ inline T cuda_max(T val1, T val2) { return (val1 > val2) ? val1 : val2; }
#ifdef ENABLE_FP8
template<> __device__ inline float2 cuda_cast<float2, __nv_fp8x2_e4m3>(__nv_fp8x2_e4m3 val) { return bf1622float2(fp8x2_e4m3_to_bfloat2(&val)); }
template<> __device__ inline __nv_fp8x2_e4m3 cuda_cast<__nv_fp8x2_e4m3, float2>(float2 val) { return __nv_fp8x2_e4m3(bf1622float2(float22bf162(val))); }
template<> __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, half>(half val) { return __nv_fp8_e4m3(val); }
template<> __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, __nv_bfloat16>(__nv_bfloat16 val) { return __nv_fp8_e4m3(val); }
template<> __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, float>(float val) { return __nv_fp8_e4m3(val); }
template<> __device__ inline float cuda_cast<float, __nv_fp8_e4m3>(__nv_fp8_e4m3 val) { return (float)val; }
template<> __device__ inline __nv_bfloat162 cuda_cast<__nv_bfloat162, __nv_fp8x2_e4m3>(__nv_fp8x2_e4m3 val) { return fp8x2_e4m3_to_bfloat2(&val); }
template<> __device__ inline int8_t cuda_cast<int8_t, __nv_fp8_e4m3>(__nv_fp8_e4m3 val)
{
// no impl
return 0;
}
template<> __device__ inline __nv_fp8_e4m3 cuda_cast<__nv_fp8_e4m3, int8_t>(int8_t val)
{
return cuda_cast<__nv_fp8_e4m3>(cuda_cast<__nv_bfloat16>(cuda_cast<float>(val)));
}
#endif // ENABLE_FP8
}
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/cuda_fp8_utils.h"
namespace fastertransformer {
/* **************************** debug tools ********************************* */
template<typename T>
void print_to_file(const T* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode)
{
cudaDeviceSynchronize();
check_cuda_error(cudaGetLastError());
printf("[INFO] file: %s with size %d.\n", file, size);
std::ofstream outFile(file, open_mode);
if (outFile) {
T* tmp = new T[size];
check_cuda_error(cudaMemcpyAsync(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost, stream));
for (int i = 0; i < size; ++i) {
float val = (float)(tmp[i]);
outFile << val << std::endl;
}
delete[] tmp;
}
else {
throw std::runtime_error(std::string("[FT][ERROR] Cannot open file: ") + file + "\n");
}
cudaDeviceSynchronize();
check_cuda_error(cudaGetLastError());
}
template void
print_to_file(const float* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode);
template void
print_to_file(const half* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode);
#ifdef ENABLE_BF16
template void print_to_file(
const __nv_bfloat16* result, const int size, const char* file, cudaStream_t stream, std::ios::openmode open_mode);
#endif
template<typename T>
void print_abs_mean(const T* buf, uint size, cudaStream_t stream, std::string name)
{
if (buf == nullptr) {
FT_LOG_WARNING("It is an nullptr, skip!");
return;
}
cudaDeviceSynchronize();
check_cuda_error(cudaGetLastError());
T* h_tmp = new T[size];
cudaMemcpyAsync(h_tmp, buf, sizeof(T) * size, cudaMemcpyDeviceToHost, stream);
cudaDeviceSynchronize();
check_cuda_error(cudaGetLastError());
double sum = 0.0f;
uint64_t zero_count = 0;
float max_val = -1e10;
bool find_inf = false;
for (uint i = 0; i < size; i++) {
if (std::isinf((float)(h_tmp[i]))) {
find_inf = true;
continue;
}
sum += abs((double)h_tmp[i]);
if ((float)h_tmp[i] == 0.0f) {
zero_count++;
}
max_val = max_val > abs(float(h_tmp[i])) ? max_val : abs(float(h_tmp[i]));
}
printf("[INFO][FT] %20s size: %u, abs mean: %f, abs sum: %f, abs max: %f, find inf: %s",
name.c_str(),
size,
sum / size,
sum,
max_val,
find_inf ? "true" : "false");
std::cout << std::endl;
delete[] h_tmp;
cudaDeviceSynchronize();
check_cuda_error(cudaGetLastError());
}
template void print_abs_mean(const float* buf, uint size, cudaStream_t stream, std::string name);
template void print_abs_mean(const half* buf, uint size, cudaStream_t stream, std::string name);
#ifdef ENABLE_BF16
template void print_abs_mean(const __nv_bfloat16* buf, uint size, cudaStream_t stream, std::string name);
#endif
template void print_abs_mean(const int* buf, uint size, cudaStream_t stream, std::string name);
template void print_abs_mean(const uint* buf, uint size, cudaStream_t stream, std::string name);
template void print_abs_mean(const int8_t* buf, uint size, cudaStream_t stream, std::string name);
#ifdef ENABLE_FP8
template void print_abs_mean(const __nv_fp8_e4m3* buf, uint size, cudaStream_t stream, std::string name);
#endif
template<typename T>
void print_to_screen(const T* result, const int size)
{
if (result == nullptr) {
FT_LOG_WARNING("It is an nullptr, skip! \n");
return;
}
T* tmp = reinterpret_cast<T*>(malloc(sizeof(T) * size));
check_cuda_error(cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost));
for (int i = 0; i < size; ++i) {
printf("%d, %f\n", i, static_cast<float>(tmp[i]));
}
free(tmp);
}
template void print_to_screen(const float* result, const int size);
template void print_to_screen(const half* result, const int size);
#ifdef ENABLE_BF16
template void print_to_screen(const __nv_bfloat16* result, const int size);
#endif
template void print_to_screen(const int* result, const int size);
template void print_to_screen(const uint* result, const int size);
template void print_to_screen(const bool* result, const int size);
#ifdef ENABLE_FP8
template void print_to_screen(const __nv_fp8_e4m3* result, const int size);
#endif
template<typename T>
void printMatrix(T* ptr, int m, int k, int stride, bool is_device_ptr)
{
T* tmp;
if (is_device_ptr) {
// k < stride ; stride = col-dimension.
tmp = reinterpret_cast<T*>(malloc(m * stride * sizeof(T)));
check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
}
else {
tmp = ptr;
}
for (int ii = -1; ii < m; ++ii) {
if (ii >= 0) {
printf("%02d ", ii);
}
else {
printf(" ");
}
for (int jj = 0; jj < k; jj += 1) {
if (ii >= 0) {
printf("%7.3f ", (float)tmp[ii * stride + jj]);
}
else {
printf("%7d ", jj);
}
}
printf("\n");
}
if (is_device_ptr) {
free(tmp);
}
}
template void printMatrix(float* ptr, int m, int k, int stride, bool is_device_ptr);
template void printMatrix(half* ptr, int m, int k, int stride, bool is_device_ptr);
#ifdef ENABLE_BF16
template void printMatrix(__nv_bfloat16* ptr, int m, int k, int stride, bool is_device_ptr);
#endif
void printMatrix(unsigned long long* ptr, int m, int k, int stride, bool is_device_ptr)
{
typedef unsigned long long T;
T* tmp;
if (is_device_ptr) {
// k < stride ; stride = col-dimension.
tmp = reinterpret_cast<T*>(malloc(m * stride * sizeof(T)));
check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
}
else {
tmp = ptr;
}
for (int ii = -1; ii < m; ++ii) {
if (ii >= 0) {
printf("%02d ", ii);
}
else {
printf(" ");
}
for (int jj = 0; jj < k; jj += 1) {
if (ii >= 0) {
printf("%4llu ", tmp[ii * stride + jj]);
}
else {
printf("%4d ", jj);
}
}
printf("\n");
}
if (is_device_ptr) {
free(tmp);
}
}
void printMatrix(int* ptr, int m, int k, int stride, bool is_device_ptr)
{
typedef int T;
T* tmp;
if (is_device_ptr) {
// k < stride ; stride = col-dimension.
tmp = reinterpret_cast<T*>(malloc(m * stride * sizeof(T)));
check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
}
else {
tmp = ptr;
}
for (int ii = -1; ii < m; ++ii) {
if (ii >= 0) {
printf("%02d ", ii);
}
else {
printf(" ");
}
for (int jj = 0; jj < k; jj += 1) {
if (ii >= 0) {
printf("%4d ", tmp[ii * stride + jj]);
}
else {
printf("%4d ", jj);
}
}
printf("\n");
}
if (is_device_ptr) {
free(tmp);
}
}
void printMatrix(size_t* ptr, int m, int k, int stride, bool is_device_ptr)
{
typedef size_t T;
T* tmp;
if (is_device_ptr) {
// k < stride ; stride = col-dimension.
tmp = reinterpret_cast<T*>(malloc(m * stride * sizeof(T)));
check_cuda_error(cudaMemcpy(tmp, ptr, sizeof(T) * m * stride, cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
}
else {
tmp = ptr;
}
for (int ii = -1; ii < m; ++ii) {
if (ii >= 0) {
printf("%02d ", ii);
}
else {
printf(" ");
}
for (int jj = 0; jj < k; jj += 1) {
if (ii >= 0) {
printf("%4ld ", tmp[ii * stride + jj]);
}
else {
printf("%4d ", jj);
}
}
printf("\n");
}
if (is_device_ptr) {
free(tmp);
}
}
template<typename T>
void check_max_val(const T* result, const int size)
{
T* tmp = new T[size];
cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost);
float max_val = -100000;
for (int i = 0; i < size; i++) {
float val = static_cast<float>(tmp[i]);
if (val > max_val) {
max_val = val;
}
}
delete tmp;
printf("[INFO][CUDA] addr %p max val: %f \n", result, max_val);
}
template void check_max_val(const float* result, const int size);
template void check_max_val(const half* result, const int size);
#ifdef ENABLE_BF16
template void check_max_val(const __nv_bfloat16* result, const int size);
#endif
template<typename T>
void check_abs_mean_val(const T* result, const int size)
{
T* tmp = new T[size];
cudaMemcpy(tmp, result, sizeof(T) * size, cudaMemcpyDeviceToHost);
float sum = 0.0f;
for (int i = 0; i < size; i++) {
sum += abs(static_cast<float>(tmp[i]));
}
delete tmp;
printf("[INFO][CUDA] addr %p abs mean val: %f \n", result, sum / size);
}
template void check_abs_mean_val(const float* result, const int size);
template void check_abs_mean_val(const half* result, const int size);
#ifdef ENABLE_BF16
template void check_abs_mean_val(const __nv_bfloat16* result, const int size);
#endif
/* ***************************** common utils ****************************** */
cudaError_t getSetDevice(int i_device, int* o_device)
{
int current_dev_id = 0;
cudaError_t err = cudaSuccess;
if (o_device != NULL) {
err = cudaGetDevice(&current_dev_id);
if (err != cudaSuccess) {
return err;
}
if (current_dev_id == i_device) {
*o_device = i_device;
}
else {
err = cudaSetDevice(i_device);
if (err != cudaSuccess) {
return err;
}
*o_device = current_dev_id;
}
}
else {
err = cudaSetDevice(i_device);
if (err != cudaSuccess) {
return err;
}
}
return cudaSuccess;
}
FtCudaDataType getModelFileType(std::string ini_file, std::string section_name)
{
FtCudaDataType model_file_type;
INIReader reader = INIReader(ini_file);
if (reader.ParseError() < 0) {
FT_LOG_WARNING("Can't load %s. Use FP32 as default", ini_file.c_str());
model_file_type = FtCudaDataType::FP32;
}
else {
std::string weight_data_type_str = std::string(reader.Get(section_name, "weight_data_type"));
if (weight_data_type_str.find("fp32") != std::string::npos) {
model_file_type = FtCudaDataType::FP32;
}
else if (weight_data_type_str.find("fp16") != std::string::npos) {
model_file_type = FtCudaDataType::FP16;
}
else if (weight_data_type_str.find("bf16") != std::string::npos) {
model_file_type = FtCudaDataType::BF16;
}
else {
FT_LOG_WARNING("Invalid type %s. Use FP32 as default", weight_data_type_str.c_str());
model_file_type = FtCudaDataType::FP32;
}
}
return model_file_type;
}
/* ************************** end of common utils ************************** */
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "3rdparty/INIReader.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/logger.h"
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#ifdef SPARSITY_ENABLED
#include <cusparseLt.h>
#endif
namespace fastertransformer {
#define MAX_CONFIG_NUM 20
#define COL32_ 32
// workspace for cublas gemm : 32MB
#define CUBLAS_WORKSPACE_SIZE 33554432
typedef struct __align__(4)
{
half x, y, z, w;
}
half4;
/* **************************** type definition ***************************** */
enum CublasDataType {
FLOAT_DATATYPE = 0,
HALF_DATATYPE = 1,
BFLOAT16_DATATYPE = 2,
INT8_DATATYPE = 3,
FP8_DATATYPE = 4
};
enum FtCudaDataType {
FP32 = 0,
FP16 = 1,
BF16 = 2,
INT8 = 3,
FP8 = 4
};
enum class OperationType {
FP32,
FP16,
BF16,
INT8,
FP8
};
/* **************************** debug tools ********************************* */
static const char* _cudaGetErrorEnum(cudaError_t error)
{
return cudaGetErrorString(error);
}
static const char* _cudaGetErrorEnum(cublasStatus_t error)
{
switch (error) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
}
return "<unknown>";
}
template<typename T>
void check(T result, char const* const func, const char* const file, int const line)
{
if (result) {
throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " "
+ file + ":" + std::to_string(line) + " \n");
}
}
#define check_cuda_error(val) check((val), #val, __FILE__, __LINE__)
#define check_cuda_error_2(val, file, line) check((val), #val, file, line)
inline void syncAndCheck(const char* const file, int const line)
{
// When FT_DEBUG_LEVEL=DEBUG, must check error
static char* level_name = std::getenv("FT_DEBUG_LEVEL");
if (level_name != nullptr) {
static std::string level = std::string(level_name);
if (level == "DEBUG") {
cudaDeviceSynchronize();
cudaError_t result = cudaGetLastError();
if (result) {
throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result))
+ " " + file + ":" + std::to_string(line) + " \n");
}
FT_LOG_DEBUG(fmtstr("run syncAndCheck at %s:%d", file, line));
}
}
#ifndef NDEBUG
cudaDeviceSynchronize();
cudaError_t result = cudaGetLastError();
if (result) {
throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: ") + (_cudaGetErrorEnum(result)) + " "
+ file + ":" + std::to_string(line) + " \n");
}
#endif
}
#define sync_check_cuda_error() syncAndCheck(__FILE__, __LINE__)
#define checkCUDNN(expression) \
{ \
cudnnStatus_t status = (expression); \
if (status != CUDNN_STATUS_SUCCESS) { \
std::cerr << "Error on file " << __FILE__ << " line " << __LINE__ << ": " << cudnnGetErrorString(status) \
<< std::endl; \
std::exit(EXIT_FAILURE); \
} \
}
template<typename T>
void print_to_file(const T* result,
const int size,
const char* file,
cudaStream_t stream = 0,
std::ios::openmode open_mode = std::ios::out);
template<typename T>
void print_abs_mean(const T* buf, uint size, cudaStream_t stream, std::string name = "");
template<typename T>
void print_to_screen(const T* result, const int size);
template<typename T>
void printMatrix(T* ptr, int m, int k, int stride, bool is_device_ptr);
void printMatrix(unsigned long long* ptr, int m, int k, int stride, bool is_device_ptr);
void printMatrix(int* ptr, int m, int k, int stride, bool is_device_ptr);
void printMatrix(size_t* ptr, int m, int k, int stride, bool is_device_ptr);
template<typename T>
void check_max_val(const T* result, const int size);
template<typename T>
void check_abs_mean_val(const T* result, const int size);
#define PRINT_FUNC_NAME_() \
do { \
std::cout << "[FT][CALL] " << __FUNCTION__ << " " << std::endl; \
} while (0)
[[noreturn]] inline void throwRuntimeError(const char* const file, int const line, std::string const& info = "")
{
throw std::runtime_error(std::string("[FT][ERROR] ") + info + " Assertion fail: " + file + ":"
+ std::to_string(line) + " \n");
}
inline void myAssert(bool result, const char* const file, int const line, std::string const& info = "")
{
if (!result) {
throwRuntimeError(file, line, info);
}
}
#define FT_CHECK(val) myAssert(val, __FILE__, __LINE__)
#define FT_CHECK_WITH_INFO(val, info) \
do { \
bool is_valid_val = (val); \
if (!is_valid_val) { \
fastertransformer::myAssert(is_valid_val, __FILE__, __LINE__, (info)); \
} \
} while (0)
#define FT_THROW(info) throwRuntimeError(__FILE__, __LINE__, info)
#ifdef SPARSITY_ENABLED
#define CHECK_CUSPARSE(func) \
{ \
cusparseStatus_t status = (func); \
if (status != CUSPARSE_STATUS_SUCCESS) { \
throw std::runtime_error(std::string("[FT][ERROR] CUSPARSE API failed at line ") \
+ std::to_string(__LINE__) + " in file " + __FILE__ + ": " \
+ cusparseGetErrorString(status) + " " + std::to_string(status)); \
} \
}
#endif
/*************Time Handling**************/
class CudaTimer {
private:
cudaEvent_t event_start_;
cudaEvent_t event_stop_;
cudaStream_t stream_;
public:
explicit CudaTimer(cudaStream_t stream = 0)
{
stream_ = stream;
}
void start()
{
check_cuda_error(cudaEventCreate(&event_start_));
check_cuda_error(cudaEventCreate(&event_stop_));
check_cuda_error(cudaEventRecord(event_start_, stream_));
}
float stop()
{
float time;
check_cuda_error(cudaEventRecord(event_stop_, stream_));
check_cuda_error(cudaEventSynchronize(event_stop_));
check_cuda_error(cudaEventElapsedTime(&time, event_start_, event_stop_));
check_cuda_error(cudaEventDestroy(event_start_));
check_cuda_error(cudaEventDestroy(event_stop_));
return time;
}
~CudaTimer() {}
};
static double diffTime(timeval start, timeval end)
{
return (end.tv_sec - start.tv_sec) * 1000 + (end.tv_usec - start.tv_usec) * 0.001;
}
/* ***************************** common utils ****************************** */
inline void print_mem_usage(std::string time = "after allocation")
{
size_t free_bytes, total_bytes;
check_cuda_error(cudaMemGetInfo(&free_bytes, &total_bytes));
float free = static_cast<float>(free_bytes) / 1024.0 / 1024.0 / 1024.0;
float total = static_cast<float>(total_bytes) / 1024.0 / 1024.0 / 1024.0;
float used = total - free;
printf("%-20s: free: %5.2f GB, total: %5.2f GB, used: %5.2f GB\n", time.c_str(), free, total, used);
}
inline int getSMVersion()
{
int device{-1};
check_cuda_error(cudaGetDevice(&device));
int sm_major = 0;
int sm_minor = 0;
check_cuda_error(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
check_cuda_error(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
return sm_major * 10 + sm_minor;
}
inline int getMaxSharedMemoryPerBlock()
{
int device{-1};
check_cuda_error(cudaGetDevice(&device));
int max_shared_memory_size = 0;
check_cuda_error(cudaDeviceGetAttribute(&max_shared_memory_size, cudaDevAttrMaxSharedMemoryPerBlock, device));
return max_shared_memory_size;
}
inline std::string getDeviceName()
{
int device{-1};
check_cuda_error(cudaGetDevice(&device));
cudaDeviceProp props;
check_cuda_error(cudaGetDeviceProperties(&props, device));
return std::string(props.name);
}
inline int div_up(int a, int n)
{
return (a + n - 1) / n;
}
cudaError_t getSetDevice(int i_device, int* o_device = NULL);
inline int getDevice()
{
int current_dev_id = 0;
check_cuda_error(cudaGetDevice(&current_dev_id));
return current_dev_id;
}
inline int getDeviceCount()
{
int count = 0;
check_cuda_error(cudaGetDeviceCount(&count));
return count;
}
template<typename T>
CublasDataType getCublasDataType()
{
if (std::is_same<T, half>::value) {
return HALF_DATATYPE;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value) {
return BFLOAT16_DATATYPE;
}
#endif
else if (std::is_same<T, float>::value) {
return FLOAT_DATATYPE;
}
else {
FT_CHECK(false);
return FLOAT_DATATYPE;
}
}
template<typename T>
cudaDataType_t getCudaDataType()
{
if (std::is_same<T, half>::value) {
return CUDA_R_16F;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value) {
return CUDA_R_16BF;
}
#endif
else if (std::is_same<T, float>::value) {
return CUDA_R_32F;
}
else {
FT_CHECK(false);
return CUDA_R_32F;
}
}
template<CublasDataType T>
struct getTypeFromCudaDataType {
using Type = float;
};
template<>
struct getTypeFromCudaDataType<HALF_DATATYPE> {
using Type = half;
};
#ifdef ENABLE_BF16
template<>
struct getTypeFromCudaDataType<BFLOAT16_DATATYPE> {
using Type = __nv_bfloat16;
};
#endif
FtCudaDataType getModelFileType(std::string ini_file, std::string section_name);
// clang-format off
template<typename T> struct packed_type;
template <> struct packed_type<float> { using type = float; }; // we don't need to pack float by default
template <> struct packed_type<half> { using type = half2; };
#ifdef ENABLE_BF16
template<>
struct packed_type<__nv_bfloat16> {
using type = __nv_bfloat162;
};
#endif
template<typename T> struct num_elems;
template <> struct num_elems<float> { static constexpr int value = 1; };
template <> struct num_elems<float2> { static constexpr int value = 2; };
template <> struct num_elems<float4> { static constexpr int value = 4; };
template <> struct num_elems<half> { static constexpr int value = 1; };
template <> struct num_elems<half2> { static constexpr int value = 2; };
#ifdef ENABLE_BF16
template <> struct num_elems<__nv_bfloat16> { static constexpr int value = 1; };
template <> struct num_elems<__nv_bfloat162> { static constexpr int value = 2; };
#endif
template<typename T, int num> struct packed_as;
template<typename T> struct packed_as<T, 1> { using type = T; };
template<> struct packed_as<half, 2> { using type = half2; };
template<> struct packed_as<float, 2> { using type = float2; };
template<> struct packed_as<int8_t, 2> { using type = int16_t; };
template<> struct packed_as<int32_t, 2> { using type = int2; };
template<> struct packed_as<half2, 1> { using type = half; };
#ifdef ENABLE_BF16
template<> struct packed_as<__nv_bfloat16, 2> { using type = __nv_bfloat162; };
template<> struct packed_as<__nv_bfloat162, 1> { using type = __nv_bfloat16; };
#endif
inline __device__ float2 operator*(float2 a, float2 b) { return make_float2(a.x * b.x, a.y * b.y); }
inline __device__ float2 operator*(float2 a, float b) { return make_float2(a.x * b, a.y * b); }
// clang-format on
template<typename T1, typename T2>
void compareTwoTensor(
const T1* pred, const T2* ref, const int size, const int print_size = 0, const std::string filename = "")
{
T1* h_pred = new T1[size];
T2* h_ref = new T2[size];
check_cuda_error(cudaMemcpy(h_pred, pred, size * sizeof(T1), cudaMemcpyDeviceToHost));
check_cuda_error(cudaMemcpy(h_ref, ref, size * sizeof(T2), cudaMemcpyDeviceToHost));
FILE* fd = nullptr;
if (filename != "") {
fd = fopen(filename.c_str(), "w");
fprintf(fd, "| %10s | %10s | %10s | %10s | \n", "pred", "ref", "abs_diff", "rel_diff(%)");
}
if (print_size > 0) {
FT_LOG_INFO(" id | pred | ref |abs diff | rel diff (%) |");
}
float mean_abs_diff = 0.0f;
float mean_rel_diff = 0.0f;
int count = 0;
for (int i = 0; i < size; i++) {
if (i < print_size) {
FT_LOG_INFO("%4d | % 6.4f | % 6.4f | % 6.4f | % 7.4f |",
i,
(float)h_pred[i],
(float)h_ref[i],
abs((float)h_pred[i] - (float)h_ref[i]),
abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f);
}
if ((float)h_pred[i] == 0) {
continue;
}
count += 1;
mean_abs_diff += abs((float)h_pred[i] - (float)h_ref[i]);
mean_rel_diff += abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f;
if (fd != nullptr) {
fprintf(fd,
"| %10.5f | %10.5f | %10.5f | %11.5f |\n",
(float)h_pred[i],
(float)h_ref[i],
abs((float)h_pred[i] - (float)h_ref[i]),
abs((float)h_pred[i] - (float)h_ref[i]) / (abs((float)h_ref[i]) + 1e-6f) * 100.f);
}
}
mean_abs_diff = mean_abs_diff / (float)count;
mean_rel_diff = mean_rel_diff / (float)count;
FT_LOG_INFO("mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff);
if (fd != nullptr) {
fprintf(fd, "mean_abs_diff: % 6.4f, mean_rel_diff: % 6.4f (%%)", mean_abs_diff, mean_rel_diff);
fclose(fd);
}
delete[] h_pred;
delete[] h_ref;
}
/* ************************** end of common utils ************************** */
} // namespace fastertransformer
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "custom_ar_comm.h"
namespace fastertransformer {
template<typename T>
CustomAllReduceComm<T>::CustomAllReduceComm(size_t rank_size, size_t rank): rank_size_(rank_size), rank_(rank)
{
param_.barrier_flag = 0;
// NOTE: assume All Reduce happens within the node (DGX A100)
param_.rank = rank_;
param_.local_rank = rank_;
param_.node_id = 0;
}
template<typename T>
CustomAllReduceComm<T>::~CustomAllReduceComm()
{
cudaPointerAttributes comm_buffer_attributes, barrier_attributes;
check_cuda_error(cudaPointerGetAttributes(&comm_buffer_attributes, param_.peer_comm_buffer_ptrs[rank_]));
check_cuda_error(cudaPointerGetAttributes(&barrier_attributes, param_.peer_barrier_ptrs[rank_]));
if (comm_buffer_attributes.type == 2) {
check_cuda_error(cudaFree(param_.peer_comm_buffer_ptrs[rank_]));
}
if (barrier_attributes.type == 2) {
check_cuda_error(cudaFree(param_.peer_barrier_ptrs[rank_]));
}
}
template<typename T>
void CustomAllReduceComm<T>::customAllReduce(size_t elts, cudaStream_t stream)
{
param_.elts_total = elts;
param_.barrier_flag = FLAG(param_.barrier_flag + 1);
invokeOneOrTwoShotAllReduceKernel<T>(param_, stream);
// swap back
output_tensor_->at(0).data = (const void*)tmp_tensor_data_;
}
template<typename T>
void CustomAllReduceComm<T>::allocateAndExchangePeerAccessPointer(
std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms)
{
assert(custom_all_reduce_comms->size() == rank_size_);
assert(rank_ == 0);
// Enable Peer to Peer Access
enableP2P(rank_size_);
for (size_t i = 0; i < rank_size_; i++) {
check_cuda_error(cudaSetDevice(i));
check_cuda_error(cudaMalloc(&(param_.peer_comm_buffer_ptrs[i]), CUSTOM_AR_SIZE_THRESHOLD));
check_cuda_error(
cudaMalloc(&(param_.peer_barrier_ptrs[i]), rank_size_ * (MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t)));
check_cuda_error(
cudaMemset(param_.peer_barrier_ptrs[i], 0, rank_size_ * (MAX_ALL_REDUCE_BLOCKS + 1) * sizeof(uint32_t)));
T* current_peer_comm_buffer_ptr = param_.peer_comm_buffer_ptrs[i];
uint32_t* current_peer_barrier_ptr = param_.peer_barrier_ptrs[i];
// Assume current comm allocates device memory on all ranks (rank_ == 0)
for (size_t j = 1; j < rank_size_; j++) {
static_cast<CustomAllReduceComm<T>*>(custom_all_reduce_comms->at(j).get())
->param_.peer_comm_buffer_ptrs[i] = current_peer_comm_buffer_ptr;
static_cast<CustomAllReduceComm<T>*>(custom_all_reduce_comms->at(j).get())->param_.peer_barrier_ptrs[i] =
current_peer_barrier_ptr;
}
}
// Set default local_output_buffer_ptr to local peer_comm_buffer_ptrs
for (size_t i = 0; i < rank_size_; i++) {
static_cast<CustomAllReduceComm<T>*>(custom_all_reduce_comms->at(i).get())->param_.local_output_buffer_ptr =
static_cast<CustomAllReduceComm<T>*>(custom_all_reduce_comms->at(i).get())->param_.peer_comm_buffer_ptrs[i];
}
}
template<typename T>
void CustomAllReduceComm<T>::enableP2P(int ngpus)
{
int peer_access_available = 0;
for (int i = 0; i < ngpus; i++) {
cudaSetDevice(i);
for (int j = 0; j < ngpus; j++) {
if (i == j) {
continue;
}
cudaDeviceCanAccessPeer(&peer_access_available, i, j);
// Custom AR Kernels need DGX A100 NVSWITCH connections
assert(peer_access_available);
cudaDeviceEnablePeerAccess(j, 0);
}
}
}
template<typename T>
bool CustomAllReduceComm<T>::swapInternalBuffer(std::vector<Tensor>* tensor_buffer, size_t elts)
{
// Check if all reduce elts meet the requirement of custom kernels
// If meet, then swap the local comm buffer ptr with output tensor data pointer (avoid additional
// memory movement)
if (rank_size_ > 1 && elts * sizeof(T) <= CUSTOM_AR_SIZE_THRESHOLD) {
tmp_tensor_data_ = (T*)(tensor_buffer->at(0).data);
output_tensor_ = tensor_buffer;
tensor_buffer->at(0).data = param_.peer_comm_buffer_ptrs[rank_];
param_.local_output_buffer_ptr = tmp_tensor_data_;
return true;
}
return false;
}
template<typename T>
void initCustomAllReduceComm(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
int enable_custom_all_reduce,
size_t rank_size)
{
if (enable_custom_all_reduce == 0) {
// don't use custom all reduce kernels, fall back to NCCL
for (size_t i = 0; i < rank_size; i++) {
custom_all_reduce_comms->push_back(nullptr);
}
return;
}
if (rank_size != RANKS_PER_NODE) {
#ifdef BUILD_MULTI_GPU
if (rank_size > 1) {
FT_LOG_WARNING("Custom All Reduce only supports 8 Ranks currently. Using NCCL as Comm.");
}
#else
FT_CHECK_WITH_INFO(rank_size == 1,
fmtstr("Custom All Reduce only supports 8 Ranks currently, got rank_size %ld. FT needs "
"the NCCL library to communicate among devices but has built without NCCL. "
"Please use the flag -DBUILD_MULTI_GPU=ON when compiling.",
rank_size));
#endif
for (size_t i = 0; i < rank_size; i++) {
custom_all_reduce_comms->push_back(nullptr);
}
return;
}
#if defined(CUDART_VERSION) && CUDART_VERSION >= 11020
for (size_t i = 0; i < rank_size; i++) {
custom_all_reduce_comms->push_back(std::make_shared<CustomAllReduceComm<T>>(rank_size, i));
}
custom_all_reduce_comms->at(0)->allocateAndExchangePeerAccessPointer(custom_all_reduce_comms);
#else
FT_LOG_WARNING("Custom All Reduce is not supported before CUDA 11.2. Using NCCL as Comm.");
for (size_t i = 0; i < rank_size; i++) {
custom_all_reduce_comms->push_back(nullptr);
}
#endif
}
// Template instantiation
template class CustomAllReduceComm<uint16_t>;
#ifdef ENABLE_BF16
template class CustomAllReduceComm<__nv_bfloat16>;
#endif
template class CustomAllReduceComm<uint32_t>;
template void
initCustomAllReduceComm<uint16_t>(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
int enable_custom_all_reduce,
size_t rank_size);
#ifdef ENABLE_BF16
template void
initCustomAllReduceComm<__nv_bfloat16>(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
int enable_custom_all_reduce,
size_t rank_size);
#endif
template void
initCustomAllReduceComm<uint32_t>(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
int enable_custom_all_reduce,
size_t rank_size);
} // namespace fastertransformer
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <memory>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include "src/fastertransformer/kernels/custom_ar_kernels.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/logger.h"
namespace fastertransformer {
class AbstractCustomComm {
public:
AbstractCustomComm() = default;
virtual ~AbstractCustomComm() = default;
virtual void customAllReduce(size_t elts, cudaStream_t stream) = 0;
virtual void enableP2P(int ngpus) = 0;
virtual bool swapInternalBuffer(std::vector<Tensor>* tensor_buffer, size_t elts) = 0;
virtual void
allocateAndExchangePeerAccessPointer(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms) = 0;
};
template<typename T>
class CustomAllReduceComm: public AbstractCustomComm {
public:
CustomAllReduceComm(size_t rank_size, size_t rank);
~CustomAllReduceComm();
void customAllReduce(size_t elts, cudaStream_t stream);
void allocateAndExchangePeerAccessPointer(
std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms) override;
bool swapInternalBuffer(std::vector<Tensor>* tensor_buffer, size_t elts) override;
void enableP2P(int ngpus) override;
private:
AllReduceParams<T> param_;
std::vector<Tensor>* output_tensor_;
T* tmp_tensor_data_;
size_t rank_size_;
size_t rank_;
};
template<typename T>
void initCustomAllReduceComm(std::vector<std::shared_ptr<AbstractCustomComm>>* custom_all_reduce_comms,
int enable_custom_all_reduce,
size_t rank_size);
template<typename T>
struct CustomARCommTypeConverter {
using Type = uint32_t;
};
template<>
struct CustomARCommTypeConverter<half> {
using Type = uint16_t;
};
#ifdef ENABLE_BF16
template<>
struct CustomARCommTypeConverter<__nv_bfloat16> {
using Type = __nv_bfloat16;
};
#endif
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/utils/gemm.h"
namespace fastertransformer {
/* ***************************** GEMM Impl ******************************** */
Gemm::Gemm(IAllocator* allocator, cudaStream_t stream, std::string config_file)
{
allocator_ = allocator;
stream_ = stream;
mutex_ = new std::mutex(); // mutex per process
check_cuda_error(cublasCreate(&cublas_handle_));
check_cuda_error(cublasLtCreate(&cublaslt_handle_));
check_cuda_error(cublasSetStream(cublas_handle_, stream));
if (allocator_ != nullptr) {
workspace_ = allocator_->reMalloc(workspace_, WORKSPACE_SIZE);
}
loadGemmConfig(config_file);
}
Gemm::~Gemm()
{
if (allocator_ != nullptr) {
allocator_->free((void**)(&workspace_));
allocator_ = nullptr;
}
cublasLtDestroy(cublaslt_handle_);
cublasDestroy(cublas_handle_);
delete cublas_algo_map_;
delete mutex_;
}
std::string Gemm::toString()
{
const char* a_type_str = a_type_ == TYPE_FP16 ? "FP16" : "FP32";
const char* b_type_str = b_type_ == TYPE_FP16 ? "FP16" : "FP32";
const char* c_type_str = c_type_ == TYPE_FP16 ? "FP16" : "FP32";
const char* compute_type_str = compute_type_ == TYPE_FP16 ? "FP16" : "FP32";
return fmtstr(
"Gemm[a_type=%s, b_type=%s, c_type=%s, compute_type=%s]", a_type_str, b_type_str, c_type_str, compute_type_str);
}
void Gemm::setAllocator(IAllocator* allocator)
{
if (allocator_ != nullptr && workspace_ != nullptr) {
allocator_->free((void**)(&workspace_));
}
allocator_ = allocator;
if (allocator_ != nullptr) {
workspace_ = allocator_->reMalloc(workspace_, WORKSPACE_SIZE);
}
}
void Gemm::setCudaStream(cudaStream_t& stream)
{
stream_ = stream;
cublasSetStream(cublas_handle_, stream);
}
void Gemm::setComputeType(DataType compute_type)
{
checkDataTypeValidity(compute_type);
compute_type_ = compute_type;
}
void Gemm::setTypes(DataType a_type, DataType b_type, DataType c_type, DataType compute_type)
{
checkDataTypeValidity(a_type);
checkDataTypeValidity(b_type);
checkDataTypeValidity(c_type);
a_type_ = a_type;
b_type_ = b_type;
c_type_ = c_type;
setComputeType(compute_type);
}
template<typename T>
void Gemm::setDefaultTypes()
{
if (std::is_same<T, float>::value) {
setTypes(TYPE_FP32, TYPE_FP32, TYPE_FP32, TYPE_FP32);
}
else if (std::is_same<T, half>::value) {
setTypes(TYPE_FP16, TYPE_FP16, TYPE_FP16, TYPE_FP16);
}
else {
throw GemmNotSupportedException("Gemm supports float or half type.");
}
}
void Gemm::loadGemmConfig(std::string config_file)
{
if (cublas_algo_map_ != nullptr) {
delete cublas_algo_map_; // unload the previous cublas map.
}
cublas_algo_map_ = new cublasAlgoMap(config_file);
}
void Gemm::gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* input,
const DenseWeight<float>& weight,
void* output,
const float alpha,
const float beta)
{
gemm(transa,
transb,
m,
n,
k,
input,
a_type_,
(transa == GEMM_OP_N) ? k : m,
(const void*)weight.kernel,
b_type_,
(transb == GEMM_OP_N) ? n : k,
output,
c_type_,
n,
alpha,
beta);
}
void Gemm::gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* input,
const DenseWeight<half>& weight,
void* output,
const float alpha,
const float beta)
{
gemm(transa,
transb,
m,
n,
k,
input,
a_type_,
(transa == GEMM_OP_N) ? k : m,
(const void*)weight.kernel,
b_type_,
(transb == GEMM_OP_N) ? n : k,
output,
c_type_,
n,
alpha,
beta);
}
void Gemm::gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
const void* B,
void* C,
const float alpha,
const float beta)
{
size_t lda = (transa == GEMM_OP_N) ? k : m;
size_t ldb = (transb == GEMM_OP_N) ? n : k;
size_t ldc = n;
gemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, alpha, beta);
}
void Gemm::gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
const size_t lda,
const void* B,
const size_t ldb,
void* C,
const size_t ldc,
const float alpha,
const float beta)
{
gemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, alpha, beta);
}
void Gemm::gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
const DataType Atype,
const size_t lda,
const void* B,
const DataType Btype,
const size_t ldb,
void* C,
const DataType Ctype,
const size_t ldc,
const float alpha,
const float beta)
{
FT_LOG_TRACE("Gemm::gemm [m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", m, n, k, lda, ldb, ldc);
// Implementation copied from cublasMMWrapper::Gemm
// Switch A and B since both cublas and cublasLt assume a column major layout,
// while A and B are both row major layout.
const void* a_data_ptr = B;
const void* b_data_ptr = A;
cublasOperation_t a_op = getCublasOperation(transb);
cublasOperation_t b_op = getCublasOperation(transa);
cudaDataType_t a_type = getCublasDataType(Btype);
cudaDataType_t b_type = getCublasDataType(Atype);
cudaDataType_t c_type = getCublasDataType(Ctype);
// swap m and n
const size_t _m = n;
const size_t _n = m;
// swap lda and ldb;
const size_t _lda = ldb;
const size_t _ldb = lda;
mutex_->lock();
// Use cublas as default in FP32 and cublasLt as default in FP16
bool is_fp16_compute_type = compute_type_ == TYPE_FP16;
bool using_cublasLt = Atype == TYPE_FP16;
int batch_count = 1;
half h_alpha = (half)alpha;
half h_beta = (half)beta;
const void* alpha_ptr =
is_fp16_compute_type ? reinterpret_cast<const void*>(&h_alpha) : reinterpret_cast<const void*>(&alpha);
const void* beta_ptr =
is_fp16_compute_type ? reinterpret_cast<const void*>(&h_beta) : reinterpret_cast<const void*>(&beta);
// TODO: unify CUBLAS_DATA_TYPE and DataType.
int findAlgo =
cublas_algo_map_->isExist(batch_count, _m, _n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
cublasLtMatmulAlgo_info info =
cublas_algo_map_->getAlgo(batch_count, _m, _n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
if (findAlgo) {
using_cublasLt = (info.stages != -1);
}
if (using_cublasLt) {
const size_t a_rows = (a_op == getCublasOperation(GEMM_OP_N)) ? _m : k;
const size_t a_cols = (a_op == getCublasOperation(GEMM_OP_N)) ? k : _m;
const size_t b_rows = (b_op == getCublasOperation(GEMM_OP_N)) ? k : _n;
const size_t b_cols = (b_op == getCublasOperation(GEMM_OP_N)) ? _n : k;
cublasLtMatmulDesc_t matmul_desc = NULL;
cublasLtMatrixLayout_t a_desc = NULL, b_desc = NULL, c_desc = NULL;
cudaDataType_t scale_type = getCublasDataType(compute_type_);
auto compute_type = getCublasComputeType(compute_type_);
// --------------------------------------
// Create descriptors for the original matrices
cublasLtMatrixLayoutCreate(&a_desc, a_type, a_rows, a_cols, _lda);
cublasLtMatrixLayoutCreate(&b_desc, b_type, b_rows, b_cols, _ldb);
cublasLtMatrixLayoutCreate(&c_desc, c_type, _m, _n, ldc);
#if (CUDART_VERSION >= 11000)
cublasLtMatmulDescCreate(&matmul_desc, compute_type, scale_type);
#else
cublasLtMatmulDescCreate(&matmul_desc, compute_type);
#endif
cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSA, &a_op, sizeof(cublasOperation_t));
cublasLtMatmulDescSetAttribute(matmul_desc, CUBLASLT_MATMUL_DESC_TRANSB, &b_op, sizeof(cublasOperation_t));
cublasLtMatmulAlgo_t algo;
void* workspace = workspace_;
int workspace_size = workspace_ == nullptr ? 0 : CUBLAS_WORKSPACE_SIZE;
if (findAlgo) {
if (info.workspaceSize > workspace_size) {
findAlgo = 0;
}
else {
cublasLtMatmulAlgoInit(
cublaslt_handle_, compute_type, scale_type, a_type, b_type, c_type, c_type, info.algoId, &algo);
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &(info.customOption), sizeof(info.customOption));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(info.tile), sizeof(info.tile));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &(info.splitK_val), sizeof(info.splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(info.swizzle), sizeof(info.swizzle));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &(info.reductionScheme), sizeof(int));
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(info.stages), sizeof(info.stages));
#endif
}
}
cublasLtMatmul(cublaslt_handle_,
matmul_desc,
alpha_ptr,
a_data_ptr,
a_desc,
b_data_ptr,
b_desc,
beta_ptr,
C,
c_desc,
C,
c_desc,
(findAlgo == 1 ? (&algo) : NULL),
workspace,
workspace_size,
stream_);
cublasLtMatmulDescDestroy(matmul_desc);
cublasLtMatrixLayoutDestroy(a_desc);
cublasLtMatrixLayoutDestroy(b_desc);
cublasLtMatrixLayoutDestroy(c_desc);
sync_check_cuda_error();
}
else {
cudaDataType_t compute_type = getCublasDataType(compute_type_);
int cublas_algo = info.algoId;
check_cuda_error(cublasGemmEx(cublas_handle_,
a_op,
b_op,
_m,
_n,
k,
alpha_ptr,
a_data_ptr,
a_type,
_lda,
b_data_ptr,
b_type,
_ldb,
beta_ptr,
C,
c_type,
ldc,
compute_type,
static_cast<cublasGemmAlgo_t>(cublas_algo)));
sync_check_cuda_error();
}
mutex_->unlock();
}
void Gemm::batchedGemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* const* A,
const void* const* B,
void* const* C,
const size_t batch_size,
const float alpha,
const float beta)
{
size_t lda = (transa == GEMM_OP_N) ? k : m;
size_t ldb = (transb == GEMM_OP_N) ? n : k;
size_t ldc = n;
batchedGemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, batch_size, alpha, beta);
}
void Gemm::batchedGemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* const* A,
const size_t lda,
const void* const* B,
const size_t ldb,
void* const* C,
const size_t ldc,
const size_t batch_size,
const float alpha,
const float beta)
{
batchedGemm(transa, transb, m, n, k, A, a_type_, lda, B, b_type_, ldb, C, c_type_, ldc, batch_size, alpha, beta);
}
void Gemm::batchedGemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* const* A,
const DataType Atype,
const size_t lda,
const void* const* B,
const DataType Btype,
const size_t ldb,
void* const* C,
const DataType Ctype,
const size_t ldc,
const size_t batch_size,
const float alpha,
const float beta)
{
FT_LOG_TRACE(
"Gemm::batchedGemm [b=%ld m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", batch_size, m, n, k, lda, ldb, ldc);
// Switch A and B.
const void* const* a_data_ptr = B;
const void* const* b_data_ptr = A;
cublasOperation_t a_op = getCublasOperation(transb);
cublasOperation_t b_op = getCublasOperation(transa);
cudaDataType_t a_type = getCublasDataType(Btype);
cudaDataType_t b_type = getCublasDataType(Atype);
cudaDataType_t c_type = getCublasDataType(Ctype);
// swap m and n, lda and ldb
const size_t _m = n;
const size_t _n = m;
const size_t _lda = ldb;
const size_t _ldb = lda;
half h_alpha = (half)alpha;
half h_beta = (half)beta;
mutex_->lock();
bool is_fp16_compute_type = compute_type_ == TYPE_FP16;
const void* alpha_ptr =
is_fp16_compute_type ? reinterpret_cast<const void*>(&h_alpha) : reinterpret_cast<const void*>(&alpha);
const void* beta_ptr =
is_fp16_compute_type ? reinterpret_cast<const void*>(&h_beta) : reinterpret_cast<const void*>(&beta);
cublasLtMatmulAlgo_info info =
cublas_algo_map_->getAlgo(batch_size, m, n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
check_cuda_error(cublasGemmBatchedEx(cublas_handle_,
a_op,
b_op,
_m,
_n,
k,
alpha_ptr,
a_data_ptr,
a_type,
_lda,
b_data_ptr,
b_type,
_ldb,
beta_ptr,
C,
c_type,
ldc,
batch_size,
getCublasComputeType(compute_type_),
static_cast<cublasGemmAlgo_t>(info.algoId)));
mutex_->unlock();
}
void Gemm::stridedBatchedGemm(GemmOp transa,
GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
const void* B,
void* C,
const size_t batch_size,
const float alpha,
const float beta)
{
size_t lda = (transa == GEMM_OP_N) ? k : m;
size_t ldb = (transb == GEMM_OP_N) ? n : k;
size_t ldc = n;
int64_t stridea = m * k;
int64_t strideb = k * n;
int64_t stridec = m * n;
stridedBatchedGemm(transa,
transb,
m,
n,
k,
A,
a_type_,
lda,
stridea,
B,
b_type_,
ldb,
strideb,
C,
c_type_,
ldc,
stridec,
batch_size,
compute_type_,
alpha,
beta);
}
void Gemm::stridedBatchedGemm(GemmOp transa,
GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
const int64_t strideA,
const void* B,
const int64_t strideB,
void* C,
const int64_t strideC,
const size_t batch_size,
const float alpha,
const float beta)
{
size_t lda = (transa == GEMM_OP_N) ? k : m;
size_t ldb = (transb == GEMM_OP_N) ? n : k;
size_t ldc = n;
stridedBatchedGemm(transa,
transb,
m,
n,
k,
A,
a_type_,
lda,
strideA,
B,
b_type_,
ldb,
strideB,
C,
c_type_,
ldc,
strideC,
batch_size,
compute_type_,
alpha,
beta);
}
void Gemm::stridedBatchedGemm(GemmOp transa,
GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
const size_t lda,
const int64_t strideA,
const void* B,
const size_t ldb,
const int64_t strideB,
void* C,
const size_t ldc,
const int64_t strideC,
const size_t batch_size,
const float alpha,
const float beta)
{
stridedBatchedGemm(transa,
transb,
m,
n,
k,
A,
a_type_,
lda,
strideA,
B,
b_type_,
ldb,
strideB,
C,
c_type_,
ldc,
strideC,
batch_size,
compute_type_,
alpha,
beta);
}
void Gemm::stridedBatchedGemm(GemmOp transa,
GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
DataType Atype,
const size_t lda,
const int64_t strideA,
const void* B,
DataType Btype,
const size_t ldb,
const int64_t strideB,
void* C,
DataType Ctype,
const size_t ldc,
const int64_t strideC,
const size_t batch_size,
DataType compute_type,
const float alpha,
const float beta)
{
FT_LOG_TRACE("Gemm::stridedBatchedGemm [b=%ld, m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]",
batch_size,
m,
n,
k,
lda,
ldb,
ldc);
// Switch A and B.
const void* a_data_ptr = B;
const void* b_data_ptr = A;
cublasOperation_t a_op = getCublasOperation(transb);
cublasOperation_t b_op = getCublasOperation(transa);
cudaDataType_t a_type = getCublasDataType(Btype);
cudaDataType_t b_type = getCublasDataType(Atype);
cudaDataType_t c_type = getCublasDataType(Ctype);
// swap m and n, lda and ldb, stride A and B
const size_t _m = n;
const size_t _n = m;
const size_t _lda = ldb;
const size_t _ldb = lda;
const int64_t _stridea = strideB;
const int64_t _strideb = strideA;
half h_alpha = (half)alpha;
half h_beta = (half)beta;
mutex_->lock();
bool is_fp16_compute_type = compute_type_ == TYPE_FP16;
const void* alpha_ptr =
is_fp16_compute_type ? reinterpret_cast<const void*>(&h_alpha) : reinterpret_cast<const void*>(&alpha);
const void* beta_ptr =
is_fp16_compute_type ? reinterpret_cast<const void*>(&h_beta) : reinterpret_cast<const void*>(&beta);
cublasLtMatmulAlgo_info info =
cublas_algo_map_->getAlgo(batch_size, m, n, k, (a_type == CUDA_R_16F) ? HALF_DATATYPE : FLOAT_DATATYPE);
check_cuda_error(cublasGemmStridedBatchedEx(cublas_handle_,
a_op,
b_op,
_m,
_n,
k,
alpha_ptr,
a_data_ptr,
a_type,
_lda,
_stridea,
b_data_ptr,
b_type,
_ldb,
_strideb,
beta_ptr,
C,
c_type,
ldc,
strideC,
batch_size,
getCublasComputeType(compute_type),
static_cast<cublasGemmAlgo_t>(info.algoId)));
mutex_->unlock();
}
void Gemm::checkDataTypeValidity(const DataType& type)
{
if (type != TYPE_FP32 && type != TYPE_FP16) {
throw GemmNotSupportedException("Gemm supports TYPE_FP16 or TYPE_FP32");
}
}
/* ************************* End of GEMM Impl **************************** */
// void Int8Gemm::gemm(Tensor& C,
// const GemmOp transa,
// const GemmOp transb,
// const Tensor& A,
// const Tensor& B,
// const float alpha,
// const float beta)
// {
// }
/* ************************* SpGEMM Impl *********************************** */
#ifdef SPARSITY_ENABLED
SpGemm::SpGemm(IAllocator* allocator, cudaStream_t stream, std::string config_file, std::string spconfig_file):
Gemm(allocator, stream, config_file)
{
CHECK_CUSPARSE(cusparseLtInit(&cusparselt_handle_));
// TODO(jaedeokk):
// Let's make cublasAlgoMap load gemm/spgemm config separtely,
// allowing us to inherit Gemm's constructor.
// cublas_algo_map_.loadSpGemmConfig(spconfig_file); // enable this line later.
a_type_ = TYPE_FP16;
b_type_ = TYPE_FP16;
c_type_ = TYPE_FP16;
compute_type_ = TYPE_FP16;
}
SpGemm::~SpGemm()
{
cusparseLtDestroy(&cusparselt_handle_);
// Need to destroy matmul description cache.
for (auto& kv : a_desc_map_) { // kv = (mark, a_desc)
cusparseLtMatDescriptorDestroy(&a_desc_map_[kv.first]);
}
for (auto& kv : b_desc_map_) { // kv = (mark, b_desc)
cusparseLtMatDescriptorDestroy(&b_desc_map_[kv.first]);
}
for (auto& kv : c_desc_map_) { // kv = (mark, c_desc)
cusparseLtMatDescriptorDestroy(&c_desc_map_[kv.first]);
}
}
std::string SpGemm::toString()
{
const char* a_type_str = a_type_ == TYPE_FP16 ? "FP16" : "FP32";
const char* b_type_str = b_type_ == TYPE_FP16 ? "FP16" : "FP32";
const char* c_type_str = c_type_ == TYPE_FP16 ? "FP16" : "FP32";
const char* compute_type_str = compute_type_ == TYPE_FP16 ? "FP16" : "FP32";
return fmtstr("SpGemm[a_type=%s, b_type=%s, c_type=%s, compute_type=%s]",
a_type_str,
b_type_str,
c_type_str,
compute_type_str);
}
void SpGemm::loadGemmConfig(std::string config_file, std::string spconfig_file)
{
if (cublas_algo_map_ != nullptr) {
delete cublas_algo_map_; // unload algo map.
}
cublas_algo_map_ = new cublasAlgoMap(config_file, spconfig_file);
}
void SpGemm::checkDataTypeValidity(const DataType& type)
{
if (type != TYPE_FP16) {
throw GemmNotSupportedException("Sparse GEMM only supports FP16 data type now.");
}
}
bool SpGemm::useBaseGemm(size_t batch_size, size_t m, size_t n, size_t k)
{
return !cublas_algo_map_->isUseSparse(batch_size, m, n, k);
}
// Temporal gemm helper mtehod to use template T.
template<typename T>
void SpGemm::weightGemmHelper(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* input,
const DenseWeight<T>& weight,
void* output,
const float alpha,
const float beta)
{
size_t lda = (transa == GEMM_OP_N) ? k : m;
size_t ldb = (transb == GEMM_OP_N) ? n : k;
size_t ldc = n;
if (useBaseGemm(1, m, n, k) || weight.sp_kernel == nullptr) {
Gemm::gemm(transa,
transb,
m,
n,
k,
input,
a_type_,
lda,
(const void*)weight.kernel,
b_type_,
ldb,
output,
c_type_,
ldc,
alpha,
beta);
}
else {
gemm(transa,
transb,
m,
n,
k,
input,
a_type_,
lda,
(const void*)weight.sp_kernel,
b_type_,
ldb,
output,
c_type_,
ldc,
alpha,
beta);
}
}
void SpGemm::gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* input,
const DenseWeight<float>& weight,
void* output,
const float alpha,
const float beta)
{
weightGemmHelper<float>(transa, transb, m, n, k, input, weight, output, alpha, beta);
}
void SpGemm::gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* input,
const DenseWeight<half>& weight,
void* output,
const float alpha,
const float beta)
{
weightGemmHelper<half>(transa, transb, m, n, k, input, weight, output, alpha, beta);
}
void SpGemm::gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
const DataType Atype,
const size_t lda,
const void* B,
const DataType Btype,
const size_t ldb,
void* C,
const DataType Ctype,
const size_t ldc,
const float alpha,
const float beta)
{
FT_LOG_TRACE("SpGemm::gemm [m=%ld, n=%ld, k=%ld, lda=%ld, ldb=%ld, ldc=%ld]", m, n, k, lda, ldb, ldc);
checkDataTypeValidity(Atype);
checkDataTypeValidity(Btype);
checkDataTypeValidity(Ctype);
checkDataTypeValidity(compute_type_);
if (useBaseGemm(1, m, n, k)) {
// Compute by the base GEMM.
Gemm::gemm(transa, transb, m, n, k, A, Atype, lda, B, Btype, ldb, C, Ctype, ldc, alpha, beta);
return;
}
// Switch A/B due to column major layout in computation.
// Typical usecase of Gemm family is to compute Y = X * W where X is an
// input tensor and W is a kernel weight. Compression takes a lot time
// so only the kernel weight (which is fixed in inference time) can be
// sparse. Using B as sparse seems not stable, unfortunately.
// (e.g. caching matrix descriptions is not correctly working.)
// Thus, SpGemm considers a column major layout in computation to make
// C^T = B^T * A^T, where a kernel weight "B" is located at the front.
const void* a_data = B;
const void* b_data = A;
cusparseOrder_t order = CUSPARSE_ORDER_COL;
cusparseOperation_t opA = getCusparseOperation(transb);
cusparseOperation_t opB = getCusparseOperation(transa);
cudaDataType_t a_type = getCublasDataType(Btype);
cudaDataType_t b_type = getCublasDataType(Atype);
cudaDataType_t c_type = getCublasDataType(Ctype);
const size_t _m = n;
const size_t _n = m;
const size_t _lda = ldb;
const size_t _ldb = lda;
const size_t a_rows = (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? _m : k;
const size_t a_cols = (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) ? k : _m;
const size_t b_rows = (opB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? k : _n;
const size_t b_cols = (opB == CUSPARSE_OPERATION_NON_TRANSPOSE) ? _n : k;
const size_t c_rows = _m;
const size_t c_cols = _n;
const unsigned alignment = 16;
cusparseComputeType compute_type = getCusparseComputeType(compute_type_);
cusparseLtMatmulDescriptor_t matmul;
cusparseLtMatmulAlgSelection_t alg_sel;
cusparseLtMatmulPlan_t plan;
char mark[256];
sprintf(mark, "%d_%ld_%ld_%ld_%s_%s", 1, m, n, k, getGemmOpString(transb).c_str(), getGemmOpString(transa).c_str());
if (a_desc_map_.find(mark) != a_desc_map_.end()) {
CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
&matmul,
opA,
opB,
&a_desc_map_[mark],
&b_desc_map_[mark],
&c_desc_map_[mark],
&c_desc_map_[mark],
compute_type));
}
else {
// initializing MatDesc takes a lot of time
cusparseLtMatDescriptor_t a_desc, b_desc, c_desc;
a_desc_map_[mark] = a_desc;
b_desc_map_[mark] = b_desc;
c_desc_map_[mark] = c_desc;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
&a_desc_map_[mark],
a_rows,
a_cols,
_lda,
alignment,
a_type,
order,
CUSPARSELT_SPARSITY_50_PERCENT));
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
&cusparselt_handle_, &b_desc_map_[mark], b_rows, b_cols, _ldb, alignment, b_type, order));
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(
&cusparselt_handle_, &c_desc_map_[mark], c_rows, c_cols, ldc, alignment, c_type, order));
CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(&cusparselt_handle_,
&matmul,
opA,
opB,
&a_desc_map_[mark],
&b_desc_map_[mark],
&c_desc_map_[mark],
&c_desc_map_[mark],
compute_type));
}
mutex_->lock();
CHECK_CUSPARSE(
cusparseLtMatmulAlgSelectionInit(&cusparselt_handle_, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT));
int alg = cublas_algo_map_->getSpAlgo(1, a_rows, b_cols, a_cols);
CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
&cusparselt_handle_, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)));
size_t workspace_size;
CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&cusparselt_handle_, &alg_sel, &workspace_size));
CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&cusparselt_handle_, &plan, &matmul, &alg_sel, workspace_size));
void* d_workspace = nullptr; // Can we use the workspace of the class?
int num_streams = 1;
cudaStream_t streams[1] = {stream_};
CHECK_CUSPARSE(cusparseLtMatmul(
&cusparselt_handle_, &plan, &alpha, a_data, b_data, &beta, C, C, d_workspace, streams, num_streams))
CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
mutex_->unlock();
sync_check_cuda_error();
}
#endif
/* ************************* End of SpGEMM Impl ************************** */
/* ***************************** GEMM utils ****************************** */
std::shared_ptr<Gemm> createGemm(IAllocator* allocator, cudaStream_t stream, bool sparse, bool quantized)
{
FT_LOG_TRACE(
"Create Gemm instance [sparse=%s, quantized=%s]", sparse ? "true" : "false", quantized ? "true" : "false");
std::shared_ptr<Gemm> gemm;
if (!sparse) {
if (!quantized) {
gemm = std::make_shared<Gemm>(allocator, stream);
}
else {
throw GemmNotSupportedException("Int8 Gemm is not supported yet");
}
}
else {
#ifdef SPARSITY_ENABLED
if (sparse && !quantized) {
gemm = std::make_shared<SpGemm>(allocator, stream);
}
else {
throw GemmNotSupportedException("Int8 Sparse Gemm is not supported yet");
}
#else
throw GemmNotSupportedException("Sparsity support is not enabled. To enabled sparisty, "
"please provide `-DSPARSITY_SUPPORT` flag for compilation.");
#endif
}
return gemm;
}
cudaDataType_t getCublasDataType(DataType dtype)
{
switch (dtype) {
case TYPE_FP16:
return CUDA_R_16F;
case TYPE_FP32:
return CUDA_R_32F;
default:
throw GemmNotSupportedException("Not supported data type.");
}
}
#if (CUDART_VERSION >= 11000)
cublasComputeType_t getCublasComputeType(DataType ctype)
{
switch (ctype) {
case TYPE_FP16:
return CUBLAS_COMPUTE_16F;
case TYPE_FP32:
return CUBLAS_COMPUTE_32F;
default:
throw GemmNotSupportedException("Not supported cublas compute type.");
}
}
#else
cudaDataType_t getCublasComputeType(DataType ctype)
{
switch (ctype) {
case TYPE_FP16:
return CUDA_R_16F;
case TYPE_FP32:
return CUDA_R_32F;
default:
throw GemmNotSupportedException("Not supported cublas compute type.");
}
}
#endif
cublasOperation_t getCublasOperation(GemmOp op)
{
switch (op) {
case GEMM_OP_N:
return CUBLAS_OP_N;
case GEMM_OP_T:
return CUBLAS_OP_T;
default:
throw GemmNotSupportedException("Unknown GemmOp provided.");
}
}
std::string getGemmOpString(const GemmOp& op)
{
switch (op) {
case GEMM_OP_T:
return "T";
case GEMM_OP_N:
return "N";
}
throw GemmNotSupportedException("Unknown GemmOp provided.");
}
#ifdef SPARSITY_ENABLED
cusparseOperation_t getCusparseOperation(GemmOp op)
{
switch (op) {
case GEMM_OP_N:
return CUSPARSE_OPERATION_NON_TRANSPOSE;
case GEMM_OP_T:
return CUSPARSE_OPERATION_TRANSPOSE;
default:
throw GemmNotSupportedException("Unknown GemmOp provided.");
}
}
cusparseComputeType getCusparseComputeType(DataType ctype)
{
if (ctype != TYPE_FP16) {
throw GemmNotSupportedException("Sparse GEMM supports TYPE_FP16 compute type only.");
}
return CUSPARSE_COMPUTE_16F;
}
void pruneMatrixB(void* data, const cudaStream_t& stream, const size_t k, const size_t n, const GemmOp trans)
{
FT_LOG_TRACE("Prune matrix B [k=%ld, n=%ld, op=%s]", k, n, getGemmOpString(trans).c_str());
// Due to A/B switching, the matrix B will be used as a matrix A.
const cusparseOrder_t order = CUSPARSE_ORDER_COL;
const size_t rows = (trans == GEMM_OP_N) ? n : k;
const size_t cols = (trans == GEMM_OP_N) ? k : n;
const size_t ld = rows;
const unsigned alignment = 16;
const cusparseLtPruneAlg_t prune_alg = CUSPARSELT_PRUNE_SPMMA_STRIP;
const cusparseOperation_t op = getCusparseOperation(trans);
const cudaDataType_t dtype = CUDA_R_16F; // fixed under cusparselt == 0.2.0.
// 0: B is sparse, 1: A is sparse
// B matrix will be used as A matrix at the SpGemm::gemm.
const int is_sparse_a = 1;
// TODO: Let the resource manager handle GPU-related resources later.
cusparseLtHandle_t handle;
CHECK_CUSPARSE(cusparseLtInit(&handle));
cusparseLtMatDescriptor_t mat_desc;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
&handle, &mat_desc, rows, cols, ld, alignment, dtype, order, CUSPARSELT_SPARSITY_50_PERCENT));
CHECK_CUSPARSE(cusparseLtSpMMAPrune2(&handle, &mat_desc, is_sparse_a, op, data, data, prune_alg, stream));
CHECK_CUSPARSE(cusparseLtMatDescriptorDestroy(&mat_desc));
CHECK_CUSPARSE(cusparseLtDestroy(&handle));
}
size_t compressMatrixB(void** output,
IAllocator& allocator,
const cudaStream_t& stream,
const void* input,
const size_t k,
const size_t n,
const GemmOp trans)
{
FT_LOG_TRACE("compressMatrix [k=%ld, n=%ld, dtype=FP16]", k, n);
// swap A/B due to column/row major layout mismatch.
cusparseOrder_t order = CUSPARSE_ORDER_COL;
const size_t rows = (trans == GEMM_OP_N) ? n : k;
const size_t cols = (trans == GEMM_OP_N) ? k : n;
const size_t ld = rows;
cudaDataType_t dtype = CUDA_R_16F; // fixed under cusparselt == 0.2.0.
cusparseLtSparsity_t sparsity = CUSPARSELT_SPARSITY_50_PERCENT;
cusparseOperation_t op = getCusparseOperation(trans);
cusparseLtMatDescriptor_t mat_desc;
const unsigned alignment = 16;
const int is_sparse_a = 1; // 0: B is sparse, 1: A is sparse
cusparseLtHandle_t handle;
CHECK_CUSPARSE(cusparseLtInit(&handle));
CHECK_CUSPARSE(
cusparseLtStructuredDescriptorInit(&handle, &mat_desc, rows, cols, ld, alignment, dtype, order, sparsity))
size_t compressed_size = 0;
CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_desc, &compressed_size));
if (compressed_size == 0) {
throw GemmInvalidException("Fail to compute correct compressed_size, got 0. This error may be "
"caused by a too small input matrix.");
}
*output = allocator.malloc(compressed_size, false);
CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_desc, is_sparse_a, op, input, *output, stream))
CHECK_CUSPARSE(cusparseLtMatDescriptorDestroy(&mat_desc));
CHECK_CUSPARSE(cusparseLtDestroy(&handle));
return compressed_size;
}
#endif
/* ************************* End of GEMM utils **************************** */
} // end of namespace fastertransformer
/*
* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cublasLt.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include <iostream>
#include <map>
#include <mutex>
#include <stdexcept>
#include <string>
// TODO: Need to remove the dependency of the layer module.
// e.g. refactor Weight class to some base module.
#include "src/fastertransformer/layers/DenseWeight.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h"
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#endif
// cublas default workspace size: 32MB. Let me make this as a Gemm property.
#define WORKSPACE_SIZE 33554432
namespace fastertransformer {
// A wrapper of cublas or cusparse matrix operator.
// - GEMM_OP_N = CUBLAS_OP_N or CUSPARSE_OP_N
// - GEMM_OP_T = CUBLAS_OP_T or CUSPARSE_OP_T
enum GemmOp {
GEMM_OP_N,
GEMM_OP_T
};
// A base class of the GEMM family.
// In the current version Gemm is as a base class as well as an interface.
class Gemm {
public:
Gemm() = delete; // Disable a default constructor
/**
* A Gemm class.
*
* NOTE:
* A, B, C are assumed to have a row major layout, while a backend cuda libraries
* assumes a column major layout. However, a family of Gemm has already handled
* such discrepancy internally. Please use naively without a trick like switching
* inputs A and B that aligns the matrix layout.
*
* Restriction: Supported in/out data or compute types: TYPE_FP16, TYPE_FP32.
*
* TODO:
* Unify resource allocation/release from a singleton GPU resource managers.
* Thus, allocator, stream can be replaced by a resource handler later.
* E.g. Gemm(std::shared_ptr<ResourceManager> resource_manager), and
* stream_ = resource_manager.getCudaStream();
* buffer = resource_manager.malloc(...);
*
* @param allocator Resource allocator.
* @param stream A CUDA stream.
* @param config_file A file path of a GEMM configuration.
*/
Gemm(IAllocator* allocator, cudaStream_t stream, std::string config_file = GEMM_CONFIG);
Gemm(Gemm const& other) = delete;
virtual ~Gemm();
virtual std::string toString();
/**
* @brief Set GEMM compute type.
*
* @param compute_type The data type of accumulation type inside GEMM computation.
* (Choices: TYPE_FP16, TYPE_FP32)
*
* @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
* @throw std::runtime_error if any exception inside CUDA.
*/
void setComputeType(DataType compute_type);
/**
* @brief Set matrix data types and compute precision.
*
* Supported data or compute types: TYPE_FP16, TYPE_FP32
*
* @param a_type The data type of a matrix A.
* @param b_type The data type of a matrix B.
* @param c_type The data type of a matrix C.
* @param compute_type The data type of accumulation type inside GEMM computation.
*
* @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
* @throw std::runtime_error if any exception inside CUDA.
*/
void setTypes(DataType a_type, DataType b_type, DataType c_type, DataType compute_type);
/**
* @brief Set matrix data and compute types by default values.
*
* Default configs:
* - T=float : data type=TYPE_FP32, compute type=TYPE_FP32
* - T=half : data type=TYPE_FP16, compute type=TYPE_FP32
*/
template<typename T>
void setDefaultTypes();
void loadGemmConfig(std::string config_file);
void setAllocator(IAllocator* allocator);
void setCudaStream(cudaStream_t& stream);
// Th APIs below are to see how the interface will change
// if it cooperates with Tensor. To enable it, we need to
// update the Tensor class. For instance, data is need to
// be of type (void*) rather than (const void*) to pass it
// as the output C of gemm.
// virtual void gemm(Tensor& C,
// const GemmOp transa,
// const GemmOp transb,
// const Tensor& A,
// const Tensor& B,
// const float alpha = 1.0f,
// const float beta = 0.0f);
//
// virtual void batchedMatmul(std::vector<Tensor> Carray,
// const GemmOp transa,
// const GemmOp transb,
// const std::vector<Tensor> Aarray,
// const std::vector<Tensor> Barray,
// const float alpha = 1.0f,
// const float beta = 0.0f);
//
// virtual void stridedBatchedGemm(Tensor& C,
// const GemmOp transa,
// const GemmOp transb,
// const Tensor& A,
// const Tensor& B,
// const float alpha = 1.0f,
// const float beta = 0.0f);
// TODO:
// This function cooperates with a Weight object to simply Gemm calls
// inside layers, computing the following formula
// output(C) = input(A) * weight_kernel(B)
// where weight_kernel can be changed according to Gemm functions.
// DenseWeight is of a template struct, not allowing override the method.
// We temperally add an interface here for two cases float/half,
// but to finialze this function, we need an interface of a weight class
// which is not a template class.
virtual void gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* input,
const DenseWeight<float>& weight,
void* output,
const float alpha = 1.0f,
const float beta = 0.0f);
virtual void gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* input,
const DenseWeight<half>& weight,
void* output,
const float alpha = 1.0f,
const float beta = 0.0f);
virtual void gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
const void* B,
void* C,
const float alpha = 1.0f,
const float beta = 0.0f);
virtual void gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
const size_t lda,
const void* B,
const size_t ldb,
void* C,
const size_t ldc,
const float alpha = 1.0f,
const float beta = 0.0f);
/**
* @brief Compute the matrix multiplication `C = \alpha * op(A) * op(B) + \beta * C`.
*
* @param transa A transpose operation of a matrix A (GEMM_OP_N or GEMM_OP_T).
* @param transb A transpose operation of a matrix B (GEMM_OP_N or GEMM_OP_T).
* @param m A number of rows of a matrix op(A) and C.
* @param n A number of columns of a matrix op(B) or C.
* @param k A number of columns of op(A) and rows of op(B).
* @param A A device pointer of a matrix A of dimension (m x lda).
* @param Atype A data type of A (TYPE_FP16 or TYPE_FP32)
* @param lda A leading dimension of the matrix A.
* @param B A device pointer of a matrix B of dimension (k x ldb).
* @param Btype A data type of B (TYPE_FP16 or TYPE_FP32)
* @param ldb A leading dimension of the matrix B.
* @param C (Output) A device pointer of a matrix C of dimension (m x ldc).
* @param Ctype A data type of C (TYPE_FP16 or TYPE_FP32)
* @param ldc A leading dimension of the matrix C.
* @param alpha A scale factor for A*B (default: 1.0f).
* @param beta A scale factor for C (default: 0.0f).
*
* @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
* @throw std::runtime_error if any exception inside CUDA.
*/
virtual void gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
const DataType Atype,
const size_t lda,
const void* B,
const DataType Btype,
const size_t ldb,
void* C,
const DataType Ctype,
const size_t ldc,
const float alpha = 1.0f,
const float beta = 0.0f);
virtual void batchedGemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* const* A,
const void* const* B,
void* const* C,
const size_t batch_size,
const float alpha = 1.0f,
const float beta = 0.0f);
virtual void batchedGemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* const* A,
const size_t lda,
const void* const* B,
const size_t ldb,
void* const* C,
const size_t ldc,
const size_t batch_size,
const float alpha = 1.0f,
const float beta = 0.0f);
/**
* @brief Compute the matrix multiplication of batch of matrices As and Bs
*
* For input batch A[i]/B[i] and output batch C[i], i = 0, ..., batch_size - 1,
* `C[i] = \alpha * op(A[i]) * op(B[i]) + \beta * C[i]`.
*
* @param transa A transpose operation of a matrix A (GEMM_OP_N or GEMM_OP_T).
* @param transb A transpose operation of a matrix B (GEMM_OP_N or GEMM_OP_T).
* @param m A number of rows of a matrix op(A) and C.
* @param n A number of columns of a matrix op(B) or C.
* @param k A number of columns of op(A) and rows of op(B).
* @param A An array of device pointers of batch of input A matrices.
* @param Atype A data type of A (TYPE_FP16 or TYPE_FP32)
* @param lda A leading dimension of the matrix A.
* @param B An array of device pointers of batch of input B matrices.
* @param Btype A data type of B (TYPE_FP16 or TYPE_FP32)
* @param ldb A leading dimension of the matrix B.
* @param C (Output) An array of device pointers of batch of output C matrices.
* @param Ctype A data type of C (TYPE_FP16 or TYPE_FP32)
* @param ldc A leading dimension of the matrix C.
* @param alpha A scale factor for A*B (default: 1.0f).
* @param beta A scale factor for C (default: 0.0f).
*
* @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
* @throw std::runtime_error if any exception inside CUDA.
*/
virtual void batchedGemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* const* A,
const DataType Atype,
const size_t lda,
const void* const* B,
const DataType Btype,
const size_t ldb,
void* const* C,
const DataType Ctype,
const size_t ldc,
const size_t batch_size,
const float alpha = 1.0f,
const float beta = 0.0f);
virtual void stridedBatchedGemm(GemmOp transa,
GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
const void* B,
void* C,
const size_t batch_size,
const float alpha = 1.0f,
const float beta = 0.0f);
virtual void stridedBatchedGemm(GemmOp transa,
GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
const int64_t strideA,
const void* B,
const int64_t strideB,
void* C,
const int64_t strideC,
const size_t batch_size,
const float alpha = 1.0f,
const float beta = 0.0f);
virtual void stridedBatchedGemm(GemmOp transa,
GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
const size_t lda,
const int64_t strideA,
const void* B,
const size_t ldb,
const int64_t strideB,
void* C,
const size_t ldc,
const int64_t strideC,
const size_t batch_size,
const float alpha = 1.0f,
const float beta = 0.0f);
/**
* @brief Compute the strided matrix multiplication of batch of matrices As and Bs
*
* For input batch A[i]/B[i] and output batch C[i], i = 0, ..., batch_size - 1,
* `C[i] = \alpha * op(A[i]) * op(B[i]) + \beta * C[i]`.
*
* @param transa A transpose operation of a matrix A (GEMM_OP_N or GEMM_OP_T).
* @param transb A transpose operation of a matrix B (GEMM_OP_N or GEMM_OP_T).
* @param m A number of rows of a matrix op(A) and C.
* @param n A number of columns of a matrix op(B) or C.
* @param k A number of columns of op(A) and rows of op(B).
* @param A An array of device pointers of batch of input A matrices.
* @param Atype A data type of A (TYPE_FP16 or TYPE_FP32)
* @param lda A leading dimension of the matrix A.
* @param strideA An offset in number of elements between matrix A[i] and A[i+1].
* @param B An array of device pointers of batch of input B matrices.
* @param Btype A data type of B (TYPE_FP16 or TYPE_FP32)
* @param ldb A leading dimension of the matrix B.
* @param strideB An offset in number of elements between matrix B[i] and B[i+1].
* @param C (Output) An array of device pointers of batch of output C matrices.
* @param Ctype A data type of C (TYPE_FP16 or TYPE_FP32)
* @param ldc A leading dimension of the matrix C.
* @param strideC An offset in number of elements between matrix C[i] and C[i+1].
* @param compute_type An accumulation type of GEMM.
* @param alpha A scale factor for A*B (default: 1.0f).
* @param beta A scale factor for C (default: 0.0f).
*
* @throw GemmNotSupportedException if a type is not TYPE_FP16 or TYPE_FP32.
* @throw std::runtime_error if any exception inside CUDA.
*/
virtual void stridedBatchedGemm(GemmOp transa,
GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
DataType Atype,
const size_t lda,
const int64_t strideA,
const void* B,
DataType Btype,
const size_t ldb,
const int64_t strideB,
void* C,
DataType Ctype,
const size_t ldc,
const int64_t strideC,
const size_t batch_size,
DataType compute_type,
const float alpha = 1.0f,
const float beta = 0.0f);
protected:
IAllocator* allocator_ = nullptr;
cudaStream_t stream_;
std::mutex* mutex_ = nullptr;
cublasAlgoMap* cublas_algo_map_ = nullptr;
cublasHandle_t cublas_handle_;
cublasLtHandle_t cublaslt_handle_;
void* workspace_ = nullptr;
// use FP32 as default
DataType a_type_ = TYPE_FP32;
DataType b_type_ = TYPE_FP32;
DataType c_type_ = TYPE_FP32;
DataType compute_type_ = TYPE_FP32;
// Check if data and inputs are valid in the Gemm class.
virtual void checkDataTypeValidity(const DataType& type);
};
// class Int8Gemm : public Gemm {
// protected:
// bool use_ORDER_COL32_2R_4R4_; // what is this?
// };
#ifdef SPARSITY_ENABLED
/**
* A Sparse Gemm class.
*
* NOTE:
* A, B, C are assumed to have a row major layout.
* There are two restrictions:
* - It supports the case when the matrix B is sparse.
* - Supported only TYPE_FP16 for in/out data or compute types.
*/
class SpGemm: public Gemm {
protected:
cusparseLtHandle_t cusparselt_handle_;
std::map<std::string, cusparseLtMatDescriptor_t> a_desc_map_;
std::map<std::string, cusparseLtMatDescriptor_t> b_desc_map_;
std::map<std::string, cusparseLtMatDescriptor_t> c_desc_map_;
bool useBaseGemm(size_t batch_size, size_t m, size_t n, size_t k);
public:
using Gemm::setComputeType;
using Gemm::setTypes;
using Gemm::setDefaultTypes;
using Gemm::setAllocator;
using Gemm::setCudaStream;
using Gemm::gemm;
using Gemm::batchedGemm;
using Gemm::stridedBatchedGemm;
/**
* @param allocator Resource allocator.
* @param stream A CUDA stream.
* @param config_file A file path of a GEMM configuration.
*/
// TODO: Let's unify algo map loading part.
SpGemm(IAllocator* allocator,
cudaStream_t stream,
std::string config_file = GEMM_CONFIG,
std::string spconfig_file = SPGEMM_CONFIG);
~SpGemm();
std::string toString() override;
void loadGemmConfig(std::string config_file, std::string spconfig_file);
// Template method cannot be overridden.
void gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* input,
const DenseWeight<float>& weight,
void* output,
const float alpha = 1.0f,
const float beta = 0.0f) override;
void gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* input,
const DenseWeight<half>& weight,
void* output,
const float alpha = 1.0f,
const float beta = 0.0f) override;
void gemm(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* A,
const DataType Atype,
const size_t lda,
const void* B,
const DataType Btype,
const size_t ldb,
void* C,
const DataType Ctype,
const size_t ldc,
const float alpha = 1.0f,
const float beta = 0.0f) override;
private:
void checkDataTypeValidity(const DataType& type) override;
// Temporal gemm helper mtehod to use template T.
template<typename T>
void weightGemmHelper(const GemmOp transa,
const GemmOp transb,
const size_t m,
const size_t n,
const size_t k,
const void* input,
const DenseWeight<T>& weight,
void* output,
const float alpha,
const float beta);
};
// class Int8SpGemm : public Int8Gemm, public SpGemm {
// };
#endif
/* ***************************** GEMM Exceptions ******************************* */
class GemmInvalidShapeException: public std::exception {
private:
std::string msg_ = "Invalid matrix shapes.";
public:
explicit GemmInvalidShapeException() = default;
template<typename... Args>
explicit GemmInvalidShapeException(const std::string format, const Args&... args): msg_(fmtstr(format, args...))
{
}
const char* what() const throw()
{
return msg_.c_str();
}
};
class GemmNotSupportedException: public std::exception {
private:
std::string msg_ = "Not supported exception.";
public:
explicit GemmNotSupportedException() = default;
template<typename... Args>
explicit GemmNotSupportedException(const std::string format, const Args&... args): msg_(fmtstr(format, args...))
{
}
const char* what() const throw()
{
return msg_.c_str();
}
};
class GemmInvalidException: public std::exception {
private:
std::string msg_ = "Invalid use of gemm.";
public:
explicit GemmInvalidException() = default;
template<typename... Args>
explicit GemmInvalidException(const std::string format, const Args&... args): msg_(fmtstr(format, args...))
{
}
const char* what() const throw()
{
return msg_.c_str();
}
};
/* ************************ End of GEMM Exceptions ************************ */
/* ***************************** GEMM utils ******************************* */
/**
* @brief Create method for the Gemm family.
*
* @param allocator Resource allocator.
* @param stream A CUDA stream.
* @param sparse Whether to use sparse GEMM
* @param quantized Whether to use int8 quantized GEMM.
* @return A shared pointer of a GemmCls instance.
*/
std::shared_ptr<Gemm>
createGemm(IAllocator* allocator, cudaStream_t stream, bool sparse = false, bool quantized = false);
cudaDataType_t getCublasDataType(DataType dtype);
#if (CUDART_VERSION >= 11000)
cublasComputeType_t getCublasComputeType(DataType dtype);
#else
cudaDataType_t getCublasComputeType(DataType dtype);
#endif
cublasOperation_t getCublasOperation(GemmOp op);
std::string getGemmOpString(const GemmOp& op);
#ifdef SPARSITY_ENABLED
cusparseOperation_t getCusparseOperation(GemmOp op);
cusparseComputeType getCusparseComputeType(DataType dtype);
/**
* @brief Prune a weight matrix (in-place).
*
* SpGemm supports a case when the sparse matrix is B in C=A*B.
*
* @param data A data pointer
* @param stream A cuda stream object.
* @param k A number of rows of op(B).
* @param n A number of columns of op(B).
* @param trans A transpose operation that will be applied to the matrix
* (default: GEMM_OP_N).
*/
void pruneMatrixB(
void* data, const cudaStream_t& stream, const size_t k, const size_t n, const GemmOp trans = GEMM_OP_N);
/**
* @brief Compress the B matrix in a specific sparsity format.
*
* @param output A pointer where to allocate memory buffer to store a compressed matrix.
* @param alloactor A resource allocator.
* @param stream A cuda stream object.
* @param input An input matrix to compress.
* @param k A number of rows of op(B).
* @param n A number of columns of op(B).
* @param trans A transpose operation that will be applied to the matrix (default: GEMM_OP_N).
*
* @return A size of the allocated device buffer of the compressed matrix.
*
* @throw GemmInvalidException if the input matrix does not have 2:4 sparsity.
* or if fail to compute a correct buffer size to store the compressed matrix.
* @throw std::runtime_error if any exception inside CUDA.
*/
size_t compressMatrixB(void** output,
IAllocator& allocator,
const cudaStream_t& stream,
const void* input,
const size_t k,
const size_t n,
const GemmOp trans = GEMM_OP_N);
#endif
/* ************************* End of GEMM utils **************************** */
} // end of namespace fastertransformer
# Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
set(gemm_func_files
gemm_func.cc
)
set(encoder_gemm_func_files
encoder_gemm_func.cc
)
set(encoder_igemm_func_files
encoder_igemm_func.cc
)
set(decoding_gemm_func_files
decoding_gemm_func.cc
)
set(gpt_gemm_func_files
gpt_gemm_func.cc
)
set(xlnet_gemm_func_files
xlnet_gemm_func.cc
)
set(t5_gemm_func_files
t5_gemm_func.cc
)
set(swin_igemm_func_files
swin_igemm_func.cc
)
set(swin_gemm_func_files
swin_gemm_func.cc
)
add_library(gemm_func STATIC ${gemm_func_files})
target_link_libraries(gemm_func PUBLIC -lcublas -lcublasLt -lcudart cuda_utils logger)
set_property(TARGET gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(encoder_gemm_func STATIC ${encoder_gemm_func_files})
target_link_libraries(encoder_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(encoder_gemm_func PUBLIC -lcusparse -lcusparseLt)
endif()
set_property(TARGET encoder_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET encoder_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(encoder_igemm_func STATIC ${encoder_igemm_func_files})
target_link_libraries(encoder_igemm_func PUBLIC -lcublas -lcublasLt -lcudart cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(encoder_igemm_func PUBLIC -lcusparse -lcusparseLt)
endif()
set_property(TARGET encoder_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET encoder_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(decoding_gemm_func STATIC ${decoding_gemm_func_files})
target_link_libraries(decoding_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
set_property(TARGET decoding_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET decoding_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(gpt_gemm_func STATIC ${gpt_gemm_func_files})
target_link_libraries(gpt_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(gpt_gemm_func PUBLIC -lcusparse -lcusparseLt)
endif()
set_property(TARGET gpt_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET gpt_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(xlnet_gemm_func STATIC ${xlnet_gemm_func_files})
target_link_libraries(xlnet_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
set_property(TARGET xlnet_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET xlnet_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(t5_gemm_func STATIC ${t5_gemm_func_files})
target_link_libraries(t5_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
if (SPARSITY_SUPPORT)
target_link_libraries(t5_gemm_func PUBLIC -lcusparse -lcusparseLt)
endif()
set_property(TARGET t5_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET t5_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(swin_igemm_func STATIC ${swin_igemm_func_files})
target_link_libraries(swin_igemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func encoder_igemm_func cuda_utils logger)
set_property(TARGET swin_igemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET swin_igemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
add_library(swin_gemm_func STATIC ${swin_gemm_func_files})
target_link_libraries(swin_gemm_func PUBLIC -lcublas -lcublasLt -lcudart gemm_func cuda_utils logger)
set_property(TARGET swin_gemm_func PROPERTY POSITION_INDEPENDENT_CODE ON)
set_property(TARGET swin_gemm_func PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/utils/gemm_test/decoding_gemm_func.h"
namespace fastertransformer {
template<typename T>
void generate_decoding_gemm_config(int batch_size,
int beam_width,
int max_mem_seq_len,
int head_num,
int size_per_head,
int inter_size,
int vocab_size,
int mem_hidden_units,
void* buffer_in,
bool isAppend)
{
void* cublas_workspace;
void* buffer;
int workSpaceSize;
#ifdef ENABLE_BF16
if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
#else
if (std::is_same<T, half>::value) {
#endif // ENABLE_BF16
// cublas_workspace_ should be the start pointer of cudaMalloc()
// to ensure 16B alignemnet
cublas_workspace = buffer_in;
buffer = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
workSpaceSize = CUBLAS_WORKSPACE_SIZE;
}
else {
cublas_workspace = nullptr;
buffer = buffer_in;
workSpaceSize = 0;
}
struct cudaDeviceProp prop;
check_cuda_error(cudaGetDeviceProperties(&prop, 0));
printf("Device %s\n", prop.name);
// check config
FILE* fd;
int line_count = 0;
if (!isAppend) {
fd = fopen(GEMM_CONFIG, "w+");
}
else {
fd = fopen(GEMM_CONFIG, "a+");
std::vector<std::string> config;
char line[1024];
while (fgets(line, 1024, fd) != NULL) {
config.push_back(std::string(line));
}
line_count = config.size();
if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1)) // 6 cublas/cublasLt, first row is not included
{
int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
fclose(fd);
fd = fopen(GEMM_CONFIG, "w+");
fprintf(fd, "%s", config[0].c_str());
for (uint i = startIdx; i < config.size(); i++) {
fprintf(fd, "%s", config[i].c_str());
}
line_count = config.size() - (GEMM_NUM + 3);
}
}
const int hidden_units = head_num * size_per_head;
const int gemm_num = 6;
int M[gemm_num];
int N[gemm_num];
int K[gemm_num];
int batchCount[gemm_num] = {1, 1, 1, 1, 1, 1};
char mess[gemm_num][256];
// gemm 0
M[0] = batch_size * beam_width;
K[0] = hidden_units;
N[0] = K[0] * 3;
strcpy(mess[0], "from_tensor * weightQKV");
// gemm 1
M[1] = batch_size * beam_width;
K[1] = hidden_units;
N[1] = K[1];
strcpy(mess[1], "attr * output_kernel");
// gemm2
M[2] = batch_size * beam_width * max_mem_seq_len;
K[2] = mem_hidden_units;
N[2] = hidden_units;
strcpy(mess[2], "mem_tensor * weightK/V in cross attention");
// gemm 3
M[3] = batch_size * beam_width;
K[3] = hidden_units;
N[3] = inter_size;
strcpy(mess[3], "ffn gemm1 ");
// gemm 4
M[4] = batch_size * beam_width;
K[4] = inter_size;
N[4] = hidden_units;
strcpy(mess[4], "ffn gemm2");
// gemm5
M[5] = batch_size * beam_width;
K[5] = hidden_units;
N[5] = ceil(vocab_size / 8.) * 8;
strcpy(mess[5], "decoder_output * embedding_kernel -> embedding_output");
cublasHandle_t cublas_handle;
check_cuda_error(cublasCreate(&cublas_handle));
cublasLtHandle_t ltHandle;
check_cuda_error(cublasLtCreate(&ltHandle));
cudaDataType_t AType;
cudaDataType_t BType;
cudaDataType_t CType;
cudaDataType_t computeType;
int startAlgo, endAlgo;
const int ites = 100;
struct timeval start, end;
CublasDataType data_type;
if (std::is_same<T, float>::value) {
data_type = FLOAT_DATATYPE;
AType = CUDA_R_32F;
BType = CUDA_R_32F;
CType = CUDA_R_32F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_ALGO23;
}
else if (std::is_same<T, half>::value) {
data_type = HALF_DATATYPE;
AType = CUDA_R_16F;
BType = CUDA_R_16F;
CType = CUDA_R_16F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value) {
data_type = BFLOAT16_DATATYPE;
AType = CUDA_R_16BF;
BType = CUDA_R_16BF;
CType = CUDA_R_16BF;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
}
#endif
using scaleT = typename ScaleTypeConverter<T>::Type;
scaleT alpha = (scaleT)1.0f;
scaleT beta = (scaleT)0.0f;
printf("***Encoder Gemm Testing Begin***\n");
printf("***Cublas Gemm Testing Begin***\n");
if (line_count == 0) {
fprintf(fd,
"batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, "
"customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time\n");
}
for (int i = 0; i < gemm_num; ++i) {
int m = M[i], n = N[i], k = K[i];
printf("\n-----------------------------\n");
printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
T* d_A = (T*)buffer;
T* d_B = d_A + m * k * batchCount[i];
T* d_C = d_B + k * n * batchCount[i];
float exec_time = 99999.0f;
int fast_algo = 0;
int seq_len = i == 2 ? max_mem_seq_len : 1;
for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status;
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
for (int ite = 0; ite < ites; ++ite) {
status = cublasGemmEx(cublas_handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
n,
m,
k,
&alpha,
d_B,
BType,
n,
d_A,
AType,
k,
&beta,
d_C,
CType,
n,
computeType,
static_cast<cublasGemmAlgo_t>(algo));
if (status != CUBLAS_STATUS_SUCCESS) {
break;
}
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
if (diffTime(start, end) / ites < exec_time) {
exec_time = diffTime(start, end) / ites;
fast_algo = algo;
}
}
}
printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
// for fp16 and bf16, we compare cublasLt
if (data_type != FLOAT_DATATYPE) {
printf("***cublasLt Gemm Testing Begin***\n");
// Let try a fixed number of combinations
int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
LtHgemmCustomFind<T, scaleT>(ltHandle,
batch_size * beam_width,
seq_len,
head_num,
size_per_head,
n,
m,
k,
&alpha,
d_B,
d_A,
&beta,
d_C,
cublas_workspace,
workSpaceSize,
fd,
perfResults,
ALGO_COMBINATIONS);
if (perfResults[0].time < exec_time) {
printPerfStructure(batch_size * beam_width,
seq_len,
head_num,
size_per_head,
n,
m,
k,
perfResults[0],
fd,
data_type,
0);
}
else {
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
"-1 -1 "
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
"-1 -1 -1 "
#endif
"%f\n",
batch_size * beam_width,
seq_len,
head_num,
size_per_head,
data_type,
batchCount[i],
n,
m,
k,
fast_algo,
exec_time);
}
printf("***cublasLt Gemm Testing End***\n");
}
else {
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
"-1 -1 "
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
"-1 -1 -1 "
#endif
"%f\n",
batch_size * beam_width,
seq_len,
head_num,
size_per_head,
data_type,
batchCount[i],
n,
m,
k,
fast_algo,
exec_time);
}
}
printf("***cublas Gemm Testing End***\n\n");
fclose(fd);
printf("***Decoding Gemm Testing End***\n");
return;
}
template void generate_decoding_gemm_config<float>(int batch_size,
int beam_width,
int seq_len,
int head_num,
int size_per_head,
int inter_size,
int vocab_size,
int mem_hidden_units,
void* buffer_in,
bool isAppend);
template void generate_decoding_gemm_config<half>(int batch_size,
int beam_width,
int seq_len,
int head_num,
int size_per_head,
int inter_size,
int vocab_size,
int mem_hidden_units,
void* buffer_in,
bool isAppend);
#ifdef ENABLE_BF16
template void generate_decoding_gemm_config<__nv_bfloat16>(int batch_size,
int beam_width,
int seq_len,
int head_num,
int size_per_head,
int inter_size,
int vocab_size,
int mem_hidden_units,
void* buffer_in,
bool isAppend);
#endif
size_t calDecodingGemmTestBufSizeInByte(int batch_size,
int beam_width,
int max_mem_seq_len,
int head_num,
int size_per_head,
int inter_size,
int memory_hidden_units,
int vocab_size,
CublasDataType data_type)
{
size_t buf_size_in_byte = 0;
const size_t tensor_para_size = 1;
const size_t hidden_units = head_num * size_per_head;
const size_t local_head_num = head_num / tensor_para_size;
const size_t local_hidden_units = local_head_num * size_per_head;
// int wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half));
// Because we always use float for some buffer, set the wordSize to float directly.
int wordSize = sizeof(float);
size_t m = batch_size * beam_width;
std::vector<size_t> buff_size;
// for qkv gemm
buff_size.push_back(m * hidden_units + hidden_units * 3 * local_hidden_units + m * 3 * local_hidden_units);
// for attention output gemm
buff_size.push_back(m * hidden_units + hidden_units * local_hidden_units + m * local_hidden_units);
// for memory_tensor gemm
buff_size.push_back(m * max_mem_seq_len * memory_hidden_units + memory_hidden_units * local_hidden_units
+ m * max_mem_seq_len * local_hidden_units);
// for context ffn gemm
buff_size.push_back(m * inter_size / tensor_para_size + hidden_units * inter_size / tensor_para_size
+ m * hidden_units);
// for vocab
buff_size.push_back(m * hidden_units + hidden_units * ceil(vocab_size / 8.) * 8 / tensor_para_size
+ m * ceil(vocab_size / 8.) * 8 / tensor_para_size);
for (auto t : buff_size) {
buf_size_in_byte = buf_size_in_byte > t ? buf_size_in_byte : t;
}
buf_size_in_byte *= wordSize;
buf_size_in_byte += ((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE) ? CUBLAS_WORKSPACE_SIZE : 0);
return buf_size_in_byte;
}
} // namespace fastertransformer
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <map>
#include <sys/time.h>
#include <unistd.h>
#include <vector>
namespace fastertransformer {
template<typename T>
void generate_decoding_gemm_config(int batch_size,
int beam_width,
int seq_len,
int head_num,
int size_per_head,
int inter_size,
int vocab_size,
int mem_hidden_units,
void* buffer_in,
bool isAppend);
size_t calDecodingGemmTestBufSizeInByte(int batch_size,
int beam_width,
int max_mem_seq_len,
int head_num,
int size_per_head,
int inter_size,
int memory_hidden_units,
int vocab_size,
CublasDataType data_type);
} // namespace fastertransformer
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "src/fastertransformer/utils/gemm_test/encoder_gemm_func.h"
namespace fastertransformer {
template<typename T>
void generate_encoder_gemm_config(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer_in, bool isAppend, int tensor_para_size)
{
void* cublas_workspace;
void* buffer;
int workSpaceSize;
#ifdef ENABLE_BF16
if (std::is_same<T, half>::value || std::is_same<T, __nv_bfloat16>::value) {
#else
if (std::is_same<T, half>::value) {
#endif // ENABLE_BF16
// cublas_workspace_ should be the start pointer of cudaMalloc()
// to ensure 16B alignemnet
cublas_workspace = buffer_in;
buffer = (void*)((char*)cublas_workspace + CUBLAS_WORKSPACE_SIZE);
workSpaceSize = CUBLAS_WORKSPACE_SIZE;
}
else {
cublas_workspace = nullptr;
buffer = buffer_in;
workSpaceSize = 0;
}
struct cudaDeviceProp prop;
check_cuda_error(cudaGetDeviceProperties(&prop, 0));
printf("Device %s\n", prop.name);
// check config
FILE* fd;
int line_count = 0;
if (!isAppend) {
fd = fopen(GEMM_CONFIG, "w+");
}
else {
fd = fopen(GEMM_CONFIG, "a+");
std::vector<std::string> config;
char line[1024];
while (fgets(line, 1024, fd) != NULL) {
config.push_back(std::string(line));
}
line_count = config.size();
if (config.size() >= (MAX_CONFIG_NUM * GEMM_NUM + 1)) // 6 cublas/cublasLt, first row is not included
{
int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * GEMM_NUM);
fclose(fd);
fd = fopen(GEMM_CONFIG, "w+");
fprintf(fd, "%s", config[0].c_str());
for (uint i = startIdx; i < config.size(); i++) {
fprintf(fd, "%s", config[i].c_str());
}
line_count = config.size() - (GEMM_NUM + 3);
}
}
const int gemm_num = 7;
int M[gemm_num];
int N[gemm_num];
int K[gemm_num];
int batchCount[gemm_num] = {1, 1, 1, 1, 1, 1, 1};
char mess[gemm_num][256];
float exec_times[gemm_num];
// gemm1
M[0] = batch_size * seq_len;
K[0] = head_num * size_per_head;
N[0] = (head_num / tensor_para_size) * size_per_head;
strcpy(mess[0], "from_tensor * weightQ/K/V");
// gemm2
M[1] = M[0];
K[1] = head_num * size_per_head;
N[1] = 4 * head_num * size_per_head / tensor_para_size;
strcpy(mess[1], "attr_output * inter_kernel");
// gemm3
M[2] = M[0];
K[2] = 4 * head_num * size_per_head / tensor_para_size;
N[2] = head_num * size_per_head;
strcpy(mess[2], "inter_matmul * output_kernel");
M[3] = seq_len;
N[3] = seq_len;
K[3] = size_per_head;
batchCount[3] = batch_size * (head_num / tensor_para_size);
strcpy(mess[3], "attention batched Gemm1");
M[4] = seq_len;
N[4] = size_per_head;
K[4] = seq_len;
batchCount[4] = batch_size * (head_num / tensor_para_size);
strcpy(mess[4], "attention batched Gemm2");
M[5] = batch_size * seq_len;
N[5] = (head_num / tensor_para_size) * size_per_head;
K[5] = head_num * size_per_head;
batchCount[5] = 3;
strcpy(mess[5], "from_tensor * weight_QKV in BatchGemm");
M[6] = batch_size * seq_len;
K[6] = (head_num / tensor_para_size) * size_per_head;
N[6] = head_num * size_per_head;
strcpy(mess[6], "attr * output_kernel");
cublasHandle_t cublas_handle;
check_cuda_error(cublasCreate(&cublas_handle));
cublasLtHandle_t ltHandle;
check_cuda_error(cublasLtCreate(&ltHandle));
cudaDataType_t AType;
cudaDataType_t BType;
cudaDataType_t CType;
cudaDataType_t computeType;
int startAlgo, endAlgo;
const int ites = 100;
struct timeval start, end;
CublasDataType data_type;
if (std::is_same<T, float>::value) {
data_type = FLOAT_DATATYPE;
AType = CUDA_R_32F;
BType = CUDA_R_32F;
CType = CUDA_R_32F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT;
endAlgo = (int)CUBLAS_GEMM_ALGO23;
}
else if (std::is_same<T, half>::value) {
data_type = HALF_DATATYPE;
AType = CUDA_R_16F;
BType = CUDA_R_16F;
CType = CUDA_R_16F;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value) {
data_type = BFLOAT16_DATATYPE;
AType = CUDA_R_16BF;
BType = CUDA_R_16BF;
CType = CUDA_R_16BF;
computeType = CUDA_R_32F;
startAlgo = (int)CUBLAS_GEMM_DEFAULT_TENSOR_OP;
endAlgo = (int)CUBLAS_GEMM_ALGO15_TENSOR_OP;
}
#endif
using scaleT = typename ScaleTypeConverter<T, false>::Type;
scaleT alpha = (scaleT)1.0f;
scaleT beta = (scaleT)0.0f;
printf("***Encoder Gemm Testing Begin***\n");
printf("***Cublas Gemm Testing Begin***\n");
if (line_count == 0) {
fprintf(fd,
"batch_size, seq_len, head_num, size_per_head dataType ### batchCount, n, m, k, algoId, "
"customOption, tile, numSplitsK, swizzle, reductionScheme, workspaceSize, stages, exec_time\n");
}
for (int i = 0; i < gemm_num; ++i) {
// if(i != 0 && i != 5) continue;
int m = M[i], n = N[i], k = K[i];
printf("\n-----------------------------\n");
printf("GEMM test %d: [M: %d, K: %d, N: %d] %s\n", i, m, k, n, mess[i]);
T* d_A = (T*)buffer;
T* d_B = d_A + m * k * batchCount[i];
T* d_C = d_B + k * n * batchCount[i];
// array of pointer for batchedGemm
T* harray[12];
harray[0] = (T*)buffer;
harray[1] = (T*)((char*)buffer + sizeof(T) * m * k);
harray[2] = (T*)((char*)buffer + 2 * sizeof(T) * m * k);
harray[4] = (T*)((char*)buffer + 3 * sizeof(T) * m * k);
harray[5] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + sizeof(T) * k * n);
harray[6] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 2 * sizeof(T) * k * n);
harray[8] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n);
harray[9] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + sizeof(T) * m * n);
harray[10] = (T*)((char*)buffer + 3 * sizeof(T) * m * k + 3 * sizeof(T) * k * n + 2 * sizeof(T) * m * n);
T** darray = 0;
check_cuda_error(cudaMalloc((void**)&darray, sizeof(T*) * 12));
cudaMemcpy((void*)darray, (void*)harray, sizeof(T*) * 12, cudaMemcpyHostToDevice);
T** dAarray = darray;
T** dBarray = darray + 4;
T** dCarray = darray + 8;
float exec_time = 99999.0f;
int fast_algo = 0;
for (int algo = startAlgo; algo <= endAlgo; algo++) {
cublasStatus_t status;
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
for (int ite = 0; ite < ites; ++ite) {
if (i < 3) {
status = cublasGemmEx(cublas_handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
n,
m,
k,
&alpha,
d_B,
BType,
n,
d_A,
AType,
k,
&beta,
d_C,
CType,
n,
computeType,
static_cast<cublasGemmAlgo_t>(algo));
}
else if (i == 3) {
status = cublasGemmStridedBatchedEx(cublas_handle,
CUBLAS_OP_T,
CUBLAS_OP_N,
seq_len,
seq_len,
size_per_head,
&alpha,
d_B,
BType,
size_per_head,
seq_len * size_per_head,
d_A,
AType,
size_per_head,
seq_len * size_per_head,
&beta,
d_C,
CType,
seq_len,
seq_len * seq_len,
batch_size * head_num,
computeType,
static_cast<cublasGemmAlgo_t>(algo));
}
else if (i == 4) {
status = cublasGemmStridedBatchedEx(cublas_handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
size_per_head,
seq_len,
seq_len,
&alpha,
d_B,
BType,
size_per_head,
seq_len * size_per_head,
d_A,
AType,
seq_len,
seq_len * seq_len,
&beta,
d_C,
CType,
size_per_head,
seq_len * size_per_head,
batch_size * head_num,
computeType,
static_cast<cublasGemmAlgo_t>(algo));
}
else if (i == 5) {
status = cublasGemmBatchedEx(cublas_handle,
CUBLAS_OP_N,
CUBLAS_OP_N,
n,
m,
k,
&alpha,
(const void* const*)dBarray,
BType,
n,
(const void* const*)dAarray,
AType,
k,
&beta,
(void* const*)dCarray,
CType,
n,
3,
computeType,
static_cast<cublasGemmAlgo_t>(algo));
}
if (status != CUBLAS_STATUS_SUCCESS) {
break;
}
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
if (status == CUBLAS_STATUS_SUCCESS) {
printf("algo_%d costs %.3fms \n", algo, diffTime(start, end) / ites);
if (diffTime(start, end) / ites < exec_time) {
exec_time = diffTime(start, end) / ites;
fast_algo = algo;
}
}
}
printf("fast_algo %d costs %.3f ms\n", fast_algo, exec_time);
// for fp16 and bf16, we compare cublasLt
if (i < 3 && data_type != FLOAT_DATATYPE) {
printf("***cublasLt Gemm Testing Begin***\n");
// Let try a fixed number of combinations
int ALGO_COMBINATIONS = 5000;
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
LtHgemmCustomFind<T, scaleT>(ltHandle,
batch_size,
seq_len,
head_num,
size_per_head,
n,
m,
k,
&alpha,
d_B,
d_A,
&beta,
d_C,
cublas_workspace,
workSpaceSize,
fd,
perfResults,
ALGO_COMBINATIONS);
if (perfResults[0].time < exec_time) {
printPerfStructure(
batch_size, seq_len, head_num, size_per_head, n, m, k, perfResults[0], fd, data_type, 0);
exec_time = perfResults[0].time;
}
else {
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
"-1 -1 "
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
"-1 -1 -1 "
#endif
"%f\n",
batch_size,
seq_len,
head_num,
size_per_head,
data_type,
batchCount[i],
n,
m,
k,
fast_algo,
exec_time);
}
printf("***cublasLt Gemm Testing End***\n");
}
else {
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d -1 -1 -1 -1 -1 -1 -1 "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
"-1 -1 "
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
"-1 -1 -1 "
#endif
"%f\n",
batch_size,
seq_len,
head_num,
size_per_head,
data_type,
batchCount[i],
n,
m,
k,
fast_algo,
exec_time);
}
exec_times[i] = exec_time;
cudaFree(darray);
}
printf("***cublas Gemm Testing End***\n\n");
fclose(fd);
printf("***Encoder Gemm Testing End***\n");
#ifdef SPARSITY_ENABLED
bool do_sparse_test = false;
if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6)) {
do_sparse_test = true;
}
if (do_sparse_test && sizeof(T) == sizeof(half)) {
printf("***cusparseLt Gemm Testing Begin***\n");
// only first 3 cases can be sparse
const int spgemm_num = 3;
if (!isAppend) {
fd = fopen(SPGEMM_CONFIG, "w+");
}
else {
fd = fopen(SPGEMM_CONFIG, "a+");
std::vector<std::string> config;
char line[1024];
while (fgets(line, 1024, fd) != NULL) {
config.push_back(std::string(line));
}
line_count = config.size();
if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1)) // 6 cublas/cublasLt, first row is not included
{
int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num);
fclose(fd);
fd = fopen(SPGEMM_CONFIG, "w+");
fprintf(fd, "%s", config[0].c_str());
for (uint i = startIdx; i < config.size(); i++) {
fprintf(fd, "%s", config[i].c_str());
}
line_count = config.size() - (spgemm_num + 3);
}
}
if (line_count == 0) {
fprintf(
fd,
"batch_size, seq_len, head_num, size_per_head dataType ### batchCount, m, n, k, algoId, exec_time\n");
}
cusparseLtHandle_t handle;
CHECK_CUSPARSE(cusparseLtInit(&handle));
cusparseOrder_t order = CUSPARSE_ORDER_COL;
cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseComputeType compute_type = CUSPARSE_COMPUTE_16F;
unsigned alignment = 16;
cudaStream_t stream = 0;
float alpha2 = 1.0f;
float beta2 = 0.0f;
for (int i = 0; i < spgemm_num; ++i) {
// to be compatible with spgemm wrapper, we let A be the weight matrix
// so m and n are swapped
// A: mxk B: kxn C:mxn
int m = N[i], n = M[i], k = K[i];
printf("\n-----------------------------\n");
printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n);
T* d_A = (T*)buffer;
T* d_B = d_A + m * k * batchCount[i];
T* d_C = d_B + k * n * batchCount[i];
T* dA_compressed;
{
cusparseLtMatDescriptor_t matA;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
&handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(
cusparseLtSpMMAPrune2(&handle, &matA, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
size_t compressed_size;
CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &matA, &compressed_size))
check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &matA, true, opA, d_A, dA_compressed, stream))
}
float exec_time = 99999.0f;
int fast_algo = 0;
for (int alg = 0; alg < 4; ++alg) {
cudaDeviceSynchronize();
cusparseLtMatDescriptor_t matA, matB, matC;
void* d_workspace = nullptr;
int num_streams = 1;
cudaStream_t streams[1] = {stream};
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
&handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matB, k, n, k, alignment, CUDA_R_16F, order))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matC, m, n, m, alignment, CUDA_R_16F, order))
gettimeofday(&start, NULL);
for (int ite = 0; ite < ites; ++ite) {
// initializing MatDesc takes a lot of time
// and these descs can be stored to other place
// whereas storing MatMulPlan to other place will cause errors
cusparseLtMatmulDescriptor_t matmul;
cusparseLtMatmulAlgSelection_t alg_sel;
cusparseLtMatmulPlan_t plan;
CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
&handle, &matmul, opA, opB, &matA, &matB, &matC, &matC, compute_type))
CHECK_CUSPARSE(
cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
&handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
size_t workspace_size;
CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size))
CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size))
CHECK_CUSPARSE(cusparseLtMatmul(&handle,
&plan,
&alpha2,
dA_compressed,
d_B,
&beta2,
d_C,
d_C,
d_workspace,
streams,
num_streams))
CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
if (diffTime(start, end) < exec_time) {
exec_time = diffTime(start, end);
fast_algo = alg;
}
}
exec_time /= ites;
if (exec_time >= exec_times[i]) {
fast_algo = -1;
}
printf("fast_algo %d\n", fast_algo);
fprintf(fd,
"%d %d %d %d %d ### %d %d %d %d %d %f\n",
batch_size,
seq_len,
head_num,
size_per_head,
HALF_DATATYPE,
batchCount[i],
m,
n,
k,
fast_algo,
exec_time);
cudaFree(dA_compressed);
}
CHECK_CUSPARSE(cusparseLtDestroy(&handle))
fclose(fd);
printf("***cusparseLt Gemm Testing End***\n");
}
#endif
return;
}
template void generate_encoder_gemm_config<float>(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size);
template void generate_encoder_gemm_config<half>(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size);
#ifdef ENABLE_BF16
template void generate_encoder_gemm_config<__nv_bfloat16>(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend, int tensor_para_size);
#endif
} // namespace fastertransformer
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/gemm_test/gemm_func.h"
#include <cstdio>
#include <cstdlib>
#include <ctime>
#include <cuda_fp16.h>
#include <cuda_profiler_api.h>
#include <map>
#include <sys/time.h>
#include <unistd.h>
#include <vector>
namespace fastertransformer {
template<typename T>
void generate_encoder_gemm_config(int batch_size,
int seq_len,
int head_num,
int size_per_head,
void* buffer,
bool isAppend = true,
int tensor_para_size = 1);
} // namespace fastertransformer
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "encoder_igemm_func.h"
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#endif
namespace fastertransformer {
int batch_size_;
int seq_len_;
int head_num_;
int size_per_head_;
static const char* showStatus(cublasStatus_t error)
{
switch (error) {
case CUBLAS_STATUS_SUCCESS:
return "CUBLAS_STATUS_SUCCESS";
case CUBLAS_STATUS_NOT_INITIALIZED:
return "CUBLAS_STATUS_NOT_INITIALIZED";
case CUBLAS_STATUS_ALLOC_FAILED:
return "CUBLAS_STATUS_ALLOC_FAILED";
case CUBLAS_STATUS_INVALID_VALUE:
return "CUBLAS_STATUS_INVALID_VALUE";
case CUBLAS_STATUS_ARCH_MISMATCH:
return "CUBLAS_STATUS_ARCH_MISMATCH";
case CUBLAS_STATUS_MAPPING_ERROR:
return "CUBLAS_STATUS_MAPPING_ERROR";
case CUBLAS_STATUS_EXECUTION_FAILED:
return "CUBLAS_STATUS_EXECUTION_FAILED";
case CUBLAS_STATUS_INTERNAL_ERROR:
return "CUBLAS_STATUS_INTERNAL_ERROR";
case CUBLAS_STATUS_NOT_SUPPORTED:
return "CUBLAS_STATUS_NOT_SUPPORTED";
case CUBLAS_STATUS_LICENSE_ERROR:
return "CUBLAS_STATUS_LICENSE_ERROR";
}
return "<unknown>";
}
// Utility function to print customMatmulPerf_t structure
int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint)
{
int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages;
const cublasLtMatmulAlgo_t* matmulAlgo = &perf.algo;
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
#else
stages = 0;
#endif
printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
"time %f workspace=%d mathMode=%d waves=%f\n",
algoId,
tile,
matmulTileName[tile],
numSplitsK,
reductionScheme,
swizzle,
customOption,
stages,
perf.status,
perf.time,
(int)perf.workspaceSize,
(int)perf.mathMode,
perf.wavesCount);
// chose the fastest algo that does not need workspace
if ((int)perf.workspaceSize == 0 && hasPrint == 0) {
fprintf(fout,
"%d %d %d %d %d ### 1 %d %d %d %d %d %d %d %d %d %d %d %f\n",
batch_size_,
seq_len_,
head_num_,
size_per_head_,
INT8_DATATYPE,
m,
n,
k,
algoId,
customOption,
tile,
numSplitsK,
swizzle,
reductionScheme,
(int)perf.workspaceSize,
stages,
perf.time);
return 1;
}
else {
return hasPrint;
}
}
int printBatchPerfStructure(
int batchCount, int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint)
{
int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages;
const cublasLtMatmulAlgo_t* matmulAlgo = &perf.algo;
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
#else
stages = 0;
#endif
printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d stages=%d} status %d "
"time %f workspace=%d mathMode=%d waves=%f\n",
algoId,
tile,
matmulTileName[tile],
numSplitsK,
reductionScheme,
swizzle,
customOption,
stages,
perf.status,
perf.time,
(int)perf.workspaceSize,
(int)perf.mathMode,
perf.wavesCount);
// chose the fastest algo that does not need workspace
if ((int)perf.workspaceSize == 0 && hasPrint == 0) {
fprintf(fout,
"%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d %f\n",
batch_size_,
seq_len_,
head_num_,
size_per_head_,
INT8_DATATYPE,
batchCount,
m,
n,
k,
algoId,
customOption,
tile,
numSplitsK,
swizzle,
reductionScheme,
(int)perf.workspaceSize,
stages,
perf.time);
return 1;
}
else {
return hasPrint;
}
}
static inline bool time_compare(const customMatmulPerf_t& perf_a, const customMatmulPerf_t& perf_b)
{
return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.time < perf_b.time));
}
static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, // to get the capabilities (required a GPU)
cublasLtMatmulDesc_t operationDesc,
const void* alpha, /* host or device pointer */
const void* A,
cublasLtMatrixLayout_t Adesc,
const void* B,
cublasLtMatrixLayout_t Bdesc,
const void* beta, /* host or device pointer */
const void* C,
cublasLtMatrixLayout_t Cdesc,
void* D,
cublasLtMatrixLayout_t Ddesc,
const cublasLtMatmulAlgo_t& algo,
int kernelRepeats,
void* workSpace,
size_t workSpaceSizeInBytes,
customMatmulPerf_t& perfResults,
cudaStream_t stream)
{
cublasLtMatmulHeuristicResult_t heurResult;
/* Looping over the Algo */
int repeats = kernelRepeats;
cublasStatus_t algoStatus =
cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
struct timeval start, end;
cublasStatus_t oneRunStatus;
cudaDeviceSynchronize();
gettimeofday(&start, NULL);
for (int loop = 0; loop < repeats; loop++) {
oneRunStatus = cublasLtMatmul(ltHandle,
operationDesc,
alpha,
A,
Adesc,
B,
Bdesc,
beta,
C,
Cdesc,
D,
Ddesc,
&algo,
workSpace,
workSpaceSizeInBytes,
stream);
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
algoStatus = oneRunStatus;
}
float time = diffTime(start, end);
// For the moment only add successful findings
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
perfResults.algo = algo;
perfResults.time = time / repeats;
perfResults.workspaceSize = heurResult.workspaceSize;
perfResults.wavesCount = heurResult.wavesCount;
}
}
else {
// printf("not enough workspace! %ld\n", heurResult.workspaceSize);
algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not enough workspace
}
}
else {
// printf("check fail!\n");
}
return algoStatus;
}
// Sample wrapper running through multiple algo and config attributes combination for INT8 gemm using cublasLt low-level
// API
template<typename T, typename scaleT>
int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
int m,
int n,
int k,
const scaleT* alpha, /* host pointer */
const int8_t* A,
const int8_t* B,
const scaleT* beta, /* host pointer */
T* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout)
{
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
cublasLtMatmulDesc_t operationDesc = NULL;
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
cudaStream_t stream = 0;
// SplitK value that we are going to try when SplitK is supported for a given algo
const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
// Let try a fixed number of combinations
#define ALGO_COMBINATIONS 50000
int AlgoCombinations = ALGO_COMBINATIONS;
int AlgoCount = 0;
int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
int nbAlgoIds = 0;
#define ALGO_IDS 100
int algoIdA[ALGO_IDS];
cudaDataType_t Atype, Btype, Ctype, scaleType;
Atype = CUDA_R_8I;
Btype = CUDA_R_8I;
if (std::is_same<T, int32_t>::value && std::is_same<scaleT, int>::value) {
Ctype = CUDA_R_32I;
scaleType = CUDA_R_32I;
}
else if (std::is_same<T, int8_t>::value && std::is_same<scaleT, float>::value) {
Ctype = CUDA_R_8I;
scaleType = CUDA_R_32F;
}
else {
printf("[ERROR]<T,scaleT> of igemm is invalid\n");
exit(-1);
}
#if (CUDART_VERSION >= 11000)
cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
#else
cudaDataType_t computeType = CUDA_R_32I;
#endif
cublasOperation_t opTranspose = CUBLAS_OP_T;
bool use_ORDER_COL32_2R_4R4 = false;
#if (CUDART_VERSION >= 11000)
int device{-1};
cudaGetDevice(&device);
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device);
if (props.major * 10 + props.minor >= 80) {
use_ORDER_COL32_2R_4R4 = true;
}
#endif
cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
cublasLtOrder_t order_matrixB;
#if (CUDART_VERSION >= 11000)
if (use_ORDER_COL32_2R_4R4) {
order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
}
else {
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
}
#else
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
#endif
int ldaTransform = 32 * m;
int ldbTransform;
if (use_ORDER_COL32_2R_4R4) {
ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
}
else {
ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
}
int ldcTransform = 32 * m;
#if (CUDART_VERSION >= 11000)
status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
#else
status = cublasLtMatmulDescCreate(&operationDesc, scaleType);
#endif
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
// Create matrix descriptors.
status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, ldaTransform);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, n, k, ldbTransform);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status =
cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldcTransform);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
// Request AlgoId available for IGEMM
status = cublasLtMatmulAlgoGetIds(
ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, ALGO_IDS, algoIdA, &nbAlgoIds);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
// Loop over the Algo IDs
for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
cublasLtMatmulAlgo_t algo;
size_t sizeWritten = 0;
/* Initialize algo structure with given Algp ID */
status =
cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, algoIdA[idx], &algo);
if (status != CUBLAS_STATUS_SUCCESS) {
continue;
}
// Query the tiles enums supported by that algo
cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
int nbTiles = int(sizeWritten / sizeof(int));
int* tileA = new int[nbTiles == 0 ? 1 : nbTiles];
if (nbTiles == 0) {
tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
nbTiles = 1;
}
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
int nbStages = int(sizeWritten / sizeof(int));
std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
if (nbStages == 0) {
stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
nbStages = 1;
}
else {
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
}
#endif
int splitkSupport, redMask, swizzlingMax, customOptionMax;
// Retrieve Algo Capabilities attributes to be able to setup loop over the different combinations
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
/* Loop over the different tiles */
for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
#if (CUDART_VERSION >= 11000)
/* Loop over different stages count */
for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
#endif
/* Loop over the different custom option if any */
for (int customOption = 0; customOption <= customOptionMax; customOption++) {
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
/* Loop over the CTAs swizzling support */
for (int k = 0; k <= swizzlingMax; k++) {
int splitK_trial = 0;
if (splitkSupport) {
splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
}
// Loop over the splitK value over a fixed sequence splitKSequenceA in addition to the case
// where splitK is not enabled
for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
/* Setup attribute of the algo to run */
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
int splitK_val = 0;
int redScheme = CUBLASLT_REDUCTION_SCHEME_NONE;
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
if (l > 0) { // Split-K case
splitK_val = splitKSequenceA[l - 1];
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
&splitKSequenceA[l - 1],
sizeof(splitKSequenceA[l - 1]));
/* Going over all the reduction scheme */
for (redScheme = 1;
redScheme <= (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
redScheme = redScheme << 1) {
if (redScheme & redMask) {
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
&redScheme,
sizeof(redScheme));
status = customMatmulRun(ltHandle,
operationDesc,
alpha, /* host or device pointer */
A,
Adesc,
B,
Bdesc,
beta, /* host or device pointer */
C,
Cdesc,
C,
Cdesc,
algo,
kernelRepeats,
workSpace,
workSpaceSize,
perfResults[AlgoCount],
stream);
perfResults[AlgoCount].status = status;
if (status == CUBLAS_STATUS_SUCCESS) {
AlgoCount++;
}
} // end if
} // end for
}
else { // Non-splitK case
/* if user preference is ok with workspace */
if (AlgoCount < AlgoCombinations) {
status = customMatmulRun(ltHandle,
operationDesc,
alpha, /* host or device pointer */
A,
Adesc,
B,
Bdesc,
beta, /* host or device pointer */
C,
Cdesc,
C,
Cdesc,
algo,
kernelRepeats,
workSpace,
workSpaceSize,
perfResults[AlgoCount],
stream);
perfResults[AlgoCount].status = status;
if (status == CUBLAS_STATUS_SUCCESS) {
AlgoCount++;
}
}
}
} // end l
} // end k
} // end customOption
#if (CUDART_VERSION >= 11000)
} // end stagesIdx
#endif
} // end tileIdx
delete[] tileA;
} // end idx
// Sort the results per run duration
std::sort(perfResults, perfResults + AlgoCount, time_compare);
// Print timing and perf details
for (int i = 0, hasPrint = 0; i < AlgoCount; i++) {
printf("result %03d : ", i);
hasPrint = printPerfStructure(m, n, k, perfResults[i], fout, hasPrint);
}
CLEANUP:
// Descriptors are no longer needed as all GPU work was already enqueued
if (Cdesc) {
cublasLtMatrixLayoutDestroy(Cdesc);
}
if (Bdesc) {
cublasLtMatrixLayoutDestroy(Bdesc);
}
if (Adesc) {
cublasLtMatrixLayoutDestroy(Adesc);
}
if (operationDesc) {
cublasLtMatmulDescDestroy(operationDesc);
}
return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}
template int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
int m,
int n,
int k,
const int* alpha, /* host pointer */
const int8_t* A,
const int8_t* B,
const int* beta, /* host pointer */
int32_t* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout);
template int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
int m,
int n,
int k,
const float* alpha, /* host pointer */
const int8_t* A,
const int8_t* B,
const float* beta, /* host pointer */
int8_t* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout);
template<typename T, typename scaleT>
int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
int batchCount,
int m,
int n,
int k,
const scaleT* alpha, /* host pointer */
const int8_t* A,
const int8_t* B,
const scaleT* beta, /* host pointer */
T* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout)
{
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
cublasLtMatmulDesc_t operationDesc = NULL;
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
cudaStream_t stream = 0;
// SplitK value that we are going to try when SplitK is supported for a given algo
const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
// Let try a fixed number of combinations
#define ALGO_COMBINATIONS 50000
int AlgoCombinations = ALGO_COMBINATIONS;
int AlgoCount = 0;
int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back
customMatmulPerf_t perfResults[ALGO_COMBINATIONS];
int nbAlgoIds = 0;
#define ALGO_IDS 100
int algoIdA[ALGO_IDS];
cudaDataType_t Atype, Btype, Ctype, scaleType;
Atype = CUDA_R_8I;
Btype = CUDA_R_8I;
if (std::is_same<T, int32_t>::value && std::is_same<scaleT, int>::value) {
Ctype = CUDA_R_32I;
scaleType = CUDA_R_32I;
}
else if (std::is_same<T, int8_t>::value && std::is_same<scaleT, float>::value) {
Ctype = CUDA_R_8I;
scaleType = CUDA_R_32F;
}
else {
printf("[ERROR]<T,scaleT> of igemm is invalid\n");
exit(-1);
}
#if (CUDART_VERSION >= 11000)
cublasComputeType_t computeType = CUBLAS_COMPUTE_32I;
#else
cudaDataType_t computeType = CUDA_R_32I;
#endif
cublasOperation_t opTranspose = CUBLAS_OP_T;
bool use_ORDER_COL32_2R_4R4 = false;
#if (CUDART_VERSION >= 11000)
int device{-1};
cudaGetDevice(&device);
cudaDeviceProp props;
cudaGetDeviceProperties(&props, device);
if (props.major * 10 + props.minor >= 80) {
use_ORDER_COL32_2R_4R4 = true;
}
#endif
cublasLtOrder_t order_COL32 = CUBLASLT_ORDER_COL32;
cublasLtOrder_t order_matrixB;
#if (CUDART_VERSION >= 11000)
if (use_ORDER_COL32_2R_4R4) {
order_matrixB = CUBLASLT_ORDER_COL32_2R_4R4;
}
else {
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
}
#else
order_matrixB = CUBLASLT_ORDER_COL4_4R2_8C;
#endif
int ldaTransform = 32 * m;
int ldbTransform;
if (use_ORDER_COL32_2R_4R4) {
ldbTransform = 32 * ((n + 32 - 1) / 32) * 32;
}
else {
ldbTransform = 32 * ((n + 8 - 1) / 8) * 8;
}
int ldcTransform = 32 * m;
int64_t stridea, strideb, stridec;
stridea = m * k;
strideb = n * k;
stridec = m * n;
#if (CUDART_VERSION >= 11000)
status = cublasLtMatmulDescCreate(&operationDesc, computeType, scaleType);
#else
status = cublasLtMatmulDescCreate(&operationDesc, scaleType);
#endif
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opTranspose, sizeof(cublasOperation_t));
// Create matrix descriptors.
status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, ldaTransform);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridea, sizeof(stridea));
status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, n, k, ldbTransform);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status =
cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_matrixB, sizeof(order_matrixB));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideb, sizeof(strideb));
status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, ldcTransform);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &order_COL32, sizeof(order_COL32));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount));
cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stridec, sizeof(stridec));
// Request AlgoId available for IGEMM
status = cublasLtMatmulAlgoGetIds(
ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, ALGO_IDS, algoIdA, &nbAlgoIds);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
// Loop over the Algo IDs
for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
cublasLtMatmulAlgo_t algo;
size_t sizeWritten = 0;
/* Initialize algo structure with given Algp ID */
status =
cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Ctype, algoIdA[idx], &algo);
if (status != CUBLAS_STATUS_SUCCESS) {
continue;
}
// Query the tiles enums supported by that algo
cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
int nbTiles = int(sizeWritten / sizeof(int));
int* tileA = new int[nbTiles == 0 ? 1 : nbTiles];
if (nbTiles == 0) {
tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
nbTiles = 1;
}
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
int nbStages = int(sizeWritten / sizeof(int));
std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
if (nbStages == 0) {
stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
nbStages = 1;
}
else {
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
}
#endif
int splitkSupport, redMask, swizzlingMax, customOptionMax;
// Retrieve Algo Capabilities attributes to be able to setup loop over the different combinations
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
/* Loop over the different tiles */
for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
#if (CUDART_VERSION >= 11000)
/* Loop over different stages count */
for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
#endif
/* Loop over the different custom option if any */
for (int customOption = 0; customOption <= customOptionMax; customOption++) {
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
/* Loop over the CTAs swizzling support */
for (int k = 0; k <= swizzlingMax; k++) {
int splitK_trial = 0;
if (splitkSupport) {
splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
}
// Loop over the splitK value over a fixed sequence splitKSequenceA in addition to the case
// where splitK is not enabled
for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
/* Setup attribute of the algo to run */
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
int splitK_val = 0;
int redScheme = CUBLASLT_REDUCTION_SCHEME_NONE;
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
if (l > 0) { // Split-K case
splitK_val = splitKSequenceA[l - 1];
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
&splitKSequenceA[l - 1],
sizeof(splitKSequenceA[l - 1]));
/* Going over all the reduction scheme */
for (redScheme = 1;
redScheme <= (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
redScheme = redScheme << 1) {
if (redScheme & redMask) {
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
&redScheme,
sizeof(redScheme));
status = customMatmulRun(ltHandle,
operationDesc,
alpha, /* host or device pointer */
A,
Adesc,
B,
Bdesc,
beta, /* host or device pointer */
C,
Cdesc,
C,
Cdesc,
algo,
kernelRepeats,
workSpace,
workSpaceSize,
perfResults[AlgoCount],
stream);
perfResults[AlgoCount].status = status;
if (status == CUBLAS_STATUS_SUCCESS) {
AlgoCount++;
}
} // end if
} // end for
}
else { // Non-splitK case
/* if user preference is ok with workspace */
if (AlgoCount < AlgoCombinations) {
status = customMatmulRun(ltHandle,
operationDesc,
alpha, /* host or device pointer */
A,
Adesc,
B,
Bdesc,
beta, /* host or device pointer */
C,
Cdesc,
C,
Cdesc,
algo,
kernelRepeats,
workSpace,
workSpaceSize,
perfResults[AlgoCount],
stream);
perfResults[AlgoCount].status = status;
if (status == CUBLAS_STATUS_SUCCESS) {
AlgoCount++;
}
}
}
} // end l
} // end k
} // end customOption
#if (CUDART_VERSION >= 11000)
} // end stagesIdx
#endif
} // end tileIdx
delete[] tileA;
} // end idx
// Sort the results per run duration
std::sort(perfResults, perfResults + AlgoCount, time_compare);
// Print timing and perf details
for (int i = 0, hasPrint = 0; i < AlgoCount; i++) {
printf("result %03d : ", i);
hasPrint = printBatchPerfStructure(batchCount, m, n, k, perfResults[i], fout, hasPrint);
}
CLEANUP:
// Descriptors are no longer needed as all GPU work was already enqueued
if (Cdesc) {
cublasLtMatrixLayoutDestroy(Cdesc);
}
if (Bdesc) {
cublasLtMatrixLayoutDestroy(Bdesc);
}
if (Adesc) {
cublasLtMatrixLayoutDestroy(Adesc);
}
if (operationDesc) {
cublasLtMatmulDescDestroy(operationDesc);
}
return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}
template int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
int batchCount,
int m,
int n,
int k,
const int* alpha, /* host pointer */
const int8_t* A,
const int8_t* B,
const int* beta, /* host pointer */
int32_t* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout);
template int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
int batchCount,
int m,
int n,
int k,
const float* alpha, /* host pointer */
const int8_t* A,
const int8_t* B,
const float* beta, /* host pointer */
int8_t* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout);
// initialize matrix in column-major
void matInit(int rows, int cols, int8_t* p, int ld)
{
srand(time(NULL));
for (int c = 0; c < cols; c++) {
for (int r = 0; r < rows; r++) {
int index = r + c * ld;
p[index] = rand() % 255 - 127;
}
}
}
int batch_igemm_config(int batchCount, int m, int n, int k, FILE* fout, void* buffer)
{
printf("batchCount %d m %d n %d k %d\n", batchCount, m, n, k);
int alpha = 1;
int beta = 0;
int8_t* d_A = (int8_t*)buffer; // m * k, stored in column-major
int8_t* d_B = d_A + batchCount * m * k; // k * n, stored in column-major
int32_t* d_C = (int32_t*)(d_B + batchCount * k * n); // m * n, stored in column-major
cublasLtHandle_t ltHandle;
cublasLtCreate(&ltHandle);
LtBatchIgemmCustomFind(ltHandle,
batchCount,
m,
n,
k,
&alpha, /* host pointer */
d_A,
d_B,
&beta, /* host pointer */
d_C,
NULL,
0,
fout);
// free memory
cublasLtDestroy(ltHandle);
return 0;
}
int igemm_config(int m, int n, int k, FILE* fout, void* buffer)
{
printf("batchCount %d m %d n %d k %d\n", 1, m, n, k);
int alpha = 1;
int beta = 0;
int8_t* d_A = (int8_t*)buffer; // m * k, stored in column-major
int8_t* d_B = d_A + m * k; // k * n, stored in column-major
int32_t* d_C = (int32_t*)(d_B + k * n); // m * n, stored in column-major
cublasLtHandle_t ltHandle;
cublasLtCreate(&ltHandle);
LtIgemmCustomFind(ltHandle,
m,
n,
k,
&alpha, /* host pointer */
d_A,
d_B,
&beta, /* host pointer */
d_C,
NULL,
0,
fout);
cublasLtDestroy(ltHandle);
return 0;
}
int generate_encoder_igemm_config(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend)
{
// ensure program running on SM >= 7.5
struct cudaDeviceProp prop;
check_cuda_error(cudaGetDeviceProperties(&prop, 0));
if (!(prop.major >= 8 || (prop.major >= 7 && prop.minor >= 5))) {
printf("[ERROR] INT8 mode > 0 is only supported on device with sm >= 7.5\n ");
exit(-1);
}
printf("Device %s\n", prop.name);
// check config
FILE* fout;
if (!isAppend) {
fout = fopen(IGEMM_CONFIG, "w+");
fprintf(
fout,
"batch_size seq_len head_num size_per_head dataType ### batchCount m n k algoId customOption tile splitK_val swizzle reductionScheme workspaceSize stages exec_time\n");
}
else {
fout = fopen(IGEMM_CONFIG, "a+");
std::vector<std::string> config;
char line[1024];
while (fgets(line, 1024, fout) != NULL) {
config.push_back(std::string(line));
}
if (config.size() >= MAX_CONFIG_NUM * GEMM_NUM) {
int startIdx = config.size() - (MAX_CONFIG_NUM - 1) * GEMM_NUM;
fclose(fout);
fout = fopen(IGEMM_CONFIG, "w+");
for (int i = startIdx; i < (int)config.size(); i++) {
fprintf(fout, "%s", config[i].c_str());
}
}
}
batch_size_ = batch_size;
seq_len_ = seq_len;
head_num_ = head_num;
size_per_head_ = size_per_head;
int m = batch_size * seq_len;
int n = head_num * size_per_head;
int k = n;
int batchCount;
printf("***Encoder IGemm Testing Begin***\n");
printf("\n-----------------------------\n");
batchCount = 3;
m = batch_size * seq_len;
k = head_num * size_per_head;
n = k;
if (n % 32 != 0 || k % 32 != 0) {
printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
}
else {
batch_igemm_config(batchCount, m, n, k, fout, buffer);
}
printf("\n-----------------------------\n");
m = seq_len;
n = seq_len;
k = size_per_head;
batchCount = batch_size * head_num;
if (n % 32 != 0 || k % 32 != 0) {
printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
}
else {
batch_igemm_config(batchCount, m, n, k, fout, buffer);
}
printf("\n-----------------------------\n");
m = seq_len;
n = size_per_head;
k = seq_len;
batchCount = batch_size * head_num;
if (n % 32 != 0 || k % 32 != 0) {
printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
}
else {
batch_igemm_config(batchCount, m, n, k, fout, buffer);
}
printf("\n-----------------------------\n");
m = batch_size * seq_len;
n = head_num * size_per_head;
k = head_num * size_per_head;
if (n % 32 != 0 || k % 32 != 0) {
printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
}
else {
igemm_config(m, n, k, fout, buffer);
}
printf("\n-----------------------------\n");
n = 4 * n;
if (n % 32 != 0 || k % 32 != 0) {
printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
}
else {
igemm_config(m, n, k, fout, buffer);
}
printf("\n-----------------------------\n");
n = k;
k = 4 * n;
if (n % 32 != 0 || k % 32 != 0) {
printf("[WARNING] For INT8 gemm test, n, k should be multiples of 32 (n = %d, k = %d)\n", n, k);
}
else {
igemm_config(m, n, k, fout, buffer);
}
fclose(fout);
printf("\n-----------------------------\n");
printf("***Encoder IGemm Testing End***\n");
#ifdef SPARSITY_ENABLED
bool do_sparse_test = false;
if (prop.major == 8 && (prop.minor == 0 || prop.minor == 6)) {
do_sparse_test = true;
}
if (do_sparse_test) {
printf("***cusparseLt Gemm Testing Begin***\n");
const int spgemm_num = 3;
FILE* fd;
int line_count = 0;
const int ites = 100;
struct timeval start, end;
if (!isAppend) {
fd = fopen(SPIGEMM_CONFIG, "w+");
}
else {
fd = fopen(SPIGEMM_CONFIG, "a+");
std::vector<std::string> config;
char line[1024];
while (fgets(line, 1024, fd) != NULL) {
config.push_back(std::string(line));
}
line_count = config.size();
if (config.size() >= (MAX_CONFIG_NUM * spgemm_num + 1)) // 6 cublas/cublasLt, first row is not included
{
int startIdx = config.size() - ((MAX_CONFIG_NUM - 1) * spgemm_num);
fclose(fd);
fd = fopen(SPIGEMM_CONFIG, "w+");
fprintf(fd, "%s", config[0].c_str());
for (uint i = startIdx; i < config.size(); i++) {
fprintf(fd, "%s", config[i].c_str());
}
line_count = config.size() - (spgemm_num + 3);
}
}
if (line_count == 0) {
fprintf(
fd,
"batch_size, seq_len, head_num, size_per_head dataType ### batchCount, m, n, k, algoId, exec_time\n");
}
int M[spgemm_num];
int N[spgemm_num];
int K[spgemm_num];
// gemm1
M[0] = batch_size * seq_len;
K[0] = head_num * size_per_head;
N[0] = K[0];
// gemm2
M[1] = M[0];
K[1] = K[0];
N[1] = 4 * N[0];
// gemm3
M[2] = M[0];
K[2] = 4 * K[0];
N[2] = N[0];
cusparseLtHandle_t handle;
CHECK_CUSPARSE(cusparseLtInit(&handle));
cusparseOrder_t col_order = CUSPARSE_ORDER_COL;
cusparseOrder_t row_order = CUSPARSE_ORDER_ROW;
cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseOperation_t opB = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseComputeType compute_type = CUSPARSE_COMPUTE_32I;
unsigned alignment = 16;
cudaStream_t stream = 0;
float alpha2 = 1.0f;
float beta2 = 0.0f;
for (int i = 0; i < spgemm_num; ++i) {
// to be compatible with spgemm wrapper, we let A be the weight matrix
// so m and n are swapped
// A: mxk B: kxn C:mxn
int m = N[i], n = M[i], k = K[i];
printf("\n-----------------------------\n");
printf("GEMM test %d: [M: %d, K: %d, N: %d]\n", i, m, k, n);
int8_t* d_A = (int8_t*)buffer;
int8_t* d_B = d_A + m * k;
int8_t* d_C = d_B + k * n;
int8_t* dA_compressed;
{
cusparseLtMatDescriptor_t matA;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
&handle, &matA, m, k, k, alignment, CUDA_R_8I, row_order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(
cusparseLtSpMMAPrune2(&handle, &matA, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
size_t compressed_size;
CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &matA, &compressed_size))
check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &matA, true, opA, d_A, dA_compressed, stream))
}
cudaDeviceSynchronize();
cudaError_t result = cudaGetLastError();
if (result) {
throw std::runtime_error(std::string("[FT][ERROR] CUDA runtime error: "));
}
float exec_time = 99999.0f;
int fast_algo = 0;
for (int alg = 0; alg < 4; ++alg) {
cudaDeviceSynchronize();
cusparseLtMatDescriptor_t matA, matB, matC;
void* d_workspace = nullptr;
int num_streams = 1;
cudaStream_t streams[1] = {stream};
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
&handle, &matA, m, k, k, alignment, CUDA_R_8I, row_order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matB, k, n, k, alignment, CUDA_R_8I, col_order))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matC, m, n, m, alignment, CUDA_R_8I, col_order))
gettimeofday(&start, NULL);
for (int ite = 0; ite < ites; ++ite) {
// initializing MatDesc takes a lot of time
// and these descs can be stored to other place
// whereas storing MatMulPlan to other place will cause errors
cusparseLtMatmulDescriptor_t matmul;
cusparseLtMatmulAlgSelection_t alg_sel;
cusparseLtMatmulPlan_t plan;
CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
&handle, &matmul, opA, opB, &matA, &matB, &matC, &matC, compute_type))
CHECK_CUSPARSE(
cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
&handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
size_t workspace_size;
CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &alg_sel, &workspace_size))
CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel, workspace_size))
CHECK_CUSPARSE(cusparseLtMatmul(&handle,
&plan,
&alpha2,
dA_compressed,
d_B,
&beta2,
d_C,
d_C,
d_workspace,
streams,
num_streams))
CHECK_CUSPARSE(cusparseLtMatmulPlanDestroy(&plan))
}
cudaDeviceSynchronize();
gettimeofday(&end, NULL);
printf("algo_%d costs %.3fms \n", alg, diffTime(start, end) / ites);
if (diffTime(start, end) < exec_time) {
exec_time = diffTime(start, end);
fast_algo = alg;
}
}
exec_time /= ites;
printf("fast_algo %d\n", fast_algo);
fprintf(fd,
"%d %d %d %d %d ### 1 %d %d %d %d %f\n",
batch_size,
seq_len,
head_num,
size_per_head,
HALF_DATATYPE,
m,
n,
k,
fast_algo,
exec_time);
cudaFree(dA_compressed);
}
CHECK_CUSPARSE(cusparseLtDestroy(&handle))
fclose(fd);
printf("***cusparseLt Gemm Testing End***\n");
}
#endif
return 0;
}
} // namespace fastertransformer
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include <algorithm>
#include <cublasLt.h>
#include <cuda_runtime.h>
#include <map>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <time.h>
#include <unistd.h>
#include <vector>
namespace fastertransformer {
/* CAUTION : must match cublasLtMatmulTile_t */
const char* const matmulTileName[] = {"UNDEF", "8x8", "8x16", "16x8", "8x32", "16x16", "32x8",
"8x64", "16x32", "32x16", "64x8", "32x32", "32x64", "64x32",
"32x128", "64x64", "128x32", "64x128", "128x64", "64x256", "128x128",
"256x64", "64x512", "128x256", "256x128", "512x64", "64x96", "96*64",
"96x128", "128x160", "160x128", "192x128", "128x192", "128x96", "END"};
int generate_encoder_igemm_config(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint);
int printBatchPerfStructure(
int batchCount, int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint);
template<typename T, typename scaleT>
int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
int m,
int n,
int k,
const scaleT* alpha, /* host pointer */
const int8_t* A,
const int8_t* B,
const scaleT* beta, /* host pointer */
T* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout);
template<typename T, typename scaleT>
int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
int batchCount,
int m,
int n,
int k,
const scaleT* alpha, /* host pointer */
const int8_t* A,
const int8_t* B,
const scaleT* beta, /* host pointer */
T* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout);
void matInit(int rows, int cols, int8_t* p, int ld);
} // namespace fastertransformer
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "encoder_gemm_func.h"
#include <assert.h>
#include <sys/types.h>
#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#endif
namespace fastertransformer {
// Utility function to print customMatmulPerf_t structure
int printPerfStructure(int batch_size,
int seq_len,
int head_num,
int size_per_head,
int m,
int n,
int k,
const customMatmulPerf_t& perf,
FILE* fout,
CublasDataType data_type,
int hasPrint,
int batch_count)
{
int algoId, tile, swizzle, customOption, numSplitsK, reductionScheme, stages;
const cublasLtMatmulAlgo_t* matmulAlgo = &perf.algo;
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_ID, &algoId, sizeof(algoId), NULL);
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tile, sizeof(tile), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &numSplitsK, sizeof(numSplitsK), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &reductionScheme, sizeof(reductionScheme), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &swizzle, sizeof(swizzle), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption), NULL);
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoConfigGetAttribute(matmulAlgo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stages, sizeof(stages), NULL);
#else
stages = 0;
#endif
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
uint16_t inner_shapeId, cluster_shapeId;
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID, &inner_shapeId, sizeof(inner_shapeId), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID, &cluster_shapeId, sizeof(cluster_shapeId), NULL);
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
uint16_t mma_shapeId, cga_shapeId, sche_mode;
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_MMA_SHAPE_ID, &mma_shapeId, sizeof(mma_shapeId), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_CGA_SHAPE_ID, &cga_shapeId, sizeof(cga_shapeId), NULL);
cublasLtMatmulAlgoConfigGetAttribute(
matmulAlgo, CUBLASLT_ALGO_CONFIG_SCHEDULING_MODE, &sche_mode, sizeof(sche_mode), NULL);
#endif
printf("algo={ Id=%d, tileIdx=%d (%s) splitK=%d reduc=%d swizzle=%d custom=%d "
#if (CUDART_VERSION >= 11000)
"stages=%d "
#endif
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
"inner_shapeId=%d cluster_shapeId=%d"
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
"mma_shapeId=%d cga_shapeId=%d schedule_mode=%d"
#endif
"} status %d "
"time %fms workspace=%d mathMode=%d waves=%f\n",
algoId,
tile,
matmulTileName[tile],
numSplitsK,
reductionScheme,
swizzle,
customOption,
#if (CUDART_VERSION >= 11000)
stages,
#endif
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
inner_shapeId,
cluster_shapeId,
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
mma_shapeId,
cga_shapeId,
sche_mode,
#endif
perf.status,
perf.time,
(int)perf.workspaceSize,
(int)perf.mathMode,
perf.wavesCount);
if (hasPrint == 0) {
fprintf(fout,
"%d %d %d %d %d ### %d %d %d %d %d %d %d %d %d %d %d %d "
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
"%d %d "
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
"%d %d %d "
#endif
"%f\n",
batch_size,
seq_len,
head_num,
size_per_head,
data_type,
batch_count,
m,
n,
k,
algoId,
customOption,
tile,
numSplitsK,
swizzle,
reductionScheme,
(int)perf.workspaceSize,
stages,
#if (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH >= 3)
inner_shapeId,
cluster_shapeId,
#elif (CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 11 && CUBLAS_VER_PATCH < 3)
mma_shapeId,
cga_shapeId,
sche_mode,
#endif
perf.time);
return 1;
}
else {
return hasPrint;
}
}
static inline bool time_compare(const customMatmulPerf_t& perf_a, const customMatmulPerf_t& perf_b)
{
return ((perf_a.status == CUBLAS_STATUS_SUCCESS) && (perf_a.time < perf_b.time));
}
static cublasStatus_t customMatmulRun(cublasLtHandle_t ltHandle, // to get the capabilities (required a GPU)
cublasLtMatmulDesc_t operationDesc,
const void* alpha, /* host or device pointer */
const void* A,
cublasLtMatrixLayout_t Adesc,
const void* B,
cublasLtMatrixLayout_t Bdesc,
const void* beta, /* host or device pointer */
const void* C,
cublasLtMatrixLayout_t Cdesc,
void* D,
cublasLtMatrixLayout_t Ddesc,
const cublasLtMatmulAlgo_t& algo,
int kernelRepeats,
void* workSpace,
size_t workSpaceSizeInBytes,
customMatmulPerf_t& perfResults,
cudaStream_t stream,
cudaEvent_t& startEvent,
cudaEvent_t& stopEvent)
{
cublasLtMatmulHeuristicResult_t heurResult;
/* Looping over the Algo */
int repeats = kernelRepeats;
cublasStatus_t algoStatus =
cublasLtMatmulAlgoCheck(ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, &algo, &heurResult);
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
if (heurResult.workspaceSize <= workSpaceSizeInBytes) {
cudaError_t err, err1, err2, err3;
err = cudaEventRecord(startEvent, stream);
for (int loop = 0; loop < repeats; loop++) {
cublasStatus_t oneRunStatus = cublasLtMatmul(ltHandle,
operationDesc,
alpha,
A,
Adesc,
B,
Bdesc,
beta,
C,
Cdesc,
D,
Ddesc,
&algo,
workSpace,
workSpaceSizeInBytes,
stream);
if (oneRunStatus != CUBLAS_STATUS_SUCCESS) {
algoStatus = oneRunStatus;
break;
}
}
err1 = cudaEventRecord(stopEvent, stream);
err2 = cudaEventSynchronize(stopEvent);
float time;
err3 = cudaEventElapsedTime(&time, startEvent, stopEvent);
if ((err != cudaSuccess) || (err1 != cudaSuccess) || (err2 != cudaSuccess) || (err3 != cudaSuccess)) {
algoStatus = CUBLAS_STATUS_INTERNAL_ERROR;
}
// For the moment only add successful findings
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
perfResults.algo = algo;
perfResults.time = time / repeats;
perfResults.workspaceSize = heurResult.workspaceSize;
perfResults.wavesCount = heurResult.wavesCount;
}
}
else {
// printf("not enough workspace! %ld\n", heurResult.workspaceSize);
algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not enough workspace
}
}
return algoStatus;
}
template<typename T, typename scaleT>
int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
int batch_size,
int seq_len,
int head_num,
int size_per_head,
int m,
int n,
int k,
const scaleT* alpha, /* host pointer */
const T* A,
const T* B,
const scaleT* beta, /* host pointer */
T* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout,
customMatmulPerf_t perfResults[],
int AlgoCombinations,
cudaDataType_t dtype_fp8,
int batchCount,
int64_t strideA,
int64_t strideB,
int64_t strideD)
{
cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
cudaEvent_t startEvent;
cudaEvent_t stopEvent;
CublasDataType data_type;
cublasLtMatmulDesc_t operationDesc = NULL;
cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL, Ddesc = NULL;
cudaStream_t stream = 0;
// SplitK value that we are going to try when SplitK is supported for a
// given algo
const int splitKSequenceA[] = {2, 3, 4, 5, 6, 8, 12, 16, 32};
// Let try a fixed number of combinations
int AlgoCount = 0;
int AlgoCountRestrict = 0; // workspace == 0
int maxNumTraversal = 50; // max number of traversal
cublasLtMatmulAlgo_t algos[AlgoCombinations]; // 0 <= workspace <= 32MB
cublasLtMatmulAlgo_t algosRestrict[AlgoCombinations]; // workspace == 0
int kernelRepeats = 100; // number of time the CUDA kernels will be run back to back
int nbAlgoIds = 0; // Number of algorithms actually returned by
// cublasLtMatmulAlgoGetIds function.
#define ALGO_IDS 100 // Number of algorithms requested.
int algoIdA[ALGO_IDS]; // Array containing the algorithm IDs returned by
// cublasLtMatmulAlgoGetIds function.
cudaDataType_t Atype, Btype, Ctype, scaleType, Dtype;
#if (CUDART_VERSION >= 11000)
cublasComputeType_t computeType;
#else
cudaDataType_t computeType;
#endif
if (std::is_same<T, float>::value) {
data_type = FLOAT_DATATYPE;
Atype = CUDA_R_32F, Btype = CUDA_R_32F, Ctype = CUDA_R_32F, Dtype = CUDA_R_32F;
}
else if (std::is_same<T, half>::value) {
data_type = HALF_DATATYPE;
Atype = CUDA_R_16F, Btype = CUDA_R_16F, Ctype = CUDA_R_16F, Dtype = CUDA_R_16F;
}
#ifdef ENABLE_BF16
else if (std::is_same<T, __nv_bfloat16>::value) {
data_type = BFLOAT16_DATATYPE;
Atype = CUDA_R_16BF, Btype = CUDA_R_16BF, Ctype = CUDA_R_16BF, Dtype = CUDA_R_16BF;
}
#endif
#ifdef ENABLE_FP8
else if (std::is_same<T, __nv_fp8_e4m3>::value) {
data_type = FP8_DATATYPE;
Atype = CUDA_R_8F_E4M3, Btype = CUDA_R_8F_E4M3, Ctype = CUDA_R_16BF;
#ifdef FP8_GEMM_OUTPUT_QUANT_DISABLE
Dtype = CUDA_R_16BF;
#else
Dtype = dtype_fp8;
#endif
}
#endif
if (sizeof(scaleT) == sizeof(float)) {
scaleType = CUDA_R_32F;
#if (CUDART_VERSION >= 11000)
computeType = CUBLAS_COMPUTE_32F;
#else
computeType = CUDA_R_32F;
#endif
}
else {
scaleType = CUDA_R_16F;
#if (CUDART_VERSION >= 11000)
computeType = CUBLAS_COMPUTE_16F;
#else
computeType = CUDA_R_16F;
#endif
}
const cublasOperation_t tA = data_type == FP8_DATATYPE ? CUBLAS_OP_T : CUBLAS_OP_N;
// Create operation descriptor; see cublasLtMatmulDescAttributes_t for
// details about defaults; here we just need to set the transforms for A and
// B
#if (CUDART_VERSION >= 11000)
status = cublasLtMatmulDescCreate(&operationDesc, computeType,
scaleType); // creates a matrix multiply descriptor
#else
status = cublasLtMatmulDescCreate(&operationDesc, computeType);
#endif
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &tA, sizeof(tA));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
#ifdef ENABLE_FP8
if (data_type == FP8_DATATYPE) {
const int8_t fastAccuMode = 1; // enable fast imprecise accum
status = cublasLtMatmulDescSetAttribute(
operationDesc, CUBLASLT_MATMUL_DESC_FAST_ACCUM, &fastAccuMode, sizeof(decltype(fastAccuMode)));
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
}
#endif
// Create matrix descriptors. We are good with the details here so no need
// to set any extra attributes
if (data_type == FP8_DATATYPE) {
status = cublasLtMatrixLayoutCreate(&Adesc, Atype, k, m, k);
}
else {
status = cublasLtMatrixLayoutCreate(&Adesc, Atype, m, k, m);
}
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutCreate(&Bdesc, Btype, k, n, k);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutCreate(&Cdesc, Ctype, m, n, m);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
status = cublasLtMatrixLayoutCreate(&Ddesc, Dtype, m, n, m);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
if (batchCount > 1) {
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Ddesc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batchCount, sizeof(batchCount)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Adesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Bdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Cdesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
check_cuda_error(cublasLtMatrixLayoutSetAttribute(
Ddesc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideD, sizeof(strideD)));
}
// Create CUDA event to time the execution time of each algo
if (cudaEventCreate(&startEvent, cudaEventBlockingSync) != cudaSuccess) {
goto CLEANUP;
}
if (cudaEventCreate(&stopEvent, cudaEventBlockingSync) != cudaSuccess) {
goto CLEANUP;
}
// Request the 100 first AlgoId available
status = cublasLtMatmulAlgoGetIds(
ltHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, ALGO_IDS, algoIdA, &nbAlgoIds);
if (status != CUBLAS_STATUS_SUCCESS) {
goto CLEANUP;
}
if (nbAlgoIds > ALGO_IDS) {
printf(
"Warning: the algo id count is not large enough to guarantee the best algo %d, %d\n", nbAlgoIds, ALGO_IDS);
}
// Loop over the Algo IDs
// This loop doesn't work for fp8 gemm
for (int idx = 0; (idx < nbAlgoIds) && (AlgoCount < AlgoCombinations); idx++) {
cublasLtMatmulAlgo_t algo;
size_t sizeWritten = 0;
/* Initialize algo structure with given Algp ID */
status =
cublasLtMatmulAlgoInit(ltHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, algoIdA[idx], &algo);
if (status != CUBLAS_STATUS_SUCCESS) {
continue;
}
// Query the tiles enums supported by that algo
cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_TILE_IDS, NULL, 0, &sizeWritten);
int nbTiles = int(sizeWritten / sizeof(int));
int* tileA = new int[nbTiles == 0 ? 1 : nbTiles];
if (nbTiles == 0) {
tileA[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
nbTiles = 1;
}
#if (CUDART_VERSION >= 11000)
cublasLtMatmulAlgoCapGetAttribute(&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, NULL, 0, &sizeWritten);
int nbStages = int(sizeWritten / sizeof(int));
std::vector<int> stagesA(nbStages == 0 ? 1 : nbStages);
if (nbStages == 0) {
stagesA[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
nbStages = 1;
}
else {
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_STAGES_IDS, stagesA.data(), sizeof(int) * nbStages, &sizeWritten);
}
#endif
int splitkSupport, redMask, swizzlingMax, customOptionMax;
// Retrieve Algo Capabilities attributes to be able to setup loop over
// the different combinations
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_TILE_IDS, tileA, sizeof(int) * nbTiles, &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_SPLITK_SUPPORT, &splitkSupport, sizeof(splitkSupport), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK, &redMask, sizeof(redMask), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT, &swizzlingMax, sizeof(swizzlingMax), &sizeWritten);
cublasLtMatmulAlgoCapGetAttribute(
&algo, CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX, &customOptionMax, sizeof(customOptionMax), &sizeWritten);
/* Loop over the different tiles */
for (int tileIdx = 0; tileIdx < nbTiles; tileIdx++) {
#if (CUDART_VERSION >= 11000)
/* Loop over different stages count */
for (int stagesIdx = 0; stagesIdx < nbStages; stagesIdx++) {
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &stagesA[stagesIdx], sizeof(stagesA[stagesIdx]));
#endif
/* Loop over the different custom option if any */
for (int customOption = 0; customOption <= customOptionMax; customOption++) {
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION, &customOption, sizeof(customOption));
/* Loop over the CTAs swizzling support */
for (int k = 0; k <= swizzlingMax; k++) {
int splitK_trial = 0;
if (splitkSupport) {
splitK_trial += sizeof(splitKSequenceA) / sizeof(splitKSequenceA[0]);
}
// Loop over the splitK value over a fixed sequence
// splitKSequenceA in addition to the case where splitK
// is not enabled
for (int l = 0; (l < (1 + splitK_trial)) && (AlgoCount < AlgoCombinations); l++) {
/* Setup attribute of the algo to run */
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &tileA[tileIdx], sizeof(tileA[tileIdx]));
int splitK_val = 0;
int redScheme = CUBLASLT_REDUCTION_SCHEME_NONE;
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_SPLITK_NUM, &splitK_val, sizeof(splitK_val));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &k, sizeof(k));
cublasLtMatmulAlgoConfigSetAttribute(
&algo, CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME, &redScheme, sizeof(int));
if (l > 0) { // Split-K case
splitK_val = splitKSequenceA[l - 1];
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
&splitKSequenceA[l - 1],
sizeof(splitKSequenceA[l - 1]));
/* Going over all the reduction scheme */
for (redScheme = 1;
redScheme < (int)CUBLASLT_REDUCTION_SCHEME_MASK && (AlgoCount < AlgoCombinations);
redScheme = redScheme << 1) {
if (redScheme & redMask) {
cublasLtMatmulAlgoConfigSetAttribute(&algo,
CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
&redScheme,
sizeof(redScheme));
cublasLtMatmulHeuristicResult_t heurResult;
cublasStatus_t algoStatus = cublasLtMatmulAlgoCheck(
ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult);
if (heurResult.workspaceSize > workSpaceSize) {
// printf("not enough workspace!
// %ld\n",
// heurResult.workspaceSize);
algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not enough workspace
}
else if (heurResult.workspaceSize == 0) {
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
algosRestrict[AlgoCountRestrict++] = algo;
}
}
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
algos[AlgoCount++] = algo;
}
} // end if
} // end for
}
else { // Non-splitK case
/* if user preference is ok with workspace */
if (AlgoCount < AlgoCombinations) {
cublasLtMatmulHeuristicResult_t heurResult;
cublasStatus_t algoStatus = cublasLtMatmulAlgoCheck(
ltHandle, operationDesc, Adesc, Bdesc, Cdesc, Cdesc, &algo, &heurResult);
if (heurResult.workspaceSize > workSpaceSize) {
// printf("not enough workspace! %ld\n",
// heurResult.workspaceSize);
algoStatus = CUBLAS_STATUS_NOT_SUPPORTED; // Not
// enough
// workspace
}
else if (heurResult.workspaceSize == 0) {
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
algosRestrict[AlgoCountRestrict++] = algo;
}
}
if (algoStatus == CUBLAS_STATUS_SUCCESS) {
algos[AlgoCount++] = algo;
}
}
}
} // end l
} // end k
} // end customOption
#if (CUDART_VERSION >= 11000)
} // end stagesIdx
#endif
} // end tileIdx
delete[] tileA;
} // end idx
printf("AlgoCount: %d\n", AlgoCount);
if (data_type == FP8_DATATYPE) {
assert(AlgoCount == 0);
}
if (AlgoCount < maxNumTraversal && data_type != FP8_DATATYPE) {
// 0 <= workspacesize <= 32MB
for (int i = 0; i < AlgoCount; i++) {
status = customMatmulRun(ltHandle,
operationDesc,
alpha, /* host or device pointer */
A,
Adesc,
B,
Bdesc,
beta, /* host or device pointer */
C,
Cdesc,
C,
Cdesc,
algos[i],
kernelRepeats,
workSpace,
workSpaceSize,
perfResults[i],
stream,
startEvent,
stopEvent);
perfResults[i].status = status;
// if (status == CUBLAS_STATUS_SUCCESS) AlgoCount++;
}
}
else {
// Heuristic + workspacesize==0
AlgoCount = 0;
nbAlgoIds = 0;
cublasLtMatmulPreference_t pref;
cublasLtMatmulPreferenceCreate(&pref);
uint64_t maxWorkSpaceSize = workSpaceSize; //(32MB)
cublasLtMatmulPreferenceSetAttribute(
pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &maxWorkSpaceSize, sizeof(maxWorkSpaceSize));
cublasLtMatmulHeuristicResult_t heuristicResultsArray[maxNumTraversal];
cublasLtMatmulAlgoGetHeuristic(ltHandle,
operationDesc,
Adesc,
Bdesc,
Cdesc,
Ddesc,
pref,
maxNumTraversal,
heuristicResultsArray,
&nbAlgoIds);
cublasLtMatmulPreferenceDestroy(pref);
printf("return %d and run heuristic algo\n", nbAlgoIds);
for (int i = 0; i < nbAlgoIds; i++) {
if (heuristicResultsArray[i].state == CUBLAS_STATUS_SUCCESS) {
status = customMatmulRun(ltHandle,
operationDesc,
alpha, /* host or device pointer */
A,
Adesc,
B,
Bdesc,
beta, /* host or device pointer */
C,
Cdesc,
C,
Ddesc,
heuristicResultsArray[i].algo,
kernelRepeats,
workSpace,
workSpaceSize,
perfResults[AlgoCount],
stream,
startEvent,
stopEvent);
perfResults[AlgoCount].status = status;
if (status == CUBLAS_STATUS_SUCCESS) {
AlgoCount++;
}
}
}
// workspacesize==0
printf("workspacesize==0, run %d algos\n", AlgoCountRestrict);
for (int i = 0; i < AlgoCountRestrict && i < (maxNumTraversal - nbAlgoIds); i++) {
status = customMatmulRun(ltHandle,
operationDesc,
alpha, /* host or device pointer */
A,
Adesc,
B,
Bdesc,
beta, /* host or device pointer */
C,
Cdesc,
C,
Ddesc,
algosRestrict[i],
kernelRepeats,
NULL,
0,
perfResults[AlgoCount],
stream,
startEvent,
stopEvent);
perfResults[AlgoCount].status = status;
if (status == CUBLAS_STATUS_SUCCESS) {
AlgoCount++;
}
}
}
// Sort the results per run duration
std::sort(perfResults, perfResults + AlgoCount, time_compare);
// Print timing and perf details
for (int i = 0, hasPrint = 1; i < AlgoCount; i++) {
printf("result %03d : ", i);
hasPrint = printPerfStructure(batch_size,
seq_len,
head_num,
size_per_head,
m,
n,
k,
perfResults[i],
fout,
data_type,
hasPrint,
batchCount);
}
CLEANUP:
// Descriptors are no longer needed as all GPU work was already enqueued
if (Cdesc) {
cublasLtMatrixLayoutDestroy(Cdesc);
}
if (Bdesc) {
cublasLtMatrixLayoutDestroy(Bdesc);
}
if (Adesc) {
cublasLtMatrixLayoutDestroy(Adesc);
}
if (operationDesc) {
cublasLtMatmulDescDestroy(operationDesc);
}
if (startEvent) {
cudaEventDestroy(startEvent);
}
if (stopEvent) {
cudaEventDestroy(stopEvent);
}
return status == CUBLAS_STATUS_SUCCESS ? 0 : 1;
}
template int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
int batch_size,
int seq_len,
int head_num,
int size_per_head,
int m,
int n,
int k,
const float* alpha, /* host pointer */
const float* A,
const float* B,
const float* beta, /* host pointer */
float* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout,
customMatmulPerf_t perfResults[],
int AlgoCombinations,
cudaDataType_t dtype_fp8,
int batchCount,
int64_t strideA,
int64_t strideB,
int64_t strideD);
template int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
int batch_size,
int seq_len,
int head_num,
int size_per_head,
int m,
int n,
int k,
const half* alpha, /* host pointer */
const half* A,
const half* B,
const half* beta, /* host pointer */
half* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout,
customMatmulPerf_t perfResults[],
int AlgoCombinations,
cudaDataType_t dtype_fp8,
int batchCount,
int64_t strideA,
int64_t strideB,
int64_t strideD);
#ifdef ENABLE_BF16
template int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
int batch_size,
int seq_len,
int head_num,
int size_per_head,
int m,
int n,
int k,
const float* alpha, /* host pointer */
const __nv_bfloat16* A,
const __nv_bfloat16* B,
const float* beta, /* host pointer */
__nv_bfloat16* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout,
customMatmulPerf_t perfResults[],
int AlgoCombinations,
cudaDataType_t dtype_fp8,
int batchCount,
int64_t strideA,
int64_t strideB,
int64_t strideD);
#endif
#ifdef ENABLE_FP8
template int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
int batch_size,
int seq_len,
int head_num,
int size_per_head,
int m,
int n,
int k,
const float* alpha, /* host pointer */
const __nv_fp8_e4m3* A,
const __nv_fp8_e4m3* B,
const float* beta, /* host pointer */
__nv_fp8_e4m3* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout,
customMatmulPerf_t perfResults[],
int AlgoCombinations,
cudaDataType_t dtype_fp8,
int batchCount,
int64_t strideA,
int64_t strideB,
int64_t strideD);
#endif
template int LtHgemmCustomFind(cublasLtHandle_t ltHandle,
int batch_size,
int seq_len,
int head_num,
int size_per_head,
int m,
int n,
int k,
const float* alpha, /* host pointer */
const half* A,
const half* B,
const float* beta, /* host pointer */
half* C,
void* workSpace,
size_t workSpaceSize,
FILE* fout,
customMatmulPerf_t perfResults[],
int AlgoCombinations,
cudaDataType_t dtype_fp8,
int batchCount,
int64_t strideA,
int64_t strideB,
int64_t strideD);
size_t calGemmTestBufSizeInByte(int batch_size,
int seq_len,
int head_num,
int size_per_head,
int inter_size,
int vocab_size,
int int8_mode,
CublasDataType data_type)
{
size_t buf_size_in_byte;
if (int8_mode > 0) {
int m = batch_size * seq_len;
int n = head_num * size_per_head;
int k = n;
size_t size1 = 3 * (m * k * sizeof(int8_t) + k * n * sizeof(int8_t) + m * n * sizeof(int));
size_t size2 = batch_size * head_num
* (seq_len * size_per_head * sizeof(int8_t) + size_per_head * seq_len * sizeof(int8_t)
+ seq_len * seq_len * sizeof(int));
size_t size3 = batch_size * head_num
* (seq_len * seq_len * sizeof(int8_t) + seq_len * size_per_head * sizeof(int8_t)
+ seq_len * size_per_head * sizeof(int));
size_t size4 = m * k * sizeof(int8_t) + k * inter_size * sizeof(int8_t) + m * inter_size * sizeof(int);
size_t size5 = m * k * sizeof(int8_t) + k * vocab_size * sizeof(int8_t) + m * vocab_size * sizeof(int);
buf_size_in_byte = size1 > size2 ? size1 : size2;
buf_size_in_byte = buf_size_in_byte > size3 ? buf_size_in_byte : size3;
buf_size_in_byte = buf_size_in_byte > size4 ? buf_size_in_byte : size4;
buf_size_in_byte = buf_size_in_byte > size5 ? buf_size_in_byte : size5;
}
else {
size_t m = batch_size * seq_len;
size_t n = head_num * size_per_head;
size_t k = n;
// TODO need to add bfloat16 here
int wordSize = (data_type == FLOAT_DATATYPE ? sizeof(float) : sizeof(half));
size_t size1 = 3 * (m * k + k * n + m * n) * wordSize;
size_t size2 = (size_t)batch_size * (size_t)head_num
* ((size_t)seq_len * (size_t)seq_len + (size_t)seq_len * (size_t)size_per_head
+ (size_t)seq_len * (size_t)size_per_head)
* (size_t)wordSize;
size_t size3 = (m * k + k * inter_size + m * inter_size) * wordSize;
size_t size4 = (m * k + k * vocab_size + m * vocab_size) * wordSize;
buf_size_in_byte = size1 > size2 ? size1 : size2;
buf_size_in_byte = buf_size_in_byte > size3 ? buf_size_in_byte : size3;
buf_size_in_byte = buf_size_in_byte > size4 ? buf_size_in_byte : size4;
buf_size_in_byte +=
((data_type == HALF_DATATYPE || data_type == BFLOAT16_DATATYPE) ? CUBLAS_WORKSPACE_SIZE : 0);
}
return buf_size_in_byte;
}
size_t calGemmTestBufSizeInByteXlnet(
int batch_size, int seq_len, int head_num, int size_per_head, int inter_size, int hidden_units, int is_fp16)
{
int M[10] = {0};
int N[10] = {0};
int K[10] = {0};
int batchCount[10] = {0};
// gemm1
M[0] = hidden_units;
N[0] = seq_len * batch_size;
K[0] = hidden_units;
batchCount[0] = 3;
// gemm2
M[1] = hidden_units;
N[1] = seq_len * 2;
K[1] = hidden_units;
batchCount[1] = 1;
// gemm3
M[2] = seq_len;
N[2] = seq_len;
K[2] = size_per_head;
batchCount[2] = batch_size * head_num;
// gemm4
M[3] = seq_len * 2;
N[3] = seq_len;
K[3] = size_per_head;
batchCount[3] = batch_size * head_num;
// gemm5
M[4] = 2;
N[4] = seq_len;
K[4] = size_per_head;
batchCount[4] = batch_size * head_num;
// gemm6
M[5] = head_num;
N[5] = seq_len;
K[5] = 2;
// gemm7
M[6] = size_per_head;
N[6] = seq_len;
K[6] = seq_len;
batchCount[6] = batch_size * head_num;
// gemm8
M[7] = hidden_units;
N[7] = seq_len;
K[7] = hidden_units;
batchCount[7] = batch_size;
// gemm9
M[8] = inter_size;
N[8] = seq_len;
K[8] = hidden_units;
batchCount[8] = batch_size;
// gemm10
M[9] = hidden_units;
N[9] = seq_len;
K[9] = inter_size;
batchCount[9] = batch_size;
size_t max_size = 0;
for (int i = 0; i < 10; ++i) {
int m = M[i], n = N[i], k = K[i];
size_t size = (M[i] * N[i] + M[i] * K[i] + N[i] * K[i]) * batchCount[i];
if (size > max_size) {
max_size = size;
}
}
int size_per_ele = 4;
if (is_fp16 == true) {
size_per_ele = 2;
}
return max_size * size_per_ele;
}
} // namespace fastertransformer
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment