Unverified Commit fe46dac2 authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

Add lint action (#32)

* temp

* fix lint

* csrc->src

* remove clang-format

* skip .rst

* skip doc

* clang-format

version

version

* mat_B
parent e8ab4ba3
/* /*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
#pragma once #pragma once
#include "stdio.h" #include "stdio.h"
#include "stdlib.h" #include "stdlib.h"
// be consistent with FasterTransformer // be consistent with FasterTransformer
int8_t float_to_int8_rn_host(float x) int8_t float_to_int8_rn_host(float x)
{ {
int8_t res; int8_t res;
int32_t tmp; int32_t tmp;
if (x >= 0) { if (x >= 0) {
tmp = int(x + 0.5); tmp = int(x + 0.5);
tmp = tmp > 127 ? 127 : tmp; tmp = tmp > 127 ? 127 : tmp;
res = int8_t(tmp); res = int8_t(tmp);
} }
else { else {
tmp = int(x - 0.5); tmp = int(x - 0.5);
tmp = tmp < -127 ? -127 : tmp; tmp = tmp < -127 ? -127 : tmp;
res = int8_t(tmp); res = int8_t(tmp);
} }
return res; return res;
} }
\ No newline at end of file
...@@ -509,10 +509,10 @@ void cublasINT8MMWrapper::SpGemm( ...@@ -509,10 +509,10 @@ void cublasINT8MMWrapper::SpGemm(
} }
else { else {
// initializing MatDesc takes a lot of time // initializing MatDesc takes a lot of time
cusparseLtMatDescriptor_t matA, matB, matC; cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
sp_mat_A_desc_map_[mark] = matA; sp_mat_A_desc_map_[mark] = mat_A;
sp_mat_B_desc_map_[mark] = matB; sp_mat_B_desc_map_[mark] = mat_B;
sp_mat_C_desc_map_[mark] = matC; sp_mat_C_desc_map_[mark] = mat_C;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_, CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
&sp_mat_A_desc_map_[mark], &sp_mat_A_desc_map_[mark],
num_A_rows, num_A_rows,
......
...@@ -695,10 +695,10 @@ void cublasMMWrapper::SpGemm(cublasOperation_t transa, ...@@ -695,10 +695,10 @@ void cublasMMWrapper::SpGemm(cublasOperation_t transa,
} }
else { else {
// initializing MatDesc takes a lot of time // initializing MatDesc takes a lot of time
cusparseLtMatDescriptor_t matA, matB, matC; cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
sp_mat_A_desc_map_[mark] = matA; sp_mat_A_desc_map_[mark] = mat_A;
sp_mat_B_desc_map_[mark] = matB; sp_mat_B_desc_map_[mark] = mat_B;
sp_mat_C_desc_map_[mark] = matC; sp_mat_C_desc_map_[mark] = mat_C;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_, CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
&sp_mat_A_desc_map_[mark], &sp_mat_A_desc_map_[mark],
num_A_rows, num_A_rows,
...@@ -752,9 +752,9 @@ size_t cublasMMWrapper::getSparseMatrixSize(int m, int k) ...@@ -752,9 +752,9 @@ size_t cublasMMWrapper::getSparseMatrixSize(int m, int k)
int num_A_cols = k; int num_A_cols = k;
int lda = num_A_rows; int lda = num_A_rows;
cusparseLtMatDescriptor_t matA; cusparseLtMatDescriptor_t mat_A;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_, CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
&matA, &mat_A,
num_A_rows, num_A_rows,
num_A_cols, num_A_cols,
lda, lda,
...@@ -763,7 +763,7 @@ size_t cublasMMWrapper::getSparseMatrixSize(int m, int k) ...@@ -763,7 +763,7 @@ size_t cublasMMWrapper::getSparseMatrixSize(int m, int k)
order, order,
CUSPARSELT_SPARSITY_50_PERCENT)); CUSPARSELT_SPARSITY_50_PERCENT));
size_t compressed_size = 0; size_t compressed_size = 0;
CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&cusparselt_handle_, &matA, &compressed_size)); CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&cusparselt_handle_, &mat_A, &compressed_size));
return compressed_size; return compressed_size;
} }
...@@ -771,11 +771,11 @@ void cublasMMWrapper::compressMatrix(const void* input, void* output, const int ...@@ -771,11 +771,11 @@ void cublasMMWrapper::compressMatrix(const void* input, void* output, const int
{ {
cusparseOrder_t order = CUSPARSE_ORDER_COL; cusparseOrder_t order = CUSPARSE_ORDER_COL;
cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE; cusparseOperation_t opA = CUSPARSE_OPERATION_NON_TRANSPOSE;
cusparseLtMatDescriptor_t matA; cusparseLtMatDescriptor_t mat_A;
unsigned alignment = 16; unsigned alignment = 16;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
&cusparselt_handle_, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT)) &cusparselt_handle_, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(cusparseLtSpMMACompress2(&cusparselt_handle_, &matA, true, opA, input, output, stream_)) CHECK_CUSPARSE(cusparseLtSpMMACompress2(&cusparselt_handle_, &mat_A, true, opA, input, output, stream_))
sync_check_cuda_error(); sync_check_cuda_error();
} }
......
...@@ -22,10 +22,11 @@ ...@@ -22,10 +22,11 @@
namespace fastertransformer { namespace fastertransformer {
#ifdef ENABLE_BF16 #ifdef ENABLE_BF16
inline __device__ float2 bf1622float2(const __nv_bfloat162 val) { inline __device__ float2 bf1622float2(const __nv_bfloat162 val)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float2 f_val; float2 f_val;
f_val.x = __low2float(val); f_val.x = __low2float(val);
f_val.y = __high2float(val); f_val.y = __high2float(val);
return f_val; return f_val;
#else #else
...@@ -33,26 +34,34 @@ inline __device__ float2 bf1622float2(const __nv_bfloat162 val) { ...@@ -33,26 +34,34 @@ inline __device__ float2 bf1622float2(const __nv_bfloat162 val) {
#endif #endif
} }
inline __device__ int16_t bf1622int16(__nv_bfloat162 val) { inline __device__ int16_t bf1622int16(__nv_bfloat162 val)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float2 f_val; float2 f_val;
f_val.x = max(min(__low2float(val), 127.f), -128.f); f_val.x = max(min(__low2float(val), 127.f), -128.f);
f_val.y = max(min(__high2float(val), 127.f), -128.f); f_val.y = max(min(__high2float(val), 127.f), -128.f);
union { int8_t int8[2]; int16_t int16; }; union {
int8_t int8[2];
int16_t int16;
};
int8[0] = static_cast<int8_t>(static_cast<short>(f_val.x)); int8[0] = static_cast<int8_t>(static_cast<short>(f_val.x));
int8[1] = static_cast<int8_t>(static_cast<short>(f_val.y)); int8[1] = static_cast<int8_t>(static_cast<short>(f_val.y));
return int16; return int16;
#else #else
val = __hmin2(val, make_bfloat162(127., 127.)); val = __hmin2(val, make_bfloat162(127., 127.));
val = __hmax2(val, make_bfloat162(-128., -128.)); val = __hmax2(val, make_bfloat162(-128., -128.));
union { int8_t int8[2]; int16_t int16; }; union {
int8_t int8[2];
int16_t int16;
};
int8[0] = static_cast<int8_t>(static_cast<short>(val.x)); int8[0] = static_cast<int8_t>(static_cast<short>(val.x));
int8[1] = static_cast<int8_t>(static_cast<short>(val.y)); int8[1] = static_cast<int8_t>(static_cast<short>(val.y));
return int16; return int16;
#endif #endif
} }
inline __device__ __nv_bfloat162 float22bf162(const float2 val) { inline __device__ __nv_bfloat162 float22bf162(const float2 val)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __floats2bfloat162_rn(val.x, val.y); return __floats2bfloat162_rn(val.x, val.y);
#else #else
...@@ -60,7 +69,8 @@ inline __device__ __nv_bfloat162 float22bf162(const float2 val) { ...@@ -60,7 +69,8 @@ inline __device__ __nv_bfloat162 float22bf162(const float2 val) {
#endif #endif
} }
inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) { inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
__nv_bfloat162 val2; __nv_bfloat162 val2;
val2.x = val; val2.x = val;
...@@ -71,7 +81,8 @@ inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) { ...@@ -71,7 +81,8 @@ inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
#endif #endif
} }
inline __device__ __nv_bfloat162 bf16hadd2(const __nv_bfloat162 x, const __nv_bfloat162 y) { inline __device__ __nv_bfloat162 bf16hadd2(const __nv_bfloat162 x, const __nv_bfloat162 y)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fxl, fxh, fyl, fyh; float fxl, fxh, fyl, fyh;
fxl = __low2float(x); fxl = __low2float(x);
...@@ -84,15 +95,17 @@ inline __device__ __nv_bfloat162 bf16hadd2(const __nv_bfloat162 x, const __nv_bf ...@@ -84,15 +95,17 @@ inline __device__ __nv_bfloat162 bf16hadd2(const __nv_bfloat162 x, const __nv_bf
#endif #endif
} }
inline __device__ __nv_bfloat16 bf16hadd(const __nv_bfloat16 x, const __nv_bfloat16 y) { inline __device__ __nv_bfloat16 bf16hadd(const __nv_bfloat16 x, const __nv_bfloat16 y)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __float2bfloat16( __bfloat162float(x) + __bfloat162float(y) ); return __float2bfloat16(__bfloat162float(x) + __bfloat162float(y));
#else #else
return __hadd(x, y); return __hadd(x, y);
#endif #endif
} }
inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 x, const __nv_bfloat162 y) { inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 x, const __nv_bfloat162 y)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fxl, fxh, fyl, fyh; float fxl, fxh, fyl, fyh;
fxl = __low2float(x); fxl = __low2float(x);
...@@ -105,15 +118,17 @@ inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 x, const __nv_bf ...@@ -105,15 +118,17 @@ inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 x, const __nv_bf
#endif #endif
} }
inline __device__ __nv_bfloat16 bf16hsub(const __nv_bfloat16 x, const __nv_bfloat16 y) { inline __device__ __nv_bfloat16 bf16hsub(const __nv_bfloat16 x, const __nv_bfloat16 y)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __float2bfloat16( __bfloat162float(x) - __bfloat162float(y) ); return __float2bfloat16(__bfloat162float(x) - __bfloat162float(y));
#else #else
return __hsub(x, y); return __hsub(x, y);
#endif #endif
} }
inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 x, const __nv_bfloat162 y) { inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 x, const __nv_bfloat162 y)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fxl, fxh, fyl, fyh; float fxl, fxh, fyl, fyh;
fxl = __low2float(x); fxl = __low2float(x);
...@@ -126,15 +141,17 @@ inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 x, const __nv_bf ...@@ -126,15 +141,17 @@ inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 x, const __nv_bf
#endif #endif
} }
inline __device__ __nv_bfloat16 bf16hmul(const __nv_bfloat16 x, const __nv_bfloat16 y) { inline __device__ __nv_bfloat16 bf16hmul(const __nv_bfloat16 x, const __nv_bfloat16 y)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) ); return __float2bfloat16(__bfloat162float(x) * __bfloat162float(y));
#else #else
return __hmul(x, y); return __hmul(x, y);
#endif #endif
} }
inline __device__ __nv_bfloat162 bf16hfma2(const __nv_bfloat162 x, const __nv_bfloat162 y, const __nv_bfloat162 z) { inline __device__ __nv_bfloat162 bf16hfma2(const __nv_bfloat162 x, const __nv_bfloat162 y, const __nv_bfloat162 z)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fxl, fxh, fyl, fyh, fzl, fzh; float fxl, fxh, fyl, fyh, fzl, fzh;
fxl = __low2float(x); fxl = __low2float(x);
...@@ -149,19 +166,22 @@ inline __device__ __nv_bfloat162 bf16hfma2(const __nv_bfloat162 x, const __nv_bf ...@@ -149,19 +166,22 @@ inline __device__ __nv_bfloat162 bf16hfma2(const __nv_bfloat162 x, const __nv_bf
#endif #endif
} }
inline __device__ __nv_bfloat16 bf16hfma(const __nv_bfloat16 x, const __nv_bfloat16 y, const __nv_bfloat16 z) { inline __device__ __nv_bfloat16 bf16hfma(const __nv_bfloat16 x, const __nv_bfloat16 y, const __nv_bfloat16 z)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) + __bfloat162float(z)); return __float2bfloat16(__bfloat162float(x) * __bfloat162float(y) + __bfloat162float(z));
#else #else
return __hfma(x, y, z); return __hfma(x, y, z);
#endif #endif
} }
inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x) { inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fxl, fxh; float fxl, fxh;
fxl = __low2float(x); fxl = __low2float(x);
fxh = __high2float(x);; fxh = __high2float(x);
;
return __floats2bfloat162_rn(expf(fxl), expf(fxh)); return __floats2bfloat162_rn(expf(fxl), expf(fxh));
#else #else
return h2exp(x); return h2exp(x);
...@@ -169,17 +189,27 @@ inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x) { ...@@ -169,17 +189,27 @@ inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x) {
} }
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800) #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
inline __device__ __nv_bfloat162 operator*(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hmul2(x, y); }; inline __device__ __nv_bfloat162 operator*(const __nv_bfloat162 x, const __nv_bfloat162 y)
inline __device__ __nv_bfloat162 operator+(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hadd2(x, y); }; {
return bf16hmul2(x, y);
};
inline __device__ __nv_bfloat162 operator+(const __nv_bfloat162 x, const __nv_bfloat162 y)
{
return bf16hadd2(x, y);
};
inline __device__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y) inline __device__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
{ {
__nv_bfloat162 t; t.x = x; t.y = y; return t; __nv_bfloat162 t;
t.x = x;
t.y = y;
return t;
} }
#endif #endif
inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) { inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c)); return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c));
#else #else
...@@ -187,7 +217,8 @@ inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_ ...@@ -187,7 +217,8 @@ inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_
#endif #endif
} }
inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c, __nv_bfloat16 d) { inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c, __nv_bfloat16 d)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c) + __bfloat162float(d)); return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c) + __bfloat162float(d));
#else #else
...@@ -195,7 +226,8 @@ inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_ ...@@ -195,7 +226,8 @@ inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_
#endif #endif
} }
inline __device__ __nv_bfloat162 bf16hadd2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) { inline __device__ __nv_bfloat162 bf16hadd2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fal, fah, fbl, fbh, fcl, fch; float fal, fah, fbl, fbh, fcl, fch;
fal = __low2float(a); fal = __low2float(a);
...@@ -210,7 +242,8 @@ inline __device__ __nv_bfloat162 bf16hadd2(__nv_bfloat162 a, __nv_bfloat162 b, _ ...@@ -210,7 +242,8 @@ inline __device__ __nv_bfloat162 bf16hadd2(__nv_bfloat162 a, __nv_bfloat162 b, _
#endif #endif
} }
inline __device__ __nv_bfloat16 bf16hmul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) { inline __device__ __nv_bfloat16 bf16hmul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b) * __bfloat162float(c)); return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b) * __bfloat162float(c));
#else #else
...@@ -218,7 +251,8 @@ inline __device__ __nv_bfloat16 bf16hmul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_ ...@@ -218,7 +251,8 @@ inline __device__ __nv_bfloat16 bf16hmul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_
#endif #endif
} }
inline __device__ __nv_bfloat162 bf16hmul2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) { inline __device__ __nv_bfloat162 bf16hmul2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fal, fah, fbl, fbh, fcl, fch; float fal, fah, fbl, fbh, fcl, fch;
fal = __low2float(a); fal = __low2float(a);
...@@ -233,7 +267,8 @@ inline __device__ __nv_bfloat162 bf16hmul2(__nv_bfloat162 a, __nv_bfloat162 b, _ ...@@ -233,7 +267,8 @@ inline __device__ __nv_bfloat162 bf16hmul2(__nv_bfloat162 a, __nv_bfloat162 b, _
#endif #endif
} }
inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162 d) { inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162 d)
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
float fal, fah, fbl, fbh, fcl, fch, fdl, fdh; float fal, fah, fbl, fbh, fcl, fch, fdl, fdh;
fal = __low2float(a); fal = __low2float(a);
...@@ -250,6 +285,6 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _ ...@@ -250,6 +285,6 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _
#endif #endif
} }
#endif // ENABLE_BF16 #endif // ENABLE_BF16
} // namespace fastertransformer } // namespace fastertransformer
\ No newline at end of file
...@@ -18,4 +18,4 @@ ...@@ -18,4 +18,4 @@
#ifdef ENABLE_BF16 #ifdef ENABLE_BF16
#include <cuda_bf16.h> #include <cuda_bf16.h>
#endif #endif
\ No newline at end of file
...@@ -121,4 +121,4 @@ template void ...@@ -121,4 +121,4 @@ template void
invokeComputeFP8QuantizeScale(float* quant_ptr, const float* weights, const int k, const int n, cudaStream_t stream); invokeComputeFP8QuantizeScale(float* quant_ptr, const float* weights, const int k, const int n, cudaStream_t stream);
#endif // ENABLE_FP8 #endif // ENABLE_FP8
} // namespace fastertransformer } // namespace fastertransformer
\ No newline at end of file
...@@ -84,4 +84,4 @@ struct CustomARCommTypeConverter<__nv_bfloat16> { ...@@ -84,4 +84,4 @@ struct CustomARCommTypeConverter<__nv_bfloat16> {
}; };
#endif #endif
} // namespace fastertransformer } // namespace fastertransformer
\ No newline at end of file
...@@ -462,29 +462,29 @@ void generate_encoder_gemm_config( ...@@ -462,29 +462,29 @@ void generate_encoder_gemm_config(
T* d_C = d_B + k * n * batchCount[i]; T* d_C = d_B + k * n * batchCount[i];
T* dA_compressed; T* dA_compressed;
{ {
cusparseLtMatDescriptor_t matA; cusparseLtMatDescriptor_t mat_A;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
&handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT)) &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE( CHECK_CUSPARSE(
cusparseLtSpMMAPrune2(&handle, &matA, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream)) cusparseLtSpMMAPrune2(&handle, &mat_A, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
size_t compressed_size; size_t compressed_size;
CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &matA, &compressed_size)) CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_A, &compressed_size))
check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size)); check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &matA, true, opA, d_A, dA_compressed, stream)) CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_A, true, opA, d_A, dA_compressed, stream))
} }
float exec_time = 99999.0f; float exec_time = 99999.0f;
int fast_algo = 0; int fast_algo = 0;
for (int alg = 0; alg < 4; ++alg) { for (int alg = 0; alg < 4; ++alg) {
cudaDeviceSynchronize(); cudaDeviceSynchronize();
cusparseLtMatDescriptor_t matA, matB, matC; cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
void* d_workspace = nullptr; void* d_workspace = nullptr;
int num_streams = 1; int num_streams = 1;
cudaStream_t streams[1] = {stream}; cudaStream_t streams[1] = {stream};
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
&handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT)) &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matB, k, n, k, alignment, CUDA_R_16F, order)) CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matC, m, n, m, alignment, CUDA_R_16F, order)) CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
gettimeofday(&start, NULL); gettimeofday(&start, NULL);
for (int ite = 0; ite < ites; ++ite) { for (int ite = 0; ite < ites; ++ite) {
// initializing MatDesc takes a lot of time // initializing MatDesc takes a lot of time
...@@ -494,7 +494,7 @@ void generate_encoder_gemm_config( ...@@ -494,7 +494,7 @@ void generate_encoder_gemm_config(
cusparseLtMatmulAlgSelection_t alg_sel; cusparseLtMatmulAlgSelection_t alg_sel;
cusparseLtMatmulPlan_t plan; cusparseLtMatmulPlan_t plan;
CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit( CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
&handle, &matmul, opA, opB, &matA, &matB, &matC, &matC, compute_type)) &handle, &matmul, opA, opB, &mat_A, &mat_B, &mat_C, &mat_C, compute_type))
CHECK_CUSPARSE( CHECK_CUSPARSE(
cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT)) cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute( CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
......
/* /*
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
* You may obtain a copy of the License at * You may obtain a copy of the License at
* *
* http://www.apache.org/licenses/LICENSE-2.0 * http://www.apache.org/licenses/LICENSE-2.0
* *
* Unless required by applicable law or agreed to in writing, software * Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, * distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
#pragma once #pragma once
#include "src/fastertransformer/utils/cublasAlgoMap.h" #include "src/fastertransformer/utils/cublasAlgoMap.h"
#include "src/fastertransformer/utils/cuda_utils.h" #include "src/fastertransformer/utils/cuda_utils.h"
#include <algorithm> #include <algorithm>
#include <cublasLt.h> #include <cublasLt.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <map> #include <map>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <sys/time.h> #include <sys/time.h>
#include <time.h> #include <time.h>
#include <unistd.h> #include <unistd.h>
#include <vector> #include <vector>
namespace fastertransformer { namespace fastertransformer {
/* CAUTION : must match cublasLtMatmulTile_t */ /* CAUTION : must match cublasLtMatmulTile_t */
const char* const matmulTileName[] = {"UNDEF", "8x8", "8x16", "16x8", "8x32", "16x16", "32x8", const char* const matmulTileName[] = {"UNDEF", "8x8", "8x16", "16x8", "8x32", "16x16", "32x8",
"8x64", "16x32", "32x16", "64x8", "32x32", "32x64", "64x32", "8x64", "16x32", "32x16", "64x8", "32x32", "32x64", "64x32",
"32x128", "64x64", "128x32", "64x128", "128x64", "64x256", "128x128", "32x128", "64x64", "128x32", "64x128", "128x64", "64x256", "128x128",
"256x64", "64x512", "128x256", "256x128", "512x64", "64x96", "96*64", "256x64", "64x512", "128x256", "256x128", "512x64", "64x96", "96*64",
"96x128", "128x160", "160x128", "192x128", "128x192", "128x96", "END"}; "96x128", "128x160", "160x128", "192x128", "128x192", "128x96", "END"};
int generate_encoder_igemm_config( int generate_encoder_igemm_config(
int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true); int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint); int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint);
int printBatchPerfStructure( int printBatchPerfStructure(
int batchCount, int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint); int batchCount, int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint);
template<typename T, typename scaleT> template<typename T, typename scaleT>
int LtIgemmCustomFind(cublasLtHandle_t ltHandle, int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
int m, int m,
int n, int n,
int k, int k,
const scaleT* alpha, /* host pointer */ const scaleT* alpha, /* host pointer */
const int8_t* A, const int8_t* A,
const int8_t* B, const int8_t* B,
const scaleT* beta, /* host pointer */ const scaleT* beta, /* host pointer */
T* C, T* C,
void* workSpace, void* workSpace,
size_t workSpaceSize, size_t workSpaceSize,
FILE* fout); FILE* fout);
template<typename T, typename scaleT> template<typename T, typename scaleT>
int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle, int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
int batchCount, int batchCount,
int m, int m,
int n, int n,
int k, int k,
const scaleT* alpha, /* host pointer */ const scaleT* alpha, /* host pointer */
const int8_t* A, const int8_t* A,
const int8_t* B, const int8_t* B,
const scaleT* beta, /* host pointer */ const scaleT* beta, /* host pointer */
T* C, T* C,
void* workSpace, void* workSpace,
size_t workSpaceSize, size_t workSpaceSize,
FILE* fout); FILE* fout);
void matInit(int rows, int cols, int8_t* p, int ld); void matInit(int rows, int cols, int8_t* p, int ld);
} // namespace fastertransformer } // namespace fastertransformer
...@@ -617,15 +617,15 @@ void generate_gpt_gemm_config(int batch_size, ...@@ -617,15 +617,15 @@ void generate_gpt_gemm_config(int batch_size,
T* d_C = d_B + k * n * batchCount[i]; T* d_C = d_B + k * n * batchCount[i];
T* dA_compressed; T* dA_compressed;
{ {
cusparseLtMatDescriptor_t matA; cusparseLtMatDescriptor_t mat_A;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
&handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT)) &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE( CHECK_CUSPARSE(
cusparseLtSpMMAPrune2(&handle, &matA, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream)) cusparseLtSpMMAPrune2(&handle, &mat_A, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
size_t compressed_size; size_t compressed_size;
CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &matA, &compressed_size)) CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_A, &compressed_size))
check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size)); check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &matA, true, opA, d_A, dA_compressed, stream)) CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_A, true, opA, d_A, dA_compressed, stream))
} }
float exec_time = 99999.0f; float exec_time = 99999.0f;
...@@ -633,14 +633,15 @@ void generate_gpt_gemm_config(int batch_size, ...@@ -633,14 +633,15 @@ void generate_gpt_gemm_config(int batch_size,
if (isSparseGemmAvailable(m, n, k)) { if (isSparseGemmAvailable(m, n, k)) {
for (int alg = 0; alg < 4; ++alg) { for (int alg = 0; alg < 4; ++alg) {
cudaDeviceSynchronize(); cudaDeviceSynchronize();
cusparseLtMatDescriptor_t matA, matB, matC; cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
void* d_workspace = nullptr; void* d_workspace = nullptr;
int num_streams = 1; int num_streams = 1;
cudaStream_t streams[1] = {stream}; cudaStream_t streams[1] = {stream};
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
&handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT)) &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matB, k, n, k, alignment, CUDA_R_16F, order)) CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matC, m, n, m, alignment, CUDA_R_16F, order)) CHECK_CUSPARSE(
cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&start, NULL); gettimeofday(&start, NULL);
for (int ite = 0; ite < ites; ++ite) { for (int ite = 0; ite < ites; ++ite) {
...@@ -651,7 +652,7 @@ void generate_gpt_gemm_config(int batch_size, ...@@ -651,7 +652,7 @@ void generate_gpt_gemm_config(int batch_size,
cusparseLtMatmulAlgSelection_t alg_sel; cusparseLtMatmulAlgSelection_t alg_sel;
cusparseLtMatmulPlan_t plan; cusparseLtMatmulPlan_t plan;
CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit( CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
&handle, &matmul, opA, opB, &matA, &matB, &matC, &matC, compute_type)) &handle, &matmul, opA, opB, &mat_A, &mat_B, &mat_C, &mat_C, compute_type))
CHECK_CUSPARSE( CHECK_CUSPARSE(
cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT)) cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute( CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
......
...@@ -616,15 +616,15 @@ void generate_t5_gemm_config(int batch_size, ...@@ -616,15 +616,15 @@ void generate_t5_gemm_config(int batch_size,
T* d_C = d_B + k * n * batchCount[i]; T* d_C = d_B + k * n * batchCount[i];
T* dA_compressed; T* dA_compressed;
{ {
cusparseLtMatDescriptor_t matA; cusparseLtMatDescriptor_t mat_A;
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
&handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT)) &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE( CHECK_CUSPARSE(
cusparseLtSpMMAPrune2(&handle, &matA, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream)) cusparseLtSpMMAPrune2(&handle, &mat_A, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
size_t compressed_size; size_t compressed_size;
CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &matA, &compressed_size)) CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_A, &compressed_size))
check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size)); check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &matA, true, opA, d_A, dA_compressed, stream)) CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_A, true, opA, d_A, dA_compressed, stream))
} }
float exec_time = 99999.0f; float exec_time = 99999.0f;
...@@ -632,14 +632,15 @@ void generate_t5_gemm_config(int batch_size, ...@@ -632,14 +632,15 @@ void generate_t5_gemm_config(int batch_size,
if (isSparseGemmAvailable(m, n, k)) { if (isSparseGemmAvailable(m, n, k)) {
for (int alg = 0; alg < 4; ++alg) { for (int alg = 0; alg < 4; ++alg) {
cudaDeviceSynchronize(); cudaDeviceSynchronize();
cusparseLtMatDescriptor_t matA, matB, matC; cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
void* d_workspace = nullptr; void* d_workspace = nullptr;
int num_streams = 1; int num_streams = 1;
cudaStream_t streams[1] = {stream}; cudaStream_t streams[1] = {stream};
CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit( CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
&handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT)) &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matB, k, n, k, alignment, CUDA_R_16F, order)) CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order))
CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matC, m, n, m, alignment, CUDA_R_16F, order)) CHECK_CUSPARSE(
cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
cudaDeviceSynchronize(); cudaDeviceSynchronize();
gettimeofday(&start, NULL); gettimeofday(&start, NULL);
for (int ite = 0; ite < ites; ++ite) { for (int ite = 0; ite < ites; ++ite) {
...@@ -650,7 +651,7 @@ void generate_t5_gemm_config(int batch_size, ...@@ -650,7 +651,7 @@ void generate_t5_gemm_config(int batch_size,
cusparseLtMatmulAlgSelection_t alg_sel; cusparseLtMatmulAlgSelection_t alg_sel;
cusparseLtMatmulPlan_t plan; cusparseLtMatmulPlan_t plan;
CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit( CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
&handle, &matmul, opA, opB, &matA, &matB, &matC, &matC, compute_type)) &handle, &matmul, opA, opB, &mat_A, &mat_B, &mat_C, &mat_C, compute_type))
CHECK_CUSPARSE( CHECK_CUSPARSE(
cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT)) cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute( CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
......
...@@ -13,4 +13,4 @@ public: ...@@ -13,4 +13,4 @@ public:
virtual void* getSharedObject() = 0; virtual void* getSharedObject() = 0;
}; };
} // namespace fastertransformer } // namespace fastertransformer
\ No newline at end of file
...@@ -27,8 +27,7 @@ namespace fastertransformer { ...@@ -27,8 +27,7 @@ namespace fastertransformer {
class Logger { class Logger {
public: public:
enum Level enum Level {
{
TRACE = 0, TRACE = 0,
DEBUG = 10, DEBUG = 10,
INFO = 20, INFO = 20,
...@@ -41,7 +40,7 @@ public: ...@@ -41,7 +40,7 @@ public:
thread_local Logger instance; thread_local Logger instance;
return instance; return instance;
} }
Logger(Logger const&) = delete; Logger(Logger const&) = delete;
void operator=(Logger const&) = delete; void operator=(Logger const&) = delete;
template<typename... Args> template<typename... Args>
......
...@@ -26,4 +26,4 @@ if (TORCH_VERSION VERSION_GREATER_EQUAL "1.9.0") ...@@ -26,4 +26,4 @@ if (TORCH_VERSION VERSION_GREATER_EQUAL "1.9.0")
target_link_libraries(${LIB_NAME} "${TORCH_LIBRARIES}" fpA_intB_gemm logger) target_link_libraries(${LIB_NAME} "${TORCH_LIBRARIES}" fpA_intB_gemm logger)
else() else()
message("TORCH_VERSION ${TORCH_VERSION} < 1.9.0, skipping compiling th_moe_ops.cc because QUInt4x2 is supported after torch 1.9.0") message("TORCH_VERSION ${TORCH_VERSION} < 1.9.0, skipping compiling th_moe_ops.cc because QUInt4x2 is supported after torch 1.9.0")
endif() endif()
\ No newline at end of file
...@@ -369,4 +369,4 @@ TORCH_LIBRARY(gemm_dq_unit_ops, m) ...@@ -369,4 +369,4 @@ TORCH_LIBRARY(gemm_dq_unit_ops, m)
m.def("benchmark_against_cublas_fp", benchmark_against_cublas_fp); m.def("benchmark_against_cublas_fp", benchmark_against_cublas_fp);
m.def("fused_gemm_dq_bias_act", fused_gemm_dq_bias_act); m.def("fused_gemm_dq_bias_act", fused_gemm_dq_bias_act);
} }
} // namespace torch_ext } // namespace torch_ext
\ No newline at end of file
This diff is collapsed.
...@@ -21,4 +21,4 @@ add_definitions(-DTORCH_CUDA=1) ...@@ -21,4 +21,4 @@ add_definitions(-DTORCH_CUDA=1)
set(EXE_NAME "int8_gemm_test") set(EXE_NAME "int8_gemm_test")
add_executable(${EXE_NAME} ${int8_test_files}) add_executable(${EXE_NAME} ${int8_test_files})
set_target_properties(${EXE_NAME} PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) set_target_properties(${EXE_NAME} PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(${EXE_NAME} PUBLIC "${TORCH_LIBRARIES}" int8_gemm tensor logger) target_link_libraries(${EXE_NAME} PUBLIC "${TORCH_LIBRARIES}" int8_gemm tensor logger)
\ No newline at end of file
...@@ -38,9 +38,9 @@ namespace ft = fastertransformer; ...@@ -38,9 +38,9 @@ namespace ft = fastertransformer;
template<typename T> template<typename T>
void int8_gemm_test( void int8_gemm_test(
const int m, const int m,
const int n, const int n,
const int k, const int k,
const at::ScalarType output_data_type, const at::ScalarType output_data_type,
const QuantMode quant_mode, const QuantMode quant_mode,
const int iters) const int iters)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment