Add lint action (#32)

* temp * fix lint * csrc->src * remove clang-format * skip .rst * skip doc * clang-format version version * mat_B

Add lint action (#32)
* temp * fix lint * csrc->src * remove clang-format * skip .rst * skip doc * clang-format version version * mat_B
fe46dac2 · AllentDan · GitHub · e8ab4ba3 · fe46dac2 · fe46dac2
Unverified Commit fe46dac2 authored Jul 01, 2023 by AllentDan Committed by GitHub Jul 01, 2023
20 changed files
--- a/src/fastertransformer/utils/convert_data_type.h
+++ b/src/fastertransformer/utils/convert_data_type.h
 /*
 * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #pragma once
 #include "stdio.h"
 #include "stdlib.h"
 // be consistent with FasterTransformer
 int8_t float_to_int8_rn_host(float x)
 {
    int8_t  res;
    int32_t tmp;
    if (x >= 0) {
        tmp = int(x + 0.5);
        tmp = tmp > 127 ? 127 : tmp;
        res = int8_t(tmp);
    }
    else {
        tmp = int(x - 0.5);
        tmp = tmp < -127 ? -127 : tmp;
        res = int8_t(tmp);
    }
    return res;
 }
\ No newline at end of file
--- a/src/fastertransformer/utils/cublasINT8MMWrapper.cc
+++ b/src/fastertransformer/utils/cublasINT8MMWrapper.cc
@@ -509,10 +509,10 @@ void cublasINT8MMWrapper::SpGemm(
    }
    else {
        // initializing MatDesc takes a lot of time
-        cusparseLtMatDescriptor_t matA, matB, matC;
+        cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
-        sp_mat_A_desc_map_[mark] = matA;
+        sp_mat_A_desc_map_[mark] = mat_A;
-        sp_mat_B_desc_map_[mark] = matB;
+        sp_mat_B_desc_map_[mark] = mat_B;
-        sp_mat_C_desc_map_[mark] = matC;
+        sp_mat_C_desc_map_[mark] = mat_C;
        CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
                                                          &sp_mat_A_desc_map_[mark],
                                                          num_A_rows,

--- a/src/fastertransformer/utils/cublasMMWrapper.cc
+++ b/src/fastertransformer/utils/cublasMMWrapper.cc
@@ -695,10 +695,10 @@ void cublasMMWrapper::SpGemm(cublasOperation_t transa,
    }
    else {
        // initializing MatDesc takes a lot of time
-        cusparseLtMatDescriptor_t matA, matB, matC;
+        cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
-        sp_mat_A_desc_map_[mark] = matA;
+        sp_mat_A_desc_map_[mark] = mat_A;
-        sp_mat_B_desc_map_[mark] = matB;
+        sp_mat_B_desc_map_[mark] = mat_B;
-        sp_mat_C_desc_map_[mark] = matC;
+        sp_mat_C_desc_map_[mark] = mat_C;
        CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
                                                          &sp_mat_A_desc_map_[mark],
                                                          num_A_rows,
@@ -752,9 +752,9 @@ size_t cublasMMWrapper::getSparseMatrixSize(int m, int k)
    int             num_A_cols = k;
    int             lda        = num_A_rows;
-    cusparseLtMatDescriptor_t matA;
+    cusparseLtMatDescriptor_t mat_A;
    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(&cusparselt_handle_,
-                                                      &matA,
+                                                      &mat_A,
                                                      num_A_rows,
                                                      num_A_cols,
                                                      lda,
@@ -763,7 +763,7 @@ size_t cublasMMWrapper::getSparseMatrixSize(int m, int k)
                                                      order,
                                                      CUSPARSELT_SPARSITY_50_PERCENT));
    size_t compressed_size = 0;
-    CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&cusparselt_handle_, &matA, &compressed_size));
+    CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&cusparselt_handle_, &mat_A, &compressed_size));
    return compressed_size;
 }
@@ -771,11 +771,11 @@ void cublasMMWrapper::compressMatrix(const void* input, void* output, const int
 {
    cusparseOrder_t           order = CUSPARSE_ORDER_COL;
    cusparseOperation_t       opA   = CUSPARSE_OPERATION_NON_TRANSPOSE;
-    cusparseLtMatDescriptor_t matA;
+    cusparseLtMatDescriptor_t mat_A;
    unsigned                  alignment = 16;
    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-        &cusparselt_handle_, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+        &cusparselt_handle_, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
-    CHECK_CUSPARSE(cusparseLtSpMMACompress2(&cusparselt_handle_, &matA, true, opA, input, output, stream_))
+    CHECK_CUSPARSE(cusparseLtSpMMACompress2(&cusparselt_handle_, &mat_A, true, opA, input, output, stream_))
    sync_check_cuda_error();
 }

--- a/src/fastertransformer/utils/cuda_bf16_fallbacks.cuh
+++ b/src/fastertransformer/utils/cuda_bf16_fallbacks.cuh
@@ -22,10 +22,11 @@
 namespace fastertransformer {
 #ifdef ENABLE_BF16
-inline __device__ float2 bf1622float2(const __nv_bfloat162 val) {
+inline __device__ float2 bf1622float2(const __nv_bfloat162 val)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    float2 f_val;
-    f_val.x = __low2float(val); 
+    f_val.x = __low2float(val);
    f_val.y = __high2float(val);
    return f_val;
 #else
@@ -33,26 +34,34 @@ inline __device__ float2 bf1622float2(const __nv_bfloat162 val) {
 #endif
 }
-inline __device__ int16_t bf1622int16(__nv_bfloat162 val) {
+inline __device__ int16_t bf1622int16(__nv_bfloat162 val)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    float2 f_val;
    f_val.x = max(min(__low2float(val), 127.f), -128.f);
    f_val.y = max(min(__high2float(val), 127.f), -128.f);
-    union { int8_t int8[2]; int16_t int16; };
+    union {
+        int8_t  int8[2];
+        int16_t int16;
+    };
    int8[0] = static_cast<int8_t>(static_cast<short>(f_val.x));
    int8[1] = static_cast<int8_t>(static_cast<short>(f_val.y));
    return int16;
 #else
    val = __hmin2(val, make_bfloat162(127., 127.));
    val = __hmax2(val, make_bfloat162(-128., -128.));
-    union { int8_t int8[2]; int16_t int16; };
+    union {
+        int8_t  int8[2];
+        int16_t int16;
+    };
    int8[0] = static_cast<int8_t>(static_cast<short>(val.x));
    int8[1] = static_cast<int8_t>(static_cast<short>(val.y));
    return int16;
 #endif
 }
-inline __device__ __nv_bfloat162 float22bf162(const float2 val) {
+inline __device__ __nv_bfloat162 float22bf162(const float2 val)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    return __floats2bfloat162_rn(val.x, val.y);
 #else
@@ -60,7 +69,8 @@ inline __device__ __nv_bfloat162 float22bf162(const float2 val) {
 #endif
 }
-inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
+inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    __nv_bfloat162 val2;
    val2.x = val;
@@ -71,7 +81,8 @@ inline __device__ __nv_bfloat162 bf162bf162(const __nv_bfloat16 val) {
 #endif
 }
-inline __device__ __nv_bfloat162 bf16hadd2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+inline __device__ __nv_bfloat162 bf16hadd2(const __nv_bfloat162 x, const __nv_bfloat162 y)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    float fxl, fxh, fyl, fyh;
    fxl = __low2float(x);
@@ -84,15 +95,17 @@ inline __device__ __nv_bfloat162 bf16hadd2(const __nv_bfloat162 x, const __nv_bf
 #endif
 }
-inline __device__ __nv_bfloat16 bf16hadd(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+inline __device__ __nv_bfloat16 bf16hadd(const __nv_bfloat16 x, const __nv_bfloat16 y)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-    return __float2bfloat16( __bfloat162float(x) + __bfloat162float(y) );
+    return __float2bfloat16(__bfloat162float(x) + __bfloat162float(y));
 #else
    return __hadd(x, y);
 #endif
 }
-inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 x, const __nv_bfloat162 y)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    float fxl, fxh, fyl, fyh;
    fxl = __low2float(x);
@@ -105,15 +118,17 @@ inline __device__ __nv_bfloat162 bf16hsub2(const __nv_bfloat162 x, const __nv_bf
 #endif
 }
-inline __device__ __nv_bfloat16 bf16hsub(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+inline __device__ __nv_bfloat16 bf16hsub(const __nv_bfloat16 x, const __nv_bfloat16 y)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-    return __float2bfloat16( __bfloat162float(x) - __bfloat162float(y) );
+    return __float2bfloat16(__bfloat162float(x) - __bfloat162float(y));
 #else
    return __hsub(x, y);
 #endif
 }
-inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 x, const __nv_bfloat162 y) {
+inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 x, const __nv_bfloat162 y)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    float fxl, fxh, fyl, fyh;
    fxl = __low2float(x);
@@ -126,15 +141,17 @@ inline __device__ __nv_bfloat162 bf16hmul2(const __nv_bfloat162 x, const __nv_bf
 #endif
 }
-inline __device__ __nv_bfloat16 bf16hmul(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+inline __device__ __nv_bfloat16 bf16hmul(const __nv_bfloat16 x, const __nv_bfloat16 y)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-    return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) );
+    return __float2bfloat16(__bfloat162float(x) * __bfloat162float(y));
-#else 
+#else
    return __hmul(x, y);
 #endif
 }
-inline __device__ __nv_bfloat162 bf16hfma2(const __nv_bfloat162 x, const __nv_bfloat162 y, const __nv_bfloat162 z) {
+inline __device__ __nv_bfloat162 bf16hfma2(const __nv_bfloat162 x, const __nv_bfloat162 y, const __nv_bfloat162 z)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    float fxl, fxh, fyl, fyh, fzl, fzh;
    fxl = __low2float(x);
@@ -149,19 +166,22 @@ inline __device__ __nv_bfloat162 bf16hfma2(const __nv_bfloat162 x, const __nv_bf
 #endif
 }
-inline __device__ __nv_bfloat16 bf16hfma(const __nv_bfloat16 x, const __nv_bfloat16 y, const __nv_bfloat16 z) {
+inline __device__ __nv_bfloat16 bf16hfma(const __nv_bfloat16 x, const __nv_bfloat16 y, const __nv_bfloat16 z)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
-    return __float2bfloat16( __bfloat162float(x) * __bfloat162float(y) + __bfloat162float(z));
+    return __float2bfloat16(__bfloat162float(x) * __bfloat162float(y) + __bfloat162float(z));
 #else
    return __hfma(x, y, z);
 #endif
 }
-inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x) {
+inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    float fxl, fxh;
    fxl = __low2float(x);
-    fxh = __high2float(x);;
+    fxh = __high2float(x);
+    ;
    return __floats2bfloat162_rn(expf(fxl), expf(fxh));
 #else
    return h2exp(x);
@@ -169,17 +189,27 @@ inline __device__ __nv_bfloat162 bf16exp2(const __nv_bfloat162 x) {
 }
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)
-inline __device__ __nv_bfloat162 operator*(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hmul2(x, y); };
+inline __device__ __nv_bfloat162 operator*(const __nv_bfloat162 x, const __nv_bfloat162 y)
-inline __device__ __nv_bfloat162 operator+(const __nv_bfloat162 x, const __nv_bfloat162 y) { return bf16hadd2(x, y); };
+{
+    return bf16hmul2(x, y);
+};
+inline __device__ __nv_bfloat162 operator+(const __nv_bfloat162 x, const __nv_bfloat162 y)
+{
+    return bf16hadd2(x, y);
+};
 inline __device__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
 {
-    __nv_bfloat162 t; t.x = x; t.y = y; return t;
+    __nv_bfloat162 t;
+    t.x = x;
+    t.y = y;
+    return t;
 }
 #endif
-inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c));
 #else
@@ -187,7 +217,8 @@ inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_
 #endif
 }
-inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c, __nv_bfloat16 d) {
+inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c, __nv_bfloat16 d)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    return __float2bfloat16(__bfloat162float(a) + __bfloat162float(b) + __bfloat162float(c) + __bfloat162float(d));
 #else
@@ -195,7 +226,8 @@ inline __device__ __nv_bfloat16 bf16hadd(__nv_bfloat16 a, __nv_bfloat16 b, __nv_
 #endif
 }
-inline __device__ __nv_bfloat162 bf16hadd2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+inline __device__ __nv_bfloat162 bf16hadd2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    float fal, fah, fbl, fbh, fcl, fch;
    fal = __low2float(a);
@@ -210,7 +242,8 @@ inline __device__ __nv_bfloat162 bf16hadd2(__nv_bfloat162 a, __nv_bfloat162 b, _
 #endif
 }
-inline __device__ __nv_bfloat16 bf16hmul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
+inline __device__ __nv_bfloat16 bf16hmul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    return __float2bfloat16(__bfloat162float(a) * __bfloat162float(b) * __bfloat162float(c));
 #else
@@ -218,7 +251,8 @@ inline __device__ __nv_bfloat16 bf16hmul(__nv_bfloat16 a, __nv_bfloat16 b, __nv_
 #endif
 }
-inline __device__ __nv_bfloat162 bf16hmul2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+inline __device__ __nv_bfloat162 bf16hmul2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    float fal, fah, fbl, fbh, fcl, fch;
    fal = __low2float(a);
@@ -233,7 +267,8 @@ inline __device__ __nv_bfloat162 bf16hmul2(__nv_bfloat162 a, __nv_bfloat162 b, _
 #endif
 }
-inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162 d) {
+inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c, __nv_bfloat162 d)
+{
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
    float fal, fah, fbl, fbh, fcl, fch, fdl, fdh;
    fal = __low2float(a);
@@ -250,6 +285,6 @@ inline __device__ __nv_bfloat162 bf16hfma2(__nv_bfloat162 a, __nv_bfloat162 b, _
 #endif
 }
-#endif // ENABLE_BF16
+#endif  // ENABLE_BF16
 }  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/utils/cuda_bf16_wrapper.h
+++ b/src/fastertransformer/utils/cuda_bf16_wrapper.h
@@ -18,4 +18,4 @@
 #ifdef ENABLE_BF16
 #include <cuda_bf16.h>
 #endif
\ No newline at end of file
--- a/src/fastertransformer/utils/cuda_fp8_utils.cu
+++ b/src/fastertransformer/utils/cuda_fp8_utils.cu
@@ -121,4 +121,4 @@ template void
 invokeComputeFP8QuantizeScale(float* quant_ptr, const float* weights, const int k, const int n, cudaStream_t stream);
 #endif  // ENABLE_FP8
 }  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/utils/cuda_type_utils.cuh
+++ b/src/fastertransformer/utils/cuda_type_utils.cuh
--- a/src/fastertransformer/utils/custom_ar_comm.h
+++ b/src/fastertransformer/utils/custom_ar_comm.h
@@ -84,4 +84,4 @@ struct CustomARCommTypeConverter<__nv_bfloat16> {
 };
 #endif
 }  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/utils/gemm_test/encoder_gemm_func.cc
+++ b/src/fastertransformer/utils/gemm_test/encoder_gemm_func.cc
@@ -462,29 +462,29 @@ void generate_encoder_gemm_config(
            T* d_C = d_B + k * n * batchCount[i];
            T* dA_compressed;
            {
-                cusparseLtMatDescriptor_t matA;
+                cusparseLtMatDescriptor_t mat_A;
                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-                    &handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+                    &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
                CHECK_CUSPARSE(
-                    cusparseLtSpMMAPrune2(&handle, &matA, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
+                    cusparseLtSpMMAPrune2(&handle, &mat_A, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
                size_t compressed_size;
-                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &matA, &compressed_size))
+                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_A, &compressed_size))
                check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
-                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &matA, true, opA, d_A, dA_compressed, stream))
+                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_A, true, opA, d_A, dA_compressed, stream))
            }
            float exec_time = 99999.0f;
            int   fast_algo = 0;
            for (int alg = 0; alg < 4; ++alg) {
                cudaDeviceSynchronize();
-                cusparseLtMatDescriptor_t matA, matB, matC;
+                cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
                void*                     d_workspace = nullptr;
                int                       num_streams = 1;
                cudaStream_t              streams[1]  = {stream};
                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-                    &handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+                    &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
-                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matB, k, n, k, alignment, CUDA_R_16F, order))
+                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order))
-                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matC, m, n, m, alignment, CUDA_R_16F, order))
+                CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
                gettimeofday(&start, NULL);
                for (int ite = 0; ite < ites; ++ite) {
                    // initializing MatDesc takes a lot of time
@@ -494,7 +494,7 @@ void generate_encoder_gemm_config(
                    cusparseLtMatmulAlgSelection_t alg_sel;
                    cusparseLtMatmulPlan_t         plan;
                    CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
-                        &handle, &matmul, opA, opB, &matA, &matB, &matC, &matC, compute_type))
+                        &handle, &matmul, opA, opB, &mat_A, &mat_B, &mat_C, &mat_C, compute_type))
                    CHECK_CUSPARSE(
                        cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
                    CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(

--- a/src/fastertransformer/utils/gemm_test/encoder_igemm_func.cc
+++ b/src/fastertransformer/utils/gemm_test/encoder_igemm_func.cc
--- a/src/fastertransformer/utils/gemm_test/encoder_igemm_func.h
+++ b/src/fastertransformer/utils/gemm_test/encoder_igemm_func.h
 /*
 * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #pragma once
 #include "src/fastertransformer/utils/cublasAlgoMap.h"
 #include "src/fastertransformer/utils/cuda_utils.h"
 #include <algorithm>
 #include <cublasLt.h>
 #include <cuda_runtime.h>
 #include <map>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/time.h>
 #include <time.h>
 #include <unistd.h>
 #include <vector>
 namespace fastertransformer {
 /* CAUTION : must match cublasLtMatmulTile_t */
 const char* const matmulTileName[] = {"UNDEF",  "8x8",     "8x16",    "16x8",    "8x32",    "16x16",  "32x8",
                                      "8x64",   "16x32",   "32x16",   "64x8",    "32x32",   "32x64",  "64x32",
                                      "32x128", "64x64",   "128x32",  "64x128",  "128x64",  "64x256", "128x128",
                                      "256x64", "64x512",  "128x256", "256x128", "512x64",  "64x96",  "96*64",
                                      "96x128", "128x160", "160x128", "192x128", "128x192", "128x96", "END"};
 int generate_encoder_igemm_config(
    int batch_size, int seq_len, int head_num, int size_per_head, void* buffer, bool isAppend = true);
 int printPerfStructure(int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint);
 int printBatchPerfStructure(
    int batchCount, int m, int n, int k, const customMatmulPerf_t& perf, FILE* fout, int hasPrint);
 template<typename T, typename scaleT>
 int LtIgemmCustomFind(cublasLtHandle_t ltHandle,
                      int              m,
                      int              n,
                      int              k,
                      const scaleT*    alpha, /* host pointer */
                      const int8_t*    A,
                      const int8_t*    B,
                      const scaleT*    beta, /* host pointer */
                      T*               C,
                      void*            workSpace,
                      size_t           workSpaceSize,
                      FILE*            fout);
 template<typename T, typename scaleT>
 int LtBatchIgemmCustomFind(cublasLtHandle_t ltHandle,
                           int              batchCount,
                           int              m,
                           int              n,
                           int              k,
                           const scaleT*    alpha, /* host pointer */
                           const int8_t*    A,
                           const int8_t*    B,
                           const scaleT*    beta, /* host pointer */
                           T*               C,
                           void*            workSpace,
                           size_t           workSpaceSize,
                           FILE*            fout);
 void matInit(int rows, int cols, int8_t* p, int ld);
 }  // namespace fastertransformer
--- a/src/fastertransformer/utils/gemm_test/gpt_gemm_func.cc
+++ b/src/fastertransformer/utils/gemm_test/gpt_gemm_func.cc
@@ -617,15 +617,15 @@ void generate_gpt_gemm_config(int   batch_size,
            T* d_C = d_B + k * n * batchCount[i];
            T* dA_compressed;
            {
-                cusparseLtMatDescriptor_t matA;
+                cusparseLtMatDescriptor_t mat_A;
                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-                    &handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+                    &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
                CHECK_CUSPARSE(
-                    cusparseLtSpMMAPrune2(&handle, &matA, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
+                    cusparseLtSpMMAPrune2(&handle, &mat_A, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
                size_t compressed_size;
-                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &matA, &compressed_size))
+                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_A, &compressed_size))
                check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
-                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &matA, true, opA, d_A, dA_compressed, stream))
+                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_A, true, opA, d_A, dA_compressed, stream))
            }
            float exec_time = 99999.0f;
@@ -633,14 +633,15 @@ void generate_gpt_gemm_config(int   batch_size,
            if (isSparseGemmAvailable(m, n, k)) {
                for (int alg = 0; alg < 4; ++alg) {
                    cudaDeviceSynchronize();
-                    cusparseLtMatDescriptor_t matA, matB, matC;
+                    cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
                    void*                     d_workspace = nullptr;
                    int                       num_streams = 1;
                    cudaStream_t              streams[1]  = {stream};
                    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-                        &handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+                        &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
-                    CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matB, k, n, k, alignment, CUDA_R_16F, order))
+                    CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order))
-                    CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matC, m, n, m, alignment, CUDA_R_16F, order))
+                    CHECK_CUSPARSE(
+                        cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
                    cudaDeviceSynchronize();
                    gettimeofday(&start, NULL);
                    for (int ite = 0; ite < ites; ++ite) {
@@ -651,7 +652,7 @@ void generate_gpt_gemm_config(int   batch_size,
                        cusparseLtMatmulAlgSelection_t alg_sel;
                        cusparseLtMatmulPlan_t         plan;
                        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
-                            &handle, &matmul, opA, opB, &matA, &matB, &matC, &matC, compute_type))
+                            &handle, &matmul, opA, opB, &mat_A, &mat_B, &mat_C, &mat_C, compute_type))
                        CHECK_CUSPARSE(
                            cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
                        CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(

--- a/src/fastertransformer/utils/gemm_test/t5_gemm_func.cc
+++ b/src/fastertransformer/utils/gemm_test/t5_gemm_func.cc
@@ -616,15 +616,15 @@ void generate_t5_gemm_config(int   batch_size,
            T* d_C = d_B + k * n * batchCount[i];
            T* dA_compressed;
            {
-                cusparseLtMatDescriptor_t matA;
+                cusparseLtMatDescriptor_t mat_A;
                CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-                    &handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+                    &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
                CHECK_CUSPARSE(
-                    cusparseLtSpMMAPrune2(&handle, &matA, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
+                    cusparseLtSpMMAPrune2(&handle, &mat_A, true, opA, d_A, d_A, CUSPARSELT_PRUNE_SPMMA_STRIP, stream))
                size_t compressed_size;
-                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &matA, &compressed_size))
+                CHECK_CUSPARSE(cusparseLtSpMMACompressedSize2(&handle, &mat_A, &compressed_size))
                check_cuda_error(cudaMalloc((void**)&dA_compressed, compressed_size));
-                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &matA, true, opA, d_A, dA_compressed, stream))
+                CHECK_CUSPARSE(cusparseLtSpMMACompress2(&handle, &mat_A, true, opA, d_A, dA_compressed, stream))
            }
            float exec_time = 99999.0f;
@@ -632,14 +632,15 @@ void generate_t5_gemm_config(int   batch_size,
            if (isSparseGemmAvailable(m, n, k)) {
                for (int alg = 0; alg < 4; ++alg) {
                    cudaDeviceSynchronize();
-                    cusparseLtMatDescriptor_t matA, matB, matC;
+                    cusparseLtMatDescriptor_t mat_A, mat_B, mat_C;
                    void*                     d_workspace = nullptr;
                    int                       num_streams = 1;
                    cudaStream_t              streams[1]  = {stream};
                    CHECK_CUSPARSE(cusparseLtStructuredDescriptorInit(
-                        &handle, &matA, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
+                        &handle, &mat_A, m, k, m, alignment, CUDA_R_16F, order, CUSPARSELT_SPARSITY_50_PERCENT))
-                    CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matB, k, n, k, alignment, CUDA_R_16F, order))
+                    CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &mat_B, k, n, k, alignment, CUDA_R_16F, order))
-                    CHECK_CUSPARSE(cusparseLtDenseDescriptorInit(&handle, &matC, m, n, m, alignment, CUDA_R_16F, order))
+                    CHECK_CUSPARSE(
+                        cusparseLtDenseDescriptorInit(&handle, &mat_C, m, n, m, alignment, CUDA_R_16F, order))
                    cudaDeviceSynchronize();
                    gettimeofday(&start, NULL);
                    for (int ite = 0; ite < ites; ++ite) {
@@ -650,7 +651,7 @@ void generate_t5_gemm_config(int   batch_size,
                        cusparseLtMatmulAlgSelection_t alg_sel;
                        cusparseLtMatmulPlan_t         plan;
                        CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
-                            &handle, &matmul, opA, opB, &matA, &matB, &matC, &matC, compute_type))
+                            &handle, &matmul, opA, opB, &mat_A, &mat_B, &mat_C, &mat_C, compute_type))
                        CHECK_CUSPARSE(
                            cusparseLtMatmulAlgSelectionInit(&handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
                        CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(

--- a/src/fastertransformer/utils/instance_comm.h
+++ b/src/fastertransformer/utils/instance_comm.h
@@ -13,4 +13,4 @@ public:
    virtual void* getSharedObject() = 0;
 };
 }  // namespace fastertransformer
\ No newline at end of file
--- a/src/fastertransformer/utils/logger.h
+++ b/src/fastertransformer/utils/logger.h
@@ -27,8 +27,7 @@ namespace fastertransformer {
 class Logger {
 public:
-    enum Level
+    enum Level {
-    {
        TRACE   = 0,
        DEBUG   = 10,
        INFO    = 20,
@@ -41,7 +40,7 @@ public:
        thread_local Logger instance;
        return instance;
    }
-    Logger(Logger const&) = delete;
+    Logger(Logger const&)         = delete;
    void operator=(Logger const&) = delete;
    template<typename... Args>

--- a/tests/gemm_dequantize/CMakeLists.txt
+++ b/tests/gemm_dequantize/CMakeLists.txt
@@ -26,4 +26,4 @@ if (TORCH_VERSION VERSION_GREATER_EQUAL "1.9.0")
    target_link_libraries(${LIB_NAME} "${TORCH_LIBRARIES}" fpA_intB_gemm logger)
 else()
    message("TORCH_VERSION ${TORCH_VERSION} < 1.9.0, skipping compiling th_moe_ops.cc because QUInt4x2 is supported after torch 1.9.0")
 endif()
\ No newline at end of file
--- a/tests/gemm_dequantize/th_gemm_dequantize.cc
+++ b/tests/gemm_dequantize/th_gemm_dequantize.cc
@@ -369,4 +369,4 @@ TORCH_LIBRARY(gemm_dq_unit_ops, m)
    m.def("benchmark_against_cublas_fp", benchmark_against_cublas_fp);
    m.def("fused_gemm_dq_bias_act", fused_gemm_dq_bias_act);
 }
 }  // namespace torch_ext
\ No newline at end of file
--- a/tests/gemm_dequantize/th_gemm_dequantize.py
+++ b/tests/gemm_dequantize/th_gemm_dequantize.py
--- a/tests/int8_gemm/CMakeLists.txt
+++ b/tests/int8_gemm/CMakeLists.txt
@@ -21,4 +21,4 @@ add_definitions(-DTORCH_CUDA=1)
 set(EXE_NAME "int8_gemm_test")
 add_executable(${EXE_NAME} ${int8_test_files})
 set_target_properties(${EXE_NAME} PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON)
 target_link_libraries(${EXE_NAME} PUBLIC "${TORCH_LIBRARIES}" int8_gemm tensor logger)
\ No newline at end of file
--- a/tests/int8_gemm/int8_gemm_test.cu
+++ b/tests/int8_gemm/int8_gemm_test.cu
@@ -38,9 +38,9 @@ namespace ft = fastertransformer;
 template<typename T>
 void int8_gemm_test(
-    const int m, 
+    const int m,
-    const int n, 
+    const int n,
-    const int k, 
+    const int k,
    const at::ScalarType output_data_type,
    const QuantMode quant_mode,
    const int iters)