Add lint action (#32)

* temp * fix lint * csrc->src * remove clang-format * skip .rst * skip doc * clang-format version version * mat_B

Add lint action (#32)
* temp * fix lint * csrc->src * remove clang-format * skip .rst * skip doc * clang-format version version * mat_B
fe46dac2 · AllentDan · GitHub · e8ab4ba3 · fe46dac2 · fe46dac2
Unverified Commit fe46dac2 authored Jul 01, 2023 by AllentDan Committed by GitHub Jul 01, 2023
20 changed files
--- a/src/fastertransformer/models/llama/LlamaWeight.h
+++ b/src/fastertransformer/models/llama/LlamaWeight.h
--- a/src/fastertransformer/models/llama/Request.h
+++ b/src/fastertransformer/models/llama/Request.h
@@ -25,8 +25,7 @@ struct Request {
    using Callback = std::function<void(std::unordered_map<std::string, Tensor>*)>;
    Callback stream_cb;
-    enum
+    enum {
-    {
        kInvalid  = 1,
        kConflict = 2,
        kBusy     = 3,

--- a/src/fastertransformer/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu
+++ b/src/fastertransformer/models/llama/fused_multi_head_attention/llama_flash_attention_kernel.cu
@@ -10,7 +10,7 @@
 #include <cutlass/half.h>
 #include <cutlass/platform/platform.h>
-// modifiy from:
+// modified from:
 // https://github.com/NVIDIA/cutlass/blob/main/examples/41_fused_multi_head_attention/kernel_forward.h
 namespace fastertransformer {

--- a/src/fastertransformer/models/llama/llama_decoder_kernels.cu
+++ b/src/fastertransformer/models/llama/llama_decoder_kernels.cu
@@ -11,7 +11,8 @@ namespace cg = cooperative_groups;
 namespace fastertransformer {
 template<typename T>
-struct res_norm_ops_t {};
+struct res_norm_ops_t {
+};
 template<typename T>
 struct res_norm_t {
@@ -144,7 +145,7 @@ __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
 template<typename T>
 void invokeFusedAddBiasResidualRMSNorm(
-    T* residual, T* inout, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream)
+    T* residual, T* in_out, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream)
 {
    constexpr int PACK_DIM = sizeof(uint4) / sizeof(T);
    FT_CHECK(n_dims % PACK_DIM == 0);
@@ -154,7 +155,7 @@ void invokeFusedAddBiasResidualRMSNorm(
    n_threads           = (n_threads + 31) / 32 * 32;      // round up to the nearest multiple of warp size
    fusedAddBiasResidualNorm<<<batch_size, n_threads, 0, stream>>>(
-        residual, inout, bias, scale, eps, batch_size, n_dims);
+        residual, in_out, bias, scale, eps, batch_size, n_dims);
 }
 template void

--- a/src/fastertransformer/models/llama/llama_decoder_kernels.h
+++ b/src/fastertransformer/models/llama/llama_decoder_kernels.h
@@ -6,6 +6,6 @@ namespace fastertransformer {
 template<typename T>
 void invokeFusedAddBiasResidualRMSNorm(
-    T* residual, T* inout, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream);
+    T* residual, T* in_out, const T* bias, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream);
 }  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/llama_kernels.cu
+++ b/src/fastertransformer/models/llama/llama_kernels.cu
 // Copyright (c) OpenMMLab. All rights reserved.
+#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
 #include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
 #include "src/fastertransformer/models/llama/llama_kernels.h"
 #include "src/fastertransformer/models/llama/llama_utils.h"
 #include "src/fastertransformer/utils/cuda_type_utils.cuh"
-#include "src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h"
 namespace fastertransformer {
@@ -293,7 +293,8 @@ inline __device__ float2 float2div(float a, float2 b)
    return c;
 }
-static inline __device__ half4 char4_scale_to_half4(char4 value, const float scale) {
+static inline __device__ half4 char4_scale_to_half4(char4 value, const float scale)
+{
    half4 dst;
    dst.x = __float2half(value.x * scale);
    dst.y = __float2half(value.y * scale);
@@ -302,16 +303,18 @@ static inline __device__ half4 char4_scale_to_half4(char4 value, const float sca
    return dst;
 }
-static inline __device__ uint32_t float4_to_char4(float x,
+static inline __device__ uint32_t float4_to_char4(float x, float y, float z, float w)
-                                                  float y,
+{
-                                                  float z,
-                                                  float w) {
    uint32_t dst;
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 720
-  uint32_t a; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(a) : "f"(x));
+    uint32_t a;
-  uint32_t b; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(b) : "f"(y));
+    asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(a) : "f"(x));
-  uint32_t c; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(c) : "f"(z));
+    uint32_t b;
-  uint32_t d; asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(d) : "f"(w));
+    asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(b) : "f"(y));
+    uint32_t c;
+    asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(c) : "f"(z));
+    uint32_t d;
+    asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(d) : "f"(w));
    asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2,  0;\n" : "=r"(dst) : "r"(d), "r"(c));
    asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, %0;\n" : "+r"(dst) : "r"(b), "r"(a));
@@ -380,7 +383,6 @@ __global__ void extend_value_cache_int8(int8_t**     v_dst,
    }
 }
 template<typename T>
 void invokeExtendKVCache(T**          k_dst,
                         T**          v_dst,
@@ -404,18 +406,48 @@ void invokeExtendKVCache(T**          k_dst,
    dim3 grid((max_q_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
    if (quant & QuantPolicy::kCacheKVInt8) {
-        extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(
+        extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(reinterpret_cast<int8_t**>(k_dst),
-            reinterpret_cast<int8_t**>(k_dst), dst_offset, k_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len, kv_scale[0]);
+                                                               dst_offset,
+                                                               k_src,
-        extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(
+                                                               local_head_num,
-            reinterpret_cast<int8_t**>(v_dst), dst_offset, v_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len, kv_scale[1]);
+                                                               size_per_head,
+                                                               query_length,
-    } else {
+                                                               history_length,
-        extend_value_cache<<<grid, block_sz, 0, stream>>>(
+                                                               max_q_len,
-            k_dst, dst_offset, k_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len);
+                                                               max_seq_len,
+                                                               kv_scale[0]);
-        extend_value_cache<<<grid, block_sz, 0, stream>>>(
-            v_dst, dst_offset, v_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len);
+        extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(reinterpret_cast<int8_t**>(v_dst),
+                                                               dst_offset,
+                                                               v_src,
+                                                               local_head_num,
+                                                               size_per_head,
+                                                               query_length,
+                                                               history_length,
+                                                               max_q_len,
+                                                               max_seq_len,
+                                                               kv_scale[1]);
+    }
+    else {
+        extend_value_cache<<<grid, block_sz, 0, stream>>>(k_dst,
+                                                          dst_offset,
+                                                          k_src,
+                                                          local_head_num,
+                                                          size_per_head,
+                                                          query_length,
+                                                          history_length,
+                                                          max_q_len,
+                                                          max_seq_len);
+        extend_value_cache<<<grid, block_sz, 0, stream>>>(v_dst,
+                                                          dst_offset,
+                                                          v_src,
+                                                          local_head_num,
+                                                          size_per_head,
+                                                          query_length,
+                                                          history_length,
+                                                          max_q_len,
+                                                          max_seq_len);
    }
 }
@@ -492,7 +524,6 @@ __global__ void transpose_value_cache(T*           v_dst,  //
    }
 }
 template<typename T>
 __global__ void transpose_value_cache_int8(T*             v_dst,  //
                                           const int8_t** v_src,
@@ -562,13 +593,27 @@ void invokeTransposeKVCache(T*           key_cache_trans,
    dim3 grid((max_kv_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num);
    if (quant & QuantPolicy::kCacheKVInt8) {
-        transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(
+        transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(key_cache_trans,
-            key_cache_trans, reinterpret_cast<const int8_t**>(key_cache), src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len, kv_scale[0]);
+                                                                  reinterpret_cast<const int8_t**>(key_cache),
+                                                                  src_offset,
-        transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(
+                                                                  head_num,
-            val_cache_trans, reinterpret_cast<const int8_t**>(val_cache), src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len, kv_scale[1]);
+                                                                  size_per_head,
+                                                                  key_length,
-    } else {
+                                                                  max_kv_len,
+                                                                  max_seq_len,
+                                                                  kv_scale[0]);
+        transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(val_cache_trans,
+                                                                  reinterpret_cast<const int8_t**>(val_cache),
+                                                                  src_offset,
+                                                                  head_num,
+                                                                  size_per_head,
+                                                                  key_length,
+                                                                  max_kv_len,
+                                                                  max_seq_len,
+                                                                  kv_scale[1]);
+    }
+    else {
        transpose_value_cache<<<grid, block_sz, 0, stream>>>(
            key_cache_trans, key_cache, src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len);
@@ -577,10 +622,34 @@ void invokeTransposeKVCache(T*           key_cache_trans,
    }
 }
-template void invokeTransposeKVCache(
+template void invokeTransposeKVCache(float*,
-    float*, float*, const float**, const float**, size_t, int, const int*, int, int, int, int, cudaStream_t stream, int, const float*);
+                                     float*,
-template void invokeTransposeKVCache(
+                                     const float**,
-    half*, half*, const half**, const half**, size_t, int, const int*, int, int, int, int, cudaStream_t stream, int, const float*);
+                                     const float**,
+                                     size_t,
+                                     int,
+                                     const int*,
+                                     int,
+                                     int,
+                                     int,
+                                     int,
+                                     cudaStream_t stream,
+                                     int,
+                                     const float*);
+template void invokeTransposeKVCache(half*,
+                                     half*,
+                                     const half**,
+                                     const half**,
+                                     size_t,
+                                     int,
+                                     const int*,
+                                     int,
+                                     int,
+                                     int,
+                                     int,
+                                     cudaStream_t stream,
+                                     int,
+                                     const float*);
 __global__ void gatherOutput(int*       output_ids,
                             const int* ids,

--- a/src/fastertransformer/models/llama/llama_kernels.h
+++ b/src/fastertransformer/models/llama/llama_kernels.h
--- a/src/fastertransformer/models/llama/llama_utils.cu
+++ b/src/fastertransformer/models/llama/llama_utils.cu
--- a/src/fastertransformer/models/llama/llama_utils.h
+++ b/src/fastertransformer/models/llama/llama_utils.h
@@ -18,8 +18,7 @@ enum QuantPolicy {
    kCacheKVInt8 = 0x04,
 };
-enum CmpMode
+enum CmpMode {
-{
    kCmpNone,
    kCmpRead,
    kCmpWrite,

--- a/src/fastertransformer/models/llama/prefix_cache.h
+++ b/src/fastertransformer/models/llama/prefix_cache.h
--- a/src/fastertransformer/triton_backend/libfastertransformer.cc
+++ b/src/fastertransformer/triton_backend/libfastertransformer.cc
@@ -25,7 +25,8 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-// Modified from https://github.com/triton-inference-server/fastertransformer_backend/blob/main/src/libfastertransformer.cc
+// Modified from
+// https://github.com/triton-inference-server/fastertransformer_backend/blob/main/src/libfastertransformer.cc
 #include <stdint.h>
@@ -1399,7 +1400,7 @@ void ModelInstanceState::SetInputTensors(
            auto batch_input_name = batch_input.TargetNames()[0];
            // we only take care of the ragged input_ids
-            // Assume the first dimention (length) are different and others are the
+            // Assume the first dimension (length) are different and others are the
            // same BATCH_ITEM_SHAPE [num_requests (batches), num_dims (excluding
            // batch dimension)]
            if (batch_input_kind == BatchInput::Kind::BATCH_ITEM_SHAPE
@@ -1464,7 +1465,7 @@ void ModelInstanceState::SetInputTensors(
                                                    param.batch_input_ptr + param.batch_intput_size,
                                                    [&](int x) { return x != param.batch_input_ptr[0]; });
-                // calculate statics of elements
+                // calculate statistics of elements
                if (param.is_input_ragged) {
                    param.max_elements_per_seq =
                        *std::max_element(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size);

--- a/src/fastertransformer/triton_backend/libtriton_fastertransformer.ldscript
+++ b/src/fastertransformer/triton_backend/libtriton_fastertransformer.ldscript
-# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
+#Copyright(c) 2021 - 2022, NVIDIA CORPORATION.All rights reserved.
 #
-# Redistribution and use in source and binary forms, with or without
+#Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
+#modification, are permitted provided that the following conditions
-# are met:
+#are met:
-#  * Redistributions of source code must retain the above copyright
+#* Redistributions of source code must retain the above copyright
-#    notice, this list of conditions and the following disclaimer.
+#notice, this list of conditions and the following disclaimer.
-#  * Redistributions in binary form must reproduce the above copyright
+#* Redistributions in binary form must reproduce the above copyright
-#    notice, this list of conditions and the following disclaimer in the
+#notice, this list of conditions and the following disclaimer in the
-#    documentation and/or other materials provided with the distribution.
+#documentation and / or other materials provided with the distribution.
-#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#* Neither the name of NVIDIA CORPORATION nor the names of its
-#    contributors may be used to endorse or promote products derived
+#contributors may be used to endorse or promote products derived
-#    from this software without specific prior written permission.
+#from this software without specific prior written       permission.
 #
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+#IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+#PURPOSE ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+#CONTRIBUTORS BE LIABLE FOR ANY                               DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+#EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO,
-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 {
-  global:
+global:
    TRITONBACKEND_*;
-  local: *;
+local:
+    *;
 };
--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModel.cc
@@ -309,7 +309,8 @@ std::string LlamaTritonModel<T>::toString()
       << "\nuse_context_fmha: " << use_context_fmha_ << "\nstart_id: " << start_id_
       << "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_
       << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_
-       << "\nprefix_cache_len: " << prefix_cache_len_ << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << quant_policy_ << std::endl;
+       << "\nprefix_cache_len: " << prefix_cache_len_ << "\nmodel_dir: " << model_dir_
+       << "\nquant_policy: " << quant_policy_ << std::endl;
    return ss.str();
 }

--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.cc
@@ -15,7 +15,8 @@
 * limitations under the License.
 */
- // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
 #include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
 #include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"

--- a/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h
+++ b/src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h
@@ -15,7 +15,8 @@
 * limitations under the License.
 */
- // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
 #pragma once

--- a/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
+++ b/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
@@ -15,7 +15,8 @@
 * limitations under the License.
 */
- // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.cpp
 #include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
 #include "src/fastertransformer/utils/nccl_utils.h"

--- a/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
+++ b/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
@@ -15,7 +15,8 @@
 * limitations under the License.
 */
-// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/transformer_triton_backend.hpp
 #pragma once

--- a/src/fastertransformer/utils/CMakeLists.txt
+++ b/src/fastertransformer/utils/CMakeLists.txt
--- a/src/fastertransformer/utils/Tensor.h
+++ b/src/fastertransformer/utils/Tensor.h
@@ -35,8 +35,7 @@
 namespace fastertransformer {
-typedef enum datatype_enum
+typedef enum datatype_enum {
-{
    TYPE_INVALID,
    TYPE_BOOL,
    TYPE_UINT8,
@@ -99,8 +98,7 @@ DataType getTensorType()
    }
 }
-typedef enum memorytype_enum
+typedef enum memorytype_enum {
-{
    MEMORY_CPU,
    MEMORY_CPU_PINNED,
    MEMORY_GPU

--- a/src/fastertransformer/utils/activation_types.h
+++ b/src/fastertransformer/utils/activation_types.h