Add lint action (#32)

* temp * fix lint * csrc->src * remove clang-format * skip .rst * skip doc * clang-format version version * mat_B

Add lint action (#32)
* temp * fix lint * csrc->src * remove clang-format * skip .rst * skip doc * clang-format version version * mat_B
fe46dac2 · AllentDan · GitHub · e8ab4ba3 · fe46dac2 · fe46dac2
Unverified Commit fe46dac2 authored Jul 01, 2023 by AllentDan Committed by GitHub Jul 01, 2023
20 changed files
--- a/src/fastertransformer/kernels/decoding_kernels.cu
+++ b/src/fastertransformer/kernels/decoding_kernels.cu
@@ -98,19 +98,19 @@ template void invokeDecodingInitialize(bool*          finished,

 // PROMPT_SRC: 0 --> no prompts, 1 --> from loaded prompts, 2 --> from request prompts
 template<typename T>
-__global__ void embeddingLookupPosEncoding(T*             from_tensor,
-                                           const T*       embedding_table,
-                                           const T*       position_encoding,
-                                           const int*     all_ids,
-                                           const int*     padding_count,
-                                           const int*     input_lengths,
-                                           const int      local_token_num,
-                                           const int64_t  hidden_units,
-                                           const int      step,
-                                           const int      max_input_length,
-                                           const int      token_num,
-                                           const int      ite,
-                                           const T        scale)
+__global__ void embeddingLookupPosEncoding(T*            from_tensor,
+                                           const T*      embedding_table,
+                                           const T*      position_encoding,
+                                           const int*    all_ids,
+                                           const int*    padding_count,
+                                           const int*    input_lengths,
+                                           const int     local_token_num,
+                                           const int64_t hidden_units,
+                                           const int     step,
+                                           const int     max_input_length,
+                                           const int     token_num,
+                                           const int     ite,
+                                           const T       scale)
 {
    // 1. lookup from embedding table
    // 2. multiply scale

--- a/src/fastertransformer/kernels/gpt_kernels.cu
+++ b/src/fastertransformer/kernels/gpt_kernels.cu
@@ -242,18 +242,18 @@ __global__ void inputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLo
        // embedding lookup from word ids [batch, beam, length] (part of [batch, beam, max_input_length]), [vocab,
        // hidden] and [batch, max_prefix_soft_prompt_length, hidden] to generate embedding [batch, beam, length +
        // max_prefix_soft_prompt_length, hidden]
-        int       tmp_index = index;
-        const int hidden_id = tmp_index % param.hidden_units;
-        tmp_index           = (tmp_index - hidden_id) / param.hidden_units;
-        const int seq_id    = tmp_index % (param.max_prefix_soft_prompt_length + param.max_input_length);
-        tmp_index           = (tmp_index - seq_id) / (param.max_prefix_soft_prompt_length + param.max_input_length);
-        const int beam_id   = tmp_index % param.beam_width;
-        tmp_index           = (tmp_index - beam_id) / param.beam_width;
-        const int batch_id  = tmp_index % param.batch_size;
+        int       tmp_index    = index;
+        const int hidden_id    = tmp_index % param.hidden_units;
+        tmp_index              = (tmp_index - hidden_id) / param.hidden_units;
+        const int seq_id       = tmp_index % (param.max_prefix_soft_prompt_length + param.max_input_length);
+        tmp_index              = (tmp_index - seq_id) / (param.max_prefix_soft_prompt_length + param.max_input_length);
+        const int beam_id      = tmp_index % param.beam_width;
+        tmp_index              = (tmp_index - beam_id) / param.beam_width;
+        const int     batch_id = tmp_index % param.batch_size;
        const int64_t hidden_units = param.hidden_units;
-        T         embedding =
+        T             embedding =
            (seq_id < param.prefix_soft_prompt_lengths[batch_id]) ?
-                        (T)param.prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * hidden_units
+                            (T)param.prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * hidden_units
                                                      + seq_id * hidden_units + hidden_id] :
                            param.embedding_table[param.input_ids[batch_id * param.beam_width * param.max_input_length
                                                      + beam_id * param.max_input_length

--- a/src/fastertransformer/kernels/reduce_kernel_utils.cuh
+++ b/src/fastertransformer/kernels/reduce_kernel_utils.cuh
@@ -21,50 +21,46 @@
 #else
 #include <cooperative_groups.h>
 #endif
-#include <cuda_fp16.h>
 #include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
+#include "src/fastertransformer/utils/cuda_type_utils.cuh"
+#include <cuda_fp16.h>
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
 #include <float.h>
 #include <type_traits>
-#include "src/fastertransformer/utils/cuda_type_utils.cuh"

 namespace cg = cooperative_groups;

 namespace fastertransformer {

-template <int VPT>
+template<int VPT>
 struct BytesToType;

-template <>
-struct BytesToType<2>
-{
+template<>
+struct BytesToType<2> {
    using type = uint16_t;
 };
-template <>
-struct BytesToType<4>
-{
+template<>
+struct BytesToType<4> {
    using type = uint32_t;
 };
-template <>
-struct BytesToType<8>
-{
+template<>
+struct BytesToType<8> {
    using type = uint64_t;
 };
-template <>
-struct BytesToType<16>
-{
+template<>
+struct BytesToType<16> {
    using type = float4;
 };

-template <int Bytes>
+template<int Bytes>
 __device__ inline void copy(const void* local, void* data)
 {
    using T = typename BytesToType<Bytes>::type;

-    const T* in = static_cast<const T*>(local);
-    T* out = static_cast<T*>(data);
-    *out = *in;
+    const T* in  = static_cast<const T*>(local);
+    T*       out = static_cast<T*>(data);
+    *out         = *in;
 }

 static const float HALF_FLT_MAX = 65504.F;
@@ -134,7 +130,6 @@ __inline__ __device__ T blockReduceMax(T val)
    return val;
 }

-
 /* Calculate the maximum of all elements in a block */
 template<typename T>
 __inline__ __device__ T blockAllReduceMax(T val)

--- a/src/fastertransformer/kernels/stop_criteria_kernels.cu
+++ b/src/fastertransformer/kernels/stop_criteria_kernels.cu
@@ -149,7 +149,7 @@ void invokeLengthCriterion(bool*           finished,
    h_pinned_finished_sum_[0] = -1;

    length_criterion<<<grid, block, 0, stream>>>(
-        finished, should_stop, h_pinned_finished_sum_, sequence_limit_length, batch_size, beam_width, step);    
+        finished, should_stop, h_pinned_finished_sum_, sequence_limit_length, batch_size, beam_width, step);
    while (((volatile int*)h_pinned_finished_sum_)[0] == -1) {};
    sync_check_cuda_error();


--- a/src/fastertransformer/kernels/unfused_attention_kernels.cu
+++ b/src/fastertransformer/kernels/unfused_attention_kernels.cu
@@ -1472,7 +1472,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T*
            k = *reinterpret_cast<Vec_t*>(k_smem + half_idx * smem_pitch + intra_half_idx);
        }
    }
-    if (!is_masked && !q_buf) {  // also skip modifing QKV if q/k/v_buf are present
+    if (!is_masked && !q_buf) {  // also skip modifying QKV if q/k/v_buf are present
        *reinterpret_cast<Vec_t*>(&QKV[src_q_idx]) = q;
        *reinterpret_cast<Vec_t*>(&QKV[src_k_idx]) = k;
        *reinterpret_cast<Vec_t*>(&QKV[src_v_idx]) = v;

--- a/src/fastertransformer/layers/CMakeLists.txt
+++ b/src/fastertransformer/layers/CMakeLists.txt
@@ -23,4 +23,4 @@ set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS  ON)
 target_link_libraries(DynamicDecodeLayer PUBLIC -lcudart
                        TopKSamplingLayer TopPSamplingLayer
                        OnlineBeamSearchLayer BeamSearchLayer ban_bad_words stop_criteria
-                        gpt_kernels tensor nvtx_utils)
\ No newline at end of file
+                        gpt_kernels tensor nvtx_utils)
--- a/src/fastertransformer/layers/FfnFP8Layer.cc
+++ b/src/fastertransformer/layers/FfnFP8Layer.cc
--- a/src/fastertransformer/layers/FfnFP8Weight.h
+++ b/src/fastertransformer/layers/FfnFP8Weight.h
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "FfnWeight.h"
-#include "src/fastertransformer/utils/ScaleList.h"
-namespace fastertransformer {
-
-template<typename T1, typename T2>
-struct FfnFP8Weight: FfnWeight<T1, T2> {
-    ScaleList* scale_list_ptr;
-    float*     identity_scale;
-    float*     identity_h_scale;
-};
-
-}  // namespace fastertransformer
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "FfnWeight.h"
+#include "src/fastertransformer/utils/ScaleList.h"
+namespace fastertransformer {
+
+template<typename T1, typename T2>
+struct FfnFP8Weight: FfnWeight<T1, T2> {
+    ScaleList* scale_list_ptr;
+    float*     identity_scale;
+    float*     identity_h_scale;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/FfnINT8Weight.h
+++ b/src/fastertransformer/layers/FfnINT8Weight.h
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "FfnWeight.h"
-#include "src/fastertransformer/utils/ScaleList.h"
-namespace fastertransformer {
-
-template<typename T>
-struct FfnINT8Weight: FfnWeight<T> {
-    ScaleList* scale_list_ptr;
-};
-
-}  // namespace fastertransformer
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "FfnWeight.h"
+#include "src/fastertransformer/utils/ScaleList.h"
+namespace fastertransformer {
+
+template<typename T>
+struct FfnINT8Weight: FfnWeight<T> {
+    ScaleList* scale_list_ptr;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/FfnLayerINT8.cc
+++ b/src/fastertransformer/layers/FfnLayerINT8.cc
--- a/src/fastertransformer/layers/FfnLayerINT8.h
+++ b/src/fastertransformer/layers/FfnLayerINT8.h
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "FfnINT8Weight.h"
-#include "src/fastertransformer/kernels/activation_int8_kernels.h"
-#include "src/fastertransformer/layers/BaseLayer.h"
-#include "src/fastertransformer/utils/ScaleList.h"
-#include "src/fastertransformer/utils/Tensor.h"
-#include "src/fastertransformer/utils/allocator.h"
-#include "src/fastertransformer/utils/cublasINT8MMWrapper.h"
-#include "src/fastertransformer/utils/memory_utils.h"
-#include <vector>
-
-namespace fastertransformer {
-
-template<typename T>
-class GeluFfnLayerINT8;
-
-template<typename T>
-class ReluFfnLayerINT8;
-
-template<typename T>
-class FfnLayerINT8: public BaseLayer {
-private:
-    // buffer handling
-    size_t max_token_num_ = 0;
-
-    // meta data
-    size_t head_num_;
-    size_t size_per_head_;
-
-    // calculated data
-    size_t hidden_units_;
-
-    void allocateBuffer() override;
-    void freeBuffer() override;
-    bool isValidTokenNum(size_t token_num);
-
-protected:
-    size_t inter_size_;
-    int    int8_mode_;
-    bool   sparse_;
-
-    int*         inter_int_buf_;
-    int8_t*      inter_buf_;
-    virtual void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) = 0;
-
-public:
-    FfnLayerINT8(size_t           max_batch_size,
-                 size_t           max_seq_len,
-                 size_t           head_num,
-                 size_t           size_per_head,
-                 size_t           inter_size,
-                 int              int8_mode,
-                 cudaStream_t     stream,
-                 cublasMMWrapper* cublas_wrapper,
-                 IAllocator*      allocator,
-                 bool             is_free_buffer_after_forward,
-                 bool             sparse = false);
-
-    FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer);
-
-    ~FfnLayerINT8();
-
-    void forward(std::vector<fastertransformer::Tensor>*       output_tensors,
-                 const std::vector<fastertransformer::Tensor>* input_tensors,
-                 const FfnWeight<T>*                           ffn_weights);
-
-    friend GeluFfnLayerINT8<T>;
-    friend ReluFfnLayerINT8<T>;
-};
-
-template<typename T>
-class GeluFfnLayerINT8: public FfnLayerINT8<T> {
-public:
-    GeluFfnLayerINT8(size_t           max_batch_size,
-                     size_t           max_seq_len,
-                     size_t           head_num,
-                     size_t           size_per_head,
-                     size_t           inter_size,
-                     int              int8_mode,
-                     cudaStream_t     stream,
-                     cublasMMWrapper* cublas_wrapper,
-                     IAllocator*      allocator,
-                     bool             is_free_buffer_after_forward,
-                     bool             sparse = false);
-
-    GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& ffn_layer);
-
-    ~GeluFfnLayerINT8() = default;
-
-private:
-    using FfnLayerINT8<T>::inter_int_buf_;
-    using FfnLayerINT8<T>::inter_buf_;
-    using FfnLayerINT8<T>::inter_size_;
-    using FfnLayerINT8<T>::stream_;
-    using FfnLayerINT8<T>::int8_mode_;
-    using FfnLayerINT8<T>::sparse_;
-    using FfnLayerINT8<T>::hidden_units_;
-    void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
-};
-
-template<typename T>
-class ReluFfnLayerINT8: public FfnLayerINT8<T> {
-public:
-    ReluFfnLayerINT8(size_t           max_batch_size,
-                     size_t           max_seq_len,
-                     size_t           head_num,
-                     size_t           size_per_head,
-                     size_t           inter_size,
-                     int              int8_mode,
-                     cudaStream_t     stream,
-                     cublasMMWrapper* cublas_wrapper,
-                     IAllocator*      allocator,
-                     bool             is_free_buffer_after_forward);
-
-    ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& ffn_layer);
-
-    ~ReluFfnLayerINT8() = default;
-
-private:
-    using FfnLayerINT8<T>::inter_int_buf_;
-    using FfnLayerINT8<T>::inter_buf_;
-    using FfnLayerINT8<T>::inter_size_;
-    using FfnLayerINT8<T>::stream_;
-    using FfnLayerINT8<T>::int8_mode_;
-    using FfnLayerINT8<T>::hidden_units_;
-    void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
-};
-
-}  // namespace fastertransformer
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "FfnINT8Weight.h"
+#include "src/fastertransformer/kernels/activation_int8_kernels.h"
+#include "src/fastertransformer/layers/BaseLayer.h"
+#include "src/fastertransformer/utils/ScaleList.h"
+#include "src/fastertransformer/utils/Tensor.h"
+#include "src/fastertransformer/utils/allocator.h"
+#include "src/fastertransformer/utils/cublasINT8MMWrapper.h"
+#include "src/fastertransformer/utils/memory_utils.h"
+#include <vector>
+
+namespace fastertransformer {
+
+template<typename T>
+class GeluFfnLayerINT8;
+
+template<typename T>
+class ReluFfnLayerINT8;
+
+template<typename T>
+class FfnLayerINT8: public BaseLayer {
+private:
+    // buffer handling
+    size_t max_token_num_ = 0;
+
+    // meta data
+    size_t head_num_;
+    size_t size_per_head_;
+
+    // calculated data
+    size_t hidden_units_;
+
+    void allocateBuffer() override;
+    void freeBuffer() override;
+    bool isValidTokenNum(size_t token_num);
+
+protected:
+    size_t inter_size_;
+    int    int8_mode_;
+    bool   sparse_;
+
+    int*         inter_int_buf_;
+    int8_t*      inter_buf_;
+    virtual void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) = 0;
+
+public:
+    FfnLayerINT8(size_t           max_batch_size,
+                 size_t           max_seq_len,
+                 size_t           head_num,
+                 size_t           size_per_head,
+                 size_t           inter_size,
+                 int              int8_mode,
+                 cudaStream_t     stream,
+                 cublasMMWrapper* cublas_wrapper,
+                 IAllocator*      allocator,
+                 bool             is_free_buffer_after_forward,
+                 bool             sparse = false);
+
+    FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer);
+
+    ~FfnLayerINT8();
+
+    void forward(std::vector<fastertransformer::Tensor>*       output_tensors,
+                 const std::vector<fastertransformer::Tensor>* input_tensors,
+                 const FfnWeight<T>*                           ffn_weights);
+
+    friend GeluFfnLayerINT8<T>;
+    friend ReluFfnLayerINT8<T>;
+};
+
+template<typename T>
+class GeluFfnLayerINT8: public FfnLayerINT8<T> {
+public:
+    GeluFfnLayerINT8(size_t           max_batch_size,
+                     size_t           max_seq_len,
+                     size_t           head_num,
+                     size_t           size_per_head,
+                     size_t           inter_size,
+                     int              int8_mode,
+                     cudaStream_t     stream,
+                     cublasMMWrapper* cublas_wrapper,
+                     IAllocator*      allocator,
+                     bool             is_free_buffer_after_forward,
+                     bool             sparse = false);
+
+    GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& ffn_layer);
+
+    ~GeluFfnLayerINT8() = default;
+
+private:
+    using FfnLayerINT8<T>::inter_int_buf_;
+    using FfnLayerINT8<T>::inter_buf_;
+    using FfnLayerINT8<T>::inter_size_;
+    using FfnLayerINT8<T>::stream_;
+    using FfnLayerINT8<T>::int8_mode_;
+    using FfnLayerINT8<T>::sparse_;
+    using FfnLayerINT8<T>::hidden_units_;
+    void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
+};
+
+template<typename T>
+class ReluFfnLayerINT8: public FfnLayerINT8<T> {
+public:
+    ReluFfnLayerINT8(size_t           max_batch_size,
+                     size_t           max_seq_len,
+                     size_t           head_num,
+                     size_t           size_per_head,
+                     size_t           inter_size,
+                     int              int8_mode,
+                     cudaStream_t     stream,
+                     cublasMMWrapper* cublas_wrapper,
+                     IAllocator*      allocator,
+                     bool             is_free_buffer_after_forward);
+
+    ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& ffn_layer);
+
+    ~ReluFfnLayerINT8() = default;
+
+private:
+    using FfnLayerINT8<T>::inter_int_buf_;
+    using FfnLayerINT8<T>::inter_buf_;
+    using FfnLayerINT8<T>::inter_size_;
+    using FfnLayerINT8<T>::stream_;
+    using FfnLayerINT8<T>::int8_mode_;
+    using FfnLayerINT8<T>::hidden_units_;
+    void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h
+++ b/src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h
@@ -68,12 +68,13 @@ AttentionType getAttentionType(size_t     size_per_head,
        }
        // GPT and its variants
        else {
-           // FMHA_ENABLE only affects gpt-style models (causal-mask)
-            char * fused_qkv = std::getenv("FMHA_ENABLE");
+            // FMHA_ENABLE only affects gpt-style models (causal-mask)
+            char* fused_qkv = std::getenv("FMHA_ENABLE");
            if (fused_qkv != nullptr && std::string(fused_qkv) == "ON") {
                if ((sm == kSM_70 || sm == kSM_72 || sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_89)
                    && (size_per_head == 32 || size_per_head == 40 || size_per_head == 64 || size_per_head == 80
-                        || size_per_head == 128 || size_per_head == 144 || size_per_head == 160 || size_per_head == 256)) {
+                        || size_per_head == 128 || size_per_head == 144 || size_per_head == 160
+                        || size_per_head == 256)) {
                    return remove_padding ? AttentionType::FUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
                }
            }

--- a/src/fastertransformer/layers/attention_layers/CMakeLists.txt
+++ b/src/fastertransformer/layers/attention_layers/CMakeLists.txt
@@ -13,4 +13,3 @@
 # limitations under the License.

 cmake_minimum_required(VERSION 3.8)
-
--- a/src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h
+++ b/src/fastertransformer/layers/attention_layers_fp8/AttentionFP8Weight.h
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
-#include "src/fastertransformer/utils/ScaleList.h"
-
-namespace fastertransformer {
-
-template<typename T1, typename T2>
-struct AttentionFP8Weight: public AttentionWeight<T1, T2> {
-    const float* qk_scale;
-    const float* qk_scale_inv;
-    float*       qk_h_scale;
-    float*       qk_h_scale_inv;
-    float*       identity_scale;
-    float*       identity_h_scale;
-};
-
-}  // namespace fastertransformer
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
+#include "src/fastertransformer/utils/ScaleList.h"
+
+namespace fastertransformer {
+
+template<typename T1, typename T2>
+struct AttentionFP8Weight: public AttentionWeight<T1, T2> {
+    const float* qk_scale;
+    const float* qk_scale_inv;
+    float*       qk_h_scale;
+    float*       qk_h_scale_inv;
+    float*       identity_scale;
+    float*       identity_h_scale;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/layers/attention_layers_int8/AttentionINT8Weight.h
+++ b/src/fastertransformer/layers/attention_layers_int8/AttentionINT8Weight.h
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
-#include "src/fastertransformer/utils/ScaleList.h"
-
-namespace fastertransformer {
-
-template<typename T>
-struct AttentionINT8Weight: AttentionWeight<T> {
-    ScaleList* scale_list_ptr;
-};
-
-}  // namespace fastertransformer
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
+#include "src/fastertransformer/utils/ScaleList.h"
+
+namespace fastertransformer {
+
+template<typename T>
+struct AttentionINT8Weight: AttentionWeight<T> {
+    ScaleList* scale_list_ptr;
+};
+
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/BaseWeight.h
+++ b/src/fastertransformer/models/BaseWeight.h
@@ -46,4 +46,4 @@ public:
    }
 };

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/Barrier.h
+++ b/src/fastertransformer/models/llama/Barrier.h
@@ -15,9 +15,9 @@ public:
        pthread_barrier_init(&barrier_, nullptr, count);
    }

-    Barrier(const Barrier&) = delete;
-    Barrier& operator=(const Barrier&) = delete;
-    Barrier(Barrier&&) noexcept        = delete;
+    Barrier(const Barrier&)                = delete;
+    Barrier& operator=(const Barrier&)     = delete;
+    Barrier(Barrier&&) noexcept            = delete;
    Barrier& operator=(Barrier&&) noexcept = delete;

    void wait()
@@ -34,4 +34,4 @@ private:
    pthread_barrier_t barrier_{};
 };

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/CMakeLists.txt
+++ b/src/fastertransformer/models/llama/CMakeLists.txt
@@ -4,7 +4,7 @@ cmake_minimum_required(VERSION 3.8)

 add_subdirectory(fused_multi_head_attention)

-add_library(Llama STATIC 
+add_library(Llama STATIC
        LlamaV2.cc
        LlamaBatch.cc
        LlamaCacheManager.cc

--- a/src/fastertransformer/models/llama/LlamaBatch.cc
+++ b/src/fastertransformer/models/llama/LlamaBatch.cc
@@ -19,11 +19,11 @@ template<typename T>
 void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_reqs,
                                   std::vector<std::shared_ptr<Request>>& infer_reqs)
 {
-    std::unordered_map<uint64_t, int> occurance;
+    std::unordered_map<uint64_t, int> occurrence;

-    auto count_occurance = [&occurance](const std::vector<std::shared_ptr<Request>>& rs) {
+    auto count_occurrence = [&occurrence](const std::vector<std::shared_ptr<Request>>& rs) {
        for (const auto& r : rs) {
-            ++occurance[r->id];
+            ++occurrence[r->id];
        }
    };

@@ -33,13 +33,13 @@ void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_r
        req.reset();
    };

-    auto handle_conflict_or_invalid = [this, &occurance, &invalidate](std::vector<std::shared_ptr<Request>>& rs,
-                                                                      const char*                            type) {
+    auto handle_conflict_or_invalid = [this, &occurrence, &invalidate](std::vector<std::shared_ptr<Request>>& rs,
+                                                                       const char*                            type) {
        for (auto& r : rs) {
            if (r) {
                int ec = 0;

-                if (occurance[r->id] != 1) {
+                if (occurrence[r->id] != 1) {
                    ec = Request::kConflict;
                }
                else if (r->start_flag && r->stop_flag) {
@@ -66,8 +66,8 @@ void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_r
        rs.resize(count);
    };

-    count_occurance(stop_reqs);
-    count_occurance(infer_reqs);
+    count_occurrence(stop_reqs);
+    count_occurrence(infer_reqs);

    if (!stop_reqs.empty()) {
        handle_conflict_or_invalid(stop_reqs, "stop");
@@ -129,7 +129,7 @@ void LlamaBatch<T>::handleStopRequests(const std::vector<std::shared_ptr<Request
            ec = 0;
            llama_->kv_cache_mgr_->erase(r->id);
        }
-        // clear output buffers (prevent leaking conversations) if request is successfull
+        // clear output buffers (prevent leaking conversations) if request is successful
        if (ec == 0) {
            auto& output_ids      = r->outputs[rank_].at("output_ids");
            auto& sequence_length = r->outputs[rank_].at("sequence_length");
@@ -407,7 +407,7 @@ void LlamaBatch<T>::initializeGeneration()
    check_cuda_error(
        cudaMemcpyAsync(sequence_lengths_, context_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
    // `sequence_lengths_` will be increased by dynamic decode
-    // note that in decoder and in output "sequence length" has differnt semantic
+    // note that in decoder and in output "sequence length" has different semantic
    // - in decoder it means length of sequence that has kv cache already computed
    // - in output it means length of all tokens (the last generated token does not have k/v cache computed yet)
    invokePlusScalar(sequence_lengths_, -1, batch_size_, stream_);
@@ -1039,4 +1039,4 @@ void LlamaBatch<T>::finishRequest(int index, bool force_end)
 template class LlamaBatch<half>;
 template class LlamaBatch<float>;

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaBatch.h
+++ b/src/fastertransformer/models/llama/LlamaBatch.h
@@ -122,7 +122,7 @@ private:
    void* topk_curandstate_buf_{};
    void* topp_curandstate_buf_{};

-    // hard limits for persistant buffers
+    // hard limits for persistent buffers
    static constexpr int kMaxStopBadWordsLen = 32;

    using CachedSeq = LlamaCacheManager::Sequence;
@@ -150,4 +150,4 @@ private:
    IAllocator*      allocator_{};
 };

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer