Add lint action (#32)

* temp * fix lint * csrc->src * remove clang-format * skip .rst * skip doc * clang-format version version * mat_B

Add lint action (#32)
* temp * fix lint * csrc->src * remove clang-format * skip .rst * skip doc * clang-format version version * mat_B
fe46dac2 · AllentDan · GitHub · e8ab4ba3 · fe46dac2 · fe46dac2
Unverified Commit fe46dac2 authored Jul 01, 2023 by AllentDan Committed by GitHub Jul 01, 2023
20 changed files
--- a/src/fastertransformer/models/llama/LlamaCacheManager.cc
+++ b/src/fastertransformer/models/llama/LlamaCacheManager.cc
@@ -189,4 +189,4 @@ bool LlamaCacheManager::contains(uint64_t id) const noexcept
    return it != device_cache_.end();
 }

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaCacheManager.h
+++ b/src/fastertransformer/models/llama/LlamaCacheManager.h
@@ -99,4 +99,4 @@ private:
    std::vector<Sequence> device_cache_;
 };

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaContextAttentionLayer.cc
+++ b/src/fastertransformer/models/llama/LlamaContextAttentionLayer.cc
@@ -199,7 +199,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                        quant_policy_,
                        weights->past_kv_scale.data());

-
    sync_check_cuda_error();
    if (use_fmha_) {
        fusedMultiHeadAttention(k_cache_ptrs,
@@ -226,8 +225,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                                  max_seq_len,
                                  quant_policy_,
                                  weights->past_kv_scale.data());
-
-        
    }

    //////////////////////////////////////////////
@@ -240,8 +237,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
        sync_check_cuda_error();
    }

- 
-
    if (is_free_buffer_after_forward_ == true) {
        freeBuffer();
    }
@@ -302,19 +297,19 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T**    key_cache_ptr
 }

 template<typename T>
-void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T**        key_cache_ptrs,
-                                                              T**        val_cache_ptrs,
-                                                              size_t     cache_layer_offset,
-                                                              const T*   attention_mask,
-                                                              const int* padding_offset,
-                                                              const int* context_length,
-                                                              int        batch_size,
-                                                              int        num_token,
-                                                              int        max_q_len,
-                                                              int        max_k_len,
-                                                              int        max_seq_len,
-                                                              int           quant,
-                                                              const float*     kv_scale)
+void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T**          key_cache_ptrs,
+                                                              T**          val_cache_ptrs,
+                                                              size_t       cache_layer_offset,
+                                                              const T*     attention_mask,
+                                                              const int*   padding_offset,
+                                                              const int*   context_length,
+                                                              int          batch_size,
+                                                              int          num_token,
+                                                              int          max_q_len,
+                                                              int          max_k_len,
+                                                              int          max_seq_len,
+                                                              int          quant,
+                                                              const float* kv_scale)
 {
    // key_cache [B, H, S[:t+s], D/x, x] -> [B, H, t+s, D]
    // val_cache [B, H, S[:t+s], D/x, x] -> [B, H, t+s, D]
@@ -408,4 +403,4 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T**        key_cac
 template class LlamaContextAttentionLayer<float>;
 template class LlamaContextAttentionLayer<half>;

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaContextAttentionLayer.h
+++ b/src/fastertransformer/models/llama/LlamaContextAttentionLayer.h
@@ -15,8 +15,9 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
- 
-// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h
+
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h

 #pragma once

@@ -100,7 +101,7 @@ private:
    const bool neox_rotary_style_;

    const bool use_fmha_;
-    const int quant_policy_;
+    const int  quant_policy_;

    NcclParam tensor_para_;

@@ -123,4 +124,4 @@ private:
    bool is_allocate_buffer_ = false;
 };

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
@@ -283,4 +283,4 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
 template class LlamaContextDecoder<float>;
 template class LlamaContextDecoder<half>;

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.h
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.h
@@ -15,7 +15,8 @@
 * limitations under the License.
 */

-// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h

 #pragma once

@@ -111,4 +112,4 @@ public:
                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
 };

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaDecoder.h
+++ b/src/fastertransformer/models/llama/LlamaDecoder.h
@@ -16,8 +16,8 @@
 * limitations under the License.
 */

- // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h
-
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h

 #include "src/fastertransformer/layers/BaseLayer.h"
 // #include "src/fastertransformer/layers/FfnLayer.h"
@@ -81,9 +81,8 @@ public:
                 IAllocator*      allocator,
                 bool             is_free_buffer_after_forward,
                 int              quant_policy),
-                 

-    ~LlamaDecoder() override;
+        ~LlamaDecoder() override;

    virtual void forward(std::unordered_map<std::string, Tensor>*        output_tensors,
                         const std::unordered_map<std::string, Tensor>*  input_tensors,
@@ -94,4 +93,4 @@ public:
                         const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
 };

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
@@ -161,12 +161,13 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType

    // load kv_cache quant scale
    // if file not exist, get empty vector
-    std::string scale_path = dir_path + ".past_kv_scale." + rank_spec + ".weight";
-    std::ifstream  in(scale_path, std::ios::in);
+    std::string   scale_path = dir_path + ".past_kv_scale." + rank_spec + ".weight";
+    std::ifstream in(scale_path, std::ios::in);
    if (in.is_open()) {
        in.close();
        self_attn_weights.past_kv_scale = loadArrayFromBin({2}, scale_path);
-    } else {
+    }
+    else {
        self_attn_weights.past_kv_scale = {};
    }
 }

--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
@@ -35,7 +35,7 @@ public:
                            size_t     tensor_para_size,
                            size_t     tensor_para_rank);
    ~LlamaDecoderLayerWeight();
-    LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete;
+    LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other)            = delete;
    LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete;

    void loadModel(std::string dir_path, FtCudaDataType model_file_type);
@@ -58,4 +58,4 @@ private:
    void mallocWeights();
 };

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.cc
@@ -256,17 +256,17 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap*                     o
        0,        // max_input_length, not used w/o linear_bias_slopes
        input_tensors->getPtr<int>("total_padding_tokens", nullptr),
        step,
-        1.f,      // q_scaling
-        0,        // relative_attention_bias_stride
-        nullptr,  // linear_bias_slopes
-        nullptr,  //  masked_tokens_data,
-        nullptr,  // ia3_tasks
-        nullptr,  // ia3_key_weights
-        nullptr,  // ia3_value_weights
-        nullptr,  // qkv_scale_out
-        nullptr,  // attention_out_scale
-        quant_policy_,        // int8_mode
-        weights->past_kv_scale.data(), // attention kv scale
+        1.f,                            // q_scaling
+        0,                              // relative_attention_bias_stride
+        nullptr,                        // linear_bias_slopes
+        nullptr,                        //  masked_tokens_data,
+        nullptr,                        // ia3_tasks
+        nullptr,                        // ia3_key_weights
+        nullptr,                        // ia3_value_weights
+        nullptr,                        // qkv_scale_out
+        nullptr,                        // attention_out_scale
+        quant_policy_,                  // int8_mode
+        weights->past_kv_scale.data(),  // attention kv scale
        stream_);
    sync_check_cuda_error();

@@ -289,4 +289,4 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap*                     o
 template class LlamaDecoderSelfAttentionLayer<float>;
 template class LlamaDecoderSelfAttentionLayer<half>;

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h
+++ b/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h
@@ -15,7 +15,8 @@
 * limitations under the License.
 */

-// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h

 #pragma once

@@ -96,4 +97,4 @@ private:
    bool is_allocate_buffer_{};
 };

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaDenseWeight.h
+++ b/src/fastertransformer/models/llama/LlamaDenseWeight.h
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

- // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h

 #pragma once

@@ -25,8 +25,7 @@

 namespace fastertransformer {

-enum class WeightType : int
-{
+enum class WeightType : int {
    kFP32,
    kFP16,
    kFP8,  // not supported yet
@@ -66,7 +65,7 @@ template<typename T>
 struct LlamaAttentionWeight {
    LlamaDenseWeight<T> qkv;
    LlamaDenseWeight<T> output;
-    std::vector<float> past_kv_scale;
+    std::vector<float>  past_kv_scale;
 };

 template<typename T>
@@ -76,4 +75,4 @@ struct LlamaFfnWeight {
    LlamaDenseWeight<T> output;
 };

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaFfnLayer.cc
+++ b/src/fastertransformer/models/llama/LlamaFfnLayer.cc
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

- // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h

 #include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
 #include "src/fastertransformer/kernels/activation_kernels.h"
@@ -110,4 +110,4 @@ void LlamaFfnLayer<T>::forward(TensorMap*               output_tensors,
 template class LlamaFfnLayer<float>;
 template class LlamaFfnLayer<half>;

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaFfnLayer.h
+++ b/src/fastertransformer/models/llama/LlamaFfnLayer.h
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

- // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc

 #pragma once

@@ -82,4 +82,4 @@ private:
    bool is_allocate_buffer_{};
 };

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaInstanceComm.h
+++ b/src/fastertransformer/models/llama/LlamaInstanceComm.h
@@ -31,4 +31,4 @@ private:
    void*   ptr{};
 };

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaLinear.h
+++ b/src/fastertransformer/models/llama/LlamaLinear.h
@@ -58,4 +58,4 @@ private:
    cudaStream_t     stream_{};
 };

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaNcclGuard.h
+++ b/src/fastertransformer/models/llama/LlamaNcclGuard.h
@@ -89,4 +89,4 @@ struct NcclGuard {
    std::unique_ptr<std::lock_guard<std::mutex>> global_nccl_lock_;
 };

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaV2.cc
+++ b/src/fastertransformer/models/llama/LlamaV2.cc
-/* 
+/*
 * Copyright (c) OpenMMLab. All rights reserved.
 * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 * Copyright (c) 2021, NAVER Corp.  Authored by CLOVA.
@@ -17,7 +17,8 @@
 * limitations under the License.
 */

- // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc

 #include "src/fastertransformer/models/llama/LlamaV2.h"
 #include "src/fastertransformer/kernels/decoding_kernels.h"
@@ -97,7 +98,8 @@ LlamaV2<T>::LlamaV2(size_t                       head_num,
            FT_LOG_ERROR("use_context_fmha not support int8");
            assert(0);
        }
-    } else {
+    }
+    else {
        elem_bits = sizeof(T) * 8;
    }
    kv_cache_mgr_ = std::make_unique<LlamaCacheManager>(num_layer_,
@@ -213,7 +215,7 @@ void LlamaV2<T>::contextDecode(T*         deocder_output,
                                             static_cast<T*>(nullptr),
                                             pPromptTuningParam<T>{},
                                             input_ids,
-                                             0,  // only used for postion encoding
+                                             0,  // only used for position encoding
                                             token_num,
                                             token_num,
                                             1,
@@ -592,4 +594,4 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>*       outputs,
 template class LlamaV2<half>;
 template class LlamaV2<float>;

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaV2.h
+++ b/src/fastertransformer/models/llama/LlamaV2.h
@@ -16,7 +16,8 @@
 * limitations under the License.
 */

-// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h

 #pragma once

@@ -182,4 +183,4 @@ private:
    std::thread internal_thread_;
 };

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer
--- a/src/fastertransformer/models/llama/LlamaWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaWeight.cc
@@ -121,4 +121,4 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
 template struct LlamaWeight<float>;
 template struct LlamaWeight<half>;

-}  // namespace fastertransformer
\ No newline at end of file
+}  // namespace fastertransformer