Add lint action (#32)

* temp * fix lint * csrc->src * remove clang-format * skip .rst * skip doc * clang-format version version * mat_B

Add lint action (#32)
* temp * fix lint * csrc->src * remove clang-format * skip .rst * skip doc * clang-format version version * mat_B
fe46dac2 · AllentDan · GitHub · e8ab4ba3 · fe46dac2 · fe46dac2
Unverified Commit fe46dac2 authored Jul 01, 2023 by AllentDan Committed by GitHub Jul 01, 2023
20 changed files
--- a/src/fastertransformer/models/llama/LlamaCacheManager.cc
+++ b/src/fastertransformer/models/llama/LlamaCacheManager.cc
--- a/src/fastertransformer/models/llama/LlamaCacheManager.h
+++ b/src/fastertransformer/models/llama/LlamaCacheManager.h
--- a/src/fastertransformer/models/llama/LlamaContextAttentionLayer.cc
+++ b/src/fastertransformer/models/llama/LlamaContextAttentionLayer.cc
@@ -199,7 +199,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                        quant_policy_,
                        weights->past_kv_scale.data());

-
    sync_check_cuda_error();
    if (use_fmha_) {
        fusedMultiHeadAttention(k_cache_ptrs,
@@ -226,8 +225,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
                                  max_seq_len,
                                  quant_policy_,
                                  weights->past_kv_scale.data());
-
-        
    }

    //////////////////////////////////////////////
@@ -240,8 +237,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
        sync_check_cuda_error();
    }

- 
-
    if (is_free_buffer_after_forward_ == true) {
        freeBuffer();
    }

--- a/src/fastertransformer/models/llama/LlamaContextAttentionLayer.h
+++ b/src/fastertransformer/models/llama/LlamaContextAttentionLayer.h
@@ -16,7 +16,8 @@
 * limitations under the License.
 */

-// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h

 #pragma once


--- a/src/fastertransformer/models/llama/LlamaContextDecoder.cc
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.cc
--- a/src/fastertransformer/models/llama/LlamaContextDecoder.h
+++ b/src/fastertransformer/models/llama/LlamaContextDecoder.h
@@ -15,7 +15,8 @@
 * limitations under the License.
 */

-// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h

 #pragma once


--- a/src/fastertransformer/models/llama/LlamaDecoder.h
+++ b/src/fastertransformer/models/llama/LlamaDecoder.h
@@ -16,8 +16,8 @@
 * limitations under the License.
 */

- // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h
-
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h

 #include "src/fastertransformer/layers/BaseLayer.h"
 // #include "src/fastertransformer/layers/FfnLayer.h"
@@ -82,7 +82,6 @@ public:
                 bool             is_free_buffer_after_forward,
                 int              quant_policy),

-
        ~LlamaDecoder() override;

    virtual void forward(std::unordered_map<std::string, Tensor>*        output_tensors,

--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.cc
@@ -166,7 +166,8 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
    if (in.is_open()) {
        in.close();
        self_attn_weights.past_kv_scale = loadArrayFromBin({2}, scale_path);
-    } else {
+    }
+    else {
        self_attn_weights.past_kv_scale = {};
    }
 }

--- a/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h
--- a/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.cc
+++ b/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.cc
--- a/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h
+++ b/src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h
@@ -15,7 +15,8 @@
 * limitations under the License.
 */

-// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h

 #pragma once


--- a/src/fastertransformer/models/llama/LlamaDenseWeight.h
+++ b/src/fastertransformer/models/llama/LlamaDenseWeight.h
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

- // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h

 #pragma once

@@ -25,8 +25,7 @@

 namespace fastertransformer {

-enum class WeightType : int
-{
+enum class WeightType : int {
    kFP32,
    kFP16,
    kFP8,  // not supported yet

--- a/src/fastertransformer/models/llama/LlamaFfnLayer.cc
+++ b/src/fastertransformer/models/llama/LlamaFfnLayer.cc
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

- // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h

 #include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
 #include "src/fastertransformer/kernels/activation_kernels.h"

--- a/src/fastertransformer/models/llama/LlamaFfnLayer.h
+++ b/src/fastertransformer/models/llama/LlamaFfnLayer.h
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

- // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc
+// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc

 #pragma once


--- a/src/fastertransformer/models/llama/LlamaInstanceComm.h
+++ b/src/fastertransformer/models/llama/LlamaInstanceComm.h
--- a/src/fastertransformer/models/llama/LlamaLinear.h
+++ b/src/fastertransformer/models/llama/LlamaLinear.h
--- a/src/fastertransformer/models/llama/LlamaNcclGuard.h
+++ b/src/fastertransformer/models/llama/LlamaNcclGuard.h
--- a/src/fastertransformer/models/llama/LlamaV2.cc
+++ b/src/fastertransformer/models/llama/LlamaV2.cc
@@ -17,7 +17,8 @@
 * limitations under the License.
 */

- // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc

 #include "src/fastertransformer/models/llama/LlamaV2.h"
 #include "src/fastertransformer/kernels/decoding_kernels.h"
@@ -97,7 +98,8 @@ LlamaV2<T>::LlamaV2(size_t                       head_num,
            FT_LOG_ERROR("use_context_fmha not support int8");
            assert(0);
        }
-    } else {
+    }
+    else {
        elem_bits = sizeof(T) * 8;
    }
    kv_cache_mgr_ = std::make_unique<LlamaCacheManager>(num_layer_,
@@ -213,7 +215,7 @@ void LlamaV2<T>::contextDecode(T*         deocder_output,
                                             static_cast<T*>(nullptr),
                                             pPromptTuningParam<T>{},
                                             input_ids,
-                                             0,  // only used for postion encoding
+                                             0,  // only used for position encoding
                                             token_num,
                                             token_num,
                                             1,

--- a/src/fastertransformer/models/llama/LlamaV2.h
+++ b/src/fastertransformer/models/llama/LlamaV2.h
@@ -16,7 +16,8 @@
 * limitations under the License.
 */

-// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h
+// Modified from
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h

 #pragma once


--- a/src/fastertransformer/models/llama/LlamaWeight.cc
+++ b/src/fastertransformer/models/llama/LlamaWeight.cc