use format-11.1 (#38)

* format-11.1 * md-link-config

use format-11.1 (#38)
* format-11.1 * md-link-config
5ea40abf · AllentDan · GitHub · 9bbd39b7 · 5ea40abf · 5ea40abf
Unverified Commit 5ea40abf authored Jul 04, 2023 by AllentDan Committed by GitHub Jul 04, 2023
20 changed files
--- a/.github/md-link-config.json
+++ b/.github/md-link-config.json
+{
+  "ignorePatterns": [
+    {
+      "pattern": "^https://developer.nvidia.com/"
+    },
+    {
+      "pattern": "^https://docs.openvino.ai/"
+    },
+    {
+      "pattern": "^https://developer.android.com/"
+    },
+    {
+      "pattern": "^https://developer.qualcomm.com/"
+    },
+    {
+      "pattern": "^http://localhost"
+    }
+  ],
+  "httpHeaders": [
+    {
+      "urls": ["https://github.com/", "https://guides.github.com/", "https://help.github.com/", "https://docs.github.com/"],
+      "headers": {
+        "Accept-Encoding": "zstd, br, gzip, deflate"
+      }
+    }
+  ],
+  "timeout": "20s",
+  "retryOn429": true,
+  "retryCount": 5,
+  "fallbackRetryDelay": "30s",
+  "aliveStatusCodes": [200, 206, 429]
+}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -18,11 +18,11 @@ jobs:
      - name: Linting
        run: pre-commit run --all-files
      - name: Format c/cuda codes with clang-format
-        uses: DoozyX/clang-format-lint-action@v0.14
+        uses: DoozyX/clang-format-lint-action@v0.13
        with:
          source: src
          extensions: h,c,cpp,hpp,cu,cuh
-          clangFormatVersion: 14
+          clangFormatVersion: 11
          style: file
      - name: Check markdown link
        uses: gaurav-nelson/github-action-markdown-link-check@v1

--- a/src/turbomind/kernels/bert_preprocess_kernels.cu
+++ b/src/turbomind/kernels/bert_preprocess_kernels.cu
@@ -398,9 +398,8 @@ template void invokeBuildRelativeAttentionBias(__nv_bfloat16*              relat
 template<typename T_OUT, typename T_IN>
 __global__ void getLastTokenDequantize(getLastTokenDequantizeParam<T_OUT, T_IN> param)
 {
-    param.output[blockIdx.x * param.d_model + threadIdx.x] =
+    param.output[blockIdx.x * param.d_model + threadIdx.x] = (T_OUT)(
-        (T_OUT)((float)param.input[blockIdx.x * param.max_seq_len * param.d_model + threadIdx.x]
+        (float)param.input[blockIdx.x * param.max_seq_len * param.d_model + threadIdx.x] * __ldg(param.input_scale));
-                * __ldg(param.input_scale));
 }
 template<typename T_OUT, typename T_IN>

--- a/src/turbomind/kernels/gen_relative_pos_bias.h
+++ b/src/turbomind/kernels/gen_relative_pos_bias.h
@@ -24,7 +24,8 @@
 namespace turbomind {
-enum class PositionEmbeddingType {
+enum class PositionEmbeddingType
+{
    relative,
    absolute,
 };

--- a/src/turbomind/kernels/penalty_types.h
+++ b/src/turbomind/kernels/penalty_types.h
@@ -23,7 +23,8 @@
 namespace turbomind {
-enum class RepetitionPenaltyType {
+enum class RepetitionPenaltyType
+{
    Additive,        // the presence penalty
    Multiplicative,  // the repetition penalty
    None             // No repetition penalty.

--- a/src/turbomind/kernels/sampling_topp_kernels.h
+++ b/src/turbomind/kernels/sampling_topp_kernels.h
@@ -82,7 +82,8 @@ void invokeAddBiasSoftMax(T*           logits,
                          cudaStream_t stream);
 namespace segmented_topp_impl {
-enum DType_t {
+enum DType_t
+{
    kFLOAT,
    kHALF,
    kINT8
@@ -95,14 +96,17 @@ template<typename Key_Data_Type_   = float,
 struct Segmented_topk_kernel_params {
    typedef Key_Data_Type_   Key_Data_Type;
    typedef Value_Data_Type_ Value_Data_Type;
-    enum {
+    enum
+    {
        BLOCK_THREADS = BLOCK_THREADS_
    };
-    enum {
+    enum
+    {
        ITEMS_INCREMENT = 32
    };
    // enum { KEYS_PER_LDG = 2 * 4 / sizeof(Key_Data_Type_) };
-    enum {
+    enum
+    {
        KEYS_PER_LDG = KEYS_PER_LDG_
    };
 };

--- a/src/turbomind/layers/DynamicDecodeBaseLayer.h
+++ b/src/turbomind/layers/DynamicDecodeBaseLayer.h
@@ -40,7 +40,7 @@ public:
    virtual void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) = 0;
    virtual void forward(std::vector<turbomind::Tensor>*       output_tensors,
-                         const std::vector<turbomind::Tensor>* input_tensors)             = 0;
+                         const std::vector<turbomind::Tensor>* input_tensors)                     = 0;
    virtual void forward(std::unordered_map<std::string, Tensor>*       output_tensors,
                         const std::unordered_map<std::string, Tensor>* input_tensors)            = 0;
    virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors)                     = 0;

--- a/src/turbomind/layers/FfnLayer.cc
+++ b/src/turbomind/layers/FfnLayer.cc
@@ -23,7 +23,7 @@ namespace turbomind {
 template<typename T>
 void FfnLayer<T>::forward(std::vector<turbomind::Tensor>*       output_tensors,
                          const std::vector<turbomind::Tensor>* input_tensors,
-                          const FfnWeight<T>*                           ffn_weights)
+                          const FfnWeight<T>*                   ffn_weights)
 {
    TensorMap input_tensor({{"ffn_input", input_tensors->at(0)}});
    TensorMap output_tensor({{"ffn_output", output_tensors->at(0)}});

--- a/src/turbomind/layers/FfnLayer.h
+++ b/src/turbomind/layers/FfnLayer.h
@@ -124,7 +124,7 @@ public:
    virtual void forward(std::vector<turbomind::Tensor>*       output_tensors,
                         const std::vector<turbomind::Tensor>* input_tensors,
-                         const FfnWeight<T>*                           ffn_weights);
+                         const FfnWeight<T>*                   ffn_weights);
    virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnWeight<T>* ffn_weights);
 };

--- a/src/turbomind/layers/FfnLayerINT8.cc
+++ b/src/turbomind/layers/FfnLayerINT8.cc
@@ -22,7 +22,7 @@ namespace turbomind {
 template<typename T>
 void FfnLayerINT8<T>::forward(std::vector<turbomind::Tensor>*       output_tensors,
                              const std::vector<turbomind::Tensor>* input_tensors,
-                              const FfnWeight<T>*                           ffn_weights)
+                              const FfnWeight<T>*                   ffn_weights)
 {
    // input_tensors: [input (token_num, hidden_dimension)]
    // output_tensors: [output (token_num, hidden_dimension)]

--- a/src/turbomind/layers/FfnLayerINT8.h
+++ b/src/turbomind/layers/FfnLayerINT8.h
@@ -79,7 +79,7 @@ public:
    void forward(std::vector<turbomind::Tensor>*       output_tensors,
                 const std::vector<turbomind::Tensor>* input_tensors,
-                 const FfnWeight<T>*                           ffn_weights);
+                 const FfnWeight<T>*                   ffn_weights);
    friend GeluFfnLayerINT8<T>;
    friend ReluFfnLayerINT8<T>;

--- a/src/turbomind/layers/attention_layers/BaseAttentionLayer.h
+++ b/src/turbomind/layers/attention_layers/BaseAttentionLayer.h
@@ -30,7 +30,8 @@
 namespace turbomind {
-enum class AttentionType {
+enum class AttentionType
+{
    UNFUSED_MHA,
    UNFUSED_PADDED_MHA,
    FUSED_MHA,

--- a/src/turbomind/models/llama/Barrier.h
+++ b/src/turbomind/models/llama/Barrier.h
@@ -15,9 +15,9 @@ public:
        pthread_barrier_init(&barrier_, nullptr, count);
    }
-    Barrier(const Barrier&)                = delete;
+    Barrier(const Barrier&) = delete;
-    Barrier& operator=(const Barrier&)     = delete;
+    Barrier& operator=(const Barrier&) = delete;
-    Barrier(Barrier&&) noexcept            = delete;
+    Barrier(Barrier&&) noexcept        = delete;
    Barrier& operator=(Barrier&&) noexcept = delete;
    void wait()

--- a/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
+++ b/src/turbomind/models/llama/LlamaDecoderLayerWeight.h
@@ -35,7 +35,7 @@ public:
                            size_t     tensor_para_size,
                            size_t     tensor_para_rank);
    ~LlamaDecoderLayerWeight();
-    LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other)            = delete;
+    LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete;
    LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete;
    void loadModel(std::string dir_path, FtCudaDataType model_file_type);

--- a/src/turbomind/models/llama/LlamaDenseWeight.h
+++ b/src/turbomind/models/llama/LlamaDenseWeight.h
@@ -25,7 +25,8 @@
 namespace turbomind {
-enum class WeightType : int {
+enum class WeightType : int
+{
    kFP32,
    kFP16,
    kFP8,  // not supported yet

--- a/src/turbomind/models/llama/LlamaWeight.h
+++ b/src/turbomind/models/llama/LlamaWeight.h
@@ -40,7 +40,7 @@ struct LlamaWeight {
    ~LlamaWeight();
-    LlamaWeight(const LlamaWeight& other)            = delete;
+    LlamaWeight(const LlamaWeight& other) = delete;
    LlamaWeight& operator=(const LlamaWeight& other) = delete;
    void loadModel(std::string dir_path);

--- a/src/turbomind/models/llama/Request.h
+++ b/src/turbomind/models/llama/Request.h
@@ -25,7 +25,8 @@ struct Request {
    using Callback = std::function<void(std::unordered_map<std::string, Tensor>*)>;
    Callback stream_cb;
-    enum {
+    enum
+    {
        kInvalid  = 1,
        kConflict = 2,
        kBusy     = 3,

--- a/src/turbomind/models/llama/llama_utils.h
+++ b/src/turbomind/models/llama/llama_utils.h
@@ -9,7 +9,8 @@
 namespace turbomind {
-enum QuantPolicy {
+enum QuantPolicy
+{
    kNone = 0x00,
    // reserve 0x01 and 0x02 for backward compatibility
    kReserve1 = 0x01,
@@ -18,7 +19,8 @@ enum QuantPolicy {
    kCacheKVInt8 = 0x04,
 };
-enum CmpMode {
+enum CmpMode
+{
    kCmpNone,
    kCmpRead,
    kCmpWrite,

--- a/src/turbomind/triton_backend/libfastertransformer.cc
+++ b/src/turbomind/triton_backend/libfastertransformer.cc
@@ -1159,8 +1159,7 @@ void streaming_callback(std::shared_ptr<std::unordered_map<std::string, Tensor>>
    for (auto& response : *responses) {
        if (response != nullptr) {
            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("start to send streaming response")).c_str());
-            LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, 0, nullptr),
+            LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, 0, nullptr), "failed to send TurboMind backend response");
-                         "failed to send TurboMind backend response");
            LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("streaming response is sent")).c_str());
        }
        else {
@@ -1354,11 +1353,10 @@ ModelInstanceState::Execute(std::vector<TRITONBACKEND_Response*>*
        }
    }
    catch (std::exception& ex) {
-        SendErrorForResponses(
+        SendErrorForResponses(responses,
-            responses,
+                              response_count,
-            response_count,
+                              TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL,
-            TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL,
+                                                    ("TurboMind execute failure: " + std::string(ex.what())).c_str()));
-                                  ("TurboMind execute failure: " + std::string(ex.what())).c_str()));
    }
    auto output_tensors = output_tensors_list[0];
    return output_tensors;

--- a/src/turbomind/triton_backend/libtriton_fastertransformer.ldscript
+++ b/src/turbomind/triton_backend/libtriton_fastertransformer.ldscript
-#Copyright(c) 2021 - 2022, NVIDIA CORPORATION.All rights reserved.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
 #
-#Redistribution and use in source and binary forms, with or without
+# Redistribution and use in source and binary forms, with or without
-#modification, are permitted provided that the following conditions
+# modification, are permitted provided that the following conditions
-#are met:
+# are met:
-#* Redistributions of source code must retain the above copyright
+#  * Redistributions of source code must retain the above copyright
-#notice, this list of conditions and the following disclaimer.
+#    notice, this list of conditions and the following disclaimer.
-#* Redistributions in binary form must reproduce the above copyright
+#  * Redistributions in binary form must reproduce the above copyright
-#notice, this list of conditions and the following disclaimer in the
+#    notice, this list of conditions and the following disclaimer in the
-#documentation and / or other materials provided with the distribution.
+#    documentation and/or other materials provided with the distribution.
-#* Neither the name of NVIDIA CORPORATION nor the names of its
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
-#contributors may be used to endorse or promote products derived
+#    contributors may be used to endorse or promote products derived
-#from this software without specific prior written       permission.
+#    from this software without specific prior written permission.
 #
-#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
-#EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-#IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-#PURPOSE ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
-#CONTRIBUTORS BE LIABLE FOR ANY                               DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-#EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-#OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 {
-global:
+  global:
    TRITONBACKEND_*;
-local:
+  local: *;
-    *;
 };