Unverified Commit 5ea40abf authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

use format-11.1 (#38)

* format-11.1

* md-link-config
parent 9bbd39b7
{
"ignorePatterns": [
{
"pattern": "^https://developer.nvidia.com/"
},
{
"pattern": "^https://docs.openvino.ai/"
},
{
"pattern": "^https://developer.android.com/"
},
{
"pattern": "^https://developer.qualcomm.com/"
},
{
"pattern": "^http://localhost"
}
],
"httpHeaders": [
{
"urls": ["https://github.com/", "https://guides.github.com/", "https://help.github.com/", "https://docs.github.com/"],
"headers": {
"Accept-Encoding": "zstd, br, gzip, deflate"
}
}
],
"timeout": "20s",
"retryOn429": true,
"retryCount": 5,
"fallbackRetryDelay": "30s",
"aliveStatusCodes": [200, 206, 429]
}
...@@ -18,11 +18,11 @@ jobs: ...@@ -18,11 +18,11 @@ jobs:
- name: Linting - name: Linting
run: pre-commit run --all-files run: pre-commit run --all-files
- name: Format c/cuda codes with clang-format - name: Format c/cuda codes with clang-format
uses: DoozyX/clang-format-lint-action@v0.14 uses: DoozyX/clang-format-lint-action@v0.13
with: with:
source: src source: src
extensions: h,c,cpp,hpp,cu,cuh extensions: h,c,cpp,hpp,cu,cuh
clangFormatVersion: 14 clangFormatVersion: 11
style: file style: file
- name: Check markdown link - name: Check markdown link
uses: gaurav-nelson/github-action-markdown-link-check@v1 uses: gaurav-nelson/github-action-markdown-link-check@v1
......
...@@ -398,9 +398,8 @@ template void invokeBuildRelativeAttentionBias(__nv_bfloat16* relat ...@@ -398,9 +398,8 @@ template void invokeBuildRelativeAttentionBias(__nv_bfloat16* relat
template<typename T_OUT, typename T_IN> template<typename T_OUT, typename T_IN>
__global__ void getLastTokenDequantize(getLastTokenDequantizeParam<T_OUT, T_IN> param) __global__ void getLastTokenDequantize(getLastTokenDequantizeParam<T_OUT, T_IN> param)
{ {
param.output[blockIdx.x * param.d_model + threadIdx.x] = param.output[blockIdx.x * param.d_model + threadIdx.x] = (T_OUT)(
(T_OUT)((float)param.input[blockIdx.x * param.max_seq_len * param.d_model + threadIdx.x] (float)param.input[blockIdx.x * param.max_seq_len * param.d_model + threadIdx.x] * __ldg(param.input_scale));
* __ldg(param.input_scale));
} }
template<typename T_OUT, typename T_IN> template<typename T_OUT, typename T_IN>
......
...@@ -24,7 +24,8 @@ ...@@ -24,7 +24,8 @@
namespace turbomind { namespace turbomind {
enum class PositionEmbeddingType { enum class PositionEmbeddingType
{
relative, relative,
absolute, absolute,
}; };
......
...@@ -23,7 +23,8 @@ ...@@ -23,7 +23,8 @@
namespace turbomind { namespace turbomind {
enum class RepetitionPenaltyType { enum class RepetitionPenaltyType
{
Additive, // the presence penalty Additive, // the presence penalty
Multiplicative, // the repetition penalty Multiplicative, // the repetition penalty
None // No repetition penalty. None // No repetition penalty.
......
...@@ -82,7 +82,8 @@ void invokeAddBiasSoftMax(T* logits, ...@@ -82,7 +82,8 @@ void invokeAddBiasSoftMax(T* logits,
cudaStream_t stream); cudaStream_t stream);
namespace segmented_topp_impl { namespace segmented_topp_impl {
enum DType_t { enum DType_t
{
kFLOAT, kFLOAT,
kHALF, kHALF,
kINT8 kINT8
...@@ -95,14 +96,17 @@ template<typename Key_Data_Type_ = float, ...@@ -95,14 +96,17 @@ template<typename Key_Data_Type_ = float,
struct Segmented_topk_kernel_params { struct Segmented_topk_kernel_params {
typedef Key_Data_Type_ Key_Data_Type; typedef Key_Data_Type_ Key_Data_Type;
typedef Value_Data_Type_ Value_Data_Type; typedef Value_Data_Type_ Value_Data_Type;
enum { enum
{
BLOCK_THREADS = BLOCK_THREADS_ BLOCK_THREADS = BLOCK_THREADS_
}; };
enum { enum
{
ITEMS_INCREMENT = 32 ITEMS_INCREMENT = 32
}; };
// enum { KEYS_PER_LDG = 2 * 4 / sizeof(Key_Data_Type_) }; // enum { KEYS_PER_LDG = 2 * 4 / sizeof(Key_Data_Type_) };
enum { enum
{
KEYS_PER_LDG = KEYS_PER_LDG_ KEYS_PER_LDG = KEYS_PER_LDG_
}; };
}; };
......
...@@ -40,7 +40,7 @@ public: ...@@ -40,7 +40,7 @@ public:
virtual void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) = 0; virtual void setup(const size_t batch_size, const size_t beam_width, TensorMap* runtime_args) = 0;
virtual void forward(std::vector<turbomind::Tensor>* output_tensors, virtual void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors) = 0; const std::vector<turbomind::Tensor>* input_tensors) = 0;
virtual void forward(std::unordered_map<std::string, Tensor>* output_tensors, virtual void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors) = 0; const std::unordered_map<std::string, Tensor>* input_tensors) = 0;
virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors) = 0; virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors) = 0;
......
...@@ -23,7 +23,7 @@ namespace turbomind { ...@@ -23,7 +23,7 @@ namespace turbomind {
template<typename T> template<typename T>
void FfnLayer<T>::forward(std::vector<turbomind::Tensor>* output_tensors, void FfnLayer<T>::forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors, const std::vector<turbomind::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights) const FfnWeight<T>* ffn_weights)
{ {
TensorMap input_tensor({{"ffn_input", input_tensors->at(0)}}); TensorMap input_tensor({{"ffn_input", input_tensors->at(0)}});
TensorMap output_tensor({{"ffn_output", output_tensors->at(0)}}); TensorMap output_tensor({{"ffn_output", output_tensors->at(0)}});
......
...@@ -124,7 +124,7 @@ public: ...@@ -124,7 +124,7 @@ public:
virtual void forward(std::vector<turbomind::Tensor>* output_tensors, virtual void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors, const std::vector<turbomind::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights); const FfnWeight<T>* ffn_weights);
virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnWeight<T>* ffn_weights); virtual void forward(TensorMap* output_tensors, TensorMap* input_tensors, const FfnWeight<T>* ffn_weights);
}; };
......
...@@ -22,7 +22,7 @@ namespace turbomind { ...@@ -22,7 +22,7 @@ namespace turbomind {
template<typename T> template<typename T>
void FfnLayerINT8<T>::forward(std::vector<turbomind::Tensor>* output_tensors, void FfnLayerINT8<T>::forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors, const std::vector<turbomind::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights) const FfnWeight<T>* ffn_weights)
{ {
// input_tensors: [input (token_num, hidden_dimension)] // input_tensors: [input (token_num, hidden_dimension)]
// output_tensors: [output (token_num, hidden_dimension)] // output_tensors: [output (token_num, hidden_dimension)]
......
...@@ -79,7 +79,7 @@ public: ...@@ -79,7 +79,7 @@ public:
void forward(std::vector<turbomind::Tensor>* output_tensors, void forward(std::vector<turbomind::Tensor>* output_tensors,
const std::vector<turbomind::Tensor>* input_tensors, const std::vector<turbomind::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights); const FfnWeight<T>* ffn_weights);
friend GeluFfnLayerINT8<T>; friend GeluFfnLayerINT8<T>;
friend ReluFfnLayerINT8<T>; friend ReluFfnLayerINT8<T>;
......
...@@ -30,7 +30,8 @@ ...@@ -30,7 +30,8 @@
namespace turbomind { namespace turbomind {
enum class AttentionType { enum class AttentionType
{
UNFUSED_MHA, UNFUSED_MHA,
UNFUSED_PADDED_MHA, UNFUSED_PADDED_MHA,
FUSED_MHA, FUSED_MHA,
......
...@@ -15,9 +15,9 @@ public: ...@@ -15,9 +15,9 @@ public:
pthread_barrier_init(&barrier_, nullptr, count); pthread_barrier_init(&barrier_, nullptr, count);
} }
Barrier(const Barrier&) = delete; Barrier(const Barrier&) = delete;
Barrier& operator=(const Barrier&) = delete; Barrier& operator=(const Barrier&) = delete;
Barrier(Barrier&&) noexcept = delete; Barrier(Barrier&&) noexcept = delete;
Barrier& operator=(Barrier&&) noexcept = delete; Barrier& operator=(Barrier&&) noexcept = delete;
void wait() void wait()
......
...@@ -35,7 +35,7 @@ public: ...@@ -35,7 +35,7 @@ public:
size_t tensor_para_size, size_t tensor_para_size,
size_t tensor_para_rank); size_t tensor_para_rank);
~LlamaDecoderLayerWeight(); ~LlamaDecoderLayerWeight();
LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete; LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete;
LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete; LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete;
void loadModel(std::string dir_path, FtCudaDataType model_file_type); void loadModel(std::string dir_path, FtCudaDataType model_file_type);
......
...@@ -25,7 +25,8 @@ ...@@ -25,7 +25,8 @@
namespace turbomind { namespace turbomind {
enum class WeightType : int { enum class WeightType : int
{
kFP32, kFP32,
kFP16, kFP16,
kFP8, // not supported yet kFP8, // not supported yet
......
...@@ -40,7 +40,7 @@ struct LlamaWeight { ...@@ -40,7 +40,7 @@ struct LlamaWeight {
~LlamaWeight(); ~LlamaWeight();
LlamaWeight(const LlamaWeight& other) = delete; LlamaWeight(const LlamaWeight& other) = delete;
LlamaWeight& operator=(const LlamaWeight& other) = delete; LlamaWeight& operator=(const LlamaWeight& other) = delete;
void loadModel(std::string dir_path); void loadModel(std::string dir_path);
......
...@@ -25,7 +25,8 @@ struct Request { ...@@ -25,7 +25,8 @@ struct Request {
using Callback = std::function<void(std::unordered_map<std::string, Tensor>*)>; using Callback = std::function<void(std::unordered_map<std::string, Tensor>*)>;
Callback stream_cb; Callback stream_cb;
enum { enum
{
kInvalid = 1, kInvalid = 1,
kConflict = 2, kConflict = 2,
kBusy = 3, kBusy = 3,
......
...@@ -9,7 +9,8 @@ ...@@ -9,7 +9,8 @@
namespace turbomind { namespace turbomind {
enum QuantPolicy { enum QuantPolicy
{
kNone = 0x00, kNone = 0x00,
// reserve 0x01 and 0x02 for backward compatibility // reserve 0x01 and 0x02 for backward compatibility
kReserve1 = 0x01, kReserve1 = 0x01,
...@@ -18,7 +19,8 @@ enum QuantPolicy { ...@@ -18,7 +19,8 @@ enum QuantPolicy {
kCacheKVInt8 = 0x04, kCacheKVInt8 = 0x04,
}; };
enum CmpMode { enum CmpMode
{
kCmpNone, kCmpNone,
kCmpRead, kCmpRead,
kCmpWrite, kCmpWrite,
......
...@@ -1159,8 +1159,7 @@ void streaming_callback(std::shared_ptr<std::unordered_map<std::string, Tensor>> ...@@ -1159,8 +1159,7 @@ void streaming_callback(std::shared_ptr<std::unordered_map<std::string, Tensor>>
for (auto& response : *responses) { for (auto& response : *responses) {
if (response != nullptr) { if (response != nullptr) {
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("start to send streaming response")).c_str()); LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("start to send streaming response")).c_str());
LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, 0, nullptr), LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, 0, nullptr), "failed to send TurboMind backend response");
"failed to send TurboMind backend response");
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("streaming response is sent")).c_str()); LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("streaming response is sent")).c_str());
} }
else { else {
...@@ -1354,11 +1353,10 @@ ModelInstanceState::Execute(std::vector<TRITONBACKEND_Response*>* ...@@ -1354,11 +1353,10 @@ ModelInstanceState::Execute(std::vector<TRITONBACKEND_Response*>*
} }
} }
catch (std::exception& ex) { catch (std::exception& ex) {
SendErrorForResponses( SendErrorForResponses(responses,
responses, response_count,
response_count, TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL,
TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL, ("TurboMind execute failure: " + std::string(ex.what())).c_str()));
("TurboMind execute failure: " + std::string(ex.what())).c_str()));
} }
auto output_tensors = output_tensors_list[0]; auto output_tensors = output_tensors_list[0];
return output_tensors; return output_tensors;
......
#Copyright(c) 2021 - 2022, NVIDIA CORPORATION.All rights reserved. # Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
# #
#Redistribution and use in source and binary forms, with or without # Redistribution and use in source and binary forms, with or without
#modification, are permitted provided that the following conditions # modification, are permitted provided that the following conditions
#are met: # are met:
#* Redistributions of source code must retain the above copyright # * Redistributions of source code must retain the above copyright
#notice, this list of conditions and the following disclaimer. # notice, this list of conditions and the following disclaimer.
#* Redistributions in binary form must reproduce the above copyright # * Redistributions in binary form must reproduce the above copyright
#notice, this list of conditions and the following disclaimer in the # notice, this list of conditions and the following disclaimer in the
#documentation and / or other materials provided with the distribution. # documentation and/or other materials provided with the distribution.
#* Neither the name of NVIDIA CORPORATION nor the names of its # * Neither the name of NVIDIA CORPORATION nor the names of its
#contributors may be used to endorse or promote products derived # contributors may be used to endorse or promote products derived
#from this software without specific prior written permission. # from this software without specific prior written permission.
# #
#THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
#EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
#IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
#PURPOSE ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
#CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
#EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
#PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
#PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
#OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
#(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
#OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
{ {
global: global:
TRITONBACKEND_*; TRITONBACKEND_*;
local: local: *;
*;
}; };
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment