Unverified Commit fe46dac2 authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

Add lint action (#32)

* temp

* fix lint

* csrc->src

* remove clang-format

* skip .rst

* skip doc

* clang-format

version

version

* mat_B
parent e8ab4ba3
...@@ -199,7 +199,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap* ...@@ -199,7 +199,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
quant_policy_, quant_policy_,
weights->past_kv_scale.data()); weights->past_kv_scale.data());
sync_check_cuda_error(); sync_check_cuda_error();
if (use_fmha_) { if (use_fmha_) {
fusedMultiHeadAttention(k_cache_ptrs, fusedMultiHeadAttention(k_cache_ptrs,
...@@ -226,8 +225,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap* ...@@ -226,8 +225,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
max_seq_len, max_seq_len,
quant_policy_, quant_policy_,
weights->past_kv_scale.data()); weights->past_kv_scale.data());
} }
////////////////////////////////////////////// //////////////////////////////////////////////
...@@ -240,8 +237,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap* ...@@ -240,8 +237,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
sync_check_cuda_error(); sync_check_cuda_error();
} }
if (is_free_buffer_after_forward_ == true) { if (is_free_buffer_after_forward_ == true) {
freeBuffer(); freeBuffer();
} }
......
...@@ -16,7 +16,8 @@ ...@@ -16,7 +16,8 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h
#pragma once #pragma once
......
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h
#pragma once #pragma once
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h
#include "src/fastertransformer/layers/BaseLayer.h" #include "src/fastertransformer/layers/BaseLayer.h"
// #include "src/fastertransformer/layers/FfnLayer.h" // #include "src/fastertransformer/layers/FfnLayer.h"
...@@ -82,7 +82,6 @@ public: ...@@ -82,7 +82,6 @@ public:
bool is_free_buffer_after_forward, bool is_free_buffer_after_forward,
int quant_policy), int quant_policy),
~LlamaDecoder() override; ~LlamaDecoder() override;
virtual void forward(std::unordered_map<std::string, Tensor>* output_tensors, virtual void forward(std::unordered_map<std::string, Tensor>* output_tensors,
......
...@@ -166,7 +166,8 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType ...@@ -166,7 +166,8 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
if (in.is_open()) { if (in.is_open()) {
in.close(); in.close();
self_attn_weights.past_kv_scale = loadArrayFromBin({2}, scale_path); self_attn_weights.past_kv_scale = loadArrayFromBin({2}, scale_path);
} else { }
else {
self_attn_weights.past_kv_scale = {}; self_attn_weights.past_kv_scale = {};
} }
} }
......
...@@ -15,7 +15,8 @@ ...@@ -15,7 +15,8 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h
#pragma once #pragma once
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h
#pragma once #pragma once
...@@ -25,8 +25,7 @@ ...@@ -25,8 +25,7 @@
namespace fastertransformer { namespace fastertransformer {
enum class WeightType : int enum class WeightType : int {
{
kFP32, kFP32,
kFP16, kFP16,
kFP8, // not supported yet kFP8, // not supported yet
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h
#include "src/fastertransformer/models/llama/LlamaFfnLayer.h" #include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
#include "src/fastertransformer/kernels/activation_kernels.h" #include "src/fastertransformer/kernels/activation_kernels.h"
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc // Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc
#pragma once #pragma once
......
...@@ -17,7 +17,8 @@ ...@@ -17,7 +17,8 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
#include "src/fastertransformer/models/llama/LlamaV2.h" #include "src/fastertransformer/models/llama/LlamaV2.h"
#include "src/fastertransformer/kernels/decoding_kernels.h" #include "src/fastertransformer/kernels/decoding_kernels.h"
...@@ -97,7 +98,8 @@ LlamaV2<T>::LlamaV2(size_t head_num, ...@@ -97,7 +98,8 @@ LlamaV2<T>::LlamaV2(size_t head_num,
FT_LOG_ERROR("use_context_fmha not support int8"); FT_LOG_ERROR("use_context_fmha not support int8");
assert(0); assert(0);
} }
} else { }
else {
elem_bits = sizeof(T) * 8; elem_bits = sizeof(T) * 8;
} }
kv_cache_mgr_ = std::make_unique<LlamaCacheManager>(num_layer_, kv_cache_mgr_ = std::make_unique<LlamaCacheManager>(num_layer_,
...@@ -213,7 +215,7 @@ void LlamaV2<T>::contextDecode(T* deocder_output, ...@@ -213,7 +215,7 @@ void LlamaV2<T>::contextDecode(T* deocder_output,
static_cast<T*>(nullptr), static_cast<T*>(nullptr),
pPromptTuningParam<T>{}, pPromptTuningParam<T>{},
input_ids, input_ids,
0, // only used for postion encoding 0, // only used for position encoding
token_num, token_num,
token_num, token_num,
1, 1,
......
...@@ -16,7 +16,8 @@ ...@@ -16,7 +16,8 @@
* limitations under the License. * limitations under the License.
*/ */
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h // Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h
#pragma once #pragma once
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment