Unverified Commit fe46dac2 authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

Add lint action (#32)

* temp

* fix lint

* csrc->src

* remove clang-format

* skip .rst

* skip doc

* clang-format

version

version

* mat_B
parent e8ab4ba3
......@@ -199,7 +199,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
quant_policy_,
weights->past_kv_scale.data());
sync_check_cuda_error();
if (use_fmha_) {
fusedMultiHeadAttention(k_cache_ptrs,
......@@ -226,8 +225,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
max_seq_len,
quant_policy_,
weights->past_kv_scale.data());
}
//////////////////////////////////////////////
......@@ -240,8 +237,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
sync_check_cuda_error();
}
if (is_free_buffer_after_forward_ == true) {
freeBuffer();
}
......
......@@ -16,7 +16,8 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h
#pragma once
......
......@@ -15,7 +15,8 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h
#pragma once
......
......@@ -16,8 +16,8 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h
#include "src/fastertransformer/layers/BaseLayer.h"
// #include "src/fastertransformer/layers/FfnLayer.h"
......@@ -82,7 +82,6 @@ public:
bool is_free_buffer_after_forward,
int quant_policy),
~LlamaDecoder() override;
virtual void forward(std::unordered_map<std::string, Tensor>* output_tensors,
......
......@@ -166,7 +166,8 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
if (in.is_open()) {
in.close();
self_attn_weights.past_kv_scale = loadArrayFromBin({2}, scale_path);
} else {
}
else {
self_attn_weights.past_kv_scale = {};
}
}
......
......@@ -15,7 +15,8 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h
#pragma once
......
......@@ -15,7 +15,7 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h
#pragma once
......@@ -25,8 +25,7 @@
namespace fastertransformer {
enum class WeightType : int
{
enum class WeightType : int {
kFP32,
kFP16,
kFP8, // not supported yet
......
......@@ -15,7 +15,7 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h
#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
#include "src/fastertransformer/kernels/activation_kernels.h"
......
......@@ -15,7 +15,7 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc
#pragma once
......
......@@ -17,7 +17,8 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
#include "src/fastertransformer/models/llama/LlamaV2.h"
#include "src/fastertransformer/kernels/decoding_kernels.h"
......@@ -97,7 +98,8 @@ LlamaV2<T>::LlamaV2(size_t head_num,
FT_LOG_ERROR("use_context_fmha not support int8");
assert(0);
}
} else {
}
else {
elem_bits = sizeof(T) * 8;
}
kv_cache_mgr_ = std::make_unique<LlamaCacheManager>(num_layer_,
......@@ -213,7 +215,7 @@ void LlamaV2<T>::contextDecode(T* deocder_output,
static_cast<T*>(nullptr),
pPromptTuningParam<T>{},
input_ids,
0, // only used for postion encoding
0, // only used for position encoding
token_num,
token_num,
1,
......
......@@ -16,7 +16,8 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h
#pragma once
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment