Unverified Commit fe46dac2 authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

Add lint action (#32)

* temp

* fix lint

* csrc->src

* remove clang-format

* skip .rst

* skip doc

* clang-format

version

version

* mat_B
parent e8ab4ba3
......@@ -189,4 +189,4 @@ bool LlamaCacheManager::contains(uint64_t id) const noexcept
return it != device_cache_.end();
}
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -99,4 +99,4 @@ private:
std::vector<Sequence> device_cache_;
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -199,7 +199,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
quant_policy_,
weights->past_kv_scale.data());
sync_check_cuda_error();
if (use_fmha_) {
fusedMultiHeadAttention(k_cache_ptrs,
......@@ -226,8 +225,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
max_seq_len,
quant_policy_,
weights->past_kv_scale.data());
}
//////////////////////////////////////////////
......@@ -240,8 +237,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
sync_check_cuda_error();
}
if (is_free_buffer_after_forward_ == true) {
freeBuffer();
}
......@@ -302,19 +297,19 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptr
}
template<typename T>
void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T** key_cache_ptrs,
T** val_cache_ptrs,
size_t cache_layer_offset,
const T* attention_mask,
const int* padding_offset,
const int* context_length,
int batch_size,
int num_token,
int max_q_len,
int max_k_len,
int max_seq_len,
int quant,
const float* kv_scale)
void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T** key_cache_ptrs,
T** val_cache_ptrs,
size_t cache_layer_offset,
const T* attention_mask,
const int* padding_offset,
const int* context_length,
int batch_size,
int num_token,
int max_q_len,
int max_k_len,
int max_seq_len,
int quant,
const float* kv_scale)
{
// key_cache [B, H, S[:t+s], D/x, x] -> [B, H, t+s, D]
// val_cache [B, H, S[:t+s], D/x, x] -> [B, H, t+s, D]
......@@ -408,4 +403,4 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T** key_cac
template class LlamaContextAttentionLayer<float>;
template class LlamaContextAttentionLayer<half>;
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -15,8 +15,9 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h
#pragma once
......@@ -100,7 +101,7 @@ private:
const bool neox_rotary_style_;
const bool use_fmha_;
const int quant_policy_;
const int quant_policy_;
NcclParam tensor_para_;
......@@ -123,4 +124,4 @@ private:
bool is_allocate_buffer_ = false;
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -283,4 +283,4 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
template class LlamaContextDecoder<float>;
template class LlamaContextDecoder<half>;
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -15,7 +15,8 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h
#pragma once
......@@ -111,4 +112,4 @@ public:
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -16,8 +16,8 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h
#include "src/fastertransformer/layers/BaseLayer.h"
// #include "src/fastertransformer/layers/FfnLayer.h"
......@@ -81,9 +81,8 @@ public:
IAllocator* allocator,
bool is_free_buffer_after_forward,
int quant_policy),
~LlamaDecoder() override;
~LlamaDecoder() override;
virtual void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors,
......@@ -94,4 +93,4 @@ public:
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -161,12 +161,13 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
// load kv_cache quant scale
// if file not exist, get empty vector
std::string scale_path = dir_path + ".past_kv_scale." + rank_spec + ".weight";
std::ifstream in(scale_path, std::ios::in);
std::string scale_path = dir_path + ".past_kv_scale." + rank_spec + ".weight";
std::ifstream in(scale_path, std::ios::in);
if (in.is_open()) {
in.close();
self_attn_weights.past_kv_scale = loadArrayFromBin({2}, scale_path);
} else {
}
else {
self_attn_weights.past_kv_scale = {};
}
}
......
......@@ -35,7 +35,7 @@ public:
size_t tensor_para_size,
size_t tensor_para_rank);
~LlamaDecoderLayerWeight();
LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete;
LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete;
LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete;
void loadModel(std::string dir_path, FtCudaDataType model_file_type);
......@@ -58,4 +58,4 @@ private:
void mallocWeights();
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -256,17 +256,17 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap* o
0, // max_input_length, not used w/o linear_bias_slopes
input_tensors->getPtr<int>("total_padding_tokens", nullptr),
step,
1.f, // q_scaling
0, // relative_attention_bias_stride
nullptr, // linear_bias_slopes
nullptr, // masked_tokens_data,
nullptr, // ia3_tasks
nullptr, // ia3_key_weights
nullptr, // ia3_value_weights
nullptr, // qkv_scale_out
nullptr, // attention_out_scale
quant_policy_, // int8_mode
weights->past_kv_scale.data(), // attention kv scale
1.f, // q_scaling
0, // relative_attention_bias_stride
nullptr, // linear_bias_slopes
nullptr, // masked_tokens_data,
nullptr, // ia3_tasks
nullptr, // ia3_key_weights
nullptr, // ia3_value_weights
nullptr, // qkv_scale_out
nullptr, // attention_out_scale
quant_policy_, // int8_mode
weights->past_kv_scale.data(), // attention kv scale
stream_);
sync_check_cuda_error();
......@@ -289,4 +289,4 @@ void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap* o
template class LlamaDecoderSelfAttentionLayer<float>;
template class LlamaDecoderSelfAttentionLayer<half>;
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -15,7 +15,8 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h
#pragma once
......@@ -96,4 +97,4 @@ private:
bool is_allocate_buffer_{};
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -15,7 +15,7 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h
#pragma once
......@@ -25,8 +25,7 @@
namespace fastertransformer {
enum class WeightType : int
{
enum class WeightType : int {
kFP32,
kFP16,
kFP8, // not supported yet
......@@ -66,7 +65,7 @@ template<typename T>
struct LlamaAttentionWeight {
LlamaDenseWeight<T> qkv;
LlamaDenseWeight<T> output;
std::vector<float> past_kv_scale;
std::vector<float> past_kv_scale;
};
template<typename T>
......@@ -76,4 +75,4 @@ struct LlamaFfnWeight {
LlamaDenseWeight<T> output;
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -15,7 +15,7 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h
#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
#include "src/fastertransformer/kernels/activation_kernels.h"
......@@ -110,4 +110,4 @@ void LlamaFfnLayer<T>::forward(TensorMap* output_tensors,
template class LlamaFfnLayer<float>;
template class LlamaFfnLayer<half>;
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -15,7 +15,7 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc
#pragma once
......@@ -82,4 +82,4 @@ private:
bool is_allocate_buffer_{};
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -31,4 +31,4 @@ private:
void* ptr{};
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -58,4 +58,4 @@ private:
cudaStream_t stream_{};
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -89,4 +89,4 @@ struct NcclGuard {
std::unique_ptr<std::lock_guard<std::mutex>> global_nccl_lock_;
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
/*
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2021, NAVER Corp. Authored by CLOVA.
......@@ -17,7 +17,8 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
#include "src/fastertransformer/models/llama/LlamaV2.h"
#include "src/fastertransformer/kernels/decoding_kernels.h"
......@@ -97,7 +98,8 @@ LlamaV2<T>::LlamaV2(size_t head_num,
FT_LOG_ERROR("use_context_fmha not support int8");
assert(0);
}
} else {
}
else {
elem_bits = sizeof(T) * 8;
}
kv_cache_mgr_ = std::make_unique<LlamaCacheManager>(num_layer_,
......@@ -213,7 +215,7 @@ void LlamaV2<T>::contextDecode(T* deocder_output,
static_cast<T*>(nullptr),
pPromptTuningParam<T>{},
input_ids,
0, // only used for postion encoding
0, // only used for position encoding
token_num,
token_num,
1,
......@@ -592,4 +594,4 @@ void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>* outputs,
template class LlamaV2<half>;
template class LlamaV2<float>;
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -16,7 +16,8 @@
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h
#pragma once
......@@ -182,4 +183,4 @@ private:
std::thread internal_thread_;
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -121,4 +121,4 @@ void LlamaWeight<T>::loadModel(std::string dir_path)
template struct LlamaWeight<float>;
template struct LlamaWeight<half>;
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment