"docs/source/ko/using-diffusers/inpaint.md" did not exist on "8124863d1f60aa86427b87f251c8a06e590d7fa9"
Unverified Commit fe46dac2 authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

Add lint action (#32)

* temp

* fix lint

* csrc->src

* remove clang-format

* skip .rst

* skip doc

* clang-format

version

version

* mat_B
parent e8ab4ba3
...@@ -21,43 +21,39 @@ ...@@ -21,43 +21,39 @@
#else #else
#include <cooperative_groups.h> #include <cooperative_groups.h>
#endif #endif
#include <cuda_fp16.h>
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h" #include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include <cuda_fp16.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <curand_kernel.h> #include <curand_kernel.h>
#include <float.h> #include <float.h>
#include <type_traits> #include <type_traits>
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
namespace cg = cooperative_groups; namespace cg = cooperative_groups;
namespace fastertransformer { namespace fastertransformer {
template <int VPT> template<int VPT>
struct BytesToType; struct BytesToType;
template <> template<>
struct BytesToType<2> struct BytesToType<2> {
{
using type = uint16_t; using type = uint16_t;
}; };
template <> template<>
struct BytesToType<4> struct BytesToType<4> {
{
using type = uint32_t; using type = uint32_t;
}; };
template <> template<>
struct BytesToType<8> struct BytesToType<8> {
{
using type = uint64_t; using type = uint64_t;
}; };
template <> template<>
struct BytesToType<16> struct BytesToType<16> {
{
using type = float4; using type = float4;
}; };
template <int Bytes> template<int Bytes>
__device__ inline void copy(const void* local, void* data) __device__ inline void copy(const void* local, void* data)
{ {
using T = typename BytesToType<Bytes>::type; using T = typename BytesToType<Bytes>::type;
...@@ -134,7 +130,6 @@ __inline__ __device__ T blockReduceMax(T val) ...@@ -134,7 +130,6 @@ __inline__ __device__ T blockReduceMax(T val)
return val; return val;
} }
/* Calculate the maximum of all elements in a block */ /* Calculate the maximum of all elements in a block */
template<typename T> template<typename T>
__inline__ __device__ T blockAllReduceMax(T val) __inline__ __device__ T blockAllReduceMax(T val)
......
...@@ -1472,7 +1472,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* ...@@ -1472,7 +1472,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T*
k = *reinterpret_cast<Vec_t*>(k_smem + half_idx * smem_pitch + intra_half_idx); k = *reinterpret_cast<Vec_t*>(k_smem + half_idx * smem_pitch + intra_half_idx);
} }
} }
if (!is_masked && !q_buf) { // also skip modifing QKV if q/k/v_buf are present if (!is_masked && !q_buf) { // also skip modifying QKV if q/k/v_buf are present
*reinterpret_cast<Vec_t*>(&QKV[src_q_idx]) = q; *reinterpret_cast<Vec_t*>(&QKV[src_q_idx]) = q;
*reinterpret_cast<Vec_t*>(&QKV[src_k_idx]) = k; *reinterpret_cast<Vec_t*>(&QKV[src_k_idx]) = k;
*reinterpret_cast<Vec_t*>(&QKV[src_v_idx]) = v; *reinterpret_cast<Vec_t*>(&QKV[src_v_idx]) = v;
......
...@@ -69,11 +69,12 @@ AttentionType getAttentionType(size_t size_per_head, ...@@ -69,11 +69,12 @@ AttentionType getAttentionType(size_t size_per_head,
// GPT and its variants // GPT and its variants
else { else {
// FMHA_ENABLE only affects gpt-style models (causal-mask) // FMHA_ENABLE only affects gpt-style models (causal-mask)
char * fused_qkv = std::getenv("FMHA_ENABLE"); char* fused_qkv = std::getenv("FMHA_ENABLE");
if (fused_qkv != nullptr && std::string(fused_qkv) == "ON") { if (fused_qkv != nullptr && std::string(fused_qkv) == "ON") {
if ((sm == kSM_70 || sm == kSM_72 || sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_89) if ((sm == kSM_70 || sm == kSM_72 || sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_89)
&& (size_per_head == 32 || size_per_head == 40 || size_per_head == 64 || size_per_head == 80 && (size_per_head == 32 || size_per_head == 40 || size_per_head == 64 || size_per_head == 80
|| size_per_head == 128 || size_per_head == 144 || size_per_head == 160 || size_per_head == 256)) { || size_per_head == 128 || size_per_head == 144 || size_per_head == 160
|| size_per_head == 256)) {
return remove_padding ? AttentionType::FUSED_MHA : AttentionType::UNFUSED_PADDED_MHA; return remove_padding ? AttentionType::FUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
} }
} }
......
...@@ -13,4 +13,3 @@ ...@@ -13,4 +13,3 @@
# limitations under the License. # limitations under the License.
cmake_minimum_required(VERSION 3.8) cmake_minimum_required(VERSION 3.8)
...@@ -19,11 +19,11 @@ template<typename T> ...@@ -19,11 +19,11 @@ template<typename T>
void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_reqs, void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_reqs,
std::vector<std::shared_ptr<Request>>& infer_reqs) std::vector<std::shared_ptr<Request>>& infer_reqs)
{ {
std::unordered_map<uint64_t, int> occurance; std::unordered_map<uint64_t, int> occurrence;
auto count_occurance = [&occurance](const std::vector<std::shared_ptr<Request>>& rs) { auto count_occurrence = [&occurrence](const std::vector<std::shared_ptr<Request>>& rs) {
for (const auto& r : rs) { for (const auto& r : rs) {
++occurance[r->id]; ++occurrence[r->id];
} }
}; };
...@@ -33,13 +33,13 @@ void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_r ...@@ -33,13 +33,13 @@ void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_r
req.reset(); req.reset();
}; };
auto handle_conflict_or_invalid = [this, &occurance, &invalidate](std::vector<std::shared_ptr<Request>>& rs, auto handle_conflict_or_invalid = [this, &occurrence, &invalidate](std::vector<std::shared_ptr<Request>>& rs,
const char* type) { const char* type) {
for (auto& r : rs) { for (auto& r : rs) {
if (r) { if (r) {
int ec = 0; int ec = 0;
if (occurance[r->id] != 1) { if (occurrence[r->id] != 1) {
ec = Request::kConflict; ec = Request::kConflict;
} }
else if (r->start_flag && r->stop_flag) { else if (r->start_flag && r->stop_flag) {
...@@ -66,8 +66,8 @@ void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_r ...@@ -66,8 +66,8 @@ void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_r
rs.resize(count); rs.resize(count);
}; };
count_occurance(stop_reqs); count_occurrence(stop_reqs);
count_occurance(infer_reqs); count_occurrence(infer_reqs);
if (!stop_reqs.empty()) { if (!stop_reqs.empty()) {
handle_conflict_or_invalid(stop_reqs, "stop"); handle_conflict_or_invalid(stop_reqs, "stop");
...@@ -129,7 +129,7 @@ void LlamaBatch<T>::handleStopRequests(const std::vector<std::shared_ptr<Request ...@@ -129,7 +129,7 @@ void LlamaBatch<T>::handleStopRequests(const std::vector<std::shared_ptr<Request
ec = 0; ec = 0;
llama_->kv_cache_mgr_->erase(r->id); llama_->kv_cache_mgr_->erase(r->id);
} }
// clear output buffers (prevent leaking conversations) if request is successfull // clear output buffers (prevent leaking conversations) if request is successful
if (ec == 0) { if (ec == 0) {
auto& output_ids = r->outputs[rank_].at("output_ids"); auto& output_ids = r->outputs[rank_].at("output_ids");
auto& sequence_length = r->outputs[rank_].at("sequence_length"); auto& sequence_length = r->outputs[rank_].at("sequence_length");
...@@ -407,7 +407,7 @@ void LlamaBatch<T>::initializeGeneration() ...@@ -407,7 +407,7 @@ void LlamaBatch<T>::initializeGeneration()
check_cuda_error( check_cuda_error(
cudaMemcpyAsync(sequence_lengths_, context_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_)); cudaMemcpyAsync(sequence_lengths_, context_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
// `sequence_lengths_` will be increased by dynamic decode // `sequence_lengths_` will be increased by dynamic decode
// note that in decoder and in output "sequence length" has differnt semantic // note that in decoder and in output "sequence length" has different semantic
// - in decoder it means length of sequence that has kv cache already computed // - in decoder it means length of sequence that has kv cache already computed
// - in output it means length of all tokens (the last generated token does not have k/v cache computed yet) // - in output it means length of all tokens (the last generated token does not have k/v cache computed yet)
invokePlusScalar(sequence_lengths_, -1, batch_size_, stream_); invokePlusScalar(sequence_lengths_, -1, batch_size_, stream_);
......
...@@ -122,7 +122,7 @@ private: ...@@ -122,7 +122,7 @@ private:
void* topk_curandstate_buf_{}; void* topk_curandstate_buf_{};
void* topp_curandstate_buf_{}; void* topp_curandstate_buf_{};
// hard limits for persistant buffers // hard limits for persistent buffers
static constexpr int kMaxStopBadWordsLen = 32; static constexpr int kMaxStopBadWordsLen = 32;
using CachedSeq = LlamaCacheManager::Sequence; using CachedSeq = LlamaCacheManager::Sequence;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment