Unverified Commit fe46dac2 authored by AllentDan's avatar AllentDan Committed by GitHub
Browse files

Add lint action (#32)

* temp

* fix lint

* csrc->src

* remove clang-format

* skip .rst

* skip doc

* clang-format

version

version

* mat_B
parent e8ab4ba3
......@@ -98,19 +98,19 @@ template void invokeDecodingInitialize(bool* finished,
// PROMPT_SRC: 0 --> no prompts, 1 --> from loaded prompts, 2 --> from request prompts
template<typename T>
__global__ void embeddingLookupPosEncoding(T* from_tensor,
const T* embedding_table,
const T* position_encoding,
const int* all_ids,
const int* padding_count,
const int* input_lengths,
const int local_token_num,
const int64_t hidden_units,
const int step,
const int max_input_length,
const int token_num,
const int ite,
const T scale)
__global__ void embeddingLookupPosEncoding(T* from_tensor,
const T* embedding_table,
const T* position_encoding,
const int* all_ids,
const int* padding_count,
const int* input_lengths,
const int local_token_num,
const int64_t hidden_units,
const int step,
const int max_input_length,
const int token_num,
const int ite,
const T scale)
{
// 1. lookup from embedding table
// 2. multiply scale
......
......@@ -242,18 +242,18 @@ __global__ void inputIdsEmbeddingLookupPosEncodingSoftPrompt(inputIdsEmbeddingLo
// embedding lookup from word ids [batch, beam, length] (part of [batch, beam, max_input_length]), [vocab,
// hidden] and [batch, max_prefix_soft_prompt_length, hidden] to generate embedding [batch, beam, length +
// max_prefix_soft_prompt_length, hidden]
int tmp_index = index;
const int hidden_id = tmp_index % param.hidden_units;
tmp_index = (tmp_index - hidden_id) / param.hidden_units;
const int seq_id = tmp_index % (param.max_prefix_soft_prompt_length + param.max_input_length);
tmp_index = (tmp_index - seq_id) / (param.max_prefix_soft_prompt_length + param.max_input_length);
const int beam_id = tmp_index % param.beam_width;
tmp_index = (tmp_index - beam_id) / param.beam_width;
const int batch_id = tmp_index % param.batch_size;
int tmp_index = index;
const int hidden_id = tmp_index % param.hidden_units;
tmp_index = (tmp_index - hidden_id) / param.hidden_units;
const int seq_id = tmp_index % (param.max_prefix_soft_prompt_length + param.max_input_length);
tmp_index = (tmp_index - seq_id) / (param.max_prefix_soft_prompt_length + param.max_input_length);
const int beam_id = tmp_index % param.beam_width;
tmp_index = (tmp_index - beam_id) / param.beam_width;
const int batch_id = tmp_index % param.batch_size;
const int64_t hidden_units = param.hidden_units;
T embedding =
T embedding =
(seq_id < param.prefix_soft_prompt_lengths[batch_id]) ?
(T)param.prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * hidden_units
(T)param.prefix_soft_prompt_embedding[batch_id * param.max_prefix_soft_prompt_length * hidden_units
+ seq_id * hidden_units + hidden_id] :
param.embedding_table[param.input_ids[batch_id * param.beam_width * param.max_input_length
+ beam_id * param.max_input_length
......
......@@ -21,50 +21,46 @@
#else
#include <cooperative_groups.h>
#endif
#include <cuda_fp16.h>
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include <float.h>
#include <type_traits>
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
namespace cg = cooperative_groups;
namespace fastertransformer {
template <int VPT>
template<int VPT>
struct BytesToType;
template <>
struct BytesToType<2>
{
template<>
struct BytesToType<2> {
using type = uint16_t;
};
template <>
struct BytesToType<4>
{
template<>
struct BytesToType<4> {
using type = uint32_t;
};
template <>
struct BytesToType<8>
{
template<>
struct BytesToType<8> {
using type = uint64_t;
};
template <>
struct BytesToType<16>
{
template<>
struct BytesToType<16> {
using type = float4;
};
template <int Bytes>
template<int Bytes>
__device__ inline void copy(const void* local, void* data)
{
using T = typename BytesToType<Bytes>::type;
const T* in = static_cast<const T*>(local);
T* out = static_cast<T*>(data);
*out = *in;
const T* in = static_cast<const T*>(local);
T* out = static_cast<T*>(data);
*out = *in;
}
static const float HALF_FLT_MAX = 65504.F;
......@@ -134,7 +130,6 @@ __inline__ __device__ T blockReduceMax(T val)
return val;
}
/* Calculate the maximum of all elements in a block */
template<typename T>
__inline__ __device__ T blockAllReduceMax(T val)
......
......@@ -149,7 +149,7 @@ void invokeLengthCriterion(bool* finished,
h_pinned_finished_sum_[0] = -1;
length_criterion<<<grid, block, 0, stream>>>(
finished, should_stop, h_pinned_finished_sum_, sequence_limit_length, batch_size, beam_width, step);
finished, should_stop, h_pinned_finished_sum_, sequence_limit_length, batch_size, beam_width, step);
while (((volatile int*)h_pinned_finished_sum_)[0] == -1) {};
sync_check_cuda_error();
......
......@@ -1472,7 +1472,7 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T*
k = *reinterpret_cast<Vec_t*>(k_smem + half_idx * smem_pitch + intra_half_idx);
}
}
if (!is_masked && !q_buf) { // also skip modifing QKV if q/k/v_buf are present
if (!is_masked && !q_buf) { // also skip modifying QKV if q/k/v_buf are present
*reinterpret_cast<Vec_t*>(&QKV[src_q_idx]) = q;
*reinterpret_cast<Vec_t*>(&QKV[src_k_idx]) = k;
*reinterpret_cast<Vec_t*>(&QKV[src_v_idx]) = v;
......
......@@ -23,4 +23,4 @@ set_property(TARGET DynamicDecodeLayer PROPERTY CUDA_RESOLVE_DEVICE_SYMBOLS ON)
target_link_libraries(DynamicDecodeLayer PUBLIC -lcudart
TopKSamplingLayer TopPSamplingLayer
OnlineBeamSearchLayer BeamSearchLayer ban_bad_words stop_criteria
gpt_kernels tensor nvtx_utils)
\ No newline at end of file
gpt_kernels tensor nvtx_utils)
This diff is collapsed.
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
namespace fastertransformer {
template<typename T1, typename T2>
struct FfnFP8Weight: FfnWeight<T1, T2> {
ScaleList* scale_list_ptr;
float* identity_scale;
float* identity_h_scale;
};
} // namespace fastertransformer
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
namespace fastertransformer {
template<typename T1, typename T2>
struct FfnFP8Weight: FfnWeight<T1, T2> {
ScaleList* scale_list_ptr;
float* identity_scale;
float* identity_h_scale;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
namespace fastertransformer {
template<typename T>
struct FfnINT8Weight: FfnWeight<T> {
ScaleList* scale_list_ptr;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
namespace fastertransformer {
template<typename T>
struct FfnINT8Weight: FfnWeight<T> {
ScaleList* scale_list_ptr;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnINT8Weight.h"
#include "src/fastertransformer/kernels/activation_int8_kernels.h"
#include "src/fastertransformer/layers/BaseLayer.h"
#include "src/fastertransformer/utils/ScaleList.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasINT8MMWrapper.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include <vector>
namespace fastertransformer {
template<typename T>
class GeluFfnLayerINT8;
template<typename T>
class ReluFfnLayerINT8;
template<typename T>
class FfnLayerINT8: public BaseLayer {
private:
// buffer handling
size_t max_token_num_ = 0;
// meta data
size_t head_num_;
size_t size_per_head_;
// calculated data
size_t hidden_units_;
void allocateBuffer() override;
void freeBuffer() override;
bool isValidTokenNum(size_t token_num);
protected:
size_t inter_size_;
int int8_mode_;
bool sparse_;
int* inter_int_buf_;
int8_t* inter_buf_;
virtual void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) = 0;
public:
FfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer);
~FfnLayerINT8();
void forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights);
friend GeluFfnLayerINT8<T>;
friend ReluFfnLayerINT8<T>;
};
template<typename T>
class GeluFfnLayerINT8: public FfnLayerINT8<T> {
public:
GeluFfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& ffn_layer);
~GeluFfnLayerINT8() = default;
private:
using FfnLayerINT8<T>::inter_int_buf_;
using FfnLayerINT8<T>::inter_buf_;
using FfnLayerINT8<T>::inter_size_;
using FfnLayerINT8<T>::stream_;
using FfnLayerINT8<T>::int8_mode_;
using FfnLayerINT8<T>::sparse_;
using FfnLayerINT8<T>::hidden_units_;
void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
};
template<typename T>
class ReluFfnLayerINT8: public FfnLayerINT8<T> {
public:
ReluFfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward);
ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& ffn_layer);
~ReluFfnLayerINT8() = default;
private:
using FfnLayerINT8<T>::inter_int_buf_;
using FfnLayerINT8<T>::inter_buf_;
using FfnLayerINT8<T>::inter_size_;
using FfnLayerINT8<T>::stream_;
using FfnLayerINT8<T>::int8_mode_;
using FfnLayerINT8<T>::hidden_units_;
void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "FfnINT8Weight.h"
#include "src/fastertransformer/kernels/activation_int8_kernels.h"
#include "src/fastertransformer/layers/BaseLayer.h"
#include "src/fastertransformer/utils/ScaleList.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasINT8MMWrapper.h"
#include "src/fastertransformer/utils/memory_utils.h"
#include <vector>
namespace fastertransformer {
template<typename T>
class GeluFfnLayerINT8;
template<typename T>
class ReluFfnLayerINT8;
template<typename T>
class FfnLayerINT8: public BaseLayer {
private:
// buffer handling
size_t max_token_num_ = 0;
// meta data
size_t head_num_;
size_t size_per_head_;
// calculated data
size_t hidden_units_;
void allocateBuffer() override;
void freeBuffer() override;
bool isValidTokenNum(size_t token_num);
protected:
size_t inter_size_;
int int8_mode_;
bool sparse_;
int* inter_int_buf_;
int8_t* inter_buf_;
virtual void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) = 0;
public:
FfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
FfnLayerINT8(FfnLayerINT8<T> const& ffn_layer);
~FfnLayerINT8();
void forward(std::vector<fastertransformer::Tensor>* output_tensors,
const std::vector<fastertransformer::Tensor>* input_tensors,
const FfnWeight<T>* ffn_weights);
friend GeluFfnLayerINT8<T>;
friend ReluFfnLayerINT8<T>;
};
template<typename T>
class GeluFfnLayerINT8: public FfnLayerINT8<T> {
public:
GeluFfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool sparse = false);
GeluFfnLayerINT8(GeluFfnLayerINT8<T> const& ffn_layer);
~GeluFfnLayerINT8() = default;
private:
using FfnLayerINT8<T>::inter_int_buf_;
using FfnLayerINT8<T>::inter_buf_;
using FfnLayerINT8<T>::inter_size_;
using FfnLayerINT8<T>::stream_;
using FfnLayerINT8<T>::int8_mode_;
using FfnLayerINT8<T>::sparse_;
using FfnLayerINT8<T>::hidden_units_;
void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
};
template<typename T>
class ReluFfnLayerINT8: public FfnLayerINT8<T> {
public:
ReluFfnLayerINT8(size_t max_batch_size,
size_t max_seq_len,
size_t head_num,
size_t size_per_head,
size_t inter_size,
int int8_mode,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward);
ReluFfnLayerINT8(ReluFfnLayerINT8<T> const& ffn_layer);
~ReluFfnLayerINT8() = default;
private:
using FfnLayerINT8<T>::inter_int_buf_;
using FfnLayerINT8<T>::inter_buf_;
using FfnLayerINT8<T>::inter_size_;
using FfnLayerINT8<T>::stream_;
using FfnLayerINT8<T>::int8_mode_;
using FfnLayerINT8<T>::hidden_units_;
void invokeAddBiasActivation(const int m, const T* bias, ScaleList* scale_list) override;
};
} // namespace fastertransformer
......@@ -68,12 +68,13 @@ AttentionType getAttentionType(size_t size_per_head,
}
// GPT and its variants
else {
// FMHA_ENABLE only affects gpt-style models (causal-mask)
char * fused_qkv = std::getenv("FMHA_ENABLE");
// FMHA_ENABLE only affects gpt-style models (causal-mask)
char* fused_qkv = std::getenv("FMHA_ENABLE");
if (fused_qkv != nullptr && std::string(fused_qkv) == "ON") {
if ((sm == kSM_70 || sm == kSM_72 || sm == kSM_75 || sm == kSM_80 || sm == kSM_86 || sm == kSM_89)
&& (size_per_head == 32 || size_per_head == 40 || size_per_head == 64 || size_per_head == 80
|| size_per_head == 128 || size_per_head == 144 || size_per_head == 160 || size_per_head == 256)) {
|| size_per_head == 128 || size_per_head == 144 || size_per_head == 160
|| size_per_head == 256)) {
return remove_padding ? AttentionType::FUSED_MHA : AttentionType::UNFUSED_PADDED_MHA;
}
}
......
......@@ -13,4 +13,3 @@
# limitations under the License.
cmake_minimum_required(VERSION 3.8)
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
namespace fastertransformer {
template<typename T1, typename T2>
struct AttentionFP8Weight: public AttentionWeight<T1, T2> {
const float* qk_scale;
const float* qk_scale_inv;
float* qk_h_scale;
float* qk_h_scale_inv;
float* identity_scale;
float* identity_h_scale;
};
} // namespace fastertransformer
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
namespace fastertransformer {
template<typename T1, typename T2>
struct AttentionFP8Weight: public AttentionWeight<T1, T2> {
const float* qk_scale;
const float* qk_scale_inv;
float* qk_h_scale;
float* qk_h_scale_inv;
float* identity_scale;
float* identity_h_scale;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
namespace fastertransformer {
template<typename T>
struct AttentionINT8Weight: AttentionWeight<T> {
ScaleList* scale_list_ptr;
};
} // namespace fastertransformer
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/ScaleList.h"
namespace fastertransformer {
template<typename T>
struct AttentionINT8Weight: AttentionWeight<T> {
ScaleList* scale_list_ptr;
};
} // namespace fastertransformer
......@@ -46,4 +46,4 @@ public:
}
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -15,9 +15,9 @@ public:
pthread_barrier_init(&barrier_, nullptr, count);
}
Barrier(const Barrier&) = delete;
Barrier& operator=(const Barrier&) = delete;
Barrier(Barrier&&) noexcept = delete;
Barrier(const Barrier&) = delete;
Barrier& operator=(const Barrier&) = delete;
Barrier(Barrier&&) noexcept = delete;
Barrier& operator=(Barrier&&) noexcept = delete;
void wait()
......@@ -34,4 +34,4 @@ private:
pthread_barrier_t barrier_{};
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -4,7 +4,7 @@ cmake_minimum_required(VERSION 3.8)
add_subdirectory(fused_multi_head_attention)
add_library(Llama STATIC
add_library(Llama STATIC
LlamaV2.cc
LlamaBatch.cc
LlamaCacheManager.cc
......
......@@ -19,11 +19,11 @@ template<typename T>
void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_reqs,
std::vector<std::shared_ptr<Request>>& infer_reqs)
{
std::unordered_map<uint64_t, int> occurance;
std::unordered_map<uint64_t, int> occurrence;
auto count_occurance = [&occurance](const std::vector<std::shared_ptr<Request>>& rs) {
auto count_occurrence = [&occurrence](const std::vector<std::shared_ptr<Request>>& rs) {
for (const auto& r : rs) {
++occurance[r->id];
++occurrence[r->id];
}
};
......@@ -33,13 +33,13 @@ void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_r
req.reset();
};
auto handle_conflict_or_invalid = [this, &occurance, &invalidate](std::vector<std::shared_ptr<Request>>& rs,
const char* type) {
auto handle_conflict_or_invalid = [this, &occurrence, &invalidate](std::vector<std::shared_ptr<Request>>& rs,
const char* type) {
for (auto& r : rs) {
if (r) {
int ec = 0;
if (occurance[r->id] != 1) {
if (occurrence[r->id] != 1) {
ec = Request::kConflict;
}
else if (r->start_flag && r->stop_flag) {
......@@ -66,8 +66,8 @@ void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_r
rs.resize(count);
};
count_occurance(stop_reqs);
count_occurance(infer_reqs);
count_occurrence(stop_reqs);
count_occurrence(infer_reqs);
if (!stop_reqs.empty()) {
handle_conflict_or_invalid(stop_reqs, "stop");
......@@ -129,7 +129,7 @@ void LlamaBatch<T>::handleStopRequests(const std::vector<std::shared_ptr<Request
ec = 0;
llama_->kv_cache_mgr_->erase(r->id);
}
// clear output buffers (prevent leaking conversations) if request is successfull
// clear output buffers (prevent leaking conversations) if request is successful
if (ec == 0) {
auto& output_ids = r->outputs[rank_].at("output_ids");
auto& sequence_length = r->outputs[rank_].at("sequence_length");
......@@ -407,7 +407,7 @@ void LlamaBatch<T>::initializeGeneration()
check_cuda_error(
cudaMemcpyAsync(sequence_lengths_, context_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
// `sequence_lengths_` will be increased by dynamic decode
// note that in decoder and in output "sequence length" has differnt semantic
// note that in decoder and in output "sequence length" has different semantic
// - in decoder it means length of sequence that has kv cache already computed
// - in output it means length of all tokens (the last generated token does not have k/v cache computed yet)
invokePlusScalar(sequence_lengths_, -1, batch_size_, stream_);
......@@ -1039,4 +1039,4 @@ void LlamaBatch<T>::finishRequest(int index, bool force_end)
template class LlamaBatch<half>;
template class LlamaBatch<float>;
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
......@@ -122,7 +122,7 @@ private:
void* topk_curandstate_buf_{};
void* topp_curandstate_buf_{};
// hard limits for persistant buffers
// hard limits for persistent buffers
static constexpr int kMaxStopBadWordsLen = 32;
using CachedSeq = LlamaCacheManager::Sequence;
......@@ -150,4 +150,4 @@ private:
IAllocator* allocator_{};
};
} // namespace fastertransformer
\ No newline at end of file
} // namespace fastertransformer
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment