Unverified Commit 7f943a26 authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

Unify prefill & decode passes (#775)

* Unify prefill and decode passes

* dynamic split-fuse

* refactor

* correct input count calculation

* remove unused

* lint

* lint

* fix msvc build

* fix msvc build

* fix msvc build

* fix msvc build

* fix msvc build

* fix msvc build

* fix msvc build

* fix msvc build

* fix msvc build
parent 2ba90822
......@@ -48,7 +48,9 @@ __global__ void getPaddingOffsetAndCuSeqLensKernel(size_t* h_valid_word_num,
if (calculate_cu_seqlens) {
cu_seqlens[batch_size] = total_seq_len;
}
h_valid_word_num[0] = (size_t)total_seq_len;
if (h_valid_word_num) {
h_valid_word_num[0] = (size_t)total_seq_len;
}
}
void invokeGetPaddingOffsetAndCuSeqLens(size_t* h_pinned_token_num,
......@@ -60,15 +62,19 @@ void invokeGetPaddingOffsetAndCuSeqLens(size_t* h_pinned_token_num,
const int max_seq_len,
cudaStream_t stream)
{
h_pinned_token_num[0] = 0;
if (h_pinned_token_num) {
h_pinned_token_num[0] = 0;
}
getPaddingOffsetAndCuSeqLensKernel<<<1, 1, 0, stream>>>(
h_pinned_token_num, tmp_mask_offset, cu_seqlens, sequence_lengths, batch_size, max_seq_len);
if (h_pinned_token_num) {
#ifdef _MSC_VER
cudaStreamSynchronize(stream);
cudaStreamSynchronize(stream);
#else
while (((volatile size_t*)h_pinned_token_num)[0] == 0) {};
while (((volatile size_t*)h_pinned_token_num)[0] == 0) {};
#endif
h_token_num[0] = h_pinned_token_num[0];
h_token_num[0] = h_pinned_token_num[0];
}
sync_check_cuda_error();
}
......
......@@ -20,13 +20,11 @@ struct DecoderMultiHeadAttentionParams {
T* __restrict__ v_bias;
// sequence-level buffers
const int* __restrict__ per_sample_length;
const int* __restrict__ context_length;
const bool* __restrict__ finished;
const float* __restrict__ rope_theta;
// kv cache
void** __restrict__ per_sample_k_cache; // [H, S, D]
void** __restrict__ per_sample_v_cache; // [H, S, D]
size_t layer_offset;
/// cache layout M,[N,H,x,D]
......
......@@ -145,7 +145,7 @@ struct DecoderMultiHeadAttentionKernel {
kv_head_idx_ = head_idx_ / gqa_group_size;
is_gqa_leader_ = head_idx_ % gqa_group_size == 0;
timestep_ = params_.per_sample_length[batch_idx_];
timestep_ = params_.context_length[batch_idx_] - 1;
if (kSplitK && params.max_split_k > 1) {
const int slice_count = (timestep_ + kSliceLen - 1) / kSliceLen;
......@@ -815,7 +815,7 @@ struct DecoderMultiHeadAttentionKernel {
{
const int batch_idx = get_batch_idx();
const int head_idx = get_head_idx();
const int timestep = params.per_sample_length[batch_idx];
const int timestep = params.context_length[batch_idx] - 1;
const int max_split_k = params.max_split_k;
const int slice_count = get_slice_count(timestep);
const int slice_per_split = (slice_count + max_split_k - 1) / max_split_k;
......
......@@ -53,7 +53,7 @@ void TestBlocks(thrust::universal_vector<half>& linear, // linear data
std::mt19937 g(rd());
std::shuffle(idxs.begin(), idxs.end(), g);
for (int i = 0; i < idxs.size(); ++i) {
for (size_t i = 0; i < idxs.size(); ++i) {
ptrs[i] = blocks.data().get() + idxs[i] * head_num * block_size * head_dim;
}
......@@ -115,8 +115,8 @@ int main(int argc, char* argv[])
constexpr int KvHeadNum = 32;
constexpr int kBatchSize = 1;
// constexpr int kContextLen = 7306;
constexpr int kContextLen = 1024;
constexpr int kSequenceLen = kContextLen + 1;
constexpr int kSequenceLen = 1024;
constexpr int kContextLen = kSequenceLen + 1;
constexpr int kBlockSz = 128;
constexpr int kTestIter = 10;
constexpr int kMaxSplitK = 1;
......@@ -126,9 +126,10 @@ int main(int argc, char* argv[])
thrust::universal_vector<half> output(kBatchSize * kHeadNum * kHeadDim);
thrust::universal_vector<half> qkv(kBatchSize * (kHeadNum + KvHeadNum * 2) * kHeadDim);
thrust::universal_vector<bool> finished(kBatchSize);
thrust::universal_vector<half> k_cache(kBatchSize * kSequenceLen * KvHeadNum * kHeadDim);
thrust::universal_vector<half> v_cache(kBatchSize * kSequenceLen * KvHeadNum * kHeadDim);
thrust::universal_vector<int> sequence_lengths(kBatchSize);
thrust::universal_vector<half> k_cache(kBatchSize * kContextLen * KvHeadNum * kHeadDim);
thrust::universal_vector<half> v_cache(kBatchSize * kContextLen * KvHeadNum * kHeadDim);
thrust::universal_vector<int> context_length(kBatchSize);
thrust::universal_vector<int> sequence_length(kBatchSize);
thrust::universal_vector<void*> k_cache_ptrs(kBatchSize);
thrust::universal_vector<void*> v_cache_ptrs(kBatchSize);
......@@ -138,23 +139,23 @@ int main(int argc, char* argv[])
rng.GenerateNormal(qkv.data().get(), qkv.size(), 1.f, 0.f);
if (kContextLen) {
rng.GenerateNormal(k_cache.data().get(), kBatchSize * KvHeadNum * kSequenceLen * kHeadDim);
rng.GenerateNormal(v_cache.data().get(), kBatchSize * KvHeadNum * kSequenceLen * kHeadDim);
if (kSequenceLen) {
rng.GenerateNormal(k_cache.data().get(), kBatchSize * KvHeadNum * kContextLen * kHeadDim);
rng.GenerateNormal(v_cache.data().get(), kBatchSize * KvHeadNum * kContextLen * kHeadDim);
cudaMemset2DAsync(k_cache.data().get() + kContextLen * kHeadDim,
sizeof(half) * kSequenceLen * kHeadDim,
cudaMemset2DAsync(k_cache.data().get() + kSequenceLen * kHeadDim,
sizeof(half) * kContextLen * kHeadDim,
0,
sizeof(half) * kHeadDim,
kBatchSize * KvHeadNum);
if constexpr (0) {
for (int b = 0; b < kBatchSize; ++b) {
for (int h = 0; h < KvHeadNum; ++h) {
for (int s = 0; s < kSequenceLen; ++s) {
for (int s = 0; s < kContextLen; ++s) {
for (int d = 0; d < kHeadDim; ++d) {
std::cout << std::setw(7) << std::setprecision(4) << std::fixed
<< (float)k_cache[b * KvHeadNum * kSequenceLen * kHeadDim
+ h * kSequenceLen * kHeadDim + s * kHeadDim + d]
<< (float)k_cache[b * KvHeadNum * kContextLen * kHeadDim
+ h * kContextLen * kHeadDim + s * kHeadDim + d]
<< " ";
}
std::cout << "\n";
......@@ -166,8 +167,8 @@ int main(int argc, char* argv[])
std::exit(0);
}
cudaMemset2DAsync(v_cache.data().get() + kContextLen * kHeadDim,
sizeof(half) * kSequenceLen * kHeadDim,
cudaMemset2DAsync(v_cache.data().get() + kSequenceLen * kHeadDim,
sizeof(half) * kContextLen * kHeadDim,
0,
sizeof(half) * kHeadDim,
kBatchSize * KvHeadNum);
......@@ -193,7 +194,8 @@ int main(int argc, char* argv[])
cudaDeviceSynchronize();
for (int i = 0; i < kBatchSize; ++i) {
sequence_lengths[i] = kContextLen;
sequence_length[i] = kSequenceLen;
context_length[i] = kContextLen;
k_cache_ptrs[i] = k_cache.data().get() + i * k_cache.size() / kBatchSize;
v_cache_ptrs[i] = v_cache.data().get() + i * v_cache.size() / kBatchSize;
k_cache_ref_ptrs[i] = k_cache_ref.data().get() + i * k_cache_ref.size() / kBatchSize;
......@@ -212,7 +214,7 @@ int main(int argc, char* argv[])
params.stride = (kHeadNum + 2 * KvHeadNum) * kHeadDim;
params.batch_size = kBatchSize;
params.max_seq_len = kContextLen + 1;
params.max_seq_len = kSequenceLen;
params.cu_block_cnts = cu_block_cnts.data().get();
printf("%d %d\n", (int)k_ptrs.size(), (int)v_ptrs.size());
......@@ -220,11 +222,9 @@ int main(int argc, char* argv[])
params.v_cache_block_ptrs = (void**)v_ptrs.data().get();
params.kv_cache_block_size = kBlockSz;
params.finished = finished.data().get();
params.per_sample_length = sequence_lengths.data().get();
params.per_sample_k_cache = k_cache_ref_ptrs.data().get();
params.per_sample_v_cache = v_cache_ref_ptrs.data().get();
params.layer_offset = 0;
params.finished = finished.data().get();
params.context_length = context_length.data().get();
params.layer_offset = 0;
params.num_heads = kHeadNum;
params.num_kv_heads = KvHeadNum;
......@@ -238,8 +238,16 @@ int main(int argc, char* argv[])
params.partial_M = partial_M.data().get();
params.partial_O = partial_O.data().get();
params.max_split_k = kMaxSplitK;
params.arch = 80;
for (int i = 0; i < kTestIter; ++i) {
mmha_ft_reference(params, cudaStream_t{});
mmha_ft_reference(params,
(half**)k_cache_ref_ptrs.data().get(),
(half**)v_cache_ref_ptrs.data().get(),
sequence_length.data().get(),
kContextLen,
cudaStream_t{});
}
cudaDeviceSynchronize();
......@@ -249,14 +257,7 @@ int main(int argc, char* argv[])
}
std::cout << "---------------------------------------------------\n";
params.out = output.data().get();
params.per_sample_k_cache = k_cache_ptrs.data().get();
params.per_sample_v_cache = v_cache_ptrs.data().get();
params.max_split_k = kMaxSplitK;
params.max_seq_len = kContextLen;
params.arch = 80;
params.out = output.data().get();
std::vector<thrust::universal_vector<half>> outputs;
......@@ -271,19 +272,14 @@ int main(int argc, char* argv[])
}
}
thrust::universal_vector<int> seq_lens(kBatchSize);
for (auto& x : seq_lens) {
x = kContextLen + 1;
}
if (1) {
ConvertBlocksToLinear((const half**)k_ptrs.data().get(),
k_cache.data().get(),
cu_block_cnts.data().get(),
seq_lens.data().get(),
context_length.data().get(),
0,
kBlockSz,
kSequenceLen,
kContextLen,
KvHeadNum,
kHeadDim,
kBatchSize,
......@@ -291,10 +287,10 @@ int main(int argc, char* argv[])
ConvertBlocksToLinear((const half**)v_ptrs.data().get(),
v_cache.data().get(),
cu_block_cnts.data().get(),
seq_lens.data().get(),
context_length.data().get(),
0,
kBlockSz,
kSequenceLen,
kContextLen,
KvHeadNum,
kHeadDim,
kBatchSize,
......@@ -316,15 +312,15 @@ int main(int argc, char* argv[])
// [H, S, D]
Compare(k_cache.data().get() + kContextLen * kHeadDim,
k_cache_ref.data().get() + kContextLen * kHeadDim,
kSequenceLen * kHeadDim,
Compare(k_cache.data().get() + kSequenceLen * kHeadDim,
k_cache_ref.data().get() + kSequenceLen * kHeadDim,
kContextLen * kHeadDim,
kHeadDim,
KvHeadNum);
Compare(v_cache.data().get() + kContextLen * kHeadDim,
v_cache_ref.data().get() + kContextLen * kHeadDim,
kSequenceLen * kHeadDim,
Compare(v_cache.data().get() + kSequenceLen * kHeadDim,
v_cache_ref.data().get() + kSequenceLen * kHeadDim,
kContextLen * kHeadDim,
kHeadDim,
KvHeadNum);
......
......@@ -182,7 +182,12 @@ struct SATypeConverter<half> {
};
template<typename T>
void mmha_ft_reference(const DecoderMultiHeadAttentionParams<T>& p, cudaStream_t st)
void mmha_ft_reference(const DecoderMultiHeadAttentionParams<T>& p,
T** per_sample_k_cache,
T** per_sample_v_cache,
const int* sequence_length,
int max_memory_len,
cudaStream_t st)
{
using DataType = typename SATypeConverter<T>::Type;
......@@ -204,18 +209,18 @@ void mmha_ft_reference(const DecoderMultiHeadAttentionParams<T>& p, cudaStream_t
params.stride = p.stride;
params.finished = (bool*)p.finished;
params.k_cache_per_sample = reinterpret_cast<DataType**>(p.per_sample_k_cache);
params.v_cache_per_sample = reinterpret_cast<DataType**>(p.per_sample_v_cache);
params.k_cache_per_sample = reinterpret_cast<DataType**>(per_sample_k_cache);
params.v_cache_per_sample = reinterpret_cast<DataType**>(per_sample_v_cache);
params.kv_cache_per_sample_offset = p.layer_offset;
params.batch_size = p.batch_size;
params.beam_width = 1;
params.memory_max_len = p.max_seq_len;
params.memory_max_len = max_memory_len;
params.prefix_prompt_lengths = 0;
params.max_prefix_prompt_length = 0;
params.length_per_sample = p.per_sample_length; // max_input_length + current output length
params.length_per_sample = sequence_length; // max_input_length + current output length
for (int i = 0; i < p.batch_size; ++i) {
params.timestep = std::max(p.per_sample_length[i], params.timestep);
params.timestep = std::max(sequence_length[i], params.timestep);
}
std::cout << "timestep = " << params.timestep << "\n";
......@@ -237,6 +242,11 @@ void mmha_ft_reference(const DecoderMultiHeadAttentionParams<T>& p, cudaStream_t
masked_multihead_attention(params, st);
}
template void mmha_ft_reference(const DecoderMultiHeadAttentionParams<half>& params, cudaStream_t st);
template void mmha_ft_reference(const DecoderMultiHeadAttentionParams<half>& params,
half** per_sample_k_cache,
half** per_sample_v_cache,
const int* sequence_length,
int max_memory_len,
cudaStream_t st);
} // namespace turbomind
......@@ -33,6 +33,11 @@ private:
};
template<typename T>
void mmha_ft_reference(const DecoderMultiHeadAttentionParams<T>& params, cudaStream_t st);
void mmha_ft_reference(const DecoderMultiHeadAttentionParams<T>& params,
T** per_sample_k_cache,
T** per_sample_v_cache,
const int* sequence_length,
int max_memory_len,
cudaStream_t st);
} // namespace turbomind
......@@ -34,10 +34,11 @@ public:
class Barrier {
public:
Barrier(unsigned count)
Barrier(unsigned count): count_(count)
{
TM_LOG_INFO("Barrier(%d)", (int)count);
pthread_barrier_init(&barrier_, nullptr, count);
if (count_ > 1) {
pthread_barrier_init(&barrier_, nullptr, count);
}
}
Barrier(const Barrier&) = delete;
......@@ -47,15 +48,20 @@ public:
void wait()
{
pthread_barrier_wait(&barrier_);
if (count_ > 1) {
pthread_barrier_wait(&barrier_);
}
}
~Barrier()
{
pthread_barrier_destroy(&barrier_);
if (count_ > 1) {
pthread_barrier_destroy(&barrier_);
}
}
private:
const int count_;
pthread_barrier_t barrier_{};
};
......
......@@ -9,16 +9,13 @@ find_package(CUDAToolkit REQUIRED)
add_library(Llama STATIC
LlamaV2.cc
LlamaBatch.cc
LlamaCacheManager.cc
BlockManager.cc
SequenceManager.cc
LlamaContextDecoder.cc
LlamaContextAttentionLayer.cc
LlamaDecoderSelfAttentionLayer.cc
LlamaDecoder.cc
LlamaWeight.cc
LlamaDecoderLayerWeight.cc
LlamaFfnLayer.cc
unified_decoder.cc
unified_attention_layer.cc
llama_kernels.cu
llama_decoder_kernels.cu
llama_utils.cu)
......
This diff is collapsed.
......@@ -9,6 +9,7 @@
#include "src/turbomind/models/llama/Request.h"
#include "src/turbomind/models/llama/SequenceManager.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_params.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/cuda_utils.h"
......@@ -28,7 +29,6 @@ struct BatchState {
float* h_rope_theta;
std::vector<int> seq_len_limit;
std::vector<int> is_swap_in;
std::vector<const Sequence*> sequences;
std::vector<std::shared_ptr<Request>> requests;
......@@ -42,6 +42,26 @@ struct BatchState {
template<typename T>
class LlamaV2;
struct GenerationState {
int max_init_ctx_len;
int step;
int sum_seq_len;
int max_seq_len;
int partial;
int partial_context_legnth;
std::vector<uint64_t> unique_ids;
int max_input_count1;
int max_input_count2;
std::deque<int> min_input_count;
int finished_count;
};
template<typename T>
class LlamaBatch {
public:
......@@ -58,35 +78,24 @@ public:
void ProcessInferRequests(const Requests& requests);
[[nodiscard]] bool Initialize();
void ContextDecode();
void AdjustMaxInputCount(GenerationState& g,
const std::vector<const Sequence*>& sequences,
const std::vector<int>& context_length);
struct GenerationState {
int max_init_ctx_len;
int step;
int sum_seq_len;
int max_seq_len;
};
void Initialize(GenerationState& g);
void InitializeSampling();
void InitializeSampling(const GenerationState& g);
GenerationState InitializeGeneration();
[[nodiscard]] bool Forward(GenerationState& g, int iter);
[[nodiscard]] bool Generate(GenerationState& g);
[[nodiscard]] auto Finish(GenerationState& g, int& finished_count) -> std::vector<Signal>;
[[nodiscard]] auto Finish(GenerationState& g) -> std::vector<Signal>;
[[nodiscard]] Signal Interrupt(int index, bool force_stop = false, bool force_end = false);
void
OutputContextLogits(T* context_decoder_output, const std::vector<int>& indices, const std::vector<int>& lengths);
explicit LlamaBatch(int max_batch_size,
int max_context_token_num,
int session_len,
std::unique_ptr<SequenceManager> sequence_manager,
LlamaV2<T>* llama);
explicit LlamaBatch(const EngineParams& params, int cache_block_seq_len, int quant_policy, LlamaV2<T>* model);
~LlamaBatch()
{
......@@ -177,7 +186,7 @@ private:
private:
const int max_batch_size_;
const int max_context_token_num_;
const int session_len_;
int session_len_;
const int rank_;
const bool debug_;
const int step_length_;
......@@ -201,6 +210,7 @@ private:
// lengths
int* input_length_buf_{}; // input + cache missed length
int* context_length_buf_{}; // history length + input_length
int* init_context_length_{};
// temp buffers used for block->linear kv-cache conversion
T* tmp_k_cache_buf_{};
T* tmp_v_cache_buf_{};
......@@ -228,13 +238,6 @@ private:
int* h_end_ids_buf_{};
int* d_end_ids_buf_{};
int** request_output_ids_ptrs_{};
int* request_output_ids_lens_{};
int** request_seqlen_ptrs_{};
int** h_request_output_ids_ptrs_{};
int* h_request_output_ids_lens_{};
int** h_request_seqlen_ptrs_{};
// pinned buffers
int* h_input_ids_buf_{};
int* h_input_length_buf_{};
......@@ -293,6 +296,10 @@ private:
bool output_stop_token_{false};
int* h_output_ids_{};
const int num_tokens_per_iter_;
const int extra_tokens_per_iter_;
const int max_prefill_iters_;
};
} // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/turbomind/models/llama/LlamaCacheManager.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
namespace turbomind {
LlamaCacheManager::~LlamaCacheManager()
{
for (auto& p : device_mem_) {
allocator_->free(&p, false);
}
}
void* LlamaCacheManager::allocate(bool is_preallocte)
{
if (rank_ == 0) {
TM_LOG_INFO("[LlamaCacheManager][allocate]");
}
void* mem_ptr{};
if (!device_free_.empty()) {
mem_ptr = device_free_.front();
device_free_.pop();
if (rank_ == 0) {
TM_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
}
}
else if (entry_count_ < max_entry_count_) {
const auto alloc_count = std::min(chunk_size_, max_entry_count_ - entry_count_);
const size_t entry_byte_size = 2 * cache_byte_size_; // 2 for k,v
if (rank_ == 0) {
TM_LOG_INFO("[LlamaCacheManager][allocate] malloc %d", (int)alloc_count);
}
const auto chunk_ptr = allocator_->malloc(alloc_count * entry_byte_size, false);
FT_CHECK(chunk_ptr);
device_mem_.push_back(chunk_ptr);
entry_count_ += alloc_count;
if (rank_ == 0) {
TM_LOG_INFO("[LlamaCacheManager][allocate] count = %d", entry_count_);
}
for (int i = 0; i < alloc_count; ++i) {
device_free_.push((uint8_t*)chunk_ptr + entry_byte_size * i);
}
if (!is_preallocte) {
mem_ptr = device_free_.front();
device_free_.pop();
}
if (rank_ == 0) {
TM_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
}
}
else {
mem_ptr = evict();
FT_CHECK_WITH_INFO(mem_ptr, "No enough cache entries.");
}
return mem_ptr;
}
auto LlamaCacheManager::create(uint64_t id, cudaStream_t stream) -> Sequence
{
if (rank_ == 0) {
TM_LOG_INFO("[LlamaCacheManager][create] %ld", (long)id);
}
for (const auto& e : device_cache_) {
if (e.id == id) {
if (rank_ == 0) {
TM_LOG_WARNING("[LlamaCacheManager][create] Removing conflicting id %ld", (long)id);
}
erase(id);
}
}
const auto mem_ptr = (uint8_t*)allocate(false);
check_cuda_error(cudaMemsetAsync(mem_ptr, 0, cache_byte_size_ * 2, stream));
device_cache_.push_back({
id,
max_seq_len_,
{},
0,
mem_ptr,
mem_ptr + cache_byte_size_,
{},
static_cast<uint64_t>(-1),
});
return device_cache_.back();
}
auto LlamaCacheManager::getEntryOrThrow(uint64_t id) -> std::vector<Sequence>::iterator
{
auto pred = [&](const Sequence& s) { return s.id == id; };
auto it = std::find_if(device_cache_.begin(), device_cache_.end(), pred);
if (it == device_cache_.end()) {
TM_LOG_ERROR("[LlamaCacheManager] %ld not found.\n", (long)id);
FT_CHECK(0);
}
return it;
}
auto LlamaCacheManager::fetch(uint64_t id, cudaStream_t stream) -> Sequence
{
if (rank_ == 0) {
TM_LOG_INFO("[LlamaCacheManager][fetch] %ld", (long)id);
}
auto entry = getEntryOrThrow(id);
if (entry->k_cache == nullptr) {
FT_CHECK(entry->cache_len == 0);
const auto mem_ptr = allocate(false);
check_cuda_error(cudaMemsetAsync(mem_ptr, 0, cache_byte_size_ * 2, stream));
entry->k_cache = mem_ptr;
entry->v_cache = (uint8_t*)entry->k_cache + cache_byte_size_;
}
entry->timestamp = static_cast<uint64_t>(-1);
return *entry;
}
void LlamaCacheManager::update(const Sequence& seq, cudaStream_t stream)
{
if (rank_ == 0) {
TM_LOG_INFO("[LlamaCacheManager][update] %ld", (long)seq.id);
}
auto entry = getEntryOrThrow(seq.id);
entry->timestamp = ++timestamp_;
entry->token_ids = seq.token_ids;
entry->cache_len = seq.cache_len;
FT_CHECK(seq.k_cache == entry->k_cache && seq.v_cache == entry->v_cache);
}
void LlamaCacheManager::erase(uint64_t id)
{
if (rank_ == 0) {
TM_LOG_INFO("[LlamaCacheManager][erase] %ld", (long)id);
}
auto entry = getEntryOrThrow(id);
if (entry->k_cache) {
device_free_.push(entry->k_cache);
if (rank_ == 0) {
TM_LOG_INFO("[LlamaCacheManager][erase] free = %d", (int)device_free_.size());
}
}
device_cache_.erase(entry);
}
void* LlamaCacheManager::evict()
{
FT_CHECK(!device_cache_.empty());
auto it = std::min_element(device_cache_.begin(), device_cache_.end(), [](const auto& a, const auto& b) {
return a.timestamp < b.timestamp;
});
if (it->timestamp == static_cast<uint64_t>(-1)) {
return nullptr;
}
if (rank_ == 0) {
TM_LOG_INFO("[LlamaCacheManager][evict] %ld", (long)it->id);
}
FT_CHECK(it->k_cache);
auto mem_ptr = it->k_cache;
it->k_cache = it->v_cache = nullptr;
it->cache_len = 0;
it->timestamp = static_cast<uint64_t>(-1);
return mem_ptr;
}
bool LlamaCacheManager::contains(uint64_t id) const noexcept
{
auto pred = [&](const Sequence& s) { return s.id == id; };
auto it = std::find_if(device_cache_.begin(), device_cache_.end(), pred);
return it != device_cache_.end();
}
} // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/logger.h"
#include <cstdint>
#include <cuda_runtime.h>
#include <queue>
#include <unordered_map>
#include <vector>
namespace turbomind {
// k-cache layout [L, H, D/x, S[s:], x]
// v-cache layout [L, H, S[s:], D/x, x]
class LlamaCacheManager {
public:
LlamaCacheManager(size_t layer_num,
size_t head_num,
size_t size_per_head,
size_t max_seq_len,
size_t elem_bits,
size_t max_entry_count,
size_t chunk_size,
int rank,
IAllocator* allocator):
layer_num_(layer_num),
head_num_(head_num),
size_per_head_(size_per_head),
max_seq_len_(max_seq_len),
elem_bits_(elem_bits),
cache_byte_size_(layer_num_ * head_num_ * max_seq_len_ * size_per_head_ * elem_bits_ / 8),
max_entry_count_(max_entry_count),
chunk_size_(chunk_size),
rank_(rank),
allocator_(allocator)
{
if (rank == 0) {
TM_LOG_INFO("[LlamaCacheManager] max_entry_count = %d", (int)max_entry_count_);
TM_LOG_INFO("[LlamaCacheManager] chunk_size = %d", (int)chunk_size_);
}
allocate(true);
}
~LlamaCacheManager();
struct Sequence {
// header
uint64_t id;
size_t max_seq_len;
// payloads
std::vector<int> token_ids; // all token ids
size_t cache_len; // cache_len == 0 -> cache miss
void* k_cache;
void* v_cache;
std::vector<uint8_t> random_state_; // states for RNGs
// for LRU policy
uint64_t timestamp;
};
Sequence create(uint64_t id, cudaStream_t stream);
Sequence fetch(uint64_t id, cudaStream_t stream);
void update(const Sequence& seq, cudaStream_t stream);
void erase(uint64_t id);
bool contains(uint64_t id) const noexcept;
private:
std::vector<Sequence>::iterator getEntryOrThrow(uint64_t id);
void* allocate(bool is_preallocte);
void* evict();
private:
const size_t layer_num_{};
const size_t head_num_{};
const size_t size_per_head_{};
const size_t max_seq_len_{};
const size_t elem_bits_{};
const size_t cache_byte_size_{};
const size_t max_entry_count_{};
const size_t chunk_size_{};
const int rank_{};
IAllocator* allocator_{};
std::queue<void*> device_free_;
std::vector<void*> device_mem_;
int entry_count_{};
uint64_t timestamp_{};
std::vector<Sequence> device_cache_;
};
} // namespace turbomind
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptContextDecoder.cc
#include "src/turbomind/models/llama/LlamaContextDecoder.h"
#include "src/turbomind/kernels/bert_preprocess_kernels.h"
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/LlamaContextDecoder.h"
#include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/debug_utils.h"
namespace turbomind {
template<typename T>
void LlamaContextDecoder<T>::allocateBuffer()
{
FT_CHECK(false);
}
template<typename T>
void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
attention_mask_ = (T*)allocator_->reMalloc(attention_mask_, sizeof(T) * batch_size * max_q_len * max_kv_len, false);
padding_offset_ = (int*)allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * max_q_len, false);
cu_seqlens_ = (int*)allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false);
is_allocate_buffer_ = true;
}
template<typename T>
void LlamaContextDecoder<T>::freeBuffer()
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)&padding_offset_);
allocator_->free((void**)&cu_seqlens_);
allocator_->free((void**)&attention_mask_);
allocator_->free((void**)&h_pinned_token_num_ptr_, true);
is_allocate_buffer_ = false;
}
}
template<typename T>
void LlamaContextDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
size_t kv_head_num,
bool use_fmha,
int cache_block_seq_len,
int quant_policy)
{
h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
context_attention_layer_ = new LlamaContextAttentionLayer<T>(head_num_,
kv_head_num,
size_per_head_,
attn_params,
tensor_para_,
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_,
use_fmha,
cache_block_seq_len,
quant_policy);
silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
size_per_head_,
inter_size_,
tensor_para_,
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_);
}
template<typename T>
void LlamaContextDecoder<T>::forwardSelfAttn(const Session& sess,
T* attn_io,
std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors,
int layer,
bool is_final)
{
// TM_LOG_ERROR(__PRETTY_FUNCTION__);
TensorMap self_attention_input_tensors{
{"input_query", Tensor{MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_io}},
{"attention_mask",
{MEMORY_GPU, data_type_, {sess.batch_size, 1, sess.max_query_len, sess.max_key_len}, attention_mask_}},
{"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &layer}},
{"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &is_final}},
{"padding_offset", {MEMORY_GPU, TYPE_INT32, {sess.token_num}, padding_offset_}},
{"cu_seqlens", {MEMORY_GPU, TYPE_INT32, {sess.batch_size + 1}, cu_seqlens_}},
{"input_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.input_length}},
{"context_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.context_length}},
{"cu_block_counts", input_tensors->at("cu_block_counts")},
{"rope_theta", input_tensors->at("rope_theta")},
{"max_seq_len", input_tensors->at("max_seq_len")}};
TensorMap self_attention_output_tensors{
{"hidden_features", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_io}},
{"key_cache", output_tensors->at("key_cache")},
{"value_cache", output_tensors->at("value_cache")},
{"tmp_k", output_tensors->at("tmp_k")},
{"tmp_v", output_tensors->at("tmp_v")}};
context_attention_layer_->forward(&self_attention_output_tensors, //
&self_attention_input_tensors,
&sess.weights->at(layer)->self_attn_weights);
}
template<typename T>
LlamaContextDecoder<T>::LlamaContextDecoder(size_t head_num,
size_t kv_head_num,
size_t size_per_head,
size_t inter_size,
size_t num_layer,
const LlamaAttentionParams& attn_params,
float rmsnorm_eps,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool use_fmha,
int cache_block_seq_len,
int quant_policy):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
head_num_(head_num),
size_per_head_(size_per_head),
inter_size_(inter_size),
hidden_units_(head_num * size_per_head),
num_layer_(num_layer),
rmsnorm_eps_(rmsnorm_eps),
tensor_para_(tensor_para),
data_type_(getTensorType<T>())
{
initialize(attn_params, kv_head_num, use_fmha, cache_block_seq_len, quant_policy);
}
template<typename T>
LlamaContextDecoder<T>::~LlamaContextDecoder()
{
delete context_attention_layer_;
delete silu_ffn_layer_;
freeBuffer();
}
template<typename T>
void LlamaContextDecoder<T>::forward(std::vector<Tensor>* output_tensors,
const std::vector<Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
{
FT_CHECK(false);
}
template<typename T>
void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
{
/**
* input tensors:
* \param decoder_input [num_token, hidden_units], float
* \param input_lengths [batch_size], int
* \param history_lengths [batch_size], int
* \param context_legnths [batch_size], int
* \param output_norm_weight [hidden_dims], float
* \param max_q_len [1], int on cpu
* \param max_kv_len [1], int on cpu
* \param max_seq_len [1], int on cpu
*
* output tensors:
* \param decoder_output [num_token, hidden_units],
* \param key_cache [num_layer, batch, local_head_num, size_per_head // x, max_seq_len, x]
* \param value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
* \param last_token_hidden_units [batch_size, hidden_units]
*/
Session sess{};
sess.token_num = input_tensors->at("decoder_input").shape[0];
sess.batch_size = input_tensors->at("input_lengths").shape[0];
sess.max_query_len = input_tensors->at("max_q_len").getVal<int>();
sess.max_key_len = input_tensors->at("max_kv_len").getVal<int>();
sess.weights = decoder_layer_weights;
sess.input_length = input_tensors->at("input_lengths").getPtr<int>();
sess.context_length = input_tensors->at("context_lengths").getPtr<int>();
T* decoder_input_output = input_tensors->at("decoder_input").getPtr<T>();
T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();
allocateBuffer(sess.batch_size, sess.token_num, sess.max_query_len, sess.max_key_len);
// dbg(padding_offset_);
FT_CHECK(padding_offset_);
size_t tmp_token_num{};
invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
&tmp_token_num, // updated token num
padding_offset_,
cu_seqlens_,
input_tensors->at("input_lengths").getPtr<int>(),
sess.batch_size,
sess.max_query_len,
stream_);
sync_check_cuda_error();
dbg(tmp_token_num, sess.token_num);
FT_CHECK(tmp_token_num == sess.token_num);
invokeCreateCausalMasks(attention_mask_,
sess.input_length,
sess.context_length,
sess.max_query_len,
sess.max_key_len,
sess.batch_size,
stream_);
sync_check_cuda_error();
// Compare(
// decoder_input_output, sess.token_num * hidden_units_, Concat("context_decoder_input", 0), kCmpRead, stream_);
/////////////////////////////////////////////
/// RMSNorm
invokeRootMeanSquareNorm(decoder_output,
decoder_input_output,
decoder_layer_weights->at(0)->self_attn_norm_weights,
rmsnorm_eps_,
sess.token_num,
hidden_units_,
stream_);
sync_check_cuda_error();
for (size_t layer = 0; layer < num_layer_; ++layer) {
/////////////////////////////////////////////
/// self-attention
forwardSelfAttn(sess, decoder_output, output_tensors, input_tensors, layer, false);
invokeFusedAddBiasResidualRMSNorm(decoder_input_output,
decoder_output,
decoder_layer_weights->at(layer)->self_attn_weights.output.bias,
decoder_layer_weights->at(layer)->ffn_norm_weights,
rmsnorm_eps_,
sess.token_num,
hidden_units_,
stream_);
sync_check_cuda_error();
////////////////////////////////////////////
/// feed-forward network
TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, decoder_output}}};
TensorMap ffn_outputs{
{"ffn_output", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, decoder_output}}};
silu_ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &decoder_layer_weights->at(layer)->ffn_weights);
auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
input_tensors->at("output_norm_weight").getPtr<T>();
invokeFusedAddBiasResidualRMSNorm(decoder_input_output, //
decoder_output,
decoder_layer_weights->at(layer)->ffn_weights.output.bias,
scale_weight,
rmsnorm_eps_,
sess.token_num,
hidden_units_,
stream_);
sync_check_cuda_error();
}
if (is_free_buffer_after_forward_) {
freeBuffer();
}
}
template class LlamaContextDecoder<float>;
template class LlamaContextDecoder<half>;
} // namespace turbomind
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptContextDecoder.h
#pragma once
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/turbomind/models/llama/LlamaFfnLayer.h"
#include "src/turbomind/models/llama/llama_params.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/custom_ar_comm.h"
#include "src/turbomind/utils/nccl_utils.h"
namespace turbomind {
template<typename T>
class LlamaContextDecoder: public BaseLayer {
protected:
void allocateBuffer() override;
void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);
void freeBuffer() override;
void initialize(const LlamaAttentionParams& attn_params,
size_t kv_head_num,
bool use_fmha,
int cache_block_seq_len,
int quant_policy);
size_t head_num_;
size_t size_per_head_;
size_t inter_size_;
size_t num_layer_;
size_t hidden_units_;
float rmsnorm_eps_;
NcclParam tensor_para_;
T* attention_mask_{};
int* padding_offset_{};
int* cu_seqlens_{}; // cu for cumulative
size_t* h_pinned_token_num_ptr_{};
LlamaContextAttentionLayer<T>* context_attention_layer_{};
LlamaFfnLayer<T>* silu_ffn_layer_{};
const DataType data_type_;
struct Session {
size_t batch_size;
size_t token_num;
size_t max_query_len;
size_t max_key_len;
int* input_length{};
int* context_length{};
const std::vector<LlamaDecoderLayerWeight<T>*>* weights;
};
void forwardSelfAttn(const Session& sess,
T* attn_io,
std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors,
int layer,
bool is_final);
public:
LlamaContextDecoder(size_t head_num,
size_t kv_head_num,
size_t size_per_head,
size_t inter_size,
size_t num_layer,
const LlamaAttentionParams& attn_params,
float rmsnorm_eps,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool use_fmha,
int cache_block_seq_len,
int quant_policy);
~LlamaContextDecoder() override;
virtual void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
virtual void forward(std::vector<Tensor>* output_tensors,
const std::vector<Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
};
} // namespace turbomind
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2022, SK Telecom Authored by A. Dialog
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoder.cc
#include "src/turbomind/models/llama/LlamaDecoder.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_params.h"
#include "src/turbomind/models/llama/llama_utils.h"
namespace turbomind {
template<typename T>
LlamaDecoder<T>::LlamaDecoder(size_t head_num,
size_t kv_head_num,
size_t size_per_head,
size_t inter_size,
size_t num_layer,
const LlamaAttentionParams& attn_params,
float rmsnorm_eps,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
int cache_block_seq_len,
int quant_policy):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
head_num_(head_num),
size_per_head_(size_per_head),
inter_size_(inter_size),
num_layer_(num_layer),
hidden_units_(head_num * size_per_head),
rmsnorm_eps_(rmsnorm_eps),
tensor_para_(tensor_para),
data_type_(getTensorType<T>())
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
initialize(attn_params, kv_head_num, cache_block_seq_len, quant_policy);
}
template<typename T>
LlamaDecoder<T>::~LlamaDecoder()
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
delete self_attention_layer_;
delete silu_ffn_layer_;
}
template<typename T>
void LlamaDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
size_t kv_head_num,
int cache_block_seq_len,
int quant_policy)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
self_attention_layer_ = new LlamaDecoderSelfAttentionLayer<T>(head_num_,
kv_head_num,
size_per_head_,
attn_params,
tensor_para_,
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_,
cache_block_seq_len,
quant_policy);
silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
size_per_head_,
inter_size_,
tensor_para_,
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_);
}
template<typename T>
void LlamaDecoder<T>::allocateBuffer()
{
FT_CHECK(false);
}
template<typename T>
void LlamaDecoder<T>::allocateBuffer(size_t batch_size)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
is_allocate_buffer_ = true;
}
template<typename T>
void LlamaDecoder<T>::freeBuffer()
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
is_allocate_buffer_ = false;
}
}
template<typename T>
void LlamaDecoder<T>::forwardSelfAttn(const LlamaDecoder::Session& sess,
T* attn_io,
const std::unordered_map<std::string, Tensor>* input_tensors,
size_t layer)
{
NvtxScope scope("self_attn");
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
TensorMap self_attention_input_tensors(*input_tensors);
self_attention_input_tensors.insert("input_query",
{MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, attn_io});
const int layer_id = layer;
self_attention_input_tensors.insert("layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id});
auto& k_cache = *sess.k_cache;
auto& v_cache = *sess.v_cache;
TensorMap self_attention_output_tensors{
{"attention_output", {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, attn_io}},
{"key_cache", k_cache},
{"value_cache", v_cache},
};
self_attention_layer_->forward(&self_attention_output_tensors, //
&self_attention_input_tensors,
&sess.weights->at(layer)->self_attn_weights);
}
template<typename T>
void LlamaDecoder<T>::forwardFfn(const LlamaDecoder::Session& sess, T* ffn_io, size_t layer)
{
TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, ffn_io}}};
TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, ffn_io}}};
silu_ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &sess.weights->at(layer)->ffn_weights);
}
template<typename T>
void LlamaDecoder<T>::forward(std::vector<Tensor>* output_tensors,
const std::vector<Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
{
FT_CHECK(false);
}
template<typename T>
void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
/**
* input_tensors:
* \param decoder_input [batch_size, hidden_dims]
* \param sequence_lengths [batch_size] int
* \param output_norm_weight [hidden_dims]
* \param step [1] on cpu
* \param ite [1] on cpu
* \param finished [batch_size] bool
* \param total_padding_tokens [batch_size], int
* \param max_seq_len [1] on cpu
* \param masked_tokens [batch_size, memory_len] bool (optional), NOT USED YET
*
* output_tensors:
* \param decoder_output [batch_size, hidden_dimension]
* \param key_cache [batch_size] uint64_t
* \param value_cache [batch_size] uint64_t
*/
// for the shape of key cache, refer to decoder_masked_multihead_attention_template.hpp
NvtxScope forward_scope("decoder_forward");
Session sess{};
sess.batch_size = input_tensors->at("decoder_input").shape[0];
sess.weights = decoder_layer_weights;
allocateBuffer(sess.batch_size);
sess.k_cache = &output_tensors->at("key_cache");
sess.v_cache = &output_tensors->at("value_cache");
T* decoder_input = input_tensors->at("decoder_input").getPtr<T>();
T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();
int step = input_tensors->at("step").getVal<int>();
// Compare(decoder_input, sess.batch_size * hidden_units_, Concat("decoder_input", 0, step), kCmpRead, stream_);
////////////////////////////////////////////
/// RMSNorm
{
NvtxScope rms_norm_scope("rms_norm_0");
invokeRootMeanSquareNorm(decoder_output,
decoder_input,
decoder_layer_weights->at(0)->self_attn_norm_weights,
rmsnorm_eps_,
sess.batch_size,
hidden_units_,
stream_);
sync_check_cuda_error();
}
for (size_t layer = 0; layer < num_layer_; ++layer) {
NvtxScope layer_scope("decode_layer");
// output: self_attn_output_, k_cache, v_cache = self_attn(decoder_normed_input_)
forwardSelfAttn(sess, decoder_output, input_tensors, layer);
{
NvtxScope rms_norm_scope("rms_norm_1");
invokeFusedAddBiasResidualRMSNorm(decoder_input,
decoder_output,
decoder_layer_weights->at(layer)->self_attn_weights.output.bias,
decoder_layer_weights->at(layer)->ffn_norm_weights,
rmsnorm_eps_,
sess.batch_size,
hidden_units_,
stream_);
sync_check_cuda_error();
}
// decoder_layer_output_ = ffn(decoder_normed_input_)
forwardFfn(sess, decoder_output, layer);
{
NvtxScope rms_norm_scope("rms_norm_2");
auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
input_tensors->at("output_norm_weight").getPtr<T>();
invokeFusedAddBiasResidualRMSNorm(decoder_input, //
decoder_output,
decoder_layer_weights->at(layer)->ffn_weights.output.bias,
scale_weight,
rmsnorm_eps_,
sess.batch_size,
hidden_units_,
stream_);
sync_check_cuda_error();
}
}
if (is_free_buffer_after_forward_) {
freeBuffer();
}
}
template class LlamaDecoder<half>;
template class LlamaDecoder<float>;
} // namespace turbomind
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2022, SK Telecom Authored by A. Dialog
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/models/multi_gpu_gpt/ParallelGptDecoder.h
#include "src/turbomind/layers/BaseLayer.h"
#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h"
#include "src/turbomind/models/llama/LlamaFfnLayer.h"
#include "src/turbomind/models/llama/llama_params.h"
#include "src/turbomind/utils/custom_ar_comm.h"
#include "src/turbomind/utils/nccl_utils.h"
namespace turbomind {
template<typename T>
class LlamaDecoder: public BaseLayer {
protected:
void allocateBuffer() override; // deprecated
void allocateBuffer(size_t batch_size);
void freeBuffer() override;
void
initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, int cache_block_seq_len, int quant_policy);
size_t head_num_;
size_t size_per_head_;
size_t inter_size_;
size_t num_layer_;
size_t hidden_units_;
float rmsnorm_eps_;
NcclParam tensor_para_;
LlamaDecoderSelfAttentionLayer<T>* self_attention_layer_{};
LlamaFfnLayer<T>* silu_ffn_layer_{};
const DataType data_type_;
struct Session {
size_t batch_size;
Tensor* k_cache;
Tensor* v_cache;
const std::vector<LlamaDecoderLayerWeight<T>*>* weights;
};
void forwardSelfAttn(const Session& sess,
T* attn_io,
const std::unordered_map<std::string, Tensor>* input_tensors,
size_t layer);
void forwardFfn(const LlamaDecoder::Session& sess, T* ffn_io, size_t layer);
public:
LlamaDecoder(size_t head_num,
size_t kv_head_num,
size_t size_per_head,
size_t inter_size,
size_t num_layer,
const LlamaAttentionParams& attn_params,
float rmsnorm_eps,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
int cache_block_seq_len,
int quant_policy);
~LlamaDecoder() override;
virtual void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
virtual void forward(std::vector<Tensor>* output_tensors,
const std::vector<Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
};
} // namespace turbomind
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/DecoderSelfAttentionLayer.cc
#include "src/turbomind/models/llama/LlamaDecoderSelfAttentionLayer.h"
#include "src/turbomind/kernels/decoder_masked_multihead_attention.h"
#include "src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
#include "src/turbomind/utils/nvtx_utils.h"
#include <string>
// #include <glog/logging.h>
namespace turbomind {
template<typename T>
void LlamaDecoderSelfAttentionLayer<T>::allocateBuffer(size_t batch_size)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
const size_t local_q_kv_head_num = local_head_num_ + 2 * local_kv_head_num_;
qkv_buf_ = reinterpret_cast<T*>(
allocator_->reMalloc(qkv_buf_, sizeof(T) * batch_size * local_q_kv_head_num * size_per_head_, false));
context_buf_ =
reinterpret_cast<T*>(allocator_->reMalloc(context_buf_, sizeof(T) * batch_size * local_hidden_units_, false));
workspace_ = (float*)allocator_->reMalloc(
workspace_, sizeof(float) * batch_size * local_head_num_ * kMaxSplitK * (size_per_head_ + 2), false);
is_allocate_buffer_ = true;
}
template<typename T>
void LlamaDecoderSelfAttentionLayer<T>::freeBuffer()
{
if (is_allocate_buffer_) {
allocator_->free((void**)(&qkv_buf_));
allocator_->free((void**)(&context_buf_));
is_allocate_buffer_ = false;
}
}
template<typename T>
void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap* output_tensors,
const TensorMap* input_tensors,
const LlamaAttentionWeight<T>* weights)
{
/**
* input tensors:
* \param input_query [batch_size, hidden_units],
* \param sequence_lengths [batch_size]
* \param step [1] on cpu
* \param finished [batch_size]
* \param total_padding_tokens [batch_size]
* \param layer_id [1], int on cpu
* \param max_seq_len [1] on cpu
* \param masked_tokens [batch_size, memory_len], (optional), NOT USED YET
* \param cache_indirection [batch_size / beam_width, beam_width, memory_max_len] (optional)
*
* output tensors:
* \param attention_output [batch_size, hidden_units],
* \param key_cache [batch, local_head_num, memory_max_len, size_per_head]
* \param value_cache [batch, local_head_num, memory_max_len, size_per_head]
*/
const T* input_query_data = input_tensors->getPtr<T>("input_query");
const int* sequence_lengths_data = input_tensors->getPtr<int>("sequence_lengths");
const bool* finished_data = input_tensors->getPtr<bool>("finished");
const int sum_seq_len = input_tensors->getVal<int>("sum_seq_len");
const int max_seq_len = input_tensors->getVal<int>("max_seq_len");
T* hidden_features_data = output_tensors->getPtr<T>("attention_output");
T** key_cache_ptrs = output_tensors->getPtr<T*>("key_cache");
T** value_cache_ptrs = output_tensors->getPtr<T*>("value_cache");
int* cu_block_counts = input_tensors->at("cu_block_counts").getPtr<int>();
const int layer_id = input_tensors->getVal<int>("layer_id");
const int step = input_tensors->getVal<int>("step");
// const int step_1 = step - 1;
const int batch_size = input_tensors->at("input_query").shape[0];
const float* rope_theta = input_tensors->getPtr<const float>("rope_theta", nullptr);
allocateBuffer(batch_size);
// for (int i = 0; i < batch_size; ++i) {
// if (gSequenceIds(i) == 1) {
// Compare((T*)input_query_data + hidden_units_ * i,
// hidden_units_,
// Concat("query", gSequenceIds(i), seqlens[i], layer_id),
// compare_mode,
// stream_);
// }
// }
{
NvtxScope scope("qkv_gemm");
linear_.forward(qkv_buf_, input_query_data, batch_size, weights->qkv);
}
// if (layer_id == 0) {
// Compare(qkv_buf_, batch_size * 3 * hidden_units_, Concat("qkv_buf", step, layer_id), kCmpRead, stream_);
// }
const auto layer_offset = layer_id * local_kv_head_num_ * kv_cache_block_len_ * size_per_head_;
// const int memory_len = max_seq_len;
DecoderMultiHeadAttentionParams<T> params{};
params.out = context_buf_;
params.q = qkv_buf_;
params.k = params.q + local_head_num_ * size_per_head_;
params.v = params.k + local_kv_head_num_ * size_per_head_;
params.stride = (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_;
params.q_bias = weights->qkv.bias;
params.k_bias = params.q_bias + local_head_num_ * size_per_head_;
params.v_bias = params.k_bias + local_kv_head_num_ * size_per_head_;
params.batch_size = batch_size;
params.cu_block_cnts = cu_block_counts;
params.k_cache_block_ptrs = (void**)key_cache_ptrs;
params.v_cache_block_ptrs = (void**)value_cache_ptrs;
params.kv_cache_block_size = kv_cache_block_len_;
params.finished = finished_data;
params.per_sample_length = sequence_lengths_data;
params.rope_theta = rope_theta;
params.layer_offset = layer_offset;
params.num_heads = local_head_num_;
params.num_kv_heads = local_kv_head_num_;
params.size_per_head = size_per_head_;
params.inv_sqrt_dh = 1.f / std::sqrt((float)params.size_per_head);
params.rotary_embedding_dim = size_per_head_;
params.rotary_embedding_base = params_.rotary_embedding_base;
params.max_position_embeddings = params_.max_position_embeddings;
// params.use_dynamic_ntk = params_.use_dynamic_ntk;
params.use_logn_attn = params_.use_logn_attn;
params.partial_O = workspace_;
params.partial_M = params.partial_O + batch_size * local_head_num_ * kMaxSplitK * size_per_head_;
params.partial_L = params.partial_M + batch_size * local_head_num_ * kMaxSplitK;
// avg_batch_size = sum_seq_len / max_seq_len
// max_split_k = kMaxSplitK / avg_batch_size
// max_split_k' = min(max_split_k, max_seq_lens / kSliceLen)
const float avg_batch_size = max_seq_len ? (float)sum_seq_len / max_seq_len : 1;
FT_CHECK(avg_batch_size >= 1.f);
const int max_split_k = std::max(1, (int)std::ceil(kMaxSplitK / avg_batch_size));
// if (layer_id == 0) {
// TM_LOG_INFO("avg_batch_size = %.1f, max_split_k = %d", avg_batch_size, max_split_k);
// }
params.max_split_k = max_split_k;
params.max_seq_len = max_seq_len;
params.arch = arch_;
params.stream = stream_;
params.quant_policy = quant_policy_;
std::copy(weights->past_kv_scale.begin(), weights->past_kv_scale.end(), std::begin(params.kv_quant_params));
{
NvtxScope scope("decoder_multihead_attention");
DispatchDecoderMultiheadAttention<T>(params);
}
// for (int i = 0; i < batch_size; ++i) {
// if (gSequenceIds(i) == 1) {
// Compare((T*)context_buf_ + hidden_units_ * i,
// hidden_units_,
// Concat("context_buf", gSequenceIds(i), seqlens[i], layer_id),
// compare_mode,
// stream_);
// }
// }
// if (layer_id == 0) {
// Compare(context_buf_, batch_size * hidden_units_, Concat("context_buf", step, layer_id), kCmpRead, stream_);
// }
{
NvtxScope scope("o_gemm");
linear_.forward(hidden_features_data, context_buf_, batch_size, weights->output);
}
if (tensor_para_.world_size_ > 1) {
NcclGuard nccl_guard(tensor_para_, stream_);
ftNcclAllReduceSum(
hidden_features_data, hidden_features_data, batch_size * hidden_units_, tensor_para_, stream_);
sync_check_cuda_error();
}
if (is_free_buffer_after_forward_) {
freeBuffer();
}
// LOG(WARNING);
}
template class LlamaDecoderSelfAttentionLayer<float>;
template class LlamaDecoderSelfAttentionLayer<half>;
} // namespace turbomind
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/DecoderSelfAttentionLayer.h
#pragma once
#include "src/turbomind/models/llama/LlamaDenseWeight.h"
#include "src/turbomind/models/llama/LlamaLinear.h"
#include "src/turbomind/models/llama/llama_params.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/nccl_utils.h"
namespace turbomind {
template<typename T>
class LlamaDecoderSelfAttentionLayer {
public:
void freeBuffer();
void allocateBuffer(size_t batch_size);
LlamaDecoderSelfAttentionLayer(size_t head_num,
size_t kv_head_num,
size_t size_per_head,
const LlamaAttentionParams& attn_params,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
int cache_block_seq_len,
int quant_policy):
head_num_(head_num),
kv_head_num_(kv_head_num),
size_per_head_(size_per_head),
hidden_units_(head_num * size_per_head),
local_head_num_(head_num / tensor_para.world_size_),
local_kv_head_num_(kv_head_num_ / tensor_para.world_size_),
local_hidden_units_(hidden_units_ / tensor_para.world_size_),
params_(attn_params),
tensor_para_(tensor_para),
stream_(stream),
linear_(cublas_wrapper, stream),
allocator_(allocator),
kv_cache_block_len_(cache_block_seq_len),
is_free_buffer_after_forward_(is_free_buffer_after_forward),
quant_policy_(quant_policy)
{
arch_ = getSMVersion();
}
~LlamaDecoderSelfAttentionLayer()
{
freeBuffer();
}
void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaAttentionWeight<T>* weights);
private:
const size_t head_num_;
const size_t kv_head_num_;
const size_t size_per_head_;
const size_t hidden_units_;
const size_t local_head_num_;
const size_t local_kv_head_num_;
const size_t local_hidden_units_;
const size_t kv_cache_block_len_;
const bool is_free_buffer_after_forward_;
const int quant_policy_;
const LlamaAttentionParams& params_;
NcclParam tensor_para_;
cudaStream_t stream_;
IAllocator* allocator_;
LlamaLinear<T> linear_;
T* qkv_buf_ = nullptr;
T* context_buf_ = nullptr;
static constexpr int kMaxSplitK = 16; // must be <= WARP_SIZE
float* workspace_ = nullptr;
bool is_allocate_buffer_{};
int arch_{};
};
} // namespace turbomind
......@@ -31,6 +31,7 @@
#include "src/turbomind/models/llama/SequenceManager.h"
#include "src/turbomind/models/llama/llama_params.h"
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/models/llama/unified_decoder.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
......@@ -47,19 +48,14 @@ LlamaV2<T>::LlamaV2(size_t head_num,
size_t inter_size,
size_t num_layer,
size_t vocab_size,
const LlamaAttentionParams& attn_params,
float norm_eps,
int max_batch_size,
int max_context_token_num,
int session_len,
int step_length,
const LlamaAttentionParams& attn_params,
int start_id,
int end_id,
float cache_max_block_count,
int cache_block_seq_len,
int cache_chunk_size,
int quant_policy,
bool use_context_fmha,
const EngineParams& engine_params,
std::shared_ptr<SharedState> shared_state,
LlamaWeight<T>* weights,
NcclParam tensor_para,
......@@ -89,7 +85,6 @@ LlamaV2<T>::LlamaV2(size_t head_num,
is_free_buffer_after_forward_(is_free_buffer_after_forward),
cuda_device_prop_(cuda_device_prop),
debug_(isDebug()),
step_length_(step_length),
shared_state_(shared_state)
{
......@@ -99,38 +94,7 @@ LlamaV2<T>::LlamaV2(size_t head_num,
vocab_size_padded_ =
(vocab_size_padded_ + tensor_para_.world_size_ - 1) / tensor_para_.world_size_ * tensor_para_.world_size_;
size_t elem_bits = 0;
if (quant_policy & QuantPolicy::kCacheKVInt8) {
elem_bits = sizeof(int8_t) * 8;
}
else {
elem_bits = sizeof(T) * 8;
}
const size_t local_kv_head_num = kv_head_num / tensor_para.world_size_;
auto sequence_manager = std::make_unique<SequenceManager>(num_layer,
local_kv_head_num,
size_per_head_,
cache_block_seq_len,
cache_max_block_count,
cache_chunk_size,
elem_bits,
tensor_para_.rank_,
allocator);
const size_t max_session_len = sequence_manager->max_block_count() * cache_block_seq_len;
if (max_session_len < session_len) {
if (tensor_para.rank_ == 0) {
TM_LOG_WARNING("No enough blocks for `session_len` (%d), `session_len` truncated to %d.",
session_len,
max_session_len);
}
session_len = max_session_len;
}
batch_ = std::make_unique<LlamaBatch<T>>(
max_batch_size, max_context_token_num, session_len, std::move(sequence_manager), this);
batch_ = std::make_unique<LlamaBatch<T>>(engine_params, cache_block_seq_len, quant_policy, this);
initialize(attn_params, kv_head_num, use_context_fmha, cache_block_seq_len, quant_policy);
......@@ -141,9 +105,8 @@ LlamaV2<T>::LlamaV2(size_t head_num,
template<typename T>
LlamaV2<T>::~LlamaV2()
{
delete decoder_;
unified_decoder_.reset();
delete dynamic_decode_layer_;
delete context_decoder_;
}
template<typename T>
......@@ -155,36 +118,21 @@ void LlamaV2<T>::initialize(const LlamaAttentionParams& attn_params,
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
context_decoder_ = new LlamaContextDecoder<T>(head_num_,
kv_head_num,
size_per_head_,
inter_size_,
num_layer_,
attn_params,
rmsnorm_eps_,
tensor_para_,
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_,
use_context_fmha,
cache_block_seq_len,
quant_policy);
decoder_ = new LlamaDecoder<T>(head_num_,
kv_head_num,
size_per_head_,
inter_size_,
num_layer_,
attn_params,
rmsnorm_eps_,
tensor_para_,
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_,
cache_block_seq_len,
quant_policy);
unified_decoder_.reset(new UnifiedDecoder<T>(head_num_,
kv_head_num,
size_per_head_,
inter_size_,
num_layer_,
attn_params,
rmsnorm_eps_,
tensor_para_,
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_,
use_context_fmha,
cache_block_seq_len,
quant_policy));
dynamic_decode_layer_ = new DynamicDecodeLayer<float>(vocab_size_,
vocab_size_padded_,
......@@ -218,31 +166,32 @@ void LlamaV2<T>::embeddingLookup(T* embeddings, const int* token_ids_buf, int ba
}
template<typename T>
void LlamaV2<T>::contextDecode(T* decoder_output,
uintptr_t* k_cache_ptr,
uintptr_t* v_cache_ptr,
void** tmp_k_ptrs,
void** tmp_v_ptrs,
T* context_decoder_input_buf,
T* context_decoder_output_buf,
const int* input_ids,
const int* input_length,
const int* context_length,
const int* cu_block_counts,
const float* rope_theta,
size_t token_num,
size_t max_input_len,
size_t max_context_len,
size_t session_len,
size_t batch_size)
void LlamaV2<T>::forwardUnified(T* out,
T* decoder_output,
T* decoder_input,
void** k_block_ptrs,
void** v_block_ptrs,
const int* input_ids,
const int* cu_block_cnts,
const float* rope_theta,
const bool* dc_finished,
const int* pf_input_length,
const int* pf_context_length,
T** pf_tmp_k_ptrs,
T** pf_tmp_v_ptrs,
size_t token_num,
int dc_batch_size,
int dc_step,
int dc_sum_seq_len,
int dc_max_seq_len,
int pf_batch_size,
int pf_max_input_len,
int pf_max_context_len,
int pf_session_len)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (tensor_para_.rank_ == 0) {
TM_LOG_INFO("context decoding start");
}
invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf,
invokeInputIdsEmbeddingLookupPosEncoding(decoder_input,
nullptr, // processed somewhere else
weights_->pre_decoder_embedding_table,
static_cast<T*>(nullptr),
......@@ -256,81 +205,32 @@ void LlamaV2<T>::contextDecode(T* decoder_output,
stream_);
sync_check_cuda_error();
const auto dtype = getTensorType<T>();
const auto bsz = batch_size;
const int max_q_len = max_input_len;
const int max_kv_len = max_context_len;
const int max_seq_len = session_len;
std::unordered_map<std::string, Tensor> decoder_input_tensors{
{"decoder_input", {MEMORY_GPU, dtype, {token_num, hidden_units_}, context_decoder_input_buf}},
{"output_norm_weight", {MEMORY_GPU, dtype, {hidden_units_}, weights_->output_norm_weight}},
{"input_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, input_length}},
{"context_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, context_length}},
{"max_q_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_q_len}},
{"max_kv_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_kv_len}},
{"max_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_seq_len}},
{"rope_theta", {MEMORY_GPU, TYPE_FP32, {hidden_units_}, rope_theta}},
{"cu_block_counts", {MEMORY_GPU, TYPE_INT32, {batch_size}, cu_block_counts}}};
std::unordered_map<std::string, Tensor> decoder_output_tensors{
{"decoder_output", {MEMORY_GPU, dtype, {token_num, hidden_units_}, context_decoder_output_buf}},
{"key_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, k_cache_ptr}},
{"value_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, v_cache_ptr}},
{"tmp_k", {MEMORY_GPU, TYPE_UINT64, {bsz}, tmp_k_ptrs}},
{"tmp_v", {MEMORY_GPU, TYPE_UINT64, {bsz}, tmp_v_ptrs}},
{"last_token_hidden_units", {MEMORY_GPU, dtype, {bsz, hidden_units_}, decoder_output}}};
context_decoder_->forward(&decoder_output_tensors, &decoder_input_tensors, &weights_->decoder_layer_weights);
if (tensor_para_.rank_ == 0) {
TM_LOG_INFO("context decoding end");
}
}
template<typename T>
void LlamaV2<T>::decoderForward(T* decoder_output,
uintptr_t* k_cache_ptr,
uintptr_t* v_cache_ptr,
T* decoder_input,
const int* sequence_length,
const bool* finished,
const int* cu_block_counts,
const float* rope_theta,
int step,
int ite,
int sum_seq_len,
int max_seq_len,
size_t batch_size)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
const auto dtype = getTensorType<T>();
// max_input_length is not used w/o linear_bias_slopes
// sequence_lengths_ will be incremented in dynamic decode
std::unordered_map<std::string, Tensor> decoder_input_tensors{
{"decoder_input", {MEMORY_GPU, dtype, {batch_size, hidden_units_}, decoder_input}},
{"sequence_lengths", {MEMORY_GPU, TYPE_INT32, {batch_size}, sequence_length}},
{"cu_block_counts", {MEMORY_GPU, TYPE_INT32, {batch_size}, cu_block_counts}},
{"sum_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &sum_seq_len}},
{"max_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_seq_len}},
{"finished", {MEMORY_GPU, TYPE_BOOL, {batch_size}, finished}},
{"output_norm_weight", {MEMORY_GPU, dtype, {hidden_units_}, weights_->output_norm_weight}},
{"rope_theta", {MEMORY_GPU, TYPE_FP32, {batch_size}, rope_theta}},
{"step", {MEMORY_CPU, TYPE_INT32, {1}, &step}},
{"ite", {MEMORY_CPU, TYPE_INT32, {1}, &ite}},
};
// LOG(ERROR) << key_cache_ << " " << value_cache_;
std::unordered_map<std::string, Tensor> decoder_output_tensors{
{"decoder_output", {MEMORY_GPU, dtype, {batch_size, hidden_units_}, decoder_output}},
{"key_cache", {MEMORY_GPU, TYPE_UINT64, {batch_size}, k_cache_ptr}},
{"value_cache", {MEMORY_GPU, TYPE_UINT64, {batch_size}, v_cache_ptr}},
};
decoder_->forward(&decoder_output_tensors, &decoder_input_tensors, &weights_->decoder_layer_weights);
const auto dtype = getTensorType<T>();
const size_t bsz = dc_batch_size + pf_batch_size;
TensorMap inputs{{"decoder_input", {MEMORY_GPU, dtype, {token_num, hidden_units_}, decoder_input}},
{"output_norm_weight", {MEMORY_GPU, dtype, {hidden_units_}, weights_->output_norm_weight}},
{"input_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, pf_input_length}},
{"context_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, pf_context_length}},
{"dc_batch_size", {MEMORY_CPU, TYPE_INT32, {1}, &dc_batch_size}},
{"dc_sum_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &dc_sum_seq_len}},
{"dc_max_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &dc_max_seq_len}},
{"finished", {MEMORY_GPU, TYPE_BOOL, {bsz}, dc_finished}},
{"pf_batch_size", {MEMORY_CPU, TYPE_INT32, {1}, &pf_batch_size}},
{"pf_max_q_len", {MEMORY_CPU, TYPE_INT32, {1}, &pf_max_input_len}},
{"pf_max_k_len", {MEMORY_CPU, TYPE_INT32, {1}, &pf_max_context_len}},
{"session_len", {MEMORY_CPU, TYPE_INT32, {1}, &pf_session_len}},
{"rope_theta", {MEMORY_GPU, TYPE_FP32, {hidden_units_}, rope_theta}},
{"cu_block_counts", {MEMORY_GPU, TYPE_INT32, {bsz}, cu_block_cnts}}};
TensorMap outputs{{"decoder_output", {MEMORY_GPU, dtype, {token_num, hidden_units_}, decoder_output}},
{"key_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, k_block_ptrs}},
{"value_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, v_block_ptrs}},
{"tmp_k", {MEMORY_GPU, TYPE_UINT64, {bsz}, pf_tmp_k_ptrs}},
{"tmp_v", {MEMORY_GPU, TYPE_UINT64, {bsz}, pf_tmp_v_ptrs}},
{"last_token_hidden_units", {MEMORY_GPU, dtype, {bsz, hidden_units_}, out}}};
unified_decoder_->forward(&outputs, &inputs, &weights_->decoder_layer_weights);
}
template<typename T>
......
......@@ -24,12 +24,11 @@
#include "src/turbomind/layers/DynamicDecodeLayer.h"
#include "src/turbomind/models/llama/Barrier.h"
#include "src/turbomind/models/llama/LlamaBatch.h"
#include "src/turbomind/models/llama/LlamaContextDecoder.h"
#include "src/turbomind/models/llama/LlamaDecoder.h"
#include "src/turbomind/models/llama/LlamaWeight.h"
#include "src/turbomind/models/llama/Request.h"
#include "src/turbomind/models/llama/SequenceManager.h"
#include "src/turbomind/models/llama/llama_params.h"
#include "src/turbomind/models/llama/unified_decoder.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/instance_comm.h"
......@@ -59,19 +58,14 @@ public:
size_t inter_size,
size_t num_layer,
size_t vocab_size,
const LlamaAttentionParams& attn_params,
float norm_eps,
int max_batch_size,
int max_context_token_num,
int session_len,
int step_length,
const LlamaAttentionParams& attn_params,
int start_id,
int end_id,
float cache_max_block_count,
int cache_block_seq_len,
int cache_chunk_size,
int quant_policy,
bool use_context_fmha,
const EngineParams& engine_params,
std::shared_ptr<SharedState> shared_state,
LlamaWeight<T>* weights,
NcclParam tensor_para,
......@@ -113,37 +107,28 @@ private:
void embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step);
void contextDecode(T* decoder_output,
uintptr_t* k_block_ptrs,
uintptr_t* v_block_ptrs,
void** k_tmp_ptrs,
void** v_tmp_ptrs,
T* context_decoder_input_buf,
T* context_decoder_output_buf,
const int* input_ids,
const int* input_length,
const int* context_length,
const int* cu_block_counts,
const float* rope_theta,
size_t token_num,
size_t max_input_len,
size_t max_context_len,
size_t session_len,
size_t batch_size);
void decoderForward(T* decoder_output,
uintptr_t* k_cache_ptr,
uintptr_t* v_cache_ptr,
void forwardUnified(T* out,
T* decoder_output,
T* decoder_input,
const int* sequence_length,
const bool* finished,
const int* cu_block_counts,
void** k_block_ptrs,
void** v_block_ptrs,
const int* input_ids,
const int* cu_block_cnts,
const float* rope_theta,
int step,
int ite,
int sum_seq_len,
int max_seq_len,
size_t batch_size);
const bool* dc_finished,
const int* pf_input_length,
const int* pf_context_length,
T** pf_tmp_k_ptrs,
T** pf_tmp_v_ptrs,
size_t token_num,
int dc_batch_size,
int dc_step,
int dc_sum_seq_len,
int dc_max_seq_len,
int pf_batch_size,
int pf_max_input_len,
int pf_max_context_len,
int pf_session_len);
void postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size);
......@@ -195,12 +180,11 @@ private:
const bool debug_{false};
LlamaWeight<T>* weights_{};
LlamaDecoder<T>* decoder_{};
LlamaContextDecoder<T>* context_decoder_{};
DynamicDecodeLayer<float>* dynamic_decode_layer_{};
LlamaWeight<T>* weights_{};
std::unique_ptr<UnifiedDecoder<T>> unified_decoder_;
DynamicDecodeLayer<float>* dynamic_decode_layer_{};
const int step_length_;
std::shared_ptr<SharedState> shared_state_;
ffi_api_lock_ctrl_t ffi_lock_;
std::unique_ptr<LlamaBatch<T>> batch_;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment