Unverified Commit 7f943a26 authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

Unify prefill & decode passes (#775)

* Unify prefill and decode passes

* dynamic split-fuse

* refactor

* correct input count calculation

* remove unused

* lint

* lint

* fix msvc build

* fix msvc build

* fix msvc build

* fix msvc build

* fix msvc build

* fix msvc build

* fix msvc build

* fix msvc build

* fix msvc build
parent 2ba90822
......@@ -13,8 +13,8 @@
namespace turbomind {
struct Request {
uint64_t id;
uint64_t priority;
uint64_t id; // sequence id
uint64_t unique_id; // monotonic increasing
bool start_flag;
bool end_flag;
......
......@@ -36,7 +36,7 @@ SequenceManager::SequenceManager(size_t layer_num,
const Sequence* SequenceManager::Create(uint64_t id)
{
Sequence sequence{id, {}, {}, {}, {}, {}, {}, 0.f};
Sequence sequence{id};
auto it = sequences_.find(id);
if (it != sequences_.end()) {
......@@ -152,18 +152,23 @@ struct Schedule {
int last;
int input_count1;
int input_count2;
Sequences active;
std::vector<int> block_counts;
Sequences inactive;
Sequences victims;
Schedule(Snapshot snapshot, int size):
Schedule(Snapshot snapshot, int size, int _input_count1, int _input_count2):
free(snapshot.free),
cached(snapshot.cached),
last(size),
use_count_(std::move(snapshot.use_count)),
unlocked_(size),
it_(size)
it_(size),
input_count1(_input_count1),
input_count2(_input_count2)
{
}
......@@ -208,6 +213,7 @@ std::ostream& operator<<(std::ostream& os, const Schedule& s)
struct Transaction {
int index_;
int block_count_;
int input_count_;
int allocate_{};
int evict_{};
......@@ -218,44 +224,45 @@ struct Transaction {
const Sequences& sequences_;
Schedule& schedule_;
explicit Transaction(const Sequences& sequences, int index, int block_count, Schedule& sched):
sequences_(sequences), schedule_(sched), index_(index), block_count_(block_count)
explicit Transaction(const Sequences& sequences, int index, int block_count, int input_count, Schedule& sched):
sequences_(sequences), schedule_(sched), index_(index), block_count_(block_count), input_count_(input_count)
{
}
void Process()
{
int count = block_count_;
if (schedule_.input_count1 > 0) {
int count = block_count_;
int tmp = std::min(schedule_.free, count);
count -= tmp;
allocate_ += tmp;
int tmp = std::min(schedule_.free, count);
count -= tmp;
allocate_ += tmp;
tmp = std::min(schedule_.cached, count);
count -= tmp;
evict_ += tmp;
tmp = std::min(schedule_.cached, count);
count -= tmp;
evict_ += tmp;
for (int vidx = schedule_.last - 1; count && vidx > index_; --vidx) {
if (sequences_[vidx]->status == Sequence::kCached) {
continue;
}
victims_.push_back(sequences_[vidx]);
preempt_ += schedule_.Unlock(sequences_, vidx);
for (int vidx = schedule_.last - 1; count && vidx > index_; --vidx) {
if (sequences_[vidx]->status == Sequence::kCached) {
continue;
if (count <= preempt_) {
evict_ += count;
count -= count;
schedule_.last = vidx; // ! modifiying `sched_.last` is part of commit
break;
}
}
victims_.push_back(sequences_[vidx]);
preempt_ += schedule_.Unlock(sequences_, vidx);
if (count <= preempt_) {
evict_ += count;
count -= count;
schedule_.last = vidx; // ! modifiying `sched_.last` is part of commit
break;
if (count == 0) {
return Commit();
}
}
if (count == 0) {
Commit();
}
else {
schedule_.inactive.push_back(sequences_[index_]);
}
const_cast<Sequence*>(sequences_[index_])->input_length = 0;
schedule_.inactive.push_back(sequences_[index_]);
}
void Commit()
......@@ -276,6 +283,13 @@ struct Transaction {
// update active sequences
schedule_.active.push_back(sequences_[index_]);
schedule_.block_counts.push_back(block_count_);
if (input_count_ > schedule_.input_count2) {
input_count_ = schedule_.input_count1;
}
schedule_.input_count1 -= input_count_;
schedule_.input_count2 -= input_count_;
const_cast<Sequence*>(sequences_[index_])->input_length = input_count_;
}
};
......@@ -308,6 +322,25 @@ void SequenceManager::SortByPriority(Sequences& sequences,
context_lengths.swap(tmp_lengths);
}
// template<class P, class... Ts>
// void SortByPriority(const std::vector<P>& priorities, Ts&... ranges)
// {
// // sort according to priority
// std::vector<int> idxs(priorities.size());
// std::iota(idxs.begin(), idxs.end(), 0);
// std::sort(idxs.begin(), idxs.end(), [&](int i, int j) {
// return priorities[i] < priorities[j]; //
// });
// auto reorder = [&](auto& src) {
// auto dst = src;
// for (size_t i = 0; i < idxs.size(); ++i) {
// dst[i] = src[idxs[i]];
// }
// src.swap(dst);
// };
// (reorder(ranges), ...);
// }
std::vector<int> SequenceManager::CountRequiredBlocks(const Sequences& sequences,
const std::vector<int>& context_lengths,
int step_length)
......@@ -344,7 +377,8 @@ void SequenceManager::AssignAndActivate(const Sequences& sequenc
auto SequenceManager::Materialize(Sequences sequences,
std::vector<int> context_lengths,
const std::vector<uint64_t>& priorities,
int step_length) -> Outcome
int step_length,
AdjustInputCount adjust) -> Outcome
{
////////////////////////////////////////////////////////////////////////////////
/// Schedule the assignment of blocks to sequences
......@@ -354,18 +388,23 @@ auto SequenceManager::Materialize(Sequences sequences,
SortByPriority(sequences, context_lengths, priorities);
// SortByPriority(priorities, sequences, context_lengths);
// Verify and lock cache sequences to avoid their blocks being evicted unnoticed
// the blocks can still be preempted later
VerifyAndLockCached(sequences);
auto [input_count1, input_count2] = adjust(sequences, context_lengths);
std::vector<int> required = CountRequiredBlocks(sequences, context_lengths, step_length);
// dbg(required);
Schedule schedule(block_manager_->TakeSnapshot(), sequences.size());
Schedule schedule(block_manager_->TakeSnapshot(), sequences.size(), input_count1, input_count2);
// `schedule.last` is decreasing in the loop
for (int i = 0; i < schedule.last; ++i) {
Transaction{sequences, i, required[i], schedule}.Process();
const int input_length = context_lengths[i] - sequences[i]->cache_len;
Transaction{sequences, i, required[i], input_length, schedule}.Process();
}
// mark remaining sequences invalid
......
......@@ -3,6 +3,7 @@
#pragma once
#include "src/turbomind/models/llama/BlockManager.h"
#include <functional>
namespace turbomind {
......@@ -16,19 +17,23 @@ struct Sequence {
};
uint64_t id;
Status status;
Status status = kCached;
std::vector<const Block*> blocks;
std::vector<uint64_t> block_unique_ids;
int input_length = 0;
mutable std::vector<int> tokens; // update by user
mutable int cache_len;
mutable int cache_len = 0;
// additional data kept round-to-round
mutable std::vector<std::byte> random_state; // update by user
mutable float rope_theta;
mutable float rope_theta = 0.f;
Sequence(uint64_t _id): id(_id) {}
friend std::ostream& operator<<(std::ostream& os, const Sequence& seq);
};
......@@ -74,10 +79,13 @@ public:
int swap_out;
};
using AdjustInputCount = std::function<std::pair<int, int>(const Sequences&, const std::vector<int>&)>;
[[nodiscard]] Outcome Materialize(Sequences sequences,
std::vector<int> context_lengths,
const std::vector<uint64_t>& priorities,
int step_length);
int step_length,
AdjustInputCount adjust);
void* OffsetKey(void* block_ptr)
{
......
// Copyright (c) OpenMMLab. All rights reserved.
#pragma once
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace turbomind {
class BatchedCopy {
public:
template<class T, std::enable_if_t<alignof(T) <= alignof(uint32_t), int> = 0>
T* Add(const T* src, int size, T* dst)
{
src_.push_back((void*)src);
dst_.push_back((void*)dst);
size_.push_back(sizeof(T) * size);
return dst + size;
}
void Submit(cudaStream_t stream)
{
invokeBatchedCopy(src_.data(), dst_.data(), size_.data(), size_.size(), stream);
sync_check_cuda_error();
src_.clear();
dst_.clear();
size_.clear();
}
private:
std::vector<void*> src_;
std::vector<void*> dst_;
std::vector<int> size_;
};
} // namespace turbomind
......@@ -101,6 +101,8 @@ __device__ T blockReduceSum(const cg::thread_block& block, T value)
return cg::reduce(tile, value, cg::plus<float>{});
}
// r' = r + x
// x' = norm(r') * scales
template<typename T>
__global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
T* __restrict__ x_data,
......
......@@ -9,11 +9,13 @@
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/dispatch.h"
#include "src/turbomind/utils/logger.h"
#include <algorithm>
#include <cstdint>
#include <cub/block/block_reduce.cuh>
#include <type_traits>
#include <utility>
namespace turbomind {
......@@ -543,8 +545,10 @@ __global__ void gatherOutput(int* output_ids,
continue;
}
// skip padding for dst
const int dst_idx = src_idx < context_len ? src_idx : src_idx - (max_context_len - context_len);
output_ids[dst_idx] = ids[src_idx * batch_size + batch_id];
const int dst_idx = src_idx < context_len ? src_idx : src_idx - (max_context_len - context_len);
if (dst_idx < max_output_len) {
output_ids[dst_idx] = ids[src_idx * batch_size + batch_id];
}
}
}
......@@ -694,50 +698,31 @@ void invokeIndexedCopyImpl(void** h_src_ptr,
int count,
cudaStream_t st)
{
auto invoke = [&](auto max_count) {
constexpr int C = decltype(max_count)::value;
// maximum parameter size: sm<70: 4kB, sm>=70: 32kB
static_assert(sizeof(IndexedCopyParam<N, C>) <= 4096);
IndexedCopyParam<N, C> param{};
std::copy_n(h_src_ptr, N, param.src_ptr.data());
std::copy_n(h_dst_ptr, N, param.dst_ptr.data());
std::transform(h_elem_sz, h_elem_sz + N, param.stride.data(), [](int size) {
// Basic alignment check
FT_CHECK_WITH_INFO(size % sizeof(T) == 0, fmtstr("misalignment: %d %% %d", size, (int)sizeof(T)));
return size / sizeof(T);
dispatch( // dispatch for num of copy operations
std::integer_sequence<int, 4, 8, 16, 32, 64, 128, 256>{},
[&](auto C) { return count <= C; },
[&](auto C) {
// maximum parameter size: sm<70: 4kB, sm>=70: 32kB
static_assert(sizeof(IndexedCopyParam<N, C>) <= 4096);
IndexedCopyParam<N, C> param{};
std::copy_n(h_src_ptr, N, param.src_ptr.data());
std::copy_n(h_dst_ptr, N, param.dst_ptr.data());
std::transform(h_elem_sz, h_elem_sz + N, param.stride.data(), [](int size) {
// Basic alignment check
FT_CHECK_WITH_INFO(size % sizeof(T) == 0, fmtstr("misalignment: %d %% %d", size, (int)sizeof(T)));
return size / sizeof(T);
});
param.max_stride = *std::max_element(param.stride.begin(), param.stride.end());
auto copy_idx = [](const int* src, int offset, int n, auto dst) {
return src ? (void)std::copy_n(src + offset, n, dst) : std::iota(dst, dst + n, offset);
};
for (int c = 0; c < count; c += C) {
int batch_size = std::min(count - c, (int)C);
copy_idx(h_src_idx, c, batch_size, param.src_idx.data());
copy_idx(h_dst_idx, c, batch_size, param.dst_idx.data());
indexedCopy<T><<<batch_size, 128, 0, st>>>(param);
}
});
param.max_stride = *std::max_element(param.stride.begin(), param.stride.end());
auto copy_idx = [](const int* src, int offset, int n, auto dst) {
return src ? (void)std::copy_n(src + offset, n, dst) : std::iota(dst, dst + n, offset);
};
for (int c = 0; c < count; c += C) {
int batch_size = std::min(count - c, C);
copy_idx(h_src_idx, c, batch_size, param.src_idx.data());
copy_idx(h_dst_idx, c, batch_size, param.dst_idx.data());
indexedCopy<T><<<batch_size, 128, 0, st>>>(param);
}
};
if (count <= 4) {
invoke(std::integral_constant<int, 4>{});
}
if (count <= 8) {
invoke(std::integral_constant<int, 8>{});
}
else if (count <= 16) {
invoke(std::integral_constant<int, 16>{});
}
else if (count <= 32) {
invoke(std::integral_constant<int, 32>{});
}
else if (count <= 64) {
invoke(std::integral_constant<int, 64>{});
}
else if (count <= 128) {
invoke(std::integral_constant<int, 128>{});
}
else {
invoke(std::integral_constant<int, 256>{});
}
}
void invokeIndexedCopy(void** h_src_ptr,
......@@ -749,19 +734,14 @@ void invokeIndexedCopy(void** h_src_ptr,
int n_copys,
cudaStream_t st)
{
auto args = std::tuple{h_src_ptr, h_dst_ptr, h_elem_sz, h_src_idx, h_dst_idx, count, st};
switch (n_copys) {
case 1:
return std::apply(invokeIndexedCopyImpl<uint32_t, 1>, args);
case 2:
return std::apply(invokeIndexedCopyImpl<uint32_t, 2>, args);
case 3:
return std::apply(invokeIndexedCopyImpl<uint32_t, 3>, args);
case 4:
return std::apply(invokeIndexedCopyImpl<uint32_t, 4>, args);
default:
FT_CHECK(0);
}
auto success = dispatch(std::integer_sequence<int, 1, 2, 3, 4>{}, [&](auto N) {
if (N == n_copys) {
invokeIndexedCopyImpl<uint32_t, N>(h_src_ptr, h_dst_ptr, h_elem_sz, h_src_idx, h_dst_idx, count, st);
return true;
}
return false;
});
FT_CHECK(success);
}
__global__ void padLastTokenIds(int* token_ids, const int* context_length, int max_context_len, int batch_size)
......@@ -777,6 +757,96 @@ void invokePadLastTokenIds(
padLastTokenIds<<<1, 512, 0, stream>>>(token_ids, context_length, max_context_len, batch_size);
}
template<typename T>
__global__ void getFeatureOfLastToken(T* output, const T* input, const int* cu_seqlens, int dims)
{
int bi = blockIdx.x;
int ti = cu_seqlens[bi + 1] - 1;
for (int i = threadIdx.x; i < dims; i += blockDim.x) {
output[dims * bi + i] = input[dims * ti + i];
}
}
template<typename T>
void invokeGetFeatureOfLastToken(
T* output, const T* input, const int* cu_seqlens, int dims, int batch_size, cudaStream_t stream)
{
getFeatureOfLastToken<<<batch_size, 256, 0, stream>>>(output, input, cu_seqlens, dims);
}
template void invokeGetFeatureOfLastToken(half*, const half*, const int*, int, int, cudaStream_t);
template void invokeGetFeatureOfLastToken(float*, const float*, const int*, int, int, cudaStream_t);
template<class T, int C>
struct BatchedCopyParam {
Array<T*, C> src_ptr;
Array<T*, C> dst_ptr;
Array<int, C> size;
int count;
};
template<int kThrPerCpy, class T, int C>
__global__ void batchedCopy(BatchedCopyParam<T, C> param)
{
const int ti = threadIdx.x + blockIdx.x * blockDim.x;
const int bi = ti / kThrPerCpy;
if (bi >= param.count) {
return;
}
const T* __restrict__ src = param.src_ptr[bi];
T* __restrict__ dst = param.dst_ptr[bi];
int size = param.size[bi];
for (int i = ti % kThrPerCpy; i < size; i += kThrPerCpy) {
dst[i] = src[i];
}
}
// MSVC does not like CUDA kernel launch inside nested lambdas
template<class P>
struct BatchedCopyLauncher {
int max_size;
int count;
const P* params;
cudaStream_t st;
template<int S>
void operator()(std::integral_constant<int, S>) const
{
constexpr int threads = 128;
constexpr int items_per_block = threads / S;
const int blocks = (count + items_per_block - 1) / items_per_block;
batchedCopy<S><<<blocks, threads, 0, st>>>(*params);
}
};
void invokeBatchedCopy(void** src_ptr, void** dst_ptr, int* size, int count, cudaStream_t st)
{
dispatch(
std::integer_sequence<int, 1, 8, 32, 128>{},
[&](auto C) { return count <= C; },
[&](auto C) {
using T = uint32_t;
BatchedCopyParam<T, C> params{};
// TODO: on CUDA 12.1 and sm_70+ this can be 32K
static_assert(sizeof(params) <= 4096);
for (int c = 0; c < count; c += C) {
const int bsz = std::min<int>(count - c, C);
params.count = bsz;
for (int i = 0; i < bsz; ++i) {
params.src_ptr[i] = (T*)src_ptr[c + i];
params.dst_ptr[i] = (T*)dst_ptr[c + i];
FT_CHECK(size[c + i] % sizeof(T) == 0);
params.size[i] = size[c + i] / sizeof(T);
}
const int max_size = *std::max_element(params.size.begin(), params.size.end());
dispatch(
std::integer_sequence<int, 1, 2, 4, 8, 16, 32, 64, 128>{},
[&](auto S) { return max_size <= S; },
BatchedCopyLauncher<BatchedCopyParam<T, C>>{max_size, count, &params, st});
}
});
}
#define VERSION_SWITCH(VERSION, CONST_NAME, ...) \
[&] { \
if (VERSION == 2) { \
......
......@@ -105,6 +105,8 @@ void invokeIndexedCopy(void** h_src_ptr,
int n_copys,
cudaStream_t st);
void invokeBatchedCopy(void** src_ptr, void** dst_ptr, int* size, int count, cudaStream_t st);
// ABCDe ABCDe e
// ABCDEFGHIJk ABCDEFGHIJk
// ABCDEFGHi -> ABCDEFGHi i
......@@ -113,6 +115,10 @@ void invokeIndexedCopy(void** h_src_ptr,
void invokePadLastTokenIds(
int* token_ids, const int* context_length, int max_context_len, int batch_size, cudaStream_t stream);
template<typename T>
void invokeGetFeatureOfLastToken(
T* output, const T* input, const int* cu_seqlens, int dims, int batch_size, cudaStream_t stream);
void invokeMyCopyInt(int* dst, const int* src, size_t count, cudaStream_t st);
template<typename T>
......
......@@ -13,4 +13,21 @@ struct LlamaAttentionParams {
bool use_logn_attn;
};
struct EngineParams {
// batch params
int max_batch_size;
int session_len;
int step_length;
// cache params
float cache_max_block_count;
int cache_chunk_size;
// chunking params
int max_context_token_num;
int num_tokens_per_iter;
int extra_tokens_per_iter;
int max_prefill_iters;
};
} // namespace turbomind
......@@ -19,8 +19,9 @@
// Modified from
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/layers/attention_layers/GptContextAttentionLayer.cc
#include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
#include "src/turbomind/models/llama/unified_attention_layer.h"
#include "src/turbomind/kernels/bert_preprocess_kernels.h"
#include "src/turbomind/kernels/decoder_multihead_attention/decoder_multihead_attention.h"
#include "src/turbomind/kernels/decoder_multihead_attention/kv_cache.h"
#include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/turbomind/macro.h"
......@@ -35,10 +36,14 @@
namespace turbomind {
template<typename T>
void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
size_t num_token,
size_t max_q_len,
size_t max_k_len)
// void UnifiedAttentionLayer<T>::allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t
// max_k_len)
void UnifiedAttentionLayer<T>::allocateBuffer(size_t num_token,
size_t pf_batch_size,
size_t pf_max_q_len,
size_t pf_max_k_len,
size_t dc_batch_size,
size_t dc_max_split_k)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
......@@ -47,65 +52,73 @@ void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
// no padding
qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * num_token * local_q_kv_head_num * size_per_head_, false);
// padding is rebuilt for q/k/v_buf_2_
// [qH + 2kvH, B, S, D]
q_buf_2_ = (T*)allocator_->reMalloc(
q_buf_2_, sizeof(T) * local_q_kv_head_num * batch_size * max_q_len * size_per_head_, false);
k_buf_2_ = q_buf_2_ + local_head_num_ * batch_size * max_q_len * size_per_head_;
v_buf_2_ = k_buf_2_ + local_kv_head_num_ * batch_size * max_q_len * size_per_head_;
// qkv_buf_3_ padding is removed
qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * num_token * local_head_num_ * size_per_head_, false);
if (use_fmha_) {
FlashAttentionOp<T> flash_attention(batch_size, local_head_num_, max_k_len, max_q_len, size_per_head_);
if (flash_attention.get_workspace_size() > 0) {
qk_buf_float_ = (float*)allocator_->reMalloc(qk_buf_float_, flash_attention.get_workspace_size(), false);
}
if (pf_batch_size) {
[&](size_t bsz, size_t max_q, size_t max_k) {
// padding is rebuilt for q/k/v_buf_2_
// [qH + 2kvH, B, S, D]
q_buf_2_ = (T*)allocator_->reMalloc(
q_buf_2_, sizeof(T) * local_q_kv_head_num * bsz * max_q * size_per_head_, false);
k_buf_2_ = q_buf_2_ + local_head_num_ * bsz * max_q * size_per_head_;
v_buf_2_ = k_buf_2_ + local_kv_head_num_ * bsz * max_q * size_per_head_;
if (use_fmha_) {
FlashAttentionOp<T> flash_attention(bsz, local_head_num_, max_k, max_q, size_per_head_);
if (flash_attention.get_workspace_size() > 0) {
qk_buf_float_ =
(float*)allocator_->reMalloc(qk_buf_float_, flash_attention.get_workspace_size(), false);
}
}
else {
// kv heads are repeated for unfused attention
k_cache_buf_ = (T*)allocator_->reMalloc(
k_cache_buf_, 2 * sizeof(T) * bsz * local_head_num_ * max_k * size_per_head_, false);
v_cache_buf_ = k_cache_buf_ + bsz * local_head_num_ * max_k * size_per_head_;
qk_buf_ = (T*)allocator_->reMalloc(qk_buf_, sizeof(T) * bsz * local_head_num_ * max_q * max_k, false);
// qkv_buf_2_ has padding
qkv_buf_2_ = (T*)allocator_->reMalloc(
qkv_buf_2_, sizeof(T) * bsz * max_q * local_head_num_ * size_per_head_, false);
}
}(pf_batch_size, pf_max_q_len, pf_max_k_len);
}
else {
// kv heads are repeated for unfused attention
k_cache_buf_ = (T*)allocator_->reMalloc(
k_cache_buf_, 2 * sizeof(T) * batch_size * local_head_num_ * max_k_len * size_per_head_, false);
v_cache_buf_ = k_cache_buf_ + batch_size * local_head_num_ * max_k_len * size_per_head_;
qk_buf_ =
(T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * local_head_num_ * max_q_len * max_k_len, false);
// qkv_buf_2_ has padding
qkv_buf_2_ = (T*)allocator_->reMalloc(
qkv_buf_2_, sizeof(T) * batch_size * max_q_len * local_head_num_ * size_per_head_, false);
if (dc_batch_size) {
dc_workspace_ = (float*)allocator_->reMalloc(dc_workspace_,
sizeof(float) * dc_batch_size * local_head_num_ * dc_max_split_k
* (size_per_head_ + 2),
false);
}
// qkv_buf_3_ padding is removed
qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * num_token * local_head_num_ * size_per_head_, false);
is_allocate_buffer_ = true;
}
template<typename T>
void LlamaContextAttentionLayer<T>::freeBuffer()
void UnifiedAttentionLayer<T>::freeBuffer()
{
if (is_allocate_buffer_) {
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
allocator_->free((void**)(&qkv_buf_));
allocator_->free((void**)(&q_buf_2_));
if (use_fmha_) {
allocator_->free((void**)&qk_buf_float_);
}
else {
allocator_->free((void**)(&k_cache_buf_));
allocator_->free((void**)(&qk_buf_));
allocator_->free((void**)(&qkv_buf_2_));
}
allocator_->free((void**)(&qkv_buf_3_));
allocator_->free((void**)&qk_buf_float_);
allocator_->free((void**)(&k_cache_buf_));
allocator_->free((void**)(&qk_buf_));
allocator_->free((void**)(&qkv_buf_2_));
allocator_->free((void**)&dc_workspace_);
is_allocate_buffer_ = false;
}
}
template<typename T>
inline void LlamaContextAttentionLayer<T>::forward(TensorMap* output_tensors,
const TensorMap* input_tensors,
const LlamaAttentionWeight<T>* weights)
inline void UnifiedAttentionLayer<T>::forward(TensorMap* outputs, const TensorMap* inputs, const WeightType* weights)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
......@@ -131,60 +144,153 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
/////////////////////////////////////////////
/// parse inputs
const int batch_size = input_tensors->at("attention_mask").shape[0];
const int max_q_len = input_tensors->at("attention_mask").shape[2];
const int max_k_len = input_tensors->at("attention_mask").shape[3];
const int layer_id = input_tensors->getVal<int>("layer_id");
const int num_token = input_tensors->at("input_query").shape[0];
const int max_seq_len = input_tensors->at("max_seq_len").getVal<int>();
const int num_token = inputs->at("input_query").shape[0];
const int layer_id = inputs->getVal<int>("layer_id");
const int session_len = inputs->getVal<int>("session_len");
int pf_batch_size = 0;
int pf_max_q_len = 0;
int pf_max_k_len = 0;
T* attention_mask{};
if (inputs->isExist("attention_mask")) {
pf_batch_size = inputs->at("attention_mask").shape[0];
pf_max_q_len = inputs->at("attention_mask").shape[2];
pf_max_k_len = inputs->at("attention_mask").shape[3];
attention_mask = inputs->getPtr<T>("attention_mask");
}
T* attention_out = output_tensors->at("hidden_features").getPtr<T>();
T* attention_input = input_tensors->at("input_query").getPtr<T>();
T* attention_mask = input_tensors->at("attention_mask").getPtr<T>();
const int dc_batch_size = inputs->getVal<int>("dc_batch_size");
const int dc_sum_seq_len = inputs->getVal<int>("dc_sum_seq_len");
const int dc_max_seq_len = inputs->getVal<int>("dc_max_seq_len");
const auto input_length = input_tensors->at("input_lengths").getPtr<const int>();
const auto context_length = input_tensors->at("context_lengths").getPtr<const int>();
int* cu_seqlens = input_tensors->at("cu_seqlens").getPtr<int>();
int* cu_block_counts = input_tensors->at("cu_block_counts").getPtr<int>();
T* attention_input = inputs->getPtr<T>("input_query");
int* input_length = inputs->getPtr<int>("input_lengths");
int* context_length = inputs->getPtr<int>("context_lengths");
bool* is_finished = inputs->getPtr<bool>("finished");
int* cu_block_count = inputs->getPtr<int>("cu_block_counts");
int* cu_seqlens = inputs->getPtr<int>("cu_seqlens", nullptr);
int* padding_offset = inputs->getPtr<int>("padding_offset", nullptr);
float* rope_theta = inputs->getPtr<float>("rope_theta", nullptr);
const float* rope_theta = input_tensors->getPtr<const float>("rope_theta", nullptr);
auto k_cache_ptrs = outputs->getPtr<void*>("key_cache");
auto v_cache_ptrs = outputs->getPtr<void*>("value_cache");
auto tmp_k_ptrs = outputs->getPtr<T*>("tmp_k");
auto tmp_v_ptrs = outputs->getPtr<T*>("tmp_v");
const auto padding_offset = input_tensors->at("padding_offset").getPtr<int>();
auto Show = [&](const T* x, size_t n) {
std::vector<T> vec(n);
cudaMemcpyAsync(vec.data(), x, sizeof(T) * n, cudaMemcpyDefault, stream_);
cudaStreamSynchronize(stream_);
std::vector<float> float_vec(vec.begin(), vec.end());
dbg(float_vec);
};
T* attention_out = outputs->getPtr<T>("hidden_features");
/////////////////////////////////////////////
/// allocate buffers
allocateBuffer(batch_size, num_token, max_q_len, max_k_len);
allocateBuffer(num_token, //
pf_batch_size,
pf_max_q_len,
pf_max_k_len,
dc_batch_size,
kDecodeMaxSplits);
// [2, L, H, s, D]
const size_t layer_offset = layer_id * local_kv_head_num_ * kv_cache_block_len_ * size_per_head_;
//////////////////////////////////////////////
/// qkv gemm
// [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
linear_.forward(qkv_buf_, attention_input, num_token, weights->qkv);
if (pf_batch_size) {
const int offset = dc_batch_size;
const int pf_num_token = num_token - offset;
prefill(qkv_buf_3_ + offset * weights->output.input_dims,
qkv_buf_ + offset * weights->qkv.output_dims,
k_cache_ptrs,
v_cache_ptrs,
attention_mask,
cu_seqlens,
padding_offset,
tmp_k_ptrs + offset,
tmp_v_ptrs + offset,
input_length + offset,
context_length + offset,
cu_block_count + offset,
rope_theta + offset,
pf_batch_size,
pf_num_token,
layer_offset,
pf_max_q_len,
pf_max_k_len,
session_len,
weights);
}
if (dc_batch_size) {
decode(qkv_buf_3_,
qkv_buf_,
k_cache_ptrs,
v_cache_ptrs,
cu_block_count,
context_length,
is_finished,
rope_theta,
layer_offset,
dc_batch_size,
dc_sum_seq_len,
dc_max_seq_len,
kDecodeMaxSplits,
weights);
}
//////////////////////////////////////////////
/// output gemm <Bs,HD> -> <Bs,HD>
linear_.forward(attention_out, qkv_buf_3_, num_token, weights->output);
if (tensor_para_.world_size_ > 1) {
NcclGuard nccl_guard(tensor_para_, stream_);
ftNcclAllReduceSum(attention_out, attention_out, num_token * hidden_units_, tensor_para_, stream_);
sync_check_cuda_error();
}
if (is_free_buffer_after_forward_ == true) {
freeBuffer();
}
sync_check_cuda_error();
}
template<typename T>
void UnifiedAttentionLayer<T>::prefill(T* output,
const T* qkv,
void** k_cache_ptrs,
void** v_cache_ptrs,
const T* attention_mask,
const int* cu_seqlens,
const int* padding_offset,
T** tmp_k_ptrs,
T** tmp_v_ptrs,
const int* input_length,
const int* context_length,
const int* cu_block_count,
const float* rope_theta,
int pf_batch_size,
int pf_num_token,
size_t layer_offset,
int pf_max_q_len,
int pf_max_k_len,
int pf_session_len,
const WeightType* weights)
{
//////////////////////////////////////////////
/// transpose qkv & apply rotary embedding & rebuild padding
/// qkv [B, s, H + 2kvH, D] -> (q [B, H, s, D], k [B, kvH, s, D], v [B, kvH, s, D])
invokeAddFusedQKVBiasTranspose(q_buf_2_,
k_buf_2_,
v_buf_2_,
qkv_buf_,
(T*)qkv,
weights->qkv.bias,
padding_offset, // padding_offset,
context_length, // used for applying rotary embedding
input_length,
rope_theta,
batch_size,
max_q_len, // seq_len
num_token, // batch_size * seq_len
pf_batch_size,
pf_max_q_len, // seq_len
pf_num_token,
local_head_num_,
local_kv_head_num_,
size_per_head_,
......@@ -196,15 +302,6 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
stream_);
sync_check_cuda_error();
// [2, L, H, s, D]
const size_t layer_offset = layer_id * local_kv_head_num_ * kv_cache_block_len_ * size_per_head_;
auto k_cache_ptrs = output_tensors->getPtr<void*>("key_cache");
auto v_cache_ptrs = output_tensors->getPtr<void*>("value_cache");
auto tmp_k_ptrs = output_tensors->getPtr<T*>("tmp_k");
auto tmp_v_ptrs = output_tensors->getPtr<T*>("tmp_v");
//////////////////////////////////////////////////////////
/// insert the k/v computed from inputs into k/v cache
/// transpose kv -> kv cache
......@@ -215,13 +312,13 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
v_cache_ptrs,
k_buf_2_,
v_buf_2_,
cu_block_counts,
cu_block_count,
input_length,
context_length,
batch_size,
pf_batch_size,
kv_cache_block_len_,
layer_offset,
max_q_len,
pf_max_q_len,
size_per_head_,
local_kv_head_num_,
quant_policy_,
......@@ -231,94 +328,147 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
const int kv_cache_elem_bits = quant_policy_ & QuantPolicy::kCacheKVInt8 ? 8 : sizeof(T) * 8;
FT_CHECK(weights->past_kv_scale.size() == 4);
ConvertKvCacheBlocksToLinear2((const void**)k_cache_ptrs,
(const void**)v_cache_ptrs,
(T**)tmp_k_ptrs,
(T**)tmp_v_ptrs,
cu_block_counts,
cu_block_count,
context_length,
layer_offset,
kv_cache_block_len_,
max_seq_len,
pf_session_len,
local_kv_head_num_,
size_per_head_,
batch_size,
pf_batch_size,
quant_policy_,
weights->past_kv_scale.data(),
stream_);
sync_check_cuda_error();
// dbg(kv_cache_block_len_, max_seq_len, local_kv_head_num_, size_per_head_, batch_size);
// void *kk, *vv;
// cudaMemcpyAsync(&kk, tmp_k_ptrs, sizeof(void*), cudaMemcpyDefault, stream_);
// cudaMemcpyAsync(&vv, tmp_v_ptrs, sizeof(void*), cudaMemcpyDefault, stream_);
// cudaStreamSynchronize(stream_);
// Show((const T*)kk, local_kv_head_num_ * max_seq_len * size_per_head_);
// Show((const T*)vv, local_kv_head_num_ * max_seq_len * size_per_head_);
if (use_fmha_) {
fusedMultiHeadAttention(tmp_k_ptrs,
fusedMultiHeadAttention(output,
q_buf_2_,
tmp_k_ptrs,
tmp_v_ptrs,
0,
attention_mask,
cu_seqlens,
input_tensors->at("context_lengths").getPtr<int>(),
batch_size,
max_q_len,
max_k_len,
max_seq_len);
(T*)attention_mask,
(int*)cu_seqlens,
(int*)context_length,
pf_batch_size,
pf_max_q_len,
pf_max_k_len,
pf_session_len);
}
else {
unfusedMultiHeadAttention(tmp_k_ptrs,
unfusedMultiHeadAttention(output,
q_buf_2_,
tmp_k_ptrs,
tmp_v_ptrs,
0,
attention_mask,
padding_offset,
context_length,
batch_size,
num_token,
max_q_len,
max_k_len,
max_seq_len,
pf_batch_size,
pf_num_token,
pf_max_q_len,
pf_max_k_len,
pf_session_len,
quant_policy_,
weights->past_kv_scale.data());
}
}
// Compare(qkv_buf_3_, num_token * hidden_units_, Concat("qkv_buf_3", layer_id), kCmpRead, stream_);
template<typename T>
void UnifiedAttentionLayer<T>::decode(T* output,
const T* qkv,
void** k_cache_ptrs,
void** v_cache_ptrs,
const int* cu_block_count,
const int* context_length,
const bool* is_finished,
const float* rope_theta,
size_t layer_offset,
int batch_size,
int dc_sum_seq_len,
int dc_max_seq_len,
int max_split_k,
const WeightType* weights)
{
DecoderMultiHeadAttentionParams<T> params{};
// dbg(max_seq_len);
params.out = output;
params.q = (T*)qkv;
params.k = params.q + local_head_num_ * size_per_head_;
params.v = params.k + local_kv_head_num_ * size_per_head_;
params.stride = (local_head_num_ + 2 * local_kv_head_num_) * size_per_head_;
if (0) {
Show(qkv_buf_3_, num_token * hidden_units_);
}
params.q_bias = weights->qkv.bias;
params.k_bias = params.q_bias + local_head_num_ * size_per_head_;
params.v_bias = params.k_bias + local_kv_head_num_ * size_per_head_;
//////////////////////////////////////////////
/// output gemm <Bs,HD> -> <Bs,HD>
linear_.forward(attention_out, qkv_buf_3_, num_token, weights->output);
params.batch_size = batch_size;
params.cu_block_cnts = (int*)cu_block_count;
if (tensor_para_.world_size_ > 1) {
NcclGuard nccl_guard(tensor_para_, stream_);
ftNcclAllReduceSum(attention_out, attention_out, num_token * hidden_units_, tensor_para_, stream_);
sync_check_cuda_error();
}
params.k_cache_block_ptrs = (void**)k_cache_ptrs;
params.v_cache_block_ptrs = (void**)v_cache_ptrs;
params.kv_cache_block_size = kv_cache_block_len_;
if (is_free_buffer_after_forward_ == true) {
freeBuffer();
params.finished = is_finished;
params.context_length = context_length;
params.rope_theta = rope_theta;
params.layer_offset = layer_offset;
params.num_heads = local_head_num_;
params.num_kv_heads = local_kv_head_num_;
params.size_per_head = size_per_head_;
params.inv_sqrt_dh = 1.f / std::sqrt((float)params.size_per_head);
params.rotary_embedding_dim = size_per_head_;
params.rotary_embedding_base = params_.rotary_embedding_base;
params.max_position_embeddings = params_.max_position_embeddings;
// params.use_dynamic_ntk = params_.use_dynamic_ntk;
params.use_logn_attn = params_.use_logn_attn;
params.partial_O = dc_workspace_;
params.partial_M = params.partial_O + batch_size * local_head_num_ * max_split_k * size_per_head_;
params.partial_L = params.partial_M + batch_size * local_head_num_ * max_split_k;
const float avg_batch_size = dc_max_seq_len ? (float)dc_sum_seq_len / dc_max_seq_len : 1;
FT_CHECK(avg_batch_size >= 1.f);
max_split_k = std::max(1, (int)std::ceil(max_split_k / avg_batch_size));
params.max_split_k = max_split_k;
params.max_seq_len = dc_max_seq_len;
params.arch = arch_;
params.stream = stream_;
params.quant_policy = quant_policy_;
FT_CHECK(std::size(weights->past_kv_scale) == std::size(params.kv_quant_params));
std::copy(weights->past_kv_scale.begin(), weights->past_kv_scale.end(), std::begin(params.kv_quant_params));
{
NvtxScope scope("decoder_multihead_attention");
DispatchDecoderMultiheadAttention<T>(params);
}
sync_check_cuda_error();
}
template<typename T>
void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptrs,
T** val_cache_ptrs,
size_t cache_layer_offset,
T* attention_mask,
int* cu_seqlens,
int* context_lengths,
int batch_size,
int max_q_len,
int max_k_len,
int max_seq_len)
void UnifiedAttentionLayer<T>::fusedMultiHeadAttention(T* output,
const T* query,
T** key_cache_ptrs,
T** val_cache_ptrs,
size_t cache_layer_offset,
T* attention_mask,
int* cu_seqlens,
int* context_lengths,
int batch_size,
int max_q_len,
int max_k_len,
int max_seq_len)
{
//////////////////////////////////////////////
// flash attention
......@@ -347,8 +497,8 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptr
};
size_t group_size = size_t(local_head_num_ / local_kv_head_num_);
AttentionOp flash_attention(batch_size, local_head_num_, max_k_len, max_q_len, size_per_head_);
typename AttentionOp::Params attn_params{qkv_buf_3_,
q_buf_2_,
typename AttentionOp::Params attn_params{output,
(T*)query,
k_cache_buf_,
v_cache_buf_,
attention_mask,
......@@ -368,19 +518,21 @@ void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptr
}
template<typename T>
void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T** key_cache_ptrs,
T** val_cache_ptrs,
size_t cache_layer_offset,
const T* attention_mask,
const int* padding_offset,
const int* context_length,
int batch_size,
int num_token,
int max_q_len,
int max_k_len,
int max_seq_len,
int quant,
const float* kv_scale)
void UnifiedAttentionLayer<T>::unfusedMultiHeadAttention(T* output,
const T* query,
T** key_cache_ptrs,
T** val_cache_ptrs,
size_t cache_layer_offset,
const T* attention_mask,
const int* padding_offset,
const int* context_length,
int batch_size,
int num_token,
int max_q_len,
int max_k_len,
int max_seq_len,
int quant,
const float* kv_scale)
{
// key_cache [B, kvH, S[:t+s], D/x, x] -> [B, qH, t+s, D]
// val_cache [B, kvH, S[:t+s], D/x, x] -> [B, qH, t+s, D]
......@@ -414,7 +566,7 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T** key_c
k_cache_buf_, // A
size_per_head_, // lda
max_k_len * size_per_head_, // strideA
q_buf_2_, // B
query, // B
size_per_head_, // ldb
max_q_len * size_per_head_, // strideB
qk_buf_, // C
......@@ -459,7 +611,7 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T** key_c
//////////////////////////////////////////////
/// transpose <B,h,s,D> -> <B,s,h,D>
invokeTransposeAttentionOutRemovePadding(qkv_buf_2_,
qkv_buf_3_,
output,
num_token,
batch_size,
max_q_len,
......@@ -472,7 +624,7 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T** key_c
sync_check_cuda_error();
}
template class LlamaContextAttentionLayer<float>;
template class LlamaContextAttentionLayer<half>;
template class UnifiedAttentionLayer<float>;
template class UnifiedAttentionLayer<half>;
} // namespace turbomind
......@@ -25,28 +25,37 @@
#include "src/turbomind/models/llama/LlamaLinear.h"
#include "src/turbomind/models/llama/llama_params.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/nccl_utils.h"
namespace turbomind {
template<typename T>
class LlamaContextAttentionLayer {
class UnifiedAttentionLayer {
public:
using WeightType = LlamaAttentionWeight<T>;
static constexpr int kDecodeMaxSplits = 16;
void freeBuffer();
void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);
LlamaContextAttentionLayer(size_t head_num,
size_t kv_head_num,
size_t size_per_head,
LlamaAttentionParams attn_params,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool use_fmha,
int cache_block_seq_len,
int quant_policy):
void allocateBuffer(size_t num_token,
size_t pf_batch_size,
size_t pf_max_q_len,
size_t pf_max_k_len,
size_t dc_batch_size,
size_t dc_max_split_k);
UnifiedAttentionLayer(size_t head_num,
size_t kv_head_num,
size_t size_per_head,
LlamaAttentionParams attn_params,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool use_fmha,
int cache_block_seq_len,
int quant_policy):
head_num_(head_num),
size_per_head_(size_per_head),
hidden_units_(head_num * size_per_head),
......@@ -65,22 +74,63 @@ public:
quant_policy_(quant_policy)
{
FT_CHECK(head_num % kv_head_num == 0);
arch_ = getSMVersion();
}
void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaAttentionWeight<T>* weights);
void fusedMultiHeadAttention(T** key_cache_ptrs,
T** val_cache_ptrs,
size_t cache_layer_offset,
T* attention_mask,
int* cu_seqlens,
int* context_lengths,
int batch_size,
int max_q_len,
int max_k_len,
int max_seq_len);
void unfusedMultiHeadAttention(T** key_cache_ptrs,
void forward(TensorMap* outputs, const TensorMap* inputs, const LlamaAttentionWeight<T>* weights);
void prefill(T* output,
const T* qkv,
void** k_cache_ptrs,
void** v_cache_ptrs,
const T* attention_mask,
const int* cu_seqlens,
const int* padding_offset,
T** tmp_k_ptrs,
T** tmp_v_ptrs,
const int* input_length,
const int* context_length,
const int* cu_block_count,
const float* rope_theta,
int pf_batch_size,
int pf_num_token,
size_t layer_offset,
int pf_max_q_len,
int pf_max_k_len,
int pf_session_len,
const WeightType* weights);
void decode(T* output,
const T* qkv,
void** k_cache_ptrs,
void** v_cache_ptrs,
const int* cu_block_count,
const int* context_length,
const bool* is_finished,
const float* rope_theta,
size_t layer_offset,
int batch_size,
int dc_sum_seq_len,
int dc_max_seq_len,
int max_split_k,
const WeightType* weights);
void fusedMultiHeadAttention(T* output,
const T* query,
T** key_cache_ptrs,
T** val_cache_ptrs,
size_t cache_layer_offset,
T* attention_mask,
int* cu_seqlens,
int* context_lengths,
int batch_size,
int max_q_len,
int max_k_len,
int max_seq_len);
void unfusedMultiHeadAttention(T* output,
const T* query,
T** key_cache_ptrs,
T** val_cache_ptrs,
size_t cache_layer_offset,
const T* attention_mask,
......@@ -116,6 +166,8 @@ private:
cublasMMWrapper* cublas_wrapper_;
LlamaLinear<T> linear_;
int arch_{};
T* qkv_buf_{};
T* q_buf_2_{};
T* k_buf_2_{};
......@@ -126,6 +178,7 @@ private:
float* qk_buf_float_{};
T* qkv_buf_2_{};
T* qkv_buf_3_{};
float* dc_workspace_{};
bool is_allocate_buffer_ = false;
};
......
#include "src/turbomind/models/llama/unified_decoder.h"
#include "src/turbomind/kernels/bert_preprocess_kernels.h"
#include "src/turbomind/kernels/gpt_kernels.h"
#include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/unified_attention_layer.h"
#include "src/turbomind/utils/cuda_utils.h"
namespace turbomind {
template<typename T>
void UnifiedDecoder<T>::allocateBuffer(size_t num_token, size_t pf_batch_size, size_t pf_max_q_len, size_t pf_max_k_len)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
if (pf_batch_size) {
attention_mask_ =
(T*)allocator_->reMalloc(attention_mask_, sizeof(T) * pf_batch_size * pf_max_q_len * pf_max_k_len, false);
padding_offset_ =
(int*)allocator_->reMalloc(padding_offset_, sizeof(int) * pf_batch_size * pf_max_q_len, false);
cu_seqlens_ = (int*)allocator_->reMalloc(cu_seqlens_, sizeof(int) * (pf_batch_size + 1), false);
}
}
template<typename T>
void UnifiedDecoder<T>::freeBuffer()
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
allocator_->free((void**)&padding_offset_);
allocator_->free((void**)&cu_seqlens_);
allocator_->free((void**)&attention_mask_);
allocator_->free((void**)&h_pinned_token_num_ptr_, true);
}
template<typename T>
void UnifiedDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
size_t kv_head_num,
bool use_fmha,
int cache_block_seq_len,
int quant_policy)
{
h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
attn_layer_ = new UnifiedAttentionLayer<T>(head_num_,
kv_head_num,
size_per_head_,
attn_params,
tensor_para_,
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_,
use_fmha,
cache_block_seq_len,
quant_policy);
ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
size_per_head_,
inter_size_,
tensor_para_,
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_);
}
template<typename T>
void UnifiedDecoder<T>::forwardSelfAttn(T* attn_io,
TensorMap* _outputs,
const TensorMap* _inputs,
size_t token_num,
size_t pf_batch_size,
size_t pf_max_q_len,
size_t pf_max_k_len,
size_t dc_batch_size,
int layer_id,
const LlamaAttentionWeight<T>* weight)
{
TensorMap inputs(*_inputs);
inputs.insert("input_query", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io});
inputs.insert("layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id});
if (pf_batch_size) {
inputs.insert("attention_mask",
{MEMORY_GPU, dtype_, {pf_batch_size, 1, pf_max_q_len, pf_max_k_len}, attention_mask_});
const size_t pf_token_num = token_num - dc_batch_size;
inputs.insert("padding_offset", {MEMORY_GPU, TYPE_INT32, {pf_token_num}, padding_offset_});
inputs.insert("cu_seqlens", {MEMORY_GPU, TYPE_INT32, {pf_batch_size + 1}, cu_seqlens_});
}
TensorMap outputs(*_outputs);
outputs.insert("hidden_features", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, attn_io});
attn_layer_->forward(&outputs, &inputs, weight);
}
template<typename T>
UnifiedDecoder<T>::~UnifiedDecoder()
{
delete attn_layer_;
delete ffn_layer_;
freeBuffer();
}
template<typename T>
void UnifiedDecoder<T>::forward(TensorMap* outputs, const TensorMap* inputs, const std::vector<WeightType*>* weights)
{
/**
* input tensors:
* \param decoder_input [num_token, hidden_units], float
* \param input_lengths [batch_size], int
* \param history_lengths [batch_size], int
* \param context_legnths [batch_size], int
* \param output_norm_weight [hidden_dims], float
* \param max_q_len [1], int on cpu
* \param max_kv_len [1], int on cpu
* \param max_seq_len [1], int on cpu
*
* output tensors:
* \param decoder_output [num_token, hidden_units],
* \param key_cache [num_layer, batch, local_head_num, size_per_head // x, max_seq_len, x]
* \param value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
* \param last_token_hidden_units [batch_size, hidden_units]
*/
// Session sess{};
const size_t token_num = inputs->at("decoder_input").shape[0];
const int pf_max_q_len = inputs->getVal<int>("pf_max_q_len");
const int pf_max_k_len = inputs->getVal<int>("pf_max_k_len");
const int pf_batch_size = inputs->getVal<int>("pf_batch_size");
const int dc_batch_size = inputs->getVal<int>("dc_batch_size");
const int* input_length = inputs->getPtr<int>("input_lengths");
const int* context_length = inputs->getPtr<int>("context_lengths");
T* decoder_input_output = inputs->getPtr<T>("decoder_input");
T* decoder_output = outputs->getPtr<T>("decoder_output");
T* last_token_hidden_units = outputs->getPtr<T>("last_token_hidden_units");
allocateBuffer(token_num, pf_batch_size, pf_max_q_len, pf_max_k_len);
const int pf_offset = dc_batch_size;
if (pf_batch_size) {
FT_CHECK(padding_offset_);
size_t tmp_token_num{};
// `cu_seqlens` is exclusive sum of "input_lengths"
invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
&tmp_token_num, // updated token num
padding_offset_,
cu_seqlens_,
input_length + pf_offset,
pf_batch_size,
pf_max_q_len,
stream_);
sync_check_cuda_error();
FT_CHECK(tmp_token_num == token_num - dc_batch_size);
invokeCreateCausalMasks(attention_mask_,
input_length + pf_offset,
context_length + pf_offset,
pf_max_q_len,
pf_max_k_len,
pf_batch_size,
stream_);
sync_check_cuda_error();
}
/////////////////////////////////////////////
/// RMSNorm
invokeRootMeanSquareNorm(decoder_output,
decoder_input_output,
weights->at(0)->self_attn_norm_weights,
rmsnorm_eps_,
token_num,
hidden_units_,
stream_);
sync_check_cuda_error();
for (size_t layer = 0; layer < num_layer_; ++layer) {
/////////////////////////////////////////////
/// self-attention
forwardSelfAttn(decoder_output,
outputs,
inputs,
token_num,
pf_batch_size,
pf_max_q_len,
pf_max_k_len,
dc_batch_size,
layer,
&weights->at(layer)->self_attn_weights);
invokeFusedAddBiasResidualRMSNorm(decoder_input_output,
decoder_output,
weights->at(layer)->self_attn_weights.output.bias,
weights->at(layer)->ffn_norm_weights,
rmsnorm_eps_,
token_num,
hidden_units_,
stream_);
sync_check_cuda_error();
////////////////////////////////////////////
/// feed-forward network
TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}};
TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, dtype_, {token_num, hidden_units_}, decoder_output}}};
ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &weights->at(layer)->ffn_weights);
const bool is_last_layer = layer == num_layer_ - 1;
auto scale_weight = !is_last_layer ? weights->at(layer + 1)->self_attn_norm_weights :
inputs->at("output_norm_weight").getPtr<T>();
invokeFusedAddBiasResidualRMSNorm(decoder_input_output,
decoder_output,
weights->at(layer)->ffn_weights.output.bias,
scale_weight,
rmsnorm_eps_,
token_num,
hidden_units_,
stream_);
sync_check_cuda_error();
}
if (dc_batch_size) {
check_cuda_error(cudaMemcpyAsync(last_token_hidden_units,
decoder_output,
sizeof(T) * dc_batch_size * hidden_units_,
cudaMemcpyDefault,
stream_));
}
if (pf_batch_size) {
invokeGetFeatureOfLastToken(last_token_hidden_units + pf_offset * hidden_units_, //
decoder_output + pf_offset * hidden_units_,
cu_seqlens_,
hidden_units_,
pf_batch_size,
stream_);
sync_check_cuda_error();
}
if (is_free_buffer_after_forward_) {
freeBuffer();
}
}
template class UnifiedDecoder<float>;
template class UnifiedDecoder<half>;
} // namespace turbomind
#pragma once
#include "src/turbomind/models/llama/LlamaDecoderLayerWeight.h"
#include "src/turbomind/models/llama/LlamaFfnLayer.h"
#include "src/turbomind/models/llama/llama_params.h"
#include "src/turbomind/models/llama/unified_attention_layer.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/nccl_utils.h"
namespace turbomind {
template<typename T>
class UnifiedDecoder {
protected:
void allocateBuffer(size_t num_token, size_t pfill_batch_size, size_t pfill_max_q_len, size_t pfill_max_k_len);
void freeBuffer();
void initialize(const LlamaAttentionParams& attn_params,
size_t kv_head_num,
bool use_fmha,
int cache_block_seq_len,
int quant_policy);
cudaStream_t stream_;
cublasMMWrapper* cublas_wrapper_;
IAllocator* allocator_;
bool is_free_buffer_after_forward_{};
size_t head_num_;
size_t size_per_head_;
size_t inter_size_;
size_t num_layer_;
size_t hidden_units_;
float rmsnorm_eps_;
NcclParam tensor_para_;
T* attention_mask_{};
int* padding_offset_{};
int* cu_seqlens_{}; // cu for cumulative
size_t* h_pinned_token_num_ptr_{};
UnifiedAttentionLayer<T>* attn_layer_{};
LlamaFfnLayer<T>* ffn_layer_{};
const DataType dtype_;
using WeightType = LlamaDecoderLayerWeight<T>;
void forwardSelfAttn(T* attn_io,
TensorMap* _outputs,
const TensorMap* _inputs,
size_t token_num,
size_t pf_batch_size,
size_t pf_max_q_len,
size_t pf_max_k_len,
size_t dc_batch_size,
int layer_id,
const LlamaAttentionWeight<T>* weight);
public:
UnifiedDecoder(size_t head_num,
size_t kv_head_num,
size_t size_per_head,
size_t inter_size,
size_t num_layer,
const LlamaAttentionParams& attn_params,
float rmsnorm_eps,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool use_fmha,
int cache_block_seq_len,
int quant_policy):
stream_(stream),
cublas_wrapper_(cublas_wrapper),
allocator_(allocator),
is_free_buffer_after_forward_(is_free_buffer_after_forward),
head_num_(head_num),
size_per_head_(size_per_head),
inter_size_(inter_size),
hidden_units_(head_num * size_per_head),
num_layer_(num_layer),
rmsnorm_eps_(rmsnorm_eps),
tensor_para_(tensor_para),
dtype_(getTensorType<T>())
{
initialize(attn_params, kv_head_num, use_fmha, cache_block_seq_len, quant_policy);
}
~UnifiedDecoder();
void forward(TensorMap* outputs, const TensorMap* inputs, const std::vector<WeightType*>* weights);
};
} // namespace turbomind
......@@ -64,36 +64,42 @@ void LlamaTritonModel<T>::handleMissingParams()
TM_LOG_WARNING("[LlamaTritonModel] `kv_head_num` is not set, default to `head_num` (%d).", (int)kv_head_num_);
}
if (!max_batch_size_) {
max_batch_size_ = 64;
TM_LOG_WARNING("[LlamaTritonModel] `max_batch_size` is not set, default to %d.", (int)max_batch_size_);
if (!attn_params_.max_position_embeddings) {
attn_params_.max_position_embeddings = 2048;
TM_LOG_WARNING("[LlamaTritonModel] `max_position_embeddings` is not set, default to %d.",
(int)attn_params_.max_position_embeddings);
}
if (!session_len_) {
session_len_ = 2160;
TM_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)session_len_);
if (!engine_params_.max_batch_size) {
engine_params_.max_batch_size = 64;
TM_LOG_WARNING("[LlamaTritonModel] `max_batch_size` is not set, default to %d.",
(int)engine_params_.max_batch_size);
}
if (!attn_params_.max_position_embeddings) {
attn_params_.max_position_embeddings = session_len_;
TM_LOG_WARNING("[LlamaTritonModel] `max_position_embeddings` is not set, default to `session_len` (%d).",
(int)attn_params_.max_position_embeddings);
if (!engine_params_.session_len) {
engine_params_.session_len = attn_params_.max_position_embeddings;
TM_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)engine_params_.session_len);
}
if (!max_context_token_num_) {
max_context_token_num_ = (int)std::sqrt(max_batch_size_);
if (!engine_params_.max_context_token_num) {
engine_params_.max_context_token_num = engine_params_.session_len;
TM_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` is not set, default to %d.",
(int)max_context_token_num_);
(int)engine_params_.max_context_token_num);
}
if (!step_length_) {
step_length_ = 1;
TM_LOG_WARNING("[LlamaTritonModel] `step_length` is not set, default to %d.", (int)step_length_);
if (engine_params_.max_context_token_num <= engine_params_.max_batch_size) {
engine_params_.max_context_token_num *= engine_params_.session_len;
TM_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` = %d.", (int)engine_params_.max_context_token_num);
}
if (!cache_max_block_count_) {
cache_max_block_count_ = .95f;
TM_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %f.", cache_max_block_count_);
if (!engine_params_.step_length) {
engine_params_.step_length = 1;
}
if (!engine_params_.cache_max_block_count) {
engine_params_.cache_max_block_count = .95f;
TM_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %f.",
engine_params_.cache_max_block_count);
}
if (!cache_block_seq_len_) {
......@@ -101,9 +107,16 @@ void LlamaTritonModel<T>::handleMissingParams()
TM_LOG_WARNING("[LlamaTritonModel] `cache_block_seq_len` is not set, default to %d.", cache_block_seq_len_);
}
if (!cache_chunk_size_) {
cache_chunk_size_ = cache_max_block_count_;
TM_LOG_WARNING("[LlamaTritonModel] `cache_chunk_size` is not set, default to %d.", (int)cache_chunk_size_);
if (!engine_params_.cache_chunk_size) {
engine_params_.cache_chunk_size = engine_params_.cache_max_block_count;
TM_LOG_WARNING("[LlamaTritonModel] `cache_chunk_size` is not set, default to %d.",
(int)engine_params_.cache_chunk_size);
}
if (!engine_params_.num_tokens_per_iter) {
engine_params_.num_tokens_per_iter = engine_params_.max_context_token_num;
TM_LOG_WARNING("[LlamaTritonModel] `num_tokens_per_iter` is not set, default to `max_context_token_num` (%d).",
(int)engine_params_.num_tokens_per_iter);
}
}
......@@ -142,24 +155,18 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size,
}
}
model_name_ = reader.Get("llama", "model_name");
head_num_ = reader.GetInteger("llama", "head_num");
kv_head_num_ = reader.GetInteger("llama", "kv_head_num", 0);
size_per_head_ = reader.GetInteger("llama", "size_per_head");
inter_size_ = reader.GetInteger("llama", "inter_size");
num_layer_ = reader.GetInteger("llama", "num_layer");
vocab_size_ = reader.GetInteger("llama", "vocab_size");
norm_eps_ = reader.GetFloat("llama", "norm_eps");
start_id_ = reader.GetInteger("llama", "start_id");
end_id_ = reader.GetInteger("llama", "end_id");
max_batch_size_ = reader.GetInteger("llama", "max_batch_size", 0);
max_context_token_num_ = reader.GetInteger("llama", "max_context_token_num", 0);
session_len_ = reader.GetInteger("llama", "session_len", 0);
step_length_ = reader.GetInteger("llama", "step_length", 0);
cache_max_block_count_ = reader.GetFloat("llama", "cache_max_entry_count", 0);
cache_block_seq_len_ = reader.GetInteger("llama", "cache_block_seq_len", 0);
cache_chunk_size_ = reader.GetInteger("llama", "cache_chunk_size", 0);
use_context_fmha_ = reader.GetInteger("llama", "use_context_fmha", 1);
model_name_ = reader.Get("llama", "model_name");
head_num_ = reader.GetInteger("llama", "head_num");
kv_head_num_ = reader.GetInteger("llama", "kv_head_num", 0);
size_per_head_ = reader.GetInteger("llama", "size_per_head");
inter_size_ = reader.GetInteger("llama", "inter_size");
num_layer_ = reader.GetInteger("llama", "num_layer");
vocab_size_ = reader.GetInteger("llama", "vocab_size");
norm_eps_ = reader.GetFloat("llama", "norm_eps");
start_id_ = reader.GetInteger("llama", "start_id");
end_id_ = reader.GetInteger("llama", "end_id");
use_context_fmha_ = reader.GetInteger("llama", "use_context_fmha", 1);
cache_block_seq_len_ = reader.GetInteger("llama", "cache_block_seq_len", 0);
attn_bias_ = reader.GetInteger("llama", "attn_bias", 0);
quant_policy_ = reader.GetInteger("llama", "quant_policy", 0);
......@@ -173,11 +180,19 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size,
// attn_params_.use_dynamic_ntk = reader.GetInteger("llama", "use_dynamic_ntk", 0);
attn_params_.use_logn_attn = reader.GetInteger("llama", "use_logn_attn", 0);
handleMissingParams();
engine_params_.max_batch_size = reader.GetInteger("llama", "max_batch_size", 0);
engine_params_.max_context_token_num = reader.GetInteger("llama", "max_context_token_num", 0);
engine_params_.session_len = reader.GetInteger("llama", "session_len", 0);
engine_params_.step_length = reader.GetInteger("llama", "step_length", 0);
if (max_context_token_num_ <= max_batch_size_) {
max_context_token_num_ *= session_len_;
}
engine_params_.cache_max_block_count = reader.GetFloat("llama", "cache_max_entry_count", 0);
engine_params_.cache_chunk_size = reader.GetInteger("llama", "cache_chunk_size", 0);
engine_params_.num_tokens_per_iter = reader.GetInteger("llama", "num_tokens_per_iter", 0);
engine_params_.extra_tokens_per_iter = reader.GetInteger("llama", "extra_tokens_per_iter", 0);
engine_params_.max_prefill_iters = reader.GetInteger("llama", "max_prefill_iters", 1);
handleMissingParams();
shared_state_ = std::make_shared<typename ft::LlamaV2<T>::SharedState>();
shared_state_->barrier = std::make_shared<ft::Barrier>(tensor_para_size);
......@@ -258,19 +273,14 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
inter_size_,
num_layer_,
vocab_size_,
attn_params_,
norm_eps_,
max_batch_size_,
max_context_token_num_,
session_len_,
step_length_,
attn_params_,
start_id_,
end_id_,
cache_max_block_count_,
cache_block_seq_len_,
cache_chunk_size_,
quant_policy_,
use_context_fmha_,
engine_params_,
shared_state_,
shared_weights_[device_id].get(),
tensor_para,
......@@ -288,7 +298,7 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
std::move(cuda_device_prop_ptr),
shared_weights_[device_id],
std::move(llama),
session_len_});
engine_params_.session_len});
}
template<typename T>
......@@ -367,10 +377,11 @@ std::string LlamaTritonModel<T>::toString()
ss << "Model: "
<< "\nhead_num: " << head_num_ << "\nkv_head_num: " << kv_head_num_ << "\nsize_per_head: " << size_per_head_
<< "\ninter_size: " << inter_size_ << "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_
<< "\nattn_bias: " << attn_bias_ << "\nmax_batch_size: " << max_batch_size_
<< "\nmax_context_token_num: " << max_context_token_num_ << "\nsession_len: " << session_len_
<< "\nstep_length: " << step_length_ << "\ncache_max_entry_count: " << cache_max_block_count_
<< "\ncache_block_seq_len: " << cache_block_seq_len_ << "\ncache_chunk_size: " << cache_chunk_size_
<< "\nattn_bias: " << attn_bias_ << "\nmax_batch_size: " << engine_params_.max_batch_size
<< "\nmax_context_token_num: " << engine_params_.max_context_token_num
<< "\nsession_len: " << engine_params_.session_len << "\nstep_length: " << engine_params_.step_length
<< "\ncache_max_entry_count: " << engine_params_.cache_max_block_count
<< "\ncache_block_seq_len: " << cache_block_seq_len_ << "\ncache_chunk_size: " << engine_params_.cache_chunk_size
<< "\nuse_context_fmha: " << use_context_fmha_ << "\nstart_id: " << start_id_
<< "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_
<< "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_
......
......@@ -89,16 +89,11 @@ private:
size_t num_layer_;
size_t vocab_size_;
turbomind::LlamaAttentionParams attn_params_;
turbomind::EngineParams engine_params_;
float norm_eps_;
int max_batch_size_;
int max_context_token_num_;
int session_len_;
int step_length_;
int start_id_;
int end_id_;
float cache_max_block_count_;
int cache_block_seq_len_;
int cache_chunk_size_;
int use_context_fmha_;
size_t tensor_para_size_;
size_t pipeline_para_size_;
......
// Copyright (c) OpenMMLab. All rights reserved.
#pragma once
#include <utility>
namespace turbomind {
namespace detail {
template<int X>
inline constexpr std::integral_constant<int, X> _Int{};
template<class F, class P, class G, int... Xs, std::size_t... Is>
bool dispatch_impl(F&& f, P&& p, G g, std::integer_sequence<int, Xs...>, std::index_sequence<Is...>)
{
constexpr int N = sizeof...(Xs);
return (((((P &&) p)(_Int<Xs>) || (g && Is == N - 1)) && (((F &&) f)(_Int<Xs>), 1)) || ...);
}
} // namespace detail
template<class F, class P, int... Is, class G = std::true_type>
bool dispatch(std::integer_sequence<int, Is...> seq, P&& p, F&& f, G g = {})
{
return detail::dispatch_impl((F &&) f, (P &&) p, g, seq, std::make_index_sequence<sizeof...(Is)>{});
}
template<class F, int... Is, class G = std::true_type>
bool dispatch(std::integer_sequence<int, Is...> seq, F&& f)
{
return (((F &&) f)(detail::_Int<Is>) || ...);
}
} // namespace turbomind
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment