Unverified Commit ab1767cf authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

TurboMind 2 (#590)

* refresh decoder attention kernel

* block-level kv cache

* `BlockManager` & `SequenceManager`

* update

* update

* update

* update

* rename

* GQA support

* fix context length

* GQA dispatch

* kv8

* tune

* async stream cb

* nvtx

* config parsing

* debug

* optimize output cost

* split-k decoding

* minor

* truncate `session_len` by available blocks

* minor

* license

* fix

* dispatch `cp.async`

* fix linking

* fix

* fix deadlock

* guard input length

* correct start offset

* fix prefill chunking

* fix `cache_block_seq_len` param passing

* fix `block_size` fmtstr

* fix output tokens

* fix batch resizing

* fix masking of finished sequences

* add debug util

* free unused block early

* add ntk scaling and logn scaling

* cmake flags

* fix typo

* w4a16 for sm75

* fix msvc build

* fix msvc build

* fix block verification

* fix msvc build

* use `std::shuffle`

* fix lint

* fix lint

* fix lint

* clear incoming buffer

* clear finished requests

* fix batch initialization

* fix typo

* fix typo

* fix comparison
parent 06125966
......@@ -28,6 +28,8 @@
#include "src/turbomind/models/llama/LlamaDecoder.h"
#include "src/turbomind/models/llama/LlamaWeight.h"
#include "src/turbomind/models/llama/Request.h"
#include "src/turbomind/models/llama/SequenceManager.h"
#include "src/turbomind/models/llama/llama_params.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include "src/turbomind/utils/instance_comm.h"
......@@ -46,9 +48,7 @@ public:
std::vector<std::shared_ptr<Request>> stop_requests;
RequestQueue request_queue;
std::shared_ptr<Barrier> barrier;
// rank 0 sets flag to true if there are no more tasks in the request_queue
bool should_stop = false;
bool abort;
};
~LlamaV2();
......@@ -67,7 +67,8 @@ public:
int step_length,
int start_id,
int end_id,
int cache_max_entry_count,
float cache_max_block_count,
int cache_block_seq_len,
int cache_chunk_size,
int quant_policy,
bool use_context_fmha,
......@@ -104,39 +105,45 @@ public:
private:
friend class Batch;
void internalThreadEntry(int device_id);
void
initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, bool use_context_fmha, int quant_policy);
void initialize(const LlamaAttentionParams& attn_params,
size_t kv_head_num,
bool use_context_fmha,
int cache_block_seq_len,
int quant_policy);
void embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step);
void contextDecode(T* deocder_output,
uintptr_t* k_cache_ptr,
uintptr_t* v_cache_ptr,
T* context_decoder_input_buf,
T* context_decoder_output_buf,
const int* input_ids,
const int* input_length,
const int* history_length,
const int* context_length,
size_t token_num,
size_t max_input_len,
size_t max_context_len,
size_t session_len,
size_t batch_size);
void decoderForward(T* decoder_output,
uintptr_t* k_cache_ptr,
uintptr_t* v_cache_ptr,
T* decoder_input,
const int* sequence_length,
const int* total_padding_count,
bool* finished,
int step,
int ite,
size_t session_len,
size_t batch_size);
void contextDecode(T* deocder_output,
uintptr_t* k_block_ptrs,
uintptr_t* v_block_ptrs,
void** k_tmp_ptrs,
void** v_tmp_ptrs,
T* context_decoder_input_buf,
T* context_decoder_output_buf,
const int* input_ids,
const int* input_length,
const int* context_length,
const int* cu_block_counts,
const float* rope_theta,
size_t token_num,
size_t max_input_len,
size_t max_context_len,
size_t session_len,
size_t batch_size);
void decoderForward(T* decoder_output,
uintptr_t* k_cache_ptr,
uintptr_t* v_cache_ptr,
T* decoder_input,
const int* sequence_length,
const bool* finished,
const int* cu_block_counts,
const float* rope_theta,
int step,
int ite,
int sum_seq_len,
int max_seq_len,
size_t batch_size);
void postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size);
......@@ -156,7 +163,15 @@ private:
size_t token_ids_len,
size_t batch_size);
void start();
curandState_t* GetTopKState(int index)
{
return dynamic_decode_layer_->topk_curandstate_buf() + index;
}
curandState_t* GetTopPState(int index)
{
return dynamic_decode_layer_->topp_curandstate_buf() + index;
}
private:
friend class LlamaBatch<T>;
......@@ -169,6 +184,8 @@ private:
size_t vocab_size_padded_;
float rmsnorm_eps_ = 1e-6f;
const LlamaAttentionParams attn_params_;
static constexpr bool neox_rotary_style_ = false;
const int start_id_;
......@@ -176,6 +193,7 @@ private:
const size_t hidden_units_;
const size_t local_head_num_;
const size_t local_kv_head_num_;
NcclParam tensor_para_;
cudaStream_t stream_;
......@@ -186,20 +204,15 @@ private:
const bool debug_{false};
std::unique_ptr<LlamaCacheManager> kv_cache_mgr_;
LlamaWeight<T>* weights_{};
LlamaDecoder<T>* decoder_{};
LlamaContextDecoder<T>* context_decoder_{};
DynamicDecodeLayer<float>* dynamic_decode_layer_{};
const int step_length_;
LlamaBatch<T> batch_;
std::shared_ptr<SharedState> shared_state_;
std::thread internal_thread_;
ffi_api_lock_ctrl_t ffi_lock_ = nullptr;
const int step_length_;
std::shared_ptr<SharedState> shared_state_;
ffi_api_lock_ctrl_t ffi_lock_;
std::unique_ptr<LlamaBatch<T>> batch_;
};
} // namespace turbomind
......@@ -14,9 +14,11 @@ namespace turbomind {
struct Request {
uint64_t id;
bool start_flag;
bool end_flag;
bool stop_flag;
uint64_t priority;
bool start_flag;
bool end_flag;
bool stop_flag;
// per rank inputs/outputs
std::vector<TensorMap> inputs;
......@@ -31,7 +33,8 @@ struct Request {
kConflict = 2,
kBusy = 3,
kInactive = 4,
kFail = 5
kFail = 5,
kTooLong = 6
};
std::promise<int> signal;
};
......@@ -66,11 +69,16 @@ public:
void dequeue(std::vector<std::shared_ptr<Request>>& stop_requests,
std::vector<std::shared_ptr<Request>>& infer_requests,
unsigned max_infer_count,
bool blocking)
bool blocking,
bool& abort)
{
std::unique_lock<std::mutex> lock(mutex_);
if (blocking) {
cv_.wait(lock, [this] { return !(stop_queue_.empty() && infer_queue_.empty() && closed_ == false); });
cv_.wait(lock, [this] { return !(stop_queue_.empty() && infer_queue_.empty()) || closed_; });
if (closed_) {
abort = true;
return;
}
}
stop_requests.clear();
......@@ -88,8 +96,10 @@ public:
void close()
{
std::lock_guard<std::mutex> lock(mutex_);
closed_ = true;
{
std::lock_guard<std::mutex> lock(mutex_);
closed_ = true;
}
cv_.notify_all();
}
......@@ -98,7 +108,7 @@ private:
std::queue<std::shared_ptr<Request>> infer_queue_;
std::mutex mutex_;
std::condition_variable cv_;
bool closed_ = false;
bool closed_{false};
};
} // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/turbomind/models/llama/SequenceManager.h"
#include "src/turbomind/models/llama/BlockManager.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/debug_utils.h"
#include "src/turbomind/utils/logger.h"
#include <cstddef>
#include <cstdlib>
#include <ctime>
#include <numeric>
#include <stdexcept>
namespace turbomind {
SequenceManager::SequenceManager(size_t layer_num,
size_t head_num,
size_t head_dim,
size_t block_seq_len,
double block_count,
int chunk_size,
size_t elem_bits,
int rank,
IAllocator* allocator):
block_seq_len_(block_seq_len)
{
constexpr int kBitsPerByte = 8;
// [2, L, H, block_seq_len, D]
size_t block_size = 2UL * layer_num * head_num * block_seq_len * head_dim * elem_bits / kBitsPerByte;
block_manager_ = std::make_unique<BlockManager>(block_size, block_count, chunk_size, allocator);
val_offset_ = block_size / 2;
}
const Sequence* SequenceManager::Create(uint64_t id)
{
Sequence sequence{id, {}, {}, {}, {}, {}, {}, 0.f};
auto it = sequences_.find(id);
if (it != sequences_.end()) {
if (rank_ == 0) {
TM_LOG_WARNING("[SequenceManager][Create] Removing conflicting ID %ld", (long)id);
}
auto& seq = it->second;
if (seq.status != Sequence::kCached) {
unlocked_.insert(unlocked_.end(), seq.blocks.begin(), seq.blocks.end());
}
seq = std::move(sequence);
}
else {
it = sequences_.emplace_hint(it, id, std::move(sequence));
}
return &it->second;
}
const Sequence* SequenceManager::Get(uint64_t id)
{
if (auto it = sequences_.find(id); it != sequences_.end()) {
auto& sequence = it->second;
return &it->second;
}
return nullptr;
}
bool SequenceManager::Contains(uint64_t id)
{
return sequences_.find(id) != sequences_.end();
}
bool SequenceManager::Erase(uint64_t id)
{
if (auto it = sequences_.find(id); it != sequences_.end()) {
auto& seq = it->second;
if (seq.status != Sequence::kCached) {
unlocked_.insert(unlocked_.end(), seq.blocks.begin(), seq.blocks.end());
freed_.insert(freed_.end(), seq.blocks.begin(), seq.blocks.end());
}
else {
for (int i = 0; i < seq.blocks.size(); ++i) {
// filter invalidated blocks
if (seq.blocks[i]->unique_id == seq.block_unique_ids[i]) {
freed_.push_back(seq.blocks[i]);
}
}
}
sequences_.erase(it);
}
else {
throw std::out_of_range(std::to_string(id));
}
return false;
}
void SequenceManager::VerifyAndLockCached(const Sequences& sequences)
{
std::vector<const Block*> blocks;
for (const auto& p : sequences) {
auto& seq = const_cast<Sequence&>(*p);
if (seq.status != Sequence::kCached) {
continue;
}
FT_CHECK(seq.blocks.size() == seq.block_unique_ids.size());
if (need_verify_) {
for (int i = 0; i < seq.blocks.size(); ++i) {
if (seq.blocks[i]->unique_id != seq.block_unique_ids[i]) {
seq.blocks.resize(i);
seq.block_unique_ids.resize(i);
break;
}
}
}
blocks.insert(blocks.end(), seq.blocks.begin(), seq.blocks.end());
seq.cache_len = std::min<int>(seq.cache_len, seq.blocks.size() * block_seq_len_);
seq.status = Sequence::kLocked;
}
block_manager_->Lock(blocks);
need_verify_ = false;
}
void SequenceManager::CommitUnlockAndFree()
{
if (!unlocked_.empty()) {
block_manager_->Unlock(unlocked_);
unlocked_.clear();
}
if (!freed_.empty()) {
block_manager_->Free(freed_);
freed_.clear();
}
}
void SequenceManager::UpdateAndSetUnlock(const Sequence& sequence)
{
FT_CHECK(sequence.status != Sequence::kCached);
auto& seq = const_cast<Sequence&>(sequence);
block_manager_->Touch(seq.blocks);
unlocked_.insert(unlocked_.end(), seq.blocks.begin(), seq.blocks.end());
seq.status = Sequence::kCached;
}
namespace {
struct Schedule {
int free;
int cached;
int allocate{};
int evict{};
int preempt{};
int last;
Sequences active;
std::vector<int> block_counts;
Sequences inactive;
Sequences victims;
Schedule(Snapshot snapshot, int size):
free(snapshot.free),
cached(snapshot.cached),
last(size),
use_count_(std::move(snapshot.use_count)),
unlocked_(size),
it_(size)
{
}
int Unlock(const Sequences& seqs, int vidx)
{
while (vidx < it_) {
const auto& blocks = seqs[--it_]->blocks;
int count = 0;
for (const auto& p : blocks) {
count += static_cast<int>(--use_count_[p->id] == 0);
}
unlocked_[it_] = count;
}
return unlocked_[vidx];
}
private:
std::vector<int> use_count_;
std::vector<int> unlocked_;
int it_;
};
template<typename T>
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
{
os << "[";
for (int i = 0; i < v.size(); ++i) {
os << (i ? "," : "") << v[i];
}
os << "]";
return os;
}
std::ostream& operator<<(std::ostream& os, const Schedule& s)
{
os << "free=" << s.free << ", cached=" << s.cached << ", allocate=" << s.allocate << ", evict=" << s.evict
<< ", preempt=" << s.preempt << ", active=" << s.active << ", victims=" << s.victims
<< ", block_counts=" << s.block_counts << ", inactive=" << s.inactive;
return os;
}
struct Transaction {
int index_;
int block_count_;
int allocate_{};
int evict_{};
int preempt_{};
Sequences victims_;
const Sequences& sequences_;
Schedule& schedule_;
explicit Transaction(const Sequences& sequences, int index, int block_count, Schedule& sched):
sequences_(sequences), schedule_(sched), index_(index), block_count_(block_count)
{
}
void Process()
{
int count = block_count_;
int tmp = std::min(schedule_.free, count);
count -= tmp;
allocate_ += tmp;
tmp = std::min(schedule_.cached, count);
count -= tmp;
evict_ += tmp;
for (int vidx = schedule_.last - 1; count && vidx > index_; --vidx) {
if (sequences_[vidx]->status == Sequence::kCached) {
continue;
}
victims_.push_back(sequences_[vidx]);
preempt_ += schedule_.Unlock(sequences_, vidx);
if (count <= preempt_) {
evict_ += count;
count -= count;
schedule_.last = vidx; // ! modifiying `sched_.last` is part of commit
break;
}
}
if (count == 0) {
Commit();
}
else {
schedule_.inactive.push_back(sequences_[index_]);
}
}
void Commit()
{
// update available resources
schedule_.free -= allocate_;
FT_CHECK(schedule_.free >= 0);
schedule_.cached += preempt_;
schedule_.cached -= evict_;
FT_CHECK(schedule_.cached >= 0);
// update scheduled operations
schedule_.allocate += allocate_;
schedule_.evict += evict_;
schedule_.preempt += preempt_;
schedule_.victims.insert(schedule_.victims.end(), victims_.begin(), victims_.end());
// update active sequences
schedule_.active.push_back(sequences_[index_]);
schedule_.block_counts.push_back(block_count_);
}
};
std::ostream& operator<<(std::ostream& os, const Transaction& trans)
{
os << "index=" << trans.index_ << ", block_count=" << trans.block_count_ << ", allocate=" << trans.allocate_
<< ", evict=" << trans.evict_ << ", preempt=" << trans.preempt_ << ", victims=" << trans.victims_;
return os;
}
} // namespace
void SequenceManager::SortByPriority(Sequences& sequences,
std::vector<int>& context_lengths,
const std::vector<uint64_t>& priorities)
{
// sort according to priority
std::vector<int> idxs(sequences.size());
std::iota(idxs.begin(), idxs.end(), 0);
std::sort(idxs.begin(), idxs.end(), [&](int i, int j) {
return priorities[i] < priorities[j]; //
});
Sequences tmp_sequences(sequences.size());
std::vector<int> tmp_lengths(context_lengths.size());
for (int i = 0; i < sequences.size(); ++i) {
tmp_sequences[i] = sequences[idxs[i]];
tmp_lengths[i] = context_lengths[idxs[i]];
}
sequences.swap(tmp_sequences);
context_lengths.swap(tmp_lengths);
}
std::vector<int> SequenceManager::CountRequiredBlocks(const Sequences& sequences,
const std::vector<int>& context_lengths,
int step_length)
{
std::vector<int> required(sequences.size());
for (int i = 0; i < sequences.size(); ++i) {
int seq_len = context_lengths[i] + step_length;
int count = (seq_len + block_seq_len_ - 1) / block_seq_len_ - static_cast<int>(sequences[i]->blocks.size());
required[i] = std::max(0, count);
}
return required;
}
void SequenceManager::AssignAndActivate(const Sequences& sequences, //
const std::vector<int>& counts,
const std::vector<const Block*>& blocks)
{
FT_CHECK(sequences.size() == counts.size());
auto first = blocks.begin();
for (int i = 0; i < sequences.size(); ++i) {
auto& s = const_cast<Sequence&>(*sequences[i]);
auto count = counts[i];
// dbg(count);
auto last = first + count;
std::for_each(first, last, [&](const Block* b) {
s.blocks.push_back(b);
s.block_unique_ids.push_back(b->unique_id);
});
s.status = Sequence::kActive;
first = last;
}
}
auto SequenceManager::Materialize(Sequences sequences,
std::vector<int> context_lengths,
const std::vector<uint64_t>& priorities,
int step_length) -> Outcome
{
////////////////////////////////////////////////////////////////////////////////
/// Schedule the assignment of blocks to sequences
// process deferred unlock and free operations
CommitUnlockAndFree();
SortByPriority(sequences, context_lengths, priorities);
// Verify and lock cache sequences to avoid their blocks being evicted unnoticed
// the blocks can still be preempted later
VerifyAndLockCached(sequences);
std::vector<int> required = CountRequiredBlocks(sequences, context_lengths, step_length);
// dbg(required);
Schedule schedule(block_manager_->TakeSnapshot(), sequences.size());
// `schedule.last` is decreasing in the loop
for (int i = 0; i < schedule.last; ++i) {
Transaction{sequences, i, required[i], schedule}.Process();
}
// mark remaining sequences invalid
for (int i = schedule.last; i < sequences.size(); ++i) {
schedule.inactive.push_back(sequences[i]);
}
////////////////////////////////////////////////////////////////////////////////
/// Schedule is ready, time to execute it. (locked -> cached -> free -> locked)
// combine allocate and evict since evicted blocks are reused by allocation
schedule.allocate += schedule.evict;
if (schedule.allocate) {
dbg(*block_manager_);
}
Outcome outcome{};
outcome.allocation = schedule.allocate;
outcome.swap_in = std::count_if(schedule.active.begin(), schedule.active.end(), [](auto p) {
if (p->status != Sequence::kActive) {
dbg(*p);
}
return p->status != Sequence::kActive; //
});
outcome.swap_out = std::count_if(schedule.inactive.begin(), schedule.inactive.end(), [](auto p) {
if (p->status == Sequence::kActive) {
dbg(*p);
}
return p->status == Sequence::kActive; //
});
// release preempted blocks -> cached
if (!schedule.victims.empty()) {
for (const auto& p : schedule.victims) {
UpdateAndSetUnlock(*p);
}
CommitUnlockAndFree();
}
// evict cached blocks -> free
if (schedule.evict) {
block_manager_->Evict(schedule.evict);
need_verify_ = true;
}
// allocate & assign blocks
{
std::vector<const Block*> blocks;
if (schedule.allocate) {
blocks = block_manager_->Allocate(schedule.allocate);
}
AssignAndActivate(schedule.active, schedule.block_counts, blocks);
}
// active -> locked
for (const auto& p : schedule.inactive) {
if (p->status == Sequence::kActive) {
const_cast<Sequence*>(p)->status = Sequence::kLocked;
}
}
return outcome;
}
} // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved.
#pragma once
#include "src/turbomind/models/llama/BlockManager.h"
namespace turbomind {
struct Sequence {
enum Status
{
kCached = 0,
kLocked,
kActive
};
uint64_t id;
Status status;
std::vector<const Block*> blocks;
std::vector<uint64_t> block_unique_ids;
mutable std::vector<int> tokens; // update by user
mutable int cache_len;
// additional data kept round-to-round
mutable std::vector<std::byte> random_state; // update by user
mutable float rope_theta;
friend std::ostream& operator<<(std::ostream& os, const Sequence& seq);
};
using Sequences = std::vector<const Sequence*>;
inline std::ostream& operator<<(std::ostream& os, const Sequence& seq)
{
os << "id=" << seq.id << ", status=" << seq.status << ", token_count=" << seq.tokens.size()
<< ", block_count=" << seq.blocks.size() << ", cache_len=" << seq.cache_len
<< ", random_state_size=" << seq.random_state.size();
return os;
}
class SequenceManager {
public:
explicit SequenceManager(size_t layer_num,
size_t head_num,
size_t head_dim,
size_t block_seq_len,
double block_count,
int chunk_size,
size_t elem_bits,
int rank,
IAllocator* allocator);
SequenceManager(const SequenceManager&) = delete;
SequenceManager(SequenceManager&&) noexcept = default;
const Sequence* Create(uint64_t id);
const Sequence* Get(uint64_t id);
bool Contains(uint64_t id);
bool Erase(uint64_t id);
void UpdateAndSetUnlock(const Sequence& seq);
struct Outcome {
int allocation;
int swap_in;
int swap_out;
};
Outcome Materialize(Sequences sequences,
std::vector<int> context_lengths,
const std::vector<uint64_t>& priorities,
int step_length);
void* OffsetKey(void* block_ptr)
{
return block_ptr;
}
void* OffsetVal(void* block_ptr)
{
return (std::byte*)block_ptr + val_offset_;
}
int max_block_count() const noexcept
{
return block_manager_->max_block_count();
}
private:
void CommitUnlockAndFree();
void VerifyAndLockCached(const Sequences& sequences);
std::vector<int> CountRequiredBlocks(const Sequences& sequences, //
const std::vector<int>& context_lengths,
int step_length);
static void SortByPriority(Sequences& sequences, //
std::vector<int>& context_lengths,
const std::vector<uint64_t>& priorities);
static void AssignAndActivate(const Sequences& sequences, //
const std::vector<int>& block_counts,
const std::vector<const Block*>& blocks);
private:
int block_seq_len_;
int rank_;
size_t val_offset_{};
bool need_verify_{};
// Use `std::map` to avoid reference invalidation
std::map<uint64_t, Sequence> sequences_;
std::unique_ptr<BlockManager> block_manager_;
std::vector<const Block*> unlocked_;
std::vector<const Block*> freed_;
};
inline std::ostream& operator<<(std::ostream& os, const SequenceManager::Outcome& oc)
{
os << "allocation: " << oc.allocation << ", swap-in: " << oc.swap_in << ", swap-out: " << oc.swap_out;
return os;
}
} // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/turbomind/kernels/decoder_multihead_attention/array_ops.h"
#include "src/turbomind/kernels/gemm_s_f16/common.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
#include "src/turbomind/utils/logger.h"
#include <type_traits>
namespace turbomind {
......@@ -199,392 +203,248 @@ void invokeCreateCausalMasks(
template void invokeCreateCausalMasks(float* mask, const int*, const int*, int, int, int, cudaStream_t);
template void invokeCreateCausalMasks(half* mask, const int*, const int*, int, int, int, cudaStream_t);
template<typename T>
__global__ void extend_key_cache(T** k_dst,
const size_t dst_offset,
const T* k_src,
const int head_num,
const int size_per_head,
const int* query_length,
const int* history_length,
const int max_q_len,
const int max_seq_len)
{
const int batch_id = blockIdx.y;
const int head_id = blockIdx.z;
constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8;
template<typename Ti, typename To>
struct ExtendKvCache {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
int size_per_head_div_x = size_per_head / X_ELEMS;
static constexpr int MaxElemSize = std::max(sizeof(Ti), sizeof(To));
static constexpr int X_ELEMS = 16 / MaxElemSize;
// x dim is now handled by uint4 type
const auto key_src = reinterpret_cast<const uint4*>(k_src);
const auto key_dst = reinterpret_cast<uint4*>(k_dst[batch_id] + dst_offset);
using Vi = Array<Ti, X_ELEMS>;
using Vo = Array<To, X_ELEMS>;
const auto seq_len = query_length[batch_id];
const auto t_offset = history_length[batch_id];
using Transform = ConvertKvCache<Ti, To>;
const int k_head_size_id = idx % size_per_head_div_x;
const int k_seq_len_id = idx / size_per_head_div_x;
struct Params {
To** k_dst_ptrs;
To** v_dst_ptrs;
const Ti* k_src;
const Ti* v_src;
const int* cu_block_counts;
const int* query_length;
const int* context_length;
int block_length;
size_t dst_layer_offset;
int max_q_len;
int head_num;
int head_dim;
Transform transform_k;
Transform transform_v;
};
if (k_seq_len_id < seq_len) {
// [B, H, s, D/x] -> [H, D/x, S[t:t+s]]
__device__ void operator()(const Params& params) const
{
const int batch_id = blockIdx.y;
const int64_t dst_idx = head_id * size_per_head_div_x * max_seq_len + // H
k_head_size_id * max_seq_len + // D/x
t_offset + k_seq_len_id; // s + offset
const int query_len = params.query_length[batch_id];
const int history_len = params.context_length[batch_id] - query_len;
const int cu_block_cnt = params.cu_block_counts[batch_id];
const int64_t src_idx = batch_id * head_num * size_per_head_div_x * max_q_len + // B
head_id * size_per_head_div_x * max_q_len + // H
k_seq_len_id * size_per_head_div_x + // s
k_head_size_id; // D/x
const int head_id = blockIdx.z;
key_dst[dst_idx] = key_src[src_idx];
}
}
const int size_per_head_div_x = params.head_dim / X_ELEMS;
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
const int head_size_id = idx % size_per_head_div_x;
const int seq_len_id = idx / size_per_head_div_x;
template<typename T>
__global__ void extend_value_cache(T** v_dst,
const size_t dst_offset,
const T* v_src,
const int head_num,
const int size_per_head,
const int* query_length,
const int* history_length,
const int max_q_len,
const int max_seq_len)
{
const int batch_id = blockIdx.y;
const int head_id = blockIdx.z;
constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8;
const int cache_block_index = (seq_len_id + history_len) / params.block_length;
const int cache_block_offset = (seq_len_id + history_len) % params.block_length;
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
int size_per_head_div_x = size_per_head / X_ELEMS;
const auto k_val_src = params.k_src;
const auto v_val_src = params.v_src;
// x dim is now handled by uint4 type
const auto val_src = reinterpret_cast<const uint4*>(v_src);
const auto val_dst = reinterpret_cast<uint4*>(v_dst[batch_id] + dst_offset);
const auto k_val_dst = (params.k_dst_ptrs + cu_block_cnt)[cache_block_index] + params.dst_layer_offset;
const auto v_val_dst = (params.v_dst_ptrs + cu_block_cnt)[cache_block_index] + params.dst_layer_offset;
const auto seq_len = query_length[batch_id];
const auto t_offset = history_length[batch_id];
if (seq_len_id < query_len) {
// [B, H, s, D/x] -> [H, S[t:t+s], D/x]
const int64_t dst_idx = head_id * params.block_length * size_per_head_div_x + // H
cache_block_offset * size_per_head_div_x + // s + offset
head_size_id; // D/x
const int v_head_size_id = idx % size_per_head_div_x;
const int v_seq_len_id = idx / size_per_head_div_x;
const int64_t src_idx = batch_id * params.head_num * params.max_q_len * size_per_head_div_x + // B
head_id * params.max_q_len * size_per_head_div_x + // H
seq_len_id * size_per_head_div_x + // s
head_size_id; // D/x
if (v_seq_len_id < seq_len) {
// [B, H, s, D/x] -> [H, S[t:t+s], D/x]
const int64_t dst_idx = head_id * size_per_head_div_x * max_seq_len + // H
(v_seq_len_id + t_offset) * size_per_head_div_x + // s + offset
v_head_size_id; // D/x
Vi k_vi;
Vi v_vi;
const int64_t src_idx = batch_id * head_num * size_per_head_div_x * max_q_len + // B
head_id * size_per_head_div_x * max_q_len + // H
v_seq_len_id * size_per_head_div_x + // s
v_head_size_id; // D/x
Ldg(k_vi, k_val_src + src_idx * X_ELEMS);
Ldg(v_vi, v_val_src + src_idx * X_ELEMS);
val_dst[dst_idx] = val_src[src_idx];
}
}
Vo k_vo = params.transform_k(k_vi);
Vo v_vo = params.transform_v(v_vi);
inline __device__ float2 float2div(float a, float2 b)
{
float2 c;
c.x = b.x / a;
c.y = b.y / a;
return c;
}
inline __device__ float2 float2sub(float zp, float2 val)
{
float2 ret;
ret.x = val.x - zp;
ret.y = val.y - zp;
return ret;
}
static inline __device__ half4 char4_scale_to_half4(char4 value, const float scale, const float zp)
{
half4 dst;
dst.x = __float2half(value.x * scale + zp);
dst.y = __float2half(value.y * scale + zp);
dst.z = __float2half(value.z * scale + zp);
dst.w = __float2half(value.w * scale + zp);
return dst;
}
Store(k_val_dst + dst_idx * X_ELEMS, k_vo);
Store(v_val_dst + dst_idx * X_ELEMS, v_vo);
}
}
};
static inline __device__ uint32_t float4_to_char4(float x, float y, float z, float w)
{
uint32_t dst;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 720
uint32_t a;
asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(a) : "f"(x));
uint32_t b;
asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(b) : "f"(y));
uint32_t c;
asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(c) : "f"(z));
uint32_t d;
asm volatile("cvt.rni.sat.s32.f32 %0, %1;\n" : "=r"(d) : "f"(w));
asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, 0;\n" : "=r"(dst) : "r"(d), "r"(c));
asm volatile("cvt.pack.sat.s8.s32.b32 %0, %1, %2, %0;\n" : "+r"(dst) : "r"(b), "r"(a));
#else
char4 tmp;
tmp.x = x;
tmp.y = y;
tmp.z = z;
tmp.w = w;
dst = reinterpret_cast<const uint32_t&>(tmp);
#endif
return dst;
}
namespace {
template<typename T>
__global__ void extend_value_cache_int8(int8_t** v_dst,
const size_t dst_offset,
const T* v_src,
const int head_num,
const int size_per_head,
const int* query_length,
const int* history_length,
const int max_q_len,
const int max_seq_len,
const float v_scale,
const float v_zp)
template<class Kernel, class Params>
__global__ void KernelWrapper(Params params)
{
const int batch_id = blockIdx.y;
const int head_id = blockIdx.z;
constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8;
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
int size_per_head_div_x = size_per_head / X_ELEMS;
// x dim is now handled by uint4 type
const auto val_src = reinterpret_cast<const uint4*>(v_src);
const auto val_dst = reinterpret_cast<uint2*>(v_dst[batch_id] + dst_offset);
const auto seq_len = query_length[batch_id];
const auto t_offset = history_length[batch_id];
const int v_head_size_id = idx % size_per_head_div_x;
const int v_seq_len_id = idx / size_per_head_div_x;
if (v_seq_len_id < seq_len) {
// [B, H, s, D/x] -> [H, S[t:t+s], D/x]
const int64_t dst_idx = head_id * size_per_head_div_x * max_seq_len + // H
(v_seq_len_id + t_offset) * size_per_head_div_x + // s + offset
v_head_size_id; // D/x
const int64_t src_idx = batch_id * head_num * size_per_head_div_x * max_q_len + // B
head_id * size_per_head_div_x * max_q_len + // H
v_seq_len_id * size_per_head_div_x + // s
v_head_size_id; // D/x
Kernel{}(params);
};
// scale to int8 and write
const auto value = val_src[src_idx];
auto to_ptr = reinterpret_cast<uint32_t*>(val_dst + dst_idx);
float2 float2_0 = float2div(v_scale, float2sub(v_zp, mmha::half2_to_float2(value.x)));
float2 float2_1 = float2div(v_scale, float2sub(v_zp, mmha::half2_to_float2(value.y)));
to_ptr[0] = float4_to_char4(float2_0.x, float2_0.y, float2_1.x, float2_1.y);
float2_0 = float2div(v_scale, float2sub(v_zp, mmha::half2_to_float2(value.z)));
float2_1 = float2div(v_scale, float2sub(v_zp, mmha::half2_to_float2(value.w)));
to_ptr[1] = float4_to_char4(float2_0.x, float2_0.y, float2_1.x, float2_1.y);
}
}
} // namespace
template<typename T>
void invokeExtendKVCache(T** k_dst,
T** v_dst,
size_t dst_offset,
void invokeExtendKVCache(void** k_dst_ptrs,
void** v_dst_ptrs,
const T* k_src,
const T* v_src,
int local_batch_size,
const int* cu_block_counts,
const int* query_length,
const int* context_length,
int batch_size,
int block_length,
size_t dst_layer_offset,
int max_q_len,
const int* history_length,
int max_seq_len,
int size_per_head,
int local_head_num,
cudaStream_t stream,
int head_dim,
int head_num,
int quant,
const float* kv_scale)
const float* kv_params,
cudaStream_t stream)
{
constexpr int block_sz = 128;
constexpr int x = (sizeof(T) == 4) ? 4 : 8;
dim3 grid((max_q_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
if (quant & QuantPolicy::kCacheKVInt8) {
extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(reinterpret_cast<int8_t**>(k_dst),
dst_offset,
k_src,
local_head_num,
size_per_head,
query_length,
history_length,
max_q_len,
max_seq_len,
kv_scale[0],
kv_scale[1]);
extend_value_cache_int8<<<grid, block_sz, 0, stream>>>(reinterpret_cast<int8_t**>(v_dst),
dst_offset,
v_src,
local_head_num,
size_per_head,
query_length,
history_length,
max_q_len,
max_seq_len,
kv_scale[2],
kv_scale[3]);
}
else {
extend_value_cache<<<grid, block_sz, 0, stream>>>(k_dst,
dst_offset,
k_src,
local_head_num,
size_per_head,
query_length,
history_length,
max_q_len,
max_seq_len);
extend_value_cache<<<grid, block_sz, 0, stream>>>(v_dst,
dst_offset,
v_src,
local_head_num,
size_per_head,
query_length,
history_length,
max_q_len,
max_seq_len);
}
}
template void invokeExtendKVCache(float**,
float**,
size_t,
const float*,
const float*,
int,
const int*,
int,
const int*,
int,
int,
int,
cudaStream_t stream,
int,
const float*);
template void invokeExtendKVCache(half**,
half**,
size_t,
const half*,
const half*,
int,
const int*,
int,
const int*,
int,
int,
int,
cudaStream_t stream,
int,
const float*);
template<typename T>
__global__ void transpose_value_cache(T* v_dst, //
const T** v_src,
const size_t src_offset,
const int head_num,
const int head_n_rep,
const int size_per_head,
const int* seq_length,
const int max_kv_len,
const int max_seq_len)
{
const int batch_id = blockIdx.y;
const int head_id = blockIdx.z;
constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8;
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
int size_per_head_div_x = size_per_head / X_ELEMS;
// x dim is now handled by uint4 type
const auto val_src = reinterpret_cast<const uint4*>(v_src[batch_id] + src_offset);
const auto val_dst = reinterpret_cast<uint4*>(v_dst);
const auto seq_len = seq_length[batch_id];
const int v_head_size_id = idx % size_per_head_div_x;
const int v_seq_len_id = idx / size_per_head_div_x;
if (v_seq_len_id < seq_len) {
// [B, H, s, D/x] <- [B, H, S[:s], D/x]
const int64_t src_idx = head_id / head_n_rep * size_per_head_div_x * max_seq_len + // H
v_seq_len_id * size_per_head_div_x + // s
v_head_size_id; // D/x
const int64_t dst_idx = batch_id * head_num * size_per_head_div_x * max_kv_len + // B
head_id * size_per_head_div_x * max_kv_len + // H
v_seq_len_id * size_per_head_div_x + // s
v_head_size_id; // D/x
val_dst[dst_idx] = val_src[src_idx];
}
auto fn = [&](auto value) {
using Tout = decltype(value);
using Kernel = ExtendKvCache<T, Tout>;
dim3 grid((max_q_len * head_dim / Kernel::X_ELEMS + block_sz - 1) / block_sz, batch_size, head_num);
typename Kernel::Params params{(Tout**)k_dst_ptrs,
(Tout**)v_dst_ptrs,
k_src,
v_src,
cu_block_counts,
query_length,
context_length,
block_length,
dst_layer_offset,
max_q_len,
head_num,
head_dim,
{kv_params[0], kv_params[1]},
{kv_params[2], kv_params[3]}};
KernelWrapper<Kernel><<<grid, block_sz, 0, stream>>>(params);
};
(quant & QuantPolicy::kCacheKVInt8) ? fn(int8_t{}) : fn(T{});
}
template<typename T>
__global__ void transpose_value_cache_int8(T* v_dst, //
const int8_t** v_src,
const size_t src_offset,
const int head_num,
const int head_n_rep,
const int size_per_head,
const int* seq_length,
const int max_kv_len,
const int max_seq_len,
const float v_scale,
const float v_zp)
{
const int batch_id = blockIdx.y;
const int head_id = blockIdx.z;
constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8;
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
int size_per_head_div_x = size_per_head / X_ELEMS;
// x dim is now handled by uint4 type
const auto val_src = reinterpret_cast<const uint2*>(v_src[batch_id] + src_offset);
const auto val_dst = reinterpret_cast<uint4*>(v_dst);
const auto seq_len = seq_length[batch_id];
const int v_head_size_id = idx % size_per_head_div_x;
const int v_seq_len_id = idx / size_per_head_div_x;
if (v_seq_len_id < seq_len) {
// [B, H, s, D/x] <- [B, H, S[:s], D/x]
const int64_t src_idx = head_id / head_n_rep * size_per_head_div_x * max_seq_len + // H
v_seq_len_id * size_per_head_div_x + // s
v_head_size_id; // D/x
const int64_t dst_idx = batch_id * head_num * size_per_head_div_x * max_kv_len + // B
head_id * size_per_head_div_x * max_kv_len + // H
v_seq_len_id * size_per_head_div_x + // s
v_head_size_id; // D/x
// int8x8 -> fp16x8
const auto from_ptr = reinterpret_cast<const char4*>(val_src + src_idx);
auto to_ptr = reinterpret_cast<half4*>(val_dst + dst_idx);
to_ptr[0] = char4_scale_to_half4(from_ptr[0], v_scale, v_zp);
to_ptr[1] = char4_scale_to_half4(from_ptr[1], v_scale, v_zp);
template void invokeExtendKVCache(void** k_dst_ptrs,
void** v_dst_ptrs,
const float* k_src,
const float* v_src,
const int* cu_block_counts,
const int* query_length,
const int* history_length,
int batch_size,
int block_length,
size_t dst_layer_offset,
int max_q_len,
int head_dim,
int head_num,
int quant,
const float* kv_scale,
cudaStream_t stream);
template void invokeExtendKVCache(void** k_dst_ptrs,
void** v_dst_ptrs,
const half* k_src,
const half* v_src,
const int* cu_block_counts,
const int* query_length,
const int* history_length,
int batch_size,
int block_length,
size_t dst_layer_offset,
int max_q_len,
int head_dim,
int head_num,
int quant,
const float* kv_scale,
cudaStream_t stream);
template<typename Ti, typename To>
struct TransposeKvCache {
static constexpr int MaxElemSize = std::max(sizeof(Ti), sizeof(To));
static constexpr int X_ELEMS = 16 / MaxElemSize;
using Vi = Array<Ti, X_ELEMS>;
using Vo = Array<To, X_ELEMS>;
using Transform = ConvertKvCache<Ti, To>;
struct Params {
To* k_dst;
To* v_dst;
const Ti** k_src;
const Ti** v_src;
size_t src_offset;
int head_num;
int head_n_rep;
int size_per_head;
const int* seq_length;
int max_kv_len;
int max_seq_len;
Transform transform_k;
Transform transform_v;
// float k_scale;
// float k_zp;
// float v_scale;
// float v_zp;
};
__device__ void operator()(const Params& params) const
{
const int batch_id = blockIdx.y;
const int head_id = blockIdx.z;
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
const int size_per_head_div_x = params.size_per_head / X_ELEMS;
const auto k_src = params.k_src[batch_id] + params.src_offset;
const auto v_src = params.v_src[batch_id] + params.src_offset;
const auto k_dst = params.k_dst;
const auto v_dst = params.v_dst;
const auto seq_len = params.seq_length[batch_id];
const int v_head_size_id = idx % size_per_head_div_x;
const int v_seq_len_id = idx / size_per_head_div_x;
if (v_seq_len_id < seq_len) {
// [B, H, s, D/x] <- [B, H, S[:s], D/x]
const int64_t src_idx = head_id / params.head_n_rep * size_per_head_div_x * params.max_seq_len + // H
v_seq_len_id * size_per_head_div_x + // s
v_head_size_id; // D/x
const int64_t dst_idx = batch_id * params.head_num * size_per_head_div_x * params.max_kv_len + // B
head_id * size_per_head_div_x * params.max_kv_len + // H
v_seq_len_id * size_per_head_div_x + // s
v_head_size_id; // D/x
Vi k_vi;
Vi v_vi;
Ldg(k_vi, k_src + src_idx * X_ELEMS);
Ldg(v_vi, v_src + src_idx * X_ELEMS);
Vo k_vo = params.transform_k(k_vi);
Vo v_vo = params.transform_v(v_vi);
Store(k_dst + dst_idx * X_ELEMS, k_vo);
Store(v_dst + dst_idx * X_ELEMS, v_vo);
}
}
}
};
template<typename T>
void invokeTransposeKVCache(T* key_cache_trans,
......@@ -601,59 +461,34 @@ void invokeTransposeKVCache(T* key_cache_trans,
int head_n_rep,
cudaStream_t stream,
int quant,
const float* kv_scale)
const float* kv_params)
{
constexpr int block_sz = 128;
constexpr int x = (sizeof(T) == 4) ? 4 : 8;
dim3 grid((max_kv_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num);
if (quant & QuantPolicy::kCacheKVInt8) {
transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(key_cache_trans,
reinterpret_cast<const int8_t**>(key_cache),
src_offset,
head_num,
head_n_rep,
size_per_head,
key_length,
max_kv_len,
max_seq_len,
kv_scale[0],
kv_scale[1]);
transpose_value_cache_int8<<<grid, block_sz, 0, stream>>>(val_cache_trans,
reinterpret_cast<const int8_t**>(val_cache),
src_offset,
head_num,
head_n_rep,
size_per_head,
key_length,
max_kv_len,
max_seq_len,
kv_scale[2],
kv_scale[3]);
}
else {
transpose_value_cache<<<grid, block_sz, 0, stream>>>(key_cache_trans,
key_cache,
src_offset,
head_num,
head_n_rep,
size_per_head,
key_length,
max_kv_len,
max_seq_len);
transpose_value_cache<<<grid, block_sz, 0, stream>>>(val_cache_trans,
val_cache,
src_offset,
head_num,
head_n_rep,
size_per_head,
key_length,
max_kv_len,
max_seq_len);
}
auto fn = [&](auto value) {
using Tin = decltype(value);
using Kernel = TransposeKvCache<Tin, T>;
dim3 grid((max_kv_len * size_per_head / Kernel::X_ELEMS + block_sz - 1) / block_sz, batch_size, head_num);
typename Kernel::Params params{key_cache_trans,
val_cache_trans,
(const Tin**)key_cache,
(const Tin**)val_cache,
src_offset,
head_num,
head_n_rep,
size_per_head,
key_length,
max_kv_len,
max_seq_len,
{kv_params[0], kv_params[1]},
{kv_params[2], kv_params[3]}};
KernelWrapper<Kernel><<<grid, block_sz, 0, stream>>>(params);
};
(quant & QuantPolicy::kCacheKVInt8) ? fn(int8_t{}) : fn(T{});
}
template void invokeTransposeKVCache(float*,
......@@ -718,12 +553,59 @@ void invokeGatherOutput(int* output_ids,
int batch_size,
cudaStream_t stream)
{
int block_size = 512;
int block_size = 128;
int grid_size = batch_size;
gatherOutput<<<grid_size, block_size, 0, stream>>>(
output_ids, ids, context_length, max_context_len, max_gen_step, max_output_len, batch_size);
}
__global__ void updateOutput(int** request_output_ids_ptrs,
int** request_seqlen_ptrs,
const int* output_ids,
const int* sequence_lengths,
const int* request_output_ids_lens,
int max_session_len,
bool token_generated)
{
const int batch_id = blockIdx.x;
auto request_output_ids = request_output_ids_ptrs[batch_id];
auto request_seqlen = request_seqlen_ptrs[batch_id];
output_ids += max_session_len * batch_id;
const int seqlen = sequence_lengths[batch_id] + (int)token_generated;
const int output_len = min(seqlen, request_output_ids_lens[batch_id]);
for (int i = threadIdx.x; i < output_len; i += blockDim.x) {
request_output_ids[i] = output_ids[i];
}
*request_seqlen = seqlen;
}
void invokeUpdateOutput(int** request_output_ids_ptrs,
int** request_seqlen_ptrs,
const int* output_ids,
const int* sequence_lengths,
const int* request_output_ids_lens,
int max_session_len,
bool token_generated,
int batch_size,
cudaStream_t stream)
{
constexpr int block_size = 128;
const int grid_size = batch_size;
updateOutput<<<grid_size, block_size, 0, stream>>>(request_output_ids_ptrs,
request_seqlen_ptrs,
output_ids,
sequence_lengths,
request_output_ids_lens,
max_session_len,
token_generated);
}
#define VERSION_SWITCH(VERSION, CONST_NAME, ...) \
[&] { \
if (VERSION == 2) { \
......
......@@ -34,21 +34,22 @@ void invokeCreateCausalMasks(
T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len, int batch_size, cudaStream_t stream);
template<typename T>
void invokeExtendKVCache(T** k_dst,
T** v_dst,
size_t layer_offset,
void invokeExtendKVCache(void** k_dst_ptrs,
void** v_dst_ptrs,
const T* k_src,
const T* v_src,
int batch_size,
const int* cu_block_counts,
const int* query_length,
const int* context_length,
int batch_size,
int block_length,
size_t dst_layer_offset,
int max_q_len,
const int* history_length,
int max_seq_len,
int size_per_head,
int local_head_num,
cudaStream_t stream,
int head_dim,
int head_num,
int quant,
const float* kv_scale);
const float* kv_scale,
cudaStream_t stream);
template<typename T>
void invokeTransposeKVCache(T* key_cache_trans,
......@@ -76,6 +77,16 @@ void invokeGatherOutput(int* output_ids,
int batch_size,
cudaStream_t stream);
void invokeUpdateOutput(int** request_output_ids_ptrs,
int** request_seqlen_ptrs,
const int* output_ids,
const int* sequence_lengths,
const int* request_output_ids_lens,
int max_session_len,
bool token_generated,
int batch_size,
cudaStream_t stream);
void invokeMyCopyInt(int* dst, const int* src, size_t count, cudaStream_t st);
template<typename T>
......
......@@ -5,11 +5,12 @@
namespace turbomind {
struct LlamaAttentionParams {
int rotray_embedding_dim;
int rotary_embedding_dim;
float rotary_embedding_base;
int max_position_embeddings;
bool use_dynamic_ntk;
bool use_logn_attn;
float rope_scaling_factor;
// bool use_dynamic_ntk;
bool use_logn_attn;
};
} // namespace turbomind
......@@ -157,4 +157,13 @@ bool isDebug()
return is_debug;
}
int64_t& gSequenceIds(int batch_idx)
{
thread_local std::vector<int64_t> ids{};
if (batch_idx >= ids.size()) {
ids.resize(batch_idx + 1, -1);
}
return ids.at(batch_idx);
}
} // namespace turbomind
......@@ -2,6 +2,7 @@
#pragma once
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/nvtx_utils.h"
#include <cuda_runtime.h>
#include <sstream>
#include <string>
......@@ -66,4 +67,18 @@ size_t curandStateGetSize();
bool isDebug();
struct NvtxScope {
explicit NvtxScope(const std::string& name)
{
PUSH_RANGE(name.c_str());
}
~NvtxScope()
{
POP_RANGE;
}
};
int64_t& gSequenceIds(int batch_idx);
} // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved.
#include "BlockManager.h"
#include "SequenceManager.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/debug_utils.h"
#include <catch2/catch_test_macros.hpp>
#include <iterator>
using namespace turbomind;
std::ostream& operator<<(std::ostream& os, const Block* b)
{
os << "(" << b->id << "," << b->timestamp << ")";
return os;
}
TEST_CASE("BlockManager")
{
Allocator<AllocatorType::CUDA> allocator(0);
BlockManager m(1024, 32, 8, &allocator);
REQUIRE(m.max_block_count() == 32);
REQUIRE(m.free_count() == 32);
auto blocks1 = m.Allocate(10);
dbg(blocks1);
REQUIRE(blocks1.size() == 10);
REQUIRE(m.active_count() == blocks1.size());
REQUIRE(m.free_count() == 22);
auto blocks2 = m.Allocate(6);
REQUIRE(blocks2.size() == 6);
REQUIRE(m.active_count() == blocks1.size() + blocks2.size());
REQUIRE(m.free_count() == 16);
auto blocks3 = m.Allocate(16);
REQUIRE(blocks3.size() == 16);
REQUIRE(m.active_count() == 32);
REQUIRE(m.free_count() == 0);
std::copy(blocks3.begin(), blocks3.end(), std::back_inserter(blocks1));
std::copy(blocks2.begin(), blocks2.end(), std::back_inserter(blocks1));
m.Touch(blocks1);
REQUIRE(m.Unlock(blocks1) == 32);
REQUIRE(m.active_count() == 0);
REQUIRE(m.free_count() == 0);
REQUIRE(m.cached_count() == 32);
m.Evict(16);
REQUIRE(m.active_count() == 0);
REQUIRE(m.free_count() == 16);
REQUIRE(m.cached_count() == 16);
auto blocks4 = m.Allocate(14);
REQUIRE(m.active_count() == 14);
REQUIRE(m.free_count() == 2);
REQUIRE(m.cached_count() == 16);
}
TEST_CASE("SequenceManager basic test")
{
Allocator<AllocatorType::CUDA> allocator(0);
SequenceManager manager(32, 32, 128, 128, 20, 4, 16, 0, &allocator);
REQUIRE(manager.max_block_count() == 20);
REQUIRE(manager.Contains(1) == false);
auto s1 = manager.Create(1);
dbg(*s1);
REQUIRE(manager.Contains(1) == true);
manager.Erase(1);
REQUIRE(manager.Contains(1) == false);
s1 = manager.Create(1);
REQUIRE(manager.Contains(1) == true);
auto outcome = manager.Materialize({s1}, {128}, {100}, 1);
dbg(s1->blocks);
REQUIRE(s1->blocks.size() == 2);
auto s2 = manager.Create(2);
REQUIRE(manager.Contains(2));
outcome = manager.Materialize({s1, s2}, {128, 2559}, {2, 1}, 1);
dbg(outcome);
REQUIRE(outcome.allocation == 20);
REQUIRE(outcome.swap_in == 1);
REQUIRE(outcome.swap_out == 1);
auto s3 = manager.Create(3);
outcome = manager.Materialize({s1, s2, s3}, {127, 2559, 255}, {1, 100, 2}, 1);
dbg(outcome);
}
TEST_CASE("SequenceManager functional test")
{
Allocator<AllocatorType::CUDA> allocator(0);
SequenceManager manager(32, 32, 128, 128, 20, 4, 16, 0, &allocator);
auto seq = manager.Create(1);
for (int i = 0; i < 1024; ++i) {
auto outcome = manager.Materialize({seq}, {i}, {0}, 1);
if (outcome.allocation) {
dbg(i, outcome);
}
}
}
......@@ -65,7 +65,7 @@ void LlamaTritonModel<T>::handleMissingParams()
}
if (!max_batch_size_) {
max_batch_size_ = 32;
max_batch_size_ = 64;
TM_LOG_WARNING("[LlamaTritonModel] `max_batch_size` is not set, default to %d.", (int)max_batch_size_);
}
......@@ -74,6 +74,12 @@ void LlamaTritonModel<T>::handleMissingParams()
TM_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)session_len_);
}
if (!attn_params_.max_position_embeddings) {
attn_params_.max_position_embeddings = session_len_;
TM_LOG_WARNING("[LlamaTritonModel] `max_position_embeddings` is not set, default to `session_len` (%d).",
(int)attn_params_.max_position_embeddings);
}
if (!max_context_token_num_) {
max_context_token_num_ = (int)std::sqrt(max_batch_size_);
TM_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` is not set, default to %d.",
......@@ -85,14 +91,18 @@ void LlamaTritonModel<T>::handleMissingParams()
TM_LOG_WARNING("[LlamaTritonModel] `step_length` is not set, default to %d.", (int)step_length_);
}
if (!cache_max_entry_count_) {
cache_max_entry_count_ = 32;
TM_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %d.",
(int)cache_max_entry_count_);
if (!cache_max_block_count_) {
cache_max_block_count_ = .95f;
TM_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %f.", cache_max_block_count_);
}
if (!cache_block_seq_len_) {
cache_block_seq_len_ = 128;
TM_LOG_WARNING("[LlamaTritonModel] `cache_block_seq_len` is not set, default to %d.", cache_block_seq_len_);
}
if (!cache_chunk_size_) {
cache_chunk_size_ = cache_max_entry_count_;
cache_chunk_size_ = cache_max_block_count_;
TM_LOG_WARNING("[LlamaTritonModel] `cache_chunk_size` is not set, default to %d.", (int)cache_chunk_size_);
}
}
......@@ -129,17 +139,21 @@ LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size,
max_context_token_num_ = reader.GetInteger("llama", "max_context_token_num", 0);
session_len_ = reader.GetInteger("llama", "session_len", 0);
step_length_ = reader.GetInteger("llama", "step_length", 0);
cache_max_entry_count_ = reader.GetInteger("llama", "cache_max_entry_count", 0);
use_context_fmha_ = reader.GetInteger("llama", "use_context_fmha", 1);
cache_max_block_count_ = reader.GetFloat("llama", "cache_max_entry_count", 0);
cache_block_seq_len_ = reader.GetInteger("llama", "cache_block_seq_len", 0);
cache_chunk_size_ = reader.GetInteger("llama", "cache_chunk_size", 0);
attn_bias_ = reader.GetInteger("llama", "attn_bias", 0);
quant_policy_ = reader.GetInteger("llama", "quant_policy", 0);
group_size_ = reader.GetInteger("llama", "group_size", 0);
use_context_fmha_ = reader.GetInteger("llama", "use_context_fmha", 1);
attn_bias_ = reader.GetInteger("llama", "attn_bias", 0);
quant_policy_ = reader.GetInteger("llama", "quant_policy", 0);
group_size_ = reader.GetInteger("llama", "group_size", 0);
attn_params_.rotray_embedding_dim = reader.GetInteger("llama", "rotary_embedding");
// rotary embedding parameters
attn_params_.rotary_embedding_dim = reader.GetInteger("llama", "rotary_embedding");
attn_params_.rotary_embedding_base = reader.GetFloat("llama", "rope_theta", 10000.0f);
attn_params_.rope_scaling_factor = reader.GetFloat("llama", "rope_scaling_factor", 0.f);
attn_params_.max_position_embeddings = reader.GetInteger("llama", "max_position_embeddings", 0);
attn_params_.use_dynamic_ntk = reader.GetInteger("llama", "use_dynamic_ntk", 0);
// attn_params_.use_dynamic_ntk = reader.GetInteger("llama", "use_dynamic_ntk", 0);
attn_params_.use_logn_attn = reader.GetInteger("llama", "use_logn_attn", 0);
handleMissingParams();
......@@ -219,7 +233,7 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
ft::NcclParam pipeline_para = nccl_params.second[comms_rank];
ft::FT_CHECK(tensor_para.world_size_ == tensor_para_size_);
ft::FT_CHECK(pipeline_para.world_size_ = pipeline_para_size_);
ft::FT_CHECK(pipeline_para.world_size_ == pipeline_para_size_);
auto llama = std::make_unique<ft::LlamaV2<T>>(head_num_,
kv_head_num_,
......@@ -235,7 +249,8 @@ std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSh
step_length_,
start_id_,
end_id_,
cache_max_entry_count_,
cache_max_block_count_,
cache_block_seq_len_,
cache_chunk_size_,
quant_policy_,
use_context_fmha_,
......@@ -320,12 +335,13 @@ std::string LlamaTritonModel<T>::toString()
<< "\ninter_size: " << inter_size_ << "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_
<< "\nattn_bias: " << attn_bias_ << "\nmax_batch_size: " << max_batch_size_
<< "\nmax_context_token_num: " << max_context_token_num_ << "\nsession_len: " << session_len_
<< "\nstep_length: " << step_length_ << "\ncache_max_entry_count: " << cache_max_entry_count_
<< "\ncache_chunk_size: " << cache_chunk_size_ << "\nuse_context_fmha: " << use_context_fmha_
<< "\nstart_id: " << start_id_ << "\ntensor_para_size: " << tensor_para_size_
<< "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_
<< "\nmodel_name: " << model_name_ << "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << quant_policy_
<< "\ngroup_size: " << group_size_ << std::endl;
<< "\nstep_length: " << step_length_ << "\ncache_max_entry_count: " << cache_max_block_count_
<< "\ncache_block_seq_len: " << cache_block_seq_len_ << "\ncache_chunk_size: " << cache_chunk_size_
<< "\nuse_context_fmha: " << use_context_fmha_ << "\nstart_id: " << start_id_
<< "\ntensor_para_size: " << tensor_para_size_ << "\npipeline_para_size: " << pipeline_para_size_
<< "\nenable_custom_all_reduce: " << enable_custom_all_reduce_ << "\nmodel_name: " << model_name_
<< "\nmodel_dir: " << model_dir_ << "\nquant_policy: " << quant_policy_ << "\ngroup_size: " << group_size_
<< std::endl;
return ss.str();
}
......
......@@ -93,7 +93,8 @@ private:
int step_length_;
int start_id_;
int end_id_;
int cache_max_entry_count_;
float cache_max_block_count_;
int cache_block_seq_len_;
int cache_chunk_size_;
int use_context_fmha_;
size_t tensor_para_size_;
......
......@@ -272,6 +272,7 @@ struct AbstractTransformerModelInstance;
struct AbstractTransformerModelInstance {
virtual ~AbstractTransformerModelInstance() = default;
virtual std::shared_ptr<std::vector<triton::Tensor>>
forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors) = 0;
......
......@@ -131,7 +131,7 @@ void check(T result, char const* const func, const char* const file, int const l
inline void syncAndCheck(const char* const file, int const line)
{
// When FT_DEBUG_LEVEL=DEBUG, must check error
static char* level_name = std::getenv("FT_DEBUG_LEVEL");
static char* level_name = std::getenv("TM_DEBUG_LEVEL");
if (level_name != nullptr) {
static std::string level = std::string(level_name);
if (level == "DEBUG") {
......
#pragma once
#if __has_include("3rdparty/dbg.h")
#include "3rdparty/dbg.h"
#else
#define dbg(...)
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment