Unverified Commit ab1767cf authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

TurboMind 2 (#590)

* refresh decoder attention kernel

* block-level kv cache

* `BlockManager` & `SequenceManager`

* update

* update

* update

* update

* rename

* GQA support

* fix context length

* GQA dispatch

* kv8

* tune

* async stream cb

* nvtx

* config parsing

* debug

* optimize output cost

* split-k decoding

* minor

* truncate `session_len` by available blocks

* minor

* license

* fix

* dispatch `cp.async`

* fix linking

* fix

* fix deadlock

* guard input length

* correct start offset

* fix prefill chunking

* fix `cache_block_seq_len` param passing

* fix `block_size` fmtstr

* fix output tokens

* fix batch resizing

* fix masking of finished sequences

* add debug util

* free unused block early

* add ntk scaling and logn scaling

* cmake flags

* fix typo

* w4a16 for sm75

* fix msvc build

* fix msvc build

* fix block verification

* fix msvc build

* use `std::shuffle`

* fix lint

* fix lint

* fix lint

* clear incoming buffer

* clear finished requests

* fix batch initialization

* fix typo

* fix typo

* fix comparison
parent 06125966
......@@ -9,6 +9,23 @@
namespace turbomind {
__inline__ __device__ void
mma_m16n8k8_row_col(Array<float, 4>& d, const Array<half, 4>& a, const Array<half, 2>& b, Array<float, 4>& c)
{
#if TURBOMIND_ARCH_SM75
uint32_t const* A = reinterpret_cast<uint32_t const*>(&a);
uint32_t const* B = reinterpret_cast<uint32_t const*>(&b);
float const* C = reinterpret_cast<float const*>(&c);
float* D = reinterpret_cast<float*>(&d);
asm("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32 {%0,%1,%2,%3}, "
"{%4,%5}, {%6}, {%7,%8,%9,%10};\n"
: "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
: "r"(A[0]), "r"(A[1]), "r"(B[0]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
#else
assert(TURBOMIND_ARCH_SM75);
#endif
}
__inline__ __device__ void
mma_m16n8k16_row_col(Array<float, 4>& d, const Array<half, 8>& a, const Array<half, 4>& b, Array<float, 4>& c)
{
......@@ -22,7 +39,10 @@ mma_m16n8k16_row_col(Array<float, 4>& d, const Array<half, 8>& a, const Array<ha
: "=f"(D[0]), "=f"(D[1]), "=f"(D[2]), "=f"(D[3])
: "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]), "r"(B[0]), "r"(B[1]), "f"(C[0]), "f"(C[1]), "f"(C[2]), "f"(C[3]));
#else
assert(TURBOMIND_ARCH_SM80);
const Array<half, 4>* _a = (const Array<half, 4>*)&a;
const Array<half, 2>* _b = (const Array<half, 2>*)&b;
mma_m16n8k8_row_col(d, _a[0], _b[0], c);
mma_m16n8k8_row_col(d, _a[1], _b[1], d);
#endif
}
......
......@@ -15,7 +15,7 @@
* limitations under the License.
*/
#include "src/turbomind/kernels/decoder_masked_multihead_attention_utils.h"
#include "src/turbomind/kernels/decoder_multihead_attention/array_ops.h"
#include "src/turbomind/kernels/reduce_kernel_utils.cuh"
#include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/turbomind/utils/cuda_type_utils.cuh"
......@@ -854,19 +854,20 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf,
T* v_buf,
T* QKV,
const T* __restrict qkv_bias,
const int* padding_offset,
const int* history_length,
const int* input_length,
int batch_size,
int seq_len,
int head_num,
int kv_head_num,
int size_per_head,
int rotary_embedding_dim,
float rotary_embedding_base,
int max_position_embeddings,
bool use_dynamic_ntk,
bool use_logn_attn)
const int* padding_offset,
const int* context_length,
const int* input_length,
const float* rope_theta,
int batch_size,
int seq_len,
int head_num,
int kv_head_num,
int size_per_head,
int rotary_embedding_dim,
float rotary_embedding_base,
int max_position_embeddings,
bool use_dynamic_ntk,
bool use_logn_attn)
{
// This kernel add bias to QKV, which has shape [batch_size, seq_len, 3, head_num, size_per_head], and
// QKV split to 3 split buffer q, k, v and transpose them to [batch_size, head_num, seq_len, size_per_head].
......@@ -907,12 +908,18 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf,
Vec_t q, k, v;
Vec_t q_bias, k_bias, v_bias;
using Vec = Array<T, vec_size>;
static_assert(sizeof(Vec_t) == sizeof(Vec));
using namespace ops;
// load Q and apply bias
if (!is_masked) {
q = *reinterpret_cast<const Vec_t*>(&QKV[src_q_idx]);
if (qkv_bias) {
q_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx]);
q = mmha::add(q, q_bias);
q_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx]);
(Vec&)q = (Vec&)q + (Vec&)q_bias;
}
}
......@@ -921,35 +928,32 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf,
k = *reinterpret_cast<const Vec_t*>(&QKV[src_k_idx]);
v = *reinterpret_cast<const Vec_t*>(&QKV[src_v_idx]);
if (qkv_bias) {
k_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx + k_offset]);
v_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx + v_offset]);
k = mmha::add(k, k_bias);
v = mmha::add(v, v_bias);
k_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx + k_offset]);
v_bias = *reinterpret_cast<const Vec_t*>(&qkv_bias[hidden_idx + v_offset]);
(Vec&)k = (Vec&)k + (Vec&)k_bias;
(Vec&)v = (Vec&)v + (Vec&)v_bias;
}
}
const int history_len = history_length[batch_idx];
const int context_len = history_len + input_length[batch_idx];
const int context_len = context_length[batch_idx];
const int history_len = context_len - input_length[batch_idx];
const int timestep = history_len + seq_idx;
if (use_dynamic_ntk) {
rotary_embedding_base = mmha::rotary_embedding_get_base(
context_len, max_position_embeddings, rotary_embedding_dim, rotary_embedding_base);
if (rope_theta) {
rotary_embedding_base = rope_theta[batch_idx];
}
// TODO: unused computation on k if GQA is used
mmha::apply_rotary_embedding(q, k, tidx, rotary_embedding_dim, rotary_embedding_base, timestep);
RotaryEmbedding<vec_size> rotary_emb(rotary_embedding_base, rotary_embedding_dim, timestep, {tidx * vec_size, 0});
rotary_emb.apply((Array<T, vec_size>&)q);
if (head_idx < kv_head_num) {
rotary_emb.apply((Array<T, vec_size>&)k);
}
if (use_logn_attn) {
// +1 to convert to context length at the timestep
float logn_scaling = mmha::logn_attn_get_scaling(timestep + 1, max_position_embeddings);
if constexpr (std::is_same_v<T, float>) {
q = mmha::mul<Vec_t, float, Vec_t>(logn_scaling, q);
}
else if constexpr (std::is_same_v<T, half>) {
half tmp = __float2half(logn_scaling);
q = mmha::mul<Vec_t, uint16_t, Vec_t>((uint16_t&)tmp, q);
}
LogNScaling logn_scaling(timestep + 1, max_position_embeddings);
logn_scaling.apply((Array<T, vec_size>&)q);
}
if (!is_masked && !q_buf) { // also skip modifying QKV if q/k/v_buf are present
......@@ -982,8 +986,9 @@ __global__ void add_fusedQKV_bias_transpose_kernel(T* q_buf,
QKV, \
qkv_bias, \
padding_offset, \
history_length, \
context_length, \
input_length, \
rope_theta, \
batch_size, \
seq_len, \
head_num, \
......@@ -1002,8 +1007,9 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf,
T* QKV,
const T* qkv_bias,
const int* padding_offset,
const int* history_length,
const int* context_length,
const int* input_length,
const float* rope_theta,
const int batch_size,
const int seq_len,
const int token_num,
......@@ -1034,6 +1040,7 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf,
const int* padding_offset, \
const int* history_length, \
const int* input_length, \
const float* rope_theta, \
const int batch_size, \
const int seq_len, \
const int token_num, \
......
......@@ -70,8 +70,9 @@ void invokeAddFusedQKVBiasTranspose(T* q_buf,
T* QKV,
const T* qkv_bias,
const int* padding_offset,
const int* history_length,
const int* context_length,
const int* input_length,
const float* rope_theta,
const int batch_size,
const int seq_len,
const int token_num,
......
......@@ -2,6 +2,7 @@
#pragma once
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
#ifndef _MSC_VER
#include <pthread.h>
......
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/turbomind/models/llama/BlockManager.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/debug_utils.h"
#include "src/turbomind/utils/logger.h"
#include <algorithm>
#include <iterator>
#include <stdexcept>
namespace turbomind {
BlockManager::BlockManager(size_t block_size, double block_count, int chunk_size, IAllocator* allocator):
block_size_(block_size), allocator_(allocator)
{
if (block_count < 1.) {
max_block_count_ = GetBlockCount(block_size, block_count);
}
else {
max_block_count_ = block_count;
}
if (chunk_size == 0) {
chunk_size_ = static_cast<int>(std::sqrt(max_block_count_));
}
else if (chunk_size < 0) {
chunk_size_ = max_block_count_;
}
else {
chunk_size_ = chunk_size;
}
TM_LOG_INFO("[BlockManager] block_size = %lu MB", (unsigned long)block_size_ >> 20);
TM_LOG_INFO("[BlockManager] max_block_count = %d", max_block_count_);
TM_LOG_INFO("[BlockManager] chunk_size = %d", chunk_size_);
blocks_.reserve(max_block_count_);
active_ids_.reserve(max_block_count_);
cached_ids_.reserve(max_block_count_);
free_ids_.reserve(max_block_count_);
// pre-allocate first chunk
Malloc();
dbg(free_ids_);
}
BlockManager::~BlockManager()
{
for (auto& chunk : chunks_) {
allocator_->free(&chunk);
}
}
bool BlockManager::Malloc()
{
auto chunk_size = std::min<int>(chunk_size_, max_block_count_ - blocks_.size());
if (!chunk_size) {
return false;
}
auto ptr = (std::byte*)allocator_->malloc(block_size_ * chunk_size);
if (!ptr) {
return false;
}
chunks_.push_back(ptr);
for (int i = 0; i < chunk_size; ++i, ptr += block_size_) {
auto& block = blocks_.emplace_back();
block.use_count = 0;
block.ref_count = 0;
block.id = (int)blocks_.size() - 1;
block.timestamp = 0;
block.data = ptr;
free_ids_.push_back(block.id);
}
return true;
}
size_t BlockManager::GetBlockCount(size_t block_size, double ratio)
{
size_t free{};
size_t total{};
check_cuda_error(cudaMemGetInfo(&free, &total));
return static_cast<size_t>(total * ratio) / block_size;
}
void BlockManager::Move(std::vector<int>& src, const std::vector<int>& delta, std::vector<int>& dst)
{
std::vector<int> src1(src.size() - delta.size());
std::set_difference(src.begin(), src.end(), delta.begin(), delta.end(), src1.begin());
src.swap(src1);
std::vector<int> dst1(dst.size() + delta.size());
std::set_union(dst.begin(), dst.end(), delta.begin(), delta.end(), dst1.begin());
dst.swap(dst1);
}
std::vector<const Block*> BlockManager::Allocate(int count)
{
while (free_ids_.size() < count) {
if (!Malloc()) {
throw std::runtime_error("out of memory");
}
}
std::vector<const Block*> ret;
std::vector<int> idxs(count);
for (int i = 0; i < count; ++i) {
int idx = free_ids_[i];
idxs[i] = idx;
auto& block = blocks_[idx];
FT_CHECK(is_free(block));
block.ref_count = 1;
block.use_count = 1;
block.unique_id = unique_id_++;
ret.push_back(&block);
}
Move(free_ids_, idxs, active_ids_);
dbg(free_ids_, active_ids_);
return ret;
}
void BlockManager::Evict(int count)
{
std::vector<int> idxs(cached_ids_);
// get first `count` cached ids according to timestamp
std::nth_element(idxs.begin(), idxs.begin() + count, idxs.end(), [&](int i, int j) {
return blocks_[i].timestamp < blocks_[j].timestamp;
});
idxs.resize(count);
// sort the retrieved ids
std::sort(idxs.begin(), idxs.end());
// set as free
for (const auto& idx : idxs) {
auto& b = blocks_[idx];
FT_CHECK(is_cached(b));
b.ref_count = 0;
b.unique_id = 0;
b.timestamp = 0;
}
Move(cached_ids_, idxs, free_ids_);
dbg(cached_ids_, free_ids_);
}
int BlockManager::Free(const std::vector<const Block*>& bs)
{
std::vector<int> idxs;
for (const auto& p : bs) {
auto& b = blocks_[p->id];
FT_CHECK(is_cached(b));
if (--b.ref_count == 0) {
b.unique_id = 0;
b.timestamp = 0;
idxs.push_back(b.id);
}
}
std::sort(idxs.begin(), idxs.end());
Move(cached_ids_, idxs, free_ids_);
dbg(cached_ids_, free_ids_);
return idxs.size();
}
int BlockManager::Unlock(const std::vector<const Block*>& bs)
{
std::vector<int> idxs;
for (const auto& p : bs) {
auto& block = blocks_[p->id];
FT_CHECK(is_active(block));
if (--block.use_count == 0) {
idxs.push_back(block.id);
}
}
std::sort(idxs.begin(), idxs.end());
Move(active_ids_, idxs, cached_ids_);
dbg(active_ids_, cached_ids_);
return idxs.size();
}
int BlockManager::Lock(const std::vector<const Block*>& bs)
{
std::vector<int> idxs;
for (const auto& p : bs) {
auto& block = blocks_[p->id];
FT_CHECK(is_cached(block));
if (++block.use_count == 1) {
idxs.push_back(p->id);
}
}
std::sort(idxs.begin(), idxs.end());
Move(cached_ids_, idxs, active_ids_);
// dbg(cached_ids_, active_ids_);
return idxs.size();
}
void BlockManager::Touch(const std::vector<const Block*>& bs)
{
std::for_each(bs.crbegin(), bs.crend(), [this](const Block* p) {
FT_CHECK(is_active(*p));
const_cast<Block*>(p)->timestamp = timestamp_++;
});
}
Snapshot BlockManager::TakeSnapshot()
{
std::vector<int> use_count(blocks_.size());
for (const auto& idx : active_ids_) {
use_count[idx] = blocks_[idx].use_count;
}
return {active_count(), cached_count(), free_count(), std::move(use_count)};
}
std::ostream& operator<<(std::ostream& os, const BlockManager& manager)
{
os << "block_size: " << manager.block_size_ << ", ";
os << "max_block_count: " << manager.max_block_count_ << ", ";
os << "chunk_size: " << manager.chunk_size_ << ", ";
os << "chunks: " << manager.chunks_.size() << ", ";
os << "active_ids: " << manager.active_ids_.size() << ", ";
os << "cached_ids: " << manager.cached_ids_.size() << ", ";
os << "free_ids: " << manager.free_ids_.size() << ", ";
os << "blocks: " << manager.blocks_.size() << ", ";
os << "unique_id: " << manager.unique_id_ << ", ";
os << "timestamp: " << manager.timestamp_ << ", ";
os << "allocator: " << manager.allocator_;
return os;
}
std::ostream& operator<<(std::ostream& os, const Block& block)
{
os << "id=" << block.id << ", use_count=" << block.use_count << ", unique_id=" << block.unique_id
<< ", timestamp=" << block.timestamp << ", data=" << block.data;
return os;
}
} // namespace turbomind
// Copyright (c) OpenMMLab. All rights reserved.
#pragma once
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
#include <algorithm>
#include <cstdint>
#include <cuda_runtime.h>
#include <iterator>
#include <numeric>
#include <queue>
#include <unordered_map>
#include <vector>
namespace turbomind {
// [L, H, S, D]
// [L, S/x, H, x, D]
struct Block {
int id; // fixed linear id in the pool
int ref_count; // all sequences referencing the block
int use_count; // active sequences using the block
uint64_t unique_id; // unique for every block allocation
uint64_t timestamp;
void* data;
friend std::ostream& operator<<(std::ostream& os, const Block& block);
};
inline bool is_active(const Block& block)
{
return block.ref_count > 0 && block.use_count > 0;
}
inline bool is_cached(const Block& block)
{
return block.ref_count > 0 && block.use_count == 0;
}
inline bool is_free(const Block& block)
{
return block.ref_count == 0 && block.use_count == 0 && block.timestamp == 0;
}
struct Snapshot {
int active;
int cached;
int free;
std::vector<int> use_count;
};
class BlockManager {
public:
explicit BlockManager(size_t block_size, double block_count, int chunk_size, IAllocator* allocator);
~BlockManager();
// free -> active (use_count = 1, ref_count = 1)
[[nodiscard]] std::vector<const Block*> Allocate(int count);
// cached -> active (use_count += 1)
[[maybe_unused]] int Lock(const std::vector<const Block*>& bs);
// active -> cached (use_count -= 1)
[[maybe_unused]] int Unlock(const std::vector<const Block*>& bs);
// cached -> free (ref_count = 0)
void Evict(int count);
// cached -> free (ref_count -= 1)
[[maybe_unused]] int Free(const std::vector<const Block*>& bs);
// increase timestamp in reversed order
void Touch(const std::vector<const Block*>& bs);
Snapshot TakeSnapshot();
int max_block_count() const noexcept
{
return max_block_count_;
}
int active_count() const noexcept
{
return active_ids_.size();
}
int cached_count() const noexcept
{
return cached_ids_.size();
}
int free_count() const noexcept
{
return (max_block_count_ - blocks_.size()) + free_ids_.size();
}
friend std::ostream& operator<<(std::ostream& os, const BlockManager&);
private:
static size_t GetBlockCount(size_t block_size, double ratio);
// move indices between sets
static void Move(std::vector<int>& src, const std::vector<int>& delta, std::vector<int>& dst);
// allocate a chunk of blocks
bool Malloc();
private:
size_t block_size_;
int max_block_count_{};
int chunk_size_{};
IAllocator* allocator_;
std::vector<void*> chunks_;
std::vector<int> active_ids_;
std::vector<int> cached_ids_;
std::vector<int> free_ids_;
std::vector<Block> blocks_; // < 100k
// uint64_t unique_id_{1UL << 63};
uint64_t unique_id_{1};
uint64_t timestamp_{1};
};
} // namespace turbomind
......@@ -10,6 +10,8 @@ add_library(Llama STATIC
LlamaV2.cc
LlamaBatch.cc
LlamaCacheManager.cc
BlockManager.cc
SequenceManager.cc
LlamaContextDecoder.cc
LlamaContextAttentionLayer.cc
LlamaDecoderSelfAttentionLayer.cc
......@@ -28,6 +30,7 @@ target_link_libraries(Llama PUBLIC CUDA::cudart
DynamicDecodeLayer
activation_kernels
decoder_masked_multihead_attention
decoder_multihead_attention
bert_preprocess_kernels
decoding_kernels
unfused_attention_kernels
......@@ -48,4 +51,11 @@ endif()
add_executable(llama_gemm llama_gemm.cc)
target_link_libraries(llama_gemm PUBLIC CUDA::cudart gpt_gemm_func memory_utils cuda_utils logger)
install(TARGETS llama_gemm DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/bin)
find_package(Catch2 3 QUIET)
if (Catch2_FOUND)
add_executable(test_cache_manager test_cache_manager.cc)
target_link_libraries(test_cache_manager PRIVATE Llama Catch2::Catch2WithMain)
endif ()
This diff is collapsed.
......@@ -2,66 +2,139 @@
#pragma once
#include "src/turbomind/models/llama/LlamaCacheManager.h"
// #include "src/turbomind/models/llama/LlamaCacheManager.h"
#include "src/turbomind/models/llama/Barrier.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/Request.h"
#include "src/turbomind/models/llama/SequenceManager.h"
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cublasMMWrapper.h"
#include <condition_variable>
#include <mutex>
namespace turbomind {
struct BatchState {
int* h_context_length;
bool* h_finished;
void* top_k_curand_state;
void* top_p_curand_state;
int* output_ids; // output ids in [B, S]
float* h_rope_theta;
std::vector<int> seq_len_limit;
std::vector<int> is_swap_in;
std::vector<const Sequence*> sequences;
std::vector<std::shared_ptr<Request>> requests;
// |<-- existing -->|<-- swap-in -->|
// |<----------- active ----------->|<-- inactive -->|
int active_size;
int size;
};
template<typename T>
class LlamaV2;
template<typename T>
class LlamaBatch {
public:
int size() const noexcept
{
return batch_size_;
};
void AllocateBuffer(size_t batch_size, size_t session_len);
void AllocatePersistantBuffer(size_t max_batch_size);
void FreeBuffer();
int maxSize() const noexcept
{
return max_batch_size_;
}
using Requests = std::vector<std::shared_ptr<Request>>;
using Signal = std::function<void()>;
int finishedCount() const noexcept
{
return finished_count_;
}
void RejectInvalidRequests(Requests& stop_reqs, Requests& infer_reqs);
[[nodiscard]] auto ProcessStopRequests(const Requests& requests) -> std::vector<Signal>;
void ProcessInferRequests(const Requests& requests);
void verifyRequests(std::vector<std::shared_ptr<Request>>& stop_reqs,
std::vector<std::shared_ptr<Request>>& infer_reqs);
void handleStopRequests(const std::vector<std::shared_ptr<Request>>& requests);
[[nodiscard]] bool Initialize();
void allocateBuffer(size_t batch_size, size_t session_len);
void allocatePersistantBuffer(size_t max_batch_size);
void freeBuffer();
void ContextDecode();
void initializeSampling(int infer_request_count);
struct GenerationState {
int max_init_ctx_len;
int step;
int sum_seq_len;
int max_seq_len;
};
void initialize(const std::vector<std::shared_ptr<Request>>& infer_requests);
void contextDecode();
void InitializeSampling();
GenerationState InitializeGeneration();
void initializeGeneration();
bool generate();
[[nodiscard]] bool Generate(GenerationState& g);
void finish();
void finishRequest(int index, bool force_end);
[[nodiscard]] auto Finish(GenerationState& g) -> std::vector<Signal>;
void synchronize();
void CompleteRequest(int index, bool is_stop_request, bool is_force_end);
void setOutputTensors(int max_gen_step);
void SetOutputTensors(const GenerationState& g);
void
outputContextLogits(T* context_decoder_output, const std::vector<int>& indices, const std::vector<int>& lengths);
OutputContextLogits(T* context_decoder_output, const std::vector<int>& indices, const std::vector<int>& lengths);
explicit LlamaBatch(int max_batch_size, int max_context_token_num, int session_len, LlamaV2<T>* llama);
explicit LlamaBatch(int max_batch_size,
int max_context_token_num,
int session_len,
std::unique_ptr<SequenceManager> sequence_manager,
LlamaV2<T>* llama);
~LlamaBatch()
{
freeBuffer();
TM_LOG_ERROR("~LlamaBatch()");
model_->shared_state_->request_queue.close();
internal_thread_.join();
if (output_thread_.joinable()) {
{
std::lock_guard lock{output_mutex_};
output_stop_token_ = true;
}
output_cv_.notify_one();
output_thread_.join();
}
FreeBuffer();
}
void Start();
private:
void InternalThreadEntry(int device_id);
void OutputThreadEntry();
void UpdateSequenceStates(BatchState& state, int index);
void CopyState(const std::pair<BatchState*, int> _src, const std::pair<BatchState*, int>& _dst);
void SaveRandomState(BatchState& state, int idx);
void LoadRandomState(BatchState& state, int idx);
void BarrierSignalRequests(Barrier& barrier, const std::vector<Signal>& signals);
// analogs to `std::copy_n`
template<typename U>
U* Copy(const U* src, size_t count, U* dst)
{
check_cuda_error(cudaMemcpyAsync(dst, src, sizeof(U) * count, cudaMemcpyDefault, stream_));
return dst += count;
}
template<typename U>
U* Clear(U* data, size_t count)
{
check_cuda_error(cudaMemsetAsync(data, 0, sizeof(U) * count, stream_));
return data += count;
}
private:
......@@ -70,52 +143,67 @@ private:
const int session_len_;
const int rank_;
const bool debug_;
const int step_length_;
LlamaV2<T>* const llama_;
// active requests
std::vector<std::shared_ptr<Request>> requests_;
T* context_decoder_input_buf_{}; // CTXDEC
T* context_decoder_output_buf_{}; // CTXDEC
int* context_decoder_ids_buf_{};
T* decoder_input_buf_{}; // CTXDEC, GENERATE
T* decoder_output_buf_{}; // CTXDEC, GENERATE
LlamaV2<T>* const model_;
int* input_ids_buf_{}; // input token ids + cache missed token ids, CTXDEC
int* input_length_buf_{}; // input + cache missed length, CTXDEC, GENERATE
int* history_length_buf_{}; // history length, CTXDEC
int* context_length_buf_{}; // history length + input_length, CTXDEC, GENERATE
std::unique_ptr<SequenceManager> sequence_manager_;
int* total_padding_count_{}; // GENERATE
int* sequence_lengths_{}; // current sequence length
///////////////////////////////////////////////////////////////////
// k/v cache block buffers
int* cu_block_counts_{};
uintptr_t* k_block_ptrs_{};
uintptr_t* v_block_ptrs_{};
uint64_t* k_cache_ptr_buf_{};
uint64_t* v_cache_ptr_buf_{};
////////////////////////////////////////////////////////////////////
// context decoding temp buffers
T* context_decoder_input_buf_{};
T* context_decoder_output_buf_{};
int* context_decoder_ids_buf_{};
int* input_ids_buf_{};
// lengths
int* input_length_buf_{}; // input + cache missed length
int* context_length_buf_{}; // history length + input_length
// temp buffers used for block->linear kv-cache conversion
T* tmp_k_cache_buf_{};
T* tmp_v_cache_buf_{};
void** tmp_k_ptrs_{};
void** tmp_v_ptrs_{};
void** h_tmp_k_ptrs_{};
void** h_tmp_v_ptrs_{};
T* decoder_input_buf_{};
T* decoder_output_buf_{};
int* sequence_lengths_{}; // current sequence length
int* init_ctx_lens_{};
float* logits_buf_{}; // combined logits
float* local_logits_buf_{}; // tensor parallel local logits
float* context_logits_buf_{};
float* local_context_logits_buf_{};
float* rope_theta_{};
// used by dynamic decoder
int* token_ids_buf_{}; // all token IDs in [S, B], indexed using `step`
int* output_ids_buf_{}; // output ids in [B, S]
int* token_ids_buf_{}; // all token IDs in [S, B], indexed using `step`
int* end_ids_buf_{};
bool* finished_buf_{};
uint32_t* seq_limit_len_{};
int** request_output_ids_ptrs_{};
int* request_output_ids_lens_{};
int** request_seqlen_ptrs_{};
int** h_request_output_ids_ptrs_{};
int* h_request_output_ids_lens_{};
int** h_request_seqlen_ptrs_{};
// pinned buffers
int* h_input_ids_buf_{};
int* h_input_length_buf_{};
int* h_history_length_buf_{};
int* h_context_length_buf_{};
int* h_sequence_lengths_{};
bool* h_finished_buf_{};
uintptr_t* h_k_cache_ptr_buf_{};
uintptr_t* h_v_cache_ptr_buf_{};
uint32_t* h_seq_limit_len_{};
int* h_cu_block_counts_{};
uintptr_t* h_k_block_ptrs_{};
uintptr_t* h_v_block_ptrs_{};
int* stop_words_buf_{}; // [batch_size, 2, kMaxStopWordsLen]
int* bad_words_buf_{};
......@@ -125,24 +213,19 @@ private:
float* h_repetition_penalty_{};
uint64_t* h_random_seed_{};
void* topk_curandstate_buf_{};
void* topp_curandstate_buf_{};
std::array<BatchState, 3> states_{};
// hard limits for persistent buffers
static constexpr int kMaxStopBadWordsLen = 32;
BatchState* state_{};
BatchState* back_{};
BatchState* incoming_{};
using CachedSeq = LlamaCacheManager::Sequence;
uint64_t request_count_{0};
std::vector<CachedSeq> cached_seq_;
std::vector<int> request_seq_len_limit_;
// hard limits for persistent buffers
static constexpr int kMaxStopBadWordsLen = 32;
const DataType data_type_{};
int batch_size_{};
int max_context_len_{};
int step_{};
int finished_count_{};
bool is_allocate_persistant_buffer_ = false;
bool is_allocate_buffer_ = false;
......@@ -154,6 +237,15 @@ private:
cudaStream_t stream_{};
cublasMMWrapper* cublas_wrapper_{};
IAllocator* allocator_{};
std::thread internal_thread_;
// async stream callback utils
std::thread output_thread_;
std::mutex output_mutex_;
std::condition_variable output_cv_;
Requests output_reqs_;
bool output_stop_token_{false};
};
} // namespace turbomind
......@@ -21,6 +21,7 @@
#include "src/turbomind/models/llama/LlamaContextAttentionLayer.h"
#include "src/turbomind/kernels/bert_preprocess_kernels.h"
#include "src/turbomind/kernels/decoder_multihead_attention/kv_cache.h"
#include "src/turbomind/kernels/unfused_attention_kernels.h"
#include "src/turbomind/macro.h"
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
......@@ -28,6 +29,7 @@
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/debug_utils.h"
#include "src/turbomind/utils/logger.h"
namespace turbomind {
......@@ -116,6 +118,7 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
* \param history_lengths [batch_size], int
* \param context_lengths [batch_size], int
* \param cu_seqlens [batch_size+1], int
* \param cu_block_counts [batch_size+1], int
* \param max_seq_len [1], int on cpu
* \param is_final_layer [1], bool on cpu
* \param layer_id [1], int on cpu
......@@ -141,13 +144,23 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
T* attention_input = input_tensors->at("input_query").getPtr<T>();
T* attention_mask = input_tensors->at("attention_mask").getPtr<T>();
const auto input_length = input_tensors->at("input_lengths").getPtr<const int>();
const auto history_length = input_tensors->at("history_lengths").getPtr<const int>();
const auto context_length = input_tensors->at("context_lengths").getPtr<const int>();
int* cu_seqlens = input_tensors->at("cu_seqlens").getPtr<int>();
const auto input_length = input_tensors->at("input_lengths").getPtr<const int>();
const auto context_length = input_tensors->at("context_lengths").getPtr<const int>();
int* cu_seqlens = input_tensors->at("cu_seqlens").getPtr<int>();
int* cu_block_counts = input_tensors->at("cu_block_counts").getPtr<int>();
const float* rope_theta = input_tensors->getPtr<const float>("rope_theta", nullptr);
const auto padding_offset = input_tensors->at("padding_offset").getPtr<int>();
auto Show = [&](const T* x, size_t n) {
std::vector<T> vec(n);
cudaMemcpyAsync(vec.data(), x, sizeof(T) * n, cudaMemcpyDefault, stream_);
cudaStreamSynchronize(stream_);
std::vector<float> float_vec(vec.begin(), vec.end());
dbg(float_vec);
};
/////////////////////////////////////////////
/// allocate buffers
allocateBuffer(batch_size, num_token, max_q_len, max_k_len);
......@@ -166,26 +179,32 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
qkv_buf_,
weights->qkv.bias,
padding_offset, // padding_offset,
history_length, // used for applying rotary embedding
context_length, // used for applying rotary embedding
input_length,
rope_theta,
batch_size,
max_q_len, // seq_len
num_token, // batch_size * seq_len
local_head_num_,
local_kv_head_num_,
size_per_head_,
params_.rotray_embedding_dim,
params_.rotary_embedding_dim,
params_.rotary_embedding_base,
params_.max_position_embeddings,
params_.use_dynamic_ntk,
false, // params_.use_dynamic_ntk,
params_.use_logn_attn,
stream_);
sync_check_cuda_error();
const size_t layer_offset = layer_id * local_kv_head_num_ * max_seq_len * size_per_head_;
// [2, L, H, s, D]
const size_t layer_offset = layer_id * local_kv_head_num_ * kv_cache_block_len_ * size_per_head_;
auto k_cache_ptrs = output_tensors->getPtr<void*>("key_cache");
auto v_cache_ptrs = output_tensors->getPtr<void*>("value_cache");
auto tmp_k_ptrs = output_tensors->getPtr<T*>("tmp_k");
auto tmp_v_ptrs = output_tensors->getPtr<T*>("tmp_v");
auto k_cache_ptrs = output_tensors->getPtr<T*>("key_cache");
auto v_cache_ptrs = output_tensors->getPtr<T*>("value_cache");
//////////////////////////////////////////////////////////
/// insert the k/v computed from inputs into k/v cache
/// transpose kv -> kv cache
......@@ -194,25 +213,53 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
// v_buf_2 [B, kvH, s, D] -> val_cache [B, kvH, S[t:t+s], D/x, x]
invokeExtendKVCache(k_cache_ptrs,
v_cache_ptrs,
layer_offset,
k_buf_2_,
v_buf_2_,
batch_size,
cu_block_counts,
input_length,
context_length,
batch_size,
kv_cache_block_len_,
layer_offset,
max_q_len,
history_length,
max_seq_len,
size_per_head_,
local_kv_head_num_,
stream_,
quant_policy_,
weights->past_kv_scale.data());
weights->past_kv_scale.data(),
stream_);
sync_check_cuda_error();
const int kv_cache_elem_bits = quant_policy_ & QuantPolicy::kCacheKVInt8 ? 8 : sizeof(T) * 8;
ConvertKvCacheBlocksToLinear2((const void**)k_cache_ptrs,
(const void**)v_cache_ptrs,
(T**)tmp_k_ptrs,
(T**)tmp_v_ptrs,
cu_block_counts,
context_length,
layer_offset,
kv_cache_block_len_,
max_seq_len,
local_kv_head_num_,
size_per_head_,
batch_size,
quant_policy_,
weights->past_kv_scale.data(),
stream_);
sync_check_cuda_error();
// dbg(kv_cache_block_len_, max_seq_len, local_kv_head_num_, size_per_head_, batch_size);
// void *kk, *vv;
// cudaMemcpyAsync(&kk, tmp_k_ptrs, sizeof(void*), cudaMemcpyDefault, stream_);
// cudaMemcpyAsync(&vv, tmp_v_ptrs, sizeof(void*), cudaMemcpyDefault, stream_);
// cudaStreamSynchronize(stream_);
// Show((const T*)kk, local_kv_head_num_ * max_seq_len * size_per_head_);
// Show((const T*)vv, local_kv_head_num_ * max_seq_len * size_per_head_);
if (use_fmha_) {
fusedMultiHeadAttention(k_cache_ptrs,
v_cache_ptrs,
layer_offset,
fusedMultiHeadAttention(tmp_k_ptrs,
tmp_v_ptrs,
0,
attention_mask,
cu_seqlens,
input_tensors->at("context_lengths").getPtr<int>(),
......@@ -222,9 +269,9 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
max_seq_len);
}
else {
unfusedMultiHeadAttention(k_cache_ptrs,
v_cache_ptrs,
layer_offset,
unfusedMultiHeadAttention(tmp_k_ptrs,
tmp_v_ptrs,
0,
attention_mask,
padding_offset,
context_length,
......@@ -237,6 +284,14 @@ inline void LlamaContextAttentionLayer<T>::forward(TensorMap*
weights->past_kv_scale.data());
}
// Compare(qkv_buf_3_, num_token * hidden_units_, Concat("qkv_buf_3", layer_id), kCmpRead, stream_);
// dbg(max_seq_len);
if (0) {
Show(qkv_buf_3_, num_token * hidden_units_);
}
//////////////////////////////////////////////
/// output gemm <Bs,HD> -> <Bs,HD>
linear_.forward(attention_out, qkv_buf_3_, num_token, weights->output);
......@@ -342,7 +397,7 @@ void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T** key_c
local_head_num_,
head_n_rep_,
stream_,
quant,
0, // dequant handled in block->linear conversion
kv_scale);
sync_check_cuda_error();
......
......@@ -45,6 +45,7 @@ public:
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool use_fmha,
int cache_block_seq_len,
int quant_policy):
head_num_(head_num),
size_per_head_(size_per_head),
......@@ -58,6 +59,7 @@ public:
cublas_wrapper_(cublas_wrapper),
linear_(cublas_wrapper, stream),
allocator_(allocator),
kv_cache_block_len_(cache_block_seq_len),
is_free_buffer_after_forward_(is_free_buffer_after_forward),
use_fmha_(use_fmha),
quant_policy_(quant_policy)
......@@ -99,6 +101,7 @@ private:
const size_t local_kv_head_num_;
const size_t local_head_num_;
const size_t head_n_rep_;
const size_t kv_cache_block_len_;
const bool is_free_buffer_after_forward_;
const LlamaAttentionParams params_;
......
......@@ -25,7 +25,9 @@
#include "src/turbomind/models/llama/LlamaContextDecoder.h"
#include "src/turbomind/models/llama/llama_decoder_kernels.h"
#include "src/turbomind/models/llama/llama_kernels.h"
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/debug_utils.h"
namespace turbomind {
......@@ -64,6 +66,7 @@ template<typename T>
void LlamaContextDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
size_t kv_head_num,
bool use_fmha,
int cache_block_seq_len,
int quant_policy)
{
h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
......@@ -78,6 +81,7 @@ void LlamaContextDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
allocator_,
is_free_buffer_after_forward_,
use_fmha,
cache_block_seq_len,
quant_policy);
silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
......@@ -93,6 +97,7 @@ void LlamaContextDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
template<typename T>
void LlamaContextDecoder<T>::forwardSelfAttn(const Session& sess,
T* attn_io,
std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors,
int layer,
bool is_final)
......@@ -107,18 +112,17 @@ void LlamaContextDecoder<T>::forwardSelfAttn(const Session&
{"padding_offset", {MEMORY_GPU, TYPE_INT32, {sess.token_num}, padding_offset_}},
{"cu_seqlens", {MEMORY_GPU, TYPE_INT32, {sess.batch_size + 1}, cu_seqlens_}},
{"input_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.input_length}},
{"history_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.history_length}},
{"context_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.context_length}},
{"cu_block_counts", input_tensors->at("cu_block_counts")},
{"rope_theta", input_tensors->at("rope_theta")},
{"max_seq_len", input_tensors->at("max_seq_len")}};
auto& k_cache = *sess.k_cache;
auto& v_cache = *sess.v_cache;
TensorMap self_attention_output_tensors{
{"hidden_features", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_io}},
{"key_cache", k_cache},
{"value_cache", v_cache},
};
{"key_cache", output_tensors->at("key_cache")},
{"value_cache", output_tensors->at("value_cache")},
{"tmp_k", output_tensors->at("tmp_k")},
{"tmp_v", output_tensors->at("tmp_v")}};
context_attention_layer_->forward(&self_attention_output_tensors, //
&self_attention_input_tensors,
......@@ -139,6 +143,7 @@ LlamaContextDecoder<T>::LlamaContextDecoder(size_t head_num
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool use_fmha,
int cache_block_seq_len,
int quant_policy):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
head_num_(head_num),
......@@ -150,7 +155,7 @@ LlamaContextDecoder<T>::LlamaContextDecoder(size_t head_num
tensor_para_(tensor_para),
data_type_(getTensorType<T>())
{
initialize(attn_params, kv_head_num, use_fmha, quant_policy);
initialize(attn_params, kv_head_num, use_fmha, cache_block_seq_len, quant_policy);
}
template<typename T>
......@@ -201,17 +206,16 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
sess.weights = decoder_layer_weights;
sess.input_length = input_tensors->at("input_lengths").getPtr<int>();
sess.history_length = input_tensors->at("history_lengths").getPtr<int>();
sess.context_length = input_tensors->at("context_lengths").getPtr<int>();
T* decoder_input_output = input_tensors->at("decoder_input").getPtr<T>();
T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();
sess.k_cache = &output_tensors->at("key_cache");
sess.v_cache = &output_tensors->at("value_cache");
allocateBuffer(sess.batch_size, sess.token_num, sess.max_query_len, sess.max_key_len);
// dbg(padding_offset_);
FT_CHECK(padding_offset_);
size_t tmp_token_num{};
invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
&tmp_token_num, // updated token num
......@@ -222,6 +226,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
sess.max_query_len,
stream_);
sync_check_cuda_error();
dbg(tmp_token_num, sess.token_num);
FT_CHECK(tmp_token_num == sess.token_num);
invokeCreateCausalMasks(attention_mask_,
......@@ -233,6 +238,9 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
stream_);
sync_check_cuda_error();
// Compare(
// decoder_input_output, sess.token_num * hidden_units_, Concat("context_decoder_input", 0), kCmpRead, stream_);
/////////////////////////////////////////////
/// RMSNorm
invokeRootMeanSquareNorm(decoder_output,
......@@ -247,7 +255,7 @@ void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>*
for (size_t layer = 0; layer < num_layer_; ++layer) {
/////////////////////////////////////////////
/// self-attention
forwardSelfAttn(sess, decoder_output, input_tensors, layer, false);
forwardSelfAttn(sess, decoder_output, output_tensors, input_tensors, layer, false);
invokeFusedAddBiasResidualRMSNorm(decoder_input_output,
decoder_output,
......
......@@ -40,7 +40,11 @@ protected:
void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);
void freeBuffer() override;
void initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, bool use_fmha, int quant_policy);
void initialize(const LlamaAttentionParams& attn_params,
size_t kv_head_num,
bool use_fmha,
int cache_block_seq_len,
int quant_policy);
size_t head_num_;
size_t size_per_head_;
......@@ -63,21 +67,19 @@ protected:
const DataType data_type_;
struct Session {
size_t batch_size;
size_t token_num;
size_t max_query_len;
size_t max_key_len;
Tensor* k_cache;
Tensor* v_cache;
int* input_length{};
int* history_length{};
int* context_length{};
size_t batch_size;
size_t token_num;
size_t max_query_len;
size_t max_key_len;
int* input_length{};
int* context_length{};
const std::vector<LlamaDecoderLayerWeight<T>*>* weights;
};
void forwardSelfAttn(const Session& sess,
T* attn_io,
std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors,
int layer,
bool is_final);
......@@ -96,6 +98,7 @@ public:
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool use_fmha,
int cache_block_seq_len,
int quant_policy);
~LlamaContextDecoder() override;
......
......@@ -41,6 +41,7 @@ LlamaDecoder<T>::LlamaDecoder(size_t head_num,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
int cache_block_seq_len,
int quant_policy):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
head_num_(head_num),
......@@ -53,7 +54,7 @@ LlamaDecoder<T>::LlamaDecoder(size_t head_num,
data_type_(getTensorType<T>())
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
initialize(attn_params, kv_head_num, quant_policy);
initialize(attn_params, kv_head_num, cache_block_seq_len, quant_policy);
}
template<typename T>
......@@ -65,7 +66,10 @@ LlamaDecoder<T>::~LlamaDecoder()
}
template<typename T>
void LlamaDecoder<T>::initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, int quant_policy)
void LlamaDecoder<T>::initialize(const LlamaAttentionParams& attn_params,
size_t kv_head_num,
int cache_block_seq_len,
int quant_policy)
{
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
......@@ -78,6 +82,7 @@ void LlamaDecoder<T>::initialize(const LlamaAttentionParams& attn_params, size_t
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_,
cache_block_seq_len,
quant_policy);
silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
......@@ -118,6 +123,7 @@ void LlamaDecoder<T>::forwardSelfAttn(const LlamaDecoder::Session&
const std::unordered_map<std::string, Tensor>* input_tensors,
size_t layer)
{
NvtxScope scope("self_attn");
TM_LOG_DEBUG(__PRETTY_FUNCTION__);
TensorMap self_attention_input_tensors(*input_tensors);
self_attention_input_tensors.insert("input_query",
......@@ -180,60 +186,73 @@ void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>* ou
// for the shape of key cache, refer to decoder_masked_multihead_attention_template.hpp
NvtxScope forward_scope("decoder_forward");
Session sess{};
sess.batch_size = input_tensors->at("decoder_input").shape[0];
sess.weights = decoder_layer_weights;
allocateBuffer(sess.batch_size);
sess.ite = input_tensors->at("ite").getVal<const int>();
sess.k_cache = &output_tensors->at("key_cache");
sess.v_cache = &output_tensors->at("value_cache");
sess.max_memory_len = input_tensors->at("max_seq_len").getVal<int>();
T* decoder_input = input_tensors->at("decoder_input").getPtr<T>();
T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();
int step = input_tensors->at("step").getVal<int>();
// Compare(decoder_input, sess.batch_size * hidden_units_, Concat("decoder_input", 0, step), kCmpRead, stream_);
////////////////////////////////////////////
/// RMSNorm
invokeRootMeanSquareNorm(decoder_output,
decoder_input,
decoder_layer_weights->at(0)->self_attn_norm_weights,
rmsnorm_eps_,
sess.batch_size,
hidden_units_,
stream_);
sync_check_cuda_error();
{
NvtxScope rms_norm_scope("rms_norm_0");
invokeRootMeanSquareNorm(decoder_output,
decoder_input,
decoder_layer_weights->at(0)->self_attn_norm_weights,
rmsnorm_eps_,
sess.batch_size,
hidden_units_,
stream_);
sync_check_cuda_error();
}
for (size_t layer = 0; layer < num_layer_; ++layer) {
NvtxScope layer_scope("decode_layer");
// output: self_attn_output_, k_cache, v_cache = self_attn(decoder_normed_input_)
forwardSelfAttn(sess, decoder_output, input_tensors, layer);
invokeFusedAddBiasResidualRMSNorm(decoder_input,
decoder_output,
decoder_layer_weights->at(layer)->self_attn_weights.output.bias,
decoder_layer_weights->at(layer)->ffn_norm_weights,
rmsnorm_eps_,
sess.batch_size,
hidden_units_,
stream_);
sync_check_cuda_error();
{
NvtxScope rms_norm_scope("rms_norm_1");
invokeFusedAddBiasResidualRMSNorm(decoder_input,
decoder_output,
decoder_layer_weights->at(layer)->self_attn_weights.output.bias,
decoder_layer_weights->at(layer)->ffn_norm_weights,
rmsnorm_eps_,
sess.batch_size,
hidden_units_,
stream_);
sync_check_cuda_error();
}
// decoder_layer_output_ = ffn(decoder_normed_input_)
forwardFfn(sess, decoder_output, layer);
auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
input_tensors->at("output_norm_weight").getPtr<T>();
invokeFusedAddBiasResidualRMSNorm(decoder_input, //
decoder_output,
decoder_layer_weights->at(layer)->ffn_weights.output.bias,
scale_weight,
rmsnorm_eps_,
sess.batch_size,
hidden_units_,
stream_);
sync_check_cuda_error();
{
NvtxScope rms_norm_scope("rms_norm_2");
auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
input_tensors->at("output_norm_weight").getPtr<T>();
invokeFusedAddBiasResidualRMSNorm(decoder_input, //
decoder_output,
decoder_layer_weights->at(layer)->ffn_weights.output.bias,
scale_weight,
rmsnorm_eps_,
sess.batch_size,
hidden_units_,
stream_);
sync_check_cuda_error();
}
}
if (is_free_buffer_after_forward_) {
......
......@@ -35,7 +35,8 @@ protected:
void allocateBuffer() override; // deprecated
void allocateBuffer(size_t batch_size);
void freeBuffer() override;
void initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, int quant_policy);
void
initialize(const LlamaAttentionParams& attn_params, size_t kv_head_num, int cache_block_seq_len, int quant_policy);
size_t head_num_;
size_t size_per_head_;
......@@ -53,8 +54,6 @@ protected:
struct Session {
size_t batch_size;
int ite;
size_t max_memory_len;
Tensor* k_cache;
Tensor* v_cache;
const std::vector<LlamaDecoderLayerWeight<T>*>* weights;
......@@ -80,6 +79,7 @@ public:
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
int cache_block_seq_len,
int quant_policy);
~LlamaDecoder() override;
......
......@@ -302,7 +302,7 @@ void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType
self_attn_weights.past_kv_scale = loadArrayFromBin({4}, scale_path);
}
else {
self_attn_weights.past_kv_scale = {};
self_attn_weights.past_kv_scale = {1.f, 0.f, 1.f, 0.f};
}
}
......
......@@ -24,6 +24,7 @@
#include "src/turbomind/models/llama/LlamaLinear.h"
#include "src/turbomind/models/llama/llama_params.h"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/nccl_utils.h"
namespace turbomind {
......@@ -32,7 +33,7 @@ template<typename T>
class LlamaDecoderSelfAttentionLayer {
public:
void freeBuffer();
void allocateBuffer(size_t batch_size, int key_len, int max_memory_len);
void allocateBuffer(size_t batch_size);
LlamaDecoderSelfAttentionLayer(size_t head_num,
size_t kv_head_num,
......@@ -43,6 +44,7 @@ public:
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
int cache_block_seq_len,
int quant_policy):
head_num_(head_num),
kv_head_num_(kv_head_num),
......@@ -56,9 +58,11 @@ public:
stream_(stream),
linear_(cublas_wrapper, stream),
allocator_(allocator),
kv_cache_block_len_(cache_block_seq_len),
is_free_buffer_after_forward_(is_free_buffer_after_forward),
quant_policy_(quant_policy)
{
arch_ = getSMVersion();
}
~LlamaDecoderSelfAttentionLayer()
......@@ -76,6 +80,7 @@ private:
const size_t local_head_num_;
const size_t local_kv_head_num_;
const size_t local_hidden_units_;
const size_t kv_cache_block_len_;
const bool is_free_buffer_after_forward_;
const int quant_policy_;
......@@ -90,7 +95,11 @@ private:
T* qkv_buf_ = nullptr;
T* context_buf_ = nullptr;
static constexpr int kMaxSplitK = 16; // must be <= WARP_SIZE
float* workspace_ = nullptr;
bool is_allocate_buffer_{};
int arch_{};
};
} // namespace turbomind
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment