Unverified Commit 9efcac38 authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

check-in fastertransformer (#7)

* add ft code

* gitignore

* fix lint

* revert fmha
parent 720fc533
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/models/llama/LlamaBatch.h"
#include "src/fastertransformer/kernels/decoding_kernels.h"
#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
#include "src/fastertransformer/models/llama/LlamaV2.h"
#include "src/fastertransformer/models/llama/Request.h"
#include "src/fastertransformer/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/logger.h"
#include <cstdint>
#include <iomanip>
#include <sstream>
#include <unordered_map>
namespace fastertransformer {
template<typename T>
void LlamaBatch<T>::verifyRequests(std::vector<std::shared_ptr<Request>>& stop_reqs,
std::vector<std::shared_ptr<Request>>& infer_reqs)
{
std::unordered_map<uint64_t, int> occurance;
auto count_occurance = [&occurance](const std::vector<std::shared_ptr<Request>>& rs) {
for (const auto& r : rs) {
++occurance[r->id];
}
};
auto invalidate = [](const char* type, std::shared_ptr<Request>& req, int ec) {
FT_LOG_WARNING("[verifyRequests] Skipping invalid %s request for id %ld, code = %d", type, (long)req->id, ec);
req->signal.set_value(ec);
req.reset();
};
auto handle_conflict_or_invalid = [this, &occurance, &invalidate](std::vector<std::shared_ptr<Request>>& rs,
const char* type) {
for (auto& r : rs) {
if (r) {
int ec = 0;
if (occurance[r->id] != 1) {
ec = Request::kConflict;
}
else if (r->start_flag && r->stop_flag) {
ec = Request::kInvalid;
}
else if (!r->start_flag && !llama_->kv_cache_mgr_->contains(r->id)) {
ec = Request::kInvalid;
}
if (ec) {
invalidate(type, r, ec);
}
}
}
};
auto drop_invalid = [](std::vector<std::shared_ptr<Request>>& rs) {
int count = 0;
for (int i = 0; i < rs.size(); ++i) {
if (rs[i]) {
rs[count++] = std::move(rs[i]);
}
}
rs.resize(count);
};
count_occurance(stop_reqs);
count_occurance(infer_reqs);
if (!stop_reqs.empty()) {
handle_conflict_or_invalid(stop_reqs, "stop");
// invalidate stop-only requests for inactive sequences
for (auto& r : stop_reqs) {
if (r && r->end_flag == false) {
int ec = Request::kInactive;
for (int i = 0; i < batch_size_; ++i) {
if (requests_[i] && requests_[i]->id == r->id) {
ec = 0;
break;
}
}
if (ec) {
invalidate("stop", r, ec);
}
}
}
drop_invalid(stop_reqs);
}
if (!infer_reqs.empty()) {
handle_conflict_or_invalid(infer_reqs, "infer");
// invalidate requests for busy sequences
for (auto& r : infer_reqs) {
if (r) {
for (int i = 0; i < batch_size_; ++i) {
if (requests_[i] && requests_[i]->id == r->id) {
invalidate("infer", r, Request::kBusy);
break;
}
}
}
}
drop_invalid(infer_reqs);
}
}
template<typename T>
void LlamaBatch<T>::handleStopRequests(const std::vector<std::shared_ptr<Request>>& requests)
{
for (const auto& r : requests) {
int ec = Request::kFail;
// find matching active sequence
for (int i = 0; i < batch_size_; ++i) {
// stop & optionally erase active sequence
if (requests_[i] && requests_[i]->id == r->id) {
ec = 0;
finishRequest(i, r->end_flag);
break;
}
}
// mismatch, try erase inactive sequence
if (ec && r->end_flag) {
ec = 0;
llama_->kv_cache_mgr_->erase(r->id);
}
// clear output buffers (prevent leaking conversations) if request is successfull
if (ec == 0) {
auto& output_ids = r->outputs[rank_].at("output_ids");
auto& sequence_length = r->outputs[rank_].at("sequence_length");
check_cuda_error(
cudaMemsetAsync(output_ids.getPtr<int>(), 0, sizeof(int) * output_ids.shape.at(2), stream_));
check_cuda_error(cudaMemsetAsync(sequence_length.getPtr<int>(), 0, sizeof(int), stream_));
check_cuda_error(cudaStreamSynchronize(stream_));
}
if (rank_ == 0) {
r->signal.set_value(ec);
}
}
}
template<typename T>
void LlamaBatch<T>::allocateBuffer(size_t batch_size, size_t session_len)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
const size_t batchxbeam = batch_size;
const size_t hidden_units = llama_->hidden_units_;
const size_t vocab_size = llama_->vocab_size_;
context_decoder_input_buf_ =
(T*)allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * max_context_token_num_ * hidden_units, false);
context_decoder_ids_buf_ =
(int*)allocator_->reMalloc(context_decoder_ids_buf_, sizeof(int) * max_context_token_num_, false);
decoder_input_buf_ = (T*)allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units, false);
decoder_output_buf_ = (T*)allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units, false);
input_ids_buf_ = (int*)allocator_->reMalloc(input_ids_buf_, sizeof(int) * batchxbeam * session_len, true);
input_length_buf_ = (int*)allocator_->reMalloc(input_length_buf_, sizeof(int) * batchxbeam);
history_length_buf_ = (int*)allocator_->reMalloc(history_length_buf_, sizeof(int) * batchxbeam);
context_length_buf_ = (int*)allocator_->reMalloc(context_length_buf_, sizeof(int) * batchxbeam);
total_padding_count_ = (int*)allocator_->reMalloc(total_padding_count_, sizeof(int) * batchxbeam, false);
sequence_lengths_ = (int*)allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false);
k_cache_ptr_buf_ = (uint64_t*)allocator_->reMalloc(k_cache_ptr_buf_, sizeof(uint64_t) * batchxbeam);
v_cache_ptr_buf_ = (uint64_t*)allocator_->reMalloc(v_cache_ptr_buf_, sizeof(uint64_t) * batchxbeam);
logits_buf_ = (float*)allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size, false);
local_logits_buf_ = (float*)allocator_->reMalloc(local_logits_buf_, sizeof(float) * batchxbeam * vocab_size, false);
token_ids_buf_ = (int*)allocator_->reMalloc(token_ids_buf_, sizeof(int) * batchxbeam * session_len * 2, true);
end_ids_buf_ = (int*)allocator_->reMalloc(end_ids_buf_, sizeof(int) * batch_size, false);
finished_buf_ = (bool*)allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false);
seq_limit_len_ = (uint32_t*)allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false);
is_allocate_buffer_ = true;
}
template<typename T>
void LlamaBatch<T>::allocatePersistantBuffer(size_t max_batch_size)
{
output_ids_buf_ = (int*)allocator_->reMalloc(output_ids_buf_, sizeof(int) * max_batch_size * session_len_, true);
stop_words_buf_ =
(int*)allocator_->reMalloc(stop_words_buf_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true);
bad_words_buf_ =
(int*)allocator_->reMalloc(bad_words_buf_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true);
h_runtime_top_k_ = (int*)allocator_->reMalloc(h_runtime_top_k_, sizeof(int) * max_batch_size, true, true);
h_runtime_top_p_ = (float*)allocator_->reMalloc(h_runtime_top_p_, sizeof(float) * max_batch_size, true, true);
h_temperature_ = (float*)allocator_->reMalloc(h_temperature_, sizeof(float) * max_batch_size, true, true);
h_repetition_penalty_ =
(float*)allocator_->reMalloc(h_repetition_penalty_, sizeof(float) * max_batch_size, true, true);
h_random_seed_ = (uint64_t*)allocator_->reMalloc(h_random_seed_, sizeof(uint64_t) * max_batch_size, true, true);
sampling_params_ = {{"stop_words_list", stop_words_buf_},
{"bad_words_list", bad_words_buf_},
{"runtime_top_k", h_runtime_top_k_},
{"runtime_top_p", h_runtime_top_p_},
{"temperature", h_temperature_},
{"repetition_penalty", h_repetition_penalty_},
{"random_seed", h_random_seed_}};
topk_curandstate_buf_ = allocator_->reMalloc(topk_curandstate_buf_, sizeof(curandState_t) * max_batch_size, true);
topp_curandstate_buf_ = allocator_->reMalloc(topp_curandstate_buf_, sizeof(curandState_t) * max_batch_size, true);
{
NcclGuard barrier(llama_->tensor_para_, stream_, true);
h_input_ids_buf_ =
(int*)allocator_->reMalloc(h_input_ids_buf_, sizeof(int) * max_batch_size * session_len_, false, true);
h_input_length_buf_ =
(int*)allocator_->reMalloc(h_input_length_buf_, sizeof(int) * max_batch_size, false, true);
h_history_length_buf_ =
(int*)allocator_->reMalloc(h_history_length_buf_, sizeof(int) * max_batch_size, false, true);
h_context_length_buf_ =
(int*)allocator_->reMalloc(h_context_length_buf_, sizeof(int) * max_batch_size, false, true);
h_sequence_lengths_ =
(int*)allocator_->reMalloc(h_sequence_lengths_, sizeof(int) * max_batch_size, false, true);
h_k_cache_ptr_buf_ =
(uintptr_t*)allocator_->reMalloc(h_k_cache_ptr_buf_, sizeof(uintptr_t) * max_batch_size, true, true);
h_v_cache_ptr_buf_ =
(uintptr_t*)allocator_->reMalloc(h_v_cache_ptr_buf_, sizeof(uintptr_t) * max_batch_size, true, true);
h_finished_buf_ = (bool*)allocator_->reMalloc(h_finished_buf_, sizeof(bool) * max_batch_size, false, true);
h_seq_limit_len_ =
(uint32_t*)allocator_->reMalloc(h_seq_limit_len_, sizeof(uint32_t) * max_batch_size, false, true);
}
is_allocate_persistant_buffer_ = true;
}
template<typename T>
void LlamaBatch<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)&context_decoder_input_buf_);
allocator_->free((void**)&context_decoder_ids_buf_);
allocator_->free((void**)&decoder_input_buf_);
allocator_->free((void**)&decoder_output_buf_);
allocator_->free((void**)&input_ids_buf_);
allocator_->free((void**)&input_length_buf_);
allocator_->free((void**)&history_length_buf_);
allocator_->free((void**)&context_length_buf_);
allocator_->free((void**)&total_padding_count_);
allocator_->free((void**)&sequence_lengths_);
allocator_->free((void**)&k_cache_ptr_buf_);
allocator_->free((void**)&v_cache_ptr_buf_);
allocator_->free((void**)&logits_buf_);
allocator_->free((void**)&local_logits_buf_);
allocator_->free((void**)&token_ids_buf_);
allocator_->free((void**)&end_ids_buf_);
allocator_->free((void**)&finished_buf_);
allocator_->free((void**)&seq_limit_len_);
is_allocate_buffer_ = false;
}
if (is_allocate_persistant_buffer_) {
allocator_->free((void**)&h_input_ids_buf_, true);
allocator_->free((void**)&h_input_length_buf_, true);
allocator_->free((void**)&h_history_length_buf_, true);
allocator_->free((void**)&h_context_length_buf_, true);
allocator_->free((void**)&h_sequence_lengths_, true);
allocator_->free((void**)&h_k_cache_ptr_buf_, true);
allocator_->free((void**)&h_v_cache_ptr_buf_, true);
allocator_->free((void**)&h_seq_limit_len_, true);
allocator_->free((void**)&h_finished_buf_, true);
allocator_->free((void**)&output_ids_buf_);
is_allocate_persistant_buffer_ = false;
}
}
template<typename T>
LlamaBatch<T>::LlamaBatch(int max_batch_size, int max_context_token_num, int session_len, LlamaV2<T>* llama):
max_batch_size_(max_batch_size),
max_context_token_num_(max_context_token_num),
session_len_(session_len),
rank_(llama->tensor_para_.rank_),
debug_(llama->debug_),
llama_(llama),
data_type_(getTensorType<T>())
{
stream_ = llama_->stream_;
allocator_ = llama_->allocator_;
cublas_wrapper_ = llama_->cublas_wrapper_;
requests_.resize(max_batch_size);
request_seq_len_limit_.resize(max_batch_size);
cached_seq_.resize(max_batch_size);
allocatePersistantBuffer(max_batch_size);
}
template<typename T>
void LlamaBatch<T>::initializeSampling(int infer_request_count)
{
TensorMap inputs;
for (const auto& param : sampling_params_) {
const Tensor* ptr{};
for (int i = 0; i < batch_size_; ++i) {
if (requests_[i]->inputs[rank_].isExist(param.first)) {
ptr = &requests_[i]->inputs[rank_].at(param.first);
break;
}
}
if (ptr) {
const auto& ref = *ptr;
auto shape = ref.shape;
FT_CHECK(shape[0] == 1);
shape[0] = batch_size_;
const int size_in_bytes = ref.sizeBytes();
check_cuda_error(cudaMemsetAsync(param.second, 0, size_in_bytes * batch_size_, stream_));
for (int i = 0; i < batch_size_; ++i) {
if (requests_[i]->inputs[rank_].isExist(param.first)) {
auto& src = requests_[i]->inputs[rank_].at(param.first);
FT_CHECK(ref.shape == src.shape);
check_cuda_error(cudaMemcpyAsync((uint8_t*)param.second + size_in_bytes * i,
src.getPtr<void>(),
size_in_bytes,
cudaMemcpyDefault,
stream_));
}
}
inputs.insert({param.first, {ref.where, ref.type, shape, param.second}});
if (debug_ && rank_ == 0) {
FT_LOG_INFO("[initializeSampling] %s", format({param.first, inputs.at(param.first)}).c_str());
}
}
}
inputs_ = std::move(inputs);
llama_->dynamic_decode_layer_->setup(batch_size_, 1, &inputs_);
for (int i = 0; i < batch_size_; ++i) {
// recover random states if not a new request or new request w/o "random_seed"
if (i < batch_size_ - infer_request_count || !requests_[i]->inputs[rank_].isExist("random_seed")) {
check_cuda_error(cudaMemcpyAsync(llama_->dynamic_decode_layer_->topk_curandstate_buf() + i,
(curandState_t*)topk_curandstate_buf_ + i,
sizeof(curandState_t),
cudaMemcpyDefault,
stream_));
check_cuda_error(cudaMemcpyAsync(llama_->dynamic_decode_layer_->topp_curandstate_buf() + i,
(curandState_t*)topp_curandstate_buf_ + i,
sizeof(curandState_t),
cudaMemcpyDefault,
stream_));
}
}
handleOptArg(&inputs_, "end_id", end_ids_buf_, llama_->end_id_, batch_size_);
cudaStreamSynchronize(0);
}
template<typename T>
void LlamaBatch<T>::initializeGeneration()
{
max_context_len_ = *std::max_element(h_context_length_buf_, h_context_length_buf_ + batch_size_);
check_cuda_error(cudaMemsetAsync(token_ids_buf_, 0, sizeof(int) * batch_size_ * session_len_ * 2, stream_));
invokeTransposeAxis01(token_ids_buf_, output_ids_buf_, batch_size_, session_len_, 1, stream_);
sync_check_cuda_error();
// token_ids_buf_[s, b]
// ABCDe ABCDe e
// ABCDEFGHIJk ABCDEFGHIJk
// ABCDEFGHi -> ABCDEFGHi i
// ABCDEFGh ABCDEFGh h
// ABCd ABCd d
for (int i = 0; i < batch_size_; ++i) {
auto token_ids = token_ids_buf_ + i;
auto p_src = h_context_length_buf_[i] - 1;
auto p_dst = max_context_len_ - 1;
if (p_src != p_dst) { // dst and src of `cudaMemcpyAsync` must not overlap
check_cuda_error(cudaMemcpyAsync(token_ids + p_dst * batch_size_,
token_ids + p_src * batch_size_,
sizeof(int),
cudaMemcpyDefault,
stream_));
}
}
check_cuda_error(cudaMemcpyAsync(
context_length_buf_, h_context_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
check_cuda_error(cudaMemcpyAsync(
k_cache_ptr_buf_, h_k_cache_ptr_buf_, sizeof(uintptr_t) * batch_size_, cudaMemcpyDefault, stream_));
check_cuda_error(cudaMemcpyAsync(
v_cache_ptr_buf_, h_v_cache_ptr_buf_, sizeof(uintptr_t) * batch_size_, cudaMemcpyDefault, stream_));
check_cuda_error(
cudaMemcpyAsync(sequence_lengths_, context_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
// `sequence_lengths_` will be increased by dynamic decode
// note that in decoder and in output "sequence length" has differnt semantic
// - in decoder it means length of sequence that has kv cache already computed
// - in output it means length of all tokens (the last generated token does not have k/v cache computed yet)
invokePlusScalar(sequence_lengths_, -1, batch_size_, stream_);
sync_check_cuda_error();
// total_padding_count_
// decoding starts at max_context_len
check_cuda_error(cudaMemsetAsync(total_padding_count_, 0, sizeof(int) * batch_size_, stream_));
invokeUpdatePaddingCount(total_padding_count_, //
context_length_buf_,
max_context_len_,
batch_size_,
1,
stream_);
sync_check_cuda_error();
// seq_limit_len_, will be compared to `step` instead of `sequence_length`, so padding len should be accounted for
for (int i = 0; i < batch_size_; ++i) {
h_seq_limit_len_[i] = request_seq_len_limit_[i] + (max_context_len_ - h_context_length_buf_[i]);
// mask finished sequences
h_finished_buf_[i] = max_context_len_ >= h_seq_limit_len_[i];
}
check_cuda_error(
cudaMemcpyAsync(seq_limit_len_, h_seq_limit_len_, sizeof(uint32_t) * batch_size_, cudaMemcpyDefault, stream_));
check_cuda_error(
cudaMemcpyAsync(finished_buf_, h_finished_buf_, sizeof(bool) * batch_size_, cudaMemcpyDefault, stream_));
// ! range of step_ [1, 2 * session_len]
// consider a sequence with context_len == session_len and another sequence with context_len == 1 and
// request_output_len == session_len - 1 => step_ will loop in [session_len, 2 * session_len)
step_ = max_context_len_;
if (rank_ == 0) {
FT_LOG_INFO("[initGen] batch_size = %d", (int)batch_size_);
FT_LOG_INFO("[initGen] max_context_len = %d", (int)max_context_len_);
FT_LOG_INFO("[initGen] slot sequence_id context_len seq_limit_len finished");
for (int i = 0; i < batch_size_; ++i) {
FT_LOG_INFO("[initGen] %4d %11ld %11d %13d %8d",
i,
(long)cached_seq_[i].id,
h_context_length_buf_[i],
(int)h_seq_limit_len_[i],
(int)h_finished_buf_[i]);
}
}
}
template<typename T>
bool LlamaBatch<T>::generate()
{
constexpr int kLogInterval = 10;
if (rank_ == 0 && (step_ - 1) % kLogInterval == 0) {
FT_LOG_INFO("------------------------- step = %d -------------------------", step_ - 1);
}
const bool is_first_step = step_ == max_context_len_;
std::vector<int> prev;
if (debug_ && rank_ == 0 && is_first_step) {
prev.resize(batch_size_);
cudaMemcpyAsync(prev.data(),
token_ids_buf_ + (step_ - 1) * batch_size_,
sizeof(int) * batch_size_,
cudaMemcpyDefault,
stream_);
}
// embeddingLookup(step_ - 1);
llama_->embeddingLookup(decoder_input_buf_, //
token_ids_buf_,
batch_size_,
step_ - 1);
llama_->decoderForward(decoder_output_buf_,
k_cache_ptr_buf_,
v_cache_ptr_buf_,
decoder_input_buf_,
sequence_lengths_,
total_padding_count_,
finished_buf_,
step_,
0,
session_len_,
batch_size_);
llama_->postDecodeEmbedding(logits_buf_, //
local_logits_buf_,
decoder_output_buf_,
batch_size_);
// stop-words & bad-words require the matched tokens to be contiguous, so item size > 1 is
// not supported yet.
bool should_stop{};
llama_->dynamicDecode(token_ids_buf_,
finished_buf_,
sequence_lengths_,
&should_stop,
&inputs_,
&outputs_,
logits_buf_,
seq_limit_len_,
context_length_buf_,
end_ids_buf_,
step_,
0,
max_context_len_,
session_len_ * 2,
batch_size_);
if (debug_ && rank_ == 0) {
std::vector<int> curr(batch_size_);
cudaMemcpyAsync(
curr.data(), token_ids_buf_ + step_ * batch_size_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_);
cudaStreamSynchronize(stream_);
if (is_first_step) {
std::stringstream sprev;
for (int k = 0; k < prev.size(); ++k) {
sprev << std::setw(6) << prev[k];
}
FT_LOG_INFO("[ lookup ] step = %d, [%s]", step_ - 1, sprev.str().c_str());
}
std::stringstream scurr;
for (int k = 0; k < curr.size(); ++k) {
scurr << std::setw(6) << curr[k];
}
FT_LOG_INFO("[generate] step = %d, [%s]", step_ - 1, scurr.str().c_str());
}
////////////////////////////////////////////////
/// ! increase the step counter
++step_;
return !should_stop;
}
template<typename T>
void LlamaBatch<T>::initialize(const std::vector<std::shared_ptr<Request>>& infer_requests)
{
FT_CHECK(batch_size_ + infer_requests.size() <= max_batch_size_);
const int infer_request_count = infer_requests.size();
allocateBuffer(batch_size_ + infer_request_count, session_len_);
// handle infer requests
std::vector<int> tmp_input_length(infer_request_count);
std::vector<CachedSeq> tmp_cached_seq;
tmp_cached_seq.reserve(infer_request_count);
int tmp_max_input_length = 0;
for (int i = 0; i < infer_request_count; ++i) {
auto& r = *infer_requests[i];
LlamaCacheManager::Sequence seq{};
if (r.start_flag) {
seq = llama_->kv_cache_mgr_->create(r.id, stream_);
}
else {
seq = llama_->kv_cache_mgr_->fetch(r.id, stream_);
}
const int step = r.inputs[rank_].getVal<int>("step", -1);
if (step >= 0) {
if (step <= seq.token_ids.size()) {
seq.token_ids.resize(step);
seq.cache_len = std::min(seq.cache_len, (size_t)step);
}
else if (rank_ == 0) {
FT_LOG_WARNING("[initialize] Skipping invalid step (%d) setting for ID %ld", step, (long)seq.id);
}
}
// input length with missing cache accounted for
int actual_input_len = r.inputs[rank_].getVal<int>("input_lengths") + (seq.token_ids.size() - seq.cache_len);
// insert `start_id` for empty sequences
if (seq.token_ids.empty() && actual_input_len == 0) {
seq.token_ids.push_back(llama_->start_id_);
seq.cache_len = 0;
actual_input_len = seq.token_ids.size() - seq.cache_len;
}
tmp_input_length[i] = actual_input_len;
tmp_max_input_length = std::max((int)tmp_max_input_length, actual_input_len);
tmp_cached_seq.push_back(std::move(seq));
}
FT_CHECK(tmp_max_input_length > 0);
const int max_input_length = tmp_max_input_length;
// arrange requests in ascending order w.r.t actual input lengths, so that requests need context decoding will
// be together
{
std::vector<int> idxs(tmp_input_length.size());
std::iota(idxs.begin(), idxs.end(), 0);
std::sort(idxs.begin(), idxs.end(), [&](int i, int j) { return tmp_input_length[i] < tmp_input_length[j]; });
for (int i = 0; i < idxs.size(); ++i) {
requests_[batch_size_ + i] = infer_requests[idxs[i]];
cached_seq_[batch_size_ + i] = tmp_cached_seq[idxs[i]];
}
}
const int count = batch_size_ + infer_requests.size();
std::vector<int> tmp_input_len(count);
for (int i = batch_size_; i < count; ++i) {
const auto& seq = cached_seq_[i];
h_input_length_buf_[i] = requests_[i]->inputs[rank_].getVal<int>("input_lengths");
tmp_input_len[i] = h_input_length_buf_[i];
// prepare output ids
// <--------> max_context_len
// aaaAAAA
// bbbbBBBBBB
// ccCCC
auto output_ids_ptr = output_ids_buf_ + i * session_len_;
// clear the persistent buffer to prevent leaking previous conversation
check_cuda_error(cudaMemsetAsync(output_ids_ptr, 0, sizeof(int) * session_len_, stream_));
if (!seq.token_ids.empty()) {
check_cuda_error(cudaMemcpyAsync(output_ids_ptr, //
seq.token_ids.data(),
sizeof(int) * seq.token_ids.size(),
cudaMemcpyDefault,
stream_));
output_ids_ptr += seq.token_ids.size();
}
if (h_input_length_buf_[i]) {
auto input_ids_ptr = requests_[i]->inputs[rank_].getPtr<int>("input_ids");
check_cuda_error(cudaMemcpyAsync(output_ids_ptr, //
input_ids_ptr,
sizeof(int) * h_input_length_buf_[i],
cudaMemcpyDefault,
stream_));
}
if (!requests_[i]->start_flag && !seq.random_state_.empty()) {
check_cuda_error(cudaMemcpyAsync((curandState_t*)topk_curandstate_buf_ + i,
seq.random_state_.data(),
sizeof(curandState_t),
cudaMemcpyDefault,
stream_));
check_cuda_error(cudaMemcpyAsync((curandState_t*)topp_curandstate_buf_ + i,
seq.random_state_.data() + sizeof(curandState_t),
sizeof(curandState_t),
cudaMemcpyDefault,
stream_));
}
}
for (int i = batch_size_; i < count; ++i) {
const auto& seq = cached_seq_[i];
const int missed = (int)seq.token_ids.size() - seq.cache_len;
auto input_ids_buf = input_ids_buf_ + i * session_len_;
FT_CHECK(missed >= 0);
if (missed > 0) {
check_cuda_error(cudaMemcpyAsync(input_ids_buf, //
seq.token_ids.data() + seq.cache_len,
sizeof(int) * missed,
cudaMemcpyDefault,
stream_));
input_ids_buf += missed;
}
auto& input_ids = requests_[i]->inputs[rank_].at("input_ids");
check_cuda_error(cudaMemcpyAsync(input_ids_buf, //
input_ids.getPtr<int>(),
sizeof(int) * h_input_length_buf_[i],
cudaMemcpyDefault,
stream_));
h_input_length_buf_[i] += missed;
h_history_length_buf_[i] = seq.cache_len;
h_context_length_buf_[i] = h_input_length_buf_[i] + h_history_length_buf_[i];
const int request_output_len = requests_[i]->inputs[rank_].getVal<int>("request_output_len");
request_seq_len_limit_[i] = h_context_length_buf_[i] + request_output_len;
// `length_criterion` sets finish flag when step >= seq_limit_len, however when step == seq_limit_len
// the actual sequence length is seq_limit_len + 1, hence seq_limit_len must truncated to session_len - 1
if (request_seq_len_limit_[i] >= session_len_) {
request_seq_len_limit_[i] = session_len_ - 1;
if (rank_ == 0) {
const int trunc_output_len = request_seq_len_limit_[i] - h_context_length_buf_[i];
FT_LOG_WARNING(
"[initialize] [%ld] total sequence length (%d + %d) exceeds session_len (%d), request_output_len is truncated to %d",
(long)seq.id,
h_context_length_buf_[i],
request_output_len,
(int)session_len_,
trunc_output_len);
}
}
h_k_cache_ptr_buf_[i] = (uint64_t)seq.k_cache;
h_v_cache_ptr_buf_[i] = (uint64_t)seq.v_cache;
}
const int max_context_len = *std::max_element(h_context_length_buf_ + batch_size_, h_context_length_buf_ + count);
batch_size_ = count;
max_context_len_ = max_context_len;
step_ = max_context_len;
check_cuda_error(
cudaMemcpyAsync(input_length_buf_, h_input_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
check_cuda_error(cudaMemcpyAsync(
history_length_buf_, h_history_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
check_cuda_error(cudaMemcpyAsync(
context_length_buf_, h_context_length_buf_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
check_cuda_error(cudaMemcpyAsync(
k_cache_ptr_buf_, h_k_cache_ptr_buf_, sizeof(uintptr_t) * batch_size_, cudaMemcpyDefault, stream_));
check_cuda_error(cudaMemcpyAsync(
v_cache_ptr_buf_, h_v_cache_ptr_buf_, sizeof(uintptr_t) * batch_size_, cudaMemcpyDefault, stream_));
if (llama_->tensor_para_.rank_ == 0) {
FT_LOG_INFO("[init] infer_request_count = %d", (int)infer_request_count);
FT_LOG_INFO("[init] batch_size = %d", (int)batch_size_);
FT_LOG_INFO("[init] session_len = %d", (int)session_len_);
FT_LOG_INFO("[init] max_input_length = %d", (int)max_input_length);
FT_LOG_INFO("[init] max_context_len = %d", (int)max_context_len);
FT_LOG_INFO(
"[init] slot sequence_id history_len input_len context_len tmp_input_len token_ids.size cache_len");
for (int i = batch_size_ - infer_request_count; i < batch_size_; ++i) {
FT_LOG_INFO("[init] %4d %11ld %11d %9d %11d %13d %14d %9d",
i,
(int)cached_seq_[i].id,
h_history_length_buf_[i],
h_input_length_buf_[i],
h_context_length_buf_[i],
tmp_input_len[i],
(int)cached_seq_[i].token_ids.size(),
(int)cached_seq_[i].cache_len);
}
}
}
template<typename T>
void LlamaBatch<T>::contextDecode()
{
int base = -1;
for (int i = 0; i < batch_size_; ++i) {
if (h_input_length_buf_[i] > 1) {
base = i;
break;
}
}
if (base >= 0) {
check_cuda_error(cudaStreamSynchronize(stream_));
const auto tick = std::chrono::high_resolution_clock::now();
const int context_decode_count = batch_size_ - base;
if (rank_ == 0) {
FT_LOG_INFO("[decodeContext] base = %d, count = %d", base, context_decode_count);
}
invokePlusScalar(input_length_buf_ + base, -1, context_decode_count, stream_);
invokePlusScalar(context_length_buf_ + base, -1, context_decode_count, stream_);
auto get_input_len = [this](int index) { return h_input_length_buf_[index] - 1; };
auto get_context_len = [this](int index) { return h_context_length_buf_[index] - 1; };
auto token_num = get_input_len(base);
auto max_input_len = get_input_len(base);
auto max_context_len = get_context_len(base);
auto offset = base;
for (int i = offset + 1; i <= batch_size_; ++i) {
if (i == batch_size_ || token_num + h_context_length_buf_[i] > max_context_token_num_) {
const int context_decode_batch_size = i - offset;
if (rank_ == 0) {
FT_LOG_INFO(
"[decodeContext] offset = %d, batch_size = %d, token_num = %d, max_input_len = %d, max_context_len = %d",
base,
context_decode_batch_size,
token_num,
max_input_len,
max_context_len);
}
// construct context_decoder_ids w/o padding
// aaaa____
// bb______ -> aaaabbcccccccc
// cccccccc
auto context_decoder_ids = context_decoder_ids_buf_;
for (int j = offset; j < i; ++j) {
check_cuda_error(cudaMemcpyAsync(context_decoder_ids,
input_ids_buf_ + j * session_len_,
sizeof(int) * get_input_len(j),
cudaMemcpyDefault,
stream_));
context_decoder_ids += get_input_len(j);
}
llama_->contextDecode(nullptr,
k_cache_ptr_buf_ + offset,
v_cache_ptr_buf_ + offset,
context_decoder_input_buf_,
nullptr,
context_decoder_ids_buf_,
input_length_buf_ + offset,
history_length_buf_ + offset,
context_length_buf_ + offset,
token_num,
max_input_len,
max_context_len,
session_len_,
context_decode_batch_size);
if (i < batch_size_) {
token_num = get_input_len(i);
max_input_len = get_input_len(i);
max_context_len = get_context_len(i);
offset = i;
}
}
else {
token_num += get_input_len(i);
max_input_len = std::max(max_input_len, get_input_len(i));
max_context_len = std::max(max_context_len, get_context_len(i));
}
}
invokePlusScalar(context_length_buf_ + base, 1, context_decode_count, stream_);
invokePlusScalar(input_length_buf_ + base, 1, context_decode_count, stream_);
for (int i = offset; i < batch_size_; ++i) {
h_input_length_buf_[i] = 0;
}
check_cuda_error(cudaStreamSynchronize(stream_));
const auto tock = std::chrono::high_resolution_clock::now();
if (rank_ == 0) {
FT_LOG_INFO("[decodeContext] %.2f ms", std::chrono::duration<float, std::milli>(tock - tick).count());
}
}
else if (rank_ == 0) {
FT_LOG_INFO("[decodeContext] Context decoding is not needed.");
}
}
template<typename T>
void LlamaBatch<T>::finish()
{
// secure info needed by `synchronize()`
check_cuda_error(
cudaMemcpyAsync(h_finished_buf_, finished_buf_, sizeof(bool) * batch_size_, cudaMemcpyDefault, stream_));
check_cuda_error(
cudaMemcpyAsync(h_sequence_lengths_, sequence_lengths_, sizeof(int) * batch_size_, cudaMemcpyDefault, stream_));
setOutputTensors(step_);
check_cuda_error(cudaStreamSynchronize(stream_));
for (int i = 0; i < batch_size_; ++i) {
FT_CHECK(requests_[i] != nullptr);
if (requests_[i]->stream_cb && rank_ == 0) {
requests_[i]->stream_cb(&requests_[i]->outputs[rank_].get());
}
}
if (debug_ && rank_ == 0) {
std::stringstream ss;
for (int i = 0; i < batch_size_; ++i) {
ss << (i ? ", " : "") << "(" << h_sequence_lengths_[i] << "," << h_finished_buf_[i] << ")";
}
FT_LOG_INFO("[finish] [%s]", ss.str().c_str());
}
for (int i = 0; i < batch_size_; ++i) {
if (h_finished_buf_[i]) {
finishRequest(i, false);
++finished_count_;
}
}
}
template<typename T>
void LlamaBatch<T>::synchronize()
{
// compact
int idx = 0;
for (int i = 0; i < batch_size_; ++i) {
if (requests_[i]) {
h_input_length_buf_[idx] = 0;
h_history_length_buf_[idx] = 0;
h_context_length_buf_[idx] = h_sequence_lengths_[i] + 1;
h_sequence_lengths_[idx] = h_context_length_buf_[idx];
check_cuda_error(cudaMemcpyAsync((curandState_t*)topk_curandstate_buf_ + idx,
llama_->dynamic_decode_layer_->topk_curandstate_buf() + i,
sizeof(curandState_t),
cudaMemcpyDefault,
stream_));
check_cuda_error(cudaMemcpyAsync((curandState_t*)topp_curandstate_buf_ + idx,
llama_->dynamic_decode_layer_->topp_curandstate_buf() + i,
sizeof(curandState_t),
cudaMemcpyDefault,
stream_));
if (i != idx) {
h_finished_buf_[idx] = h_finished_buf_[i];
request_seq_len_limit_[idx] = request_seq_len_limit_[i];
h_k_cache_ptr_buf_[idx] = h_k_cache_ptr_buf_[i];
h_v_cache_ptr_buf_[idx] = h_v_cache_ptr_buf_[i];
requests_[idx] = std::move(requests_[i]);
cached_seq_[idx] = std::move(cached_seq_[i]);
check_cuda_error(cudaMemcpyAsync(output_ids_buf_ + idx * session_len_,
output_ids_buf_ + i * session_len_,
sizeof(int) * h_context_length_buf_[idx],
cudaMemcpyDefault,
stream_));
}
++idx;
}
}
batch_size_ = idx;
if (rank_ == 0) {
FT_LOG_INFO("[synchronize] batch_size = %d", (int)batch_size_);
}
finished_count_ = 0;
}
template<typename T>
void LlamaBatch<T>::setOutputTensors(int max_gen_step)
{
// [s,b] -> [b,s] and skip padding in [context_len, max_context_len)
invokeGatherOutput(output_ids_buf_,
token_ids_buf_,
context_length_buf_,
max_context_len_,
max_gen_step,
session_len_,
batch_size_,
stream_);
sync_check_cuda_error();
/// TODO: fuse the loop into a single kernel
for (int i = 0; i < batch_size_; ++i) {
if (requests_[i]) {
auto& output_ids = requests_[i]->outputs[rank_].at("output_ids");
auto& sequence_length = requests_[i]->outputs[rank_].at("sequence_length");
check_cuda_error(cudaMemcpyAsync(output_ids.getPtr<int>(),
output_ids_buf_ + i * session_len_,
sizeof(int) * output_ids.shape.at(2),
cudaMemcpyDefault,
stream_));
check_cuda_error(cudaMemcpyAsync(
sequence_length.getPtr<int>(), sequence_lengths_ + i, sizeof(int), cudaMemcpyDefault, stream_));
if (max_gen_step > max_context_len_) { // +1 for newly generated token
invokePlusScalar(sequence_length.getPtr<int>(), 1, 1, stream_);
}
}
}
}
template<typename T>
void LlamaBatch<T>::finishRequest(int index, bool force_end)
{
if (rank_ == 0) {
FT_LOG_INFO("[finishRequest] slot = %d, id = %lu", index, (long)requests_[index]->id);
}
if (debug_ && rank_ == 0) {
std::vector<int> tokens(h_sequence_lengths_[index] + 1);
cudaMemcpyAsync(tokens.data(),
output_ids_buf_ + index * session_len_,
sizeof(int) * tokens.size(),
cudaMemcpyDefault,
stream_);
cudaStreamSynchronize(stream_);
std::stringstream ss;
for (const auto& t : tokens) {
ss << " " << t;
}
FT_LOG_INFO("[finishRequest] slot %d, tokens [%s]", index, ss.str().c_str());
}
auto& output_ids_tensor = requests_[index]->outputs[rank_].at("output_ids");
const auto output_ids_data = output_ids_tensor.getPtr<int>();
if (requests_[index]->end_flag || force_end) {
llama_->kv_cache_mgr_->erase(requests_[index]->id);
}
else {
// the last generated token is not processed by decoder thus dont have k/v cache
const int n_steps = step_ - max_context_len_;
const int cache_len = h_sequence_lengths_[index];
const int output_len = n_steps > 0 ? cache_len + 1 : cache_len;
auto& seq = cached_seq_[index];
seq.cache_len = cache_len;
// update token IDs
seq.token_ids.resize(output_len);
check_cuda_error(cudaMemcpyAsync(
seq.token_ids.data(), output_ids_data, sizeof(int) * output_len, cudaMemcpyDefault, stream_));
// update random states
seq.random_state_.resize(sizeof(curandState_t) * 2);
check_cuda_error(cudaMemcpyAsync(seq.random_state_.data(),
llama_->dynamic_decode_layer_->topk_curandstate_buf() + index,
sizeof(curandState_t),
cudaMemcpyDefault,
stream_));
check_cuda_error(cudaMemcpyAsync(seq.random_state_.data() + sizeof(curandState_t),
llama_->dynamic_decode_layer_->topp_curandstate_buf() + index,
sizeof(curandState_t),
cudaMemcpyDefault,
stream_));
check_cuda_error(cudaStreamSynchronize(stream_));
llama_->kv_cache_mgr_->update(cached_seq_[index], stream_);
}
if (rank_ == 0) {
requests_[index]->signal.set_value(0);
}
requests_[index] = nullptr;
}
template class LlamaBatch<half>;
template class LlamaBatch<float>;
} // namespace fastertransformer
\ No newline at end of file
// Copyright (c) OpenMMLab. All rights reserved.
#pragma once
#include "src/fastertransformer/models/llama/LlamaCacheManager.h"
#include "src/fastertransformer/models/llama/Request.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
namespace fastertransformer {
template<typename T>
class LlamaV2;
template<typename T>
class LlamaBatch {
public:
int size() const noexcept
{
return batch_size_;
};
int maxSize() const noexcept
{
return max_batch_size_;
}
int finishedCount() const noexcept
{
return finished_count_;
}
void verifyRequests(std::vector<std::shared_ptr<Request>>& stop_reqs,
std::vector<std::shared_ptr<Request>>& infer_reqs);
void handleStopRequests(const std::vector<std::shared_ptr<Request>>& requests);
void allocateBuffer(size_t batch_size, size_t session_len);
void allocatePersistantBuffer(size_t max_batch_size);
void freeBuffer();
void initializeSampling(int infer_request_count);
void initialize(const std::vector<std::shared_ptr<Request>>& infer_requests);
void contextDecode();
void initializeGeneration();
bool generate();
void finish();
void finishRequest(int index, bool force_end);
void synchronize();
void setOutputTensors(int max_gen_step);
explicit LlamaBatch(int max_batch_size, int max_context_token_num, int session_len, LlamaV2<T>* llama);
~LlamaBatch()
{
freeBuffer();
}
private:
const int max_batch_size_;
const int max_context_token_num_;
const int session_len_;
const int rank_;
const bool debug_;
LlamaV2<T>* const llama_;
// active requests
std::vector<std::shared_ptr<Request>> requests_;
T* context_decoder_input_buf_{}; // CTXDEC
// T* context_decoder_output_buf_{}; // CTXDEC
int* context_decoder_ids_buf_{};
T* decoder_input_buf_{}; // CTXDEC, GENERATE
T* decoder_output_buf_{}; // CTXDEC, GENERATE
int* input_ids_buf_{}; // input token ids + cache missed token ids, CTXDEC
int* input_length_buf_{}; // input + cache missed length, CTXDEC, GENERATE
int* history_length_buf_{}; // history length, CTXDEC
int* context_length_buf_{}; // history length + input_length, CTXDEC, GENERATE
int* total_padding_count_{}; // GENERATE
int* sequence_lengths_{}; // current sequence length
uint64_t* k_cache_ptr_buf_{};
uint64_t* v_cache_ptr_buf_{};
float* logits_buf_{}; // combined logits
float* local_logits_buf_{}; // tensor parallel local logits
// used by dynamic decoder
int* token_ids_buf_{}; // all token IDs in [S, B], indexed using `step`
int* output_ids_buf_{}; // output ids in [B, S]
int* end_ids_buf_{};
bool* finished_buf_{};
uint32_t* seq_limit_len_{};
// pinned buffers
int* h_input_ids_buf_{};
int* h_input_length_buf_{};
int* h_history_length_buf_{};
int* h_context_length_buf_{};
int* h_sequence_lengths_{};
bool* h_finished_buf_{};
uintptr_t* h_k_cache_ptr_buf_{};
uintptr_t* h_v_cache_ptr_buf_{};
uint32_t* h_seq_limit_len_{};
int* stop_words_buf_{}; // [batch_size, 2, kMaxStopWordsLen]
int* bad_words_buf_{};
int* h_runtime_top_k_{};
float* h_runtime_top_p_{};
float* h_temperature_{};
float* h_repetition_penalty_{};
uint64_t* h_random_seed_{};
void* topk_curandstate_buf_{};
void* topp_curandstate_buf_{};
// hard limits for persistant buffers
static constexpr int kMaxStopBadWordsLen = 32;
using CachedSeq = LlamaCacheManager::Sequence;
std::vector<CachedSeq> cached_seq_;
std::vector<int> request_seq_len_limit_;
const DataType data_type_{};
int batch_size_{};
int max_context_len_{};
int step_{};
int finished_count_{};
bool is_allocate_persistant_buffer_ = false;
bool is_allocate_buffer_ = false;
TensorMap inputs_;
TensorMap outputs_;
std::unordered_map<std::string, void*> sampling_params_;
cudaStream_t stream_{};
cublasMMWrapper* cublas_wrapper_{};
IAllocator* allocator_{};
};
} // namespace fastertransformer
\ No newline at end of file
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/models/llama/LlamaCacheManager.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/logger.h"
namespace fastertransformer {
LlamaCacheManager::~LlamaCacheManager()
{
for (auto& p : device_mem_) {
allocator_->free(&p, false);
}
}
void* LlamaCacheManager::allocate(bool is_preallocte)
{
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][allocate]");
}
void* mem_ptr{};
if (!device_free_.empty()) {
mem_ptr = device_free_.front();
device_free_.pop();
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
}
}
else if (entry_count_ < max_entry_count_) {
const auto alloc_count = std::min(chunk_size_, max_entry_count_ - entry_count_);
const size_t entry_byte_size = 2 * cache_byte_size_; // 2 for k,v
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][allocate] malloc %d", (int)alloc_count);
}
const auto chunk_ptr = allocator_->malloc(alloc_count * entry_byte_size, false);
FT_CHECK(chunk_ptr);
device_mem_.push_back(chunk_ptr);
entry_count_ += alloc_count;
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][allocate] count = %d", entry_count_);
}
for (int i = 0; i < alloc_count; ++i) {
device_free_.push((uint8_t*)chunk_ptr + entry_byte_size * i);
}
if (!is_preallocte) {
mem_ptr = device_free_.front();
device_free_.pop();
}
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][allocate] free = %d", (int)device_free_.size());
}
}
else {
mem_ptr = evict();
FT_CHECK_WITH_INFO(mem_ptr, "No enough cache entries.");
}
return mem_ptr;
}
auto LlamaCacheManager::create(uint64_t id, cudaStream_t stream) -> Sequence
{
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][create] %ld", (long)id);
}
for (const auto& e : device_cache_) {
if (e.id == id) {
if (rank_ == 0) {
FT_LOG_WARNING("[LlamaCacheManager][create] Removing conflicting id %ld", (long)id);
}
erase(id);
}
}
const auto mem_ptr = (uint8_t*)allocate(false);
check_cuda_error(cudaMemsetAsync(mem_ptr, 0, cache_byte_size_ * 2, stream));
device_cache_.push_back({
id,
max_seq_len_,
{},
0,
mem_ptr,
mem_ptr + cache_byte_size_,
{},
static_cast<uint64_t>(-1),
});
return device_cache_.back();
}
auto LlamaCacheManager::getEntryOrThrow(uint64_t id) -> std::vector<Sequence>::iterator
{
auto pred = [&](const Sequence& s) { return s.id == id; };
auto it = std::find_if(device_cache_.begin(), device_cache_.end(), pred);
if (it == device_cache_.end()) {
FT_LOG_ERROR("[LlamaCacheManager] %ld not found.\n", (long)id);
FT_CHECK(0);
}
return it;
}
auto LlamaCacheManager::fetch(uint64_t id, cudaStream_t stream) -> Sequence
{
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][fetch] %ld", (long)id);
}
auto entry = getEntryOrThrow(id);
if (entry->k_cache == nullptr) {
FT_CHECK(entry->cache_len == 0);
const auto mem_ptr = allocate(false);
check_cuda_error(cudaMemsetAsync(mem_ptr, 0, cache_byte_size_ * 2, stream));
entry->k_cache = mem_ptr;
entry->v_cache = (uint8_t*)entry->k_cache + cache_byte_size_;
}
entry->timestamp = static_cast<uint64_t>(-1);
return *entry;
}
void LlamaCacheManager::update(const Sequence& seq, cudaStream_t stream)
{
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][update] %ld", (long)seq.id);
}
auto entry = getEntryOrThrow(seq.id);
entry->timestamp = ++timestamp_;
entry->token_ids = seq.token_ids;
entry->cache_len = seq.cache_len;
FT_CHECK(seq.k_cache == entry->k_cache && seq.v_cache == entry->v_cache);
}
void LlamaCacheManager::erase(uint64_t id)
{
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][erase] %ld", (long)id);
}
auto entry = getEntryOrThrow(id);
if (entry->k_cache) {
device_free_.push(entry->k_cache);
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][erase] free = %d", (int)device_free_.size());
}
}
device_cache_.erase(entry);
}
void* LlamaCacheManager::evict()
{
FT_CHECK(!device_cache_.empty());
auto it = std::min_element(device_cache_.begin(), device_cache_.end(), [](const auto& a, const auto& b) {
return a.timestamp < b.timestamp;
});
if (it->timestamp == static_cast<uint64_t>(-1)) {
return nullptr;
}
if (rank_ == 0) {
FT_LOG_INFO("[LlamaCacheManager][evict] %ld", (long)it->id);
}
FT_CHECK(it->k_cache);
auto mem_ptr = it->k_cache;
it->k_cache = it->v_cache = nullptr;
it->cache_len = 0;
it->timestamp = static_cast<uint64_t>(-1);
return mem_ptr;
}
bool LlamaCacheManager::contains(uint64_t id) const noexcept
{
auto pred = [&](const Sequence& s) { return s.id == id; };
auto it = std::find_if(device_cache_.begin(), device_cache_.end(), pred);
return it != device_cache_.end();
}
} // namespace fastertransformer
\ No newline at end of file
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/logger.h"
#include <cstdint>
#include <cuda_runtime.h>
#include <queue>
#include <unordered_map>
#include <vector>
namespace fastertransformer {
// k-cache layout [L, H, D/x, S[s:], x]
// v-cache layout [L, H, S[s:], D/x, x]
class LlamaCacheManager {
public:
LlamaCacheManager(size_t layer_num,
size_t head_num,
size_t size_per_head,
size_t max_seq_len,
size_t elem_bits,
size_t max_entry_count,
size_t chunk_size,
int rank,
IAllocator* allocator):
layer_num_(layer_num),
head_num_(head_num),
size_per_head_(size_per_head),
max_seq_len_(max_seq_len),
elem_bits_(elem_bits),
cache_byte_size_(layer_num_ * head_num_ * max_seq_len_ * size_per_head_ * elem_bits_ / 8),
max_entry_count_(max_entry_count),
chunk_size_(chunk_size),
rank_(rank),
allocator_(allocator)
{
if (rank == 0) {
FT_LOG_INFO("[LlamaCacheManager] max_entry_count = %d", (int)max_entry_count_);
FT_LOG_INFO("[LlamaCacheManager] chunk_size = %d", (int)chunk_size_);
}
allocate(true);
}
~LlamaCacheManager();
struct Sequence {
// header
uint64_t id;
size_t max_seq_len;
// payloads
std::vector<int> token_ids; // all token ids
size_t cache_len; // cache_len == 0 -> cache miss
void* k_cache;
void* v_cache;
std::vector<uint8_t> random_state_; // states for RNGs
// for LRU policy
uint64_t timestamp;
};
Sequence create(uint64_t id, cudaStream_t stream);
Sequence fetch(uint64_t id, cudaStream_t stream);
void update(const Sequence& seq, cudaStream_t stream);
void erase(uint64_t id);
bool contains(uint64_t id) const noexcept;
private:
std::vector<Sequence>::iterator getEntryOrThrow(uint64_t id);
void* allocate(bool is_preallocte);
void* evict();
private:
const size_t layer_num_{};
const size_t head_num_{};
const size_t size_per_head_{};
const size_t max_seq_len_{};
const size_t elem_bits_{};
const size_t cache_byte_size_{};
const size_t max_entry_count_{};
const size_t chunk_size_{};
const int rank_{};
IAllocator* allocator_{};
std::queue<void*> device_free_;
std::vector<void*> device_mem_;
int entry_count_{};
uint64_t timestamp_{};
std::vector<Sequence> device_cache_;
};
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2021, NAVER Corp. Authored by CLOVA.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.cc
#include "src/fastertransformer/models/llama/LlamaContextAttentionLayer.h"
#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
#include "src/fastertransformer/kernels/unfused_attention_kernels.h"
#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
#include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_utils.h"
namespace fastertransformer {
template<typename T>
void LlamaContextAttentionLayer<T>::allocateBuffer(size_t batch_size,
size_t num_token,
size_t max_q_len,
size_t max_k_len)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
// no padding
qkv_buf_ = (T*)allocator_->reMalloc(qkv_buf_, sizeof(T) * num_token * 3 * local_hidden_units_, true);
// padding is rebuilt for q/k/v_buf_2_
q_buf_2_ = (T*)allocator_->reMalloc(q_buf_2_, sizeof(T) * 3 * batch_size * max_q_len * local_hidden_units_, true);
k_buf_2_ = q_buf_2_ + batch_size * max_q_len * local_hidden_units_;
v_buf_2_ = k_buf_2_ + batch_size * max_q_len * local_hidden_units_;
if (use_fmha_) {
FlashAttentionOp<T> flash_attention(batch_size, local_head_num_, max_k_len, max_q_len, size_per_head_);
if (flash_attention.get_workspace_size() > 0) {
qk_buf_float_ = (float*)allocator_->reMalloc(qk_buf_float_, flash_attention.get_workspace_size(), true);
}
}
else {
k_cache_buf_ = (T*)allocator_->reMalloc(
k_cache_buf_, 2 * sizeof(T) * batch_size * local_head_num_ * max_k_len * size_per_head_, true);
v_cache_buf_ = k_cache_buf_ + batch_size * local_head_num_ * max_k_len * size_per_head_;
qk_buf_ =
(T*)allocator_->reMalloc(qk_buf_, sizeof(T) * batch_size * local_head_num_ * max_q_len * max_k_len, true);
// qkv_buf_2_ has padding
qkv_buf_2_ =
(T*)allocator_->reMalloc(qkv_buf_2_, sizeof(T) * batch_size * max_q_len * local_hidden_units_, true);
}
// qkv_buf_3_ padding is removed
qkv_buf_3_ = (T*)allocator_->reMalloc(qkv_buf_3_, sizeof(T) * num_token * local_hidden_units_, true);
is_allocate_buffer_ = true;
}
template<typename T>
void LlamaContextAttentionLayer<T>::freeBuffer()
{
if (is_allocate_buffer_) {
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
allocator_->free((void**)(&qkv_buf_));
allocator_->free((void**)(&q_buf_2_));
if (use_fmha_) {
allocator_->free((void**)&qk_buf_float_);
}
else {
allocator_->free((void**)(&k_cache_buf_));
allocator_->free((void**)(&qk_buf_));
allocator_->free((void**)(&qkv_buf_2_));
}
allocator_->free((void**)(&qkv_buf_3_));
is_allocate_buffer_ = false;
}
}
template<typename T>
inline void LlamaContextAttentionLayer<T>::forward(TensorMap* output_tensors,
const TensorMap* input_tensors,
const LlamaAttentionWeight<T>* weights)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
/**
* input_tensors:
* \param input_query [token_num, hidden_dim]
* \param attention_mask [batch_size, 1, max_q_len, max_kv_len]
* \param padding_offset [token_num], int
* \param input_lengths [batch_size], int
* \param history_lengths [batch_size], int
* \param context_lengths [batch_size], int
* \param cu_seqlens [batch_size+1], int
* \param max_seq_len [1], int on cpu
* \param is_final_layer [1], bool on cpu
* \param layer_id [1], int on cpu
*
* output_tensors:
* \param hidden_features [token_num, hidden_dim]
* \param key_cache [batch_size], uint64
* \param value_cache [batch_size], uint64
*/
/////////////////////////////////////////////
/// parse inputs
const int batch_size = input_tensors->at("attention_mask").shape[0];
const int max_q_len = input_tensors->at("attention_mask").shape[2];
const int max_k_len = input_tensors->at("attention_mask").shape[3];
const int layer_id = input_tensors->getVal<int>("layer_id");
const int num_token = input_tensors->at("input_query").shape[0];
const int max_seq_len = input_tensors->at("max_seq_len").getVal<int>();
T* attention_out = output_tensors->at("hidden_features").getPtr<T>();
T* attention_input = input_tensors->at("input_query").getPtr<T>();
T* attention_mask = input_tensors->at("attention_mask").getPtr<T>();
const auto input_length = input_tensors->at("input_lengths").getPtr<const int>();
const auto history_length = input_tensors->at("history_lengths").getPtr<const int>();
const auto context_length = input_tensors->at("context_lengths").getPtr<const int>();
int* cu_seqlens = input_tensors->at("cu_seqlens").getPtr<int>();
const auto padding_offset = input_tensors->at("padding_offset").getPtr<int>();
/////////////////////////////////////////////
/// allocate buffers
allocateBuffer(batch_size, num_token, max_q_len, max_k_len);
//////////////////////////////////////////////
/// qkv gemm
// [token_num, hidden_dim] -> [token_num, 3, local_hidden_dim]
linear_.forward(qkv_buf_, attention_input, num_token, weights->qkv);
//////////////////////////////////////////////
/// transpose qkv & apply rotary embedding & rebuild padding
/// qkv [B, s, 3, H, D] -> (q [B, H, s, D], k [B, H, s, D], v [B, H, s, D])
invokeAddFusedQKVBiasTranspose(q_buf_2_,
k_buf_2_,
v_buf_2_,
PrefixPromptBatchWeightsParam<T>{},
qkv_buf_,
(const T*)nullptr, // qkv_bias
padding_offset, // padding_offset,
history_length, // used for applying rotary embedding
batch_size,
max_q_len, // seq_len
num_token, // batch_size * seq_len
local_head_num_,
size_per_head_,
rotary_embedding_dim_,
neox_rotary_style_,
nullptr, // query_weight.scale_out
0, // int8 mode
stream_);
sync_check_cuda_error();
const size_t layer_offset = layer_id * local_head_num_ * max_seq_len * size_per_head_;
auto k_cache_ptrs = output_tensors->getPtr<T*>("key_cache");
auto v_cache_ptrs = output_tensors->getPtr<T*>("value_cache");
//////////////////////////////////////////////////////////
/// insert the k/v computed from inputs into k/v cache
/// transpose kv -> kv cache
// put k/v_buf from shape [B, H, s, D] to
// k_buf_2 [B, H, s, D] -> key_cache [B, H, S[t:t+s], D/x, x]
// v_buf_2 [B, H, s, D] -> val_cache [B, H, S[t:t+s], D/x, x]
invokeExtendKVCache(k_cache_ptrs,
v_cache_ptrs,
layer_offset,
k_buf_2_,
v_buf_2_,
batch_size,
input_length,
max_q_len,
history_length,
max_seq_len,
size_per_head_,
local_head_num_,
stream_);
sync_check_cuda_error();
if (use_fmha_) {
fusedMultiHeadAttention(k_cache_ptrs,
v_cache_ptrs,
layer_offset,
attention_mask,
cu_seqlens,
batch_size,
max_q_len,
max_k_len,
max_seq_len);
}
else {
unfusedMultiHeadAttention(k_cache_ptrs,
v_cache_ptrs,
layer_offset,
attention_mask,
padding_offset,
context_length,
batch_size,
num_token,
max_q_len,
max_k_len,
max_seq_len);
}
//////////////////////////////////////////////
/// output gemm <Bs,HD> -> <Bs,HD>
linear_.forward(attention_out, qkv_buf_3_, num_token, weights->output);
if (tensor_para_.world_size_ > 1) {
NcclGuard nccl_guard(tensor_para_, stream_);
ftNcclAllReduceSum(attention_out, attention_out, num_token * hidden_units_, tensor_para_, stream_);
sync_check_cuda_error();
}
if (is_free_buffer_after_forward_ == true) {
freeBuffer();
}
sync_check_cuda_error();
}
template<typename T>
void LlamaContextAttentionLayer<T>::fusedMultiHeadAttention(T** key_cache_ptrs,
T** val_cache_ptrs,
size_t cache_layer_offset,
T* attention_mask,
int* cu_seqlens,
int batch_size,
int max_q_len,
int max_k_len,
int max_seq_len)
{
//////////////////////////////////////////////
// flash attention
using AttentionOp = FlashAttentionOp<T>;
using Layout = typename AttentionOp::AttentionLayout;
Layout layout_q{.stride_batch = int(local_head_num_ * max_q_len * size_per_head_),
.stride_seq = int(size_per_head_),
.stride_head = int(max_q_len * size_per_head_)};
Layout layout_k{.stride_batch = int(local_head_num_ * max_seq_len * size_per_head_),
.stride_seq = int(size_per_head_),
.stride_head = int(max_seq_len * size_per_head_),
.batch_seqs_offset = int(cache_layer_offset),
.batch_seqs = key_cache_ptrs};
Layout layout_v{.stride_batch = int(local_head_num_ * max_seq_len * size_per_head_),
.stride_seq = int(size_per_head_),
.stride_head = int(max_seq_len * size_per_head_),
.batch_seqs_offset = int(cache_layer_offset),
.batch_seqs = val_cache_ptrs};
Layout layout_o{
.stride_batch = int(local_head_num_ * max_q_len * size_per_head_),
.stride_seq = int(local_head_num_ * size_per_head_),
.stride_head = int(size_per_head_),
.use_seqlens = true,
};
AttentionOp flash_attention(batch_size, local_head_num_, max_k_len, max_q_len, size_per_head_);
typename AttentionOp::Params attn_params{.attn_out = qkv_buf_3_,
.query = q_buf_2_,
.key = k_cache_buf_,
.val = v_cache_buf_,
.mask = attention_mask,
.out_accum = qk_buf_float_,
.cu_seqlens_q = cu_seqlens,
.cu_seqlens_k = nullptr,
.layout_q = layout_q,
.layout_k = layout_k,
.layout_v = layout_v,
.layout_o = layout_o};
//
flash_attention(attn_params, stream_);
}
template<typename T>
void LlamaContextAttentionLayer<T>::unfusedMultiHeadAttention(T** key_cache_ptrs,
T** val_cache_ptrs,
size_t cache_layer_offset,
const T* attention_mask,
const int* padding_offset,
const int* context_length,
int batch_size,
int num_token,
int max_q_len,
int max_k_len,
int max_seq_len)
{
// key_cache [B, H, S[:t+s], D/x, x] -> [B, H, t+s, D]
// val_cache [B, H, S[:t+s], D/x, x] -> [B, H, t+s, D]
invokeTransposeKVCache(k_cache_buf_,
v_cache_buf_,
(const T**)key_cache_ptrs,
(const T**)val_cache_ptrs,
cache_layer_offset,
batch_size,
context_length, // history_len + input_len = context_len
max_k_len,
max_seq_len,
size_per_head_,
local_head_num_,
stream_);
sync_check_cuda_error();
const T qk_scale = static_cast<T>(1.f / sqrtf(size_per_head_ * 1.f));
//////////////////////////////////////////////
/// Q*K batch gemm
/// -> [B, H, s, t + s]
cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_T,
CUBLAS_OP_N,
max_k_len, // m
max_q_len, // n
size_per_head_, // k
k_cache_buf_, // A
size_per_head_, // lda
max_k_len * size_per_head_, // strideA
q_buf_2_, // B
size_per_head_, // ldb
max_q_len * size_per_head_, // strideB
qk_buf_, // C
max_k_len, // ldc
max_q_len * max_k_len, // strideC
batch_size * local_head_num_); // batchCount
//////////////////////////////////////////////
/// ! masked softmax (kernel asserts k_length <= 4096)
MaskedSoftmaxParam<T, T> param{};
param.attention_score = qk_buf_;
param.qk = qk_buf_;
param.attention_mask = attention_mask;
param.batch_size = batch_size;
param.q_length = max_q_len;
param.k_length = max_k_len;
param.num_heads = local_head_num_;
param.qk_scale = qk_scale;
param.linear_bias_slopes = nullptr;
invokeMaskedSoftmax(param, stream_);
sync_check_cuda_error();
//////////////////////////////////////////////
/// softmax(QK)*V batch gemm
// -> [B, H, S, D]
cublas_wrapper_->stridedBatchedGemm(CUBLAS_OP_N,
CUBLAS_OP_N,
size_per_head_, // m
max_q_len, // n
max_k_len, // k
v_cache_buf_, // A
size_per_head_, // lda
max_k_len * size_per_head_, // strideA,
qk_buf_, // B
max_k_len, // ldb
max_k_len * max_q_len, // strideB
qkv_buf_2_, // C
size_per_head_, // ldc,
max_q_len * size_per_head_, // strideC
batch_size * local_head_num_); // batchCount
//////////////////////////////////////////////
/// transpose <B,h,s,D> -> <B,s,h,D>
invokeTransposeAttentionOutRemovePadding(qkv_buf_2_,
qkv_buf_3_,
num_token,
batch_size,
max_q_len,
local_head_num_,
size_per_head_,
padding_offset,
nullptr,
0,
stream_);
sync_check_cuda_error();
}
template class LlamaContextAttentionLayer<float>;
template class LlamaContextAttentionLayer<half>;
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2021, NAVER Corp. Authored by CLOVA.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/GptContextAttentionLayer.h
#pragma once
#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
#include "src/fastertransformer/models/llama/LlamaLinear.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/nccl_utils.h"
namespace fastertransformer {
template<typename T>
class LlamaContextAttentionLayer {
public:
void freeBuffer();
void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);
LlamaContextAttentionLayer(size_t head_num,
size_t size_per_head,
size_t rotary_embedding_dim,
bool neox_rotary_style,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool use_fmha):
head_num_(head_num),
size_per_head_(size_per_head),
hidden_units_(head_num * size_per_head),
local_head_num_(head_num / tensor_para.world_size_),
local_hidden_units_(hidden_units_ / tensor_para.world_size_),
rotary_embedding_dim_(rotary_embedding_dim),
neox_rotary_style_(neox_rotary_style),
tensor_para_(tensor_para),
stream_(stream),
cublas_wrapper_(cublas_wrapper),
linear_(cublas_wrapper, stream),
allocator_(allocator),
is_free_buffer_after_forward_(is_free_buffer_after_forward),
use_fmha_(use_fmha)
{
}
void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaAttentionWeight<T>* weights);
void fusedMultiHeadAttention(T** key_cache_ptrs,
T** val_cache_ptrs,
size_t cache_layer_offset,
T* attention_mask,
int* cu_seqlens,
int batch_size,
int max_q_len,
int max_k_len,
int max_seq_len);
void unfusedMultiHeadAttention(T** key_cache_ptrs,
T** val_cache_ptrs,
size_t cache_layer_offset,
const T* attention_mask,
const int* padding_offset,
const int* context_length,
int batch_size,
int num_token,
int max_q_len,
int max_k_len,
int max_seq_len);
private:
const size_t head_num_;
const size_t size_per_head_;
const size_t hidden_units_;
const size_t local_head_num_;
const size_t local_hidden_units_;
const size_t rotary_embedding_dim_;
const bool is_free_buffer_after_forward_;
const bool neox_rotary_style_;
const bool use_fmha_;
NcclParam tensor_para_;
cudaStream_t stream_;
IAllocator* allocator_;
cublasMMWrapper* cublas_wrapper_;
LlamaLinear<T> linear_;
T* qkv_buf_{};
T* q_buf_2_{};
T* k_buf_2_{};
T* v_buf_2_{};
T* k_cache_buf_{};
T* v_cache_buf_{};
T* qk_buf_{};
float* qk_buf_float_{};
T* qkv_buf_2_{};
T* qkv_buf_3_{};
bool is_allocate_buffer_ = false;
};
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.cc
#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
#include "src/fastertransformer/kernels/bert_preprocess_kernels.h"
#include "src/fastertransformer/kernels/gpt_kernels.h"
#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
#include "src/fastertransformer/models/llama/llama_decoder_kernels.h"
#include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/utils/Tensor.h"
namespace fastertransformer {
template<typename T>
void LlamaContextDecoder<T>::allocateBuffer()
{
FT_CHECK(false);
}
template<typename T>
void LlamaContextDecoder<T>::allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
attn_ffn_io_ = (T*)allocator_->reMalloc(attn_ffn_io_, sizeof(T) * num_token * hidden_units_, false);
attention_mask_ = (T*)allocator_->reMalloc(attention_mask_, sizeof(T) * batch_size * max_q_len * max_kv_len, false);
padding_offset_ = (int*)allocator_->reMalloc(padding_offset_, sizeof(int) * batch_size * max_q_len, false);
cu_seqlens_ = (int*)allocator_->reMalloc(cu_seqlens_, sizeof(int) * (batch_size + 1), false);
is_allocate_buffer_ = true;
}
template<typename T>
void LlamaContextDecoder<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
allocator_->free((void**)&attn_ffn_io_);
allocator_->free((void**)&padding_offset_);
allocator_->free((void**)&cu_seqlens_);
allocator_->free((void**)&attention_mask_);
allocator_->free((void**)&h_pinned_token_num_ptr_, true);
is_allocate_buffer_ = false;
}
}
template<typename T>
void LlamaContextDecoder<T>::initialize(bool use_fmha)
{
h_pinned_token_num_ptr_ = (size_t*)allocator_->reMalloc(h_pinned_token_num_ptr_, sizeof(size_t), true, true);
context_attention_layer_ = new LlamaContextAttentionLayer<T>(head_num_,
size_per_head_,
rotary_embedding_dim_,
false, // neox_rotary_style
tensor_para_,
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_,
use_fmha);
silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
size_per_head_,
inter_size_,
tensor_para_,
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_);
}
template<typename T>
void LlamaContextDecoder<T>::forwardSelfAttn(const Session& sess,
const std::unordered_map<std::string, Tensor>* input_tensors,
int layer,
bool is_final)
{
// FT_LOG_ERROR(__PRETTY_FUNCTION__);
TensorMap self_attention_input_tensors{
{"input_query", Tensor{MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}},
{"attention_mask",
{MEMORY_GPU, data_type_, {sess.batch_size, 1, sess.max_query_len, sess.max_key_len}, attention_mask_}},
{"layer_id", Tensor{MEMORY_CPU, TYPE_INT32, {1}, &layer}},
{"is_final_layer", Tensor{MEMORY_CPU, TYPE_BOOL, {1}, &is_final}},
{"padding_offset", {MEMORY_GPU, TYPE_INT32, {sess.token_num}, padding_offset_}},
{"cu_seqlens", {MEMORY_GPU, TYPE_INT32, {sess.batch_size + 1}, cu_seqlens_}},
{"input_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.input_length}},
{"history_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.history_length}},
{"context_lengths", {MEMORY_GPU, TYPE_INT32, {sess.batch_size}, sess.context_length}},
{"max_seq_len", input_tensors->at("max_seq_len")}};
auto& k_cache = *sess.k_cache;
auto& v_cache = *sess.v_cache;
TensorMap self_attention_output_tensors{
{"hidden_features", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}},
{"key_cache", k_cache},
{"value_cache", v_cache},
};
context_attention_layer_->forward(&self_attention_output_tensors, //
&self_attention_input_tensors,
&sess.weights->at(layer)->self_attn_weights);
}
template<typename T>
LlamaContextDecoder<T>::LlamaContextDecoder(size_t head_num,
size_t size_per_head,
size_t inter_size,
size_t num_layer,
size_t rotary_embedding_dim,
float rmsnorm_eps,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool use_fmha):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
head_num_(head_num),
size_per_head_(size_per_head),
inter_size_(inter_size),
hidden_units_(head_num * size_per_head),
num_layer_(num_layer),
rotary_embedding_dim_(rotary_embedding_dim),
rmsnorm_eps_(rmsnorm_eps),
tensor_para_(tensor_para),
data_type_(getTensorType<T>())
{
initialize(use_fmha);
}
template<typename T>
LlamaContextDecoder<T>::~LlamaContextDecoder()
{
delete context_attention_layer_;
delete silu_ffn_layer_;
freeBuffer();
}
template<typename T>
void LlamaContextDecoder<T>::forward(std::vector<Tensor>* output_tensors,
const std::vector<Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
{
FT_CHECK(false);
}
template<typename T>
void LlamaContextDecoder<T>::forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
{
/**
* input tensors:
* \param decoder_input [num_token, hidden_units], float
* \param input_lengths [batch_size], int
* \param history_lengths [batch_size], int
* \param context_legnths [batch_size], int
* \param output_norm_weight [hidden_dims], float
* \param max_q_len [1], int on cpu
* \param max_kv_len [1], int on cpu
* \param max_seq_len [1], int on cpu
*
* output tensors:
* \param decoder_output [batch_size, seq_len, hidden_units],
* \param key_cache [num_layer, batch, local_head_num, size_per_head // x, max_seq_len, x]
* \param value_cache [num_layer, batch, local_head_num, max_seq_len, size_per_head]
* \param last_token_hidden_units [batch_size, hidden_units]
*/
Session sess{};
sess.token_num = input_tensors->at("decoder_input").shape[0];
sess.batch_size = input_tensors->at("input_lengths").shape[0];
sess.max_query_len = input_tensors->at("max_q_len").getVal<int>();
sess.max_key_len = input_tensors->at("max_kv_len").getVal<int>();
sess.weights = decoder_layer_weights;
sess.input_length = input_tensors->at("input_lengths").getPtr<int>();
sess.history_length = input_tensors->at("history_lengths").getPtr<int>();
sess.context_length = input_tensors->at("context_lengths").getPtr<int>();
T* decoder_input_output = input_tensors->at("decoder_input").getPtr<T>();
// T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();
sess.k_cache = &output_tensors->at("key_cache");
sess.v_cache = &output_tensors->at("value_cache");
allocateBuffer(sess.batch_size, sess.token_num, sess.max_query_len, sess.max_key_len);
size_t tmp_token_num{};
invokeGetPaddingOffsetAndCuSeqLens(h_pinned_token_num_ptr_,
&tmp_token_num, // updated token num
padding_offset_,
cu_seqlens_,
input_tensors->at("input_lengths").getPtr<int>(),
sess.batch_size,
sess.max_query_len,
stream_);
sync_check_cuda_error();
FT_CHECK(tmp_token_num == sess.token_num);
invokeCreateCausalMasks(attention_mask_,
sess.input_length,
sess.context_length,
sess.max_query_len,
sess.max_key_len,
sess.batch_size,
stream_);
sync_check_cuda_error();
/////////////////////////////////////////////
/// RMSNorm
invokeRootMeanSquareNorm(attn_ffn_io_,
decoder_input_output,
decoder_layer_weights->at(0)->self_attn_norm_weights,
rmsnorm_eps_,
sess.token_num,
hidden_units_,
stream_);
sync_check_cuda_error();
for (size_t layer = 0; layer < num_layer_; ++layer) {
/////////////////////////////////////////////
/// self-attention
forwardSelfAttn(sess, input_tensors, layer, false);
invokeFusedAddResidualRMSNorm(decoder_input_output,
attn_ffn_io_,
decoder_layer_weights->at(layer)->ffn_norm_weights,
rmsnorm_eps_,
sess.token_num,
hidden_units_,
stream_);
sync_check_cuda_error();
////////////////////////////////////////////
/// feed-forward network
TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}}};
TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, data_type_, {sess.token_num, hidden_units_}, attn_ffn_io_}}};
silu_ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &decoder_layer_weights->at(layer)->ffn_weights);
auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
input_tensors->at("output_norm_weight").getPtr<T>();
invokeFusedAddResidualRMSNorm(decoder_input_output, //
attn_ffn_io_,
scale_weight,
rmsnorm_eps_,
sess.token_num,
hidden_units_,
stream_);
sync_check_cuda_error();
}
if (is_free_buffer_after_forward_) {
freeBuffer();
}
}
template class LlamaContextDecoder<float>;
template class LlamaContextDecoder<half>;
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptContextDecoder.h
#pragma once
// #include "src/fastertransformer/kernels/add_residual_kernels.h"
// #include "src/fastertransformer/kernels/layernorm_kernels.h"
#include "src/fastertransformer/layers/BaseLayer.h"
// #include "src/fastertransformer/layers/FfnLayer.h"
// #include "src/fastertransformer/layers/attention_layers/BaseAttentionLayer.h"
#include "src/fastertransformer/models/llama/LlamaContextAttentionLayer.h"
#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/nccl_utils.h"
namespace fastertransformer {
template<typename T>
class LlamaContextDecoder: public BaseLayer {
protected:
void allocateBuffer() override;
void allocateBuffer(size_t batch_size, size_t num_token, size_t max_q_len, size_t max_kv_len);
void freeBuffer() override;
void initialize(bool use_fmha);
size_t head_num_;
size_t size_per_head_;
size_t inter_size_;
size_t num_layer_;
size_t rotary_embedding_dim_;
size_t hidden_units_;
float rmsnorm_eps_;
NcclParam tensor_para_;
T* attn_ffn_io_{};
T* attention_mask_{};
int* padding_offset_{};
int* cu_seqlens_{}; // cu for cumulative
size_t* h_pinned_token_num_ptr_{};
LlamaContextAttentionLayer<T>* context_attention_layer_{};
LlamaFfnLayer<T>* silu_ffn_layer_{};
const DataType data_type_;
struct Session {
size_t batch_size;
size_t token_num;
size_t max_query_len;
size_t max_key_len;
Tensor* k_cache;
Tensor* v_cache;
int* input_length{};
int* history_length{};
int* context_length{};
const std::vector<LlamaDecoderLayerWeight<T>*>* weights;
};
void forwardSelfAttn(const Session& sess,
const std::unordered_map<std::string, Tensor>* input_tensors,
int layer,
bool is_final);
public:
LlamaContextDecoder(size_t head_num,
size_t size_per_head,
size_t inter_size,
size_t num_layer,
size_t rotary_embedding_dim,
float rmsnorm_eps,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
bool use_fmha);
~LlamaContextDecoder() override;
virtual void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
virtual void forward(std::vector<Tensor>* output_tensors,
const std::vector<Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
};
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2022, SK Telecom Authored by A. Dialog
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.cc
#include "src/fastertransformer/models/llama/LlamaDecoder.h"
#include "src/fastertransformer/models/llama/llama_decoder_kernels.h"
#include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/models/llama/llama_utils.h"
namespace fastertransformer {
template<typename T>
LlamaDecoder<T>::LlamaDecoder(size_t head_num,
size_t size_per_head,
size_t inter_size,
size_t num_layer,
size_t rotary_embedding_dim,
float rmsnorm_eps,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward):
BaseLayer(stream, cublas_wrapper, allocator, is_free_buffer_after_forward),
head_num_(head_num),
size_per_head_(size_per_head),
inter_size_(inter_size),
num_layer_(num_layer),
rotary_embedding_dim_(rotary_embedding_dim),
hidden_units_(head_num * size_per_head),
rmsnorm_eps_(rmsnorm_eps),
tensor_para_(tensor_para),
data_type_(getTensorType<T>())
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
initialize();
}
template<typename T>
LlamaDecoder<T>::~LlamaDecoder()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
delete self_attention_layer_;
delete silu_ffn_layer_;
}
template<typename T>
void LlamaDecoder<T>::initialize()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
self_attention_layer_ = new LlamaDecoderSelfAttentionLayer<T>(head_num_,
size_per_head_,
rotary_embedding_dim_,
false, // neox_rotary_style
tensor_para_,
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_);
silu_ffn_layer_ = new LlamaFfnLayer<T>(head_num_,
size_per_head_,
inter_size_,
tensor_para_,
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_);
}
template<typename T>
void LlamaDecoder<T>::allocateBuffer()
{
FT_CHECK(false);
}
template<typename T>
void LlamaDecoder<T>::allocateBuffer(size_t batch_size)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
is_allocate_buffer_ = true;
}
template<typename T>
void LlamaDecoder<T>::freeBuffer()
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
if (is_allocate_buffer_) {
is_allocate_buffer_ = false;
}
}
template<typename T>
void LlamaDecoder<T>::forwardSelfAttn(const LlamaDecoder::Session& sess,
T* attn_io,
const std::unordered_map<std::string, Tensor>* input_tensors,
size_t layer)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
TensorMap self_attention_input_tensors(*input_tensors);
self_attention_input_tensors.insert("input_query",
{MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, attn_io});
const int layer_id = layer;
self_attention_input_tensors.insert("layer_id", {MEMORY_CPU, TYPE_INT32, {1}, &layer_id});
auto& k_cache = *sess.k_cache;
auto& v_cache = *sess.v_cache;
TensorMap self_attention_output_tensors{
{"attention_output", {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, attn_io}},
{"key_cache", k_cache},
{"value_cache", v_cache},
};
self_attention_layer_->forward(&self_attention_output_tensors, //
&self_attention_input_tensors,
&sess.weights->at(layer)->self_attn_weights);
}
template<typename T>
void LlamaDecoder<T>::forwardFfn(const LlamaDecoder::Session& sess, T* ffn_io, size_t layer)
{
TensorMap ffn_inputs{{"ffn_input", {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, ffn_io}}};
TensorMap ffn_outputs{{"ffn_output", {MEMORY_GPU, data_type_, {sess.batch_size, hidden_units_}, ffn_io}}};
silu_ffn_layer_->forward(&ffn_outputs, &ffn_inputs, &sess.weights->at(layer)->ffn_weights);
}
template<typename T>
void LlamaDecoder<T>::forward(std::vector<Tensor>* output_tensors,
const std::vector<Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
{
FT_CHECK(false);
}
template<typename T>
void LlamaDecoder<T>::forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
/**
* input_tensors:
* \param decoder_input [batch_size, hidden_dims]
* \param sequence_lengths [batch_size] int
* \param output_norm_weight [hidden_dims]
* \param step [1] on cpu
* \param ite [1] on cpu
* \param finished [batch_size] bool
* \param total_padding_tokens [batch_size], int
* \param max_seq_len [1] on cpu
* \param masked_tokens [batch_size, memory_len] bool (optional), NOT USED YET
*
* output_tensors:
* \param decoder_output [batch_size, hidden_dimension]
* \param key_cache [batch_size] uint64_t
* \param value_cache [batch_size] uint64_t
*/
// for the shape of key cache, refer to decoder_masked_multihead_attention_template.hpp
Session sess{};
sess.batch_size = input_tensors->at("decoder_input").shape[0];
sess.weights = decoder_layer_weights;
allocateBuffer(sess.batch_size);
sess.ite = input_tensors->at("ite").getVal<const int>();
sess.k_cache = &output_tensors->at("key_cache");
sess.v_cache = &output_tensors->at("value_cache");
sess.max_memory_len = input_tensors->at("max_seq_len").getVal<int>();
T* decoder_input = input_tensors->at("decoder_input").getPtr<T>();
T* decoder_output = output_tensors->at("decoder_output").getPtr<T>();
////////////////////////////////////////////
/// RMSNorm
invokeRootMeanSquareNorm(decoder_output,
decoder_input,
decoder_layer_weights->at(0)->self_attn_norm_weights,
rmsnorm_eps_,
sess.batch_size,
hidden_units_,
stream_);
sync_check_cuda_error();
for (size_t layer = 0; layer < num_layer_; ++layer) {
// output: self_attn_output_, k_cache, v_cache = self_attn(decoder_normed_input_)
forwardSelfAttn(sess, decoder_output, input_tensors, layer);
invokeFusedAddResidualRMSNorm(decoder_input,
decoder_output,
decoder_layer_weights->at(layer)->ffn_norm_weights,
rmsnorm_eps_,
sess.batch_size,
hidden_units_,
stream_);
sync_check_cuda_error();
// decoder_layer_output_ = ffn(decoder_normed_input_)
forwardFfn(sess, decoder_output, layer);
auto scale_weight = layer < num_layer_ - 1 ? decoder_layer_weights->at(layer + 1)->self_attn_norm_weights :
input_tensors->at("output_norm_weight").getPtr<T>();
invokeFusedAddResidualRMSNorm(decoder_input, //
decoder_output,
scale_weight,
rmsnorm_eps_,
sess.batch_size,
hidden_units_,
stream_);
sync_check_cuda_error();
}
if (is_free_buffer_after_forward_) {
freeBuffer();
}
}
template class LlamaDecoder<half>;
template class LlamaDecoder<float>;
} // namespace fastertransformer
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2022, SK Telecom Authored by A. Dialog
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoder.h
#include "src/fastertransformer/layers/BaseLayer.h"
// #include "src/fastertransformer/layers/FfnLayer.h"
#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
#include "src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h"
#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
#include "src/fastertransformer/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/nccl_utils.h"
namespace fastertransformer {
template<typename T>
class LlamaDecoder: public BaseLayer {
protected:
void allocateBuffer() override; // deprecated
void allocateBuffer(size_t batch_size);
void freeBuffer() override;
void initialize();
size_t head_num_;
size_t size_per_head_;
size_t inter_size_;
size_t num_layer_;
size_t rotary_embedding_dim_;
size_t hidden_units_;
float rmsnorm_eps_;
NcclParam tensor_para_;
LlamaDecoderSelfAttentionLayer<T>* self_attention_layer_{};
LlamaFfnLayer<T>* silu_ffn_layer_{};
const DataType data_type_;
struct Session {
size_t batch_size;
int ite;
size_t max_memory_len;
Tensor* k_cache;
Tensor* v_cache;
const std::vector<LlamaDecoderLayerWeight<T>*>* weights;
};
void forwardSelfAttn(const Session& sess,
T* attn_io,
const std::unordered_map<std::string, Tensor>* input_tensors,
size_t layer);
void forwardFfn(const LlamaDecoder::Session& sess, T* ffn_io, size_t layer);
public:
LlamaDecoder(size_t head_num,
size_t size_per_head,
size_t inter_size,
size_t num_layer,
size_t rotary_embedding_dim,
float rmsnorm_eps,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward);
~LlamaDecoder() override;
virtual void forward(std::unordered_map<std::string, Tensor>* output_tensors,
const std::unordered_map<std::string, Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
virtual void forward(std::vector<Tensor>* output_tensors,
const std::vector<Tensor>* input_tensors,
const std::vector<LlamaDecoderLayerWeight<T>*>* decoder_layer_weights);
};
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.cc
#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
#include "src/fastertransformer/utils/logger.h"
#include "src/fastertransformer/utils/memory_utils.h"
namespace fastertransformer {
template<typename T>
LlamaDecoderLayerWeight<T>::LlamaDecoderLayerWeight(
size_t hidden_units, size_t inter_size, WeightType weight_type, size_t tensor_para_size, size_t tensor_para_rank):
hidden_units_(hidden_units),
inter_size_(inter_size),
weight_type_(weight_type),
tensor_para_size_(tensor_para_size),
tensor_para_rank_(tensor_para_rank)
{
self_attn_weights.qkv.input_dims = hidden_units_;
self_attn_weights.qkv.output_dims = 3 * hidden_units_ / tensor_para_size_;
self_attn_weights.qkv.type = weight_type;
self_attn_weights.output.input_dims = hidden_units_ / tensor_para_size_;
self_attn_weights.output.output_dims = hidden_units_;
self_attn_weights.output.type = weight_type;
ffn_weights.gating.input_dims = hidden_units_;
ffn_weights.gating.output_dims = inter_size_ / tensor_para_size_;
ffn_weights.gating.type = weight_type;
ffn_weights.intermediate.input_dims = hidden_units_;
ffn_weights.intermediate.output_dims = inter_size_ / tensor_para_size_;
ffn_weights.intermediate.type = weight_type;
ffn_weights.output.input_dims = inter_size_ / tensor_para_size_;
ffn_weights.output.output_dims = hidden_units_;
ffn_weights.output.type = weight_type;
mallocWeights();
}
template<typename T>
void freeWeights(LlamaDenseWeight<T>& weights)
{
cudaFree(weights.kernel);
cudaFree(weights.bias);
cudaFree(weights.scales);
cudaFree(weights.zeros);
weights.kernel = nullptr;
weights.bias = nullptr;
weights.scales = nullptr;
weights.zeros = nullptr;
}
template<typename T>
void mallocWeights(LlamaDenseWeight<T>& weights, bool bias)
{
if (bias) {
deviceMalloc((T**)&weights.bias, weights.output_dims);
}
const size_t bit_size = getBitSize(weights.type);
if (bit_size >= 16) { // fp16, fp32
deviceMalloc((T**)&weights.kernel, weights.input_dims * weights.output_dims);
}
else { // int8, int4
const int factor = sizeof(float) * 8 / bit_size;
FT_CHECK(weights.input_dims % factor == 0);
deviceMalloc((float**)&weights.kernel, weights.input_dims / factor * weights.output_dims);
deviceMalloc((T**)&weights.scales, weights.output_dims);
deviceMalloc((T**)&weights.zeros, weights.output_dims);
}
}
template<typename T>
void loadWeights(LlamaDenseWeight<T>& w, std::string prefix, int rank, FtCudaDataType model_file_type)
{
prefix += "." + std::to_string(rank);
const auto type = model_file_type;
if (w.bias) {
loadWeightFromBin((T*)w.bias, {w.output_dims}, prefix + ".bias", type);
}
const size_t bit_size = getBitSize(w.type);
if (bit_size >= 16) { // fp16, fp32
loadWeightFromBin((T*)w.kernel, {w.input_dims, w.output_dims}, prefix + ".weight", type);
}
else { // int8, int4
const int factor = sizeof(float) * 8 / bit_size;
FT_CHECK(w.input_dims % factor == 0);
const auto f32_type = FtCudaDataType::FP32;
loadWeightFromBin((float*)w.kernel, {w.input_dims / factor, w.output_dims}, prefix + ".qweight", f32_type);
loadWeightFromBin((T*)w.scales, {w.output_dims}, prefix + ".scales", type);
loadWeightFromBin((T*)w.zeros, {w.output_dims}, prefix + ".zeros", type);
}
}
template<typename T>
void LlamaDecoderLayerWeight<T>::mallocWeights()
{
deviceMalloc((T**)&self_attn_norm_weights, hidden_units_);
deviceMalloc((T**)&ffn_norm_weights, hidden_units_);
fastertransformer::mallocWeights(self_attn_weights.qkv, false);
fastertransformer::mallocWeights(self_attn_weights.output, false);
fastertransformer::mallocWeights(ffn_weights.gating, false);
fastertransformer::mallocWeights(ffn_weights.intermediate, false);
fastertransformer::mallocWeights(ffn_weights.output, false);
}
template<typename T>
LlamaDecoderLayerWeight<T>::~LlamaDecoderLayerWeight()
{
cudaFree((void*)self_attn_norm_weights);
cudaFree((void*)ffn_norm_weights);
freeWeights(self_attn_weights.qkv);
freeWeights(self_attn_weights.output);
freeWeights(ffn_weights.gating);
freeWeights(ffn_weights.intermediate);
freeWeights(ffn_weights.output);
}
template<typename T>
void LlamaDecoderLayerWeight<T>::loadModel(std::string dir_path, FtCudaDataType model_file_type)
{
const auto rank_spec = std::to_string(tensor_para_rank_);
const auto type = model_file_type;
loadWeightFromBin(
(T*)self_attn_norm_weights, {hidden_units_}, dir_path + ".attention_norm.weight", model_file_type);
loadWeightFromBin((T*)ffn_norm_weights, {hidden_units_}, dir_path + ".ffn_norm.weight", model_file_type);
loadWeights(self_attn_weights.qkv, dir_path + ".attention.w_qkv", tensor_para_rank_, type);
loadWeights(self_attn_weights.output, dir_path + ".attention.wo", tensor_para_rank_, type);
loadWeights(ffn_weights.gating, dir_path + ".feed_forward.w1", tensor_para_rank_, type);
loadWeights(ffn_weights.intermediate, dir_path + ".feed_forward.w3", tensor_para_rank_, type);
loadWeights(ffn_weights.output, dir_path + ".feed_forward.w2", tensor_para_rank_, type);
}
template struct LlamaDecoderLayerWeight<float>;
template struct LlamaDecoderLayerWeight<half>;
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptDecoderLayerWeight.h
#pragma once
#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
namespace fastertransformer {
template<typename T>
struct LlamaDecoderLayerWeight {
public:
LlamaDecoderLayerWeight() = delete;
LlamaDecoderLayerWeight(
size_t hidden_units, size_t inter_size, WeightType weight_type, size_t tensor_para_size, size_t tensor_para_rank);
~LlamaDecoderLayerWeight();
LlamaDecoderLayerWeight(const LlamaDecoderLayerWeight& other) = delete;
LlamaDecoderLayerWeight& operator=(const LlamaDecoderLayerWeight& other) = delete;
void loadModel(std::string dir_path, FtCudaDataType model_file_type);
T* self_attn_norm_weights{};
T* ffn_norm_weights{};
LlamaAttentionWeight<T> self_attn_weights{};
LlamaFfnWeight<T> ffn_weights{};
private:
size_t hidden_units_;
size_t inter_size_;
WeightType weight_type_;
size_t bit_size_;
size_t tensor_para_size_;
size_t tensor_para_rank_;
bool is_maintain_buffer_ = false;
void mallocWeights();
};
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.cc
#include "src/fastertransformer/models/llama/LlamaDecoderSelfAttentionLayer.h"
#include "src/fastertransformer/kernels/decoder_masked_multihead_attention.h"
#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
#include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/nvtx_utils.h"
#include <string>
// #include <glog/logging.h>
namespace fastertransformer {
template<typename T>
struct SATypeConverter {
using Type = T;
};
template<>
struct SATypeConverter<half> {
using Type = uint16_t;
};
template<typename T>
static inline void fusedQKV_masked_attention_dispatch(const T* qkv_buf,
const T* qkv_bias,
const T* relative_attention_bias,
T* key_cache,
T* value_cache,
T** k_cache_per_sample,
T** v_cache_per_sample,
size_t kv_cache_per_sample_offset,
const int* cache_indir,
T* context_buf,
const bool* finished,
const int* sequence_lengths,
const int max_batch_size,
const int inference_batch_size,
const int beam_width,
const int head_num,
const int size_per_head,
const int rotary_embedding_dim,
const int memory_max_len,
const int* prefix_prompt_lengths,
const int max_prefix_prompt_length,
const int max_input_len,
const int* total_padding_tokens,
const int step,
const float q_scaling,
const int relative_attention_bias_stride,
const T* linear_bias_slopes,
const bool* masked_tokens,
const int* ia3_tasks,
const T* ia3_key_weights,
const T* ia3_value_weights,
const float* qkv_scale_out,
const float* attention_out_scale,
const int int8_mode,
cudaStream_t stream)
{
using DataType = typename SATypeConverter<T>::Type;
// Prepare the parameters.
Masked_multihead_attention_params<DataType> params;
memset(&params, 0, sizeof(params));
int hidden_units = head_num * size_per_head;
if (qkv_bias != nullptr) {
params.q_bias = reinterpret_cast<const DataType*>(qkv_bias);
params.k_bias = reinterpret_cast<const DataType*>(qkv_bias) + hidden_units;
params.v_bias = reinterpret_cast<const DataType*>(qkv_bias) + 2 * hidden_units;
}
else {
params.q_bias = nullptr;
params.k_bias = nullptr;
params.v_bias = nullptr;
}
// Set the output buffer.
params.out = reinterpret_cast<DataType*>(context_buf);
// Set the input buffers.
params.q = reinterpret_cast<const DataType*>(qkv_buf);
if (int8_mode != 2) {
params.k = reinterpret_cast<const DataType*>(qkv_buf) + hidden_units;
params.v = reinterpret_cast<const DataType*>(qkv_buf) + 2 * hidden_units;
}
else {
params.k = reinterpret_cast<const DataType*>(reinterpret_cast<const int8_t*>(qkv_buf) + hidden_units);
params.v = reinterpret_cast<const DataType*>(reinterpret_cast<const int8_t*>(qkv_buf) + 2 * hidden_units);
}
params.stride = 3 * hidden_units;
params.finished = const_cast<bool*>(finished);
params.k_cache = reinterpret_cast<DataType*>(key_cache);
params.v_cache = reinterpret_cast<DataType*>(value_cache);
params.k_cache_per_sample = reinterpret_cast<DataType**>(k_cache_per_sample);
params.v_cache_per_sample = reinterpret_cast<DataType**>(v_cache_per_sample);
params.kv_cache_per_sample_offset = kv_cache_per_sample_offset;
params.k_cache_interleaved = false;
params.cache_indir = cache_indir;
params.batch_size = inference_batch_size;
params.beam_width = beam_width;
params.memory_max_len = memory_max_len;
params.prefix_prompt_lengths = prefix_prompt_lengths;
params.max_prefix_prompt_length = max_prefix_prompt_length;
params.length_per_sample = sequence_lengths; // max_input_length + current output length
// timestep adding max_prefix_prompt_length for shared memory size calculation and rotary embedding computation
params.timestep = step + max_prefix_prompt_length - 1;
params.num_heads = head_num;
params.hidden_size_per_head = size_per_head;
params.rotary_embedding_dim = rotary_embedding_dim;
// Note: keep norm factor (sqrt(K_dim)) when adopting megatron T5 structure (may adjust)
params.inv_sqrt_dh = 1.F / (sqrtf((float)params.hidden_size_per_head) * q_scaling);
params.total_padding_tokens = total_padding_tokens;
if (relative_attention_bias != nullptr) {
params.relative_attention_bias = reinterpret_cast<const DataType*>(relative_attention_bias);
}
params.relative_attention_bias_stride = relative_attention_bias_stride;
params.masked_tokens = masked_tokens;
// The slope of linear position bias per head, e.g., ALiBi.
if (linear_bias_slopes != nullptr) {
params.linear_bias_slopes = reinterpret_cast<const DataType*>(linear_bias_slopes);
}
params.max_input_length = max_input_len;
params.ia3_tasks = ia3_tasks;
params.ia3_key_weights = reinterpret_cast<const DataType*>(ia3_key_weights);
params.ia3_value_weights = reinterpret_cast<const DataType*>(ia3_value_weights);
params.int8_mode = int8_mode;
if (int8_mode == 2) {
params.qkv_scale_out = qkv_scale_out;
params.attention_out_scale = attention_out_scale;
}
PUSH_RANGE("scaled dot-product fusion");
masked_multihead_attention(params, stream);
POP_RANGE;
}
template<typename T>
void LlamaDecoderSelfAttentionLayer<T>::allocateBuffer(size_t batch_size, int key_len, int max_memory_len)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
qkv_buf_ =
reinterpret_cast<T*>(allocator_->reMalloc(qkv_buf_, sizeof(T) * batch_size * 3 * local_hidden_units_, false));
context_buf_ =
reinterpret_cast<T*>(allocator_->reMalloc(context_buf_, sizeof(T) * batch_size * local_hidden_units_, false));
is_allocate_buffer_ = true;
}
template<typename T>
void LlamaDecoderSelfAttentionLayer<T>::freeBuffer()
{
if (is_allocate_buffer_) {
allocator_->free((void**)(&qkv_buf_));
allocator_->free((void**)(&context_buf_));
// allocator_->free((void**)(&k_cache_buf_));
// allocator_->free((void**)(&v_cache_buf_));
is_allocate_buffer_ = false;
}
}
template<typename T>
void LlamaDecoderSelfAttentionLayer<T>::forward(TensorMap* output_tensors,
const TensorMap* input_tensors,
const LlamaAttentionWeight<T>* weights)
{
/**
* input tensors:
* \param input_query [batch_size, hidden_units],
* \param sequence_lengths [batch_size]
* \param step [1] on cpu
* \param finished [batch_size]
* \param total_padding_tokens [batch_size]
* \param layer_id [1], int on cpu
* \param max_seq_len [1] on cpu
* \param masked_tokens [batch_size, memory_len], (optional), NOT USED YET
* \param cache_indirection [batch_size / beam_width, beam_width, memory_max_len] (optional)
*
* output tensors:
* \param attention_output [batch_size, hidden_units],
* \param key_cache [batch, local_head_num, size_per_head / x, memory_max_len, x]
* \param value_cache [batch, local_head_num, memory_max_len, size_per_head]
*/
const T* input_query_data = input_tensors->getPtr<T>("input_query");
const int* sequence_lengths_data = input_tensors->getPtr<int>("sequence_lengths");
const int* total_padding_len = input_tensors->getPtr<int>("total_padding_tokens");
const bool* finished_data = input_tensors->getPtr<bool>("finished", nullptr);
const bool* masked_tokens_data = input_tensors->getPtr<bool>("masked_tokens", nullptr);
const int* cache_indir = input_tensors->getPtr<int>("cache_indirection", nullptr);
T* hidden_features_data = output_tensors->getPtr<T>("attention_output");
T** key_cache_ptrs = output_tensors->getPtr<T*>("key_cache");
T** value_cache_ptrs = output_tensors->getPtr<T*>("value_cache");
const int layer_id = input_tensors->getVal<int>("layer_id");
const int max_seq_len = input_tensors->getVal<int>("max_seq_len");
const int step = input_tensors->getVal<int>("step");
const int step_1 = step - 1;
const int batch_size = input_tensors->at("input_query").shape[0];
const int beam_width = cache_indir != nullptr ? input_tensors->at("cache_indirection").shape[1] : 1;
allocateBuffer(batch_size, step, max_seq_len);
PUSH_RANGE("qkv_gemm");
linear_.forward(qkv_buf_, input_query_data, batch_size, weights->qkv);
POP_RANGE;
const auto kv_cache_layer_offset = layer_id * local_head_num_ * max_seq_len * size_per_head_;
const int memory_len = max_seq_len;
fusedQKV_masked_attention_dispatch<T>(
qkv_buf_,
nullptr, // query_weight.bias,
nullptr, // relative_attention_bias,
nullptr,
nullptr,
key_cache_ptrs,
value_cache_ptrs,
kv_cache_layer_offset,
cache_indir,
context_buf_,
finished_data,
sequence_lengths_data, // NOTE: current seq len including padding (fixed after meeting the finished id)
batch_size,
batch_size,
beam_width,
local_head_num_,
size_per_head_,
rotary_embedding_dim_,
memory_len,
nullptr, // prefix_prompt_lengths
0, // max_prefix_prompt_length
0, // max_input_length, not used w/o linear_bias_slopes
input_tensors->getPtr<int>("total_padding_tokens", nullptr),
step,
1.f, // q_scaling
0, // relative_attention_bias_stride
nullptr, // linear_bias_slopes
nullptr, // masked_tokens_data,
nullptr, // ia3_tasks
nullptr, // ia3_key_weights
nullptr, // ia3_value_weights
nullptr, // qkv_scale_out
nullptr, // attention_out_scale
0, // int8_mode
stream_);
sync_check_cuda_error();
linear_.forward(hidden_features_data, context_buf_, batch_size, weights->output);
if (tensor_para_.world_size_ > 1) {
NcclGuard nccl_guard(tensor_para_, stream_);
ftNcclAllReduceSum(
hidden_features_data, hidden_features_data, batch_size * hidden_units_, tensor_para_, stream_);
sync_check_cuda_error();
}
if (is_free_buffer_after_forward_) {
freeBuffer();
}
// LOG(WARNING);
}
template class LlamaDecoderSelfAttentionLayer<float>;
template class LlamaDecoderSelfAttentionLayer<half>;
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/attention_layers/DecoderSelfAttentionLayer.h
#pragma once
#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
#include "src/fastertransformer/models/llama/LlamaLinear.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/nccl_utils.h"
namespace fastertransformer {
template<typename T>
class LlamaDecoderSelfAttentionLayer {
public:
void freeBuffer();
void allocateBuffer(size_t batch_size, int key_len, int max_memory_len);
LlamaDecoderSelfAttentionLayer(size_t head_num,
size_t size_per_head,
size_t rotary_embedding_dim,
bool neox_rotary_style,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward):
head_num_(head_num),
size_per_head_(size_per_head),
hidden_units_(head_num * size_per_head),
local_head_num_(head_num / tensor_para.world_size_),
local_hidden_units_(hidden_units_ / tensor_para.world_size_),
rotary_embedding_dim_(rotary_embedding_dim),
neox_rotary_style_(neox_rotary_style),
tensor_para_(tensor_para),
stream_(stream),
linear_(cublas_wrapper, stream),
allocator_(allocator),
is_free_buffer_after_forward_(is_free_buffer_after_forward)
{
}
~LlamaDecoderSelfAttentionLayer()
{
freeBuffer();
}
void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaAttentionWeight<T>* weights);
private:
const size_t head_num_;
const size_t size_per_head_;
const size_t hidden_units_;
const size_t local_head_num_;
const size_t local_hidden_units_;
const size_t rotary_embedding_dim_;
const bool is_free_buffer_after_forward_;
const bool neox_rotary_style_;
NcclParam tensor_para_;
cudaStream_t stream_;
IAllocator* allocator_;
LlamaLinear<T> linear_;
T* qkv_buf_ = nullptr;
T* context_buf_ = nullptr;
// T* weight_buf_ = nullptr;
// T* k_cache_buf_{};
// T* v_cache_buf_{};
// T* tmp_k_cache_buf_{};
// T* tmp_v_cache_buf_{};
// T* tmp_cache_buf_{};
bool is_allocate_buffer_{};
};
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/DenseWeight.h
#pragma once
#include "src/fastertransformer/layers/FfnWeight.h"
#include "src/fastertransformer/layers/attention_layers/AttentionWeight.h"
#include "src/fastertransformer/utils/cuda_utils.h"
namespace fastertransformer {
enum class WeightType : int
{
kFP32,
kFP16,
kFP8, // not supported yet
kINT8,
kINT4
};
inline size_t getBitSize(WeightType type)
{
switch (type) {
case WeightType::kFP32:
return 32;
case WeightType::kFP16:
return 16;
case WeightType::kFP8:
return 8;
case WeightType::kINT8:
return 8;
case WeightType::kINT4:
return 4;
}
}
template<typename T>
struct LlamaDenseWeight {
size_t input_dims;
size_t output_dims;
void* kernel;
WeightType type;
T* bias;
T* scales;
T* zeros;
};
template<typename T>
struct LlamaAttentionWeight {
LlamaDenseWeight<T> qkv;
LlamaDenseWeight<T> output;
};
template<typename T>
struct LlamaFfnWeight {
LlamaDenseWeight<T> gating;
LlamaDenseWeight<T> intermediate;
LlamaDenseWeight<T> output;
};
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.h
#include "src/fastertransformer/models/llama/LlamaFfnLayer.h"
#include "src/fastertransformer/kernels/activation_kernels.h"
#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
#include "src/fastertransformer/utils/nvtx_utils.h"
// #include <glog/logging.h>
namespace fastertransformer {
template<typename T>
void LlamaFfnLayer<T>::allocateBuffer(size_t token_num)
{
inter_buf_ = (T*)allocator_->reMalloc(inter_buf_, sizeof(T) * token_num * inter_size_, false);
gating_buf_ = (T*)allocator_->reMalloc(gating_buf_, sizeof(T) * token_num * inter_size_, false);
is_allocate_buffer_ = true;
}
template<typename T>
void LlamaFfnLayer<T>::freeBuffer()
{
if (is_allocate_buffer_) {
allocator_->free((void**)&inter_buf_);
allocator_->free((void**)&gating_buf_);
is_allocate_buffer_ = false;
}
}
template<typename T>
void LlamaFfnLayer<T>::activation(int num_token)
{
invokeGenericActivation<SiluActivation>(gating_buf_,
(const T*)nullptr, // bias
inter_buf_,
(const T*)nullptr, // gated_bias
nullptr, // ia3_tasks
(const T*)nullptr, // ia3_weights
num_token, // m
inter_size_, // n
0, // int8_mode
nullptr, // activation_in
nullptr, // activation_out
nullptr, // padding_offset
0, // seq_len
stream_);
sync_check_cuda_error();
}
template<typename T>
void LlamaFfnLayer<T>::forward(TensorMap* output_tensors,
const TensorMap* input_tensors,
const LlamaFfnWeight<T>* weights)
{
/**
* input_tensors:
* \param ffn_input [token_num, hidden_dimension]
*
* output_tensors:
* \param ffn_output [token_num, hidden_dimension]
*/
const size_t num_token = input_tensors->at("ffn_input").shape[0];
// LOG(WARNING);
allocateBuffer(num_token);
const T* ffn_input_data = input_tensors->at("ffn_input").getPtr<T>();
T* ffn_output_data = output_tensors->at("ffn_output").getPtr<T>();
PUSH_RANGE("ffn");
// TODO: fuse the two GEMMs with activation
linear_.forward(gating_buf_, ffn_input_data, num_token, weights->gating);
linear_.forward(inter_buf_, ffn_input_data, num_token, weights->intermediate);
activation(num_token);
linear_.forward(ffn_output_data, gating_buf_, num_token, weights->output);
POP_RANGE;
if (tensor_para_.world_size_ > 1) {
NcclGuard nccl_guard(tensor_para_, stream_);
ftNcclAllReduceSum(ffn_output_data, ffn_output_data, num_token * hidden_units_, tensor_para_, stream_);
sync_check_cuda_error();
}
if (is_free_buffer_after_forward_) {
freeBuffer();
}
// LOG(WARNING);
}
template class LlamaFfnLayer<float>;
template class LlamaFfnLayer<half>;
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/layers/FfnLayer.cc
#pragma once
// #include "src/fastertransformer/layers/FfnLayer.h"
#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
#include "src/fastertransformer/models/llama/LlamaLinear.h"
#include "src/fastertransformer/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/nccl_utils.h"
#include <functional>
namespace fastertransformer {
template<typename T>
class LlamaFfnLayer {
public:
LlamaFfnLayer(size_t head_num,
size_t size_per_head,
size_t inter_size,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward):
head_num_(head_num),
size_per_head_(size_per_head),
inter_size_(inter_size / tensor_para.world_size_),
hidden_units_(head_num * size_per_head),
stream_(stream),
linear_(cublas_wrapper, stream),
allocator_(allocator),
tensor_para_(tensor_para),
is_free_buffer_after_forward_(is_free_buffer_after_forward)
{
}
~LlamaFfnLayer()
{
freeBuffer();
}
void forward(TensorMap* output_tensors, const TensorMap* input_tensors, const LlamaFfnWeight<T>* weights);
private:
void allocateBuffer(size_t token_num);
void freeBuffer();
void activation(int num_token);
size_t head_num_;
size_t size_per_head_;
size_t inter_size_;
size_t hidden_units_;
cudaStream_t stream_;
LlamaLinear<T> linear_;
IAllocator* allocator_;
bool is_free_buffer_after_forward_;
T* gating_buf_{};
T* inter_buf_{};
NcclParam tensor_para_;
bool is_allocate_buffer_{};
};
} // namespace fastertransformer
\ No newline at end of file
// Copyright (c) OpenMMLab. All rights reserved.
#pragma once
#include "src/fastertransformer/models/llama/Barrier.h"
#include "src/fastertransformer/utils/instance_comm.h"
namespace fastertransformer {
class LlamaInstanceComm: public AbstractInstanceComm {
public:
LlamaInstanceComm(int count): barrier_(count) {}
void barrier() override
{
barrier_.wait();
}
void setSharedObject(void* p) override
{
ptr = p;
}
void* getSharedObject() override
{
return ptr;
}
private:
Barrier barrier_;
void* ptr{};
};
} // namespace fastertransformer
\ No newline at end of file
// Copyright (c) OpenMMLab. All rights reserved.
#pragma once
#include "src/fastertransformer/models/llama/LlamaDenseWeight.h"
#include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
namespace fastertransformer {
template<typename T>
class LlamaLinear {
public:
LlamaLinear(cublasMMWrapper* cublas_wrapper, cudaStream_t stream): cublas_wrapper_(cublas_wrapper), stream_(stream)
{
}
void forward(T* output_data, const T* input_data, int batch_size, const LlamaDenseWeight<T>& weight)
{
switch (weight.type) {
case WeightType::kFP16:
case WeightType::kFP32:
forwardFp(output_data, input_data, batch_size, weight);
break;
case WeightType::kINT4:
forwardInt4(output_data, input_data, batch_size, weight);
break;
default:
FT_CHECK(0);
}
}
private:
void forwardFp(T* output_data, const T* input_data, int batch_size, const LlamaDenseWeight<T>& weight)
{
cublas_wrapper_->Gemm(CUBLAS_OP_N,
CUBLAS_OP_N,
weight.output_dims,
batch_size,
weight.input_dims,
(const T*)weight.kernel,
weight.output_dims,
input_data,
weight.input_dims,
output_data,
weight.output_dims);
sync_check_cuda_error();
}
void forwardInt4(T* output_data, const T* input_data, int batch_size, const LlamaDenseWeight<T>& weight)
{
FT_CHECK_WITH_INFO(0, "Not implemented");
}
private:
cublasMMWrapper* cublas_wrapper_;
cudaStream_t stream_{};
};
} // namespace fastertransformer
\ No newline at end of file
// Copyright (c) OpenMMLab. All rights reserved.
#pragma once
#include "src/fastertransformer/utils/nccl_utils.h"
#include <array>
#include <atomic>
#include <condition_variable>
#include <cuda_runtime.h>
#include <mutex>
namespace fastertransformer {
struct NcclGuard {
static constexpr int kMaxGroupCount = 32;
static std::mutex& globalNcclMutex()
{
static std::mutex inst;
return inst;
}
struct GroupState {
std::mutex mutex;
std::condition_variable cv;
int ref_count;
};
static GroupState& groupState(int group_id)
{
static std::array<GroupState, kMaxGroupCount> array{};
FT_CHECK(group_id < kMaxGroupCount);
return array[group_id];
}
NcclGuard(NcclParam tensor_para, cudaStream_t stream, bool barrier = false):
tensor_para_(tensor_para), stream_(stream), barrier_(barrier)
{
if (is_active()) {
auto& group = groupState(tensor_para_.group_id_);
if (tensor_para_.rank_ == 0) {
/// TODO: use std::optional after switching to C++17
global_nccl_lock_ = std::make_unique<std::lock_guard<std::mutex>>(globalNcclMutex());
{
std::lock_guard<std::mutex> lock(group.mutex);
group.ref_count = tensor_para_.world_size_;
}
group.cv.notify_all();
}
else {
std::unique_lock<std::mutex> lock(group.mutex);
group.cv.wait(lock, [&] { return group.ref_count > 0; });
}
}
}
~NcclGuard()
{
if (is_active()) {
ftNcclStreamSynchronize(tensor_para_, NcclParam{}, stream_);
auto& group = groupState(tensor_para_.group_id_);
int value = -1;
{
std::lock_guard<std::mutex> lock(group.mutex);
value = --group.ref_count;
}
if (value == 0) {
group.cv.notify_all();
}
else if (barrier_ || tensor_para_.rank_ == 0) {
std::unique_lock<std::mutex> lock(group.mutex);
group.cv.wait(lock, [&] { return group.ref_count == 0; });
}
// rank 0 unlocks global NCCL mutex automatically
}
}
bool is_active()
{
return barrier_ || (ftNcclGroupCount() > 1 && tensor_para_.world_size_ > 1);
}
NcclParam tensor_para_;
cudaStream_t stream_;
bool barrier_;
std::unique_ptr<std::lock_guard<std::mutex>> global_nccl_lock_;
};
} // namespace fastertransformer
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment