Unverified Commit 9efcac38 authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

check-in fastertransformer (#7)

* add ft code

* gitignore

* fix lint

* revert fmha
parent 720fc533
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2021, NAVER Corp. Authored by CLOVA.
* Copyright (c) 2022, SK Telecom Authored by A. Dialog
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.cc
#include "src/fastertransformer/models/llama/LlamaV2.h"
#include "src/fastertransformer/kernels/decoding_kernels.h"
#include "src/fastertransformer/kernels/gpt_kernels.h"
#include "src/fastertransformer/models/llama/LlamaBatch.h"
#include "src/fastertransformer/models/llama/LlamaNcclGuard.h"
#include "src/fastertransformer/models/llama/LlamaWeight.h"
#include "src/fastertransformer/models/llama/Request.h"
#include "src/fastertransformer/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include <functional>
#include <memory>
#include <sstream>
#include <stdexcept>
namespace fastertransformer {
template<typename T>
LlamaV2<T>::LlamaV2(size_t head_num,
size_t size_per_head,
size_t inter_size,
size_t num_layer,
size_t vocab_size,
size_t rotary_embedding_dim,
float norm_eps,
int max_batch_size,
int max_context_token_num,
int session_len,
int step_length,
int start_id,
int end_id,
int cache_max_entry_count,
int cache_chunk_size,
bool use_context_fmha,
std::shared_ptr<SharedState> shared_state,
LlamaWeight<T>* weights,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
cudaDeviceProp* cuda_device_prop):
head_num_(head_num),
size_per_head_(size_per_head),
inter_size_(inter_size),
num_layer_(num_layer),
vocab_size_(vocab_size),
rotary_embedding_dim_(rotary_embedding_dim),
rmsnorm_eps_(norm_eps),
start_id_(start_id),
end_id_(end_id),
hidden_units_(head_num * size_per_head),
local_head_num_(head_num / tensor_para.world_size_),
weights_(weights),
tensor_para_(tensor_para),
stream_(stream),
cublas_wrapper_(cublas_wrapper),
allocator_(allocator),
is_free_buffer_after_forward_(is_free_buffer_after_forward),
cuda_device_prop_(cuda_device_prop),
debug_(isDebug()),
step_length_(step_length),
batch_(max_batch_size, max_context_token_num, session_len, this),
shared_state_(shared_state)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
FT_CHECK(vocab_size_ % tensor_para_.world_size_ == 0);
FT_LOG_INFO("NCCL group_id = %d", tensor_para_.group_id_);
kv_cache_mgr_ = std::make_unique<LlamaCacheManager>(num_layer_,
local_head_num_,
size_per_head_,
session_len,
sizeof(T) * 8,
cache_max_entry_count,
cache_chunk_size,
tensor_para.rank_,
allocator);
initialize(use_context_fmha);
start();
}
template<typename T>
LlamaV2<T>::~LlamaV2()
{
internal_thread_.join();
delete decoder_;
delete dynamic_decode_layer_;
delete context_decoder_;
}
template<typename T>
void LlamaV2<T>::initialize(bool use_context_fmha)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
context_decoder_ = new LlamaContextDecoder<T>(head_num_,
size_per_head_,
inter_size_,
num_layer_,
rotary_embedding_dim_,
rmsnorm_eps_,
tensor_para_,
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_,
use_context_fmha);
decoder_ = new LlamaDecoder<T>(head_num_,
size_per_head_,
inter_size_,
num_layer_,
rotary_embedding_dim_,
rmsnorm_eps_,
tensor_para_,
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_);
dynamic_decode_layer_ = new DynamicDecodeLayer<float>(vocab_size_,
vocab_size_, // vocab_size_padded,
0, // end_id, deprecated
stream_,
cublas_wrapper_,
allocator_,
is_free_buffer_after_forward_,
cuda_device_prop_);
}
template<typename T>
void LlamaV2<T>::embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
// ! This kernel can't be used in context decoding
invokeEmbeddingLookupPosEncodingPadCount(embeddings,
weights_->pre_decoder_embedding_table,
static_cast<T*>(nullptr), // position encoding
token_ids_buf,
static_cast<int*>(nullptr), // padding count, not used w/o pos-code
batch_size,
hidden_units_,
static_cast<T>(1.), // scale
step, // step, used int index into output_ids_buf_
batch_size, // token_num
0, // ite
stream_);
sync_check_cuda_error();
}
template<typename T>
void LlamaV2<T>::contextDecode(T* deocder_output,
uintptr_t* k_cache_ptr,
uintptr_t* v_cache_ptr,
T* context_decoder_input_buf,
T* context_decoder_output_buf,
const int* input_ids,
const int* input_length,
const int* history_length,
const int* context_length,
size_t token_num,
size_t max_input_len,
size_t max_context_len,
size_t session_len,
size_t batch_size)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
if (tensor_para_.rank_ == 0) {
FT_LOG_INFO("context decoding start");
}
invokeInputIdsEmbeddingLookupPosEncoding(context_decoder_input_buf,
nullptr, // processed somewhere else
weights_->pre_decoder_embedding_table,
static_cast<T*>(nullptr),
pPromptTuningParam<T>{},
input_ids,
0, // only used for postion encoding
token_num,
token_num,
1,
hidden_units_,
stream_);
sync_check_cuda_error();
const auto dtype = getTensorType<T>();
const auto bsz = batch_size;
const int max_q_len = max_input_len;
const int max_kv_len = max_context_len;
const int max_seq_len = session_len;
std::unordered_map<std::string, Tensor> decoder_input_tensors{
{"decoder_input", {MEMORY_GPU, dtype, {token_num, hidden_units_}, context_decoder_input_buf}},
{"output_norm_weight", {MEMORY_GPU, dtype, {hidden_units_}, weights_->output_norm_weight}},
{"input_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, input_length}},
{"history_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, history_length}},
{"context_lengths", {MEMORY_GPU, TYPE_INT32, {bsz}, context_length}},
{"max_q_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_q_len}},
{"max_kv_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_kv_len}},
{"max_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_seq_len}},
};
std::unordered_map<std::string, Tensor> decoder_output_tensors{
{"decoder_output", {MEMORY_GPU, dtype, {bsz, max_input_len, hidden_units_}, context_decoder_output_buf}},
{"key_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, k_cache_ptr}},
{"value_cache", {MEMORY_GPU, TYPE_UINT64, {bsz}, v_cache_ptr}},
{"last_token_hidden_units", {MEMORY_GPU, dtype, {bsz, hidden_units_}, deocder_output}}};
context_decoder_->forward(&decoder_output_tensors, &decoder_input_tensors, &weights_->decoder_layer_weights);
if (tensor_para_.rank_ == 0) {
FT_LOG_INFO("context decoding end");
}
}
template<typename T>
void LlamaV2<T>::decoderForward(T* decoder_output,
uintptr_t* k_cache_ptr,
uintptr_t* v_cache_ptr,
T* decoder_input,
const int* sequence_length,
const int* total_padding_count,
bool* finished,
int step,
int ite,
size_t session_len,
size_t batch_size)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
const int max_seq_len = session_len;
const auto dtype = getTensorType<T>();
// max_input_length is not used w/o linear_bias_slopes
// sequence_lengths_ will be incremented in dynamic decode
std::unordered_map<std::string, Tensor> decoder_input_tensors{
{"decoder_input", {MEMORY_GPU, dtype, {batch_size, hidden_units_}, decoder_input}},
{"sequence_lengths", {MEMORY_GPU, TYPE_INT32, {batch_size}, sequence_length}},
{"total_padding_tokens", {MEMORY_GPU, TYPE_INT32, {batch_size}, total_padding_count}},
{"max_seq_len", {MEMORY_CPU, TYPE_INT32, {1}, &max_seq_len}},
{"finished", {MEMORY_GPU, TYPE_BOOL, {batch_size}, finished}},
{"output_norm_weight", {MEMORY_GPU, dtype, {hidden_units_}, weights_->output_norm_weight}},
{"step", {MEMORY_CPU, TYPE_INT32, {1}, &step}},
{"ite", {MEMORY_CPU, TYPE_INT32, {1}, &ite}},
};
// LOG(ERROR) << key_cache_ << " " << value_cache_;
std::unordered_map<std::string, Tensor> decoder_output_tensors{
{"decoder_output", {MEMORY_GPU, dtype, {batch_size, hidden_units_}, decoder_output}},
{"key_cache", {MEMORY_GPU, TYPE_UINT64, {batch_size}, k_cache_ptr}},
{"value_cache", {MEMORY_GPU, TYPE_UINT64, {batch_size}, v_cache_ptr}},
};
decoder_->forward(&decoder_output_tensors, &decoder_input_tensors, &weights_->decoder_layer_weights);
}
template<typename T>
void LlamaV2<T>::postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
cudaDataType_t data_type = getCudaDataType<T>();
float alpha = 1.f;
float beta = 0.f;
if (tensor_para_.world_size_ == 1) {
cublas_wrapper_->Gemm(CUBLAS_OP_T,
CUBLAS_OP_N,
vocab_size_, // n
batch_size,
hidden_units_, // k
&alpha,
weights_->post_decoder_embedding_kernel,
data_type,
hidden_units_, // k
decoder_output,
data_type,
hidden_units_, // k
&beta,
logits,
CUDA_R_32F,
vocab_size_, // n
CUDA_R_32F,
cublasGemmAlgo_t(-1));
}
else {
FT_CHECK(vocab_size_ % tensor_para_.world_size_ == 0);
const size_t local_vocab_size = vocab_size_ / tensor_para_.world_size_;
cublas_wrapper_->Gemm(CUBLAS_OP_T,
CUBLAS_OP_N,
local_vocab_size, // n
batch_size,
hidden_units_, // k
&alpha,
weights_->post_decoder_embedding_kernel
+ tensor_para_.rank_ * local_vocab_size * hidden_units_,
data_type,
hidden_units_, // k
decoder_output,
data_type,
hidden_units_, // k
&beta,
local_logits + tensor_para_.rank_ * batch_size * local_vocab_size,
CUDA_R_32F,
local_vocab_size, // n
CUDA_R_32F,
cublasGemmAlgo_t(-1));
{
NcclGuard nccl_guard(tensor_para_, stream_);
ftNcclAllGather(local_logits, // send_buf
local_logits, // recv_buf
batch_size * local_vocab_size, // data_size
tensor_para_.rank_,
tensor_para_,
stream_);
}
invokeTransposeAxis01(logits, local_logits, tensor_para_.world_size_, batch_size, local_vocab_size, stream_);
sync_check_cuda_error();
}
}
template<typename T>
void LlamaV2<T>::dynamicDecode(int* token_ids,
bool* finished,
int* sequence_length,
bool* should_stop,
TensorMap* inputs,
TensorMap* outputs,
const float* logits,
const uint32_t* seq_limit_len,
const int* context_length,
const int* end_ids,
int step,
int ite,
size_t max_context_len,
size_t token_ids_len,
size_t batch_size)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
int local_batch_size = (int)batch_size;
std::unordered_map<std::string, Tensor> dynamic_decode_input_tensors{
{"logits", {MEMORY_GPU, TYPE_FP32, {batch_size, (size_t)1, vocab_size_}, logits}},
{"step", {MEMORY_CPU, TYPE_INT32, {1}, &step}},
{"max_input_length", {MEMORY_CPU, TYPE_INT32, {1}, &max_context_len}},
{"sequence_limit_length", {MEMORY_GPU, TYPE_UINT32, {batch_size}, seq_limit_len}},
{"input_lengths", {MEMORY_GPU, TYPE_INT32, {batch_size, 1}, context_length}},
{"ite", {MEMORY_CPU, TYPE_UINT32, {1}, &ite}},
{"end_id", {MEMORY_GPU, TYPE_INT32, {batch_size}, end_ids}},
{"local_batch_size", {MEMORY_CPU, TYPE_INT32, {1}, &local_batch_size}},
};
const std::vector<std::string> optional_inputs{"stop_words_list",
"bad_words_list",
"runtime_top_k",
"runtime_top_p",
"temperature",
"repetition_penalty",
"random_seed"};
for (const auto& key : optional_inputs) {
if (inputs->isExist(key)) {
dynamic_decode_input_tensors.insert({key, inputs->at(key)});
}
}
std::unordered_map<std::string, Tensor> dynamic_decode_output_tensors{
{"output_ids", {MEMORY_GPU, TYPE_INT32, {token_ids_len, batch_size, 1U}, token_ids}},
{"finished", {MEMORY_GPU, TYPE_BOOL, {batch_size}, finished}},
{"sequence_length", {MEMORY_GPU, TYPE_INT32, {batch_size}, sequence_length}},
{"should_stop", {MEMORY_CPU, TYPE_BOOL, {1}, should_stop}}};
const std::vector<std::string> optional_outputs{"cum_log_probs", "output_log_probs"};
for (const auto& key : optional_outputs) {
if (outputs->isExist(key)) {
dynamic_decode_output_tensors.insert({key, outputs->at(key)});
}
}
dynamic_decode_layer_->forward(&dynamic_decode_output_tensors, &dynamic_decode_input_tensors);
}
template<typename T>
void LlamaV2<T>::internalThreadEntry(int device_id)
{
FT_LOG_INFO("[internalThreadEntry] %d", (int)tensor_para_.rank_);
check_cuda_error(cudaSetDevice(device_id));
auto& request_queue = shared_state_->request_queue;
auto& infer_requests = shared_state_->infer_requests;
auto& stop_requests = shared_state_->stop_requests;
while (1) {
if (tensor_para_.rank_ == 0) {
const int free_slot_count = batch_.maxSize() - batch_.size() + batch_.finishedCount();
const bool is_empty = free_slot_count == batch_.maxSize();
request_queue.dequeue(stop_requests, infer_requests, free_slot_count, is_empty);
batch_.verifyRequests(stop_requests, infer_requests);
}
// wait while rank-0 is dequeueing
shared_state_->barrier->wait();
bool modified = false;
if (!(batch_.finishedCount() == 0 && stop_requests.empty() && infer_requests.empty())) {
batch_.handleStopRequests(stop_requests);
batch_.synchronize();
modified = true;
}
const int infer_request_count = infer_requests.size();
if (!infer_requests.empty()) {
batch_.initialize(infer_requests); // reinitialize when new requests come, possible buffer allocation
batch_.contextDecode();
modified = true;
}
// wait while shared stop/infer_requests is being used
shared_state_->barrier->wait();
if (batch_.size()) {
if (modified) {
batch_.initializeGeneration();
batch_.initializeSampling(infer_request_count);
}
for (int i = 0; i < step_length_; ++i) {
if (!batch_.generate()) {
break;
}
}
batch_.finish();
}
}
FT_CHECK(0);
}
template<typename T>
void LlamaV2<T>::start()
{
int device_id = -1;
check_cuda_error(cudaGetDevice(&device_id));
internal_thread_ = std::thread(&LlamaV2<T>::internalThreadEntry, this, device_id);
}
static inline Tensor slice(const Tensor& tensor, int index)
{
auto shape = tensor.shape;
if (shape.at(0) == 1) {
return tensor;
}
shape[0] = 1;
const auto offset = std::accumulate(shape.begin(), shape.end(), (size_t)index, std::multiplies<>{});
return tensor.slice(shape, offset);
}
// ! implicit conversion from `unordered_map` to `TensorMap` drops 0-sized tensors
static inline TensorMap slice(const std::unordered_map<std::string, Tensor>& src, int index)
{
TensorMap dst;
for (const auto& kv : src) {
dst.insert({kv.first, slice(kv.second, index)});
}
return dst;
}
template<typename T>
void LlamaV2<T>::forward(std::unordered_map<std::string, Tensor>* outputs,
const std::unordered_map<std::string, Tensor>* inputs,
Control control)
{
if (debug_) {
if (tensor_para_.rank_ == 0) {
for (const auto& kv : *inputs) {
FT_LOG_INFO("[forward][rank=%d] INPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
}
for (const auto& kv : *outputs) {
FT_LOG_INFO("[forward][rank=%d] OUTPUT: %s", (int)tensor_para_.rank_, format(kv).c_str());
}
}
}
const int batch_size = outputs->at("output_ids").shape[0];
const auto rank = tensor_para_.rank_;
std::vector<std::shared_ptr<Request>> requests(batch_size);
// rank-0 allocates all requests for the batch
if (rank == 0) {
for (int i = 0; i < batch_size; ++i) {
requests[i] = std::make_shared<Request>();
requests[i]->inputs.resize(tensor_para_.world_size_);
requests[i]->outputs.resize(tensor_para_.world_size_);
}
control.comm->setSharedObject(&requests);
}
control.comm->barrier();
if (rank != 0) {
requests = *(std::vector<std::shared_ptr<Request>>*)control.comm->getSharedObject();
}
for (int i = 0; i < batch_size; ++i) {
auto& r = requests[i];
r->inputs[rank] = slice(*inputs, i);
r->outputs[rank] = slice(*outputs, i);
if (rank == 0) {
r->id = r->inputs[rank].getVal<uint64_t>("CORRID", i);
r->start_flag = r->inputs[rank].getVal<int>("START", 1);
r->end_flag = r->inputs[rank].getVal<int>("END", 1);
r->stop_flag = r->inputs[rank].getVal<int>("STOP", 0);
r->stream_cb = control.callback;
}
}
control.comm->barrier();
// rank-0 now takes the ownership of `requests`
// rank-0 submits the tasks and wait for finish
std::vector<int> error_codes;
bool has_error = 0;
if (rank == 0) {
FT_LOG_INFO("[forward] Enqueue requests");
auto futures = shared_state_->request_queue.enqueue(std::move(requests));
FT_LOG_INFO("[forward] Wait for requests to complete ...");
for (auto& f : futures) {
auto ec = f.get();
error_codes.push_back(ec);
if (ec) {
has_error = true;
}
}
}
// prevents request tensors being freed before the batch completes
control.comm->barrier();
if (rank == 0 && has_error) {
std::stringstream ss;
for (int i = 0; i < error_codes.size(); ++i) {
ss << (i ? "" : " ") << error_codes[i];
}
throw std::runtime_error(ss.str());
}
}
template class LlamaV2<half>;
template class LlamaV2<float>;
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2021, NAVER Corp. Authored by CLOVA.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGpt.h
#pragma once
#include "src/fastertransformer/layers/DynamicDecodeLayer.h"
#include "src/fastertransformer/models/llama/Barrier.h"
#include "src/fastertransformer/models/llama/LlamaBatch.h"
#include "src/fastertransformer/models/llama/LlamaContextDecoder.h"
#include "src/fastertransformer/models/llama/LlamaDecoder.h"
#include "src/fastertransformer/models/llama/LlamaWeight.h"
#include "src/fastertransformer/models/llama/Request.h"
#include "src/fastertransformer/utils/allocator.h"
#include "src/fastertransformer/utils/cublasMMWrapper.h"
#include "src/fastertransformer/utils/instance_comm.h"
#include "src/fastertransformer/utils/nccl_utils.h"
#include <unordered_map>
namespace fastertransformer {
template<typename T>
class LlamaV2 {
public:
struct SharedState {
std::vector<std::shared_ptr<Request>> infer_requests;
std::vector<std::shared_ptr<Request>> stop_requests;
RequestQueue request_queue;
std::shared_ptr<Barrier> barrier;
};
~LlamaV2();
LlamaV2(size_t head_num,
size_t size_per_head,
size_t inter_size,
size_t num_layer,
size_t vocab_size,
size_t rotary_embedding_dim,
float norm_eps,
int max_batch_size,
int max_context_token_num,
int session_len,
int step_length,
int start_id,
int end_id,
int cache_max_entry_count,
int cache_chunk_size,
bool use_context_fmha,
std::shared_ptr<SharedState> shared_state,
LlamaWeight<T>* weights,
NcclParam tensor_para,
cudaStream_t stream,
cublasMMWrapper* cublas_wrapper,
IAllocator* allocator,
bool is_free_buffer_after_forward,
cudaDeviceProp* cuda_device_prop);
struct Control {
AbstractInstanceComm* comm;
Request::Callback callback;
};
void forward(std::unordered_map<std::string, Tensor>* outputs,
const std::unordered_map<std::string, Tensor>* inputs,
Control control);
void stop(const std::vector<uint64_t>& seq_ids);
private:
friend class Batch;
void internalThreadEntry(int device_id);
void initialize(bool use_context_fmha);
void embeddingLookup(T* embeddings, const int* token_ids_buf, int batch_size, int step);
void contextDecode(T* deocder_output,
uintptr_t* k_cache_ptr,
uintptr_t* v_cache_ptr,
T* context_decoder_input_buf,
T* context_decoder_output_buf,
const int* input_ids,
const int* input_length,
const int* history_length,
const int* context_length,
size_t token_num,
size_t max_input_len,
size_t max_context_len,
size_t session_len,
size_t batch_size);
void decoderForward(T* decoder_output,
uintptr_t* k_cache_ptr,
uintptr_t* v_cache_ptr,
T* decoder_input,
const int* sequence_length,
const int* total_padding_count,
bool* finished,
int step,
int ite,
size_t session_len,
size_t batch_size);
void postDecodeEmbedding(float* logits, float* local_logits, const T* decoder_output, int batch_size);
void dynamicDecode(int* token_ids,
bool* finished,
int* sequence_length,
bool* should_stop,
TensorMap* inputs,
TensorMap* outputs,
const float* logits,
const uint32_t* seq_limit_len,
const int* context_length,
const int* end_ids,
int step,
int ite,
size_t max_context_len,
size_t token_ids_len,
size_t batch_size);
void start();
private:
friend class LlamaBatch<T>;
const size_t head_num_;
const size_t size_per_head_;
const size_t inter_size_;
const size_t num_layer_;
const size_t vocab_size_;
const size_t rotary_embedding_dim_;
float rmsnorm_eps_ = 1e-6f;
static constexpr bool neox_rotary_style_ = false;
const int start_id_;
const int end_id_;
const size_t hidden_units_;
const size_t local_head_num_;
NcclParam tensor_para_;
cudaStream_t stream_;
cublasMMWrapper* cublas_wrapper_;
IAllocator* allocator_;
bool is_free_buffer_after_forward_;
cudaDeviceProp* cuda_device_prop_;
const bool debug_{false};
std::unique_ptr<LlamaCacheManager> kv_cache_mgr_;
LlamaWeight<T>* weights_{};
LlamaDecoder<T>* decoder_{};
LlamaContextDecoder<T>* context_decoder_{};
DynamicDecodeLayer<float>* dynamic_decode_layer_{};
const int step_length_;
LlamaBatch<T> batch_;
std::shared_ptr<SharedState> shared_state_;
std::thread internal_thread_;
};
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.cc
#include "src/fastertransformer/models/llama/LlamaWeight.h"
namespace fastertransformer {
template<typename T>
LlamaWeight<T>::LlamaWeight(size_t hidden_units,
size_t inter_size,
size_t vocab_size,
size_t num_layer,
WeightType weight_type,
size_t tensor_para_size,
size_t tensor_para_rank,
int prefix_cache_len):
hidden_units_(hidden_units),
inter_size_(inter_size),
vocab_size_(vocab_size),
num_layer_(num_layer),
weight_type_(weight_type),
tensor_para_size_(tensor_para_size),
tensor_para_rank_(tensor_para_rank),
prefix_cache_len_(prefix_cache_len)
{
decoder_layer_weights.reserve(num_layer_);
for (unsigned l = 0; l < num_layer_; ++l) {
decoder_layer_weights.push_back(new LlamaDecoderLayerWeight<T>(
hidden_units_, inter_size_, weight_type_, tensor_para_size_, tensor_para_rank_));
}
mallocWeights();
}
template<typename T>
LlamaWeight<T>::~LlamaWeight()
{
cudaFree((void*)pre_decoder_embedding_table);
cudaFree((void*)output_norm_weight);
cudaFree((void*)post_decoder_embedding_kernel);
if (prefix_cache_key) {
cudaFree((void*)prefix_cache_key);
cudaFree((void*)prefix_cache_token);
}
pre_decoder_embedding_table = nullptr;
post_decoder_embedding_kernel = nullptr;
prefix_cache_token = nullptr;
prefix_cache_key = nullptr;
prefix_cache_value = nullptr;
}
template<typename T>
void LlamaWeight<T>::mallocWeights()
{
deviceMalloc((T**)&pre_decoder_embedding_table, vocab_size_ * hidden_units_);
deviceMalloc((T**)&output_norm_weight, hidden_units_);
deviceMalloc((T**)&post_decoder_embedding_kernel, hidden_units_ * vocab_size_);
if (prefix_cache_len_) {
size_t cache_size = num_layer_ * prefix_cache_len_ * hidden_units_ / tensor_para_size_;
deviceMalloc((T**)&prefix_cache_key, cache_size * 2);
prefix_cache_value = prefix_cache_key + cache_size;
deviceMalloc((int**)&prefix_cache_token, prefix_cache_len_);
}
}
template<typename T>
void LlamaWeight<T>::loadModel(std::string dir_path)
{
FtCudaDataType model_file_type = FtCudaDataType::FP16;
dir_path += '/';
loadWeightFromBin((T*)pre_decoder_embedding_table,
{vocab_size_ * hidden_units_},
dir_path + "tok_embeddings.weight",
model_file_type);
loadWeightFromBin((T*)output_norm_weight, {hidden_units_}, dir_path + "norm.weight", model_file_type);
loadWeightFromBin(
(T*)post_decoder_embedding_kernel, {hidden_units_ * vocab_size_}, dir_path + "output.weight", model_file_type);
if (prefix_cache_len_) {
loadWeightFromBin((float*)prefix_cache_token, {prefix_cache_len_}, dir_path + "prefix_cache.token");
loadWeightFromBin((T*)prefix_cache_key,
{num_layer_ * prefix_cache_len_, hidden_units_ / tensor_para_size_},
dir_path + "prefix_cache." + std::to_string(tensor_para_rank_) + ".key",
model_file_type);
loadWeightFromBin((T*)prefix_cache_value,
{num_layer_ * prefix_cache_len_, hidden_units_ / tensor_para_size_},
dir_path + "prefix_cache." + std::to_string(tensor_para_rank_) + ".value",
model_file_type);
}
for (unsigned layer = 0; layer < num_layer_; ++layer) {
decoder_layer_weights[layer]->loadModel(dir_path + "layers." + std::to_string(layer), model_file_type);
}
}
template struct LlamaWeight<float>;
template struct LlamaWeight<half>;
} // namespace fastertransformer
\ No newline at end of file
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2019-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/models/multi_gpu_gpt/ParallelGptWeight.h
#pragma once
#include "src/fastertransformer/models/llama/LlamaDecoderLayerWeight.h"
#include "src/fastertransformer/utils/memory_utils.h"
namespace fastertransformer {
template<typename T>
struct LlamaWeight {
LlamaWeight() = default;
LlamaWeight(size_t hidden_units,
size_t inter_size,
size_t vocab_size,
size_t num_layer,
WeightType weight_type,
size_t tensor_para_size,
size_t tensor_para_rank,
int prefix_cache_len);
~LlamaWeight();
LlamaWeight(const LlamaWeight& other) = delete;
LlamaWeight& operator=(const LlamaWeight& other) = delete;
void loadModel(std::string dir_path);
std::vector<LlamaDecoderLayerWeight<T>*> decoder_layer_weights;
const T* pre_decoder_embedding_table{};
const T* output_norm_weight{};
const T* post_decoder_embedding_kernel{};
size_t prefix_cache_len_;
int* prefix_cache_token{};
T* prefix_cache_key{};
T* prefix_cache_value{};
private:
void mallocWeights();
size_t hidden_units_;
size_t inter_size_;
size_t vocab_size_;
size_t num_layer_;
WeightType weight_type_;
size_t tensor_para_size_;
size_t tensor_para_rank_;
};
} // namespace fastertransformer
\ No newline at end of file
// Copyright (c) OpenMMLab. All rights reserved.
#pragma once
#include "src/fastertransformer/utils/Tensor.h"
#include <condition_variable>
#include <cstdint>
#include <future>
#include <limits>
#include <queue>
#include <unordered_map>
namespace fastertransformer {
struct Request {
uint64_t id;
bool start_flag;
bool end_flag;
bool stop_flag;
// per rank inputs/outputs
std::vector<TensorMap> inputs;
std::vector<TensorMap> outputs;
using Callback = std::function<void(std::unordered_map<std::string, Tensor>*)>;
Callback stream_cb;
enum
{
kInvalid = 1,
kConflict = 2,
kBusy = 3,
kInactive = 4,
kFail = 5
};
std::promise<int> signal;
};
class RequestQueue {
public:
std::vector<std::future<int>> enqueue(std::vector<std::shared_ptr<Request>> requests)
{
std::vector<std::future<int>> futures;
futures.reserve(requests.size());
{
std::lock_guard<std::mutex> lock(mutex_);
for (auto& r : requests) {
futures.push_back(r->signal.get_future());
if (r->stop_flag) {
stop_queue_.push(std::move(r));
}
else {
infer_queue_.push(std::move(r));
}
}
}
cv_.notify_one();
return futures;
}
void dequeue(std::vector<std::shared_ptr<Request>>& stop_requests,
std::vector<std::shared_ptr<Request>>& infer_requests,
unsigned max_infer_count,
bool blocking)
{
std::unique_lock<std::mutex> lock(mutex_);
if (blocking) {
cv_.wait(lock, [this] { return !(stop_queue_.empty() && infer_queue_.empty()); });
}
stop_requests.clear();
while (!stop_queue_.empty()) {
stop_requests.push_back(std::move(stop_queue_.front()));
stop_queue_.pop();
}
infer_requests.clear();
while (!infer_queue_.empty() && infer_requests.size() < max_infer_count) {
infer_requests.push_back(std::move(infer_queue_.front()));
infer_queue_.pop();
}
}
private:
std::queue<std::shared_ptr<Request>> stop_queue_;
std::queue<std::shared_ptr<Request>> infer_queue_;
std::mutex mutex_;
std::condition_variable cv_;
};
} // namespace fastertransformer
\ No newline at end of file
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/models/llama/llama_decoder_kernels.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include <cooperative_groups.h>
#include <cooperative_groups/reduce.h>
#include <cuda_fp16.h>
namespace cg = cooperative_groups;
namespace fastertransformer {
template<typename T>
struct res_norm_ops_t {};
template<typename T>
struct res_norm_t {
res_norm_ops_t<T> f;
__device__ uint4 addvec(const uint4& a, const uint4& b, float& accum) const
{
uint4 c;
c.x = f.cast(f.add(f.cast(a.x), f.cast(b.x), accum));
c.y = f.cast(f.add(f.cast(a.y), f.cast(b.y), accum));
c.z = f.cast(f.add(f.cast(a.z), f.cast(b.z), accum));
c.w = f.cast(f.add(f.cast(a.w), f.cast(b.w), accum));
return c;
}
__device__ uint4 normvec(const uint4& u, const uint4& s, float factor) const
{
uint4 v;
v.x = f.cast(f.norm(f.cast(u.x), f.cast(s.x), factor));
v.y = f.cast(f.norm(f.cast(u.y), f.cast(s.y), factor));
v.z = f.cast(f.norm(f.cast(u.z), f.cast(s.z), factor));
v.w = f.cast(f.norm(f.cast(u.w), f.cast(s.w), factor));
return v;
}
};
template<>
struct res_norm_ops_t<half> {
__device__ float2 cast(const uint& x) const
{
return __half22float2(reinterpret_cast<const half2&>(x));
}
__device__ uint cast(const float2& x) const
{
auto y = __float22half2_rn(x);
return reinterpret_cast<uint&>(y);
}
__device__ float2 add(const float2& a, const float2& b, float& accum) const
{
float2 c{a.x + b.x, a.y + b.y};
accum += c.x * c.x + c.y * c.y;
return c;
}
__device__ float2 norm(const float2& a, const float2& s, float factor) const
{
return {a.x * s.x * factor, a.y * s.y * factor};
}
};
template<>
struct res_norm_ops_t<float> {
__device__ float cast(const uint& x) const
{
return reinterpret_cast<const float&>(x);
}
__device__ uint cast(const float& x) const
{
return reinterpret_cast<const uint&>(x);
}
__device__ float add(const float& a, const float& b, float& accum) const
{
float c = a + b;
accum += c * c;
return c;
}
__device__ float norm(const float& a, const float& s, float factor) const
{
return a * s * factor;
}
};
template<typename T>
__device__ T blockReduceSum(const cg::thread_block& block, T value)
{
__shared__ float partial[32];
auto tile = cg::tiled_partition<32>(block);
value = cg::reduce(tile, value, cg::plus<float>{});
if (tile.thread_rank() == 0) {
partial[tile.meta_group_rank()] = value;
}
block.sync();
value = tile.thread_rank() < tile.meta_group_size() ? partial[tile.thread_rank()] : T{};
return cg::reduce(tile, value, cg::plus<float>{});
}
template<typename T>
__global__ void fusedAddResidualNorm(
T* __restrict__ r_data, T* __restrict__ x_data, const T* __restrict__ scale, float eps, int batch_size, int n_dims)
{
auto block = cg::this_thread_block();
auto grid = cg::this_grid();
constexpr int PACK_DIM = sizeof(uint4) / sizeof(T);
const auto b = grid.block_rank();
uint4* __restrict__ r_ptr = reinterpret_cast<uint4*>(r_data + b * n_dims);
uint4* __restrict__ x_ptr = reinterpret_cast<uint4*>(x_data + b * n_dims);
res_norm_t<T> ops;
float thread_sum{};
for (auto i = block.thread_rank(); i < n_dims / PACK_DIM; i += block.num_threads()) {
auto r = r_ptr[i];
auto x = x_ptr[i];
r = ops.addvec(r, x, thread_sum);
r_ptr[i] = r;
}
auto total_sum = blockReduceSum(block, thread_sum);
float s_inv_mean = rsqrt(total_sum / n_dims + eps);
const uint4* __restrict__ s_ptr = reinterpret_cast<const uint4*>(scale);
for (uint i = block.thread_rank(); i < n_dims / PACK_DIM; i += block.num_threads()) {
auto r = r_ptr[i];
auto s = s_ptr[i];
auto o = ops.normvec(r, s, s_inv_mean);
x_ptr[i] = o;
}
}
template<typename T>
void invokeFusedAddResidualRMSNorm(
T* residual, T* inout, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream)
{
constexpr int PACK_DIM = sizeof(uint4) / sizeof(T);
FT_CHECK(n_dims % PACK_DIM == 0);
const int n_pack = n_dims / PACK_DIM;
const int n_iter = ((n_pack + 1023) / 1024); // iterations when block size == 1024
int n_threads = (n_pack + n_iter - 1) / n_iter; // adjust block size to avoid tail effect
n_threads = (n_threads + 31) / 32 * 32; // round up to the nearest multiple of warp size
fusedAddResidualNorm<<<batch_size, n_threads, 0, stream>>>(residual, inout, scale, eps, batch_size, n_dims);
}
template void invokeFusedAddResidualRMSNorm(float*, float*, const float*, float, int, int, cudaStream_t);
template void invokeFusedAddResidualRMSNorm(half*, half*, const half*, float, int, int, cudaStream_t);
} // namespace fastertransformer
// Copyright (c) OpenMMLab. All rights reserved.
#include <cuda_runtime.h>
namespace fastertransformer {
template<typename T>
void invokeFusedAddResidualRMSNorm(
T* residual, T* inout, const T* scale, float eps, int batch_size, int n_dims, cudaStream_t stream);
} // namespace fastertransformer
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/models/llama/llama_kernels.h"
#include "src/fastertransformer/utils/cuda_type_utils.cuh"
namespace fastertransformer {
// fp16, bf16
// n is divided by 2 for this impl
template<typename T>
__global__ void rootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n)
{
using T2 = typename TypeConverter<T>::Type;
__shared__ float s_inv_mean;
float mean = 0.f;
T2* out_ptr = (T2*)out;
const T2* input_ptr = (const T2*)input;
const T2* scale_ptr = (const T2*)scale;
for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
float2 tmp2 = cuda_cast<float2>(input_ptr[blockIdx.x * n + idx]);
mean += tmp2.x * tmp2.x;
mean += tmp2.y * tmp2.y;
}
mean = blockReduceSum<float>(mean);
if (threadIdx.x == 0) {
s_inv_mean = rsqrt(.5f * mean / (float)n + eps);
}
__syncthreads();
for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
float2 tmp2 = cuda_cast<float2>(input_ptr[blockIdx.x * n + idx]);
float2 sca2 = cuda_cast<float2>(scale_ptr[idx]);
tmp2.x = tmp2.x * s_inv_mean * sca2.x;
tmp2.y = tmp2.y * s_inv_mean * sca2.y;
out_ptr[blockIdx.x * n + idx] = cuda_cast<T2>(tmp2);
}
}
template<>
__global__ void rootMeanSquareNorm(float* out, const float* input, const float* scale, float eps, int m, int n)
{
__shared__ float s_inv_mean;
float mean = 0.f;
for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
float tmp = input[blockIdx.x * n + idx];
mean += tmp * tmp;
}
mean = blockReduceSum<float>(mean);
if (threadIdx.x == 0) {
s_inv_mean = rsqrt(mean / static_cast<float>(n) + eps);
}
__syncthreads();
for (uint idx = threadIdx.x; idx < n; idx += blockDim.x) {
float tmp = input[blockIdx.x * n + idx];
out[blockIdx.x * n + idx] = tmp * s_inv_mean * scale[idx];
}
}
template<typename T>
void invokeRootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n, cudaStream_t stream)
{
if (sizeof(T) == 2) {
FT_CHECK(n % 2 == 0);
n /= 2;
}
dim3 grid(m);
dim3 block(std::min(n, 1024));
rootMeanSquareNorm<<<grid, block, 0, stream>>>(out, input, scale, eps, m, n);
}
template void invokeRootMeanSquareNorm(float*, const float*, const float*, float, int, int, cudaStream_t);
template void invokeRootMeanSquareNorm(half*, const half*, const half*, float, int, int, cudaStream_t);
// #ifdef ENABLE_BF16
// template void invokeRootMeanSquareNorm(__nv_bfloat16*, const __nv_bfloat16*, float, int, int, cudaStream_t);
// #endif
template<typename T, typename T0>
__device__ T saturate_cast(T0 x)
{
return x;
}
template<>
__device__ half saturate_cast<half, float>(float x)
{
return (x > 64512.f || x < -64512.f) ? (x > 0.f ? 64512.f : -64512.f) : x;
}
template<typename T>
__global__ void addResidual(T* out, const T* in, size_t n)
{
auto idx = threadIdx.x + (size_t)blockIdx.x * blockDim.x;
if (idx < n) {
out[idx] = static_cast<T>(static_cast<float>(out[idx]) + static_cast<float>(in[idx]));
}
}
template<typename T>
void invokeAddResidual(T* out, const T* in, int m, int n, cudaStream_t stream)
{
auto total = static_cast<size_t>(m) * n;
dim3 block(std::min(total, 1024UL));
dim3 grid((total + block.x - 1) / block.x);
addResidual<<<grid, block, 0, stream>>>(out, in, total);
}
template void invokeAddResidual(float*, const float*, int, int, cudaStream_t);
template void invokeAddResidual(half*, const half*, int, int, cudaStream_t);
// ids [seq_len, batch_size]
// input_ids [batch_size, max_input_len]
__global__ void
fixInputIds(int* ids, const int* input_ids, const int* input_lengths, int batch_size, int seq_len, int max_input_len)
{
int seq_id = threadIdx.x;
int batch_id = blockIdx.x;
for (; seq_id < input_lengths[batch_id]; seq_id += blockDim.x) {
ids[seq_id * batch_size + batch_id] = input_ids[batch_id * max_input_len + seq_id];
}
}
void invokeFixInputIds(int* ids,
const int* input_ids,
const int* input_lengths,
int batch_size,
int seq_len,
int max_input_len,
cudaStream_t st)
{
dim3 block(std::min(1024, max_input_len));
dim3 grid(batch_size);
fixInputIds<<<grid, block, 0, st>>>(ids, input_ids, input_lengths, batch_size, seq_len, max_input_len);
}
template<typename T>
__global__ void sliceCausalMask(T* mask, int seq_len, int key_len, int step)
{
mask += (size_t)blockIdx.x * seq_len * key_len;
for (int i = threadIdx.x; i < seq_len * key_len; i += blockDim.x) {
int row = i / key_len;
int col = i % key_len;
if (col <= row + step) {
mask[i] = static_cast<T>(1.f);
}
else {
mask[i] = static_cast<T>(0.f);
}
}
}
// [step: step+Q, :] of the K*K causal mask
template<typename T>
void invokeSliceCausalMask(T* mask, int seq_len, int key_len, int step, int batch_size, cudaStream_t stream)
{
FT_CHECK(step == key_len - seq_len);
sliceCausalMask<<<batch_size, 256, 0, stream>>>(mask, seq_len, key_len, step);
}
template void invokeSliceCausalMask(half*, int, int, int, int, cudaStream_t);
template void invokeSliceCausalMask(float*, int, int, int, int, cudaStream_t);
// mask [bsz, max_q_len, max_k_len]
template<typename T>
__global__ void createCausalMasks(T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len)
{
const auto q_len = q_lens[blockIdx.x];
const auto k_len = k_lens[blockIdx.x];
mask += blockIdx.x * max_q_len * max_k_len;
for (int i = threadIdx.x; i < max_q_len * max_k_len; i += blockDim.x) {
const int q = i / max_k_len; // [0, max_q_len)
const int k = i % max_k_len; // [0, max_k_len)
bool is_valid = q < q_len && k < k_len && k <= q + (k_len - q_len);
mask[i] = static_cast<T>(is_valid);
}
}
template<typename T>
void invokeCreateCausalMasks(
T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len, int batch_size, cudaStream_t stream)
{
createCausalMasks<<<batch_size, 512, 0, stream>>>(mask, q_lens, k_lens, max_q_len, max_k_len);
}
template void invokeCreateCausalMasks(float* mask, const int*, const int*, int, int, int, cudaStream_t);
template void invokeCreateCausalMasks(half* mask, const int*, const int*, int, int, int, cudaStream_t);
template<typename T>
__global__ void extend_key_cache(T** k_dst,
const size_t dst_offset,
const T* k_src,
const int head_num,
const int size_per_head,
const int* query_length,
const int* history_length,
const int max_q_len,
const int max_seq_len)
{
const int batch_id = blockIdx.y;
const int head_id = blockIdx.z;
constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8;
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
int size_per_head_div_x = size_per_head / X_ELEMS;
// x dim is now handled by uint4 type
const auto key_src = reinterpret_cast<const uint4*>(k_src);
const auto key_dst = reinterpret_cast<uint4*>(k_dst[batch_id] + dst_offset);
const auto seq_len = query_length[batch_id];
const auto t_offset = history_length[batch_id];
const int k_head_size_id = idx % size_per_head_div_x;
const int k_seq_len_id = idx / size_per_head_div_x;
if (k_seq_len_id < seq_len) {
// [B, H, s, D/x] -> [H, D/x, S[t:t+s]]
const int64_t dst_idx = head_id * size_per_head_div_x * max_seq_len + // H
k_head_size_id * max_seq_len + // D/x
t_offset + k_seq_len_id; // s + offset
const int64_t src_idx = batch_id * head_num * size_per_head_div_x * max_q_len + // B
head_id * size_per_head_div_x * max_q_len + // H
k_seq_len_id * size_per_head_div_x + // s
k_head_size_id; // D/x
key_dst[dst_idx] = key_src[src_idx];
}
}
template<typename T>
__global__ void extend_value_cache(T** v_dst,
const size_t dst_offset,
const T* v_src,
const int head_num,
const int size_per_head,
const int* query_length,
const int* history_length,
const int max_q_len,
const int max_seq_len)
{
const int batch_id = blockIdx.y;
const int head_id = blockIdx.z;
constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8;
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
int size_per_head_div_x = size_per_head / X_ELEMS;
// x dim is now handled by uint4 type
const auto val_src = reinterpret_cast<const uint4*>(v_src);
const auto val_dst = reinterpret_cast<uint4*>(v_dst[batch_id] + dst_offset);
const auto seq_len = query_length[batch_id];
const auto t_offset = history_length[batch_id];
const int v_head_size_id = idx % size_per_head_div_x;
const int v_seq_len_id = idx / size_per_head_div_x;
if (v_seq_len_id < seq_len) {
// [B, H, s, D/x] -> [H, S[t:t+s], D/x]
const int64_t dst_idx = head_id * size_per_head_div_x * max_seq_len + // H
(v_seq_len_id + t_offset) * size_per_head_div_x + // s + offset
v_head_size_id; // D/x
const int64_t src_idx = batch_id * head_num * size_per_head_div_x * max_q_len + // B
head_id * size_per_head_div_x * max_q_len + // H
v_seq_len_id * size_per_head_div_x + // s
v_head_size_id; // D/x
val_dst[dst_idx] = val_src[src_idx];
}
}
template<typename T>
void invokeExtendKVCache(T** k_dst,
T** v_dst,
size_t dst_offset,
const T* k_src,
const T* v_src,
int local_batch_size,
const int* query_length,
int max_q_len,
const int* history_length,
int max_seq_len,
int size_per_head,
int local_head_num,
cudaStream_t stream)
{
constexpr int block_sz = 128;
constexpr int x = (sizeof(T) == 4) ? 4 : 8;
dim3 grid((max_q_len * size_per_head / x + block_sz - 1) / block_sz, local_batch_size, local_head_num);
extend_value_cache<<<grid, block_sz, 0, stream>>>(
k_dst, dst_offset, k_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len);
extend_value_cache<<<grid, block_sz, 0, stream>>>(
v_dst, dst_offset, v_src, local_head_num, size_per_head, query_length, history_length, max_q_len, max_seq_len);
}
template void invokeExtendKVCache(float**,
float**,
size_t,
const float*,
const float*,
int,
const int*,
int,
const int*,
int,
int,
int,
cudaStream_t stream);
template void invokeExtendKVCache(half**,
half**,
size_t,
const half*,
const half*,
int,
const int*,
int,
const int*,
int,
int,
int,
cudaStream_t stream);
template<typename T>
__global__ void transpose_key_cache(T* k_dst,
const T** k_src,
const size_t src_offset,
const int head_num,
const int size_per_head,
const int* seq_length,
const int max_kv_len,
const int max_seq_len)
{
const int batch_id = blockIdx.y;
const int head_id = blockIdx.z;
constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8;
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
int size_per_head_div_x = size_per_head / X_ELEMS;
// x dim is now handled by uint4 type
const auto key_src = reinterpret_cast<const uint4*>(k_src[batch_id] + src_offset);
const auto key_dst = reinterpret_cast<uint4*>(k_dst);
const auto seq_len = seq_length[batch_id];
const int k_head_size_id = idx % size_per_head_div_x;
const int k_seq_len_id = idx / size_per_head_div_x;
if (k_seq_len_id < seq_len) {
// [B, H, s, D/x] <- [B, H, D/x, S[:s]]
const int64_t src_idx = head_id * size_per_head_div_x * max_seq_len + // H
k_head_size_id * max_seq_len + // D/x
k_seq_len_id; // s
const int64_t dst_idx = batch_id * head_num * size_per_head_div_x * max_kv_len + // B
head_id * size_per_head_div_x * max_kv_len + // H
k_seq_len_id * size_per_head_div_x + // s
k_head_size_id; // D/x
key_dst[dst_idx] = key_src[src_idx];
}
}
template<typename T>
__global__ void transpose_value_cache(T* v_dst, //
const T** v_src,
const size_t src_offset,
const int head_num,
const int size_per_head,
const int* seq_length,
const int max_kv_len,
const int max_seq_len)
{
const int batch_id = blockIdx.y;
const int head_id = blockIdx.z;
constexpr int X_ELEMS = (sizeof(T) == 4) ? 4 : 8;
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
int size_per_head_div_x = size_per_head / X_ELEMS;
// x dim is now handled by uint4 type
const auto val_src = reinterpret_cast<const uint4*>(v_src[batch_id] + src_offset);
const auto val_dst = reinterpret_cast<uint4*>(v_dst);
const auto seq_len = seq_length[batch_id];
const int v_head_size_id = idx % size_per_head_div_x;
const int v_seq_len_id = idx / size_per_head_div_x;
if (v_seq_len_id < seq_len) {
// [B, H, s, D/x] <- [B, H, S[:s], D/x]
const int64_t src_idx = head_id * size_per_head_div_x * max_seq_len + // H
v_seq_len_id * size_per_head_div_x + // s
v_head_size_id; // D/x
const int64_t dst_idx = batch_id * head_num * size_per_head_div_x * max_kv_len + // B
head_id * size_per_head_div_x * max_kv_len + // H
v_seq_len_id * size_per_head_div_x + // s
v_head_size_id; // D/x
val_dst[dst_idx] = val_src[src_idx];
}
}
template<typename T>
void invokeTransposeKVCache(T* key_cache_trans,
T* val_cache_trans,
const T** key_cache,
const T** val_cache,
size_t src_offset,
int batch_size,
const int* key_length,
int max_kv_len,
int max_seq_len,
int size_per_head,
int head_num,
cudaStream_t stream)
{
constexpr int block_sz = 128;
constexpr int x = (sizeof(T) == 4) ? 4 : 8;
dim3 grid((max_kv_len * size_per_head / x + block_sz - 1) / block_sz, batch_size, head_num);
transpose_value_cache<<<grid, block_sz, 0, stream>>>(
key_cache_trans, key_cache, src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len);
transpose_value_cache<<<grid, block_sz, 0, stream>>>(
val_cache_trans, val_cache, src_offset, head_num, size_per_head, key_length, max_kv_len, max_seq_len);
}
template void invokeTransposeKVCache(
float*, float*, const float**, const float**, size_t, int, const int*, int, int, int, int, cudaStream_t stream);
template void invokeTransposeKVCache(
half*, half*, const half**, const half**, size_t, int, const int*, int, int, int, int, cudaStream_t stream);
__global__ void gatherOutput(int* output_ids,
const int* ids,
const int* context_length,
int max_context_len,
int max_gen_step,
int max_output_len,
int batch_size)
{
const int batch_id = blockIdx.x;
const int context_len = context_length[batch_id];
output_ids += batch_id * max_output_len;
for (int src_idx = threadIdx.x; src_idx < max_gen_step; src_idx += blockDim.x) {
// skip padding for src
if (context_len <= src_idx && src_idx < max_context_len) {
continue;
}
// skip padding for dst
const int dst_idx = src_idx < context_len ? src_idx : src_idx - (max_context_len - context_len);
output_ids[dst_idx] = ids[src_idx * batch_size + batch_id];
}
}
void invokeGatherOutput(int* output_ids,
const int* ids,
const int* context_length,
int max_context_len,
int max_gen_step,
int max_output_len,
int batch_size,
cudaStream_t stream)
{
int block_size = 512;
int grid_size = batch_size;
gatherOutput<<<grid_size, block_size, 0, stream>>>(
output_ids, ids, context_length, max_context_len, max_gen_step, max_output_len, batch_size);
}
} // namespace fastertransformer
\ No newline at end of file
// Copyright (c) OpenMMLab. All rights reserved.
#pragma once
#include "src/fastertransformer/kernels/gpt_kernels.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include <assert.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <numeric>
namespace fastertransformer {
template<typename T>
void invokeRootMeanSquareNorm(T* out, const T* input, const T* scale, float eps, int m, int n, cudaStream_t stream);
template<typename T>
void invokeAddResidual(T* out, const T* in, int m, int n, cudaStream_t stream);
void invokeFixInputIds(int* ids,
const int* input_ids,
const int* input_lengths,
int batch_size,
int seq_len,
int max_input_len,
cudaStream_t st);
template<typename T>
void invokeSliceCausalMask(T* mask, int seq_len, int key_len, int step, int batch_size, cudaStream_t stream);
template<typename T>
void invokeCreateCausalMasks(
T* mask, const int* q_lens, const int* k_lens, int max_q_len, int max_k_len, int batch_size, cudaStream_t stream);
template<typename T>
void invokeExtendKVCache(T** k_dst,
T** v_dst,
size_t layer_offset,
const T* k_src,
const T* v_src,
int batch_size,
const int* query_length,
int max_q_len,
const int* history_length,
int max_seq_len,
int size_per_head,
int local_head_num,
cudaStream_t stream);
template<typename T>
void invokeTransposeKVCache(T* key_cache_trans,
T* val_cache_trans,
const T** key_cache,
const T** val_cache,
size_t layer_offset,
int batch_size,
const int* key_length,
int max_kv_len,
int max_seq_len,
int size_per_head,
int head_num,
cudaStream_t stream);
void invokeGatherOutput(int* output_ids,
const int* ids,
const int* context_length,
int max_context_len,
int max_gen_step,
int max_output_len,
int batch_size,
cudaStream_t stream);
void invokeMyCopyInt(int* dst, const int* src, size_t count, cudaStream_t st);
template<typename T>
class FlashAttentionOp {
public:
struct AttentionLayout {
int stride_batch;
int stride_seq;
int stride_head;
bool use_seqlens = false;
int batch_seqs_offset = 0;
T** batch_seqs = nullptr;
};
struct Params {
T* attn_out;
T* query;
T* key;
T* val;
T* mask;
float* out_accum = nullptr;
int* cu_seqlens_q = nullptr;
int* cu_seqlens_k = nullptr;
AttentionLayout layout_q;
AttentionLayout layout_k;
AttentionLayout layout_v;
AttentionLayout layout_o;
};
public:
FlashAttentionOp(int batch_size, int head_num, int key_len, int seq_len, int size_per_head);
~FlashAttentionOp();
int get_workspace_size() const;
void operator()(Params& params, cudaStream_t st) const;
private:
class impl;
std::unique_ptr<impl> pimpl;
};
template<typename T>
inline void dump(const T* x, int size, cudaStream_t st, const char* msg, bool full = false)
{
std::vector<T> h_x(size);
cudaMemcpyAsync(h_x.data(), x, sizeof(T) * size, cudaMemcpyDefault, st);
cudaStreamSynchronize(st);
fprintf(stderr, "\n%s:\n", msg);
std::vector<float> h_y(h_x.begin(), h_x.end());
float asum = 0.f;
for (const auto& x : h_y) {
asum += std::fabs(x);
}
if (full) {
for (int i = 0; i < size; ++i) {
printf("%d %.8f\n", i, h_y[i]);
}
}
else {
for (int i = 0; i < 8; ++i) {
fprintf(stderr, "%.8f\n", h_y[i]);
}
for (int i = size - 8; i < size; ++i) {
fprintf(stderr, "%.8f\n", h_y[i]);
}
}
fprintf(stderr, "\nasum = %f\n", asum);
// getchar();
}
template<typename T>
struct TempBuffer {
TempBuffer(size_t size)
{
deviceMalloc(&data, size, false);
}
T* data;
};
template<typename T>
inline T*
transpose_key_cache(T* key_cache, size_t head_num, size_t size_per_head_by_x, size_t mem_len, size_t x, cudaStream_t st)
{
static TempBuffer<T> buf(8192 * 8192);
// from: H Dx, S, x
// to : S, H Dx, x
invokeTransposeAxis01(buf.data, key_cache, head_num * size_per_head_by_x, mem_len, x, st);
return buf.data;
}
template<typename T>
inline T* transpose_value_cache(T* value_cache, size_t head_num, size_t mem_len, size_t size_per_head, cudaStream_t st)
{
static TempBuffer<T> buf(8192 * 8192);
invokeTransposeAxis01(buf.data, value_cache, head_num, mem_len, size_per_head, st);
return buf.data;
}
inline void dump_sequence_len(int* d_seq_len, int step, int tp_rank, cudaStream_t st)
{
int h_seq_len = -1;
cudaMemcpyAsync(&h_seq_len, d_seq_len, sizeof(int), cudaMemcpyDefault, st);
cudaStreamSynchronize(st);
FT_LOG_ERROR("--------> rank = %d, step = %d, seq_len = %d <--------", tp_rank, step, h_seq_len);
}
} // namespace fastertransformer
\ No newline at end of file
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/kernels/reduce_kernel_utils.cuh"
#include "src/fastertransformer/models/llama/llama_utils.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cuda_fp16.h>
#include <curand_kernel.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/host_vector.h>
#include <vector>
namespace fastertransformer {
CmpMode compare_mode = kCmpNone;
template<typename T>
struct abs_diff_t {
using type = T;
};
template<>
struct abs_diff_t<half> {
using type = float;
};
template<typename T>
struct abs_diff: public thrust::unary_function<thrust::tuple<T, T>, typename abs_diff_t<T>::type> {
__host__ __device__ float operator()(thrust::tuple<T, T> x) const
{
using R = typename abs_diff_t<T>::type;
auto r = R(thrust::get<0>(x)) - R(thrust::get<1>(x));
return r < R(0) ? -r : r;
}
};
template<typename T>
void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream)
{
std::vector<T> h_data(size);
cudaMemcpyAsync(h_data.data(), ptr, sizeof(T) * size, cudaMemcpyDefault, stream);
check_cuda_error(cudaStreamSynchronize(stream));
size_t nan_cnt = 0;
for (const auto& x : h_data) {
nan_cnt += std::isnan(static_cast<float>(x));
}
if (nan_cnt) {
std::cerr << key << ": NaN count " << nan_cnt << "\n";
}
}
template<typename T>
void CmpRead(T* ptr, size_t size, std::string key, cudaStream_t stream)
{
// wait for b
check_cuda_error(cudaStreamSynchronize(stream));
// read a from file
thrust::host_vector<T> h_a(size);
{
const auto filename = "tmp/" + key + ".cmp";
std::ifstream ifs(filename, std::ios::binary);
if (!ifs.is_open()) {
std::cerr << key << ": failed to open " + filename << "\n";
return;
}
ifs.seekg(0, ifs.end);
const auto actual_size_in_bytes = ifs.tellg();
ifs.seekg(0, ifs.beg);
const auto expect_size_in_bytes = sizeof(T) * size;
if (actual_size_in_bytes != expect_size_in_bytes) {
std::cerr << key << ": file size in bytes mismatch, expect " << expect_size_in_bytes << ", got "
<< actual_size_in_bytes << "\n";
return;
}
ifs.read((char*)h_a.data(), sizeof(T) * h_a.size());
}
// copy a to device
thrust::device_vector<T> a = h_a;
// create abs(a - b) iterator
thrust::device_ptr<T> dev_ptr(ptr);
auto zip_iter = thrust::make_zip_iterator(thrust::make_tuple(a.begin(), dev_ptr));
auto transform_iter = thrust::make_transform_iterator(zip_iter, abs_diff<T>{});
// sum(abs(a - b))
auto asum = thrust::reduce(thrust::device, transform_iter, transform_iter + size);
std::cerr << key << ": " << asum << " " << asum / size << "\n";
}
template<typename T>
void CmpWrite(T* ptr, size_t size, std::string key, cudaStream_t stream)
{
std::vector<T> a(size);
// copy a to host
check_cuda_error(cudaMemcpyAsync(a.data(), ptr, sizeof(T) * size, cudaMemcpyDefault, stream));
check_cuda_error(cudaStreamSynchronize(stream));
// write to file
{
std::ofstream ofs("tmp/" + key + ".cmp", std::ios::binary);
ofs.write((char*)a.data(), sizeof(T) * a.size());
}
}
template<typename T>
void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream)
{
// std::cerr << "Comparing " << key << "\n";
if (mode == kCmpRead) {
CmpRead(ptr, size, key, stream);
}
else if (mode == kCmpWrite) {
CmpWrite(ptr, size, key, stream);
}
else {
// kCmpNone
}
}
template void Compare(int* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
template void Compare(float* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
template void Compare(half* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
template void CheckNan(const float* ptr, size_t size, std::string key, cudaStream_t stream);
template void CheckNan(const half* ptr, size_t size, std::string key, cudaStream_t stream);
std::string format(const std::pair<std::string, Tensor>& p)
{
std::stringstream ss;
ss << p.first << " [";
bool first = true;
for (const auto& x : p.second.shape) {
ss << (first ? "" : ", ") << x;
first = false;
}
ss << "]";
return ss.str();
}
size_t curandStateGetSize()
{
return sizeof(curandState_t);
}
bool isDebug()
{
static const bool is_debug = [] {
const auto level = std::getenv("FT_DEBUG_LEVEL");
if (level && level == std::string("DEBUG")) {
return true;
}
return false;
}();
return is_debug;
}
} // namespace fastertransformer
\ No newline at end of file
// Copyright (c) OpenMMLab. All rights reserved.
#pragma once
#include "src/fastertransformer/utils/Tensor.h"
#include <cuda_runtime.h>
#include <sstream>
#include <string>
#include <vector>
namespace fastertransformer {
enum CmpMode
{
kCmpNone,
kCmpRead,
kCmpWrite,
};
extern CmpMode compare_mode;
template<typename T>
void Compare(T* ptr, size_t size, std::string key, CmpMode mode, cudaStream_t stream);
template<typename T>
void CheckNan(const T* ptr, size_t size, std::string key, cudaStream_t stream);
namespace detail {
template<typename T>
std::string to_string(T x)
{
return std::to_string(x);
}
inline std::string to_string(std::string x)
{
return x;
}
} // namespace detail
template<typename... Args>
std::string Concat(std::string key, Args&&... args)
{
std::vector<std::string> args_str{detail::to_string((Args &&) args)...};
for (const auto& s : args_str) {
key.append("_");
key.append(s);
}
return key;
}
std::string format(const std::pair<std::string, Tensor>& p);
size_t curandStateGetSize();
bool isDebug();
} // namespace fastertransformer
\ No newline at end of file
// Copyright (c) OpenMMLab. All rights reserved.
#include "src/fastertransformer/models/llama/prefix_cache.h"
// <L,H,D/X,s,X> -> <L,H,D/X,S[:s],X>
template<typename T>
__global__ void insertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, size_t S)
{
for (int i = threadIdx.x; i < L * H * Dx * s * X; i += blockDim.x) {
int i0 = i / X;
int x = i % X;
int i1 = i0 / s;
int t = i0 % s;
size_t j = (i1 * S + t) * X + x;
key_cache[j] = src[i];
}
}
template<typename T>
void invokeInsertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st)
{
insertKeyCache<<<1, 512, 0, st>>>(key_cache, src, L, H, Dx, s, X, S);
}
template void
invokeInsertKeyCache(float* key_cache, const float* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st);
template void
invokeInsertKeyCache(half* key_cache, const half* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st);
// <L,H,s,D> -> <L,H,S[:s],D>
template<typename T>
__global__ void insertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, size_t S)
{
for (int i = threadIdx.x; i < L * H * s * D; i += blockDim.x) {
int i0 = i / D;
int d = i % D;
int i1 = i0 / s;
int t = i0 % s;
size_t j = (i1 * S + t) * D + d;
value_cache[j] = src[i];
}
}
template<typename T>
void invokeInsertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, int S, cudaStream_t st)
{
insertValueCache<<<1, 512, 0, st>>>(value_cache, src, L, H, s, D, S);
}
template void
invokeInsertValueCache(float* value_cache, const float* src, int L, int H, int s, int D, int S, cudaStream_t st);
template void
invokeInsertValueCache(half* value_cache, const half* src, int L, int H, int s, int D, int S, cudaStream_t st);
// Copyright (c) OpenMMLab. All rights reserved.
#include <cuda_fp16.h>
template<typename T>
void invokeInsertKeyCache(T* key_cache, const T* src, int L, int H, int Dx, int s, int X, int S, cudaStream_t st);
template<typename T>
void invokeInsertValueCache(T* value_cache, const T* src, int L, int H, int s, int D, int S, cudaStream_t st);
\ No newline at end of file
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmake_minimum_required (VERSION 3.18)
project(tritonfastertransformerbackend LANGUAGES C CXX)
#
# Options
#
option(TRITON_ENABLE_GPU "Enable GPU support in backend" ON)
option(TRITON_ENABLE_STATS "Include statistics collections in backend" ON)
set(TRITON_PYTORCH_INCLUDE_PATHS "" CACHE PATH "Paths to Torch includes")
set(TRITON_PYTORCH_LIB_PATHS "" CACHE PATH "Paths to Torch libraries")
set(TRITON_BACKEND_REPO_TAG "r22.12" CACHE STRING "Tag for triton-inference-server/backend repo")
set(TRITON_CORE_REPO_TAG "r22.12" CACHE STRING "Tag for triton-inference-server/core repo")
set(TRITON_COMMON_REPO_TAG "r22.12" CACHE STRING "Tag for triton-inference-server/common repo")
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
set(USE_TRITONSERVER_DATATYPE "ON")
message("-- Enable USE_TRITONSERVER_DATATYPE")
#
# Dependencies
#
# FetchContent's composability isn't very good. We must include the
# transitive closure of all repos so that we can override the tag.
#
include(FetchContent)
FetchContent_Declare(
repo-common
GIT_REPOSITORY https://github.com/triton-inference-server/common.git
GIT_TAG ${TRITON_COMMON_REPO_TAG}
GIT_SHALLOW ON
)
FetchContent_Declare(
repo-core
GIT_REPOSITORY https://github.com/triton-inference-server/core.git
GIT_TAG ${TRITON_CORE_REPO_TAG}
GIT_SHALLOW ON
)
FetchContent_Declare(
repo-backend
GIT_REPOSITORY https://github.com/triton-inference-server/backend.git
GIT_TAG ${TRITON_BACKEND_REPO_TAG}
GIT_SHALLOW ON
)
FetchContent_MakeAvailable(repo-common repo-core repo-backend)
#
# CUDA
#
if(${TRITON_ENABLE_GPU})
find_package(CUDAToolkit REQUIRED)
endif() # TRITON_ENABLE_GPU
#
# Shared library implementing the Triton Backend API
#
configure_file(libtriton_fastertransformer.ldscript libtriton_fastertransformer.ldscript COPYONLY)
add_library(
triton-fastertransformer-backend SHARED
libfastertransformer.cc
)
add_library(
TritonFasterTransformerBackend::triton-fastertransformer-backend ALIAS triton-fastertransformer-backend
)
find_package(CUDAToolkit REQUIRED)
find_package(CUDA 10.1 REQUIRED)
if (${CUDA_VERSION} GREATER_EQUAL 11.0)
message(STATUS "Add DCUDA11_MODE")
add_definitions("-DCUDA11_MODE")
endif()
set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
target_compile_definitions(triton-fastertransformer-backend
PUBLIC
USE_TRITONSERVER_DATATYPE
BUILD_MULTI_GPU)
target_include_directories(
triton-fastertransformer-backend
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src
${TRITON_PYTORCH_INCLUDE_PATHS}
${Python3_INCLUDE_DIRS}
${repo-ft_SOURCE_DIR}
${repo-ft_SOURCE_DIR}/3rdparty/cutlass/include
${repo-core_SOURCE_DIR}/include
)
target_link_directories(
triton-fastertransformer-backend
PRIVATE
${CUDA_PATH}/lib64
)
target_compile_features(triton-fastertransformer-backend PRIVATE cxx_std_14)
target_compile_options(
triton-fastertransformer-backend PRIVATE
$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-Wall -Wextra -Wno-unused-parameter -Wno-type-limits >#-Werror>
)
if(${TRITON_ENABLE_GPU})
target_compile_definitions(
triton-fastertransformer-backend
PRIVATE TRITON_ENABLE_GPU=1
)
endif() # TRITON_ENABLE_GPU
set_target_properties(
triton-fastertransformer-backend
PROPERTIES
POSITION_INDEPENDENT_CODE ON
OUTPUT_NAME triton_fastertransformer
SKIP_BUILD_RPATH TRUE
BUILD_WITH_INSTALL_RPATH TRUE
INSTALL_RPATH_USE_LINK_PATH FALSE
INSTALL_RPATH "$\{ORIGIN\}"
LINK_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libtriton_fastertransformer.ldscript
LINK_FLAGS "-Wl,--no-as-needed,--version-script libtriton_fastertransformer.ldscript"
)
# Need to turn off unused-but-set-variable due to Torchvision
# Need to turn off unknown-pragmas due to ATen OpenMP
set_target_properties(
triton-fastertransformer-backend
PROPERTIES COMPILE_FLAGS
"-Wno-unknown-pragmas -Wno-unused-but-set-variable"
)
set(TRITON_PYTORCH_LDFLAGS "")
FOREACH(p ${TRITON_PYTORCH_LIB_PATHS})
set(TRITON_PYTORCH_LDFLAGS ${TRITON_PYTORCH_LDFLAGS} "-L${p}")
ENDFOREACH(p)
target_link_libraries(
triton-fastertransformer-backend
PRIVATE
triton-core-serverapi # from repo-core
triton-core-backendapi # from repo-core
triton-core-serverstub # from repo-core
triton-backend-utils # from repo-backend
transformer-shared # from repo-ft
${TRITON_PYTORCH_LDFLAGS}
-lcublas
-lcublasLt
-lcudart
-lcurand
)
if (BUILD_MULTI_GPU)
target_compile_definitions(
triton-fastertransformer-backend
PUBLIC
BUILD_MULTI_GPU
)
target_include_directories(
triton-fastertransformer-backend
PRIVATE
${MPI_INCLUDE_PATH}
)
target_link_directories(
triton-fastertransformer-backend
PRIVATE
${MPI_Libraries}
/usr/local/mpi/lib
)
target_link_libraries(
triton-fastertransformer-backend
PRIVATE
${NCCL_LIBRARIES}
${MPI_LIBRARIES}
)
endif()
if(${TRITON_ENABLE_GPU})
target_link_libraries(
triton-fastertransformer-backend
PRIVATE
CUDA::cudart
)
endif() # TRITON_ENABLE_GPU
#
# Install
#
include(GNUInstallDirs)
set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonFasterTransformerBackend)
install(
TARGETS
triton-fastertransformer-backend
EXPORT
triton-fastertransformer-backend-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
)
install(
EXPORT
triton-fastertransformer-backend-targets
FILE
TritonFasterTransformerBackendTargets.cmake
NAMESPACE
TritonFasterTransformerBackend::
DESTINATION
${INSTALL_CONFIGDIR}
)
include(CMakePackageConfigHelpers)
configure_package_config_file(
${CMAKE_SOURCE_DIR}/cmake/TritonFasterTransformerBackendConfig.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendConfig.cmake
INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
)
install(
FILES
${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendConfig.cmake
DESTINATION ${INSTALL_CONFIGDIR}
)
#
# Export from build tree
#
export(
EXPORT triton-fastertransformer-backend-targets
FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendTargets.cmake
NAMESPACE TritonFasterTransformerBackend::
)
export(PACKAGE TritonFasterTransformerBackend)
# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
add_library(TransformerTritonBackend SHARED transformer_triton_backend.cpp)
target_link_libraries(TransformerTritonBackend PRIVATE nccl_utils mpi_utils)
add_subdirectory(llama)
// Copyright (c) OpenMMLab. All rights reserved.
// Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// * Neither the name of NVIDIA CORPORATION nor the names of its
// contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// Modified from https://github.com/triton-inference-server/fastertransformer_backend/blob/main/src/libfastertransformer.cc
#include <stdint.h>
#include <exception>
#include <string>
#include <thread>
#include <vector>
#pragma GCC diagnostic push
//#pragma GCC diagnostic ignored "-Wsign-compare"
#pragma GCC diagnostic ignored "-Wcast-function-type"
#pragma warning(push, 0)
#pragma warning(pop)
#pragma GCC diagnostic pop
// must include triton libraries first
#include "triton/backend/backend_common.h"
#include "triton/backend/backend_input_collector.h"
#include "triton/backend/backend_memory.h"
#include "triton/backend/backend_model.h"
#include "triton/backend/backend_model_instance.h"
#include "triton/backend/backend_output_responder.h"
#include "triton/core/tritonbackend.h"
// FT's libraries have dependency with triton's lib
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_bf16_wrapper.h"
#include "src/fastertransformer/utils/instance_comm.h"
#include "src/fastertransformer/utils/mpi_utils.h"
#include "src/fastertransformer/utils/nccl_utils.h"
std::exception_ptr ptr[8];
namespace ft = fastertransformer;
namespace triton {
namespace backend {
namespace fastertransformer_backend {
#define RESPOND_ALL_AND_RETURN_IF_ERROR(RESPONSES, RESPONSES_COUNT, X) \
do { \
TRITONSERVER_Error* raarie_err__ = (X); \
if (raarie_err__ != nullptr) { \
SendErrorForResponses(RESPONSES, RESPONSES_COUNT, raarie_err__); \
return; \
} \
} while (false)
// Cuda Error handling
TRITONSERVER_Error*
ConvertCUDAStatusToTritonError(cudaError_t cuda_error, TRITONSERVER_Error_Code code, const char* msg)
{
if (cuda_error != cudaSuccess) {
return TRITONSERVER_ErrorNew(code, cudaGetErrorString(cuda_error));
}
return nullptr; // success
}
////////////////////////////////////////////////////////////////////////////////////////////////////
// Ragged Baching
struct RaggedBatchingParams {
bool is_input_ragged = false;
int32_t max_seq_length = 0;
int32_t max_elements_per_seq = 0;
const int32_t* batch_input_ptr = nullptr;
size_t batch_intput_size = 0;
size_t total_input_elements = 0;
};
using RaggedBatchingParam_Map = std::unordered_map<std::string, RaggedBatchingParams>;
////////////////////////////////////////////////////////////////////////////////////////////////////
//
// ModelState
//
// State associated with a model that is using this backend. An object
// of this class is created and associated with each
// TRITONBACKEND_Model.
//
class ModelState: public BackendModel {
public:
static TRITONSERVER_Error* Create(TRITONBACKEND_Model* triton_model, ModelState** state);
virtual ~ModelState() = default;
TRITONSERVER_Error* LoadModel(const std::string& artifact_name,
const int32_t node_id,
const int32_t device_id,
const int32_t device_id_start,
const int32_t stream_id,
std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>& nccl_params,
std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comms,
std::string* model_path,
std::unique_ptr<AbstractTransformerModelInstance>* ft_model_instance);
int GetGpuSize()
{
return gpu_size;
};
int GetWorldSize()
{
return world_size;
};
int GetParallelSize()
{
return tp_pp_size;
};
int GetInstanceId()
{
return current_model_instance_id++;
};
int GetInstanceGroupCount()
{
return instance_group_count;
};
bool SequenceBatchingEnabled()
{
return sequence_batching_enabled;
};
bool DynamicBatchingEnabled()
{
return dynamic_batching_enabled;
};
std::shared_ptr<AbstractTransformerModel> GetFtModel()
{
return ft_model;
};
private:
ModelState(TRITONBACKEND_Model* triton_model);
TRITONSERVER_Error* AutoCompleteConfig();
std::string GetParameter(const char* parameter);
int current_model_instance_id = 0;
bool sequence_batching_enabled = false;
bool dynamic_batching_enabled = false;
int instance_group_count = 1;
std::shared_ptr<AbstractTransformerModel> ft_model;
int node_id, gpu_size, world_size, tp_pp_size;
std::vector<cudaStream_t> streams_;
std::shared_ptr<AbstractTransformerModel> ModelFactory(common::TritonJson::Value& param,
const std::string& model_filename);
};
TRITONSERVER_Error* ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
{
try {
*state = new ModelState(triton_model);
}
catch (const BackendModelException& ex) {
RETURN_ERROR_IF_TRUE(ex.err_ == nullptr,
TRITONSERVER_ERROR_INTERNAL,
std::string("unexpected nullptr in BackendModelException"));
RETURN_IF_ERROR(ex.err_);
}
// Auto-complete the configuration if requested, or T5-Encoder
bool auto_complete_config = false;
RETURN_IF_ERROR(TRITONBACKEND_ModelAutoCompleteConfig(triton_model, &auto_complete_config));
auto_complete_config |=
(*state)->GetParameter("model_type") == "T5-Encoder" || (*state)->GetParameter("model_type") == "bert";
if (auto_complete_config) {
RETURN_IF_ERROR((*state)->AutoCompleteConfig());
triton::common::TritonJson::WriteBuffer json_buffer;
(*state)->ModelConfig().Write(&json_buffer);
TRITONSERVER_Message* message;
RETURN_IF_ERROR(TRITONSERVER_MessageNewFromSerializedJson(&message, json_buffer.Base(), json_buffer.Size()));
RETURN_IF_ERROR(TRITONBACKEND_ModelSetConfig(triton_model, 1 /* config_version */, message));
}
return nullptr; // success
}
std::string param_get(common::TritonJson::Value& param, const char* field, const std::string& fallback = "")
{
common::TritonJson::Value key;
std::string value = fallback;
param.MemberAsObject(field, &key);
key.MemberAsString("string_value", &value);
return value;
}
int param_get_int(common::TritonJson::Value& param, const char* field, int fallback = 0)
{
int ret = fallback;
try {
ret = std::stoi(param_get(param, field));
}
catch (std::invalid_argument& ia) {
LOG_MESSAGE(TRITONSERVER_LOG_ERROR,
(std::string("Invalid configuration argument '") + field + "': " + ia.what()).c_str());
}
return ret;
}
float param_get_float(common::TritonJson::Value& param, const char* field, float fallback = 0.0)
{
float ret = fallback;
try {
ret = std::stof(param_get(param, field));
}
catch (std::invalid_argument& ia) {
LOG_MESSAGE(TRITONSERVER_LOG_ERROR,
(std::string("Invalid configuration argument '") + field + "': " + ia.what()).c_str());
}
return ret;
}
bool param_get_bool(common::TritonJson::Value& param, const char* field, bool fallback = false)
{
return static_cast<bool>(param_get_int(param, field, static_cast<int>(fallback)));
}
std::shared_ptr<AbstractTransformerModel> ModelState::ModelFactory(common::TritonJson::Value& param,
const std::string& model_filename)
{
std::shared_ptr<AbstractTransformerModel> ft_model;
const std::string model_dir = param_get(
param, "model_checkpoint_path", JoinPath({RepositoryPath(), std::to_string(Version()), model_filename}));
const std::string model_type = param_get(param, "model_type", "GPT");
const std::string data_type = param_get(param, "data_type");
const int tp = param_get_int(param, "tensor_para_size");
const int pp = param_get_int(param, "pipeline_para_size");
const int custom_ar = param_get_int(param, "enable_custom_all_reduce");
const std::string dt_message = std::string("Invalid configuration argument 'data_type': ") + data_type;
if (model_type == "Llama") {
if (data_type == "fp16") {
ft_model = std::make_shared<LlamaTritonModel<half>>(tp, pp, custom_ar, model_dir);
}
else {
ft_model = std::make_shared<LlamaTritonModel<float>>(tp, pp, custom_ar, model_dir);
}
}
else {
THROW_IF_BACKEND_MODEL_ERROR(
TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED, ("Unknown model \"" + model_type + "\"").c_str()));
}
return ft_model;
}
ModelState::ModelState(TRITONBACKEND_Model* triton_model): BackendModel(triton_model, true)
{
node_id = ft::mpi::getCommWorldRank();
int num_nodes = ft::mpi::getCommWorldSize();
triton::common::TritonJson::WriteBuffer buffer;
ModelConfig().PrettyWrite(&buffer);
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("model configuration:\n") + buffer.Contents()).c_str());
common::TritonJson::Value param;
model_config_.MemberAsObject("parameters", &param);
// instance groups
triton::common::TritonJson::Value instance_group, instance_obj, instance_group_count_val, instance_group_kind;
if (!ModelConfig().Find("instance_group", &instance_group) || instance_group.ArraySize() > 1) {
THROW_IF_BACKEND_MODEL_ERROR(
TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED, "Only supports one instance group !"));
}
instance_group.IndexAsObject(0, &instance_obj);
instance_obj.Find("count", &instance_group_count_val);
instance_obj.Find("kind", &instance_group_kind);
std::string instance_group_kind_str;
int64_t instance_group_count_int64 = 1;
instance_group_kind.AsString(&instance_group_kind_str);
instance_group_count_val.AsInt(&instance_group_count_int64);
instance_group_count = (int)instance_group_count_int64;
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
("Instance group type: " + instance_group_kind_str + " count: " + std::to_string(instance_group_count_int64))
.c_str());
if (instance_group_kind_str != "KIND_CPU") {
THROW_IF_BACKEND_MODEL_ERROR(
TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED, "Instance Group: only KIND_CPU supports!"));
}
// instance group validation
bool multi_node_enabled = num_nodes > 1;
tp_pp_size = param_get_int(param, "tensor_para_size") * param_get_int(param, "pipeline_para_size");
gpu_size = ft::getDeviceCount();
world_size = gpu_size * num_nodes;
int model_instance_size = num_nodes > 1 ? gpu_size : tp_pp_size;
bool multi_model_instance_valid = (multi_node_enabled && tp_pp_size == world_size && instance_group_count == 1)
|| (!multi_node_enabled && gpu_size % tp_pp_size == 0
&& model_instance_size * instance_group_count >= gpu_size);
printf("num_nodes=%d\n", num_nodes);
printf("tp_pp_size=%d\n", tp_pp_size);
printf("gpu_size=%d\n", gpu_size);
printf("world_size=%d\n", world_size);
printf("model_instance_size=%d\n", model_instance_size);
if (!multi_model_instance_valid) {
THROW_IF_BACKEND_MODEL_ERROR(
TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED,
"1. Number of visible GPUs must be evenly divisble by TP * PP \n"
"2. Number of visible GPUs must be <= instance count * TP * PP \n"
"3. Multi-Node Inference only support one model instance \n"));
}
int64_t max_batch_size = 0;
model_config_.MemberAsInt("max_batch_size", &max_batch_size);
// sequence batching
triton::common::TritonJson::Value sequence_batching;
sequence_batching_enabled = ModelConfig().Find("sequence_batching", &sequence_batching);
std::string sequence_batching_log = sequence_batching_enabled ? "enabled" : "disabled";
LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Sequence Batching: ") + sequence_batching_log).c_str());
// if (sequence_batching_enabled && max_batch_size != 1) {
// THROW_IF_BACKEND_MODEL_ERROR(TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED,
// "Sequence Batching for interactive text generation: only supports max
// batch size = 1 currently !"));
// }
// dynamic batching
triton::common::TritonJson::Value dynamic_batching;
dynamic_batching_enabled = ModelConfig().Find("dynamic_batching", &dynamic_batching);
std::string dynamic_batching_log = dynamic_batching_enabled ? "enabled" : "disabled";
LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Dynamic Batching: ") + dynamic_batching_log).c_str());
if (dynamic_batching_enabled && sequence_batching_enabled) {
THROW_IF_BACKEND_MODEL_ERROR(TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_UNSUPPORTED,
"Sequence Batching cannot work with dynamic "
"batching at the same time !"));
}
std::string model_filename;
model_config_.MemberAsString("default_model_filename", &model_filename);
if (model_filename == "") {
model_filename = std::to_string(param_get_int(param, "tensor_para_size")) + "-gpu";
}
ft_model = ModelFactory(param, model_filename);
std::cout << ft_model->toString();
int total_weight_gpu_size = (instance_group_count * model_instance_size) >= gpu_size ?
gpu_size :
(instance_group_count * model_instance_size);
streams_.resize(instance_group_count * model_instance_size);
/* create shared weights
assume 8 gpus, 8 model instances, Tensor Para Size 2
then we will distribute model instances to [0, 1], [2, 3], [4, 5], [6, 7],
[0, 1], [2, 3], [4, 5], [6, 7] GPUs;
two instance instances on GPUs [0, 1] will share the same weights
*/
std::vector<std::thread> threads;
LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Before Loading Weights:")).c_str());
ft::print_mem_usage();
for (int gid = 0; gid < total_weight_gpu_size; gid++) {
int rank = node_id * gpu_size + gid % tp_pp_size;
threads.push_back(std::thread(&AbstractTransformerModel::createSharedWeights, ft_model, gid, rank));
}
for (auto& t : threads) {
t.join();
}
LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("After Loading Weights:")).c_str());
ft::print_mem_usage();
}
TRITONSERVER_Error*
ModelState::LoadModel(const std::string& artifact_name,
const int32_t node_id,
const int32_t device_id,
const int32_t device_id_start,
const int32_t stream_id,
std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>& nccl_params_instance,
std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comms,
std::string* model_path,
std::unique_ptr<AbstractTransformerModelInstance>* ft_model_instance)
{
LOG_IF_ERROR(ConvertCUDAStatusToTritonError(
cudaSetDevice(device_id), TRITONSERVER_ERROR_INTERNAL, "Failed to set cuda device"),
"Failed to set cuda device");
std::string cc_model_filename = artifact_name;
if (cc_model_filename.empty()) {
cc_model_filename = "gpt3-model";
}
if (!node_id && !device_id) {
LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("Before Loading Model:")).c_str());
}
ft::print_mem_usage();
LOG_IF_ERROR(ConvertCUDAStatusToTritonError(cudaStreamCreate(&streams_[stream_id]),
TRITONSERVER_ERROR_INTERNAL,
"Failed to create the stream"),
"Failed to create the stream");
const int rank = node_id * GetGpuSize() + device_id - device_id_start;
auto model_instance = ft_model->createModelInstance(
device_id, rank, streams_[stream_id], nccl_params_instance, custom_all_reduce_comms);
ft_model_instance->reset(model_instance.release());
if (!node_id && !device_id) {
LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("After Loading Model:")).c_str());
}
ft::print_mem_usage();
return nullptr; // success
}
TRITONSERVER_Error* ModelState::AutoCompleteConfig()
{
if (GetParameter("model_type") == "T5-Encoder") {
const std::string data_type = GetParameter("data_type");
auto& config = ModelConfig();
common::TritonJson::Value outputs, output, dtype_object;
std::string name;
config.MemberAsArray("output", &outputs);
std::unordered_map<std::string, std::string> return_type_map{
{"fp16", "TYPE_FP16"}, {"fp32", "TYPE_FP32"}, {"bf16", "TYPE_BF16"}};
std::set<std::string> outputs_to_modify = {"output_hidden_state", "output_attentions"};
for (size_t idx = 0; idx < outputs.ArraySize(); idx++) {
outputs.IndexAsObject(idx, &output);
output.MemberAsString("name", &name);
if (outputs_to_modify.find(name) == outputs_to_modify.end()) {
continue;
}
output.Find("data_type", &dtype_object);
dtype_object.SetString(return_type_map[data_type]);
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
("Automatically setting return data_type for \"" + name + "\" to \""
+ return_type_map[data_type] + "\"")
.c_str());
}
}
else if (GetParameter("model_type") == "bert") {
const std::string data_type = GetParameter("data_type");
auto& config = ModelConfig();
common::TritonJson::Value inputs, input, dtype_object;
common::TritonJson::Value outputs, output;
std::string name;
config.MemberAsArray("input", &inputs);
config.MemberAsArray("output", &outputs);
std::unordered_map<std::string, std::string> return_type_map{
{"fp16", "TYPE_FP16"}, {"fp32", "TYPE_FP32"}, {"bf16", "TYPE_BF16"}};
for (size_t idx = 0; idx < inputs.ArraySize(); idx++) {
inputs.IndexAsObject(idx, &input);
input.MemberAsString("name", &name);
if (name != "input_hidden_state") {
continue;
}
input.Find("data_type", &dtype_object);
dtype_object.SetString(return_type_map[data_type]);
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
("Automatically setting return data_type for "
"\"input_hidden_state\" to \""
+ return_type_map[data_type] + "\"")
.c_str());
}
for (size_t idx = 0; idx < outputs.ArraySize(); idx++) {
outputs.IndexAsObject(idx, &output);
output.MemberAsString("name", &name);
if (name != "output_hidden_state") {
continue;
}
output.Find("data_type", &dtype_object);
dtype_object.SetString(return_type_map[data_type]);
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
("Automatically setting return data_type for "
"\"output_hidden_state\" to \""
+ return_type_map[data_type] + "\"")
.c_str());
}
}
else {
// Auto-complete configuration is not supported since fastertransformer does
// not store/capture sufficient model metadata so just log error instead.
LOG_MESSAGE(TRITONSERVER_LOG_WARN,
(std::string("skipping model configuration auto-complete for '") + Name()
+ "': not supported for fastertransformer backend")
.c_str());
}
return nullptr; // success
}
std::string ModelState::GetParameter(const char* parameter)
{
auto& config = ModelConfig();
common::TritonJson::Value parameters, model_type_obj;
std::string model_type;
config.MemberAsObject("parameters", &parameters);
parameters.MemberAsObject(parameter, &model_type_obj);
model_type_obj.MemberAsString("string_value", &model_type);
return model_type;
}
struct stream_callback_ctx_t {
size_t total_batch_size;
TRITONBACKEND_Request** requests;
uint32_t request_count;
std::vector<TRITONBACKEND_Response*>* responses;
std::vector<TRITONBACKEND_ResponseFactory*>* factories;
BackendModelInstance* model;
};
void generate_response_placeholders(std::vector<TRITONBACKEND_Response*>* responses,
std::vector<TRITONBACKEND_ResponseFactory*>* factories)
{
TRITONSERVER_Error* err = nullptr;
for (auto factory : *factories) {
TRITONBACKEND_Response* response;
err = TRITONBACKEND_ResponseNewFromFactory(&response, factory);
if (err) {
LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response from factory");
TRITONSERVER_ErrorDelete(err);
}
responses->push_back(response);
}
}
//
// ModelInstanceState
//
// State associated with a model instance. An object of this class is
// created and associated with each TRITONBACKEND_ModelInstance.
//
class ModelInstanceState: public BackendModelInstance {
public:
static TRITONSERVER_Error*
Create(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance, ModelInstanceState** state);
virtual ~ModelInstanceState();
// Get the state of the model that corresponds to this instance.
ModelState* StateForModel() const
{
return model_state_;
}
// Execute...
void ProcessRequests(TRITONBACKEND_Request** requests, const uint32_t request_count);
std::shared_ptr<std::unordered_map<std::string, Tensor>>
Execute(std::vector<TRITONBACKEND_Response*>* responses,
stream_callback_ctx_t* context,
const uint32_t response_count,
std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors);
void ReadOutputTensors(size_t total_batch_size,
std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors,
TRITONBACKEND_Request** requests,
const uint32_t request_count,
std::vector<TRITONBACKEND_Response*>* responses);
int GetModelInstanceCount()
{
return model_instance_count_;
};
int GetModelInstanceId()
{
return model_instance_id_;
};
private:
ModelInstanceState(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance);
TRITONSERVER_Error* ValidateInputs();
TRITONSERVER_Error* ValidateOutputs();
void SetInputTensors(size_t total_batch_size,
TRITONBACKEND_Request** requests,
const uint32_t request_count,
std::vector<TRITONBACKEND_Response*>* responses,
BackendInputCollector* collector,
std::vector<const char*>* input_names,
std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors,
std::vector<BackendMemory*>* input_memories,
bool* cuda_copy);
void BroadcastInputTensors(std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors);
ModelState* model_state_;
// model instance id
int model_instance_count_ = 1;
int model_instance_id_ = 0;
int model_instance_gpu_size_ = 1;
int model_instance_device_id_start_ = 0;
// output tensor stream
cudaStream_t output_stream_;
// tensor parallel + pipeline parallel
int gpu_size_ = 1;
int world_size_ = 1;
int tp_pp_size_ = 1;
// Should we use the streaming API?
bool is_decoupled_ = false;
// The full path to the FT model file.
std::string model_path_;
std::vector<std::unique_ptr<AbstractTransformerModelInstance>> ft_model_instance_;
std::unique_ptr<ft::AbstractInstanceComm> instance_comm_;
// inter-node broadcast buffer
std::vector<char*> bcast_buffers;
// Map from configuration name for an input to the index of
// that input in the model.
std::unordered_map<std::string, int> input_index_map_;
// Map from configuration name for an output to the index of
// that output in the model.
std::unordered_map<std::string, TRITONSERVER_DataType> output_dtype_map_;
std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params_;
// custom all reduce comms
std::vector<std::shared_ptr<ft::AbstractCustomComm>> custom_all_reduce_comms_;
};
TRITONSERVER_Error* ModelInstanceState::Create(ModelState* model_state,
TRITONBACKEND_ModelInstance* triton_model_instance,
ModelInstanceState** state)
{
try {
*state = new ModelInstanceState(model_state, triton_model_instance);
}
catch (const BackendModelInstanceException& ex) {
RETURN_ERROR_IF_TRUE(ex.err_ == nullptr,
TRITONSERVER_ERROR_INTERNAL,
std::string("unexpected nullptr in BackendModelInstanceException"));
RETURN_IF_ERROR(ex.err_);
}
return nullptr; // success
}
int ThreadLoadModel(ModelState* model_state,
const std::string& artifact_name,
const int32_t node_id,
const int32_t device_id,
const int32_t device_id_start,
const int32_t stream_id,
std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comms,
std::string* model_path,
std::unique_ptr<AbstractTransformerModelInstance>* ft_model_instance)
{
THROW_IF_BACKEND_INSTANCE_ERROR(model_state->LoadModel(artifact_name,
node_id,
device_id,
device_id_start,
stream_id,
nccl_params,
custom_all_reduce_comms,
model_path,
ft_model_instance));
return 0;
}
ModelInstanceState::ModelInstanceState(ModelState* model_state, TRITONBACKEND_ModelInstance* triton_model_instance):
BackendModelInstance(model_state, triton_model_instance), model_state_(model_state)
{
int node_id = ft::mpi::getCommWorldRank();
int num_nodes = ft::mpi::getCommWorldSize();
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Model name ") + ArtifactFilename()).c_str());
triton::common::TritonJson::Value transaction_policy;
is_decoupled_ = false;
model_state_->ModelConfig().MemberAsObject("model_transaction_policy", &transaction_policy);
transaction_policy.MemberAsBool("decoupled", &is_decoupled_);
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("Use ") + (is_decoupled_ ? "DECOUPLED (streaming)" : "COUPLED (classic)") + " API.").c_str());
THROW_IF_BACKEND_INSTANCE_ERROR(ValidateInputs());
THROW_IF_BACKEND_INSTANCE_ERROR(ValidateOutputs());
// NOTE: model instance params
model_instance_id_ = model_state->GetInstanceId();
model_instance_count_ = model_state->GetInstanceGroupCount();
tp_pp_size_ = model_state->GetParallelSize();
gpu_size_ = model_state->GetGpuSize();
world_size_ = model_state->GetWorldSize();
model_instance_gpu_size_ = num_nodes > 1 ? gpu_size_ : tp_pp_size_;
ft_model_instance_.resize(model_instance_gpu_size_);
std::vector<std::thread> threads;
std::shared_ptr<AbstractTransformerModel> shared_ft_model = model_state->GetFtModel();
// NOTE: CPU_KIND only, the backend fully controls how to distribute models to
// GPUs
model_instance_device_id_start_ = (model_instance_id_ * model_instance_gpu_size_) % gpu_size_;
// create output tensor stream
LOG_IF_ERROR(ConvertCUDAStatusToTritonError(cudaSetDevice(model_instance_device_id_start_),
TRITONSERVER_ERROR_INTERNAL,
"Failed to set cuda device"),
"Failed to set cuda device");
LOG_IF_ERROR(ConvertCUDAStatusToTritonError(
cudaStreamCreate(&output_stream_), TRITONSERVER_ERROR_INTERNAL, "Failed to create the stream"),
"Failed to create the stream");
// create nccl params
nccl_params_ = shared_ft_model->createNcclParams(node_id, model_instance_device_id_start_, num_nodes > 1);
shared_ft_model->createCustomComms(&custom_all_reduce_comms_, world_size_);
std::string model_instance_gpu_ids = "[ ";
for (int gid = model_instance_device_id_start_; gid < model_instance_device_id_start_ + model_instance_gpu_size_;
gid++) {
model_instance_gpu_ids += (std::to_string(gid) + " ");
threads.push_back(std::thread(ThreadLoadModel,
model_state,
ArtifactFilename(),
node_id,
gid,
model_instance_device_id_start_,
model_instance_id_ * model_instance_gpu_size_ + gid,
nccl_params_,
custom_all_reduce_comms_[gid - model_instance_device_id_start_],
&model_path_,
&ft_model_instance_[gid - model_instance_device_id_start_]));
}
model_instance_gpu_ids += "]";
for (auto& t : threads) {
t.join();
}
instance_comm_ = shared_ft_model->createInstanceComm(tp_pp_size_);
LOG_MESSAGE(TRITONSERVER_LOG_INFO,
(std::string("Model instance is created on GPU ") + model_instance_gpu_ids).c_str());
}
ModelInstanceState::~ModelInstanceState()
{
#ifdef TRITON_ENABLE_GPU
#endif // TRITON_ENABLE_GPU
for (auto bcast_buffer : bcast_buffers) {
free(bcast_buffer);
}
}
TRITONSERVER_Error* ModelInstanceState::ValidateInputs()
{
triton::common::TritonJson::Value ios, bios;
// input
std::string name, data_type;
triton::common::TritonJson::Value jshape;
// batch input
std::string kind, target_name, source_input;
triton::common::TritonJson::Value target_name_array, source_input_array;
model_state_->ModelConfig().MemberAsArray("input", &ios);
model_state_->ModelConfig().MemberAsArray("batch_input", &bios);
std::vector<std::string> valid_batch_input;
// batch input
for (size_t size = 0; size < bios.ArraySize(); size++) {
triton::common::TritonJson::Value batch_input;
bios.IndexAsObject(size, &batch_input);
batch_input.MemberAsString("kind", &kind);
batch_input.MemberAsArray("target_name", &target_name_array);
batch_input.MemberAsString("data_type", &data_type);
batch_input.MemberAsArray("source_input", &source_input_array);
target_name_array.IndexAsString(0, &target_name);
source_input_array.IndexAsString(0, &source_input);
LOG_MESSAGE(TRITONSERVER_LOG_INFO,
(std::string("Get batch input kind: " + kind + ", target_name: " + target_name
+ ", data_type: " + data_type + ", source_input: " + source_input)
.c_str()));
if (kind == "BATCH_ITEM_SHAPE" && data_type == "TYPE_INT32" && source_input + "_item_shape" == target_name) {
valid_batch_input.emplace_back(std::move(source_input));
}
}
// input
for (size_t size = 0; size < ios.ArraySize(); size++) {
triton::common::TritonJson::Value input;
ios.IndexAsObject(size, &input);
input.MemberAsString("name", &name);
input.MemberAsString("data_type", &data_type);
input.MemberAsArray("dims", &jshape);
triton::common::TritonJson::Value allow_ragged_batch_json;
bool allow_ragged_batch = false;
if (input.Find("allow_ragged_batch", &allow_ragged_batch_json)) {
RETURN_IF_ERROR(allow_ragged_batch_json.AsBool(&allow_ragged_batch));
}
if (allow_ragged_batch
&& std::find(valid_batch_input.begin(), valid_batch_input.end(), name) == valid_batch_input.end()) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INVALID_ARG,
std::string("Ragged Batch [ " + name + " ] needs the corresponding batch_input item shape !").c_str());
}
std::vector<int64_t> shape;
for (size_t size = 0; size < jshape.ArraySize(); size++) {
int64_t value = 0;
jshape.IndexAsInt(size, &value);
shape.push_back(value);
}
std::string str_shape = "[";
for (uint i = 0; i < shape.size(); i++) {
str_shape = str_shape + std::to_string(shape[i]);
if (i != shape.size() - 1) {
str_shape = str_shape + ", ";
}
else {
str_shape = str_shape + "]";
}
}
std::string allow_ragged_batch_str = allow_ragged_batch ? "true" : "false";
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
(std::string("Get input name: " + name + ", type: " + data_type + ", shape: " + str_shape
+ ", allow_ragged_batch: " + allow_ragged_batch_str)
.c_str()));
}
return nullptr; // success
}
TRITONSERVER_Error* ModelInstanceState::ValidateOutputs()
{
triton::common::TritonJson::Value ios;
RETURN_IF_ERROR(model_state_->ModelConfig().MemberAsArray("output", &ios));
std::string name, data_type;
triton::common::TritonJson::Value jshape;
model_state_->ModelConfig().MemberAsArray("output", &ios);
for (size_t size = 0; size < ios.ArraySize(); size++) {
triton::common::TritonJson::Value input;
ios.IndexAsObject(size, &input);
input.MemberAsString("name", &name);
input.MemberAsString("data_type", &data_type);
input.MemberAsArray("dims", &jshape);
std::vector<int64_t> shape;
for (size_t size = 0; size < jshape.ArraySize(); size++) {
int64_t value = 0;
jshape.IndexAsInt(size, &value);
shape.push_back(value);
}
std::string str_shape = "[";
for (uint i = 0; i < shape.size(); i++) {
str_shape = str_shape + std::to_string(shape[i]);
if (i != shape.size() - 1) {
str_shape = str_shape + ", ";
}
else {
str_shape = str_shape + "]";
}
}
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("Get output name: " + name + ", type: " + data_type + ", shape: " + str_shape).c_str()));
}
return nullptr; // success
}
void ModelInstanceState::ProcessRequests(TRITONBACKEND_Request** requests, const uint32_t request_count)
{
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
(std::string("TRITONBACKEND_ModelExecute: Running ") + Name() + " with " + std::to_string(request_count)
+ " requests")
.c_str());
uint64_t exec_start_ns = 0;
SET_TIMESTAMP(exec_start_ns);
const int max_batch_size = model_state_->MaxBatchSize();
// For each request collect the total batch size for this inference
// execution. The batch-size, number of inputs, and size of each
// input has already been checked so don't need to do that here.
size_t total_batch_size = 0;
// bool sequence_batching_enabled = model_state_->SequenceBatchingEnabled();
// size_t real_batch_dim = (int) sequence_batching_enabled;
constexpr size_t real_batch_dim = 0;
// only one batch slot per model instance when sequence_batching enabled
for (size_t i = 0; i < request_count; i++) {
// If we get a nullptr request then something is badly wrong. Fail
// and release all requests.
if (requests[i] == nullptr) {
RequestsRespondWithError(
requests,
request_count,
TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
std::string("null request given to FasterTransformer backend for '" + Name() + "'").c_str()));
return;
}
if (max_batch_size > 0) {
// Retrieve the batch size from one of the inputs, if the model
// supports batching, the first dimension size is batch size
int index = 0;
while (true) {
TRITONBACKEND_Input* input;
TRITONSERVER_Error* err_0 = TRITONBACKEND_RequestInputByIndex(requests[i], index, &input);
if (err_0 == nullptr) {
const char* input_name;
const int64_t* shape;
TRITONSERVER_Error* err_1 =
TRITONBACKEND_InputProperties(input, &input_name, nullptr, &shape, nullptr, nullptr, nullptr);
std::string input_name_str = std::string(input_name);
if (err_1 == nullptr) {
if (input_name_str != "START" && input_name_str != "END" && input_name_str != "READY") {
total_batch_size += shape[real_batch_dim];
break;
}
index++;
}
else {
RequestsRespondWithError(requests, request_count, err_1);
return;
}
}
else {
RequestsRespondWithError(requests, request_count, err_0);
return;
}
}
}
else {
total_batch_size += 1;
}
}
// If there are no valid payloads then no need to run the inference.
if (total_batch_size == 0) {
return;
}
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
(std::string("get total batch_size = ") + std::to_string(total_batch_size)).c_str());
// Make sure the maximum batch size is not exceeded. The
// total_batch_size must be 1 for models that don't support batching
// (i.e. max_batch_size == 0). If max_batch_size is exceeded then
// scheduler has done something badly wrong so fail and release all
// requests.
if ((total_batch_size != 1) && (total_batch_size > (size_t)max_batch_size)) {
RequestsRespondWithError(
requests,
request_count,
TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL,
std::string("batch size " + std::to_string(total_batch_size) + " for '" + Name()
+ "', max allowed is " + std::to_string(max_batch_size))
.c_str()));
return;
}
// At this point we are committed to running inference with all
// 'requests'. Create a response for each request. During input
// processing if there is an error with any request that error will
// be sent immediately with the corresponding response (and the
// response unique_ptr will then be nullptr). The request object
// itself will not be released until after all inferencing is done
// (below) as we may need to access the request object when
// determine how to process outputs (for example, even if we don't
// need the outputs for a request that has an error, we do need to
// know the size of those outputs associated with the request so we
// can skip them in the output tensors).
//
// When operating in the decoupled mode, responses should be created
// from factories. Here, we instantiate a factory for each request and
// generate the first response. At each new result from the model the
// generated response is filled, sent, and another response is created
// from the factory. The last response is send just like in the
// non-decoupled mode.
std::vector<TRITONBACKEND_Response*> responses;
responses.reserve(request_count);
std::vector<TRITONBACKEND_ResponseFactory*> factories;
for (size_t i = 0; i < request_count; i++) {
if (is_decoupled_) {
TRITONBACKEND_ResponseFactory* factory;
auto err = TRITONBACKEND_ResponseFactoryNew(&factory, requests[i]);
if (err == nullptr) {
factories.emplace_back(factory);
}
else {
factories.emplace_back(nullptr);
LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response factory");
TRITONSERVER_ErrorDelete(err);
}
}
else {
TRITONBACKEND_Response* response;
auto err = TRITONBACKEND_ResponseNew(&response, requests[i]);
if (err == nullptr) {
responses.emplace_back(response);
}
else {
responses.emplace_back(nullptr);
LOG_MESSAGE(TRITONSERVER_LOG_ERROR, "Fail to create response");
TRITONSERVER_ErrorDelete(err);
}
}
}
std::vector<const char*> input_names;
std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors =
std::make_shared<std::unordered_map<std::string, Tensor>>();
std::vector<BackendMemory*> input_memories;
bool cuda_copy = false;
if (is_decoupled_) {
generate_response_placeholders(&responses, &factories);
}
BackendInputCollector collector(requests,
request_count,
&responses,
model_state_->TritonMemoryManager(),
model_state_->EnablePinnedInput(),
CudaStream());
SetInputTensors(total_batch_size,
requests,
request_count,
&responses,
&collector,
&input_names,
&input_tensors,
&input_memories,
&cuda_copy);
// Wait for any in-flight input tensor copies to complete.
#ifdef TRITON_ENABLE_GPU
if (cuda_copy) {
cudaStreamSynchronize(CudaStream());
}
#endif
uint64_t compute_start_ns = 0;
SET_TIMESTAMP(compute_start_ns);
stream_callback_ctx_t context = {total_batch_size, requests, request_count, &responses, &factories, this};
auto output_tensors = Execute(&responses, &context, request_count, input_tensors);
uint64_t compute_end_ns = 0;
SET_TIMESTAMP(compute_end_ns);
// Free BackendMemory used for inputs
for (BackendMemory* mem : input_memories) {
delete mem;
}
input_memories.clear();
ReadOutputTensors(total_batch_size, output_tensors, requests, request_count, &responses);
uint64_t exec_end_ns = 0;
SET_TIMESTAMP(exec_end_ns);
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
(std::string("get response size = ") + std::to_string(responses.size())).c_str());
// Send all the responses that haven't already been sent because of
// an earlier error. Note that the responses are not set to nullptr
// here as we need that indication below to determine if the request
// we successful or not.
for (auto& response : responses) {
if (response != nullptr) {
LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
"failed to send FasterTransformer backend response");
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("response is sent")).c_str());
}
else {
LOG_MESSAGE(TRITONSERVER_LOG_WARN, (std::string("response is nullptr")).c_str());
}
}
// Report statistics for each request.
for (uint32_t r = 0; r < request_count; ++r) {
auto& request = requests[r];
LOG_IF_ERROR(TRITONBACKEND_ModelInstanceReportStatistics(TritonModelInstance(),
request,
(responses[r] != nullptr) /* success */,
exec_start_ns,
compute_start_ns,
compute_end_ns,
exec_end_ns),
"failed reporting request statistics");
LOG_IF_ERROR(TRITONBACKEND_RequestRelease(request, TRITONSERVER_REQUEST_RELEASE_ALL),
"failed releasing request");
}
// Report the entire batch statistics.
LOG_IF_ERROR(
TRITONBACKEND_ModelInstanceReportBatchStatistics(
TritonModelInstance(), total_batch_size, exec_start_ns, compute_start_ns, compute_end_ns, exec_end_ns),
"failed reporting batch request statistics");
}
void streaming_callback(std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors, void* ctx)
{
stream_callback_ctx_t* context = reinterpret_cast<stream_callback_ctx_t*>(ctx);
ModelInstanceState* model = reinterpret_cast<ModelInstanceState*>(context->model);
std::vector<TRITONBACKEND_Response*>* responses = context->responses;
model->ReadOutputTensors(
context->total_batch_size, output_tensors, context->requests, context->request_count, responses);
for (auto& response : *responses) {
if (response != nullptr) {
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("start to send streaming response")).c_str());
LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, 0, nullptr),
"failed to send FasterTransformer backend response");
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("streaming response is sent")).c_str());
}
else {
LOG_MESSAGE(TRITONSERVER_LOG_WARN, (std::string("streaming response is nullptr")).c_str());
}
}
responses->clear();
generate_response_placeholders(responses, context->factories);
}
int ThreadForward(std::unique_ptr<AbstractTransformerModelInstance>* ft_model_instance,
std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors,
std::shared_ptr<std::unordered_map<std::string, Tensor>>* output_tensors,
ft::AbstractInstanceComm* instance_comm,
std::exception_ptr* exception_ptr,
const int device_id,
const int use_stream_cb,
stream_callback_ctx_t* context)
{
LOG_IF_ERROR(ConvertCUDAStatusToTritonError(
cudaSetDevice(device_id), TRITONSERVER_ERROR_INTERNAL, "Failed to set cuda device"),
"Failed to set cuda device");
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Start to forward")).c_str());
if (use_stream_cb) {
(*ft_model_instance)->registerCallback(streaming_callback, (void*)context);
}
*output_tensors = (*ft_model_instance)->forward(*input_tensors, instance_comm);
if (use_stream_cb) {
(*ft_model_instance)->unRegisterCallback();
}
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("Stop to forward")).c_str());
if ((*output_tensors)->count("error_message")) {
*exception_ptr = *((std::exception_ptr*)((*output_tensors)->at("error_message").data));
}
return 0;
}
void triton_check_inputs(std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors, const char* filename)
{
auto& output = output_tensors->at("output_ids");
auto shape = output.shape;
assert(shape.size() == 3);
assert(output.type == TYPE_UINT32);
auto batch_size = shape[0];
auto length = shape[2];
std::string fName = filename;
auto file = std::ofstream(fName, std::ios::out);
if (!file.is_open()) {}
else {
for (size_t i = 0; i < batch_size; i++) {
for (size_t j = 0; j < length; j++) {
file << ((uint32_t*)output.data)[i * length + j] << " ";
}
file << std::endl;
}
}
}
void ModelInstanceState::BroadcastInputTensors(std::shared_ptr<std::unordered_map<std::string, Tensor>>* input_tensors)
{
int node_id = ft::mpi::getCommWorldRank();
uint32_t input_count = node_id ? 0 : (*input_tensors)->size();
ft::mpi::bcast(&input_count, 1, ft::mpi::MPI_TYPE_UINT32_T, 0, ft::mpi::COMM_WORLD);
if (input_count > bcast_buffers.size()) {
bcast_buffers.resize(input_count);
}
if (node_id) {
for (uint input_index = 0; input_index < input_count; input_index++) {
std::vector<size_t> batchn_shape;
int64_t shape_size = 0;
int64_t buffer_size = 1;
ft::mpi::bcast(&shape_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
for (int s_id = 0; s_id < shape_size; s_id++) {
int64_t val;
ft::mpi::bcast(&val, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
batchn_shape.push_back(val);
buffer_size *= val;
}
int64_t data_type_size = 1;
ft::mpi::bcast(&data_type_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
buffer_size *= data_type_size;
bcast_buffers[input_index] = (char*)realloc(bcast_buffers[input_index], buffer_size);
char* input_buffer = bcast_buffers[input_index];
ft::mpi::bcast(input_buffer, buffer_size, ft::mpi::MPI_TYPE_BYTE, 0, ft::mpi::COMM_WORLD);
int64_t name_size = 0;
ft::mpi::bcast(&name_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
char char_name[1024] = {0};
ft::mpi::bcast(char_name, name_size, ft::mpi::MPI_TYPE_CHAR, 0, ft::mpi::COMM_WORLD);
uint32_t data_type_num = 0;
ft::mpi::bcast(&data_type_num, 1, ft::mpi::MPI_TYPE_UINT32_T, 0, ft::mpi::COMM_WORLD);
TRITONSERVER_DataType triton_data_type = TRITONSERVER_DataType(data_type_num);
(*input_tensors)
->insert({std::string(char_name),
Tensor{TRITONSERVER_MEMORY_CPU, triton_data_type, batchn_shape, input_buffer}});
}
}
else {
int input_index = 0;
for (auto it = (*input_tensors)->begin(); it != (*input_tensors)->end(); ++it) {
std::vector<size_t> batchn_shape = it->second.shape;
int64_t shape_size = batchn_shape.size();
int64_t buffer_size = 1;
ft::mpi::bcast(&shape_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
for (int s_id = 0; s_id < shape_size; s_id++) {
int64_t val = batchn_shape[s_id];
ft::mpi::bcast(&val, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
buffer_size *= val;
}
ft::Tensor tmp{
ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, nullptr}; // TODO change the getDataTypeByteNum function to static
int64_t data_type_size = tmp.getTypeSize(triton::Tensor::convertTritonTypeToFt(it->second.type));
ft::mpi::bcast(&data_type_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
buffer_size *= data_type_size;
ft::mpi::bcast(
const_cast<void*>(it->second.data), buffer_size, ft::mpi::MPI_TYPE_BYTE, 0, ft::mpi::COMM_WORLD);
std::string name = it->first;
int64_t name_size = name.size();
ft::mpi::bcast(&name_size, 1, ft::mpi::MPI_TYPE_INT64_T, 0, ft::mpi::COMM_WORLD);
bcast_buffers[input_index] = (char*)realloc(bcast_buffers[input_index], name_size);
char* char_name = bcast_buffers[input_index];
int64_t length = (int64_t)name.copy(char_name, name_size);
ft::FT_CHECK(length == name_size);
ft::mpi::bcast(char_name, name_size, ft::mpi::MPI_TYPE_CHAR, 0, ft::mpi::COMM_WORLD);
uint32_t data_type_num = (uint32_t)(it->second.type);
ft::mpi::bcast(&data_type_num, 1, ft::mpi::MPI_TYPE_UINT32_T, 0, ft::mpi::COMM_WORLD);
input_index++;
}
}
}
std::shared_ptr<std::unordered_map<std::string, Tensor>>
ModelInstanceState::Execute(std::vector<TRITONBACKEND_Response*>* responses,
stream_callback_ctx_t* context,
const uint32_t response_count,
std::shared_ptr<std::unordered_map<std::string, Tensor>> input_tensors)
{
int node_id = ft::mpi::getCommWorldRank();
if (node_id == 0) {
// Debug: input array
// triton_check_inputs(input_tensors, "triton_in");
}
if (node_id) {
input_tensors = std::make_shared<std::unordered_map<std::string, Tensor>>();
}
ft::mpi::barrier();
BroadcastInputTensors(&input_tensors);
std::vector<std::thread> threads;
std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors_list[model_instance_gpu_size_];
std::exception_ptr exception_ptr[model_instance_gpu_size_];
for (int gid = model_instance_device_id_start_; gid < model_instance_device_id_start_ + model_instance_gpu_size_;
gid++) {
int instance_local_id = gid - model_instance_device_id_start_;
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("before ThreadForward " + std::to_string(gid))).c_str());
threads.push_back(std::thread(ThreadForward,
&ft_model_instance_[instance_local_id],
&input_tensors,
&output_tensors_list[instance_local_id],
instance_comm_.get(),
&exception_ptr[instance_local_id],
gid,
is_decoupled_ && gid == model_instance_device_id_start_,
context));
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("after ThreadForward " + std::to_string(gid))).c_str());
}
for (auto& t : threads) {
t.join();
}
try {
for (int gid = model_instance_device_id_start_;
gid < model_instance_device_id_start_ + model_instance_gpu_size_;
gid++) {
int instance_local_id = gid - model_instance_device_id_start_;
if (exception_ptr[instance_local_id]) {
std::rethrow_exception(exception_ptr[instance_local_id]);
}
}
}
catch (std::exception& ex) {
SendErrorForResponses(
responses,
response_count,
TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL,
("FasterTransformer execute failure: " + std::string(ex.what())).c_str()));
}
auto output_tensors = output_tensors_list[0];
return output_tensors;
}
void ModelInstanceState::SetInputTensors(
size_t total_batch_size,
TRITONBACKEND_Request** requests,
const uint32_t request_count,
std::vector<TRITONBACKEND_Response*>* responses,
BackendInputCollector* collector,
std::vector<const char*>* input_names,
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>* input_tensors,
std::vector<BackendMemory*>* input_memories,
bool* cuda_copy)
{
const int max_batch_size = model_state_->MaxBatchSize();
// bool sequence_batching_enabled = model_state_->SequenceBatchingEnabled();
bool dynamic_batching_enabled = model_state_->DynamicBatchingEnabled() || model_state_->SequenceBatchingEnabled();
// All requests must have equally-sized input tensors so use any
// request as the representative for the input tensors.
uint32_t input_count;
RESPOND_ALL_AND_RETURN_IF_ERROR(
responses, request_count, TRITONBACKEND_RequestInputCount(requests[0], &input_count));
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("get input count = ") + std::to_string(input_count)).c_str());
// Process batch input if any
RaggedBatchingParam_Map batch_input_param_map;
if (dynamic_batching_enabled) {
// Handle batch inputs for ragged batching
for (const auto& batch_input : model_state_->BatchInputs()) {
std::vector<int64_t> shape;
collector->BatchInputShape(batch_input, &shape);
auto batch_input_kind = batch_input.BatchInputKind();
auto batch_input_name = batch_input.TargetNames()[0];
// we only take care of the ragged input_ids
// Assume the first dimention (length) are different and others are the
// same BATCH_ITEM_SHAPE [num_requests (batches), num_dims (excluding
// batch dimension)]
if (batch_input_kind == BatchInput::Kind::BATCH_ITEM_SHAPE
&& (batch_input_name == "input_ids_item_shape"
|| batch_input_name == "request_prompt_embedding_item_shape")) {
RaggedBatchingParams param{};
size_t num_feature_dimensions = (size_t)shape[1];
const char* dst_buffer = nullptr;
size_t dst_buffer_byte_size;
TRITONSERVER_MemoryType dst_memory_type;
int64_t dst_memory_type_id;
// Batch inputs are always created on CPU
RESPOND_ALL_AND_SET_NULL_IF_ERROR((*responses),
responses->size(),
collector->ProcessBatchInput(batch_input,
nullptr,
0,
{{TRITONSERVER_MEMORY_CPU, 0}},
&dst_buffer,
&dst_buffer_byte_size,
&dst_memory_type,
&dst_memory_type_id));
param.batch_input_ptr = reinterpret_cast<const int32_t*>(dst_buffer);
// concat all feature dimensions
param.batch_intput_size = (dst_buffer_byte_size / sizeof(int32_t)) / num_feature_dimensions;
if (num_feature_dimensions > 1) {
BackendMemory* batch_item_shape_memory;
RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
request_count,
BackendMemory::Create(model_state_->TritonMemoryManager(),
{BackendMemory::AllocationType::CPU},
0,
dst_buffer_byte_size / num_feature_dimensions,
&batch_item_shape_memory));
int32_t* batch_item_shape_memory_ptr =
reinterpret_cast<int32_t*>(batch_item_shape_memory->MemoryPtr());
for (size_t idx = 0; idx < param.batch_intput_size; idx++) {
int32_t concat_dimensions = 1;
for (size_t dim_idx = 0; dim_idx < num_feature_dimensions; dim_idx++) {
concat_dimensions *= param.batch_input_ptr[idx * num_feature_dimensions + dim_idx];
// dim0 is seq length dimension
if (dim_idx == 0) {
param.max_seq_length =
std::max(param.max_seq_length, param.batch_input_ptr[idx * num_feature_dimensions]);
}
}
batch_item_shape_memory_ptr[idx] = concat_dimensions;
}
param.batch_input_ptr = reinterpret_cast<const int32_t*>(batch_item_shape_memory_ptr);
}
else {
param.max_seq_length =
*std::max_element(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size);
}
// check if padding is needed
param.is_input_ragged = std::any_of(param.batch_input_ptr,
param.batch_input_ptr + param.batch_intput_size,
[&](int x) { return x != param.batch_input_ptr[0]; });
// calculate statics of elements
if (param.is_input_ragged) {
param.max_elements_per_seq =
*std::max_element(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size);
param.total_input_elements =
std::accumulate(param.batch_input_ptr, param.batch_input_ptr + param.batch_intput_size, 0);
batch_input_param_map.insert({batch_input_name, param});
// verbose logging for debugging
if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {
std::string value_str = "[ ";
for (size_t i = 0; i < param.batch_intput_size; i++) {
value_str += std::to_string(param.batch_input_ptr[i]) + " ";
}
value_str += "]";
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
(std::string("collect batch input name: ") + batch_input_name + "\n size: "
+ std::to_string(dst_buffer_byte_size) + " bytes\n value: " + value_str
+ "\n max sequence length: " + std::to_string(param.max_seq_length)
+ "\n max elements per sequence: " + std::to_string(param.max_elements_per_seq))
.c_str());
}
}
}
}
}
// Process user-defined inputs
for (uint32_t input_idx = 0; input_idx < input_count; input_idx++) {
TRITONBACKEND_Input* input;
RESPOND_ALL_AND_RETURN_IF_ERROR(
responses, request_count, TRITONBACKEND_RequestInputByIndex(requests[0], input_idx, &input));
const char* input_name;
TRITONSERVER_DataType input_datatype;
const int64_t* input_shape;
uint32_t input_dims_count;
RESPOND_ALL_AND_RETURN_IF_ERROR(
responses,
request_count,
TRITONBACKEND_InputProperties(
input, &input_name, &input_datatype, &input_shape, &input_dims_count, nullptr, nullptr));
input_names->emplace_back(input_name);
std::string input_name_str = std::string(input_name);
// Pad input ids from different requests
RaggedBatchingParams param = batch_input_param_map[input_name_str + "_item_shape"];
if (batch_input_param_map.find(input_name_str + "_item_shape") != batch_input_param_map.end()
&& batch_input_param_map[input_name_str + "_item_shape"].is_input_ragged) {
RaggedBatchingParams param = batch_input_param_map[input_name_str + "_item_shape"];
const int64_t total_batch_size_int64 = (int64_t)total_batch_size;
const int64_t max_elements_per_seq_int64 = (int64_t)param.max_elements_per_seq;
const size_t padded_input_ids_buffer_size =
GetByteSize(input_datatype, std::vector<int64_t>{total_batch_size_int64, max_elements_per_seq_int64});
// Always host memory
BackendMemory* padded_input_memory;
BackendMemory* request_input_memory;
RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
request_count,
BackendMemory::Create(model_state_->TritonMemoryManager(),
{BackendMemory::AllocationType::CPU},
0,
padded_input_ids_buffer_size,
&padded_input_memory));
RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
request_count,
BackendMemory::Create(model_state_->TritonMemoryManager(),
{BackendMemory::AllocationType::CPU},
0,
padded_input_ids_buffer_size,
&request_input_memory));
memset(padded_input_memory->MemoryPtr(), 0, padded_input_ids_buffer_size);
collector->ProcessTensor(
input_name,
request_input_memory->MemoryPtr(),
GetByteSize(input_datatype, std::vector<int64_t>{(int64_t)param.total_input_elements}),
request_input_memory->MemoryType(),
request_input_memory->MemoryTypeId());
int64_t accumulated_elements_offset = 0;
char* padded_input_ids_ptr = padded_input_memory->MemoryPtr();
char* base_input_ids = request_input_memory->MemoryPtr();
// copy each request buffer to padded buffer
for (int64_t single_batch_idx = 0; single_batch_idx < total_batch_size_int64; single_batch_idx++) {
int32_t sequence_elements = param.batch_input_ptr[single_batch_idx];
std::memcpy(padded_input_ids_ptr
+ GetByteSize(input_datatype,
std::vector<int64_t>{single_batch_idx, max_elements_per_seq_int64}),
base_input_ids
+ GetByteSize(input_datatype, std::vector<int64_t>{accumulated_elements_offset}),
GetByteSize(input_datatype, std::vector<int64_t>{sequence_elements}));
accumulated_elements_offset += sequence_elements;
}
// modify batch dimension shape, and sequence length dimension shape after
// padding
std::vector<size_t> batchn_shape(input_shape, input_shape + input_dims_count);
if (max_batch_size != 0) {
batchn_shape[0] = total_batch_size;
batchn_shape[1] = (size_t)param.max_seq_length;
// assume all non-seq-length dimensions have the same shape
if (input_dims_count > 2) {
batchn_shape[2] = (size_t)(param.max_elements_per_seq / param.max_seq_length);
}
}
(*input_tensors)
->insert({std::string(input_name),
triton::Tensor{TRITONSERVER_MEMORY_CPU, input_datatype, batchn_shape, padded_input_ids_ptr}});
continue;
}
// bool start_end_ready_flag = (input_name_str == "START" || input_name_str
// == "END"
// || input_name_str == "READY");
// int shape_dims_start = (int) (sequence_batching_enabled &&
// !start_end_ready_flag);
// The shape for the entire input patch, [total_batch_size, ...]
std::vector<int64_t> batchn_shape(input_shape, input_shape + input_dims_count);
if (max_batch_size != 0) {
batchn_shape[0] = total_batch_size;
}
std::vector<size_t> batchn_shape_2(input_shape, input_shape + input_dims_count);
if (max_batch_size != 0) {
batchn_shape_2[0] = total_batch_size;
}
// std::vector<int64_t> batchn_shape(
// input_shape + shape_dims_start, input_shape + input_dims_count);
// if (max_batch_size != 0 && !start_end_ready_flag) {
// batchn_shape[0] = total_batch_size;
// }
// std::vector<size_t> batchn_shape_2(
// input_shape + shape_dims_start, input_shape + input_dims_count);
// if (max_batch_size != 0 && !start_end_ready_flag) {
// batchn_shape_2[0] = total_batch_size;
// }
// The input must be in contiguous CPU/GPU memory.
const int64_t batchn_byte_size = GetByteSize(input_datatype, batchn_shape);
// Always host memory
BackendMemory* input_memory;
RESPOND_ALL_AND_RETURN_IF_ERROR(responses,
request_count,
BackendMemory::Create(model_state_->TritonMemoryManager(),
{BackendMemory::AllocationType::CPU},
0,
batchn_byte_size,
&input_memory));
input_memories->push_back(input_memory);
TRITONSERVER_MemoryType memory_type = input_memory->MemoryType();
int64_t memory_type_id = input_memory->MemoryTypeId();
char* input_buffer = input_memory->MemoryPtr();
collector->ProcessTensor(input_name, input_buffer, batchn_byte_size, memory_type, memory_type_id);
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("collect name: ") + input_name + " size: " + std::to_string(batchn_byte_size) + " bytes")
.c_str());
(*input_tensors)
->insert({std::string(input_name),
triton::Tensor{TRITONSERVER_MEMORY_CPU, input_datatype, batchn_shape_2, input_buffer}});
}
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
(std::string("the data is in ") + (*cuda_copy ? std::string("GPU") : std::string("CPU"))).c_str());
// Finalize...
*cuda_copy |= collector->Finalize();
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
(std::string("the data is in ") + (*cuda_copy ? std::string("GPU") : std::string("CPU"))).c_str());
}
void ModelInstanceState::ReadOutputTensors(size_t total_batch_size,
std::shared_ptr<std::unordered_map<std::string, Tensor>> output_tensors,
TRITONBACKEND_Request** requests,
const uint32_t request_count,
std::vector<TRITONBACKEND_Response*>* responses)
{
BackendOutputResponder responder(requests,
request_count,
responses,
model_state_->MaxBatchSize(),
model_state_->TritonMemoryManager(),
model_state_->EnablePinnedInput(),
output_stream_);
bool cuda_copy = false;
// bool sequence_batching_enabled = model_state_->SequenceBatchingEnabled();
std::vector<std::vector<char>> string_buffers;
int idx = 0;
for (auto it = output_tensors->begin(); it != output_tensors->end(); ++it) {
LOG_MESSAGE(
TRITONSERVER_LOG_VERBOSE,
(std::string("Get output_tensors ") + std::to_string(idx) + std::string(": ") + std::string(it->first))
.c_str());
idx++;
auto& output = it->second;
// Verify output datatype matches datatype from model config
TRITONSERVER_DataType output_dtype = output.type;
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
(std::string(" output_type: ") + TRITONSERVER_DataTypeString(output_dtype)).c_str());
const char* output_buffer = static_cast<const char*>(output.data);
// Set output shape
// std::vector<int64_t> batchn_shape = sequence_batching_enabled ?
// std::vector<int64_t>{1} :
// std::vector<int64_t>{};
std::vector<int64_t> batchn_shape;
if (TRITONSERVER_LogIsEnabled(TRITONSERVER_LOG_VERBOSE)) {
// std::string batch_shape_str = sequence_batching_enabled ? " output
// shape: [1, " :
// " output shape: [";
std::string batch_shape_str = " output shape: [";
for (uint i = 0; i < output.shape.size(); i++) {
batchn_shape.push_back(output.shape[i]);
batch_shape_str = batch_shape_str + std::to_string(output.shape[i]);
if (i != output.shape.size() - 1) {
batch_shape_str = batch_shape_str + ", ";
}
else {
batch_shape_str = batch_shape_str + "]";
}
}
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, batch_shape_str.c_str());
}
else {
batchn_shape.insert(batchn_shape.end(), output.shape.begin(), output.shape.end());
}
responder.ProcessTensor(it->first,
output_dtype,
batchn_shape,
output_buffer,
TRITONSERVER_MEMORY_GPU,
model_instance_device_id_start_);
}
// Finalize and wait for any pending buffer copies.
cuda_copy |= responder.Finalize();
#ifdef TRITON_ENABLE_GPU
if (cuda_copy) {
cudaStreamSynchronize(output_stream_);
}
#endif // TRITON_ENABLE_GPU
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
(std::string("PERFORMED GPU copy: ") + (cuda_copy ? std::string("YES") : std::string("NO"))).c_str());
}
/////////////
extern "C" {
TRITONSERVER_Error* TRITONBACKEND_Initialize(TRITONBACKEND_Backend* backend)
{
int provided;
ft::mpi::initThread(nullptr, nullptr, ft::mpi::THREAD_MULTIPLE, &provided);
const char* cname;
RETURN_IF_ERROR(TRITONBACKEND_BackendName(backend, &cname));
std::string name(cname);
LOG_MESSAGE(TRITONSERVER_LOG_INFO, (std::string("TRITONBACKEND_Initialize: ") + name).c_str());
// Check the backend API version that Triton supports vs. what this
// backend was compiled against.
uint32_t api_version_major, api_version_minor;
RETURN_IF_ERROR(TRITONBACKEND_ApiVersion(&api_version_major, &api_version_minor));
LOG_MESSAGE(TRITONSERVER_LOG_INFO,
(std::string("Triton TRITONBACKEND API version: ") + std::to_string(api_version_major) + "."
+ std::to_string(api_version_minor))
.c_str());
LOG_MESSAGE(TRITONSERVER_LOG_INFO,
(std::string("'") + name
+ "' TRITONBACKEND API version: " + std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "."
+ std::to_string(TRITONBACKEND_API_VERSION_MINOR))
.c_str());
if ((api_version_major != TRITONBACKEND_API_VERSION_MAJOR)
|| (api_version_minor < TRITONBACKEND_API_VERSION_MINOR)) {
return TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_UNSUPPORTED,
(std::string("Triton TRITONBACKEND API version: ") + std::to_string(api_version_major) + "."
+ std::to_string(api_version_minor) + " does not support '" + name + "' TRITONBACKEND API version: "
+ std::to_string(TRITONBACKEND_API_VERSION_MAJOR) + "." + std::to_string(TRITONBACKEND_API_VERSION_MINOR))
.c_str());
}
return nullptr; // success
}
TRITONSERVER_Error* TRITONBACKEND_ModelInitialize(TRITONBACKEND_Model* model)
{
const char* cname;
RETURN_IF_ERROR(TRITONBACKEND_ModelName(model, &cname));
std::string name(cname);
uint64_t version;
RETURN_IF_ERROR(TRITONBACKEND_ModelVersion(model, &version));
LOG_MESSAGE(
TRITONSERVER_LOG_INFO,
(std::string("TRITONBACKEND_ModelInitialize: ") + name + " (version " + std::to_string(version) + ")").c_str());
// Create a ModelState object and associate it with the
// TRITONBACKEND_Model.
ModelState* model_state;
RETURN_IF_ERROR(ModelState::Create(model, &model_state));
RETURN_IF_ERROR(TRITONBACKEND_ModelSetState(model, reinterpret_cast<void*>(model_state)));
return nullptr; // success
}
TRITONSERVER_Error* TRITONBACKEND_ModelFinalize(TRITONBACKEND_Model* model)
{
void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vstate));
ModelState* model_state = reinterpret_cast<ModelState*>(vstate);
LOG_MESSAGE(TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: delete model state");
delete model_state;
LOG_MESSAGE(TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelFinalize: MPI Finalize");
ft::mpi::finalize();
return nullptr; // success
}
TRITONSERVER_Error* TRITONBACKEND_ModelInstanceInitialize(TRITONBACKEND_ModelInstance* instance)
{
int node_id = ft::mpi::getCommWorldRank();
const char* cname;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceName(instance, &cname));
std::string name(cname);
// Get the model state associated with this instance's model.
TRITONBACKEND_Model* model;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceModel(instance, &model));
void* vmodelstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelState(model, &vmodelstate));
ModelState* model_state = reinterpret_cast<ModelState*>(vmodelstate);
// Create a ModelInstanceState object and associate it with the
// TRITONBACKEND_ModelInstance.
ModelInstanceState* instance_state;
RETURN_IF_ERROR(ModelInstanceState::Create(model_state, instance, &instance_state));
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceSetState(instance, reinterpret_cast<void*>(instance_state)));
int model_instance_id = instance_state->GetModelInstanceId();
int model_instance_count = instance_state->GetModelInstanceCount();
LOG_MESSAGE(TRITONSERVER_LOG_INFO,
(std::string("TRITONBACKEND_ModelInstanceInitialize: ") + name + " (count "
+ std::to_string(model_instance_count) + ")" + " (instance_id " + std::to_string(model_instance_id)
+ ")")
.c_str());
if (node_id) {
while (true) {
instance_state->Execute(
nullptr, nullptr, 0, std::shared_ptr<std::unordered_map<std::string, Tensor>>(nullptr));
}
}
return nullptr; // success
}
TRITONSERVER_Error* TRITONBACKEND_ModelInstanceFinalize(TRITONBACKEND_ModelInstance* instance)
{
void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
ModelInstanceState* instance_state = reinterpret_cast<ModelInstanceState*>(vstate);
LOG_MESSAGE(TRITONSERVER_LOG_INFO, "TRITONBACKEND_ModelInstanceFinalize: delete instance state");
delete instance_state;
return nullptr; // success
}
TRITONSERVER_Error* TRITONBACKEND_ModelInstanceExecute(TRITONBACKEND_ModelInstance* instance,
TRITONBACKEND_Request** requests,
const uint32_t request_count)
{
// Triton will not call this function simultaneously for the same
// 'instance'. But since this backend could be used by multiple
// instances from multiple models the implementation needs to handle
// multiple calls to this function at the same time (with different
// 'instance' objects). Suggested practice for this is to use only
// function-local and model-instance-specific state (obtained from
// 'instance'), which is what we do here.
ModelInstanceState* instance_state;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, reinterpret_cast<void**>(&instance_state)));
ModelState* model_state = instance_state->StateForModel();
// This backend specifies BLOCKING execution policy. That means that
// we should not return from this function until execution is
// complete. Triton will automatically release 'instance' on return
// from this function so that it is again available to be used for
// another call to TRITONBACKEND_ModelInstanceExecute.
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE,
(std::string("model ") + model_state->Name() + ", instance " + instance_state->Name() + ", executing "
+ std::to_string(request_count) + " requests")
.c_str());
// At this point we accept ownership of 'requests', which means that
// even if something goes wrong we must still return success from
// this function. If something does go wrong in processing a
// particular request then we send an error response just for the
// specific request.
instance_state->ProcessRequests(requests, request_count);
return nullptr; // success
}
} // extern "C"
} // namespace fastertransformer_backend
} // namespace backend
} // namespace triton
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
{
global:
TRITONBACKEND_*;
local: *;
};
# Copyright (c) OpenMMLab. All rights reserved.
# Copyright (c) 2022-2023, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/CMakeLists.txt
cmake_minimum_required(VERSION 3.8)
set(llama_triton_backend_files
LlamaTritonModel.cc
LlamaTritonModelInstance.cc
)
add_library(LlamaTritonBackend STATIC ${llama_triton_backend_files})
set_property(TARGET LlamaTritonBackend PROPERTY POSITION_INDEPENDENT_CODE ON)
target_link_libraries(LlamaTritonBackend PRIVATE TransformerTritonBackend Llama tensor memory_utils -lcublasLt)
target_compile_features(LlamaTritonBackend PRIVATE cxx_std_14)
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.cc
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModel.h"
#include "3rdparty/INIReader.h"
#include "src/fastertransformer/models/llama/LlamaInstanceComm.h"
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/allocator.h"
#include <mutex>
namespace ft = fastertransformer;
std::shared_ptr<AbstractTransformerModel> AbstractTransformerModel::createLlamaModel(std::string inifile)
{
INIReader reader = INIReader(inifile);
if (reader.ParseError() < 0) {
std::cout << "[ERROR] Can't load '" << inifile << "'\n";
return nullptr;
}
const std::string data_type = reader.Get("ft_instance_hyperparameter", "data_type");
int tensor_para_size = reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size");
std::string model_dir = reader.Get("ft_instance_hyperparameter", "model_dir");
if (data_type == "half" || data_type == "fp16") {
return std::make_shared<LlamaTritonModel<half>>(
reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
model_dir);
}
else {
return std::make_shared<LlamaTritonModel<float>>(
reader.GetInteger("ft_instance_hyperparameter", "tensor_para_size"),
reader.GetInteger("ft_instance_hyperparameter", "pipeline_para_size"),
reader.GetInteger("ft_instance_hyperparameter", "enable_custom_all_reduce", 0),
model_dir);
}
}
template<typename T>
void LlamaTritonModel<T>::handleMissingParams()
{
if (!max_batch_size_) {
max_batch_size_ = 32;
FT_LOG_WARNING("[LlamaTritonModel] `max_batch_size` is not set, default to %d.", (int)max_batch_size_);
}
if (!session_len_) {
session_len_ = 2160;
FT_LOG_WARNING("[LlamaTritonModel] `session_len` is not set, default to %d.", (int)session_len_);
}
if (!max_context_token_num_) {
max_context_token_num_ = (int)std::sqrt(max_batch_size_);
FT_LOG_WARNING("[LlamaTritonModel] `max_context_token_num` is not set, default to %d.",
(int)max_context_token_num_);
}
if (!step_length_) {
step_length_ = 1;
FT_LOG_WARNING("[LlamaTritonModel] `step_length` is not set, default to %d.", (int)step_length_);
}
if (!cache_max_entry_count_) {
cache_max_entry_count_ = 32;
FT_LOG_WARNING("[LlamaTritonModel] `cache_max_entry_count` is not set, default to %d.",
(int)cache_max_entry_count_);
}
if (!cache_chunk_size_) {
cache_chunk_size_ = cache_max_entry_count_;
FT_LOG_WARNING("[LlamaTritonModel] `cache_chunk_size` is not set, default to %d.", (int)cache_chunk_size_);
}
}
template<typename T>
LlamaTritonModel<T>::LlamaTritonModel(size_t tensor_para_size,
size_t pipeline_para_size,
int enable_custom_all_reduce,
std::string model_dir):
tensor_para_size_(tensor_para_size),
pipeline_para_size_(pipeline_para_size),
shared_weights_(std::vector<std::shared_ptr<ft::LlamaWeight<T>>>(ft::getDeviceCount())),
enable_custom_all_reduce_(enable_custom_all_reduce)
{
model_dir_ = model_dir;
const std::string inifile{model_dir + "/config.ini"};
INIReader reader = INIReader(inifile);
if (reader.ParseError() < 0) {
std::cout << "[ERROR] Can't load '" << inifile << "'\n";
ft::FT_CHECK(false);
}
model_name_ = reader.Get("llama", "model_name");
head_num_ = reader.GetInteger("llama", "head_num");
size_per_head_ = reader.GetInteger("llama", "size_per_head");
inter_size_ = reader.GetInteger("llama", "inter_size");
num_layer_ = reader.GetInteger("llama", "num_layer");
vocab_size_ = reader.GetInteger("llama", "vocab_size");
rotary_embedding_dim_ = reader.GetInteger("llama", "rotary_embedding");
norm_eps_ = reader.GetFloat("llama", "norm_eps");
start_id_ = reader.GetInteger("llama", "start_id");
end_id_ = reader.GetInteger("llama", "end_id");
max_batch_size_ = reader.GetInteger("llama", "max_batch_size", 0);
max_context_token_num_ = reader.GetInteger("llama", "max_context_token_num", 0);
session_len_ = reader.GetInteger("llama", "session_len", 0);
step_length_ = reader.GetInteger("llama", "step_length", 0);
cache_max_entry_count_ = reader.GetInteger("llama", "cache_max_entry_count", 0);
use_context_fmha_ = reader.GetInteger("llama", "use_context_fmha", 1);
cache_chunk_size_ = reader.GetInteger("llama", "cache_chunk_size", 0);
prefix_cache_len_ = reader.GetInteger("llama", "prefix_cache_len", 0);
handleMissingParams();
if (max_context_token_num_ <= max_batch_size_) {
max_context_token_num_ *= session_len_;
}
shared_state_ = std::make_shared<typename ft::LlamaV2<T>::SharedState>();
shared_state_->barrier = std::make_shared<ft::Barrier>(tensor_para_size);
const auto device_count = ft::getDeviceCount();
shared_instances_.resize(device_count);
shared_mutexes_.resize(device_count);
const std::string weight_type_str = reader.Get("llama", "weight_type");
if (weight_type_str == "fp16") {
weight_type_ = ft::WeightType::kFP16;
}
else if (weight_type_str == "fp32") {
weight_type_ = ft::WeightType::kFP32;
}
else if (weight_type_str == "int8") {
weight_type_ = ft::WeightType::kINT8;
}
else if (weight_type_str == "int4") {
weight_type_ = ft::WeightType::kINT4;
}
else {
std::cout << "[ERROR] Unsupported weight type: '" << weight_type_str << "'\n";
ft::FT_CHECK(0);
}
}
template<typename T>
std::unique_ptr<LlamaTritonSharedModelInstance<T>> LlamaTritonModel<T>::createSharedModelInstance(
int device_id,
int rank,
std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm)
{
ft::check_cuda_error(cudaSetDevice(device_id));
const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_);
std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator(
new ft::Allocator<ft::AllocatorType::CUDA>(device_id));
/// TODO: this stream handle is leaked
cudaStream_t stream{};
ft::check_cuda_error(cudaStreamCreate(&stream));
allocator->setStream(stream);
cublasHandle_t cublas_handle;
cublasLtHandle_t cublaslt_handle;
cublasCreate(&cublas_handle);
cublasLtCreate(&cublaslt_handle);
cublasSetStream(cublas_handle, stream);
std::unique_ptr<ft::cublasAlgoMap> cublas_algo_map(new ft::cublasAlgoMap("gemm_config.in"));
std::unique_ptr<std::mutex> cublas_wrapper_mutex(new std::mutex());
std::unique_ptr<ft::cublasMMWrapper> cublas_wrapper(new ft::cublasMMWrapper(
cublas_handle, cublaslt_handle, stream, cublas_algo_map.get(), cublas_wrapper_mutex.get(), allocator.get()));
std::unique_ptr<cudaDeviceProp> cuda_device_prop_ptr(new cudaDeviceProp);
ft::check_cuda_error(cudaGetDeviceProperties(cuda_device_prop_ptr.get(), device_id));
if (std::is_same<T, half>::value) {
cublas_wrapper->setGemmConfig(CUDA_R_16F, CUDA_R_16F, CUDA_R_16F, CUDA_R_32F);
}
else if (std::is_same<T, float>::value) {
cublas_wrapper->setFP32GemmConfig();
}
ft::NcclParam tensor_para = nccl_params.first[comms_rank];
ft::NcclParam pipeline_para = nccl_params.second[comms_rank];
ft::FT_CHECK(tensor_para.world_size_ == tensor_para_size_);
ft::FT_CHECK(pipeline_para.world_size_ = pipeline_para_size_);
auto llama = std::make_unique<ft::LlamaV2<T>>(head_num_,
size_per_head_,
inter_size_,
num_layer_,
vocab_size_,
rotary_embedding_dim_,
norm_eps_,
max_batch_size_,
max_context_token_num_,
session_len_,
step_length_,
start_id_,
end_id_,
cache_max_entry_count_,
cache_chunk_size_,
use_context_fmha_,
shared_state_,
shared_weights_[device_id].get(),
tensor_para,
stream,
cublas_wrapper.get(),
allocator.get(),
false, // is_free_buffer_after_forward,
cuda_device_prop_ptr.get());
return std::make_unique<LlamaTritonSharedModelInstance<T>>(
LlamaTritonSharedModelInstance<T>{std::move(llama),
shared_weights_[device_id],
std::move(allocator),
std::move(cublas_algo_map),
std::move(cublas_wrapper_mutex),
std::move(cublas_wrapper),
std::move(cuda_device_prop_ptr),
session_len_});
}
template<typename T>
std::unique_ptr<AbstractTransformerModelInstance>
LlamaTritonModel<T>::createModelInstance(int device_id,
int rank,
cudaStream_t stream,
std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm)
{
ft::check_cuda_error(cudaSetDevice(device_id));
// const int comms_rank = device_id % (tensor_para_size_ * pipeline_para_size_);
std::shared_ptr<LlamaTritonSharedModelInstance<T>> instance;
{
std::lock_guard<std::mutex> lock(shared_mutexes_[device_id]);
instance = shared_instances_[device_id].lock();
if (!instance) {
instance = createSharedModelInstance(device_id, rank, nccl_params, custom_all_reduce_comm);
shared_instances_[device_id] = instance;
}
}
std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator(
new ft::Allocator<ft::AllocatorType::CUDA>(device_id));
allocator->setStream(stream);
return std::make_unique<LlamaTritonModelInstance<T>>(instance, std::move(allocator));
}
template<typename T>
void LlamaTritonModel<T>::createSharedWeights(int device_id, int rank)
{
ft::check_cuda_error(cudaSetDevice(device_id));
const int tensor_para_rank = rank % tensor_para_size_;
const int pipeline_para_rank = rank / tensor_para_size_;
ft::FT_CHECK(pipeline_para_size_ == 1 && pipeline_para_rank == 0);
shared_weights_[device_id] = std::make_shared<ft::LlamaWeight<T>>(head_num_ * size_per_head_,
inter_size_,
vocab_size_,
num_layer_,
weight_type_,
tensor_para_size_,
tensor_para_rank,
prefix_cache_len_);
shared_weights_[device_id]->loadModel(model_dir_);
return;
}
template<typename T>
std::string LlamaTritonModel<T>::toString()
{
std::stringstream ss;
ss << "Model: "
<< "\nhead_num: " << head_num_ << "\nsize_per_head: " << size_per_head_ << "\ninter_size: " << inter_size_
<< "\nnum_layer: " << num_layer_ << "\nvocab_size: " << vocab_size_ << "\nmax_batch_size: " << max_batch_size_
<< "\nmax_context_token_num: " << max_context_token_num_ << "\nsession_len: " << session_len_
<< "\nstep_length: " << step_length_ << "\ncache_max_entry_count: " << cache_max_entry_count_
<< "\ncache_chunk_size: " << cache_chunk_size_ << "\nuse_context_fmha: " << use_context_fmha_
<< "\nstart_id: " << start_id_ << "\ntensor_para_size: " << tensor_para_size_
<< "\npipeline_para_size: " << pipeline_para_size_ << "\nenable_custom_all_reduce: " << enable_custom_all_reduce_
<< "\nmodel_name: " << model_name_ << "\nprefix_cache_len: " << prefix_cache_len_
<< "\nmodel_dir: " << model_dir_ << std::endl;
return ss.str();
}
template<typename T>
void LlamaTritonModel<T>::createCustomComms(
std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms, int world_size)
{
using commDataType = typename ft::CustomARCommTypeConverter<T>::Type;
ft::initCustomAllReduceComm<commDataType>(custom_all_reduce_comms, enable_custom_all_reduce_, world_size);
}
template<typename T>
std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
LlamaTritonModel<T>::createNcclParams(const int node_id, const int device_id_start, const bool multi_node)
{
const auto device_count = ft::getDeviceCount();
bool need_nccl_params = false;
// create nccl group when there are non-occupied devices
for (int i = 0; i < device_count; ++i) {
std::lock_guard<std::mutex> lock(shared_mutexes_[i]);
if (shared_instances_[i].expired()) {
need_nccl_params = true;
break;
}
}
if (need_nccl_params) {
return AbstractTransformerModel::createNcclParams(node_id, device_id_start, multi_node);
}
else {
FT_LOG_INFO("Skipping NCCL param creation.");
const int tensor_para_size = getTensorParaSize();
const int pipeline_para_size = getPipelineParaSize();
const int local_comm_size = multi_node ? device_count : tensor_para_size * pipeline_para_size;
std::vector<ft::NcclParam> tensor_para_params(local_comm_size);
std::vector<ft::NcclParam> pipeline_para_params(local_comm_size);
return {std::move(tensor_para_params), std::move(pipeline_para_params)};
}
}
template<typename T>
std::unique_ptr<ft::AbstractInstanceComm> LlamaTritonModel<T>::createInstanceComm(int size)
{
return std::make_unique<ft::LlamaInstanceComm>(size);
}
template<typename T>
int LlamaTritonModel<T>::getTensorParaSize()
{
return tensor_para_size_;
}
template<typename T>
int LlamaTritonModel<T>::getPipelineParaSize()
{
return pipeline_para_size_;
}
template struct LlamaTritonModel<float>;
template struct LlamaTritonModel<half>;
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
#pragma once
#include "src/fastertransformer/models/llama/LlamaV2.h"
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/utils/cuda_utils.h"
#include "src/fastertransformer/utils/custom_ar_comm.h"
#include "src/fastertransformer/utils/nccl_utils.h"
#include <cuda_fp16.h>
#include <mutex>
namespace ft = fastertransformer;
template<typename T>
struct LlamaTritonSharedModelInstance;
template<typename T>
struct LlamaTritonModel: public AbstractTransformerModel {
LlamaTritonModel(size_t tensor_para_size,
size_t pipeline_para_size,
int enable_custom_all_reduce,
std::string model_dir);
~LlamaTritonModel() = default;
std::unique_ptr<AbstractTransformerModelInstance>
createModelInstance(int deviceId,
int rank,
cudaStream_t stream,
std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr) override;
void createSharedWeights(int deviceId, int rank) override;
void createCustomComms(std::vector<std::shared_ptr<ft::AbstractCustomComm>>* custom_all_reduce_comms,
int world_size) override;
std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>>
createNcclParams(const int node_id, const int device_id_start, const bool multi_node) override;
std::unique_ptr<ft::AbstractInstanceComm> createInstanceComm(int size) override;
void handleMissingParams();
std::string toString() override;
int getTensorParaSize() override;
int getPipelineParaSize() override;
private:
std::unique_ptr<LlamaTritonSharedModelInstance<T>>
createSharedModelInstance(int deviceId,
int rank,
std::pair<std::vector<ft::NcclParam>, std::vector<ft::NcclParam>> nccl_params,
std::shared_ptr<ft::AbstractCustomComm> custom_all_reduce_comm = nullptr);
size_t head_num_;
size_t size_per_head_;
size_t inter_size_;
size_t num_layer_;
size_t vocab_size_;
size_t rotary_embedding_dim_;
float norm_eps_;
int max_batch_size_;
int max_context_token_num_;
int session_len_;
int step_length_;
int start_id_;
int end_id_;
int cache_max_entry_count_;
int cache_chunk_size_;
int use_context_fmha_;
size_t tensor_para_size_;
size_t pipeline_para_size_;
ft::WeightType weight_type_;
size_t prefix_cache_len_{};
// shared weights for each device
std::vector<std::shared_ptr<ft::LlamaWeight<T>>> shared_weights_;
std::shared_ptr<typename ft::LlamaV2<T>::SharedState> shared_state_;
// weak_ptr is used so that the instances get released when all strong references are gone
std::vector<std::weak_ptr<LlamaTritonSharedModelInstance<T>>> shared_instances_;
std::deque<std::mutex> shared_mutexes_; // is locking really needed?
// // residual type
// bool use_gptj_residual_ = true;
// // number of tasks (for prefix-prompt, p/prompt-tuning)
// size_t num_tasks_ = 0;
// int prompt_learning_start_id_ = 0;
// ft::PromptLearningType prompt_learning_type_ = ft::PromptLearningType::no_prompt;
// std::map<std::string, std::pair<int, int>> prompt_learning_table_pair_ = {};
bool is_fp16_;
int enable_custom_all_reduce_ = 0;
std::string model_name_;
std::string model_dir_;
};
/*
* Copyright (c) OpenMMLab. All rights reserved.
* Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Modified from https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
#include "src/fastertransformer/triton_backend/llama/LlamaTritonModelInstance.h"
#include "src/fastertransformer/triton_backend/transformer_triton_backend.hpp"
#include "src/fastertransformer/triton_backend/triton_utils.hpp"
#include "src/fastertransformer/utils/Tensor.h"
#include "src/fastertransformer/utils/cuda_utils.h"
#include <algorithm>
#include <functional>
#include <numeric>
#include <sstream>
#include <unordered_map>
#include <vector>
namespace ft = fastertransformer;
template<typename T>
void triton_stream_callback(std::unordered_map<std::string, ft::Tensor>* output_tensors, void* ctx)
{
LlamaTritonModelInstance<T>* model = reinterpret_cast<LlamaTritonModelInstance<T>*>(ctx);
auto result = LlamaTritonModelInstance<T>::convert_outputs(*output_tensors);
model->stream_cb_(result, model->stream_ctx_);
}
template<typename T>
LlamaTritonModelInstance<T>::LlamaTritonModelInstance(
std::shared_ptr<LlamaTritonSharedModelInstance<T>> instance,
std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator):
instance_(std::move(instance)), allocator_(std::move(allocator))
{
}
template<typename T>
std::unordered_map<std::string, ft::Tensor> LlamaTritonModelInstance<T>::convert_inputs(
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
move_tensor_H2D(input_tensors->at("input_ids"), d_input_ids_, &allocator_);
move_tensor_H2D(input_tensors->at("input_lengths"), d_input_lengths_, &allocator_);
const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
const size_t input_data_len = input_tensors->at("input_ids").shape[1];
// freed in forward()
h_total_output_lengths_ = reinterpret_cast<uint32_t*>(malloc(request_batch_size * sizeof(uint32_t)));
std::unordered_map<std::string, ft::Tensor> ft_input_tensors = std::unordered_map<std::string, ft::Tensor>{
{"input_ids", as_GPU_tensor(input_tensors->at("input_ids"), d_input_ids_)},
// {"input_lengths", as_GPU_tensor(input_tensors->at("input_lengths"), d_input_lengths_)},
};
if (input_tensors->find("bad_words_list") != input_tensors->end()) {
move_tensor_H2D(input_tensors->at("bad_words_list"), d_input_bad_words_, &allocator_);
ft_input_tensors.insert(
{"bad_words_list", as_GPU_tensor(input_tensors->at("bad_words_list"), d_input_bad_words_)});
}
if (input_tensors->find("stop_words_list") != input_tensors->end()) {
move_tensor_H2D(input_tensors->at("stop_words_list"), d_input_stop_words_, &allocator_);
ft_input_tensors.insert(
{"stop_words_list", as_GPU_tensor(input_tensors->at("stop_words_list"), d_input_stop_words_)});
}
if (input_tensors->count("request_prompt_embedding") && input_tensors->count("request_prompt_lengths")
&& input_tensors->count("request_prompt_type")) {
move_tensor_H2D(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_, &allocator_);
ft_input_tensors.insert(
{"request_prompt_lengths",
as_GPU_tensor(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_)});
move_tensor_H2D(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_, &allocator_);
ft_input_tensors.insert(
{"request_prompt_embedding",
as_GPU_tensor(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_)});
}
if (input_tensors->find("top_p_decay") != input_tensors->end()) {
move_tensor_H2D(input_tensors->at("top_p_decay"), d_top_p_decay_, &allocator_);
ft_input_tensors.insert({"top_p_decay", as_GPU_tensor(input_tensors->at("top_p_decay"), d_top_p_decay_)});
}
if (input_tensors->find("top_p_min") != input_tensors->end()) {
move_tensor_H2D(input_tensors->at("top_p_min"), d_top_p_min_, &allocator_);
ft_input_tensors.insert({"top_p_min", as_GPU_tensor(input_tensors->at("top_p_min"), d_top_p_min_)});
}
if (input_tensors->find("top_p_reset_ids") != input_tensors->end()) {
move_tensor_H2D(input_tensors->at("top_p_reset_ids"), d_top_p_reset_ids_, &allocator_);
ft_input_tensors.insert(
{"top_p_reset_ids", as_GPU_tensor(input_tensors->at("top_p_reset_ids"), d_top_p_reset_ids_)});
}
for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
if (t->first.find("input_ids") == std::string::npos // && t->first.find("input_lengths") == std::string::npos
&& t->first.find("output_seq_len") == std::string::npos
&& t->first.find("prefix_soft_prompt_embedding") == std::string::npos
&& t->first.find("prefix_soft_prompt_lengths") == std::string::npos) {
if (ft_input_tensors.count(t->first) == 0) {
ft_input_tensors.insert({t->first, t->second.convertTritonTensorToFt()});
}
}
}
return ft_input_tensors;
}
template<typename T>
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
LlamaTritonModelInstance<T>::convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
std::unordered_map<std::string, triton::Tensor>* outputs_mapping =
new std::unordered_map<std::string, triton::Tensor>();
for (auto it = output_tensors.begin(); it != output_tensors.end(); it++) {
outputs_mapping->insert({it->first, triton::Tensor::convertFtTensorToTriton(it->second)});
}
return std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>(outputs_mapping);
}
template<typename T>
std::shared_ptr<std::vector<triton::Tensor>>
LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors)
{
ft::FT_CHECK(false);
return nullptr;
}
template<typename T>
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
{
ft::FT_CHECK(false);
return nullptr;
}
template<typename T>
std::string format_vector(const std::vector<T>& vec)
{
std::stringstream ss;
ss << "[";
bool first = true;
for (const auto& x : vec) {
ss << (first ? "" : ", ") << x;
first = false;
}
ss << "]";
return ss.str();
}
template<typename T>
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors,
ft::AbstractInstanceComm* instance_comm)
{
FT_LOG_DEBUG(__PRETTY_FUNCTION__);
// for (const auto& kv : *input_tensors) {
// FT_LOG_INFO("%s: %s", kv.first.c_str(), format_vector(kv.second.shape).c_str());
// }
FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape.size() == 2,
"input_tensors->at(\"input_ids\").shape.size() == 2");
FT_CHECK_WITH_INFO(input_tensors->at("input_lengths").shape.size() == 1,
"input_tensors->at(\"input_lengths\").shape.size() == 1");
const uint32_t request_batch_size = input_tensors->at("input_ids").shape[0];
const uint32_t max_request_output_len = (size_t)*std::max_element(
(int*)input_tensors->at("request_output_len").data,
(int*)input_tensors->at("request_output_len").data + input_tensors->at("request_output_len").shape[0]);
// const uint32_t total_output_len = max_request_output_len + input_tensors->at("input_ids").shape[1];
const uint32_t beam_width =
input_tensors->count("beam_width") ? (size_t)(*(uint*)input_tensors->at("beam_width").data) : 1;
FT_CHECK_WITH_INFO(beam_width == 1, "Beam search is not implemented");
std::unordered_map<std::string, ft::Tensor> ft_input_tensors = convert_inputs(input_tensors);
allocateBuffer(request_batch_size, beam_width, instance_->session_len);
std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
{"output_ids",
ft::Tensor{ft::MEMORY_GPU,
ft::TYPE_UINT32,
std::vector<size_t>{request_batch_size, beam_width, (size_t)instance_->session_len},
d_output_ids_}},
{"sequence_length",
ft::Tensor{ft::MEMORY_GPU,
ft::TYPE_UINT32,
std::vector<size_t>{request_batch_size, beam_width},
d_sequence_lengths_}}};
if (input_tensors->count("is_return_log_probs") && *((bool*)input_tensors->at("is_return_log_probs").data)) {
output_tensors.insert({"output_log_probs",
ft::Tensor{ft::MEMORY_GPU,
ft::TYPE_FP32,
std::vector<size_t>{request_batch_size, beam_width, max_request_output_len},
d_output_log_probs_}});
output_tensors.insert({"cum_log_probs",
ft::Tensor{ft::MEMORY_GPU,
ft::TYPE_FP32,
std::vector<size_t>{request_batch_size, beam_width},
d_cum_log_probs_}});
}
try {
ft::Request::Callback callback;
if (stream_cb_) {
callback = [this](std::unordered_map<std::string, ft::Tensor>* outputs) {
triton_stream_callback<T>(outputs, this);
};
}
ft::check_cuda_error(cudaStreamSynchronize(allocator_->returnStream()));
instance_->llm->forward(&output_tensors, &ft_input_tensors, {instance_comm, callback});
// ! stream synced by the model before returning
}
catch (...) {
h_exception_ = std::current_exception();
output_tensors.insert({"error_message", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, &h_exception_}});
}
if (h_total_output_lengths_ != nullptr) {
free(h_total_output_lengths_);
h_total_output_lengths_ = nullptr;
}
return convert_outputs(output_tensors);
}
template<typename T>
LlamaTritonModelInstance<T>::~LlamaTritonModelInstance()
{
freeBuffer();
}
template<typename T>
void LlamaTritonModelInstance<T>::allocateBuffer(const size_t request_batch_size,
const size_t beam_width,
const size_t session_len)
{
d_output_ids_ =
(int*)(allocator_->reMalloc(d_output_ids_, sizeof(int) * request_batch_size * beam_width * session_len, false));
d_sequence_lengths_ =
(int*)(allocator_->reMalloc(d_sequence_lengths_, sizeof(int) * request_batch_size * beam_width, false));
d_output_log_probs_ = (float*)(allocator_->reMalloc(
d_output_log_probs_, sizeof(float) * request_batch_size * beam_width * session_len, false));
d_cum_log_probs_ =
(float*)(allocator_->reMalloc(d_cum_log_probs_, sizeof(float) * request_batch_size * beam_width, false));
}
template<typename T>
void LlamaTritonModelInstance<T>::freeBuffer()
{
allocator_->free((void**)(&d_output_ids_));
allocator_->free((void**)(&d_sequence_lengths_));
allocator_->free((void**)(&d_output_log_probs_));
allocator_->free((void**)(&d_cum_log_probs_));
}
template struct LlamaTritonModelInstance<float>;
template struct LlamaTritonModelInstance<half>;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment