Unverified Commit 911c0a85 authored by Li Zhang's avatar Li Zhang Committed by GitHub
Browse files

Optimize for throughput (#701)



* tmp

* update

* update

* optimize for throughput

* update

* fix eos

* clean up

* fix serving

* fix indexed copy

* minor

* minor

---------
Co-authored-by: default avatarlvhan028 <lvhan_028@163.com>
parent 65d735ba
......@@ -87,6 +87,32 @@ void invokeUpdateOutput(int** request_output_ids_ptrs,
int batch_size,
cudaStream_t stream);
// [aaa, bbbb, cc, ddd] -> [aaabbbbccddd]
void invokeCompactOutputIds(int* cu_output_ids,
const int* output_ids,
const int* sequence_lengths,
int max_session_len,
bool token_generated,
int batch_size,
cudaStream_t stream);
void invokeIndexedCopy(void** h_src_ptr,
void** h_dst_ptr,
const int* h_elem_sz,
const int* h_src_idx,
const int* h_dst_idx,
int count,
int n_copys,
cudaStream_t st);
// ABCDe ABCDe e
// ABCDEFGHIJk ABCDEFGHIJk
// ABCDEFGHi -> ABCDEFGHi i
// ABCDEFGh ABCDEFGh h
// ABCd ABCd d
void invokePadLastTokenIds(
int* token_ids, const int* context_length, int max_context_len, int batch_size, cudaStream_t stream);
void invokeMyCopyInt(int* dst, const int* src, size_t count, cudaStream_t st);
template<typename T>
......
......@@ -1715,7 +1715,7 @@ void ModelInstanceState::ReadOutputTensors(size_t
output_dtype,
batchn_shape,
output_buffer,
TRITONSERVER_MEMORY_GPU,
TRITONSERVER_MEMORY_CPU,
model_instance_device_id_start_);
}
......
......@@ -64,59 +64,11 @@ std::unordered_map<std::string, ft::Tensor> LlamaTritonModelInstance<T>::convert
h_total_output_lengths_ =
(uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t));
std::unordered_map<std::string, ft::Tensor> ft_input_tensors = std::unordered_map<std::string, ft::Tensor>{
{"input_ids", as_GPU_tensor(input_tensors->at("input_ids"), d_input_ids_)},
// {"input_lengths", as_GPU_tensor(input_tensors->at("input_lengths"), d_input_lengths_)},
};
if (input_tensors->find("bad_words_list") != input_tensors->end()) {
move_tensor_H2D(input_tensors->at("bad_words_list"), d_input_bad_words_, &allocator_);
ft_input_tensors.insert(
{"bad_words_list", as_GPU_tensor(input_tensors->at("bad_words_list"), d_input_bad_words_)});
}
if (input_tensors->find("stop_words_list") != input_tensors->end()) {
move_tensor_H2D(input_tensors->at("stop_words_list"), d_input_stop_words_, &allocator_);
ft_input_tensors.insert(
{"stop_words_list", as_GPU_tensor(input_tensors->at("stop_words_list"), d_input_stop_words_)});
}
if (input_tensors->count("request_prompt_embedding") && input_tensors->count("request_prompt_lengths")
&& input_tensors->count("request_prompt_type")) {
move_tensor_H2D(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_, &allocator_);
ft_input_tensors.insert(
{"request_prompt_lengths",
as_GPU_tensor(input_tensors->at("request_prompt_lengths"), d_request_prompt_lengths_)});
move_tensor_H2D(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_, &allocator_);
ft_input_tensors.insert(
{"request_prompt_embedding",
as_GPU_tensor(input_tensors->at("request_prompt_embedding"), d_request_prompt_embedding_)});
}
if (input_tensors->find("top_p_decay") != input_tensors->end()) {
move_tensor_H2D(input_tensors->at("top_p_decay"), d_top_p_decay_, &allocator_);
ft_input_tensors.insert({"top_p_decay", as_GPU_tensor(input_tensors->at("top_p_decay"), d_top_p_decay_)});
}
if (input_tensors->find("top_p_min") != input_tensors->end()) {
move_tensor_H2D(input_tensors->at("top_p_min"), d_top_p_min_, &allocator_);
ft_input_tensors.insert({"top_p_min", as_GPU_tensor(input_tensors->at("top_p_min"), d_top_p_min_)});
}
if (input_tensors->find("top_p_reset_ids") != input_tensors->end()) {
move_tensor_H2D(input_tensors->at("top_p_reset_ids"), d_top_p_reset_ids_, &allocator_);
ft_input_tensors.insert(
{"top_p_reset_ids", as_GPU_tensor(input_tensors->at("top_p_reset_ids"), d_top_p_reset_ids_)});
}
std::unordered_map<std::string, ft::Tensor> ft_input_tensors{};
for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
if (t->first.find("input_ids") == std::string::npos // && t->first.find("input_lengths") == std::string::npos
&& t->first.find("output_seq_len") == std::string::npos
&& t->first.find("prefix_soft_prompt_embedding") == std::string::npos
&& t->first.find("prefix_soft_prompt_lengths") == std::string::npos) {
if (ft_input_tensors.count(t->first) == 0) {
ft_input_tensors.insert({t->first, t->second.convertTritonTensorToFt()});
}
if (ft_input_tensors.count(t->first) == 0) {
ft_input_tensors.insert({t->first, t->second.convertTritonTensorToFt()});
}
}
......@@ -204,12 +156,12 @@ LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::str
std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
{"output_ids",
ft::Tensor{ft::MEMORY_GPU,
ft::Tensor{ft::MEMORY_CPU,
ft::TYPE_UINT32,
std::vector<size_t>{request_batch_size, beam_width, (size_t)instance_->session_len},
d_output_ids_}},
{"sequence_length",
ft::Tensor{ft::MEMORY_GPU,
ft::Tensor{ft::MEMORY_CPU,
ft::TYPE_UINT32,
std::vector<size_t>{request_batch_size, beam_width},
d_sequence_lengths_}}};
......@@ -267,10 +219,9 @@ void LlamaTritonModelInstance<T>::allocateBuffer(const size_t request_batch_size
const size_t session_len,
const bool is_return_logits)
{
d_output_ids_ =
(int*)(allocator_->reMalloc(d_output_ids_, sizeof(int) * request_batch_size * beam_width * session_len, false));
d_sequence_lengths_ =
(int*)(allocator_->reMalloc(d_sequence_lengths_, sizeof(int) * request_batch_size * beam_width, false));
d_output_ids_ = (int*)std::realloc(d_output_ids_, sizeof(int) * request_batch_size * beam_width * session_len);
d_sequence_lengths_ = (int*)std::realloc(d_sequence_lengths_, sizeof(int) * request_batch_size * beam_width);
d_output_log_probs_ = (float*)(allocator_->reMalloc(
d_output_log_probs_, sizeof(float) * request_batch_size * beam_width * session_len, false));
d_cum_log_probs_ =
......@@ -284,8 +235,8 @@ void LlamaTritonModelInstance<T>::allocateBuffer(const size_t request_batch_size
template<typename T>
void LlamaTritonModelInstance<T>::freeBuffer()
{
allocator_->free((void**)(&d_output_ids_));
allocator_->free((void**)(&d_sequence_lengths_));
std::free(d_output_ids_);
std::free(d_sequence_lengths_);
allocator_->free((void**)(&d_output_log_probs_));
allocator_->free((void**)(&d_cum_log_probs_));
std::free(h_total_output_lengths_);
......
......@@ -52,5 +52,6 @@ ft::Tensor as_GPU_tensor(const triton::Tensor& tensor, T* d_ptr)
inline ft::Tensor as_CPU_tensor(const triton::Tensor& tensor)
{
ft::FT_CHECK(tensor.where == triton::MEMORY_CPU);
return ft::Tensor{ft::MEMORY_CPU, triton::Tensor::convertTritonTypeToFt(tensor.type), tensor.shape, tensor.data};
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment