LlamaTritonModelInstance.cc 10 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
/*
 * Copyright (c) OpenMMLab. All rights reserved.
 * Copyright (c) 2021-2023, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

AllentDan's avatar
AllentDan committed
18
// Modified from
lvhan028's avatar
lvhan028 committed
19
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/turbomind/triton_backend/multi_gpu_gpt/ParallelGptTritonModel.h
Li Zhang's avatar
Li Zhang committed
20

lvhan028's avatar
lvhan028 committed
21
#include "src/turbomind/triton_backend/llama/LlamaTritonModelInstance.h"
Chen Xin's avatar
Chen Xin committed
22
#include "src/turbomind/macro.h"
lvhan028's avatar
lvhan028 committed
23
24
25
26
#include "src/turbomind/triton_backend/transformer_triton_backend.hpp"
#include "src/turbomind/triton_backend/triton_utils.hpp"
#include "src/turbomind/utils/Tensor.h"
#include "src/turbomind/utils/cuda_utils.h"
Li Zhang's avatar
Li Zhang committed
27
28
29
30
31
32
33
#include <algorithm>
#include <functional>
#include <numeric>
#include <sstream>
#include <unordered_map>
#include <vector>

lvhan028's avatar
lvhan028 committed
34
namespace ft = turbomind;
Li Zhang's avatar
Li Zhang committed
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56

template<typename T>
void triton_stream_callback(std::unordered_map<std::string, ft::Tensor>* output_tensors, void* ctx)
{
    LlamaTritonModelInstance<T>* model  = reinterpret_cast<LlamaTritonModelInstance<T>*>(ctx);
    auto                         result = LlamaTritonModelInstance<T>::convert_outputs(*output_tensors);

    model->stream_cb_(result, model->stream_ctx_);
}

template<typename T>
LlamaTritonModelInstance<T>::LlamaTritonModelInstance(
    std::shared_ptr<LlamaTritonSharedModelInstance<T>>      instance,
    std::unique_ptr<ft::Allocator<ft::AllocatorType::CUDA>> allocator):
    instance_(std::move(instance)), allocator_(std::move(allocator))
{
}

template<typename T>
std::unordered_map<std::string, ft::Tensor> LlamaTritonModelInstance<T>::convert_inputs(
    std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
{
lvhan028's avatar
lvhan028 committed
57
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
Li Zhang's avatar
Li Zhang committed
58
59
60
61
62
63

    move_tensor_H2D(input_tensors->at("input_ids"), d_input_ids_, &allocator_);
    move_tensor_H2D(input_tensors->at("input_lengths"), d_input_lengths_, &allocator_);

    const size_t request_batch_size = input_tensors->at("input_ids").shape[0];
    const size_t input_data_len     = input_tensors->at("input_ids").shape[1];
Chen Xin's avatar
Chen Xin committed
64
65
    h_total_output_lengths_ =
        (uint32_t*)std::realloc((void*)h_total_output_lengths_, request_batch_size * sizeof(uint32_t));
Li Zhang's avatar
Li Zhang committed
66

Li Zhang's avatar
Li Zhang committed
67
    std::unordered_map<std::string, ft::Tensor> ft_input_tensors{};
Li Zhang's avatar
Li Zhang committed
68
69

    for (auto t = input_tensors->begin(); t != input_tensors->end(); ++t) {
Li Zhang's avatar
Li Zhang committed
70
71
        if (ft_input_tensors.count(t->first) == 0) {
            ft_input_tensors.insert({t->first, t->second.convertTritonTensorToFt()});
Li Zhang's avatar
Li Zhang committed
72
73
74
75
76
77
78
79
80
81
        }
    }

    return ft_input_tensors;
}

template<typename T>
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
LlamaTritonModelInstance<T>::convert_outputs(const std::unordered_map<std::string, ft::Tensor>& output_tensors)
{
lvhan028's avatar
lvhan028 committed
82
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
Li Zhang's avatar
Li Zhang committed
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
    std::unordered_map<std::string, triton::Tensor>* outputs_mapping =
        new std::unordered_map<std::string, triton::Tensor>();

    for (auto it = output_tensors.begin(); it != output_tensors.end(); it++) {
        outputs_mapping->insert({it->first, triton::Tensor::convertFtTensorToTriton(it->second)});
    }

    return std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>(outputs_mapping);
}

template<typename T>
std::shared_ptr<std::vector<triton::Tensor>>
LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::vector<triton::Tensor>> input_tensors)
{
    ft::FT_CHECK(false);
    return nullptr;
}

template<typename T>
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors)
{
    ft::FT_CHECK(false);
    return nullptr;
}

template<typename T>
std::string format_vector(const std::vector<T>& vec)
{
    std::stringstream ss;
    ss << "[";
    bool first = true;
    for (const auto& x : vec) {
        ss << (first ? "" : ", ") << x;
        first = false;
    }
    ss << "]";
    return ss.str();
}

template<typename T>
std::shared_ptr<std::unordered_map<std::string, triton::Tensor>>
LlamaTritonModelInstance<T>::forward(std::shared_ptr<std::unordered_map<std::string, triton::Tensor>> input_tensors,
                                     ft::AbstractInstanceComm*                                        instance_comm)
{
lvhan028's avatar
lvhan028 committed
128
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
Li Zhang's avatar
Li Zhang committed
129
    // for (const auto& kv : *input_tensors) {
lvhan028's avatar
lvhan028 committed
130
    //     TM_LOG_INFO("%s: %s", kv.first.c_str(), format_vector(kv.second.shape).c_str());
Li Zhang's avatar
Li Zhang committed
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
    // }

    FT_CHECK_WITH_INFO(input_tensors->at("input_ids").shape.size() == 2,
                       "input_tensors->at(\"input_ids\").shape.size() == 2");
    FT_CHECK_WITH_INFO(input_tensors->at("input_lengths").shape.size() == 1,
                       "input_tensors->at(\"input_lengths\").shape.size() == 1");

    const uint32_t request_batch_size     = input_tensors->at("input_ids").shape[0];
    const uint32_t max_request_output_len = (size_t)*std::max_element(
        (int*)input_tensors->at("request_output_len").data,
        (int*)input_tensors->at("request_output_len").data + input_tensors->at("request_output_len").shape[0]);
    // const uint32_t total_output_len = max_request_output_len + input_tensors->at("input_ids").shape[1];
    const uint32_t beam_width =
        input_tensors->count("beam_width") ? (size_t)(*(uint*)input_tensors->at("beam_width").data) : 1;
    FT_CHECK_WITH_INFO(beam_width == 1, "Beam search is not implemented");

    std::unordered_map<std::string, ft::Tensor> ft_input_tensors = convert_inputs(input_tensors);

149
150
151
152
153
154
155
    const size_t max_input_len = input_tensors->at("input_ids").shape[1];
    const bool   is_return_logits =
        input_tensors->count("is_return_logits") && *(bool*)input_tensors->at("is_return_logits").data;

    const size_t vocab_size = instance_->llm->vocab_size();

    allocateBuffer(request_batch_size, max_input_len, beam_width, instance_->session_len, is_return_logits);
Li Zhang's avatar
Li Zhang committed
156
157
158

    std::unordered_map<std::string, ft::Tensor> output_tensors = std::unordered_map<std::string, ft::Tensor>{
        {"output_ids",
Li Zhang's avatar
Li Zhang committed
159
         ft::Tensor{ft::MEMORY_CPU,
Li Zhang's avatar
Li Zhang committed
160
161
162
163
                    ft::TYPE_UINT32,
                    std::vector<size_t>{request_batch_size, beam_width, (size_t)instance_->session_len},
                    d_output_ids_}},
        {"sequence_length",
Li Zhang's avatar
Li Zhang committed
164
         ft::Tensor{ft::MEMORY_CPU,
Li Zhang's avatar
Li Zhang committed
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
                    ft::TYPE_UINT32,
                    std::vector<size_t>{request_batch_size, beam_width},
                    d_sequence_lengths_}}};

    if (input_tensors->count("is_return_log_probs") && *((bool*)input_tensors->at("is_return_log_probs").data)) {
        output_tensors.insert({"output_log_probs",
                               ft::Tensor{ft::MEMORY_GPU,
                                          ft::TYPE_FP32,
                                          std::vector<size_t>{request_batch_size, beam_width, max_request_output_len},
                                          d_output_log_probs_}});
        output_tensors.insert({"cum_log_probs",
                               ft::Tensor{ft::MEMORY_GPU,
                                          ft::TYPE_FP32,
                                          std::vector<size_t>{request_batch_size, beam_width},
                                          d_cum_log_probs_}});
    }
181
182
183
184
185
186
187

    if (is_return_logits) {
        output_tensors.insert(
            {"logits",
             {ft::MEMORY_GPU, ft::TYPE_FP32, {request_batch_size, max_input_len, vocab_size}, d_output_logits_}});
    }

Li Zhang's avatar
Li Zhang committed
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
    try {
        ft::Request::Callback callback;

        if (stream_cb_) {
            callback = [this](std::unordered_map<std::string, ft::Tensor>* outputs) {
                triton_stream_callback<T>(outputs, this);
            };
        }

        ft::check_cuda_error(cudaStreamSynchronize(allocator_->returnStream()));
        instance_->llm->forward(&output_tensors, &ft_input_tensors, {instance_comm, callback});
        // ! stream synced by the model before returning
    }
    catch (...) {
        h_exception_ = std::current_exception();
        output_tensors.insert({"error_message", ft::Tensor{ft::MEMORY_CPU, ft::TYPE_BYTES, {1}, &h_exception_}});
    }

    return convert_outputs(output_tensors);
}

template<typename T>
LlamaTritonModelInstance<T>::~LlamaTritonModelInstance()
{
    freeBuffer();
}

template<typename T>
void LlamaTritonModelInstance<T>::allocateBuffer(const size_t request_batch_size,
217
                                                 const size_t max_input_len,
Li Zhang's avatar
Li Zhang committed
218
                                                 const size_t beam_width,
219
220
                                                 const size_t session_len,
                                                 const bool   is_return_logits)
Li Zhang's avatar
Li Zhang committed
221
{
Li Zhang's avatar
Li Zhang committed
222
223
224
    d_output_ids_ = (int*)std::realloc(d_output_ids_, sizeof(int) * request_batch_size * beam_width * session_len);
    d_sequence_lengths_ = (int*)std::realloc(d_sequence_lengths_, sizeof(int) * request_batch_size * beam_width);

Li Zhang's avatar
Li Zhang committed
225
226
227
228
    d_output_log_probs_ = (float*)(allocator_->reMalloc(
        d_output_log_probs_, sizeof(float) * request_batch_size * beam_width * session_len, false));
    d_cum_log_probs_ =
        (float*)(allocator_->reMalloc(d_cum_log_probs_, sizeof(float) * request_batch_size * beam_width, false));
229
230
231
232
    if (is_return_logits) {
        d_output_logits_ = (float*)allocator_->reMalloc(
            d_output_logits_, sizeof(float) * request_batch_size * max_input_len * instance_->llm->vocab_size(), false);
    }
Li Zhang's avatar
Li Zhang committed
233
234
235
236
237
}

template<typename T>
void LlamaTritonModelInstance<T>::freeBuffer()
{
Li Zhang's avatar
Li Zhang committed
238
239
    std::free(d_output_ids_);
    std::free(d_sequence_lengths_);
Li Zhang's avatar
Li Zhang committed
240
241
    allocator_->free((void**)(&d_output_log_probs_));
    allocator_->free((void**)(&d_cum_log_probs_));
Chen Xin's avatar
Chen Xin committed
242
    std::free(h_total_output_lengths_);
Li Zhang's avatar
Li Zhang committed
243
244
245
246
}

template struct LlamaTritonModelInstance<float>;
template struct LlamaTritonModelInstance<half>;