"vscode:/vscode.git/clone" did not exist on "11eea69e70aaca8385658a2346e9327b6dcfa20a"
LlamaBatch.cc 59 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
// Copyright (c) OpenMMLab. All rights reserved.

lvhan028's avatar
lvhan028 committed
3
4
#include "src/turbomind/models/llama/LlamaBatch.h"
#include "src/turbomind/kernels/decoding_kernels.h"
Li Zhang's avatar
Li Zhang committed
5
#include "src/turbomind/kernels/sampling_topk_kernels.h"
Chen Xin's avatar
Chen Xin committed
6
#include "src/turbomind/macro.h"
lvhan028's avatar
lvhan028 committed
7
8
9
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/LlamaV2.h"
#include "src/turbomind/models/llama/Request.h"
Li Zhang's avatar
Li Zhang committed
10
11
#include "src/turbomind/models/llama/SequenceManager.h"
#include "src/turbomind/models/llama/llama_kernels.h"
lvhan028's avatar
lvhan028 committed
12
13
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/Tensor.h"
Li Zhang's avatar
Li Zhang committed
14
15
16
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/debug_utils.h"
#include "src/turbomind/utils/gemm_test/gemm_func.h"
lvhan028's avatar
lvhan028 committed
17
#include "src/turbomind/utils/logger.h"
Li Zhang's avatar
Li Zhang committed
18
19
#include <algorithm>
#include <cmath>
Li Zhang's avatar
Li Zhang committed
20
#include <cstddef>
Li Zhang's avatar
Li Zhang committed
21
22
#include <cstdint>
#include <iomanip>
Li Zhang's avatar
Li Zhang committed
23
#include <iterator>
Li Zhang's avatar
Li Zhang committed
24
25
#include <mutex>
#include <numeric>
Li Zhang's avatar
Li Zhang committed
26
27
#include <sstream>
#include <unordered_map>
Li Zhang's avatar
Li Zhang committed
28
#include <utility>
Li Zhang's avatar
Li Zhang committed
29

lvhan028's avatar
lvhan028 committed
30
namespace turbomind {
Li Zhang's avatar
Li Zhang committed
31

Li Zhang's avatar
Li Zhang committed
32
33
34
35
36
37
38
void ClearState(BatchState& s)
{
    std::fill_n(s.requests.begin(), s.size, nullptr);
    std::fill_n(s.sequences.begin(), s.size, nullptr);
    s.size = s.active_size = 0;
}

Li Zhang's avatar
Li Zhang committed
39
template<typename T>
Li Zhang's avatar
Li Zhang committed
40
void LlamaBatch<T>::RejectInvalidRequests(Requests& stop_reqs, Requests& infer_reqs)
Li Zhang's avatar
Li Zhang committed
41
{
AllentDan's avatar
AllentDan committed
42
    std::unordered_map<uint64_t, int> occurrence;
Li Zhang's avatar
Li Zhang committed
43

Li Zhang's avatar
Li Zhang committed
44
    auto count_occurrence = [&occurrence](const Requests& rs) {
Li Zhang's avatar
Li Zhang committed
45
        for (const auto& r : rs) {
AllentDan's avatar
AllentDan committed
46
            ++occurrence[r->id];
Li Zhang's avatar
Li Zhang committed
47
48
49
        }
    };

Li Zhang's avatar
Li Zhang committed
50
51
52
    auto reject = [](const char* type, std::shared_ptr<Request>& req, int ec) {
        TM_LOG_WARNING(
            "[RejectInvalidRequests] Skipping invalid %s request for id %ld, code = %d", type, (long)req->id, ec);
Li Zhang's avatar
Li Zhang committed
53
54
55
56
        req->signal.set_value(ec);
        req.reset();
    };

Li Zhang's avatar
Li Zhang committed
57
    auto handle_conflict_or_invalid = [this, &occurrence, &reject](Requests& rs, const char* type) {
Li Zhang's avatar
Li Zhang committed
58
59
60
61
        for (auto& r : rs) {
            if (r) {
                int ec = 0;

Li Zhang's avatar
Li Zhang committed
62
63
64
65
66
                const int  input_length = r->inputs[rank_].getVal<int>("input_lengths", 0);
                const auto get_offset   = [&](int token_count) {
                    return std::max(0, std::min(token_count, r->inputs[rank_].getVal<int>("step", token_count)));
                };

AllentDan's avatar
AllentDan committed
67
                if (occurrence[r->id] != 1) {
Li Zhang's avatar
Li Zhang committed
68
69
70
71
72
                    ec = Request::kConflict;
                }
                else if (r->start_flag && r->stop_flag) {
                    ec = Request::kInvalid;
                }
Li Zhang's avatar
Li Zhang committed
73
74
75
76
77
78
79
80
81
82
                else if (input_length > session_len_) {
                    ec = Request::kTooLong;
                }
                else if (!r->start_flag) {
                    if (auto seq = sequence_manager_->Get(r->id); seq == nullptr) {
                        ec = Request::kInvalid;
                    }
                    else if (get_offset(seq->tokens.size()) + input_length > session_len_) {
                        ec = Request::kTooLong;
                    }
Li Zhang's avatar
Li Zhang committed
83
84
85
                }

                if (ec) {
Li Zhang's avatar
Li Zhang committed
86
                    reject(type, r, ec);
Li Zhang's avatar
Li Zhang committed
87
88
89
90
91
                }
            }
        }
    };

Li Zhang's avatar
Li Zhang committed
92
    auto drop_invalid = [](Requests& rs) {
Li Zhang's avatar
Li Zhang committed
93
94
95
96
97
98
99
100
101
        int count = 0;
        for (int i = 0; i < rs.size(); ++i) {
            if (rs[i]) {
                rs[count++] = std::move(rs[i]);
            }
        }
        rs.resize(count);
    };

AllentDan's avatar
AllentDan committed
102
103
    count_occurrence(stop_reqs);
    count_occurrence(infer_reqs);
Li Zhang's avatar
Li Zhang committed
104
105
106
107
108
109
110
111

    if (!stop_reqs.empty()) {
        handle_conflict_or_invalid(stop_reqs, "stop");

        // invalidate stop-only requests for inactive sequences
        for (auto& r : stop_reqs) {
            if (r && r->end_flag == false) {
                int ec = Request::kInactive;
Li Zhang's avatar
Li Zhang committed
112
113
                for (int i = 0; i < state_->size; ++i) {
                    if (state_->requests[i] && state_->requests[i]->id == r->id) {
Li Zhang's avatar
Li Zhang committed
114
115
116
117
118
                        ec = 0;
                        break;
                    }
                }
                if (ec) {
Li Zhang's avatar
Li Zhang committed
119
                    reject("stop", r, ec);
Li Zhang's avatar
Li Zhang committed
120
121
122
123
124
125
126
127
128
129
130
131
132
                }
            }
        }

        drop_invalid(stop_reqs);
    }

    if (!infer_reqs.empty()) {
        handle_conflict_or_invalid(infer_reqs, "infer");

        // invalidate requests for busy sequences
        for (auto& r : infer_reqs) {
            if (r) {
Li Zhang's avatar
Li Zhang committed
133
134
135
                for (int i = 0; i < state_->size; ++i) {
                    if (state_->requests[i] && state_->requests[i]->id == r->id) {
                        reject("infer", r, Request::kBusy);
Li Zhang's avatar
Li Zhang committed
136
137
138
139
140
141
142
143
144
145
146
                        break;
                    }
                }
            }
        }

        drop_invalid(infer_reqs);
    }
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
147
auto LlamaBatch<T>::ProcessStopRequests(const Requests& requests) -> std::vector<Signal>
Li Zhang's avatar
Li Zhang committed
148
{
Li Zhang's avatar
Li Zhang committed
149
    NvtxScope           scope("stop_request");
Li Zhang's avatar
Li Zhang committed
150
    std::vector<Signal> signals;
Li Zhang's avatar
Li Zhang committed
151
    int                 count = 0;
Li Zhang's avatar
Li Zhang committed
152
153
154
    for (const auto& r : requests) {
        int ec = Request::kFail;
        // find matching active sequence
Li Zhang's avatar
Li Zhang committed
155
        for (int i = 0; i < state_->size; ++i) {
Li Zhang's avatar
Li Zhang committed
156
            // stop & optionally erase active sequence
Li Zhang's avatar
Li Zhang committed
157
            if (state_->requests[i] && state_->requests[i]->id == r->id) {
Li Zhang's avatar
Li Zhang committed
158
                ec = 0;
Li Zhang's avatar
Li Zhang committed
159
160
                signals.push_back(Interrupt(i, true, r->end_flag));
                ++count;
Li Zhang's avatar
Li Zhang committed
161
162
163
                break;
            }
        }
Li Zhang's avatar
Li Zhang committed
164
        // mismatch, try erase inactive sequence, in this case there is no active request to interrupt
Li Zhang's avatar
Li Zhang committed
165
        if (ec && r->end_flag) {
Li Zhang's avatar
Li Zhang committed
166
167
168
            if (sequence_manager_->Erase(r->id)) {
                ec = 0;
            }
Li Zhang's avatar
Li Zhang committed
169
        }
Li Zhang's avatar
Li Zhang committed
170
        signals.push_back([=] {
Li Zhang's avatar
Li Zhang committed
171
            if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
172
                r->signal.set_value(ec);
Li Zhang's avatar
Li Zhang committed
173
            }
Li Zhang's avatar
Li Zhang committed
174
175
176
177
        });
    }
    if (count) {
        check_cuda_error(cudaStreamSynchronize(stream_));
Li Zhang's avatar
Li Zhang committed
178
179
180
    }
    return signals;
}
akhoroshev's avatar
akhoroshev committed
181

Li Zhang's avatar
Li Zhang committed
182
183
184
template<typename T>
void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
{
Li Zhang's avatar
Li Zhang committed
185
186
    NvtxScope scope("infer_request");
    auto&     state = *incoming_;
Li Zhang's avatar
Li Zhang committed
187
188
189
190

    FT_CHECK(state.size == 0);
    FT_CHECK(state.active_size == 0);

Li Zhang's avatar
Li Zhang committed
191
    std::vector<int> existing_idx;
Li Zhang's avatar
Li Zhang committed
192

Li Zhang's avatar
Li Zhang committed
193
194
195
    int idx = 0;
    for (const auto& r : requests) {
        FT_CHECK(!state.requests[idx]);
Li Zhang's avatar
Li Zhang committed
196

Li Zhang's avatar
Li Zhang committed
197
198
199
        if (rank_ == 0) {
            TM_LOG_WARNING("[ProcessInferRequests] Request for %ld received.", (long)r->id);
        }
Li Zhang's avatar
Li Zhang committed
200

Li Zhang's avatar
Li Zhang committed
201
        state.requests[idx] = r;
Li Zhang's avatar
Li Zhang committed
202
203

        // get sequence for the request
Li Zhang's avatar
Li Zhang committed
204
205
        state.sequences[idx] = r->start_flag ? sequence_manager_->Create(r->id) : sequence_manager_->Get(r->id);
        FT_CHECK(state.sequences[idx]);
Li Zhang's avatar
Li Zhang committed
206

Li Zhang's avatar
Li Zhang committed
207
        auto& seq = *state.sequences[idx];
Li Zhang's avatar
Li Zhang committed
208
209
210
211
212
213
214
215
216
217

        if (int step = r->inputs[rank_].getVal<int>("step", -1); step >= 0) {
            if (step <= seq.tokens.size()) {
                seq.tokens.resize(step);
                seq.cache_len = std::min(seq.cache_len, step);
            }
            else if (rank_ == 0) {
                TM_LOG_WARNING(
                    "[ProcessInferRequests] Skipping invalid step (%d) setting for ID %ld", step, (long)seq.id);
            }
Li Zhang's avatar
Li Zhang committed
218
        }
Li Zhang's avatar
Li Zhang committed
219
220
221
222
223

        const int  input_length = r->inputs[rank_].getVal<int>("input_lengths");
        const int* input_ids    = r->inputs[rank_].getPtr<int>("input_ids");

        // `output_ids` contains all token ids of the sequences
Li Zhang's avatar
Li Zhang committed
224
        const auto output_ids_base = state.output_ids + session_len_ * idx;
Li Zhang's avatar
Li Zhang committed
225
226
227
228
229
230
231
232
233
234
235
236
237
        auto       output_ids      = output_ids_base;

        // copy history tokens
        if (!seq.tokens.empty()) {
            output_ids = Copy(seq.tokens.data(), seq.tokens.size(), output_ids);
        }

        // copy input tokens
        if (input_length) {
            output_ids = Copy(input_ids, input_length, output_ids);
        }

        // total context length (history + input)
Li Zhang's avatar
Li Zhang committed
238
239
        state.h_context_length[idx] = output_ids - output_ids_base;
        state.h_finished[idx]       = false;
Li Zhang's avatar
Li Zhang committed
240

Li Zhang's avatar
Li Zhang committed
241
242
        const int request_output_len = state.requests[idx]->inputs[rank_].getVal<int>("request_output_len");
        state.seq_len_limit[idx]     = state.h_context_length[idx] + request_output_len;
Li Zhang's avatar
Li Zhang committed
243
244
        // `length_criterion` sets finish flag when step >= seq_limit_len, however when step == seq_limit_len
        // the actual sequence length is seq_limit_len + 1, hence seq_limit_len must truncated to session_len - 1
Li Zhang's avatar
Li Zhang committed
245
246
        if (state.seq_len_limit[idx] >= session_len_) {
            state.seq_len_limit[idx] = session_len_ - 1;
Li Zhang's avatar
Li Zhang committed
247
            if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
248
                const int trunc_output_len = state.seq_len_limit[idx] - state.h_context_length[idx];
Li Zhang's avatar
Li Zhang committed
249
250
251
                TM_LOG_WARNING(
                    "[ProcessInferRequests] [%ld] total sequence length (%d + %d) exceeds `session_len` (%d), `request_output_len` is truncated to %d",
                    (long)seq.id,
Li Zhang's avatar
Li Zhang committed
252
                    state.h_context_length[idx],
Li Zhang's avatar
Li Zhang committed
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
                    request_output_len,
                    (int)session_len_,
                    trunc_output_len);
            }
        }

        // compute rope scaling factor
        if (r->start_flag) {
            seq.rope_theta      = model_->attn_params_.rotary_embedding_base;
            auto scaling_factor = 1.f;
            if (r->inputs[rank_].isExist("rope_scaling_factor")) {  // runtime scaling factor
                scaling_factor = r->inputs[rank_].getVal<float>("rope_scaling_factor");
            }
            else if (model_->attn_params_.rope_scaling_factor >= 1.f) {  // infer by `seq_len_limit`
                scaling_factor   = model_->attn_params_.rope_scaling_factor;
Li Zhang's avatar
Li Zhang committed
268
                auto max_seq_len = state.seq_len_limit[idx];
Li Zhang's avatar
Li Zhang committed
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
                auto max_pos_emb = model_->attn_params_.max_position_embeddings;
                if (max_seq_len > max_pos_emb) {
                    scaling_factor = scaling_factor * max_seq_len / max_pos_emb - (scaling_factor - 1);
                    // scaling_factor = std::max(exp2f(ceilf(log2f((float)max_seq_len / max_pos_emb) + 1.f))
                    // - 1.f, 1.f);
                }
            }
            if (scaling_factor != 1.f) {
                float rope_dim = model_->attn_params_.rotary_embedding_dim;
                seq.rope_theta *= powf(scaling_factor, rope_dim / (rope_dim - 2.f));
                TM_LOG_INFO("[ProcessInferRequests] %ld rope_scaling_factor: %f, rope_theta = %f",
                            (long)seq.id,
                            scaling_factor,
                            seq.rope_theta);
            }
        }
Li Zhang's avatar
Li Zhang committed
285
        state.h_rope_theta[idx] = seq.rope_theta;
Li Zhang's avatar
Li Zhang committed
286

Li Zhang's avatar
Li Zhang committed
287
288
289
290
291
292
293
294
        if (r->start_flag) {
            // prepare to initialize random state for new sequence
            h_random_seed_[idx] = r->inputs[rank_].getVal<unsigned long long>("random_seed", 0);
        }
        else {
            // Recover device states if not a new sequence
            h_curand_state_[existing_idx.size()] = *(curandState_t*)seq.random_state.data();
            existing_idx.push_back(idx);
Li Zhang's avatar
Li Zhang committed
295
296
        }

Li Zhang's avatar
Li Zhang committed
297
        // ! SHARED STATE IS MODIFIED, BARRIER SYNCHRONIZATION REQUIRED
Li Zhang's avatar
Li Zhang committed
298
        // assign priority based on arrival time
Li Zhang's avatar
Li Zhang committed
299
300
301
        if (rank_ == 0) {
            r->priority = request_count_++;
        }
Li Zhang's avatar
Li Zhang committed
302
303

        // increment pointer
Li Zhang's avatar
Li Zhang committed
304
        idx++;
Li Zhang's avatar
Li Zhang committed
305
    }
Li Zhang's avatar
Li Zhang committed
306

Li Zhang's avatar
Li Zhang committed
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
    state.size = idx;

    // when there are new sequences
    if (state.size != existing_idx.size()) {
        // copy random seeds to device
        Copy(h_random_seed_, state.size, d_random_seed_);
        // initialize random states
        invokeCurandBatchInitialize(state.curand_state, state.size, d_random_seed_, stream_);
        sync_check_cuda_error();
    }

    if (!existing_idx.empty()) {
        // copy existing curand states to device
        Copy(h_curand_state_, existing_idx.size(), d_curand_state_);
        // insert the states to their correct positions in the batch
        IndexedCopy({}, existing_idx, std::tuple{d_curand_state_, state.curand_state, 1});
    }
Li Zhang's avatar
Li Zhang committed
324
325
326
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
bool LlamaBatch<T>::Initialize()
{
    NvtxScope                                scope("initialize");
    std::vector<const Sequence*>             sequences;
    std::vector<Sequence::Status>            status;
    std::vector<uint64_t>                    priorities;
    std::vector<int>                         context_lengths;
    std::vector<std::pair<BatchState*, int>> coords;

    // count the holes introduced by finished requests in from previous iteration or stop requests from
    // current iteration
    int holes{};
    int active_holes{};
    for (int i = 0; i < state_->size; ++i) {
        if (!state_->requests[i]) {
            ++holes;
            if (i < state_->active_size) {
                ++active_holes;
            }
        }
    }

    // dbg(holes, active_holes);

    auto process = [&](BatchState* state) {
        for (int i = 0; i < state->size; ++i) {
            if (auto& r = state->requests[i]) {
                sequences.push_back(state->sequences[i]);
                status.push_back(state->sequences[i]->status);
                priorities.push_back(r->priority);
                context_lengths.push_back(state->h_context_length[i]);
                coords.emplace_back(state, i);
                // clear swap-in flags
                state->is_swap_in[i] = 0;
            }
        }
    };

    process(state_);
    process(incoming_);

    auto outcome = sequence_manager_->Materialize(sequences, context_lengths, priorities, step_length_);

    if (outcome.allocation || outcome.swap_in || outcome.swap_out) {
        dbg(outcome);
    }

    bool exchange = outcome.swap_in + outcome.swap_out > 0;

    std::vector<int> idxs(sequences.size());
    std::iota(idxs.begin(), idxs.end(), 0);

    if (exchange || holes || incoming_->size) {
        // put active ones first
        auto active_end = std::stable_partition(idxs.begin(), idxs.end(), [&](int idx) {
            return sequences[idx]->status == Sequence::kActive;  // present status
        });

        // all blocks are not enough to hold a single sequence
Li Zhang's avatar
Li Zhang committed
386
387
388
        if (!sequences.empty()) {
            FT_CHECK_WITH_INFO(active_end != idxs.begin(), "No enough blocks.");
        }
Li Zhang's avatar
Li Zhang committed
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405

        // move swap-ins to the back
        auto swapin_beg = std::stable_partition(idxs.begin(), active_end, [&](int idx) {
            return status[idx] == Sequence::kActive;  // past status
        });

        // sort swap-ins according to missing length
        if (swapin_beg != active_end) {
            std::vector<int> missing_len(sequences.size());
            for (int i = 0; i < sequences.size(); ++i) {
                missing_len[i] = context_lengths[i] - sequences[i]->cache_len;
            }
            std::stable_sort(swapin_beg, active_end, [&](int i, int j) { return missing_len[i] < missing_len[j]; });
        }

        // Copy sequence states to back buffer
        FT_CHECK(back_->size == 0 && back_->active_size == 0);
Li Zhang's avatar
Li Zhang committed
406
        std::vector<std::tuple<BatchState*, BatchState*, int, int>> cpys;
Li Zhang's avatar
Li Zhang committed
407
408
409
410
411
412
413
414
415
416
417
418
        for (const auto& i : idxs) {
            auto& s = *sequences[i];
            if (exchange) {
                const auto& [state, idx] = coords[i];
                // mark swap-ins
                if (status[i] != Sequence::kActive && s.status == Sequence::kActive) {
                    state->is_swap_in[idx] = 1;
                }
            }
            if (s.status == Sequence::kActive) {
                ++back_->active_size;
            }
Li Zhang's avatar
Li Zhang committed
419
            cpys.emplace_back(coords[i].first, back_, coords[i].second, back_->size++);
Li Zhang's avatar
Li Zhang committed
420
        }
Li Zhang's avatar
Li Zhang committed
421
        CopyState(cpys);
Li Zhang's avatar
Li Zhang committed
422
423
424
425
426
427
428
        // Swap the buffers
        std::swap(state_, back_);

        ClearState(*back_);
        ClearState(*incoming_);
    }

Li Zhang's avatar
Li Zhang committed
429
430
    FT_CHECK(state_->size <= max_batch_size_);

Li Zhang's avatar
Li Zhang committed
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
    /// Update block ptrs when there were
    //  1. swap-in or swap-out
    //  2. holes in the active buffer
    //  3. new allocations (for existing active sequences)
    if (exchange || active_holes || outcome.allocation) {
        // Prepare intermediate buffers
        h_cu_block_counts_[0] = 0;

        auto k_ptrs = h_k_block_ptrs_;
        auto v_ptrs = h_v_block_ptrs_;

        const int batch_size = state_->active_size;

        for (int i = 0; i < batch_size; ++i) {
            const auto& seq = *state_->sequences[i];

            // cumulative num of blocks
            h_cu_block_counts_[i + 1] = h_cu_block_counts_[i] + seq.blocks.size();

Li Zhang's avatar
Li Zhang committed
450
451
452
            FT_CHECK_WITH_INFO(h_cu_block_counts_[i + 1] <= sequence_manager_->max_block_count(),
                               std::to_string(h_cu_block_counts_[i + 1]));

Li Zhang's avatar
Li Zhang committed
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
            k_ptrs = std::transform(seq.blocks.cbegin(), seq.blocks.cend(), k_ptrs, [&](auto p) {
                return reinterpret_cast<uintptr_t>(sequence_manager_->OffsetKey(p->data));
            });
            v_ptrs = std::transform(seq.blocks.cbegin(), seq.blocks.cend(), v_ptrs, [&](auto p) {
                return reinterpret_cast<uintptr_t>(sequence_manager_->OffsetVal(p->data));
            });
        }

        static_assert(sizeof(uintptr_t) == sizeof(void*));

        Copy(h_cu_block_counts_, batch_size + 1, cu_block_counts_);
        Copy(h_k_block_ptrs_, h_cu_block_counts_[batch_size], k_block_ptrs_);
        Copy(h_v_block_ptrs_, h_cu_block_counts_[batch_size], v_block_ptrs_);
    }

    /// Layout of the buffers is changed, generation & sampling need to be re-initialized for correctness when there
    /// were
    //  1. swap-in or swap-out
    //  2. holes in the active buffer
    return exchange || active_holes;
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
476
void LlamaBatch<T>::CopyState(const std::vector<std::tuple<BatchState*, BatchState*, int, int>>& desc)
Li Zhang's avatar
Li Zhang committed
477
{
Li Zhang's avatar
Li Zhang committed
478
479
480
481
    if (desc.empty()) {
        return;
    }

Li Zhang's avatar
Li Zhang committed
482
483
    std::vector<int> idxs(desc.size());
    std::iota(idxs.begin(), idxs.end(), 0);
Li Zhang's avatar
Li Zhang committed
484

Li Zhang's avatar
Li Zhang committed
485
    std::sort(idxs.begin(), idxs.end(), [&](int i, int j) { return desc[i] < desc[j]; });
Li Zhang's avatar
Li Zhang committed
486

Li Zhang's avatar
Li Zhang committed
487
488
489
    auto get_signature = [&](int i) -> std::pair<BatchState*, BatchState*> {
        return std::make_pair(std::get<0>(desc[idxs[i]]), std::get<1>(desc[idxs[i]]));
    };
Li Zhang's avatar
Li Zhang committed
490

Li Zhang's avatar
Li Zhang committed
491
492
493
494
495
496
497
498
499
500
    std::vector<int> offsets;
    auto             current = get_signature(0);
    offsets.push_back(0);
    for (int i = 0; i < idxs.size(); ++i) {
        if (auto signature = get_signature(i); signature != current) {
            current = signature;
            offsets.push_back(i);
        }
    }
    offsets.push_back(idxs.size());
Li Zhang's avatar
Li Zhang committed
501

Li Zhang's avatar
Li Zhang committed
502
503
504
    for (int bi = 1; bi < offsets.size(); ++bi) {
        int beg = offsets[bi - 1];
        int end = offsets[bi];
Li Zhang's avatar
Li Zhang committed
505

Li Zhang's avatar
Li Zhang committed
506
507
508
        if (beg == end) {
            continue;
        }
Li Zhang's avatar
Li Zhang committed
509

Li Zhang's avatar
Li Zhang committed
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
        auto [s, d] = get_signature(beg);

        std::vector<int> s_idx;
        std::vector<int> d_idx;
        for (int i = beg; i < end; ++i) {
            s_idx.push_back(std::get<2>(desc[idxs[i]]));
            d_idx.push_back(std::get<3>(desc[idxs[i]]));
        }

        IndexedCopy(s_idx,
                    d_idx,
                    std::tuple{s->output_ids, d->output_ids, session_len_},
                    std::tuple{s->curand_state, d->curand_state, 1});
    }

    for (const auto& [s, d, si, di] : desc) {
        d->h_context_length[di] = s->h_context_length[si];
        d->h_finished[di]       = s->h_finished[si];
        d->h_rope_theta[di]     = s->h_rope_theta[si];
        d->seq_len_limit[di]    = s->seq_len_limit[si];
        d->sequences[di]        = s->sequences[si];
        d->is_swap_in[di]       = s->is_swap_in[si];
        d->requests[di]         = s->requests[si];
    }
Li Zhang's avatar
Li Zhang committed
534
535
536
537
}

template<typename T>
void LlamaBatch<T>::AllocateBuffer(size_t batch_size, size_t session_len)
Li Zhang's avatar
Li Zhang committed
538
{
lvhan028's avatar
lvhan028 committed
539
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
Li Zhang's avatar
Li Zhang committed
540
541
    const size_t batchxbeam = batch_size;

Li Zhang's avatar
Li Zhang committed
542
543
544
545
546
547
    const size_t hidden_units      = model_->hidden_units_;
    const size_t vocab_size        = model_->vocab_size_padded_;
    const size_t head_dim          = model_->size_per_head_;
    const size_t local_kv_head_num = model_->local_kv_head_num_;
    // +1 padding, BlockIterator does not use predicate
    const size_t max_block_count = sequence_manager_->max_block_count() + 1;
Li Zhang's avatar
Li Zhang committed
548
549
550

    context_decoder_input_buf_ =
        (T*)allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * max_context_token_num_ * hidden_units, false);
551
552
    context_decoder_output_buf_ =
        (T*)allocator_->reMalloc(context_decoder_output_buf_, sizeof(T) * max_context_token_num_ * hidden_units, false);
Li Zhang's avatar
Li Zhang committed
553
554
555
    context_decoder_ids_buf_ =
        (int*)allocator_->reMalloc(context_decoder_ids_buf_, sizeof(int) * max_context_token_num_, false);

Li Zhang's avatar
Li Zhang committed
556
557
558
559
560
561
562
563
    tmp_k_cache_buf_ = (T*)allocator_->reMalloc(
        tmp_k_cache_buf_, sizeof(T) * max_context_token_num_ * local_kv_head_num * head_dim, false);
    tmp_v_cache_buf_ = (T*)allocator_->reMalloc(
        tmp_v_cache_buf_, sizeof(T) * max_context_token_num_ * local_kv_head_num * head_dim, false);

    tmp_k_ptrs_ = (void**)allocator_->reMalloc(tmp_k_ptrs_, sizeof(void*) * batch_size, false);
    tmp_v_ptrs_ = (void**)allocator_->reMalloc(tmp_v_ptrs_, sizeof(void*) * batch_size, false);

Li Zhang's avatar
Li Zhang committed
564
565
566
567
568
569
570
    decoder_input_buf_  = (T*)allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units, false);
    decoder_output_buf_ = (T*)allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units, false);

    input_ids_buf_      = (int*)allocator_->reMalloc(input_ids_buf_, sizeof(int) * batchxbeam * session_len, true);
    input_length_buf_   = (int*)allocator_->reMalloc(input_length_buf_, sizeof(int) * batchxbeam);
    context_length_buf_ = (int*)allocator_->reMalloc(context_length_buf_, sizeof(int) * batchxbeam);

Li Zhang's avatar
Li Zhang committed
571
    sequence_lengths_ = (int*)allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false);
Li Zhang's avatar
Li Zhang committed
572

Li Zhang's avatar
Li Zhang committed
573
574
575
    cu_block_counts_ = (int*)allocator_->reMalloc(cu_block_counts_, sizeof(int) * (batch_size + 1));
    k_block_ptrs_    = (uintptr_t*)allocator_->reMalloc(k_block_ptrs_, sizeof(uintptr_t) * max_block_count);
    v_block_ptrs_    = (uintptr_t*)allocator_->reMalloc(v_block_ptrs_, sizeof(uintptr_t) * max_block_count);
Li Zhang's avatar
Li Zhang committed
576
577
578
579
580
581
582
583
584

    logits_buf_       = (float*)allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size, false);
    local_logits_buf_ = (float*)allocator_->reMalloc(local_logits_buf_, sizeof(float) * batchxbeam * vocab_size, false);

    token_ids_buf_ = (int*)allocator_->reMalloc(token_ids_buf_, sizeof(int) * batchxbeam * session_len * 2, true);

    finished_buf_  = (bool*)allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false);
    seq_limit_len_ = (uint32_t*)allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false);

Li Zhang's avatar
Li Zhang committed
585
586
587
588
589
590
    request_output_ids_ptrs_ = (int**)allocator_->reMalloc(request_output_ids_ptrs_, sizeof(int*) * batch_size, true);
    request_output_ids_lens_ = (int*)allocator_->reMalloc(request_output_ids_lens_, sizeof(int) * batch_size, true);
    request_seqlen_ptrs_     = (int**)allocator_->reMalloc(request_seqlen_ptrs_, sizeof(int*) * batch_size, true);

    rope_theta_ = (float*)allocator_->reMalloc(rope_theta_, sizeof(float) * batch_size, false);

Li Zhang's avatar
Li Zhang committed
591
592
593
594
    is_allocate_buffer_ = true;
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
595
void LlamaBatch<T>::AllocatePersistantBuffer(size_t max_batch_size)
Li Zhang's avatar
Li Zhang committed
596
{
Li Zhang's avatar
Li Zhang committed
597
598
599
600
601
602
    d_stop_words_ = (int*)allocator_->reMalloc(d_stop_words_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true);
    d_bad_words_  = (int*)allocator_->reMalloc(d_bad_words_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true);
    h_stop_words_ =
        (int*)allocator_->reMalloc(h_stop_words_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true, true);
    h_bad_words_ =
        (int*)allocator_->reMalloc(h_bad_words_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true, true);
Li Zhang's avatar
Li Zhang committed
603
604
605
606
607
608
609

    h_runtime_top_k_ = (int*)allocator_->reMalloc(h_runtime_top_k_, sizeof(int) * max_batch_size, true, true);
    h_runtime_top_p_ = (float*)allocator_->reMalloc(h_runtime_top_p_, sizeof(float) * max_batch_size, true, true);
    h_temperature_   = (float*)allocator_->reMalloc(h_temperature_, sizeof(float) * max_batch_size, true, true);
    h_repetition_penalty_ =
        (float*)allocator_->reMalloc(h_repetition_penalty_, sizeof(float) * max_batch_size, true, true);

Li Zhang's avatar
Li Zhang committed
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
    h_random_seed_ = (unsigned long long*)allocator_->reMalloc(
        h_random_seed_, sizeof(unsigned long long) * max_batch_size, true, true);
    d_random_seed_ = (unsigned long long*)allocator_->reMalloc(
        d_random_seed_, sizeof(unsigned long long) * max_batch_size, true, false);

    h_curand_state_ =
        (curandState_t*)allocator_->reMalloc(h_curand_state_, sizeof(curandState_t) * max_batch_size, true, true);
    d_curand_state_ =
        (curandState_t*)allocator_->reMalloc(d_curand_state_, sizeof(curandState_t) * max_batch_size, true, false);

    d_end_ids_buf_ = (int*)allocator_->reMalloc(d_end_ids_buf_, sizeof(int) * max_batch_size, false);
    h_end_ids_buf_ = (int*)allocator_->reMalloc(h_end_ids_buf_, sizeof(int) * max_batch_size, false, true);

    sampling_params_ = {
        {"stop_words_list", (std::byte*)h_stop_words_, (std::byte*)d_stop_words_},
        {"bad_words_list", (std::byte*)h_bad_words_, (std::byte*)d_bad_words_},
        {"runtime_top_k", (std::byte*)h_runtime_top_k_, nullptr},
        {"runtime_top_p", (std::byte*)h_runtime_top_p_, nullptr},
        {"temperature", (std::byte*)h_temperature_, nullptr},
        {"repetition_penalty", (std::byte*)h_repetition_penalty_, nullptr},
    };
Li Zhang's avatar
Li Zhang committed
631

Li Zhang's avatar
Li Zhang committed
632
633
    for (auto& s : states_) {
        s.output_ids = (int*)allocator_->reMalloc(s.output_ids, sizeof(int) * max_batch_size * session_len_, true);
Li Zhang's avatar
Li Zhang committed
634
635
        s.curand_state =
            (curandState_t*)allocator_->reMalloc(s.curand_state, sizeof(curandState_t) * max_batch_size, true);
Li Zhang's avatar
Li Zhang committed
636
637
638
    }

    const size_t max_block_count = sequence_manager_->max_block_count();
Li Zhang's avatar
Li Zhang committed
639
640

    {
Li Zhang's avatar
Li Zhang committed
641
        NcclGuard barrier(model_->tensor_para_, stream_, true);
Li Zhang's avatar
Li Zhang committed
642
643
644
645
        h_input_ids_buf_ =
            (int*)allocator_->reMalloc(h_input_ids_buf_, sizeof(int) * max_batch_size * session_len_, false, true);
        h_input_length_buf_ =
            (int*)allocator_->reMalloc(h_input_length_buf_, sizeof(int) * max_batch_size, false, true);
Li Zhang's avatar
Li Zhang committed
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663

        h_tmp_k_ptrs_ = (void**)allocator_->reMalloc(h_tmp_k_ptrs_, sizeof(void*) * max_batch_size, false, true);
        h_tmp_v_ptrs_ = (void**)allocator_->reMalloc(h_tmp_v_ptrs_, sizeof(void*) * max_batch_size, false, true);

        h_cu_block_counts_ =
            (int*)allocator_->reMalloc(h_cu_block_counts_, sizeof(int) * (max_batch_size + 1), false, true);
        h_k_block_ptrs_ =
            (uintptr_t*)allocator_->reMalloc(h_k_block_ptrs_, sizeof(uintptr_t) * max_block_count, false, true);
        h_v_block_ptrs_ =
            (uintptr_t*)allocator_->reMalloc(h_v_block_ptrs_, sizeof(uintptr_t) * max_block_count, false, true);

        for (auto& s : states_) {
            s.h_context_length =
                (int*)allocator_->reMalloc(s.h_context_length, sizeof(int) * max_batch_size, false, true);
            s.h_finished   = (bool*)allocator_->reMalloc(s.h_finished, sizeof(bool) * max_batch_size * 2, false, true);
            s.h_rope_theta = (float*)allocator_->reMalloc(s.h_rope_theta, sizeof(float) * max_batch_size, false, true);
        }

Li Zhang's avatar
Li Zhang committed
664
665
        h_seq_limit_len_ =
            (uint32_t*)allocator_->reMalloc(h_seq_limit_len_, sizeof(uint32_t) * max_batch_size, false, true);
Li Zhang's avatar
Li Zhang committed
666
667
668
669
670
671
672

        h_request_output_ids_ptrs_ =
            (int**)allocator_->reMalloc(h_request_output_ids_ptrs_, sizeof(int*) * max_batch_size, true, true);
        h_request_output_ids_lens_ =
            (int*)allocator_->reMalloc(h_request_output_ids_lens_, sizeof(int) * max_batch_size, true, true);
        h_request_seqlen_ptrs_ =
            (int**)allocator_->reMalloc(h_request_seqlen_ptrs_, sizeof(int*) * max_batch_size, true, true);
Li Zhang's avatar
Li Zhang committed
673
674
675

        h_output_ids_ =
            (int*)allocator_->reMalloc(h_output_ids_, sizeof(int) * max_batch_size * session_len_, false, true);
Li Zhang's avatar
Li Zhang committed
676
677
678
679
680
681
    }

    is_allocate_persistant_buffer_ = true;
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
682
void LlamaBatch<T>::FreeBuffer()
Li Zhang's avatar
Li Zhang committed
683
{
lvhan028's avatar
lvhan028 committed
684
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
Li Zhang's avatar
Li Zhang committed
685
686
    if (is_allocate_buffer_) {
        allocator_->free((void**)&context_decoder_input_buf_);
687
        allocator_->free((void**)&context_decoder_output_buf_);
Li Zhang's avatar
Li Zhang committed
688
689
        allocator_->free((void**)&context_decoder_ids_buf_);

Li Zhang's avatar
Li Zhang committed
690
691
692
693
694
        allocator_->free((void**)&tmp_k_cache_buf_);
        allocator_->free((void**)&tmp_v_cache_buf_);
        allocator_->free((void**)&tmp_k_ptrs_);
        allocator_->free((void**)&tmp_v_ptrs_);

Li Zhang's avatar
Li Zhang committed
695
696
697
698
699
700
701
702
703
        allocator_->free((void**)&decoder_input_buf_);
        allocator_->free((void**)&decoder_output_buf_);

        allocator_->free((void**)&input_ids_buf_);
        allocator_->free((void**)&input_length_buf_);
        allocator_->free((void**)&context_length_buf_);

        allocator_->free((void**)&sequence_lengths_);

Li Zhang's avatar
Li Zhang committed
704
705
706
        allocator_->free((void**)&cu_block_counts_);
        allocator_->free((void**)&k_block_ptrs_);
        allocator_->free((void**)&v_block_ptrs_);
Li Zhang's avatar
Li Zhang committed
707
708
709
710

        allocator_->free((void**)&logits_buf_);
        allocator_->free((void**)&local_logits_buf_);

711
712
713
714
715
716
717
        if (local_context_logits_buf_) {
            allocator_->free((void**)&local_context_logits_buf_);
        }
        if (context_logits_buf_) {
            allocator_->free((void**)&context_logits_buf_);
        }

Li Zhang's avatar
Li Zhang committed
718
719
        allocator_->free((void**)&token_ids_buf_);

Li Zhang's avatar
Li Zhang committed
720
721
722
        allocator_->free((void**)&d_end_ids_buf_);
        allocator_->free((void**)&h_end_ids_buf_, true);

Li Zhang's avatar
Li Zhang committed
723
724
725
        allocator_->free((void**)&finished_buf_);
        allocator_->free((void**)&seq_limit_len_);

Li Zhang's avatar
Li Zhang committed
726
727
728
729
730
731
        allocator_->free((void**)&request_output_ids_ptrs_);
        allocator_->free((void**)&request_output_ids_lens_);
        allocator_->free((void**)&request_seqlen_ptrs_);

        allocator_->free((void**)&rope_theta_);

Li Zhang's avatar
Li Zhang committed
732
733
734
735
        is_allocate_buffer_ = false;
    }

    if (is_allocate_persistant_buffer_) {
Li Zhang's avatar
Li Zhang committed
736
737
738
739
740
741
742
743
744
745

        allocator_->free((void**)&d_stop_words_);
        allocator_->free((void**)&h_stop_words_, true);
        allocator_->free((void**)&d_bad_words_);
        allocator_->free((void**)&h_bad_words_, true);
        allocator_->free((void**)&d_random_seed_);
        allocator_->free((void**)&h_random_seed_, true);
        allocator_->free((void**)&d_curand_state_);
        allocator_->free((void**)&h_curand_state_, true);

Li Zhang's avatar
Li Zhang committed
746
747
748
749
750
        for (auto& s : states_) {
            allocator_->free((void**)&s.h_context_length, true);
            allocator_->free((void**)&s.h_finished, true);
            allocator_->free((void**)&s.h_rope_theta, true);
            allocator_->free((void**)&s.output_ids);
Li Zhang's avatar
Li Zhang committed
751
            allocator_->free((void**)&s.curand_state);
Li Zhang's avatar
Li Zhang committed
752
753
754
755
756
757
        }
        allocator_->free((void**)&h_tmp_k_ptrs_, true);
        allocator_->free((void**)&h_tmp_v_ptrs_, true);
        allocator_->free((void**)&h_cu_block_counts_, true);
        allocator_->free((void**)&h_k_block_ptrs_, true);
        allocator_->free((void**)&h_v_block_ptrs_, true);
Li Zhang's avatar
Li Zhang committed
758
759
760
761
        allocator_->free((void**)&h_input_ids_buf_, true);
        allocator_->free((void**)&h_input_length_buf_, true);
        allocator_->free((void**)&h_seq_limit_len_, true);

Li Zhang's avatar
Li Zhang committed
762
763
764
        allocator_->free((void**)&h_request_output_ids_ptrs_, true);
        allocator_->free((void**)&h_request_output_ids_lens_, true);
        allocator_->free((void**)&h_request_seqlen_ptrs_, true);
Li Zhang's avatar
Li Zhang committed
765

Li Zhang's avatar
Li Zhang committed
766
767
        allocator_->free((void**)&h_output_ids_, true);

Li Zhang's avatar
Li Zhang committed
768
769
770
771
772
        is_allocate_persistant_buffer_ = false;
    }
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
773
774
775
776
777
LlamaBatch<T>::LlamaBatch(int                              max_batch_size,
                          int                              max_context_token_num,
                          int                              session_len,
                          std::unique_ptr<SequenceManager> sequence_manager,
                          LlamaV2<T>*                      model):
Li Zhang's avatar
Li Zhang committed
778
779
780
    max_batch_size_(max_batch_size),
    max_context_token_num_(max_context_token_num),
    session_len_(session_len),
Li Zhang's avatar
Li Zhang committed
781
782
783
784
785
    rank_(model->tensor_para_.rank_),
    debug_(model->debug_),
    step_length_(model->step_length_),
    sequence_manager_(std::move(sequence_manager)),
    model_(model),
Li Zhang's avatar
Li Zhang committed
786
787
    data_type_(getTensorType<T>())
{
Li Zhang's avatar
Li Zhang committed
788
789
790
791
792
793
794
795
796
797
    stream_         = model_->stream_;
    allocator_      = model_->allocator_;
    cublas_wrapper_ = model_->cublas_wrapper_;

    for (auto& s : states_) {
        s.requests.resize(max_batch_size);
        s.sequences.resize(max_batch_size);
        s.seq_len_limit.resize(max_batch_size);
        s.is_swap_in.resize(max_batch_size);
    }
Li Zhang's avatar
Li Zhang committed
798

Li Zhang's avatar
Li Zhang committed
799
800
801
    state_    = &states_[0];
    back_     = &states_[1];
    incoming_ = &states_[2];
Li Zhang's avatar
Li Zhang committed
802

Li Zhang's avatar
Li Zhang committed
803
804
    AllocateBuffer(max_batch_size, session_len_);
    AllocatePersistantBuffer(max_batch_size);
Li Zhang's avatar
Li Zhang committed
805
806
807
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
808
void LlamaBatch<T>::InitializeSampling()
Li Zhang's avatar
Li Zhang committed
809
{
Li Zhang's avatar
Li Zhang committed
810
    NvtxScope _("InitSampling");
Li Zhang's avatar
Li Zhang committed
811
    const int batch_size = state_->active_size;
Li Zhang's avatar
Li Zhang committed
812
    TensorMap inputs;
Li Zhang's avatar
Li Zhang committed
813
    for (const auto& [name, h_ptr, d_ptr] : sampling_params_) {
Li Zhang's avatar
Li Zhang committed
814
        // find an exemplar that matches the param name
Li Zhang's avatar
Li Zhang committed
815
        const Tensor* ptr{};
Li Zhang's avatar
Li Zhang committed
816
        for (int i = 0; i < batch_size; ++i) {
Li Zhang's avatar
Li Zhang committed
817
818
            if (state_->requests[i]->inputs[rank_].isExist(name)) {
                ptr = &state_->requests[i]->inputs[rank_].at(name);
Li Zhang's avatar
Li Zhang committed
819
820
821
                break;
            }
        }
Li Zhang's avatar
Li Zhang committed
822
        // fill the batch of the param
Li Zhang's avatar
Li Zhang committed
823
824
825
826
        if (ptr) {
            const auto& ref   = *ptr;
            auto        shape = ref.shape;
            FT_CHECK(shape[0] == 1);
Li Zhang's avatar
Li Zhang committed
827
            shape[0]                = batch_size;
Li Zhang's avatar
Li Zhang committed
828
            const int size_in_bytes = ref.sizeBytes();
Li Zhang's avatar
Li Zhang committed
829
            memset(h_ptr, 0, size_in_bytes * batch_size);
Li Zhang's avatar
Li Zhang committed
830
            for (int i = 0; i < batch_size; ++i) {
Li Zhang's avatar
Li Zhang committed
831
832
                if (state_->requests[i]->inputs[rank_].isExist(name)) {
                    Tensor& src = state_->requests[i]->inputs[rank_].at(name);
Li Zhang's avatar
Li Zhang committed
833
                    FT_CHECK(ref.shape == src.shape);
Li Zhang's avatar
Li Zhang committed
834
                    std::copy_n(src.getPtr<std::byte>(), size_in_bytes, h_ptr + size_in_bytes * i);
Li Zhang's avatar
Li Zhang committed
835
836
                }
            }
Li Zhang's avatar
Li Zhang committed
837
838
839
840
            if (d_ptr) {
                Copy(h_ptr, batch_size * size_in_bytes, d_ptr);
            }
            inputs.insert({name, {d_ptr ? MEMORY_GPU : MEMORY_CPU, ref.type, shape, d_ptr ? d_ptr : h_ptr}});
Li Zhang's avatar
Li Zhang committed
841
            if (debug_ && rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
842
                TM_LOG_INFO("[initializeSampling] %s", format({name, inputs.at(name)}).c_str());
Li Zhang's avatar
Li Zhang committed
843
844
845
846
            }
        }
    }

Li Zhang's avatar
Li Zhang committed
847
848
849
850
851
    // init for eos
    std::fill_n(h_end_ids_buf_, batch_size, model_->end_id_);
    Copy(h_end_ids_buf_, batch_size, d_end_ids_buf_);
    inputs.insert({"end_id", {MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, d_end_ids_buf_}});

Li Zhang's avatar
Li Zhang committed
852
853
    inputs_ = std::move(inputs);

Li Zhang's avatar
Li Zhang committed
854
    model_->dynamic_decode_layer_->setup(batch_size, 1, &inputs_);
Li Zhang's avatar
Li Zhang committed
855
856
857
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
858
auto LlamaBatch<T>::InitializeGeneration() -> GenerationState
Li Zhang's avatar
Li Zhang committed
859
{
Li Zhang's avatar
Li Zhang committed
860
    NvtxScope _("InitGen");
Li Zhang's avatar
Li Zhang committed
861
862
    const int batch_size      = state_->active_size;
    const int max_context_len = *std::max_element(state_->h_context_length, state_->h_context_length + batch_size);
Li Zhang's avatar
Li Zhang committed
863

Li Zhang's avatar
Li Zhang committed
864
865
866
867
868
869
870
871
872
    Copy(state_->h_context_length, batch_size, context_length_buf_);  // also referenced in `SetOutputTensors`
    Copy(context_length_buf_, batch_size, sequence_lengths_);
    // `sequence_lengths_` will be increased by dynamic decode
    // note that in decoder and in output "sequence length" has different semantic
    // - in decoder it means length of sequence that has kv cache already computed
    // - in output it means length of all tokens (the last generated token does not have k/v cache computed yet)
    invokePlusScalar(sequence_lengths_, -1, batch_size, stream_);
    sync_check_cuda_error();

Li Zhang's avatar
Li Zhang committed
873
874
    Clear(token_ids_buf_, batch_size * session_len_);
    invokeTransposeAxis01(token_ids_buf_, state_->output_ids, batch_size, session_len_, 1, stream_);
Li Zhang's avatar
Li Zhang committed
875
876
877
878
879
880
881
882
    sync_check_cuda_error();

    // token_ids_buf_[s, b]
    // ABCDe            ABCDe     e
    // ABCDEFGHIJk      ABCDEFGHIJk
    // ABCDEFGHi    ->  ABCDEFGHi i
    // ABCDEFGh         ABCDEFGh  h
    // ABCd             ABCd      d
Li Zhang's avatar
Li Zhang committed
883
    invokePadLastTokenIds(token_ids_buf_, context_length_buf_, max_context_len, batch_size, stream_);
Li Zhang's avatar
Li Zhang committed
884
885
    sync_check_cuda_error();

Li Zhang's avatar
Li Zhang committed
886
887
888
889
890
891
892
893
894
895
896
897
    // used for dispatching split-k decoding kernels
    const int sum_seq_len =
        std::accumulate(state_->h_context_length, state_->h_context_length + batch_size, -batch_size);
    const int max_seq_len = *std::max_element(state_->h_context_length, state_->h_context_length + batch_size) - 1;

    // seq_limit_len_, will be compared to `step` instead of `sequence_length`, so padding len should be accounted
    // for
    for (int i = 0; i < batch_size; ++i) {
        h_seq_limit_len_[i] = state_->seq_len_limit[i] + (max_context_len - state_->h_context_length[i]);
    }
    Copy(h_seq_limit_len_, batch_size, seq_limit_len_);
    Copy(state_->h_finished, batch_size, finished_buf_);
Li Zhang's avatar
Li Zhang committed
898

Li Zhang's avatar
Li Zhang committed
899
900
901
902
903
904
905
906
907
908
909
910
    for (int i = 0; i < batch_size; ++i) {
        Tensor& output_ids         = state_->requests[i]->outputs[rank_].at("output_ids");
        int*    req_output_ids_ptr = output_ids.getPtr<int>();
        int*    req_seqlen_ptr     = state_->requests[i]->outputs[rank_].getPtr<int>("sequence_length");

        h_request_output_ids_ptrs_[i] = req_output_ids_ptr;
        h_request_output_ids_lens_[i] = output_ids.shape.at(2);
        h_request_seqlen_ptrs_[i]     = req_seqlen_ptr;

        FT_CHECK(h_request_output_ids_ptrs_[i]);
        FT_CHECK(h_request_output_ids_lens_[i]);
        FT_CHECK(h_request_seqlen_ptrs_[i]);
Li Zhang's avatar
Li Zhang committed
911
    }
Li Zhang's avatar
Li Zhang committed
912
913
914
915
916
    Copy(h_request_output_ids_ptrs_, batch_size, request_output_ids_ptrs_);
    Copy(h_request_output_ids_lens_, batch_size, request_output_ids_lens_);
    Copy(h_request_seqlen_ptrs_, batch_size, request_seqlen_ptrs_);

    Copy(state_->h_rope_theta, batch_size, rope_theta_);
Li Zhang's avatar
Li Zhang committed
917
918
919
920

    // ! range of step_ [1, 2 * session_len]
    // consider a sequence with context_len == session_len and another sequence with context_len == 1 and
    // request_output_len == session_len - 1 => step_ will loop in [session_len, 2 * session_len)
Li Zhang's avatar
Li Zhang committed
921
    const int start_step = max_context_len;
Li Zhang's avatar
Li Zhang committed
922
923

    if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
924
925
        TM_LOG_INFO("[initGen] batch_size = %d", (int)batch_size);
        TM_LOG_INFO("[initGen] max_context_len = %d", (int)max_context_len);
Li Zhang's avatar
Li Zhang committed
926

Li Zhang's avatar
Li Zhang committed
927
928
929
930
931
932
933
934
935
936
        if (debug_) {
            TM_LOG_INFO("[initGen] slot  sequence_id  context_len  seq_limit_len  finished");
            for (int i = 0; i < batch_size; ++i) {
                TM_LOG_INFO("[initGen] %4d  %11ld  %11d  %13d  %8d",
                            i,
                            (long)state_->sequences[i]->id,
                            state_->h_context_length[i],
                            (int)h_seq_limit_len_[i],
                            (int)state_->h_finished[i]);
            }
Li Zhang's avatar
Li Zhang committed
937
938
        }
    }
Li Zhang's avatar
Li Zhang committed
939
940

    return GenerationState{max_context_len, start_step, sum_seq_len, max_seq_len};
Li Zhang's avatar
Li Zhang committed
941
942
943
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
944
bool LlamaBatch<T>::Generate(GenerationState& g)
Li Zhang's avatar
Li Zhang committed
945
{
Li Zhang's avatar
Li Zhang committed
946
947
948
    NvtxScope scope("Generate");
    const int batch_size = state_->active_size;

Li Zhang's avatar
Li Zhang committed
949
    constexpr int kLogInterval = 10;
Li Zhang's avatar
Li Zhang committed
950
951
    if (rank_ == 0 && (g.step - 1) % kLogInterval == 0) {
        TM_LOG_INFO("------------------------- step = %d -------------------------", g.step - 1);
Li Zhang's avatar
Li Zhang committed
952
953
    }

Li Zhang's avatar
Li Zhang committed
954
    const bool is_first_step = (g.step == g.max_init_ctx_len);
Li Zhang's avatar
Li Zhang committed
955
956
957

    std::vector<int> prev;
    if (debug_ && rank_ == 0 && is_first_step) {
Li Zhang's avatar
Li Zhang committed
958
959
        prev.resize(batch_size);
        Copy(token_ids_buf_ + (g.step - 1) * batch_size, batch_size, prev.data());
Li Zhang's avatar
Li Zhang committed
960
961
962
    }

    // embeddingLookup(step_ - 1);
Li Zhang's avatar
Li Zhang committed
963
    model_->embeddingLookup(decoder_input_buf_,  //
Li Zhang's avatar
Li Zhang committed
964
                            token_ids_buf_,
Li Zhang's avatar
Li Zhang committed
965
966
                            batch_size,
                            g.step - 1);
Li Zhang's avatar
Li Zhang committed
967

Li Zhang's avatar
Li Zhang committed
968
969
970
    model_->decoderForward(decoder_output_buf_,
                           k_block_ptrs_,
                           v_block_ptrs_,
Li Zhang's avatar
Li Zhang committed
971
972
973
                           decoder_input_buf_,
                           sequence_lengths_,
                           finished_buf_,
Li Zhang's avatar
Li Zhang committed
974
975
976
                           cu_block_counts_,
                           rope_theta_,
                           g.step,
Li Zhang's avatar
Li Zhang committed
977
                           0,
Li Zhang's avatar
Li Zhang committed
978
979
980
                           g.sum_seq_len,
                           g.max_seq_len,
                           batch_size);
Li Zhang's avatar
Li Zhang committed
981

Li Zhang's avatar
Li Zhang committed
982
    model_->postDecodeEmbedding(logits_buf_,  //
Li Zhang's avatar
Li Zhang committed
983
984
                                local_logits_buf_,
                                decoder_output_buf_,
Li Zhang's avatar
Li Zhang committed
985
                                batch_size);
Li Zhang's avatar
Li Zhang committed
986

Li Zhang's avatar
Li Zhang committed
987
988
989
    /// sync for better NVTX visualization, THIS IS NOT NEEDED
    // check_cuda_error(cudaStreamSynchronize(stream_));

Li Zhang's avatar
Li Zhang committed
990
991
992
    // stop-words & bad-words require the matched tokens to be contiguous, so item size > 1 is
    // not supported yet.
    bool should_stop{};
Li Zhang's avatar
Li Zhang committed
993
    model_->dynamicDecode(token_ids_buf_,
Li Zhang's avatar
Li Zhang committed
994
995
996
                          finished_buf_,
                          sequence_lengths_,
                          &should_stop,
Li Zhang's avatar
Li Zhang committed
997
                          state_->curand_state,
Li Zhang's avatar
Li Zhang committed
998
999
1000
1001
1002
                          &inputs_,
                          &outputs_,
                          logits_buf_,
                          seq_limit_len_,
                          context_length_buf_,
Li Zhang's avatar
Li Zhang committed
1003
                          d_end_ids_buf_,
Li Zhang's avatar
Li Zhang committed
1004
                          g.step,
Li Zhang's avatar
Li Zhang committed
1005
                          0,
Li Zhang's avatar
Li Zhang committed
1006
                          g.max_init_ctx_len,
Li Zhang's avatar
Li Zhang committed
1007
                          session_len_ * 2,
Li Zhang's avatar
Li Zhang committed
1008
                          batch_size);
Li Zhang's avatar
Li Zhang committed
1009
1010

    if (debug_ && rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
1011
        std::vector<int> curr(batch_size);
Li Zhang's avatar
Li Zhang committed
1012

Li Zhang's avatar
Li Zhang committed
1013
        Copy(token_ids_buf_ + g.step * batch_size, batch_size, curr.data());
Li Zhang's avatar
Li Zhang committed
1014
1015
1016
1017
1018
1019
1020
        cudaStreamSynchronize(stream_);

        if (is_first_step) {
            std::stringstream sprev;
            for (int k = 0; k < prev.size(); ++k) {
                sprev << std::setw(6) << prev[k];
            }
Li Zhang's avatar
Li Zhang committed
1021
            TM_LOG_INFO("[ lookup ] step = %d, [%s]", g.step - 1, sprev.str().c_str());
Li Zhang's avatar
Li Zhang committed
1022
1023
1024
1025
1026
1027
        }

        std::stringstream scurr;
        for (int k = 0; k < curr.size(); ++k) {
            scurr << std::setw(6) << curr[k];
        }
Li Zhang's avatar
Li Zhang committed
1028
        TM_LOG_INFO("[generate] step = %d, [%s]", g.step - 1, scurr.str().c_str());
Li Zhang's avatar
Li Zhang committed
1029
1030
1031
    }

    ////////////////////////////////////////////////
Li Zhang's avatar
Li Zhang committed
1032
1033
1034
1035
    /// ! increase the counters
    g.step += 1;
    g.max_seq_len += 1;
    g.sum_seq_len += batch_size;
Li Zhang's avatar
Li Zhang committed
1036
1037
1038
1039
1040

    return !should_stop;
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
1041
void LlamaBatch<T>::ContextDecode()
Li Zhang's avatar
Li Zhang committed
1042
{
Li Zhang's avatar
Li Zhang committed
1043
    NvtxScope  _("prefill");
Li Zhang's avatar
Li Zhang committed
1044
    const auto batch_size = state_->active_size;
Li Zhang's avatar
Li Zhang committed
1045

Li Zhang's avatar
Li Zhang committed
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
    int base = -1;
    for (int i = 0; i < batch_size; ++i) {
        if (state_->is_swap_in[i]) {
            const auto& seq = *state_->sequences[i];
            dbg(std::tuple(i, state_->h_context_length[i], seq.cache_len));
            if (const int missing = state_->h_context_length[i] - seq.cache_len; missing > 1) {
                base = base < 0 ? i : base;
                dbg(seq.tokens, seq.cache_len);
                Copy(state_->output_ids + i * session_len_ + seq.cache_len, missing, input_ids_buf_ + i * session_len_);
                // subtract input/context len by 1 to skip last input token (will process with decoder later)
                h_input_length_buf_[i] = missing - 1;
Li Zhang's avatar
Li Zhang committed
1057
1058
1059
            }
        }
    }
Li Zhang's avatar
Li Zhang committed
1060
1061
1062
    if (base < 0) {
        // TM_LOG_INFO("[decodeContext] Context decoding is not needed.");
        return;
Li Zhang's avatar
Li Zhang committed
1063
1064
    }

Li Zhang's avatar
Li Zhang committed
1065
    const int context_decode_count = batch_size - base;
Li Zhang's avatar
Li Zhang committed
1066

Li Zhang's avatar
Li Zhang committed
1067
1068
1069
    Copy(state_->h_context_length, batch_size, context_length_buf_);
    Copy(state_->h_rope_theta, batch_size, rope_theta_);
    Copy(h_input_length_buf_, batch_size, input_length_buf_);
Li Zhang's avatar
Li Zhang committed
1070

Li Zhang's avatar
Li Zhang committed
1071
1072
    // check_cuda_error(cudaStreamSynchronize(stream_));
    // const auto tick = std::chrono::high_resolution_clock::now();
Li Zhang's avatar
Li Zhang committed
1073

Li Zhang's avatar
Li Zhang committed
1074
1075
    if (rank_ == 0) {
        TM_LOG_INFO("[decodeContext] base = %d, count = %d", base, context_decode_count);
Li Zhang's avatar
Li Zhang committed
1076
    }
Li Zhang's avatar
Li Zhang committed
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
    // subtract input/context len by 1 to skip last input token (will process with decoder later)
    invokePlusScalar(context_length_buf_ + base, -1, context_decode_count, stream_);

    // find sub-batch offsets
    std::vector<int> offsets{base};
    std::vector<int> max_context_cnts;
    int              accum_size        = 0;
    int              accum_input_count = 0;
    int              max_context_count = 0;
    for (int i = base; i < batch_size; ++i) {
        int size          = accum_size + 1;
        int input_count   = accum_input_count + h_input_length_buf_[i];
        int context_count = std::max(max_context_count, state_->h_context_length[i] - 1);
        // we have `cu_seqlens` on q so no padding for input is needed
        // kernels are expecting uniform k/v cache length -> `max_context_count * size <= max_context_token_num_`
        if (input_count <= max_context_token_num_ && context_count * size <= max_context_token_num_) {
            accum_size        = size;
            accum_input_count = input_count;
            max_context_count = context_count;
Li Zhang's avatar
Li Zhang committed
1096
        }
Li Zhang's avatar
Li Zhang committed
1097
1098
1099
1100
1101
1102
        else {
            offsets.push_back(i);
            max_context_cnts.push_back(max_context_count);
            accum_size        = 1;
            accum_input_count = h_input_length_buf_[i];
            max_context_count = state_->h_context_length[i] - 1;
Li Zhang's avatar
Li Zhang committed
1103
1104
        }
    }
Li Zhang's avatar
Li Zhang committed
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
    offsets.push_back(batch_size);
    max_context_cnts.push_back(max_context_count);

    dbg(offsets, max_context_cnts);

    // context decode on sub-batches
    for (int k = 0; k < offsets.size() - 1; ++k) {
        int              first          = offsets[k];
        int              last           = offsets[k + 1];
        int              sub_batch_size = last - first;
        T*               k_ptr          = tmp_k_cache_buf_;
        T*               v_ptr          = tmp_v_cache_buf_;
        std::vector<int> decode_indices{};
        std::vector<int> decode_lengths{};
        int              max_input_len{};
        auto             input_ids = context_decoder_ids_buf_;
        TM_LOG_INFO("first = %d, last = %d", first, last);
        for (int i = first; i < last; ++i) {
Li Zhang's avatar
Li Zhang committed
1123
            // TM_LOG_INFO("session_len = %d, input_length = %d", session_len_, h_input_length_buf_[i]);
Li Zhang's avatar
Li Zhang committed
1124
1125
1126
1127
1128
1129
1130
1131
1132
            input_ids = Copy(input_ids_buf_ + i * session_len_, h_input_length_buf_[i], input_ids);
            dbg(i, h_input_length_buf_[i]);
            h_tmp_k_ptrs_[i] = k_ptr;
            h_tmp_v_ptrs_[i] = v_ptr;
            k_ptr += model_->local_kv_head_num_ * max_context_cnts[k] * model_->size_per_head_;
            v_ptr += model_->local_kv_head_num_ * max_context_cnts[k] * model_->size_per_head_;
            decode_indices.push_back(i);
            decode_lengths.push_back(h_input_length_buf_[i]);
            max_input_len = std::max(max_input_len, h_input_length_buf_[i]);
Li Zhang's avatar
Li Zhang committed
1133
        }
Li Zhang's avatar
Li Zhang committed
1134
1135
        int token_count = input_ids - context_decoder_ids_buf_;
        dbg(token_count, max_input_len, max_context_cnts[k]);
Li Zhang's avatar
Li Zhang committed
1136

Li Zhang's avatar
Li Zhang committed
1137
1138
        Copy(h_tmp_k_ptrs_ + first, sub_batch_size, tmp_k_ptrs_ + first);
        Copy(h_tmp_v_ptrs_ + first, sub_batch_size, tmp_v_ptrs_ + first);
Li Zhang's avatar
Li Zhang committed
1139
1140

        if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
1141
1142
1143
1144
1145
1146
1147
            TM_LOG_INFO(
                "[decodeContext] offset = %d, batch_size = %d, token_num = %d, max_input_len = %d, max_context_len = %d",
                base,
                sub_batch_size,
                token_count,
                max_input_len,
                max_context_cnts[k]);
Li Zhang's avatar
Li Zhang committed
1148
1149
        }

Li Zhang's avatar
Li Zhang committed
1150
1151
        dbg(first, last);
        dbg(k_block_ptrs_, v_block_ptrs_);
Li Zhang's avatar
Li Zhang committed
1152

Li Zhang's avatar
Li Zhang committed
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
        model_->contextDecode(nullptr,
                              k_block_ptrs_,
                              v_block_ptrs_,
                              tmp_k_ptrs_ + first,
                              tmp_v_ptrs_ + first,
                              context_decoder_input_buf_,
                              context_decoder_output_buf_,
                              context_decoder_ids_buf_,
                              input_length_buf_ + first,
                              context_length_buf_ + first,
                              cu_block_counts_ + first,
                              rope_theta_ + first,
                              token_count,
                              max_input_len,
                              max_context_cnts[k],
                              max_context_cnts[k],
                              sub_batch_size);

        // compute logits of inputs if requested
        OutputContextLogits(context_decoder_output_buf_, decode_indices, decode_lengths);
    }

    invokePlusScalar(context_length_buf_ + base, 1, context_decode_count, stream_);

    std::fill(h_input_length_buf_ + base, h_input_length_buf_ + batch_size, 0);

    // `SequenceManager` needs real-time value of cache length
    for (int i = base; i < batch_size; ++i) {
        if (state_->requests[i]) {
            FT_CHECK(state_->sequences[i]);
            state_->sequences[i]->cache_len = state_->h_context_length[i] - 1;  // -1 since we skip last token
Li Zhang's avatar
Li Zhang committed
1184
1185
        }
    }
Li Zhang's avatar
Li Zhang committed
1186

Li Zhang's avatar
Li Zhang committed
1187
1188
1189
1190
1191
    // check_cuda_error(cudaStreamSynchronize(stream_));
    // const auto tock = std::chrono::high_resolution_clock::now();
    // if (rank_ == 0) {
    //     TM_LOG_INFO("[decodeContext] %.2f ms", std::chrono::duration<float, std::milli>(tock - tick).count());
    // }
Li Zhang's avatar
Li Zhang committed
1192
1193
}

1194
template<typename T>
Li Zhang's avatar
Li Zhang committed
1195
void LlamaBatch<T>::OutputContextLogits(T*                      context_decoder_output,
1196
1197
1198
1199
1200
1201
1202
1203
                                        const std::vector<int>& indices,
                                        const std::vector<int>& lengths)
{
    std::vector<float*> output_logits;
    int                 num_token = 0;
    {
        bool is_return_logits = false;
        for (int k = 0; k < indices.size(); ++k) {
Li Zhang's avatar
Li Zhang committed
1204
            auto& request = state_->requests[indices[k]];
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
            output_logits.push_back(request->outputs[rank_].getPtr<float>("logits", nullptr));
            num_token += lengths[k];
            if (output_logits.back()) {
                is_return_logits = true;
            }
        }
        if (!is_return_logits) {
            return;
        }
    }

    if (context_logits_buf_ == nullptr) {
Li Zhang's avatar
Li Zhang committed
1217
        NcclGuard guard(model_->tensor_para_, stream_, true);
Chen Xin's avatar
Chen Xin committed
1218
        context_logits_buf_ =
Li Zhang's avatar
Li Zhang committed
1219
1220
            (float*)allocator_->malloc(sizeof(float) * model_->vocab_size_padded_ * max_context_token_num_);
        const auto tp = model_->tensor_para_.world_size_;
1221
        if (tp > 1) {
Li Zhang's avatar
Li Zhang committed
1222
1223
            FT_CHECK(model_->vocab_size_padded_ % tp == 0);
            const auto local_vocab_size = model_->vocab_size_padded_ / tp;
1224
1225
1226
1227
1228
            local_context_logits_buf_ =
                (float*)allocator_->malloc(sizeof(float) * local_vocab_size * max_context_token_num_);
        }
    }

Li Zhang's avatar
Li Zhang committed
1229
    model_->postDecodeEmbedding(context_logits_buf_, local_context_logits_buf_, context_decoder_output, num_token);
1230
1231
1232
1233
1234

    auto logits = context_logits_buf_;

    for (int k = 0; k < indices.size(); ++k) {
        if (output_logits[k]) {
Li Zhang's avatar
Li Zhang committed
1235
            Copy(logits, model_->vocab_size_ * lengths[k], output_logits[k]);
1236
        }
Li Zhang's avatar
Li Zhang committed
1237
        logits += model_->vocab_size_padded_ * lengths[k];
1238
1239
1240
    }
}

Li Zhang's avatar
Li Zhang committed
1241
template<typename T>
Li Zhang's avatar
Li Zhang committed
1242
auto LlamaBatch<T>::Finish(GenerationState& g, int& finished_count) -> std::vector<Signal>
Li Zhang's avatar
Li Zhang committed
1243
{
Li Zhang's avatar
Li Zhang committed
1244
1245
    NvtxScope scope("Finish");
    const int batch_size = state_->active_size;
Li Zhang's avatar
Li Zhang committed
1246

Li Zhang's avatar
Li Zhang committed
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
    // [s,b] -> [b,s] and skip padding in [context_len, max_context_len)
    invokeGatherOutput(state_->output_ids,
                       token_ids_buf_,
                       context_length_buf_,
                       g.max_init_ctx_len,
                       g.step,
                       session_len_,
                       batch_size,
                       stream_);
    sync_check_cuda_error();
Li Zhang's avatar
Li Zhang committed
1257

Li Zhang's avatar
Li Zhang committed
1258
1259
    Copy(state_->output_ids, batch_size * session_len_, h_output_ids_);
    Copy(finished_buf_, batch_size, state_->h_finished);
Li Zhang's avatar
Li Zhang committed
1260
1261
    Copy(sequence_lengths_, batch_size, state_->h_context_length);

Li Zhang's avatar
Li Zhang committed
1262
    check_cuda_error(cudaStreamSynchronize(stream_));
Li Zhang's avatar
Li Zhang committed
1263

1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
    // `SequenceManager` needs real-time value of cache length
    // ! Must be done before incrementing `h_context_length` because the generated token is NOT kv-cached yet
    for (int i = 0; i < batch_size; ++i) {
        if (state_->requests[i]) {
            FT_CHECK(state_->sequences[i]);
            state_->sequences[i]->cache_len = state_->h_context_length[i];
        }
    }

    // invariant: context_length = sequence_length + 1, so that h_context_length include all (including the one just
    // generated) tokens
Li Zhang's avatar
Li Zhang committed
1275
1276
    for (int i = 0; i < batch_size; ++i) {
        ++state_->h_context_length[i];
Li Zhang's avatar
Li Zhang committed
1277
    }
Li Zhang's avatar
Li Zhang committed
1278

Li Zhang's avatar
Li Zhang committed
1279
1280
1281
1282
    {  // set output tokens ids and sequence length
        int* output_ptr = h_output_ids_;
        for (int i = 0; i < batch_size; ++i) {
            if (state_->requests[i] && (state_->requests[i]->stream_cb || state_->h_finished[i])) {
1283
                const int count = state_->h_context_length[i];
Li Zhang's avatar
Li Zhang committed
1284
1285
1286
                // TODO: sync history output tokens at when receiving the request and copy only the last token here
                std::copy(output_ptr, output_ptr + count, h_request_output_ids_ptrs_[i]);
                *h_request_seqlen_ptrs_[i] = count;
Li Zhang's avatar
Li Zhang committed
1287
            }
Li Zhang's avatar
Li Zhang committed
1288
            output_ptr += session_len_;
Li Zhang's avatar
Li Zhang committed
1289
        }
Chen Xin's avatar
Chen Xin committed
1290
    }
Li Zhang's avatar
Li Zhang committed
1291
1292
1293

    if (debug_ && rank_ == 0) {
        std::stringstream ss;
Li Zhang's avatar
Li Zhang committed
1294
1295
        for (int i = 0; i < batch_size; ++i) {
            ss << (i ? ", " : "") << "(" << state_->h_context_length[i] << "," << state_->h_finished[i] << ")";
Li Zhang's avatar
Li Zhang committed
1296
        }
lvhan028's avatar
lvhan028 committed
1297
        TM_LOG_INFO("[finish] [%s]", ss.str().c_str());
Li Zhang's avatar
Li Zhang committed
1298
1299
    }

Li Zhang's avatar
Li Zhang committed
1300
1301
    std::vector<Signal> signals;
    {
Li Zhang's avatar
Li Zhang committed
1302
        NvtxScope _("stream_and_completion_signal");
Li Zhang's avatar
Li Zhang committed
1303
        for (int i = 0; i < batch_size; ++i) {
Li Zhang's avatar
Li Zhang committed
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
            if (state_->requests[i]) {
                if (state_->h_finished[i]) {
                    // Interrupt finished sequences and move the request handle into the signal closure
                    signals.push_back(Interrupt(i));
                    ++finished_count;
                }
                else if (state_->requests[i]->stream_cb) {
                    // Create signals by copying the request handles for non-finished streaming requests
                    signals.push_back([this, r = state_->requests[i]] {
                        if (rank_ == 0) {
                            r->stream_cb(&r->outputs[rank_].get());
                        }
                    });
                }
Li Zhang's avatar
Li Zhang committed
1318
1319
            }
        }
Li Zhang's avatar
Li Zhang committed
1320
1321
1322
1323
        if (finished_count) {
            // synchronize for interrupted sequences
            check_cuda_error(cudaStreamSynchronize(stream_));
        }
Li Zhang's avatar
Li Zhang committed
1324
    }
Li Zhang's avatar
Li Zhang committed
1325
    return signals;
Li Zhang's avatar
Li Zhang committed
1326
1327
1328
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
1329
auto LlamaBatch<T>::Interrupt(int index, bool force_stop, bool force_end) -> Signal
Li Zhang's avatar
Li Zhang committed
1330
1331
{
    if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
1332
        TM_LOG_INFO("[Interrupt] slot = %d, id = %lu", index, (long)state_->requests[index]->id);
Li Zhang's avatar
Li Zhang committed
1333
1334
1335
    }

    if (debug_ && rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
1336
1337
        std::vector<int> tokens(state_->h_context_length[index]);
        Copy(state_->output_ids + index * session_len_, tokens.size(), tokens.data());
Li Zhang's avatar
Li Zhang committed
1338
1339
1340
1341
1342
        cudaStreamSynchronize(stream_);
        std::stringstream ss;
        for (const auto& t : tokens) {
            ss << " " << t;
        }
Li Zhang's avatar
Li Zhang committed
1343
        TM_LOG_INFO("[Interrupt] slot %d, tokens [%s]", index, ss.str().c_str());
Li Zhang's avatar
Li Zhang committed
1344
1345
    }

Li Zhang's avatar
Li Zhang committed
1346
1347
1348
    if (state_->requests[index]->end_flag || force_end) {
        // Sequence is ending this round or a stop request is issued to end it
        FT_CHECK(sequence_manager_->Erase(state_->requests[index]->id));
Li Zhang's avatar
Li Zhang committed
1349
1350
    }
    else {
1351
        const int output_len = state_->h_context_length[index];
Li Zhang's avatar
Li Zhang committed
1352
        auto&     seq        = *state_->sequences[index];
Li Zhang's avatar
Li Zhang committed
1353

Li Zhang's avatar
Li Zhang committed
1354
        // Update token IDs
Li Zhang's avatar
Li Zhang committed
1355
1356
        seq.tokens.resize(output_len);
        const auto output_ids_data = state_->requests[index]->outputs[rank_].at("output_ids").getPtr<int>();
Li Zhang's avatar
Li Zhang committed
1357
        std::copy_n(output_ids_data, output_len, seq.tokens.data());
Li Zhang's avatar
Li Zhang committed
1358

Li Zhang's avatar
Li Zhang committed
1359
1360
1361
1362
        // Save random state in host memory
        seq.random_state.resize(sizeof(curandState_t));
        // This async copy must be synchronized by the caller
        Copy(state_->curand_state + index, 1, (curandState_t*)seq.random_state.data());
Li Zhang's avatar
Li Zhang committed
1363

Li Zhang's avatar
Li Zhang committed
1364
        // Set unlock flag for corresponding blocks, will be unlocked in the next `Materialize()`
Li Zhang's avatar
Li Zhang committed
1365
1366
1367
1368
        sequence_manager_->UpdateAndSetUnlock(seq);
    }

    state_->sequences[index] = nullptr;
Li Zhang's avatar
Li Zhang committed
1369
1370
1371
1372
1373
1374
1375

    // move the request handle into the signal
    return [this, r = std::move(state_->requests[index])] {
        if (rank_ == 0) {
            r->signal.set_value(0);
        }
    };
Li Zhang's avatar
Li Zhang committed
1376
1377
1378
1379
1380
}

template<typename T>
void LlamaBatch<T>::InternalThreadEntry(int device_id)
{
Li Zhang's avatar
Li Zhang committed
1381
    // TM_LOG_INFO("[InternalThreadEntry] %d", (int)rank_);
Li Zhang's avatar
Li Zhang committed
1382
1383
1384
1385
1386
1387
1388
1389
    check_cuda_error(cudaSetDevice(device_id));

    auto& shared_state = model_->shared_state_;

    auto& request_queue  = shared_state->request_queue;
    auto& infer_requests = shared_state->infer_requests;
    auto& stop_requests  = shared_state->stop_requests;

Li Zhang's avatar
Li Zhang committed
1390
    // sequences that are removed but still counted in state's size
Li Zhang's avatar
Li Zhang committed
1391
1392
1393
1394
    int finished_count = 0;

    GenerationState g{};

Li Zhang's avatar
Li Zhang committed
1395
1396
1397
    constexpr int request_interval = 1;
    long          request_counter  = 0;

Li Zhang's avatar
Li Zhang committed
1398
1399
1400
1401
    while (1) {
        if (rank_ == 0) {
            const int  free_slot_count = max_batch_size_ - state_->size + finished_count;
            const bool is_empty        = (free_slot_count == max_batch_size_);
Li Zhang's avatar
Li Zhang committed
1402
1403
1404
1405
1406
1407
1408
1409
            stop_requests.clear();
            infer_requests.clear();
            if (is_empty || request_counter % request_interval == 0) {
                // Block if batch is empty
                request_queue.dequeue(stop_requests, infer_requests, free_slot_count, is_empty, shared_state->abort);
                if (!shared_state->abort) {
                    RejectInvalidRequests(stop_requests, infer_requests);
                }
Li Zhang's avatar
Li Zhang committed
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
            }
        }

        NvtxScope scope("mainloop");

        // wait while rank-0 is dequeueing
        shared_state->barrier->wait();

        if (shared_state->abort) {
            TM_LOG_INFO("[InternalThreadEntry] stop requested.");
            return;
        }

        auto signals = ProcessStopRequests(stop_requests);

Li Zhang's avatar
Li Zhang committed
1425
        // Shared `priority` field will be assigned by rank-0
Li Zhang's avatar
Li Zhang committed
1426
1427
        ProcessInferRequests(infer_requests);

Li Zhang's avatar
Li Zhang committed
1428
        // Wait while shared `requests` is being used
Li Zhang's avatar
Li Zhang committed
1429
1430
        shared_state->barrier->wait();

Li Zhang's avatar
Li Zhang committed
1431
1432
        SendSignals(std::move(signals));

Li Zhang's avatar
Li Zhang committed
1433
        auto modified = Initialize();
Li Zhang's avatar
Li Zhang committed
1434
1435
        // finished sequences is handled by `Initialize()`
        finished_count = 0;
Li Zhang's avatar
Li Zhang committed
1436
1437

        if (state_->active_size) {
Li Zhang's avatar
Li Zhang committed
1438
1439
1440

            ContextDecode();

Li Zhang's avatar
Li Zhang committed
1441
1442
1443
1444
            if (modified) {
                g = InitializeGeneration();
                InitializeSampling();
            }
Li Zhang's avatar
Li Zhang committed
1445

Li Zhang's avatar
Li Zhang committed
1446
1447
1448
1449
1450
            for (int i = 0; i < step_length_; ++i) {
                if (!Generate(g)) {
                    break;
                }
            }
Li Zhang's avatar
Li Zhang committed
1451

Li Zhang's avatar
Li Zhang committed
1452
1453
1454
1455
1456
1457
1458
1459
1460
            if (auto signals = Finish(g, finished_count); !signals.empty()) {
                if (finished_count) {
                    // Finished requests and corresponding output tensors will be released when notified
                    // wait for all ranks to ensure no rank (except for output thread) will access related
                    // resources
                    shared_state->barrier->wait();
                }
                SendSignals(std::move(signals));
            }
Li Zhang's avatar
Li Zhang committed
1461
        }
Li Zhang's avatar
Li Zhang committed
1462
1463

        ++request_counter;
Li Zhang's avatar
Li Zhang committed
1464
1465
    }

Li Zhang's avatar
Li Zhang committed
1466
1467
1468
1469
    FT_CHECK(0);
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
1470
void LlamaBatch<T>::SendSignals(std::vector<Signal> signals)
Li Zhang's avatar
Li Zhang committed
1471
{
Li Zhang's avatar
Li Zhang committed
1472
1473
1474
1475
1476
1477
1478
1479
    if (rank_ != 0 || signals.empty()) {
        return;
    }
    {
        std::lock_guard lock{output_mutex_};
        output_signals_.insert(output_signals_.end(),  //
                               std::move_iterator{signals.begin()},
                               std::move_iterator{signals.end()});
Li Zhang's avatar
Li Zhang committed
1480
    }
Li Zhang's avatar
Li Zhang committed
1481
    output_cv_.notify_one();
Li Zhang's avatar
Li Zhang committed
1482
1483
1484
1485
1486
}

template<typename T>
void LlamaBatch<T>::Start()
{
Li Zhang's avatar
Li Zhang committed
1487
    TM_LOG_INFO("LlamaBatch<T>::Start()");
Li Zhang's avatar
Li Zhang committed
1488
1489
1490
    int device_id = -1;
    check_cuda_error(cudaGetDevice(&device_id));
    internal_thread_ = std::thread(&LlamaBatch::InternalThreadEntry, this, device_id);
Li Zhang's avatar
Li Zhang committed
1491
    if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
1492
        output_thread_ = std::thread(&LlamaBatch::OutputThreadEntry, this);
Li Zhang's avatar
Li Zhang committed
1493
    }
Li Zhang's avatar
Li Zhang committed
1494
}
Li Zhang's avatar
Li Zhang committed
1495

Li Zhang's avatar
Li Zhang committed
1496
1497
1498
1499
template<typename T>
void LlamaBatch<T>::OutputThreadEntry()
{
    while (true) {
Li Zhang's avatar
Li Zhang committed
1500
        std::vector<Signal> signals;
Li Zhang's avatar
Li Zhang committed
1501
        {
Li Zhang's avatar
Li Zhang committed
1502
            // Wait for signals to come
Li Zhang's avatar
Li Zhang committed
1503
            std::unique_lock lock(output_mutex_);
Li Zhang's avatar
Li Zhang committed
1504
            output_cv_.wait(lock, [&] { return !output_signals_.empty() || output_stop_token_; });
Li Zhang's avatar
Li Zhang committed
1505
1506
1507
1508
            if (output_stop_token_) {
                TM_LOG_INFO("[OutputThreadEntry] stop requested.");
                return;
            }
Li Zhang's avatar
Li Zhang committed
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
            signals = std::move(output_signals_);
        }
        if (rank_ == 0 && model_->ffi_lock_) {
            model_->ffi_lock_(1);
        }
        // invoke stream cbs & signals
        for (const auto& s : signals) {
            s();
        }
        if (rank_ == 0 && model_->ffi_lock_) {
            model_->ffi_lock_(0);
Li Zhang's avatar
Li Zhang committed
1520
1521
        }
    }
Li Zhang's avatar
Li Zhang committed
1522
1523
1524
1525
1526
}

template class LlamaBatch<half>;
template class LlamaBatch<float>;

lvhan028's avatar
lvhan028 committed
1527
}  // namespace turbomind