LlamaBatch.cc 59 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
// Copyright (c) OpenMMLab. All rights reserved.

lvhan028's avatar
lvhan028 committed
3
4
#include "src/turbomind/models/llama/LlamaBatch.h"
#include "src/turbomind/kernels/decoding_kernels.h"
Li Zhang's avatar
Li Zhang committed
5
#include "src/turbomind/kernels/sampling_topk_kernels.h"
Chen Xin's avatar
Chen Xin committed
6
#include "src/turbomind/macro.h"
lvhan028's avatar
lvhan028 committed
7
8
9
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/LlamaV2.h"
#include "src/turbomind/models/llama/Request.h"
Li Zhang's avatar
Li Zhang committed
10
11
#include "src/turbomind/models/llama/SequenceManager.h"
#include "src/turbomind/models/llama/llama_kernels.h"
lvhan028's avatar
lvhan028 committed
12
13
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/Tensor.h"
Li Zhang's avatar
Li Zhang committed
14
15
16
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/debug_utils.h"
#include "src/turbomind/utils/gemm_test/gemm_func.h"
lvhan028's avatar
lvhan028 committed
17
#include "src/turbomind/utils/logger.h"
Li Zhang's avatar
Li Zhang committed
18
19
#include <algorithm>
#include <cmath>
Li Zhang's avatar
Li Zhang committed
20
#include <cstddef>
Li Zhang's avatar
Li Zhang committed
21
22
#include <cstdint>
#include <iomanip>
Li Zhang's avatar
Li Zhang committed
23
#include <iterator>
Li Zhang's avatar
Li Zhang committed
24
25
#include <mutex>
#include <numeric>
Li Zhang's avatar
Li Zhang committed
26
27
#include <sstream>
#include <unordered_map>
Li Zhang's avatar
Li Zhang committed
28
#include <utility>
Li Zhang's avatar
Li Zhang committed
29

lvhan028's avatar
lvhan028 committed
30
namespace turbomind {
Li Zhang's avatar
Li Zhang committed
31

Li Zhang's avatar
Li Zhang committed
32
33
34
35
36
37
38
void ClearState(BatchState& s)
{
    std::fill_n(s.requests.begin(), s.size, nullptr);
    std::fill_n(s.sequences.begin(), s.size, nullptr);
    s.size = s.active_size = 0;
}

Li Zhang's avatar
Li Zhang committed
39
template<typename T>
Li Zhang's avatar
Li Zhang committed
40
void LlamaBatch<T>::RejectInvalidRequests(Requests& stop_reqs, Requests& infer_reqs)
Li Zhang's avatar
Li Zhang committed
41
{
AllentDan's avatar
AllentDan committed
42
    std::unordered_map<uint64_t, int> occurrence;
Li Zhang's avatar
Li Zhang committed
43

Li Zhang's avatar
Li Zhang committed
44
    auto count_occurrence = [&occurrence](const Requests& rs) {
Li Zhang's avatar
Li Zhang committed
45
        for (const auto& r : rs) {
AllentDan's avatar
AllentDan committed
46
            ++occurrence[r->id];
Li Zhang's avatar
Li Zhang committed
47
48
49
        }
    };

Li Zhang's avatar
Li Zhang committed
50
51
52
    auto reject = [](const char* type, std::shared_ptr<Request>& req, int ec) {
        TM_LOG_WARNING(
            "[RejectInvalidRequests] Skipping invalid %s request for id %ld, code = %d", type, (long)req->id, ec);
Li Zhang's avatar
Li Zhang committed
53
54
55
56
        req->signal.set_value(ec);
        req.reset();
    };

Li Zhang's avatar
Li Zhang committed
57
    auto handle_conflict_or_invalid = [this, &occurrence, &reject](Requests& rs, const char* type) {
Li Zhang's avatar
Li Zhang committed
58
59
60
61
        for (auto& r : rs) {
            if (r) {
                int ec = 0;

Li Zhang's avatar
Li Zhang committed
62
63
64
65
66
                const int  input_length = r->inputs[rank_].getVal<int>("input_lengths", 0);
                const auto get_offset   = [&](int token_count) {
                    return std::max(0, std::min(token_count, r->inputs[rank_].getVal<int>("step", token_count)));
                };

AllentDan's avatar
AllentDan committed
67
                if (occurrence[r->id] != 1) {
Li Zhang's avatar
Li Zhang committed
68
69
70
71
72
                    ec = Request::kConflict;
                }
                else if (r->start_flag && r->stop_flag) {
                    ec = Request::kInvalid;
                }
Li Zhang's avatar
Li Zhang committed
73
74
75
76
77
78
79
80
81
82
                else if (input_length > session_len_) {
                    ec = Request::kTooLong;
                }
                else if (!r->start_flag) {
                    if (auto seq = sequence_manager_->Get(r->id); seq == nullptr) {
                        ec = Request::kInvalid;
                    }
                    else if (get_offset(seq->tokens.size()) + input_length > session_len_) {
                        ec = Request::kTooLong;
                    }
Li Zhang's avatar
Li Zhang committed
83
84
85
                }

                if (ec) {
Li Zhang's avatar
Li Zhang committed
86
                    reject(type, r, ec);
Li Zhang's avatar
Li Zhang committed
87
88
89
90
91
                }
            }
        }
    };

Li Zhang's avatar
Li Zhang committed
92
    auto drop_invalid = [](Requests& rs) {
Li Zhang's avatar
Li Zhang committed
93
94
95
96
97
98
99
100
101
        int count = 0;
        for (int i = 0; i < rs.size(); ++i) {
            if (rs[i]) {
                rs[count++] = std::move(rs[i]);
            }
        }
        rs.resize(count);
    };

AllentDan's avatar
AllentDan committed
102
103
    count_occurrence(stop_reqs);
    count_occurrence(infer_reqs);
Li Zhang's avatar
Li Zhang committed
104
105
106
107
108
109
110
111

    if (!stop_reqs.empty()) {
        handle_conflict_or_invalid(stop_reqs, "stop");

        // invalidate stop-only requests for inactive sequences
        for (auto& r : stop_reqs) {
            if (r && r->end_flag == false) {
                int ec = Request::kInactive;
Li Zhang's avatar
Li Zhang committed
112
113
                for (int i = 0; i < state_->size; ++i) {
                    if (state_->requests[i] && state_->requests[i]->id == r->id) {
Li Zhang's avatar
Li Zhang committed
114
115
116
117
118
                        ec = 0;
                        break;
                    }
                }
                if (ec) {
Li Zhang's avatar
Li Zhang committed
119
                    reject("stop", r, ec);
Li Zhang's avatar
Li Zhang committed
120
121
122
123
124
125
126
127
128
129
130
131
132
                }
            }
        }

        drop_invalid(stop_reqs);
    }

    if (!infer_reqs.empty()) {
        handle_conflict_or_invalid(infer_reqs, "infer");

        // invalidate requests for busy sequences
        for (auto& r : infer_reqs) {
            if (r) {
Li Zhang's avatar
Li Zhang committed
133
134
135
                for (int i = 0; i < state_->size; ++i) {
                    if (state_->requests[i] && state_->requests[i]->id == r->id) {
                        reject("infer", r, Request::kBusy);
Li Zhang's avatar
Li Zhang committed
136
137
138
139
140
141
142
143
144
145
146
                        break;
                    }
                }
            }
        }

        drop_invalid(infer_reqs);
    }
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
147
auto LlamaBatch<T>::ProcessStopRequests(const Requests& requests) -> std::vector<Signal>
Li Zhang's avatar
Li Zhang committed
148
{
Li Zhang's avatar
Li Zhang committed
149
    NvtxScope           scope("stop_request");
Li Zhang's avatar
Li Zhang committed
150
    std::vector<Signal> signals;
Li Zhang's avatar
Li Zhang committed
151
    int                 count = 0;
Li Zhang's avatar
Li Zhang committed
152
153
154
    for (const auto& r : requests) {
        int ec = Request::kFail;
        // find matching active sequence
Li Zhang's avatar
Li Zhang committed
155
        for (int i = 0; i < state_->size; ++i) {
Li Zhang's avatar
Li Zhang committed
156
            // stop & optionally erase active sequence
Li Zhang's avatar
Li Zhang committed
157
            if (state_->requests[i] && state_->requests[i]->id == r->id) {
Li Zhang's avatar
Li Zhang committed
158
                ec = 0;
Li Zhang's avatar
Li Zhang committed
159
160
                signals.push_back(Interrupt(i, true, r->end_flag));
                ++count;
Li Zhang's avatar
Li Zhang committed
161
162
163
                break;
            }
        }
Li Zhang's avatar
Li Zhang committed
164
        // mismatch, try erase inactive sequence, in this case there is no active request to interrupt
Li Zhang's avatar
Li Zhang committed
165
        if (ec && r->end_flag) {
Li Zhang's avatar
Li Zhang committed
166
167
168
            if (sequence_manager_->Erase(r->id)) {
                ec = 0;
            }
Li Zhang's avatar
Li Zhang committed
169
        }
Li Zhang's avatar
Li Zhang committed
170
        signals.push_back([=] {
Li Zhang's avatar
Li Zhang committed
171
            if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
172
                r->signal.set_value(ec);
Li Zhang's avatar
Li Zhang committed
173
            }
Li Zhang's avatar
Li Zhang committed
174
175
176
177
        });
    }
    if (count) {
        check_cuda_error(cudaStreamSynchronize(stream_));
Li Zhang's avatar
Li Zhang committed
178
179
180
    }
    return signals;
}
akhoroshev's avatar
akhoroshev committed
181

Li Zhang's avatar
Li Zhang committed
182
183
184
template<typename T>
void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
{
Li Zhang's avatar
Li Zhang committed
185
186
    NvtxScope scope("infer_request");
    auto&     state = *incoming_;
Li Zhang's avatar
Li Zhang committed
187
188
189
190

    FT_CHECK(state.size == 0);
    FT_CHECK(state.active_size == 0);

Li Zhang's avatar
Li Zhang committed
191
    std::vector<int> existing_idx;
Li Zhang's avatar
Li Zhang committed
192

Li Zhang's avatar
Li Zhang committed
193
194
195
    int idx = 0;
    for (const auto& r : requests) {
        FT_CHECK(!state.requests[idx]);
Li Zhang's avatar
Li Zhang committed
196

Li Zhang's avatar
Li Zhang committed
197
198
199
        if (rank_ == 0) {
            TM_LOG_WARNING("[ProcessInferRequests] Request for %ld received.", (long)r->id);
        }
Li Zhang's avatar
Li Zhang committed
200

Li Zhang's avatar
Li Zhang committed
201
        state.requests[idx] = r;
Li Zhang's avatar
Li Zhang committed
202
203

        // get sequence for the request
Li Zhang's avatar
Li Zhang committed
204
205
        state.sequences[idx] = r->start_flag ? sequence_manager_->Create(r->id) : sequence_manager_->Get(r->id);
        FT_CHECK(state.sequences[idx]);
Li Zhang's avatar
Li Zhang committed
206

Li Zhang's avatar
Li Zhang committed
207
        auto& seq = *state.sequences[idx];
Li Zhang's avatar
Li Zhang committed
208
209
210
211
212
213
214
215
216
217

        if (int step = r->inputs[rank_].getVal<int>("step", -1); step >= 0) {
            if (step <= seq.tokens.size()) {
                seq.tokens.resize(step);
                seq.cache_len = std::min(seq.cache_len, step);
            }
            else if (rank_ == 0) {
                TM_LOG_WARNING(
                    "[ProcessInferRequests] Skipping invalid step (%d) setting for ID %ld", step, (long)seq.id);
            }
Li Zhang's avatar
Li Zhang committed
218
        }
Li Zhang's avatar
Li Zhang committed
219
220
221
222
223

        const int  input_length = r->inputs[rank_].getVal<int>("input_lengths");
        const int* input_ids    = r->inputs[rank_].getPtr<int>("input_ids");

        // `output_ids` contains all token ids of the sequences
Li Zhang's avatar
Li Zhang committed
224
        const auto output_ids_base = state.output_ids + session_len_ * idx;
Li Zhang's avatar
Li Zhang committed
225
226
227
228
229
230
231
232
233
234
235
236
237
        auto       output_ids      = output_ids_base;

        // copy history tokens
        if (!seq.tokens.empty()) {
            output_ids = Copy(seq.tokens.data(), seq.tokens.size(), output_ids);
        }

        // copy input tokens
        if (input_length) {
            output_ids = Copy(input_ids, input_length, output_ids);
        }

        // total context length (history + input)
Li Zhang's avatar
Li Zhang committed
238
239
        state.h_context_length[idx] = output_ids - output_ids_base;
        state.h_finished[idx]       = false;
Li Zhang's avatar
Li Zhang committed
240

Li Zhang's avatar
Li Zhang committed
241
242
        const int request_output_len = state.requests[idx]->inputs[rank_].getVal<int>("request_output_len");
        state.seq_len_limit[idx]     = state.h_context_length[idx] + request_output_len;
Li Zhang's avatar
Li Zhang committed
243
244
        // `length_criterion` sets finish flag when step >= seq_limit_len, however when step == seq_limit_len
        // the actual sequence length is seq_limit_len + 1, hence seq_limit_len must truncated to session_len - 1
Li Zhang's avatar
Li Zhang committed
245
246
        if (state.seq_len_limit[idx] >= session_len_) {
            state.seq_len_limit[idx] = session_len_ - 1;
Li Zhang's avatar
Li Zhang committed
247
            if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
248
                const int trunc_output_len = state.seq_len_limit[idx] - state.h_context_length[idx];
Li Zhang's avatar
Li Zhang committed
249
250
251
                TM_LOG_WARNING(
                    "[ProcessInferRequests] [%ld] total sequence length (%d + %d) exceeds `session_len` (%d), `request_output_len` is truncated to %d",
                    (long)seq.id,
Li Zhang's avatar
Li Zhang committed
252
                    state.h_context_length[idx],
Li Zhang's avatar
Li Zhang committed
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
                    request_output_len,
                    (int)session_len_,
                    trunc_output_len);
            }
        }

        // compute rope scaling factor
        if (r->start_flag) {
            seq.rope_theta      = model_->attn_params_.rotary_embedding_base;
            auto scaling_factor = 1.f;
            if (r->inputs[rank_].isExist("rope_scaling_factor")) {  // runtime scaling factor
                scaling_factor = r->inputs[rank_].getVal<float>("rope_scaling_factor");
            }
            else if (model_->attn_params_.rope_scaling_factor >= 1.f) {  // infer by `seq_len_limit`
                scaling_factor   = model_->attn_params_.rope_scaling_factor;
Li Zhang's avatar
Li Zhang committed
268
                auto max_seq_len = state.seq_len_limit[idx];
Li Zhang's avatar
Li Zhang committed
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
                auto max_pos_emb = model_->attn_params_.max_position_embeddings;
                if (max_seq_len > max_pos_emb) {
                    scaling_factor = scaling_factor * max_seq_len / max_pos_emb - (scaling_factor - 1);
                    // scaling_factor = std::max(exp2f(ceilf(log2f((float)max_seq_len / max_pos_emb) + 1.f))
                    // - 1.f, 1.f);
                }
            }
            if (scaling_factor != 1.f) {
                float rope_dim = model_->attn_params_.rotary_embedding_dim;
                seq.rope_theta *= powf(scaling_factor, rope_dim / (rope_dim - 2.f));
                TM_LOG_INFO("[ProcessInferRequests] %ld rope_scaling_factor: %f, rope_theta = %f",
                            (long)seq.id,
                            scaling_factor,
                            seq.rope_theta);
            }
        }
Li Zhang's avatar
Li Zhang committed
285
        state.h_rope_theta[idx] = seq.rope_theta;
Li Zhang's avatar
Li Zhang committed
286

Li Zhang's avatar
Li Zhang committed
287
288
289
290
291
292
293
294
        if (r->start_flag) {
            // prepare to initialize random state for new sequence
            h_random_seed_[idx] = r->inputs[rank_].getVal<unsigned long long>("random_seed", 0);
        }
        else {
            // Recover device states if not a new sequence
            h_curand_state_[existing_idx.size()] = *(curandState_t*)seq.random_state.data();
            existing_idx.push_back(idx);
Li Zhang's avatar
Li Zhang committed
295
296
        }

Li Zhang's avatar
Li Zhang committed
297
        // ! SHARED STATE IS MODIFIED, BARRIER SYNCHRONIZATION REQUIRED
Li Zhang's avatar
Li Zhang committed
298
        // assign priority based on arrival time
Li Zhang's avatar
Li Zhang committed
299
300
301
        if (rank_ == 0) {
            r->priority = request_count_++;
        }
Li Zhang's avatar
Li Zhang committed
302
303

        // increment pointer
Li Zhang's avatar
Li Zhang committed
304
        idx++;
Li Zhang's avatar
Li Zhang committed
305
    }
Li Zhang's avatar
Li Zhang committed
306

Li Zhang's avatar
Li Zhang committed
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
    state.size = idx;

    // when there are new sequences
    if (state.size != existing_idx.size()) {
        // copy random seeds to device
        Copy(h_random_seed_, state.size, d_random_seed_);
        // initialize random states
        invokeCurandBatchInitialize(state.curand_state, state.size, d_random_seed_, stream_);
        sync_check_cuda_error();
    }

    if (!existing_idx.empty()) {
        // copy existing curand states to device
        Copy(h_curand_state_, existing_idx.size(), d_curand_state_);
        // insert the states to their correct positions in the batch
        IndexedCopy({}, existing_idx, std::tuple{d_curand_state_, state.curand_state, 1});
    }
Li Zhang's avatar
Li Zhang committed
324
325
326
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
bool LlamaBatch<T>::Initialize()
{
    NvtxScope                                scope("initialize");
    std::vector<const Sequence*>             sequences;
    std::vector<Sequence::Status>            status;
    std::vector<uint64_t>                    priorities;
    std::vector<int>                         context_lengths;
    std::vector<std::pair<BatchState*, int>> coords;

    // count the holes introduced by finished requests in from previous iteration or stop requests from
    // current iteration
    int holes{};
    int active_holes{};
    for (int i = 0; i < state_->size; ++i) {
        if (!state_->requests[i]) {
            ++holes;
            if (i < state_->active_size) {
                ++active_holes;
            }
        }
    }

    // dbg(holes, active_holes);

    auto process = [&](BatchState* state) {
        for (int i = 0; i < state->size; ++i) {
            if (auto& r = state->requests[i]) {
                sequences.push_back(state->sequences[i]);
                status.push_back(state->sequences[i]->status);
                priorities.push_back(r->priority);
                context_lengths.push_back(state->h_context_length[i]);
                coords.emplace_back(state, i);
                // clear swap-in flags
                state->is_swap_in[i] = 0;
            }
        }
    };

    process(state_);
    process(incoming_);

    auto outcome = sequence_manager_->Materialize(sequences, context_lengths, priorities, step_length_);

    if (outcome.allocation || outcome.swap_in || outcome.swap_out) {
        dbg(outcome);
    }

    bool exchange = outcome.swap_in + outcome.swap_out > 0;

    std::vector<int> idxs(sequences.size());
    std::iota(idxs.begin(), idxs.end(), 0);

    if (exchange || holes || incoming_->size) {
        // put active ones first
        auto active_end = std::stable_partition(idxs.begin(), idxs.end(), [&](int idx) {
            return sequences[idx]->status == Sequence::kActive;  // present status
        });

        // all blocks are not enough to hold a single sequence
Li Zhang's avatar
Li Zhang committed
386
387
388
        if (!sequences.empty()) {
            FT_CHECK_WITH_INFO(active_end != idxs.begin(), "No enough blocks.");
        }
Li Zhang's avatar
Li Zhang committed
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405

        // move swap-ins to the back
        auto swapin_beg = std::stable_partition(idxs.begin(), active_end, [&](int idx) {
            return status[idx] == Sequence::kActive;  // past status
        });

        // sort swap-ins according to missing length
        if (swapin_beg != active_end) {
            std::vector<int> missing_len(sequences.size());
            for (int i = 0; i < sequences.size(); ++i) {
                missing_len[i] = context_lengths[i] - sequences[i]->cache_len;
            }
            std::stable_sort(swapin_beg, active_end, [&](int i, int j) { return missing_len[i] < missing_len[j]; });
        }

        // Copy sequence states to back buffer
        FT_CHECK(back_->size == 0 && back_->active_size == 0);
Li Zhang's avatar
Li Zhang committed
406
        std::vector<std::tuple<BatchState*, BatchState*, int, int>> cpys;
Li Zhang's avatar
Li Zhang committed
407
408
409
410
411
412
413
414
415
416
417
418
        for (const auto& i : idxs) {
            auto& s = *sequences[i];
            if (exchange) {
                const auto& [state, idx] = coords[i];
                // mark swap-ins
                if (status[i] != Sequence::kActive && s.status == Sequence::kActive) {
                    state->is_swap_in[idx] = 1;
                }
            }
            if (s.status == Sequence::kActive) {
                ++back_->active_size;
            }
Li Zhang's avatar
Li Zhang committed
419
            cpys.emplace_back(coords[i].first, back_, coords[i].second, back_->size++);
Li Zhang's avatar
Li Zhang committed
420
        }
Li Zhang's avatar
Li Zhang committed
421
        CopyState(cpys);
Li Zhang's avatar
Li Zhang committed
422
423
424
425
426
427
428
        // Swap the buffers
        std::swap(state_, back_);

        ClearState(*back_);
        ClearState(*incoming_);
    }

Li Zhang's avatar
Li Zhang committed
429
430
    FT_CHECK(state_->size <= max_batch_size_);

Li Zhang's avatar
Li Zhang committed
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
    /// Update block ptrs when there were
    //  1. swap-in or swap-out
    //  2. holes in the active buffer
    //  3. new allocations (for existing active sequences)
    if (exchange || active_holes || outcome.allocation) {
        // Prepare intermediate buffers
        h_cu_block_counts_[0] = 0;

        auto k_ptrs = h_k_block_ptrs_;
        auto v_ptrs = h_v_block_ptrs_;

        const int batch_size = state_->active_size;

        for (int i = 0; i < batch_size; ++i) {
            const auto& seq = *state_->sequences[i];

            // cumulative num of blocks
            h_cu_block_counts_[i + 1] = h_cu_block_counts_[i] + seq.blocks.size();

Li Zhang's avatar
Li Zhang committed
450
451
452
            FT_CHECK_WITH_INFO(h_cu_block_counts_[i + 1] <= sequence_manager_->max_block_count(),
                               std::to_string(h_cu_block_counts_[i + 1]));

Li Zhang's avatar
Li Zhang committed
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
            k_ptrs = std::transform(seq.blocks.cbegin(), seq.blocks.cend(), k_ptrs, [&](auto p) {
                return reinterpret_cast<uintptr_t>(sequence_manager_->OffsetKey(p->data));
            });
            v_ptrs = std::transform(seq.blocks.cbegin(), seq.blocks.cend(), v_ptrs, [&](auto p) {
                return reinterpret_cast<uintptr_t>(sequence_manager_->OffsetVal(p->data));
            });
        }

        static_assert(sizeof(uintptr_t) == sizeof(void*));

        Copy(h_cu_block_counts_, batch_size + 1, cu_block_counts_);
        Copy(h_k_block_ptrs_, h_cu_block_counts_[batch_size], k_block_ptrs_);
        Copy(h_v_block_ptrs_, h_cu_block_counts_[batch_size], v_block_ptrs_);
    }

    /// Layout of the buffers is changed, generation & sampling need to be re-initialized for correctness when there
    /// were
    //  1. swap-in or swap-out
    //  2. holes in the active buffer
    return exchange || active_holes;
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
476
void LlamaBatch<T>::CopyState(const std::vector<std::tuple<BatchState*, BatchState*, int, int>>& desc)
Li Zhang's avatar
Li Zhang committed
477
{
Li Zhang's avatar
Li Zhang committed
478
479
    std::vector<int> idxs(desc.size());
    std::iota(idxs.begin(), idxs.end(), 0);
Li Zhang's avatar
Li Zhang committed
480

Li Zhang's avatar
Li Zhang committed
481
    std::sort(idxs.begin(), idxs.end(), [&](int i, int j) { return desc[i] < desc[j]; });
Li Zhang's avatar
Li Zhang committed
482

Li Zhang's avatar
Li Zhang committed
483
484
485
    auto get_signature = [&](int i) -> std::pair<BatchState*, BatchState*> {
        return std::make_pair(std::get<0>(desc[idxs[i]]), std::get<1>(desc[idxs[i]]));
    };
Li Zhang's avatar
Li Zhang committed
486

Li Zhang's avatar
Li Zhang committed
487
488
489
490
491
492
493
494
495
496
    std::vector<int> offsets;
    auto             current = get_signature(0);
    offsets.push_back(0);
    for (int i = 0; i < idxs.size(); ++i) {
        if (auto signature = get_signature(i); signature != current) {
            current = signature;
            offsets.push_back(i);
        }
    }
    offsets.push_back(idxs.size());
Li Zhang's avatar
Li Zhang committed
497

Li Zhang's avatar
Li Zhang committed
498
499
500
    for (int bi = 1; bi < offsets.size(); ++bi) {
        int beg = offsets[bi - 1];
        int end = offsets[bi];
Li Zhang's avatar
Li Zhang committed
501

Li Zhang's avatar
Li Zhang committed
502
503
504
        if (beg == end) {
            continue;
        }
Li Zhang's avatar
Li Zhang committed
505

Li Zhang's avatar
Li Zhang committed
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
        auto [s, d] = get_signature(beg);

        std::vector<int> s_idx;
        std::vector<int> d_idx;
        for (int i = beg; i < end; ++i) {
            s_idx.push_back(std::get<2>(desc[idxs[i]]));
            d_idx.push_back(std::get<3>(desc[idxs[i]]));
        }

        IndexedCopy(s_idx,
                    d_idx,
                    std::tuple{s->output_ids, d->output_ids, session_len_},
                    std::tuple{s->curand_state, d->curand_state, 1});
    }

    for (const auto& [s, d, si, di] : desc) {
        d->h_context_length[di] = s->h_context_length[si];
        d->h_finished[di]       = s->h_finished[si];
        d->h_rope_theta[di]     = s->h_rope_theta[si];
        d->seq_len_limit[di]    = s->seq_len_limit[si];
        d->sequences[di]        = s->sequences[si];
        d->is_swap_in[di]       = s->is_swap_in[si];
        d->requests[di]         = s->requests[si];
    }
Li Zhang's avatar
Li Zhang committed
530
531
532
533
}

template<typename T>
void LlamaBatch<T>::AllocateBuffer(size_t batch_size, size_t session_len)
Li Zhang's avatar
Li Zhang committed
534
{
lvhan028's avatar
lvhan028 committed
535
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
Li Zhang's avatar
Li Zhang committed
536
537
    const size_t batchxbeam = batch_size;

Li Zhang's avatar
Li Zhang committed
538
539
540
541
542
543
    const size_t hidden_units      = model_->hidden_units_;
    const size_t vocab_size        = model_->vocab_size_padded_;
    const size_t head_dim          = model_->size_per_head_;
    const size_t local_kv_head_num = model_->local_kv_head_num_;
    // +1 padding, BlockIterator does not use predicate
    const size_t max_block_count = sequence_manager_->max_block_count() + 1;
Li Zhang's avatar
Li Zhang committed
544
545
546

    context_decoder_input_buf_ =
        (T*)allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * max_context_token_num_ * hidden_units, false);
547
548
    context_decoder_output_buf_ =
        (T*)allocator_->reMalloc(context_decoder_output_buf_, sizeof(T) * max_context_token_num_ * hidden_units, false);
Li Zhang's avatar
Li Zhang committed
549
550
551
    context_decoder_ids_buf_ =
        (int*)allocator_->reMalloc(context_decoder_ids_buf_, sizeof(int) * max_context_token_num_, false);

Li Zhang's avatar
Li Zhang committed
552
553
554
555
556
557
558
559
    tmp_k_cache_buf_ = (T*)allocator_->reMalloc(
        tmp_k_cache_buf_, sizeof(T) * max_context_token_num_ * local_kv_head_num * head_dim, false);
    tmp_v_cache_buf_ = (T*)allocator_->reMalloc(
        tmp_v_cache_buf_, sizeof(T) * max_context_token_num_ * local_kv_head_num * head_dim, false);

    tmp_k_ptrs_ = (void**)allocator_->reMalloc(tmp_k_ptrs_, sizeof(void*) * batch_size, false);
    tmp_v_ptrs_ = (void**)allocator_->reMalloc(tmp_v_ptrs_, sizeof(void*) * batch_size, false);

Li Zhang's avatar
Li Zhang committed
560
561
562
563
564
565
566
    decoder_input_buf_  = (T*)allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units, false);
    decoder_output_buf_ = (T*)allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units, false);

    input_ids_buf_      = (int*)allocator_->reMalloc(input_ids_buf_, sizeof(int) * batchxbeam * session_len, true);
    input_length_buf_   = (int*)allocator_->reMalloc(input_length_buf_, sizeof(int) * batchxbeam);
    context_length_buf_ = (int*)allocator_->reMalloc(context_length_buf_, sizeof(int) * batchxbeam);

Li Zhang's avatar
Li Zhang committed
567
    sequence_lengths_ = (int*)allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false);
Li Zhang's avatar
Li Zhang committed
568

Li Zhang's avatar
Li Zhang committed
569
570
571
    cu_block_counts_ = (int*)allocator_->reMalloc(cu_block_counts_, sizeof(int) * (batch_size + 1));
    k_block_ptrs_    = (uintptr_t*)allocator_->reMalloc(k_block_ptrs_, sizeof(uintptr_t) * max_block_count);
    v_block_ptrs_    = (uintptr_t*)allocator_->reMalloc(v_block_ptrs_, sizeof(uintptr_t) * max_block_count);
Li Zhang's avatar
Li Zhang committed
572
573
574
575
576
577
578
579
580

    logits_buf_       = (float*)allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size, false);
    local_logits_buf_ = (float*)allocator_->reMalloc(local_logits_buf_, sizeof(float) * batchxbeam * vocab_size, false);

    token_ids_buf_ = (int*)allocator_->reMalloc(token_ids_buf_, sizeof(int) * batchxbeam * session_len * 2, true);

    finished_buf_  = (bool*)allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false);
    seq_limit_len_ = (uint32_t*)allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false);

Li Zhang's avatar
Li Zhang committed
581
582
583
584
585
586
    request_output_ids_ptrs_ = (int**)allocator_->reMalloc(request_output_ids_ptrs_, sizeof(int*) * batch_size, true);
    request_output_ids_lens_ = (int*)allocator_->reMalloc(request_output_ids_lens_, sizeof(int) * batch_size, true);
    request_seqlen_ptrs_     = (int**)allocator_->reMalloc(request_seqlen_ptrs_, sizeof(int*) * batch_size, true);

    rope_theta_ = (float*)allocator_->reMalloc(rope_theta_, sizeof(float) * batch_size, false);

Li Zhang's avatar
Li Zhang committed
587
588
589
590
    is_allocate_buffer_ = true;
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
591
void LlamaBatch<T>::AllocatePersistantBuffer(size_t max_batch_size)
Li Zhang's avatar
Li Zhang committed
592
{
Li Zhang's avatar
Li Zhang committed
593
594
595
596
597
598
    d_stop_words_ = (int*)allocator_->reMalloc(d_stop_words_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true);
    d_bad_words_  = (int*)allocator_->reMalloc(d_bad_words_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true);
    h_stop_words_ =
        (int*)allocator_->reMalloc(h_stop_words_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true, true);
    h_bad_words_ =
        (int*)allocator_->reMalloc(h_bad_words_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true, true);
Li Zhang's avatar
Li Zhang committed
599
600
601
602
603
604
605

    h_runtime_top_k_ = (int*)allocator_->reMalloc(h_runtime_top_k_, sizeof(int) * max_batch_size, true, true);
    h_runtime_top_p_ = (float*)allocator_->reMalloc(h_runtime_top_p_, sizeof(float) * max_batch_size, true, true);
    h_temperature_   = (float*)allocator_->reMalloc(h_temperature_, sizeof(float) * max_batch_size, true, true);
    h_repetition_penalty_ =
        (float*)allocator_->reMalloc(h_repetition_penalty_, sizeof(float) * max_batch_size, true, true);

Li Zhang's avatar
Li Zhang committed
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
    h_random_seed_ = (unsigned long long*)allocator_->reMalloc(
        h_random_seed_, sizeof(unsigned long long) * max_batch_size, true, true);
    d_random_seed_ = (unsigned long long*)allocator_->reMalloc(
        d_random_seed_, sizeof(unsigned long long) * max_batch_size, true, false);

    h_curand_state_ =
        (curandState_t*)allocator_->reMalloc(h_curand_state_, sizeof(curandState_t) * max_batch_size, true, true);
    d_curand_state_ =
        (curandState_t*)allocator_->reMalloc(d_curand_state_, sizeof(curandState_t) * max_batch_size, true, false);

    d_end_ids_buf_ = (int*)allocator_->reMalloc(d_end_ids_buf_, sizeof(int) * max_batch_size, false);
    h_end_ids_buf_ = (int*)allocator_->reMalloc(h_end_ids_buf_, sizeof(int) * max_batch_size, false, true);

    sampling_params_ = {
        {"stop_words_list", (std::byte*)h_stop_words_, (std::byte*)d_stop_words_},
        {"bad_words_list", (std::byte*)h_bad_words_, (std::byte*)d_bad_words_},
        {"runtime_top_k", (std::byte*)h_runtime_top_k_, nullptr},
        {"runtime_top_p", (std::byte*)h_runtime_top_p_, nullptr},
        {"temperature", (std::byte*)h_temperature_, nullptr},
        {"repetition_penalty", (std::byte*)h_repetition_penalty_, nullptr},
    };
Li Zhang's avatar
Li Zhang committed
627

Li Zhang's avatar
Li Zhang committed
628
629
    for (auto& s : states_) {
        s.output_ids = (int*)allocator_->reMalloc(s.output_ids, sizeof(int) * max_batch_size * session_len_, true);
Li Zhang's avatar
Li Zhang committed
630
631
        s.curand_state =
            (curandState_t*)allocator_->reMalloc(s.curand_state, sizeof(curandState_t) * max_batch_size, true);
Li Zhang's avatar
Li Zhang committed
632
633
634
    }

    const size_t max_block_count = sequence_manager_->max_block_count();
Li Zhang's avatar
Li Zhang committed
635
636

    {
Li Zhang's avatar
Li Zhang committed
637
        NcclGuard barrier(model_->tensor_para_, stream_, true);
Li Zhang's avatar
Li Zhang committed
638
639
640
641
        h_input_ids_buf_ =
            (int*)allocator_->reMalloc(h_input_ids_buf_, sizeof(int) * max_batch_size * session_len_, false, true);
        h_input_length_buf_ =
            (int*)allocator_->reMalloc(h_input_length_buf_, sizeof(int) * max_batch_size, false, true);
Li Zhang's avatar
Li Zhang committed
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659

        h_tmp_k_ptrs_ = (void**)allocator_->reMalloc(h_tmp_k_ptrs_, sizeof(void*) * max_batch_size, false, true);
        h_tmp_v_ptrs_ = (void**)allocator_->reMalloc(h_tmp_v_ptrs_, sizeof(void*) * max_batch_size, false, true);

        h_cu_block_counts_ =
            (int*)allocator_->reMalloc(h_cu_block_counts_, sizeof(int) * (max_batch_size + 1), false, true);
        h_k_block_ptrs_ =
            (uintptr_t*)allocator_->reMalloc(h_k_block_ptrs_, sizeof(uintptr_t) * max_block_count, false, true);
        h_v_block_ptrs_ =
            (uintptr_t*)allocator_->reMalloc(h_v_block_ptrs_, sizeof(uintptr_t) * max_block_count, false, true);

        for (auto& s : states_) {
            s.h_context_length =
                (int*)allocator_->reMalloc(s.h_context_length, sizeof(int) * max_batch_size, false, true);
            s.h_finished   = (bool*)allocator_->reMalloc(s.h_finished, sizeof(bool) * max_batch_size * 2, false, true);
            s.h_rope_theta = (float*)allocator_->reMalloc(s.h_rope_theta, sizeof(float) * max_batch_size, false, true);
        }

Li Zhang's avatar
Li Zhang committed
660
661
        h_seq_limit_len_ =
            (uint32_t*)allocator_->reMalloc(h_seq_limit_len_, sizeof(uint32_t) * max_batch_size, false, true);
Li Zhang's avatar
Li Zhang committed
662
663
664
665
666
667
668

        h_request_output_ids_ptrs_ =
            (int**)allocator_->reMalloc(h_request_output_ids_ptrs_, sizeof(int*) * max_batch_size, true, true);
        h_request_output_ids_lens_ =
            (int*)allocator_->reMalloc(h_request_output_ids_lens_, sizeof(int) * max_batch_size, true, true);
        h_request_seqlen_ptrs_ =
            (int**)allocator_->reMalloc(h_request_seqlen_ptrs_, sizeof(int*) * max_batch_size, true, true);
Li Zhang's avatar
Li Zhang committed
669
670
671

        h_output_ids_ =
            (int*)allocator_->reMalloc(h_output_ids_, sizeof(int) * max_batch_size * session_len_, false, true);
Li Zhang's avatar
Li Zhang committed
672
673
674
675
676
677
    }

    is_allocate_persistant_buffer_ = true;
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
678
void LlamaBatch<T>::FreeBuffer()
Li Zhang's avatar
Li Zhang committed
679
{
lvhan028's avatar
lvhan028 committed
680
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
Li Zhang's avatar
Li Zhang committed
681
682
    if (is_allocate_buffer_) {
        allocator_->free((void**)&context_decoder_input_buf_);
683
        allocator_->free((void**)&context_decoder_output_buf_);
Li Zhang's avatar
Li Zhang committed
684
685
        allocator_->free((void**)&context_decoder_ids_buf_);

Li Zhang's avatar
Li Zhang committed
686
687
688
689
690
        allocator_->free((void**)&tmp_k_cache_buf_);
        allocator_->free((void**)&tmp_v_cache_buf_);
        allocator_->free((void**)&tmp_k_ptrs_);
        allocator_->free((void**)&tmp_v_ptrs_);

Li Zhang's avatar
Li Zhang committed
691
692
693
694
695
696
697
698
699
        allocator_->free((void**)&decoder_input_buf_);
        allocator_->free((void**)&decoder_output_buf_);

        allocator_->free((void**)&input_ids_buf_);
        allocator_->free((void**)&input_length_buf_);
        allocator_->free((void**)&context_length_buf_);

        allocator_->free((void**)&sequence_lengths_);

Li Zhang's avatar
Li Zhang committed
700
701
702
        allocator_->free((void**)&cu_block_counts_);
        allocator_->free((void**)&k_block_ptrs_);
        allocator_->free((void**)&v_block_ptrs_);
Li Zhang's avatar
Li Zhang committed
703
704
705
706

        allocator_->free((void**)&logits_buf_);
        allocator_->free((void**)&local_logits_buf_);

707
708
709
710
711
712
713
        if (local_context_logits_buf_) {
            allocator_->free((void**)&local_context_logits_buf_);
        }
        if (context_logits_buf_) {
            allocator_->free((void**)&context_logits_buf_);
        }

Li Zhang's avatar
Li Zhang committed
714
715
        allocator_->free((void**)&token_ids_buf_);

Li Zhang's avatar
Li Zhang committed
716
717
718
        allocator_->free((void**)&d_end_ids_buf_);
        allocator_->free((void**)&h_end_ids_buf_, true);

Li Zhang's avatar
Li Zhang committed
719
720
721
        allocator_->free((void**)&finished_buf_);
        allocator_->free((void**)&seq_limit_len_);

Li Zhang's avatar
Li Zhang committed
722
723
724
725
726
727
        allocator_->free((void**)&request_output_ids_ptrs_);
        allocator_->free((void**)&request_output_ids_lens_);
        allocator_->free((void**)&request_seqlen_ptrs_);

        allocator_->free((void**)&rope_theta_);

Li Zhang's avatar
Li Zhang committed
728
729
730
731
        is_allocate_buffer_ = false;
    }

    if (is_allocate_persistant_buffer_) {
Li Zhang's avatar
Li Zhang committed
732
733
734
735
736
737
738
739
740
741

        allocator_->free((void**)&d_stop_words_);
        allocator_->free((void**)&h_stop_words_, true);
        allocator_->free((void**)&d_bad_words_);
        allocator_->free((void**)&h_bad_words_, true);
        allocator_->free((void**)&d_random_seed_);
        allocator_->free((void**)&h_random_seed_, true);
        allocator_->free((void**)&d_curand_state_);
        allocator_->free((void**)&h_curand_state_, true);

Li Zhang's avatar
Li Zhang committed
742
743
744
745
746
        for (auto& s : states_) {
            allocator_->free((void**)&s.h_context_length, true);
            allocator_->free((void**)&s.h_finished, true);
            allocator_->free((void**)&s.h_rope_theta, true);
            allocator_->free((void**)&s.output_ids);
Li Zhang's avatar
Li Zhang committed
747
            allocator_->free((void**)&s.curand_state);
Li Zhang's avatar
Li Zhang committed
748
749
750
751
752
753
        }
        allocator_->free((void**)&h_tmp_k_ptrs_, true);
        allocator_->free((void**)&h_tmp_v_ptrs_, true);
        allocator_->free((void**)&h_cu_block_counts_, true);
        allocator_->free((void**)&h_k_block_ptrs_, true);
        allocator_->free((void**)&h_v_block_ptrs_, true);
Li Zhang's avatar
Li Zhang committed
754
755
756
757
        allocator_->free((void**)&h_input_ids_buf_, true);
        allocator_->free((void**)&h_input_length_buf_, true);
        allocator_->free((void**)&h_seq_limit_len_, true);

Li Zhang's avatar
Li Zhang committed
758
759
760
        allocator_->free((void**)&h_request_output_ids_ptrs_, true);
        allocator_->free((void**)&h_request_output_ids_lens_, true);
        allocator_->free((void**)&h_request_seqlen_ptrs_, true);
Li Zhang's avatar
Li Zhang committed
761

Li Zhang's avatar
Li Zhang committed
762
763
        allocator_->free((void**)&h_output_ids_, true);

Li Zhang's avatar
Li Zhang committed
764
765
766
767
768
        is_allocate_persistant_buffer_ = false;
    }
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
769
770
771
772
773
LlamaBatch<T>::LlamaBatch(int                              max_batch_size,
                          int                              max_context_token_num,
                          int                              session_len,
                          std::unique_ptr<SequenceManager> sequence_manager,
                          LlamaV2<T>*                      model):
Li Zhang's avatar
Li Zhang committed
774
775
776
    max_batch_size_(max_batch_size),
    max_context_token_num_(max_context_token_num),
    session_len_(session_len),
Li Zhang's avatar
Li Zhang committed
777
778
779
780
781
    rank_(model->tensor_para_.rank_),
    debug_(model->debug_),
    step_length_(model->step_length_),
    sequence_manager_(std::move(sequence_manager)),
    model_(model),
Li Zhang's avatar
Li Zhang committed
782
783
    data_type_(getTensorType<T>())
{
Li Zhang's avatar
Li Zhang committed
784
785
786
787
788
789
790
791
792
793
    stream_         = model_->stream_;
    allocator_      = model_->allocator_;
    cublas_wrapper_ = model_->cublas_wrapper_;

    for (auto& s : states_) {
        s.requests.resize(max_batch_size);
        s.sequences.resize(max_batch_size);
        s.seq_len_limit.resize(max_batch_size);
        s.is_swap_in.resize(max_batch_size);
    }
Li Zhang's avatar
Li Zhang committed
794

Li Zhang's avatar
Li Zhang committed
795
796
797
    state_    = &states_[0];
    back_     = &states_[1];
    incoming_ = &states_[2];
Li Zhang's avatar
Li Zhang committed
798

Li Zhang's avatar
Li Zhang committed
799
800
    AllocateBuffer(max_batch_size, session_len_);
    AllocatePersistantBuffer(max_batch_size);
Li Zhang's avatar
Li Zhang committed
801
802
803
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
804
void LlamaBatch<T>::InitializeSampling()
Li Zhang's avatar
Li Zhang committed
805
{
Li Zhang's avatar
Li Zhang committed
806
    NvtxScope _("InitSampling");
Li Zhang's avatar
Li Zhang committed
807
    const int batch_size = state_->active_size;
Li Zhang's avatar
Li Zhang committed
808
    TensorMap inputs;
Li Zhang's avatar
Li Zhang committed
809
    for (const auto& [name, h_ptr, d_ptr] : sampling_params_) {
Li Zhang's avatar
Li Zhang committed
810
        // find an exemplar that matches the param name
Li Zhang's avatar
Li Zhang committed
811
        const Tensor* ptr{};
Li Zhang's avatar
Li Zhang committed
812
        for (int i = 0; i < batch_size; ++i) {
Li Zhang's avatar
Li Zhang committed
813
814
            if (state_->requests[i]->inputs[rank_].isExist(name)) {
                ptr = &state_->requests[i]->inputs[rank_].at(name);
Li Zhang's avatar
Li Zhang committed
815
816
817
                break;
            }
        }
Li Zhang's avatar
Li Zhang committed
818
        // fill the batch of the param
Li Zhang's avatar
Li Zhang committed
819
820
821
822
        if (ptr) {
            const auto& ref   = *ptr;
            auto        shape = ref.shape;
            FT_CHECK(shape[0] == 1);
Li Zhang's avatar
Li Zhang committed
823
            shape[0]                = batch_size;
Li Zhang's avatar
Li Zhang committed
824
            const int size_in_bytes = ref.sizeBytes();
Li Zhang's avatar
Li Zhang committed
825
            memset(h_ptr, 0, size_in_bytes * batch_size);
Li Zhang's avatar
Li Zhang committed
826
            for (int i = 0; i < batch_size; ++i) {
Li Zhang's avatar
Li Zhang committed
827
828
                if (state_->requests[i]->inputs[rank_].isExist(name)) {
                    Tensor& src = state_->requests[i]->inputs[rank_].at(name);
Li Zhang's avatar
Li Zhang committed
829
                    FT_CHECK(ref.shape == src.shape);
Li Zhang's avatar
Li Zhang committed
830
                    std::copy_n(src.getPtr<std::byte>(), size_in_bytes, h_ptr + size_in_bytes * i);
Li Zhang's avatar
Li Zhang committed
831
832
                }
            }
Li Zhang's avatar
Li Zhang committed
833
834
835
836
            if (d_ptr) {
                Copy(h_ptr, batch_size * size_in_bytes, d_ptr);
            }
            inputs.insert({name, {d_ptr ? MEMORY_GPU : MEMORY_CPU, ref.type, shape, d_ptr ? d_ptr : h_ptr}});
Li Zhang's avatar
Li Zhang committed
837
            if (debug_ && rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
838
                TM_LOG_INFO("[initializeSampling] %s", format({name, inputs.at(name)}).c_str());
Li Zhang's avatar
Li Zhang committed
839
840
841
842
            }
        }
    }

Li Zhang's avatar
Li Zhang committed
843
844
845
846
847
    // init for eos
    std::fill_n(h_end_ids_buf_, batch_size, model_->end_id_);
    Copy(h_end_ids_buf_, batch_size, d_end_ids_buf_);
    inputs.insert({"end_id", {MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, d_end_ids_buf_}});

Li Zhang's avatar
Li Zhang committed
848
849
    inputs_ = std::move(inputs);

Li Zhang's avatar
Li Zhang committed
850
    model_->dynamic_decode_layer_->setup(batch_size, 1, &inputs_);
Li Zhang's avatar
Li Zhang committed
851
852
853
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
854
auto LlamaBatch<T>::InitializeGeneration() -> GenerationState
Li Zhang's avatar
Li Zhang committed
855
{
Li Zhang's avatar
Li Zhang committed
856
    NvtxScope _("InitGen");
Li Zhang's avatar
Li Zhang committed
857
858
    const int batch_size      = state_->active_size;
    const int max_context_len = *std::max_element(state_->h_context_length, state_->h_context_length + batch_size);
Li Zhang's avatar
Li Zhang committed
859

Li Zhang's avatar
Li Zhang committed
860
861
862
863
864
865
866
867
868
    Copy(state_->h_context_length, batch_size, context_length_buf_);  // also referenced in `SetOutputTensors`
    Copy(context_length_buf_, batch_size, sequence_lengths_);
    // `sequence_lengths_` will be increased by dynamic decode
    // note that in decoder and in output "sequence length" has different semantic
    // - in decoder it means length of sequence that has kv cache already computed
    // - in output it means length of all tokens (the last generated token does not have k/v cache computed yet)
    invokePlusScalar(sequence_lengths_, -1, batch_size, stream_);
    sync_check_cuda_error();

Li Zhang's avatar
Li Zhang committed
869
870
    Clear(token_ids_buf_, batch_size * session_len_);
    invokeTransposeAxis01(token_ids_buf_, state_->output_ids, batch_size, session_len_, 1, stream_);
Li Zhang's avatar
Li Zhang committed
871
872
873
874
875
876
877
878
    sync_check_cuda_error();

    // token_ids_buf_[s, b]
    // ABCDe            ABCDe     e
    // ABCDEFGHIJk      ABCDEFGHIJk
    // ABCDEFGHi    ->  ABCDEFGHi i
    // ABCDEFGh         ABCDEFGh  h
    // ABCd             ABCd      d
Li Zhang's avatar
Li Zhang committed
879
    invokePadLastTokenIds(token_ids_buf_, context_length_buf_, max_context_len, batch_size, stream_);
Li Zhang's avatar
Li Zhang committed
880
881
    sync_check_cuda_error();

Li Zhang's avatar
Li Zhang committed
882
883
884
885
886
887
888
889
890
891
892
893
    // used for dispatching split-k decoding kernels
    const int sum_seq_len =
        std::accumulate(state_->h_context_length, state_->h_context_length + batch_size, -batch_size);
    const int max_seq_len = *std::max_element(state_->h_context_length, state_->h_context_length + batch_size) - 1;

    // seq_limit_len_, will be compared to `step` instead of `sequence_length`, so padding len should be accounted
    // for
    for (int i = 0; i < batch_size; ++i) {
        h_seq_limit_len_[i] = state_->seq_len_limit[i] + (max_context_len - state_->h_context_length[i]);
    }
    Copy(h_seq_limit_len_, batch_size, seq_limit_len_);
    Copy(state_->h_finished, batch_size, finished_buf_);
Li Zhang's avatar
Li Zhang committed
894

Li Zhang's avatar
Li Zhang committed
895
896
897
898
899
900
901
902
903
904
905
906
    for (int i = 0; i < batch_size; ++i) {
        Tensor& output_ids         = state_->requests[i]->outputs[rank_].at("output_ids");
        int*    req_output_ids_ptr = output_ids.getPtr<int>();
        int*    req_seqlen_ptr     = state_->requests[i]->outputs[rank_].getPtr<int>("sequence_length");

        h_request_output_ids_ptrs_[i] = req_output_ids_ptr;
        h_request_output_ids_lens_[i] = output_ids.shape.at(2);
        h_request_seqlen_ptrs_[i]     = req_seqlen_ptr;

        FT_CHECK(h_request_output_ids_ptrs_[i]);
        FT_CHECK(h_request_output_ids_lens_[i]);
        FT_CHECK(h_request_seqlen_ptrs_[i]);
Li Zhang's avatar
Li Zhang committed
907
    }
Li Zhang's avatar
Li Zhang committed
908
909
910
911
912
    Copy(h_request_output_ids_ptrs_, batch_size, request_output_ids_ptrs_);
    Copy(h_request_output_ids_lens_, batch_size, request_output_ids_lens_);
    Copy(h_request_seqlen_ptrs_, batch_size, request_seqlen_ptrs_);

    Copy(state_->h_rope_theta, batch_size, rope_theta_);
Li Zhang's avatar
Li Zhang committed
913
914
915
916

    // ! range of step_ [1, 2 * session_len]
    // consider a sequence with context_len == session_len and another sequence with context_len == 1 and
    // request_output_len == session_len - 1 => step_ will loop in [session_len, 2 * session_len)
Li Zhang's avatar
Li Zhang committed
917
    const int start_step = max_context_len;
Li Zhang's avatar
Li Zhang committed
918
919

    if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
920
921
        TM_LOG_INFO("[initGen] batch_size = %d", (int)batch_size);
        TM_LOG_INFO("[initGen] max_context_len = %d", (int)max_context_len);
Li Zhang's avatar
Li Zhang committed
922

Li Zhang's avatar
Li Zhang committed
923
924
925
926
927
928
929
930
931
932
        if (debug_) {
            TM_LOG_INFO("[initGen] slot  sequence_id  context_len  seq_limit_len  finished");
            for (int i = 0; i < batch_size; ++i) {
                TM_LOG_INFO("[initGen] %4d  %11ld  %11d  %13d  %8d",
                            i,
                            (long)state_->sequences[i]->id,
                            state_->h_context_length[i],
                            (int)h_seq_limit_len_[i],
                            (int)state_->h_finished[i]);
            }
Li Zhang's avatar
Li Zhang committed
933
934
        }
    }
Li Zhang's avatar
Li Zhang committed
935
936

    return GenerationState{max_context_len, start_step, sum_seq_len, max_seq_len};
Li Zhang's avatar
Li Zhang committed
937
938
939
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
940
bool LlamaBatch<T>::Generate(GenerationState& g)
Li Zhang's avatar
Li Zhang committed
941
{
Li Zhang's avatar
Li Zhang committed
942
943
944
    NvtxScope scope("Generate");
    const int batch_size = state_->active_size;

Li Zhang's avatar
Li Zhang committed
945
    constexpr int kLogInterval = 10;
Li Zhang's avatar
Li Zhang committed
946
947
    if (rank_ == 0 && (g.step - 1) % kLogInterval == 0) {
        TM_LOG_INFO("------------------------- step = %d -------------------------", g.step - 1);
Li Zhang's avatar
Li Zhang committed
948
949
    }

Li Zhang's avatar
Li Zhang committed
950
    const bool is_first_step = (g.step == g.max_init_ctx_len);
Li Zhang's avatar
Li Zhang committed
951
952
953

    std::vector<int> prev;
    if (debug_ && rank_ == 0 && is_first_step) {
Li Zhang's avatar
Li Zhang committed
954
955
        prev.resize(batch_size);
        Copy(token_ids_buf_ + (g.step - 1) * batch_size, batch_size, prev.data());
Li Zhang's avatar
Li Zhang committed
956
957
958
    }

    // embeddingLookup(step_ - 1);
Li Zhang's avatar
Li Zhang committed
959
    model_->embeddingLookup(decoder_input_buf_,  //
Li Zhang's avatar
Li Zhang committed
960
                            token_ids_buf_,
Li Zhang's avatar
Li Zhang committed
961
962
                            batch_size,
                            g.step - 1);
Li Zhang's avatar
Li Zhang committed
963

Li Zhang's avatar
Li Zhang committed
964
965
966
    model_->decoderForward(decoder_output_buf_,
                           k_block_ptrs_,
                           v_block_ptrs_,
Li Zhang's avatar
Li Zhang committed
967
968
969
                           decoder_input_buf_,
                           sequence_lengths_,
                           finished_buf_,
Li Zhang's avatar
Li Zhang committed
970
971
972
                           cu_block_counts_,
                           rope_theta_,
                           g.step,
Li Zhang's avatar
Li Zhang committed
973
                           0,
Li Zhang's avatar
Li Zhang committed
974
975
976
                           g.sum_seq_len,
                           g.max_seq_len,
                           batch_size);
Li Zhang's avatar
Li Zhang committed
977

Li Zhang's avatar
Li Zhang committed
978
    model_->postDecodeEmbedding(logits_buf_,  //
Li Zhang's avatar
Li Zhang committed
979
980
                                local_logits_buf_,
                                decoder_output_buf_,
Li Zhang's avatar
Li Zhang committed
981
                                batch_size);
Li Zhang's avatar
Li Zhang committed
982

Li Zhang's avatar
Li Zhang committed
983
984
985
    /// sync for better NVTX visualization, THIS IS NOT NEEDED
    // check_cuda_error(cudaStreamSynchronize(stream_));

Li Zhang's avatar
Li Zhang committed
986
987
988
    // stop-words & bad-words require the matched tokens to be contiguous, so item size > 1 is
    // not supported yet.
    bool should_stop{};
Li Zhang's avatar
Li Zhang committed
989
    model_->dynamicDecode(token_ids_buf_,
Li Zhang's avatar
Li Zhang committed
990
991
992
                          finished_buf_,
                          sequence_lengths_,
                          &should_stop,
Li Zhang's avatar
Li Zhang committed
993
                          state_->curand_state,
Li Zhang's avatar
Li Zhang committed
994
995
996
997
998
                          &inputs_,
                          &outputs_,
                          logits_buf_,
                          seq_limit_len_,
                          context_length_buf_,
Li Zhang's avatar
Li Zhang committed
999
                          d_end_ids_buf_,
Li Zhang's avatar
Li Zhang committed
1000
                          g.step,
Li Zhang's avatar
Li Zhang committed
1001
                          0,
Li Zhang's avatar
Li Zhang committed
1002
                          g.max_init_ctx_len,
Li Zhang's avatar
Li Zhang committed
1003
                          session_len_ * 2,
Li Zhang's avatar
Li Zhang committed
1004
                          batch_size);
Li Zhang's avatar
Li Zhang committed
1005
1006

    if (debug_ && rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
1007
        std::vector<int> curr(batch_size);
Li Zhang's avatar
Li Zhang committed
1008

Li Zhang's avatar
Li Zhang committed
1009
        Copy(token_ids_buf_ + g.step * batch_size, batch_size, curr.data());
Li Zhang's avatar
Li Zhang committed
1010
1011
1012
1013
1014
1015
1016
        cudaStreamSynchronize(stream_);

        if (is_first_step) {
            std::stringstream sprev;
            for (int k = 0; k < prev.size(); ++k) {
                sprev << std::setw(6) << prev[k];
            }
Li Zhang's avatar
Li Zhang committed
1017
            TM_LOG_INFO("[ lookup ] step = %d, [%s]", g.step - 1, sprev.str().c_str());
Li Zhang's avatar
Li Zhang committed
1018
1019
1020
1021
1022
1023
        }

        std::stringstream scurr;
        for (int k = 0; k < curr.size(); ++k) {
            scurr << std::setw(6) << curr[k];
        }
Li Zhang's avatar
Li Zhang committed
1024
        TM_LOG_INFO("[generate] step = %d, [%s]", g.step - 1, scurr.str().c_str());
Li Zhang's avatar
Li Zhang committed
1025
1026
1027
    }

    ////////////////////////////////////////////////
Li Zhang's avatar
Li Zhang committed
1028
1029
1030
1031
    /// ! increase the counters
    g.step += 1;
    g.max_seq_len += 1;
    g.sum_seq_len += batch_size;
Li Zhang's avatar
Li Zhang committed
1032
1033
1034
1035
1036

    return !should_stop;
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
1037
void LlamaBatch<T>::ContextDecode()
Li Zhang's avatar
Li Zhang committed
1038
{
Li Zhang's avatar
Li Zhang committed
1039
    NvtxScope  _("prefill");
Li Zhang's avatar
Li Zhang committed
1040
    const auto batch_size = state_->active_size;
Li Zhang's avatar
Li Zhang committed
1041

Li Zhang's avatar
Li Zhang committed
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
    int base = -1;
    for (int i = 0; i < batch_size; ++i) {
        if (state_->is_swap_in[i]) {
            const auto& seq = *state_->sequences[i];
            dbg(std::tuple(i, state_->h_context_length[i], seq.cache_len));
            if (const int missing = state_->h_context_length[i] - seq.cache_len; missing > 1) {
                base = base < 0 ? i : base;
                dbg(seq.tokens, seq.cache_len);
                Copy(state_->output_ids + i * session_len_ + seq.cache_len, missing, input_ids_buf_ + i * session_len_);
                // subtract input/context len by 1 to skip last input token (will process with decoder later)
                h_input_length_buf_[i] = missing - 1;
Li Zhang's avatar
Li Zhang committed
1053
1054
1055
            }
        }
    }
Li Zhang's avatar
Li Zhang committed
1056
1057
1058
    if (base < 0) {
        // TM_LOG_INFO("[decodeContext] Context decoding is not needed.");
        return;
Li Zhang's avatar
Li Zhang committed
1059
1060
    }

Li Zhang's avatar
Li Zhang committed
1061
    const int context_decode_count = batch_size - base;
Li Zhang's avatar
Li Zhang committed
1062

Li Zhang's avatar
Li Zhang committed
1063
1064
1065
    Copy(state_->h_context_length, batch_size, context_length_buf_);
    Copy(state_->h_rope_theta, batch_size, rope_theta_);
    Copy(h_input_length_buf_, batch_size, input_length_buf_);
Li Zhang's avatar
Li Zhang committed
1066

Li Zhang's avatar
Li Zhang committed
1067
1068
    // check_cuda_error(cudaStreamSynchronize(stream_));
    // const auto tick = std::chrono::high_resolution_clock::now();
Li Zhang's avatar
Li Zhang committed
1069

Li Zhang's avatar
Li Zhang committed
1070
1071
    if (rank_ == 0) {
        TM_LOG_INFO("[decodeContext] base = %d, count = %d", base, context_decode_count);
Li Zhang's avatar
Li Zhang committed
1072
    }
Li Zhang's avatar
Li Zhang committed
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
    // subtract input/context len by 1 to skip last input token (will process with decoder later)
    invokePlusScalar(context_length_buf_ + base, -1, context_decode_count, stream_);

    // find sub-batch offsets
    std::vector<int> offsets{base};
    std::vector<int> max_context_cnts;
    int              accum_size        = 0;
    int              accum_input_count = 0;
    int              max_context_count = 0;
    for (int i = base; i < batch_size; ++i) {
        int size          = accum_size + 1;
        int input_count   = accum_input_count + h_input_length_buf_[i];
        int context_count = std::max(max_context_count, state_->h_context_length[i] - 1);
        // we have `cu_seqlens` on q so no padding for input is needed
        // kernels are expecting uniform k/v cache length -> `max_context_count * size <= max_context_token_num_`
        if (input_count <= max_context_token_num_ && context_count * size <= max_context_token_num_) {
            accum_size        = size;
            accum_input_count = input_count;
            max_context_count = context_count;
Li Zhang's avatar
Li Zhang committed
1092
        }
Li Zhang's avatar
Li Zhang committed
1093
1094
1095
1096
1097
1098
        else {
            offsets.push_back(i);
            max_context_cnts.push_back(max_context_count);
            accum_size        = 1;
            accum_input_count = h_input_length_buf_[i];
            max_context_count = state_->h_context_length[i] - 1;
Li Zhang's avatar
Li Zhang committed
1099
1100
        }
    }
Li Zhang's avatar
Li Zhang committed
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
    offsets.push_back(batch_size);
    max_context_cnts.push_back(max_context_count);

    dbg(offsets, max_context_cnts);

    // context decode on sub-batches
    for (int k = 0; k < offsets.size() - 1; ++k) {
        int              first          = offsets[k];
        int              last           = offsets[k + 1];
        int              sub_batch_size = last - first;
        T*               k_ptr          = tmp_k_cache_buf_;
        T*               v_ptr          = tmp_v_cache_buf_;
        std::vector<int> decode_indices{};
        std::vector<int> decode_lengths{};
        int              max_input_len{};
        auto             input_ids = context_decoder_ids_buf_;
        TM_LOG_INFO("first = %d, last = %d", first, last);
        for (int i = first; i < last; ++i) {
Li Zhang's avatar
Li Zhang committed
1119
            // TM_LOG_INFO("session_len = %d, input_length = %d", session_len_, h_input_length_buf_[i]);
Li Zhang's avatar
Li Zhang committed
1120
1121
1122
1123
1124
1125
1126
1127
1128
            input_ids = Copy(input_ids_buf_ + i * session_len_, h_input_length_buf_[i], input_ids);
            dbg(i, h_input_length_buf_[i]);
            h_tmp_k_ptrs_[i] = k_ptr;
            h_tmp_v_ptrs_[i] = v_ptr;
            k_ptr += model_->local_kv_head_num_ * max_context_cnts[k] * model_->size_per_head_;
            v_ptr += model_->local_kv_head_num_ * max_context_cnts[k] * model_->size_per_head_;
            decode_indices.push_back(i);
            decode_lengths.push_back(h_input_length_buf_[i]);
            max_input_len = std::max(max_input_len, h_input_length_buf_[i]);
Li Zhang's avatar
Li Zhang committed
1129
        }
Li Zhang's avatar
Li Zhang committed
1130
1131
        int token_count = input_ids - context_decoder_ids_buf_;
        dbg(token_count, max_input_len, max_context_cnts[k]);
Li Zhang's avatar
Li Zhang committed
1132

Li Zhang's avatar
Li Zhang committed
1133
1134
        Copy(h_tmp_k_ptrs_ + first, sub_batch_size, tmp_k_ptrs_ + first);
        Copy(h_tmp_v_ptrs_ + first, sub_batch_size, tmp_v_ptrs_ + first);
Li Zhang's avatar
Li Zhang committed
1135
1136

        if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
1137
1138
1139
1140
1141
1142
1143
            TM_LOG_INFO(
                "[decodeContext] offset = %d, batch_size = %d, token_num = %d, max_input_len = %d, max_context_len = %d",
                base,
                sub_batch_size,
                token_count,
                max_input_len,
                max_context_cnts[k]);
Li Zhang's avatar
Li Zhang committed
1144
1145
        }

Li Zhang's avatar
Li Zhang committed
1146
1147
        dbg(first, last);
        dbg(k_block_ptrs_, v_block_ptrs_);
Li Zhang's avatar
Li Zhang committed
1148

Li Zhang's avatar
Li Zhang committed
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
        model_->contextDecode(nullptr,
                              k_block_ptrs_,
                              v_block_ptrs_,
                              tmp_k_ptrs_ + first,
                              tmp_v_ptrs_ + first,
                              context_decoder_input_buf_,
                              context_decoder_output_buf_,
                              context_decoder_ids_buf_,
                              input_length_buf_ + first,
                              context_length_buf_ + first,
                              cu_block_counts_ + first,
                              rope_theta_ + first,
                              token_count,
                              max_input_len,
                              max_context_cnts[k],
                              max_context_cnts[k],
                              sub_batch_size);

        // compute logits of inputs if requested
        OutputContextLogits(context_decoder_output_buf_, decode_indices, decode_lengths);
    }

    invokePlusScalar(context_length_buf_ + base, 1, context_decode_count, stream_);

    std::fill(h_input_length_buf_ + base, h_input_length_buf_ + batch_size, 0);

    // `SequenceManager` needs real-time value of cache length
    for (int i = base; i < batch_size; ++i) {
        if (state_->requests[i]) {
            FT_CHECK(state_->sequences[i]);
            state_->sequences[i]->cache_len = state_->h_context_length[i] - 1;  // -1 since we skip last token
Li Zhang's avatar
Li Zhang committed
1180
1181
        }
    }
Li Zhang's avatar
Li Zhang committed
1182

Li Zhang's avatar
Li Zhang committed
1183
1184
1185
1186
1187
    // check_cuda_error(cudaStreamSynchronize(stream_));
    // const auto tock = std::chrono::high_resolution_clock::now();
    // if (rank_ == 0) {
    //     TM_LOG_INFO("[decodeContext] %.2f ms", std::chrono::duration<float, std::milli>(tock - tick).count());
    // }
Li Zhang's avatar
Li Zhang committed
1188
1189
}

1190
template<typename T>
Li Zhang's avatar
Li Zhang committed
1191
void LlamaBatch<T>::OutputContextLogits(T*                      context_decoder_output,
1192
1193
1194
1195
1196
1197
1198
1199
                                        const std::vector<int>& indices,
                                        const std::vector<int>& lengths)
{
    std::vector<float*> output_logits;
    int                 num_token = 0;
    {
        bool is_return_logits = false;
        for (int k = 0; k < indices.size(); ++k) {
Li Zhang's avatar
Li Zhang committed
1200
            auto& request = state_->requests[indices[k]];
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
            output_logits.push_back(request->outputs[rank_].getPtr<float>("logits", nullptr));
            num_token += lengths[k];
            if (output_logits.back()) {
                is_return_logits = true;
            }
        }
        if (!is_return_logits) {
            return;
        }
    }

    if (context_logits_buf_ == nullptr) {
Li Zhang's avatar
Li Zhang committed
1213
        NcclGuard guard(model_->tensor_para_, stream_, true);
Chen Xin's avatar
Chen Xin committed
1214
        context_logits_buf_ =
Li Zhang's avatar
Li Zhang committed
1215
1216
            (float*)allocator_->malloc(sizeof(float) * model_->vocab_size_padded_ * max_context_token_num_);
        const auto tp = model_->tensor_para_.world_size_;
1217
        if (tp > 1) {
Li Zhang's avatar
Li Zhang committed
1218
1219
            FT_CHECK(model_->vocab_size_padded_ % tp == 0);
            const auto local_vocab_size = model_->vocab_size_padded_ / tp;
1220
1221
1222
1223
1224
            local_context_logits_buf_ =
                (float*)allocator_->malloc(sizeof(float) * local_vocab_size * max_context_token_num_);
        }
    }

Li Zhang's avatar
Li Zhang committed
1225
    model_->postDecodeEmbedding(context_logits_buf_, local_context_logits_buf_, context_decoder_output, num_token);
1226
1227
1228
1229
1230

    auto logits = context_logits_buf_;

    for (int k = 0; k < indices.size(); ++k) {
        if (output_logits[k]) {
Li Zhang's avatar
Li Zhang committed
1231
            Copy(logits, model_->vocab_size_ * lengths[k], output_logits[k]);
1232
        }
Li Zhang's avatar
Li Zhang committed
1233
        logits += model_->vocab_size_padded_ * lengths[k];
1234
1235
1236
    }
}

Li Zhang's avatar
Li Zhang committed
1237
template<typename T>
Li Zhang's avatar
Li Zhang committed
1238
auto LlamaBatch<T>::Finish(GenerationState& g, int& finished_count) -> std::vector<Signal>
Li Zhang's avatar
Li Zhang committed
1239
{
Li Zhang's avatar
Li Zhang committed
1240
1241
    NvtxScope scope("Finish");
    const int batch_size = state_->active_size;
Li Zhang's avatar
Li Zhang committed
1242

Li Zhang's avatar
Li Zhang committed
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
    // [s,b] -> [b,s] and skip padding in [context_len, max_context_len)
    invokeGatherOutput(state_->output_ids,
                       token_ids_buf_,
                       context_length_buf_,
                       g.max_init_ctx_len,
                       g.step,
                       session_len_,
                       batch_size,
                       stream_);
    sync_check_cuda_error();
Li Zhang's avatar
Li Zhang committed
1253

Li Zhang's avatar
Li Zhang committed
1254
1255
    Copy(state_->output_ids, batch_size * session_len_, h_output_ids_);
    Copy(finished_buf_, batch_size, state_->h_finished);
Li Zhang's avatar
Li Zhang committed
1256
1257
    Copy(sequence_lengths_, batch_size, state_->h_context_length);

Li Zhang's avatar
Li Zhang committed
1258
    check_cuda_error(cudaStreamSynchronize(stream_));
Li Zhang's avatar
Li Zhang committed
1259

1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
    // `SequenceManager` needs real-time value of cache length
    // ! Must be done before incrementing `h_context_length` because the generated token is NOT kv-cached yet
    for (int i = 0; i < batch_size; ++i) {
        if (state_->requests[i]) {
            FT_CHECK(state_->sequences[i]);
            state_->sequences[i]->cache_len = state_->h_context_length[i];
        }
    }

    // invariant: context_length = sequence_length + 1, so that h_context_length include all (including the one just
    // generated) tokens
Li Zhang's avatar
Li Zhang committed
1271
1272
    for (int i = 0; i < batch_size; ++i) {
        ++state_->h_context_length[i];
Li Zhang's avatar
Li Zhang committed
1273
    }
Li Zhang's avatar
Li Zhang committed
1274

Li Zhang's avatar
Li Zhang committed
1275
1276
1277
1278
    {  // set output tokens ids and sequence length
        int* output_ptr = h_output_ids_;
        for (int i = 0; i < batch_size; ++i) {
            if (state_->requests[i] && (state_->requests[i]->stream_cb || state_->h_finished[i])) {
1279
                const int count = state_->h_context_length[i];
Li Zhang's avatar
Li Zhang committed
1280
1281
1282
                // TODO: sync history output tokens at when receiving the request and copy only the last token here
                std::copy(output_ptr, output_ptr + count, h_request_output_ids_ptrs_[i]);
                *h_request_seqlen_ptrs_[i] = count;
Li Zhang's avatar
Li Zhang committed
1283
            }
Li Zhang's avatar
Li Zhang committed
1284
            output_ptr += session_len_;
Li Zhang's avatar
Li Zhang committed
1285
        }
Chen Xin's avatar
Chen Xin committed
1286
    }
Li Zhang's avatar
Li Zhang committed
1287
1288
1289

    if (debug_ && rank_ == 0) {
        std::stringstream ss;
Li Zhang's avatar
Li Zhang committed
1290
1291
        for (int i = 0; i < batch_size; ++i) {
            ss << (i ? ", " : "") << "(" << state_->h_context_length[i] << "," << state_->h_finished[i] << ")";
Li Zhang's avatar
Li Zhang committed
1292
        }
lvhan028's avatar
lvhan028 committed
1293
        TM_LOG_INFO("[finish] [%s]", ss.str().c_str());
Li Zhang's avatar
Li Zhang committed
1294
1295
    }

Li Zhang's avatar
Li Zhang committed
1296
1297
    std::vector<Signal> signals;
    {
Li Zhang's avatar
Li Zhang committed
1298
        NvtxScope _("stream_and_completion_signal");
Li Zhang's avatar
Li Zhang committed
1299
        for (int i = 0; i < batch_size; ++i) {
Li Zhang's avatar
Li Zhang committed
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
            if (state_->requests[i]) {
                if (state_->h_finished[i]) {
                    // Interrupt finished sequences and move the request handle into the signal closure
                    signals.push_back(Interrupt(i));
                    ++finished_count;
                }
                else if (state_->requests[i]->stream_cb) {
                    // Create signals by copying the request handles for non-finished streaming requests
                    signals.push_back([this, r = state_->requests[i]] {
                        if (rank_ == 0) {
                            r->stream_cb(&r->outputs[rank_].get());
                        }
                    });
                }
Li Zhang's avatar
Li Zhang committed
1314
1315
            }
        }
Li Zhang's avatar
Li Zhang committed
1316
1317
1318
1319
        if (finished_count) {
            // synchronize for interrupted sequences
            check_cuda_error(cudaStreamSynchronize(stream_));
        }
Li Zhang's avatar
Li Zhang committed
1320
    }
Li Zhang's avatar
Li Zhang committed
1321
    return signals;
Li Zhang's avatar
Li Zhang committed
1322
1323
1324
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
1325
auto LlamaBatch<T>::Interrupt(int index, bool force_stop, bool force_end) -> Signal
Li Zhang's avatar
Li Zhang committed
1326
1327
{
    if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
1328
        TM_LOG_INFO("[Interrupt] slot = %d, id = %lu", index, (long)state_->requests[index]->id);
Li Zhang's avatar
Li Zhang committed
1329
1330
1331
    }

    if (debug_ && rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
1332
1333
        std::vector<int> tokens(state_->h_context_length[index]);
        Copy(state_->output_ids + index * session_len_, tokens.size(), tokens.data());
Li Zhang's avatar
Li Zhang committed
1334
1335
1336
1337
1338
        cudaStreamSynchronize(stream_);
        std::stringstream ss;
        for (const auto& t : tokens) {
            ss << " " << t;
        }
Li Zhang's avatar
Li Zhang committed
1339
        TM_LOG_INFO("[Interrupt] slot %d, tokens [%s]", index, ss.str().c_str());
Li Zhang's avatar
Li Zhang committed
1340
1341
    }

Li Zhang's avatar
Li Zhang committed
1342
1343
1344
    if (state_->requests[index]->end_flag || force_end) {
        // Sequence is ending this round or a stop request is issued to end it
        FT_CHECK(sequence_manager_->Erase(state_->requests[index]->id));
Li Zhang's avatar
Li Zhang committed
1345
1346
    }
    else {
1347
        const int output_len = state_->h_context_length[index];
Li Zhang's avatar
Li Zhang committed
1348
        auto&     seq        = *state_->sequences[index];
Li Zhang's avatar
Li Zhang committed
1349

Li Zhang's avatar
Li Zhang committed
1350
        // Update token IDs
Li Zhang's avatar
Li Zhang committed
1351
1352
        seq.tokens.resize(output_len);
        const auto output_ids_data = state_->requests[index]->outputs[rank_].at("output_ids").getPtr<int>();
Li Zhang's avatar
Li Zhang committed
1353
        std::copy_n(output_ids_data, output_len, seq.tokens.data());
Li Zhang's avatar
Li Zhang committed
1354

Li Zhang's avatar
Li Zhang committed
1355
1356
1357
1358
        // Save random state in host memory
        seq.random_state.resize(sizeof(curandState_t));
        // This async copy must be synchronized by the caller
        Copy(state_->curand_state + index, 1, (curandState_t*)seq.random_state.data());
Li Zhang's avatar
Li Zhang committed
1359

Li Zhang's avatar
Li Zhang committed
1360
        // Set unlock flag for corresponding blocks, will be unlocked in the next `Materialize()`
Li Zhang's avatar
Li Zhang committed
1361
1362
1363
1364
        sequence_manager_->UpdateAndSetUnlock(seq);
    }

    state_->sequences[index] = nullptr;
Li Zhang's avatar
Li Zhang committed
1365
1366
1367
1368
1369
1370
1371

    // move the request handle into the signal
    return [this, r = std::move(state_->requests[index])] {
        if (rank_ == 0) {
            r->signal.set_value(0);
        }
    };
Li Zhang's avatar
Li Zhang committed
1372
1373
1374
1375
1376
}

template<typename T>
void LlamaBatch<T>::InternalThreadEntry(int device_id)
{
Li Zhang's avatar
Li Zhang committed
1377
    // TM_LOG_INFO("[InternalThreadEntry] %d", (int)rank_);
Li Zhang's avatar
Li Zhang committed
1378
1379
1380
1381
1382
1383
1384
1385
    check_cuda_error(cudaSetDevice(device_id));

    auto& shared_state = model_->shared_state_;

    auto& request_queue  = shared_state->request_queue;
    auto& infer_requests = shared_state->infer_requests;
    auto& stop_requests  = shared_state->stop_requests;

Li Zhang's avatar
Li Zhang committed
1386
    // sequences that are removed but still counted in state's size
Li Zhang's avatar
Li Zhang committed
1387
1388
1389
1390
    int finished_count = 0;

    GenerationState g{};

Li Zhang's avatar
Li Zhang committed
1391
1392
1393
    constexpr int request_interval = 1;
    long          request_counter  = 0;

Li Zhang's avatar
Li Zhang committed
1394
1395
1396
1397
    while (1) {
        if (rank_ == 0) {
            const int  free_slot_count = max_batch_size_ - state_->size + finished_count;
            const bool is_empty        = (free_slot_count == max_batch_size_);
Li Zhang's avatar
Li Zhang committed
1398
1399
1400
1401
1402
1403
1404
1405
            stop_requests.clear();
            infer_requests.clear();
            if (is_empty || request_counter % request_interval == 0) {
                // Block if batch is empty
                request_queue.dequeue(stop_requests, infer_requests, free_slot_count, is_empty, shared_state->abort);
                if (!shared_state->abort) {
                    RejectInvalidRequests(stop_requests, infer_requests);
                }
Li Zhang's avatar
Li Zhang committed
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
            }
        }

        NvtxScope scope("mainloop");

        // wait while rank-0 is dequeueing
        shared_state->barrier->wait();

        if (shared_state->abort) {
            TM_LOG_INFO("[InternalThreadEntry] stop requested.");
            return;
        }

        auto signals = ProcessStopRequests(stop_requests);

Li Zhang's avatar
Li Zhang committed
1421
        // Shared `priority` field will be assigned by rank-0
Li Zhang's avatar
Li Zhang committed
1422
1423
        ProcessInferRequests(infer_requests);

Li Zhang's avatar
Li Zhang committed
1424
        // Wait while shared `requests` is being used
Li Zhang's avatar
Li Zhang committed
1425
1426
        shared_state->barrier->wait();

Li Zhang's avatar
Li Zhang committed
1427
1428
        SendSignals(std::move(signals));

Li Zhang's avatar
Li Zhang committed
1429
        auto modified = Initialize();
Li Zhang's avatar
Li Zhang committed
1430
1431
        // finished sequences is handled by `Initialize()`
        finished_count = 0;
Li Zhang's avatar
Li Zhang committed
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444

        ContextDecode();

        if (state_->active_size) {
            if (modified) {
                g = InitializeGeneration();
                InitializeSampling();
            }
            for (int i = 0; i < step_length_; ++i) {
                if (!Generate(g)) {
                    break;
                }
            }
Li Zhang's avatar
Li Zhang committed
1445
1446
1447
1448
1449
1450
1451
1452
1453
            if (auto signals = Finish(g, finished_count); !signals.empty()) {
                if (finished_count) {
                    // Finished requests and corresponding output tensors will be released when notified
                    // wait for all ranks to ensure no rank (except for output thread) will access related
                    // resources
                    shared_state->barrier->wait();
                }
                SendSignals(std::move(signals));
            }
Li Zhang's avatar
Li Zhang committed
1454
        }
Li Zhang's avatar
Li Zhang committed
1455
1456

        ++request_counter;
Li Zhang's avatar
Li Zhang committed
1457
1458
    }

Li Zhang's avatar
Li Zhang committed
1459
1460
1461
1462
    FT_CHECK(0);
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
1463
void LlamaBatch<T>::SendSignals(std::vector<Signal> signals)
Li Zhang's avatar
Li Zhang committed
1464
{
Li Zhang's avatar
Li Zhang committed
1465
1466
1467
1468
1469
1470
1471
1472
    if (rank_ != 0 || signals.empty()) {
        return;
    }
    {
        std::lock_guard lock{output_mutex_};
        output_signals_.insert(output_signals_.end(),  //
                               std::move_iterator{signals.begin()},
                               std::move_iterator{signals.end()});
Li Zhang's avatar
Li Zhang committed
1473
    }
Li Zhang's avatar
Li Zhang committed
1474
    output_cv_.notify_one();
Li Zhang's avatar
Li Zhang committed
1475
1476
1477
1478
1479
}

template<typename T>
void LlamaBatch<T>::Start()
{
Li Zhang's avatar
Li Zhang committed
1480
    TM_LOG_INFO("LlamaBatch<T>::Start()");
Li Zhang's avatar
Li Zhang committed
1481
1482
1483
    int device_id = -1;
    check_cuda_error(cudaGetDevice(&device_id));
    internal_thread_ = std::thread(&LlamaBatch::InternalThreadEntry, this, device_id);
Li Zhang's avatar
Li Zhang committed
1484
    if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
1485
        output_thread_ = std::thread(&LlamaBatch::OutputThreadEntry, this);
Li Zhang's avatar
Li Zhang committed
1486
    }
Li Zhang's avatar
Li Zhang committed
1487
}
Li Zhang's avatar
Li Zhang committed
1488

Li Zhang's avatar
Li Zhang committed
1489
1490
1491
1492
template<typename T>
void LlamaBatch<T>::OutputThreadEntry()
{
    while (true) {
Li Zhang's avatar
Li Zhang committed
1493
        std::vector<Signal> signals;
Li Zhang's avatar
Li Zhang committed
1494
        {
Li Zhang's avatar
Li Zhang committed
1495
            // Wait for signals to come
Li Zhang's avatar
Li Zhang committed
1496
            std::unique_lock lock(output_mutex_);
Li Zhang's avatar
Li Zhang committed
1497
            output_cv_.wait(lock, [&] { return !output_signals_.empty() || output_stop_token_; });
Li Zhang's avatar
Li Zhang committed
1498
1499
1500
1501
            if (output_stop_token_) {
                TM_LOG_INFO("[OutputThreadEntry] stop requested.");
                return;
            }
Li Zhang's avatar
Li Zhang committed
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
            signals = std::move(output_signals_);
        }
        if (rank_ == 0 && model_->ffi_lock_) {
            model_->ffi_lock_(1);
        }
        // invoke stream cbs & signals
        for (const auto& s : signals) {
            s();
        }
        if (rank_ == 0 && model_->ffi_lock_) {
            model_->ffi_lock_(0);
Li Zhang's avatar
Li Zhang committed
1513
1514
        }
    }
Li Zhang's avatar
Li Zhang committed
1515
1516
1517
1518
1519
}

template class LlamaBatch<half>;
template class LlamaBatch<float>;

lvhan028's avatar
lvhan028 committed
1520
}  // namespace turbomind