LlamaBatch.cc 59 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
// Copyright (c) OpenMMLab. All rights reserved.

lvhan028's avatar
lvhan028 committed
3
4
#include "src/turbomind/models/llama/LlamaBatch.h"
#include "src/turbomind/kernels/decoding_kernels.h"
Li Zhang's avatar
Li Zhang committed
5
#include "src/turbomind/kernels/sampling_topk_kernels.h"
Chen Xin's avatar
Chen Xin committed
6
#include "src/turbomind/macro.h"
lvhan028's avatar
lvhan028 committed
7
8
9
#include "src/turbomind/models/llama/LlamaNcclGuard.h"
#include "src/turbomind/models/llama/LlamaV2.h"
#include "src/turbomind/models/llama/Request.h"
Li Zhang's avatar
Li Zhang committed
10
11
#include "src/turbomind/models/llama/SequenceManager.h"
#include "src/turbomind/models/llama/llama_kernels.h"
lvhan028's avatar
lvhan028 committed
12
13
#include "src/turbomind/models/llama/llama_utils.h"
#include "src/turbomind/utils/Tensor.h"
Li Zhang's avatar
Li Zhang committed
14
15
16
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/debug_utils.h"
#include "src/turbomind/utils/gemm_test/gemm_func.h"
lvhan028's avatar
lvhan028 committed
17
#include "src/turbomind/utils/logger.h"
Li Zhang's avatar
Li Zhang committed
18
19
#include <algorithm>
#include <cmath>
Li Zhang's avatar
Li Zhang committed
20
#include <cstddef>
Li Zhang's avatar
Li Zhang committed
21
22
#include <cstdint>
#include <iomanip>
Li Zhang's avatar
Li Zhang committed
23
#include <iterator>
Li Zhang's avatar
Li Zhang committed
24
25
#include <mutex>
#include <numeric>
Li Zhang's avatar
Li Zhang committed
26
27
#include <sstream>
#include <unordered_map>
Li Zhang's avatar
Li Zhang committed
28
#include <utility>
Li Zhang's avatar
Li Zhang committed
29

lvhan028's avatar
lvhan028 committed
30
namespace turbomind {
Li Zhang's avatar
Li Zhang committed
31

Li Zhang's avatar
Li Zhang committed
32
33
34
35
36
37
38
void ClearState(BatchState& s)
{
    std::fill_n(s.requests.begin(), s.size, nullptr);
    std::fill_n(s.sequences.begin(), s.size, nullptr);
    s.size = s.active_size = 0;
}

Li Zhang's avatar
Li Zhang committed
39
template<typename T>
Li Zhang's avatar
Li Zhang committed
40
void LlamaBatch<T>::RejectInvalidRequests(Requests& stop_reqs, Requests& infer_reqs)
Li Zhang's avatar
Li Zhang committed
41
{
AllentDan's avatar
AllentDan committed
42
    std::unordered_map<uint64_t, int> occurrence;
Li Zhang's avatar
Li Zhang committed
43

Li Zhang's avatar
Li Zhang committed
44
    auto count_occurrence = [&occurrence](const Requests& rs) {
Li Zhang's avatar
Li Zhang committed
45
        for (const auto& r : rs) {
AllentDan's avatar
AllentDan committed
46
            ++occurrence[r->id];
Li Zhang's avatar
Li Zhang committed
47
48
49
        }
    };

Li Zhang's avatar
Li Zhang committed
50
51
52
    auto reject = [](const char* type, std::shared_ptr<Request>& req, int ec) {
        TM_LOG_WARNING(
            "[RejectInvalidRequests] Skipping invalid %s request for id %ld, code = %d", type, (long)req->id, ec);
Li Zhang's avatar
Li Zhang committed
53
54
55
56
        req->signal.set_value(ec);
        req.reset();
    };

Li Zhang's avatar
Li Zhang committed
57
    auto handle_conflict_or_invalid = [this, &occurrence, &reject](Requests& rs, const char* type) {
Li Zhang's avatar
Li Zhang committed
58
59
60
61
        for (auto& r : rs) {
            if (r) {
                int ec = 0;

Li Zhang's avatar
Li Zhang committed
62
63
64
65
66
                const int  input_length = r->inputs[rank_].getVal<int>("input_lengths", 0);
                const auto get_offset   = [&](int token_count) {
                    return std::max(0, std::min(token_count, r->inputs[rank_].getVal<int>("step", token_count)));
                };

AllentDan's avatar
AllentDan committed
67
                if (occurrence[r->id] != 1) {
Li Zhang's avatar
Li Zhang committed
68
69
70
71
72
                    ec = Request::kConflict;
                }
                else if (r->start_flag && r->stop_flag) {
                    ec = Request::kInvalid;
                }
Li Zhang's avatar
Li Zhang committed
73
74
75
76
77
78
79
80
81
82
                else if (input_length > session_len_) {
                    ec = Request::kTooLong;
                }
                else if (!r->start_flag) {
                    if (auto seq = sequence_manager_->Get(r->id); seq == nullptr) {
                        ec = Request::kInvalid;
                    }
                    else if (get_offset(seq->tokens.size()) + input_length > session_len_) {
                        ec = Request::kTooLong;
                    }
Li Zhang's avatar
Li Zhang committed
83
84
85
                }

                if (ec) {
Li Zhang's avatar
Li Zhang committed
86
                    reject(type, r, ec);
Li Zhang's avatar
Li Zhang committed
87
88
89
90
91
                }
            }
        }
    };

Li Zhang's avatar
Li Zhang committed
92
    auto drop_invalid = [](Requests& rs) {
Li Zhang's avatar
Li Zhang committed
93
94
95
96
97
98
99
100
101
        int count = 0;
        for (int i = 0; i < rs.size(); ++i) {
            if (rs[i]) {
                rs[count++] = std::move(rs[i]);
            }
        }
        rs.resize(count);
    };

AllentDan's avatar
AllentDan committed
102
103
    count_occurrence(stop_reqs);
    count_occurrence(infer_reqs);
Li Zhang's avatar
Li Zhang committed
104
105
106
107
108
109
110
111

    if (!stop_reqs.empty()) {
        handle_conflict_or_invalid(stop_reqs, "stop");

        // invalidate stop-only requests for inactive sequences
        for (auto& r : stop_reqs) {
            if (r && r->end_flag == false) {
                int ec = Request::kInactive;
Li Zhang's avatar
Li Zhang committed
112
113
                for (int i = 0; i < state_->size; ++i) {
                    if (state_->requests[i] && state_->requests[i]->id == r->id) {
Li Zhang's avatar
Li Zhang committed
114
115
116
117
118
                        ec = 0;
                        break;
                    }
                }
                if (ec) {
Li Zhang's avatar
Li Zhang committed
119
                    reject("stop", r, ec);
Li Zhang's avatar
Li Zhang committed
120
121
122
123
124
125
126
127
128
129
130
131
132
                }
            }
        }

        drop_invalid(stop_reqs);
    }

    if (!infer_reqs.empty()) {
        handle_conflict_or_invalid(infer_reqs, "infer");

        // invalidate requests for busy sequences
        for (auto& r : infer_reqs) {
            if (r) {
Li Zhang's avatar
Li Zhang committed
133
134
135
                for (int i = 0; i < state_->size; ++i) {
                    if (state_->requests[i] && state_->requests[i]->id == r->id) {
                        reject("infer", r, Request::kBusy);
Li Zhang's avatar
Li Zhang committed
136
137
138
139
140
141
142
143
144
145
146
                        break;
                    }
                }
            }
        }

        drop_invalid(infer_reqs);
    }
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
147
auto LlamaBatch<T>::ProcessStopRequests(const Requests& requests) -> std::vector<Signal>
Li Zhang's avatar
Li Zhang committed
148
{
Li Zhang's avatar
Li Zhang committed
149
    NvtxScope           scope("stop_request");
Li Zhang's avatar
Li Zhang committed
150
    std::vector<Signal> signals;
Li Zhang's avatar
Li Zhang committed
151
    int                 count = 0;
Li Zhang's avatar
Li Zhang committed
152
153
154
    for (const auto& r : requests) {
        int ec = Request::kFail;
        // find matching active sequence
Li Zhang's avatar
Li Zhang committed
155
        for (int i = 0; i < state_->size; ++i) {
Li Zhang's avatar
Li Zhang committed
156
            // stop & optionally erase active sequence
Li Zhang's avatar
Li Zhang committed
157
            if (state_->requests[i] && state_->requests[i]->id == r->id) {
Li Zhang's avatar
Li Zhang committed
158
                ec = 0;
Li Zhang's avatar
Li Zhang committed
159
160
                signals.push_back(Interrupt(i, true, r->end_flag));
                ++count;
Li Zhang's avatar
Li Zhang committed
161
162
163
                break;
            }
        }
Li Zhang's avatar
Li Zhang committed
164
        // mismatch, try erase inactive sequence, in this case there is no active request to interrupt
Li Zhang's avatar
Li Zhang committed
165
        if (ec && r->end_flag) {
Li Zhang's avatar
Li Zhang committed
166
167
168
            if (sequence_manager_->Erase(r->id)) {
                ec = 0;
            }
Li Zhang's avatar
Li Zhang committed
169
        }
Li Zhang's avatar
Li Zhang committed
170
        signals.push_back([=] {
Li Zhang's avatar
Li Zhang committed
171
            if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
172
                r->signal.set_value(ec);
Li Zhang's avatar
Li Zhang committed
173
            }
Li Zhang's avatar
Li Zhang committed
174
175
176
177
        });
    }
    if (count) {
        check_cuda_error(cudaStreamSynchronize(stream_));
Li Zhang's avatar
Li Zhang committed
178
179
180
    }
    return signals;
}
akhoroshev's avatar
akhoroshev committed
181

Li Zhang's avatar
Li Zhang committed
182
183
184
template<typename T>
void LlamaBatch<T>::ProcessInferRequests(const Requests& requests)
{
Li Zhang's avatar
Li Zhang committed
185
186
    NvtxScope scope("infer_request");
    auto&     state = *incoming_;
Li Zhang's avatar
Li Zhang committed
187
188
189
190

    FT_CHECK(state.size == 0);
    FT_CHECK(state.active_size == 0);

Li Zhang's avatar
Li Zhang committed
191
    std::vector<int> existing_idx;
Li Zhang's avatar
Li Zhang committed
192

Li Zhang's avatar
Li Zhang committed
193
194
195
    int idx = 0;
    for (const auto& r : requests) {
        FT_CHECK(!state.requests[idx]);
Li Zhang's avatar
Li Zhang committed
196

Li Zhang's avatar
Li Zhang committed
197
198
199
        if (rank_ == 0) {
            TM_LOG_WARNING("[ProcessInferRequests] Request for %ld received.", (long)r->id);
        }
Li Zhang's avatar
Li Zhang committed
200

Li Zhang's avatar
Li Zhang committed
201
        state.requests[idx] = r;
Li Zhang's avatar
Li Zhang committed
202
203

        // get sequence for the request
Li Zhang's avatar
Li Zhang committed
204
205
        state.sequences[idx] = r->start_flag ? sequence_manager_->Create(r->id) : sequence_manager_->Get(r->id);
        FT_CHECK(state.sequences[idx]);
Li Zhang's avatar
Li Zhang committed
206

Li Zhang's avatar
Li Zhang committed
207
        auto& seq = *state.sequences[idx];
Li Zhang's avatar
Li Zhang committed
208
209
210
211
212
213
214
215
216
217
218

        if (int step = r->inputs[rank_].getVal<int>("step", -1); step >= 0) {
            /// TODO: revise step setting
            if (step <= seq.tokens.size()) {
                seq.tokens.resize(step);
                seq.cache_len = std::min(seq.cache_len, step);
            }
            else if (rank_ == 0) {
                TM_LOG_WARNING(
                    "[ProcessInferRequests] Skipping invalid step (%d) setting for ID %ld", step, (long)seq.id);
            }
Li Zhang's avatar
Li Zhang committed
219
        }
Li Zhang's avatar
Li Zhang committed
220
221
222
223
224

        const int  input_length = r->inputs[rank_].getVal<int>("input_lengths");
        const int* input_ids    = r->inputs[rank_].getPtr<int>("input_ids");

        // `output_ids` contains all token ids of the sequences
Li Zhang's avatar
Li Zhang committed
225
        const auto output_ids_base = state.output_ids + session_len_ * idx;
Li Zhang's avatar
Li Zhang committed
226
227
228
229
230
231
232
233
234
235
236
237
238
        auto       output_ids      = output_ids_base;

        // copy history tokens
        if (!seq.tokens.empty()) {
            output_ids = Copy(seq.tokens.data(), seq.tokens.size(), output_ids);
        }

        // copy input tokens
        if (input_length) {
            output_ids = Copy(input_ids, input_length, output_ids);
        }

        // total context length (history + input)
Li Zhang's avatar
Li Zhang committed
239
240
        state.h_context_length[idx] = output_ids - output_ids_base;
        state.h_finished[idx]       = false;
Li Zhang's avatar
Li Zhang committed
241

Li Zhang's avatar
Li Zhang committed
242
243
        const int request_output_len = state.requests[idx]->inputs[rank_].getVal<int>("request_output_len");
        state.seq_len_limit[idx]     = state.h_context_length[idx] + request_output_len;
Li Zhang's avatar
Li Zhang committed
244
245
        // `length_criterion` sets finish flag when step >= seq_limit_len, however when step == seq_limit_len
        // the actual sequence length is seq_limit_len + 1, hence seq_limit_len must truncated to session_len - 1
Li Zhang's avatar
Li Zhang committed
246
247
        if (state.seq_len_limit[idx] >= session_len_) {
            state.seq_len_limit[idx] = session_len_ - 1;
Li Zhang's avatar
Li Zhang committed
248
            if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
249
                const int trunc_output_len = state.seq_len_limit[idx] - state.h_context_length[idx];
Li Zhang's avatar
Li Zhang committed
250
251
252
                TM_LOG_WARNING(
                    "[ProcessInferRequests] [%ld] total sequence length (%d + %d) exceeds `session_len` (%d), `request_output_len` is truncated to %d",
                    (long)seq.id,
Li Zhang's avatar
Li Zhang committed
253
                    state.h_context_length[idx],
Li Zhang's avatar
Li Zhang committed
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
                    request_output_len,
                    (int)session_len_,
                    trunc_output_len);
            }
        }

        // compute rope scaling factor
        if (r->start_flag) {
            seq.rope_theta      = model_->attn_params_.rotary_embedding_base;
            auto scaling_factor = 1.f;
            if (r->inputs[rank_].isExist("rope_scaling_factor")) {  // runtime scaling factor
                scaling_factor = r->inputs[rank_].getVal<float>("rope_scaling_factor");
            }
            else if (model_->attn_params_.rope_scaling_factor >= 1.f) {  // infer by `seq_len_limit`
                scaling_factor   = model_->attn_params_.rope_scaling_factor;
Li Zhang's avatar
Li Zhang committed
269
                auto max_seq_len = state.seq_len_limit[idx];
Li Zhang's avatar
Li Zhang committed
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
                auto max_pos_emb = model_->attn_params_.max_position_embeddings;
                if (max_seq_len > max_pos_emb) {
                    scaling_factor = scaling_factor * max_seq_len / max_pos_emb - (scaling_factor - 1);
                    // scaling_factor = std::max(exp2f(ceilf(log2f((float)max_seq_len / max_pos_emb) + 1.f))
                    // - 1.f, 1.f);
                }
            }
            if (scaling_factor != 1.f) {
                float rope_dim = model_->attn_params_.rotary_embedding_dim;
                seq.rope_theta *= powf(scaling_factor, rope_dim / (rope_dim - 2.f));
                TM_LOG_INFO("[ProcessInferRequests] %ld rope_scaling_factor: %f, rope_theta = %f",
                            (long)seq.id,
                            scaling_factor,
                            seq.rope_theta);
            }
        }
Li Zhang's avatar
Li Zhang committed
286
        state.h_rope_theta[idx] = seq.rope_theta;
Li Zhang's avatar
Li Zhang committed
287

Li Zhang's avatar
Li Zhang committed
288
289
290
291
292
293
294
295
        if (r->start_flag) {
            // prepare to initialize random state for new sequence
            h_random_seed_[idx] = r->inputs[rank_].getVal<unsigned long long>("random_seed", 0);
        }
        else {
            // Recover device states if not a new sequence
            h_curand_state_[existing_idx.size()] = *(curandState_t*)seq.random_state.data();
            existing_idx.push_back(idx);
Li Zhang's avatar
Li Zhang committed
296
297
        }

Li Zhang's avatar
Li Zhang committed
298
        // ! SHARED STATE IS MODIFIED, BARRIER SYNCHRONIZATION REQUIRED
Li Zhang's avatar
Li Zhang committed
299
        // assign priority based on arrival time
Li Zhang's avatar
Li Zhang committed
300
301
302
        if (rank_ == 0) {
            r->priority = request_count_++;
        }
Li Zhang's avatar
Li Zhang committed
303
304

        // increment pointer
Li Zhang's avatar
Li Zhang committed
305
        idx++;
Li Zhang's avatar
Li Zhang committed
306
    }
Li Zhang's avatar
Li Zhang committed
307

Li Zhang's avatar
Li Zhang committed
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
    state.size = idx;

    // when there are new sequences
    if (state.size != existing_idx.size()) {
        // copy random seeds to device
        Copy(h_random_seed_, state.size, d_random_seed_);
        // initialize random states
        invokeCurandBatchInitialize(state.curand_state, state.size, d_random_seed_, stream_);
        sync_check_cuda_error();
    }

    if (!existing_idx.empty()) {
        // copy existing curand states to device
        Copy(h_curand_state_, existing_idx.size(), d_curand_state_);
        // insert the states to their correct positions in the batch
        IndexedCopy({}, existing_idx, std::tuple{d_curand_state_, state.curand_state, 1});
    }
Li Zhang's avatar
Li Zhang committed
325
326
327
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
bool LlamaBatch<T>::Initialize()
{
    NvtxScope                                scope("initialize");
    std::vector<const Sequence*>             sequences;
    std::vector<Sequence::Status>            status;
    std::vector<uint64_t>                    priorities;
    std::vector<int>                         context_lengths;
    std::vector<std::pair<BatchState*, int>> coords;

    // count the holes introduced by finished requests in from previous iteration or stop requests from
    // current iteration
    int holes{};
    int active_holes{};
    for (int i = 0; i < state_->size; ++i) {
        if (!state_->requests[i]) {
            ++holes;
            if (i < state_->active_size) {
                ++active_holes;
            }
        }
    }

    // dbg(holes, active_holes);

    auto process = [&](BatchState* state) {
        for (int i = 0; i < state->size; ++i) {
            if (auto& r = state->requests[i]) {
                sequences.push_back(state->sequences[i]);
                status.push_back(state->sequences[i]->status);
                priorities.push_back(r->priority);
                context_lengths.push_back(state->h_context_length[i]);
                coords.emplace_back(state, i);
                // clear swap-in flags
                state->is_swap_in[i] = 0;
            }
        }
    };

    process(state_);
    process(incoming_);

    auto outcome = sequence_manager_->Materialize(sequences, context_lengths, priorities, step_length_);

    if (outcome.allocation || outcome.swap_in || outcome.swap_out) {
        dbg(outcome);
    }

    bool exchange = outcome.swap_in + outcome.swap_out > 0;

    std::vector<int> idxs(sequences.size());
    std::iota(idxs.begin(), idxs.end(), 0);

    if (exchange || holes || incoming_->size) {
        // put active ones first
        auto active_end = std::stable_partition(idxs.begin(), idxs.end(), [&](int idx) {
            return sequences[idx]->status == Sequence::kActive;  // present status
        });

        // all blocks are not enough to hold a single sequence
Li Zhang's avatar
Li Zhang committed
387
388
389
        if (!sequences.empty()) {
            FT_CHECK_WITH_INFO(active_end != idxs.begin(), "No enough blocks.");
        }
Li Zhang's avatar
Li Zhang committed
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406

        // move swap-ins to the back
        auto swapin_beg = std::stable_partition(idxs.begin(), active_end, [&](int idx) {
            return status[idx] == Sequence::kActive;  // past status
        });

        // sort swap-ins according to missing length
        if (swapin_beg != active_end) {
            std::vector<int> missing_len(sequences.size());
            for (int i = 0; i < sequences.size(); ++i) {
                missing_len[i] = context_lengths[i] - sequences[i]->cache_len;
            }
            std::stable_sort(swapin_beg, active_end, [&](int i, int j) { return missing_len[i] < missing_len[j]; });
        }

        // Copy sequence states to back buffer
        FT_CHECK(back_->size == 0 && back_->active_size == 0);
Li Zhang's avatar
Li Zhang committed
407
        std::vector<std::tuple<BatchState*, BatchState*, int, int>> cpys;
Li Zhang's avatar
Li Zhang committed
408
409
410
411
412
413
414
415
416
417
418
419
        for (const auto& i : idxs) {
            auto& s = *sequences[i];
            if (exchange) {
                const auto& [state, idx] = coords[i];
                // mark swap-ins
                if (status[i] != Sequence::kActive && s.status == Sequence::kActive) {
                    state->is_swap_in[idx] = 1;
                }
            }
            if (s.status == Sequence::kActive) {
                ++back_->active_size;
            }
Li Zhang's avatar
Li Zhang committed
420
            cpys.emplace_back(coords[i].first, back_, coords[i].second, back_->size++);
Li Zhang's avatar
Li Zhang committed
421
        }
Li Zhang's avatar
Li Zhang committed
422
        CopyState(cpys);
Li Zhang's avatar
Li Zhang committed
423
424
425
426
427
428
429
        // Swap the buffers
        std::swap(state_, back_);

        ClearState(*back_);
        ClearState(*incoming_);
    }

Li Zhang's avatar
Li Zhang committed
430
431
    FT_CHECK(state_->size <= max_batch_size_);

Li Zhang's avatar
Li Zhang committed
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
    /// Update block ptrs when there were
    //  1. swap-in or swap-out
    //  2. holes in the active buffer
    //  3. new allocations (for existing active sequences)
    if (exchange || active_holes || outcome.allocation) {
        // Prepare intermediate buffers
        h_cu_block_counts_[0] = 0;

        auto k_ptrs = h_k_block_ptrs_;
        auto v_ptrs = h_v_block_ptrs_;

        const int batch_size = state_->active_size;

        for (int i = 0; i < batch_size; ++i) {
            const auto& seq = *state_->sequences[i];

            // cumulative num of blocks
            h_cu_block_counts_[i + 1] = h_cu_block_counts_[i] + seq.blocks.size();

Li Zhang's avatar
Li Zhang committed
451
452
453
            FT_CHECK_WITH_INFO(h_cu_block_counts_[i + 1] <= sequence_manager_->max_block_count(),
                               std::to_string(h_cu_block_counts_[i + 1]));

Li Zhang's avatar
Li Zhang committed
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
            k_ptrs = std::transform(seq.blocks.cbegin(), seq.blocks.cend(), k_ptrs, [&](auto p) {
                return reinterpret_cast<uintptr_t>(sequence_manager_->OffsetKey(p->data));
            });
            v_ptrs = std::transform(seq.blocks.cbegin(), seq.blocks.cend(), v_ptrs, [&](auto p) {
                return reinterpret_cast<uintptr_t>(sequence_manager_->OffsetVal(p->data));
            });
        }

        static_assert(sizeof(uintptr_t) == sizeof(void*));

        Copy(h_cu_block_counts_, batch_size + 1, cu_block_counts_);
        Copy(h_k_block_ptrs_, h_cu_block_counts_[batch_size], k_block_ptrs_);
        Copy(h_v_block_ptrs_, h_cu_block_counts_[batch_size], v_block_ptrs_);
    }

    /// Layout of the buffers is changed, generation & sampling need to be re-initialized for correctness when there
    /// were
    //  1. swap-in or swap-out
    //  2. holes in the active buffer
    return exchange || active_holes;
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
477
void LlamaBatch<T>::CopyState(const std::vector<std::tuple<BatchState*, BatchState*, int, int>>& desc)
Li Zhang's avatar
Li Zhang committed
478
{
Li Zhang's avatar
Li Zhang committed
479
480
    std::vector<int> idxs(desc.size());
    std::iota(idxs.begin(), idxs.end(), 0);
Li Zhang's avatar
Li Zhang committed
481

Li Zhang's avatar
Li Zhang committed
482
    std::sort(idxs.begin(), idxs.end(), [&](int i, int j) { return desc[i] < desc[j]; });
Li Zhang's avatar
Li Zhang committed
483

Li Zhang's avatar
Li Zhang committed
484
485
486
    auto get_signature = [&](int i) -> std::pair<BatchState*, BatchState*> {
        return std::make_pair(std::get<0>(desc[idxs[i]]), std::get<1>(desc[idxs[i]]));
    };
Li Zhang's avatar
Li Zhang committed
487

Li Zhang's avatar
Li Zhang committed
488
489
490
491
492
493
494
495
496
497
    std::vector<int> offsets;
    auto             current = get_signature(0);
    offsets.push_back(0);
    for (int i = 0; i < idxs.size(); ++i) {
        if (auto signature = get_signature(i); signature != current) {
            current = signature;
            offsets.push_back(i);
        }
    }
    offsets.push_back(idxs.size());
Li Zhang's avatar
Li Zhang committed
498

Li Zhang's avatar
Li Zhang committed
499
500
501
    for (int bi = 1; bi < offsets.size(); ++bi) {
        int beg = offsets[bi - 1];
        int end = offsets[bi];
Li Zhang's avatar
Li Zhang committed
502

Li Zhang's avatar
Li Zhang committed
503
504
505
        if (beg == end) {
            continue;
        }
Li Zhang's avatar
Li Zhang committed
506

Li Zhang's avatar
Li Zhang committed
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
        auto [s, d] = get_signature(beg);

        std::vector<int> s_idx;
        std::vector<int> d_idx;
        for (int i = beg; i < end; ++i) {
            s_idx.push_back(std::get<2>(desc[idxs[i]]));
            d_idx.push_back(std::get<3>(desc[idxs[i]]));
        }

        IndexedCopy(s_idx,
                    d_idx,
                    std::tuple{s->output_ids, d->output_ids, session_len_},
                    std::tuple{s->curand_state, d->curand_state, 1});
    }

    for (const auto& [s, d, si, di] : desc) {
        d->h_context_length[di] = s->h_context_length[si];
        d->h_finished[di]       = s->h_finished[si];
        d->h_rope_theta[di]     = s->h_rope_theta[si];
        d->seq_len_limit[di]    = s->seq_len_limit[si];
        d->sequences[di]        = s->sequences[si];
        d->is_swap_in[di]       = s->is_swap_in[si];
        d->requests[di]         = s->requests[si];
    }
Li Zhang's avatar
Li Zhang committed
531
532
533
534
}

template<typename T>
void LlamaBatch<T>::AllocateBuffer(size_t batch_size, size_t session_len)
Li Zhang's avatar
Li Zhang committed
535
{
lvhan028's avatar
lvhan028 committed
536
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
Li Zhang's avatar
Li Zhang committed
537
538
    const size_t batchxbeam = batch_size;

Li Zhang's avatar
Li Zhang committed
539
540
541
542
543
544
    const size_t hidden_units      = model_->hidden_units_;
    const size_t vocab_size        = model_->vocab_size_padded_;
    const size_t head_dim          = model_->size_per_head_;
    const size_t local_kv_head_num = model_->local_kv_head_num_;
    // +1 padding, BlockIterator does not use predicate
    const size_t max_block_count = sequence_manager_->max_block_count() + 1;
Li Zhang's avatar
Li Zhang committed
545
546
547

    context_decoder_input_buf_ =
        (T*)allocator_->reMalloc(context_decoder_input_buf_, sizeof(T) * max_context_token_num_ * hidden_units, false);
548
549
    context_decoder_output_buf_ =
        (T*)allocator_->reMalloc(context_decoder_output_buf_, sizeof(T) * max_context_token_num_ * hidden_units, false);
Li Zhang's avatar
Li Zhang committed
550
551
552
    context_decoder_ids_buf_ =
        (int*)allocator_->reMalloc(context_decoder_ids_buf_, sizeof(int) * max_context_token_num_, false);

Li Zhang's avatar
Li Zhang committed
553
554
555
556
557
558
559
560
    tmp_k_cache_buf_ = (T*)allocator_->reMalloc(
        tmp_k_cache_buf_, sizeof(T) * max_context_token_num_ * local_kv_head_num * head_dim, false);
    tmp_v_cache_buf_ = (T*)allocator_->reMalloc(
        tmp_v_cache_buf_, sizeof(T) * max_context_token_num_ * local_kv_head_num * head_dim, false);

    tmp_k_ptrs_ = (void**)allocator_->reMalloc(tmp_k_ptrs_, sizeof(void*) * batch_size, false);
    tmp_v_ptrs_ = (void**)allocator_->reMalloc(tmp_v_ptrs_, sizeof(void*) * batch_size, false);

Li Zhang's avatar
Li Zhang committed
561
562
563
564
565
566
567
    decoder_input_buf_  = (T*)allocator_->reMalloc(decoder_input_buf_, sizeof(T) * batchxbeam * hidden_units, false);
    decoder_output_buf_ = (T*)allocator_->reMalloc(decoder_output_buf_, sizeof(T) * batchxbeam * hidden_units, false);

    input_ids_buf_      = (int*)allocator_->reMalloc(input_ids_buf_, sizeof(int) * batchxbeam * session_len, true);
    input_length_buf_   = (int*)allocator_->reMalloc(input_length_buf_, sizeof(int) * batchxbeam);
    context_length_buf_ = (int*)allocator_->reMalloc(context_length_buf_, sizeof(int) * batchxbeam);

Li Zhang's avatar
Li Zhang committed
568
    sequence_lengths_ = (int*)allocator_->reMalloc(sequence_lengths_, sizeof(int) * batchxbeam, false);
Li Zhang's avatar
Li Zhang committed
569

Li Zhang's avatar
Li Zhang committed
570
571
572
    cu_block_counts_ = (int*)allocator_->reMalloc(cu_block_counts_, sizeof(int) * (batch_size + 1));
    k_block_ptrs_    = (uintptr_t*)allocator_->reMalloc(k_block_ptrs_, sizeof(uintptr_t) * max_block_count);
    v_block_ptrs_    = (uintptr_t*)allocator_->reMalloc(v_block_ptrs_, sizeof(uintptr_t) * max_block_count);
Li Zhang's avatar
Li Zhang committed
573
574
575
576
577
578
579
580
581

    logits_buf_       = (float*)allocator_->reMalloc(logits_buf_, sizeof(float) * batchxbeam * vocab_size, false);
    local_logits_buf_ = (float*)allocator_->reMalloc(local_logits_buf_, sizeof(float) * batchxbeam * vocab_size, false);

    token_ids_buf_ = (int*)allocator_->reMalloc(token_ids_buf_, sizeof(int) * batchxbeam * session_len * 2, true);

    finished_buf_  = (bool*)allocator_->reMalloc(finished_buf_, sizeof(bool) * batchxbeam, false);
    seq_limit_len_ = (uint32_t*)allocator_->reMalloc(seq_limit_len_, sizeof(uint32_t) * batch_size, false);

Li Zhang's avatar
Li Zhang committed
582
583
584
585
586
587
    request_output_ids_ptrs_ = (int**)allocator_->reMalloc(request_output_ids_ptrs_, sizeof(int*) * batch_size, true);
    request_output_ids_lens_ = (int*)allocator_->reMalloc(request_output_ids_lens_, sizeof(int) * batch_size, true);
    request_seqlen_ptrs_     = (int**)allocator_->reMalloc(request_seqlen_ptrs_, sizeof(int*) * batch_size, true);

    rope_theta_ = (float*)allocator_->reMalloc(rope_theta_, sizeof(float) * batch_size, false);

Li Zhang's avatar
Li Zhang committed
588
589
590
591
    is_allocate_buffer_ = true;
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
592
void LlamaBatch<T>::AllocatePersistantBuffer(size_t max_batch_size)
Li Zhang's avatar
Li Zhang committed
593
{
Li Zhang's avatar
Li Zhang committed
594
595
596
597
598
599
    d_stop_words_ = (int*)allocator_->reMalloc(d_stop_words_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true);
    d_bad_words_  = (int*)allocator_->reMalloc(d_bad_words_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true);
    h_stop_words_ =
        (int*)allocator_->reMalloc(h_stop_words_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true, true);
    h_bad_words_ =
        (int*)allocator_->reMalloc(h_bad_words_, sizeof(int) * max_batch_size * kMaxStopBadWordsLen, true, true);
Li Zhang's avatar
Li Zhang committed
600
601
602
603
604
605
606

    h_runtime_top_k_ = (int*)allocator_->reMalloc(h_runtime_top_k_, sizeof(int) * max_batch_size, true, true);
    h_runtime_top_p_ = (float*)allocator_->reMalloc(h_runtime_top_p_, sizeof(float) * max_batch_size, true, true);
    h_temperature_   = (float*)allocator_->reMalloc(h_temperature_, sizeof(float) * max_batch_size, true, true);
    h_repetition_penalty_ =
        (float*)allocator_->reMalloc(h_repetition_penalty_, sizeof(float) * max_batch_size, true, true);

Li Zhang's avatar
Li Zhang committed
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
    h_random_seed_ = (unsigned long long*)allocator_->reMalloc(
        h_random_seed_, sizeof(unsigned long long) * max_batch_size, true, true);
    d_random_seed_ = (unsigned long long*)allocator_->reMalloc(
        d_random_seed_, sizeof(unsigned long long) * max_batch_size, true, false);

    h_curand_state_ =
        (curandState_t*)allocator_->reMalloc(h_curand_state_, sizeof(curandState_t) * max_batch_size, true, true);
    d_curand_state_ =
        (curandState_t*)allocator_->reMalloc(d_curand_state_, sizeof(curandState_t) * max_batch_size, true, false);

    d_end_ids_buf_ = (int*)allocator_->reMalloc(d_end_ids_buf_, sizeof(int) * max_batch_size, false);
    h_end_ids_buf_ = (int*)allocator_->reMalloc(h_end_ids_buf_, sizeof(int) * max_batch_size, false, true);

    sampling_params_ = {
        {"stop_words_list", (std::byte*)h_stop_words_, (std::byte*)d_stop_words_},
        {"bad_words_list", (std::byte*)h_bad_words_, (std::byte*)d_bad_words_},
        {"runtime_top_k", (std::byte*)h_runtime_top_k_, nullptr},
        {"runtime_top_p", (std::byte*)h_runtime_top_p_, nullptr},
        {"temperature", (std::byte*)h_temperature_, nullptr},
        {"repetition_penalty", (std::byte*)h_repetition_penalty_, nullptr},
    };
Li Zhang's avatar
Li Zhang committed
628

Li Zhang's avatar
Li Zhang committed
629
630
    for (auto& s : states_) {
        s.output_ids = (int*)allocator_->reMalloc(s.output_ids, sizeof(int) * max_batch_size * session_len_, true);
Li Zhang's avatar
Li Zhang committed
631
632
        s.curand_state =
            (curandState_t*)allocator_->reMalloc(s.curand_state, sizeof(curandState_t) * max_batch_size, true);
Li Zhang's avatar
Li Zhang committed
633
634
635
    }

    const size_t max_block_count = sequence_manager_->max_block_count();
Li Zhang's avatar
Li Zhang committed
636
637

    {
Li Zhang's avatar
Li Zhang committed
638
        NcclGuard barrier(model_->tensor_para_, stream_, true);
Li Zhang's avatar
Li Zhang committed
639
640
641
642
        h_input_ids_buf_ =
            (int*)allocator_->reMalloc(h_input_ids_buf_, sizeof(int) * max_batch_size * session_len_, false, true);
        h_input_length_buf_ =
            (int*)allocator_->reMalloc(h_input_length_buf_, sizeof(int) * max_batch_size, false, true);
Li Zhang's avatar
Li Zhang committed
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660

        h_tmp_k_ptrs_ = (void**)allocator_->reMalloc(h_tmp_k_ptrs_, sizeof(void*) * max_batch_size, false, true);
        h_tmp_v_ptrs_ = (void**)allocator_->reMalloc(h_tmp_v_ptrs_, sizeof(void*) * max_batch_size, false, true);

        h_cu_block_counts_ =
            (int*)allocator_->reMalloc(h_cu_block_counts_, sizeof(int) * (max_batch_size + 1), false, true);
        h_k_block_ptrs_ =
            (uintptr_t*)allocator_->reMalloc(h_k_block_ptrs_, sizeof(uintptr_t) * max_block_count, false, true);
        h_v_block_ptrs_ =
            (uintptr_t*)allocator_->reMalloc(h_v_block_ptrs_, sizeof(uintptr_t) * max_block_count, false, true);

        for (auto& s : states_) {
            s.h_context_length =
                (int*)allocator_->reMalloc(s.h_context_length, sizeof(int) * max_batch_size, false, true);
            s.h_finished   = (bool*)allocator_->reMalloc(s.h_finished, sizeof(bool) * max_batch_size * 2, false, true);
            s.h_rope_theta = (float*)allocator_->reMalloc(s.h_rope_theta, sizeof(float) * max_batch_size, false, true);
        }

Li Zhang's avatar
Li Zhang committed
661
662
        h_seq_limit_len_ =
            (uint32_t*)allocator_->reMalloc(h_seq_limit_len_, sizeof(uint32_t) * max_batch_size, false, true);
Li Zhang's avatar
Li Zhang committed
663
664
665
666
667
668
669

        h_request_output_ids_ptrs_ =
            (int**)allocator_->reMalloc(h_request_output_ids_ptrs_, sizeof(int*) * max_batch_size, true, true);
        h_request_output_ids_lens_ =
            (int*)allocator_->reMalloc(h_request_output_ids_lens_, sizeof(int) * max_batch_size, true, true);
        h_request_seqlen_ptrs_ =
            (int**)allocator_->reMalloc(h_request_seqlen_ptrs_, sizeof(int*) * max_batch_size, true, true);
Li Zhang's avatar
Li Zhang committed
670
671
672

        h_output_ids_ =
            (int*)allocator_->reMalloc(h_output_ids_, sizeof(int) * max_batch_size * session_len_, false, true);
Li Zhang's avatar
Li Zhang committed
673
674
675
676
677
678
    }

    is_allocate_persistant_buffer_ = true;
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
679
void LlamaBatch<T>::FreeBuffer()
Li Zhang's avatar
Li Zhang committed
680
{
lvhan028's avatar
lvhan028 committed
681
    TM_LOG_DEBUG(__PRETTY_FUNCTION__);
Li Zhang's avatar
Li Zhang committed
682
683
    if (is_allocate_buffer_) {
        allocator_->free((void**)&context_decoder_input_buf_);
684
        allocator_->free((void**)&context_decoder_output_buf_);
Li Zhang's avatar
Li Zhang committed
685
686
        allocator_->free((void**)&context_decoder_ids_buf_);

Li Zhang's avatar
Li Zhang committed
687
688
689
690
691
        allocator_->free((void**)&tmp_k_cache_buf_);
        allocator_->free((void**)&tmp_v_cache_buf_);
        allocator_->free((void**)&tmp_k_ptrs_);
        allocator_->free((void**)&tmp_v_ptrs_);

Li Zhang's avatar
Li Zhang committed
692
693
694
695
696
697
698
699
700
        allocator_->free((void**)&decoder_input_buf_);
        allocator_->free((void**)&decoder_output_buf_);

        allocator_->free((void**)&input_ids_buf_);
        allocator_->free((void**)&input_length_buf_);
        allocator_->free((void**)&context_length_buf_);

        allocator_->free((void**)&sequence_lengths_);

Li Zhang's avatar
Li Zhang committed
701
702
703
        allocator_->free((void**)&cu_block_counts_);
        allocator_->free((void**)&k_block_ptrs_);
        allocator_->free((void**)&v_block_ptrs_);
Li Zhang's avatar
Li Zhang committed
704
705
706
707

        allocator_->free((void**)&logits_buf_);
        allocator_->free((void**)&local_logits_buf_);

708
709
710
711
712
713
714
        if (local_context_logits_buf_) {
            allocator_->free((void**)&local_context_logits_buf_);
        }
        if (context_logits_buf_) {
            allocator_->free((void**)&context_logits_buf_);
        }

Li Zhang's avatar
Li Zhang committed
715
716
        allocator_->free((void**)&token_ids_buf_);

Li Zhang's avatar
Li Zhang committed
717
718
719
        allocator_->free((void**)&d_end_ids_buf_);
        allocator_->free((void**)&h_end_ids_buf_, true);

Li Zhang's avatar
Li Zhang committed
720
721
722
        allocator_->free((void**)&finished_buf_);
        allocator_->free((void**)&seq_limit_len_);

Li Zhang's avatar
Li Zhang committed
723
724
725
726
727
728
        allocator_->free((void**)&request_output_ids_ptrs_);
        allocator_->free((void**)&request_output_ids_lens_);
        allocator_->free((void**)&request_seqlen_ptrs_);

        allocator_->free((void**)&rope_theta_);

Li Zhang's avatar
Li Zhang committed
729
730
731
732
        is_allocate_buffer_ = false;
    }

    if (is_allocate_persistant_buffer_) {
Li Zhang's avatar
Li Zhang committed
733
734
735
736
737
738
739
740
741
742

        allocator_->free((void**)&d_stop_words_);
        allocator_->free((void**)&h_stop_words_, true);
        allocator_->free((void**)&d_bad_words_);
        allocator_->free((void**)&h_bad_words_, true);
        allocator_->free((void**)&d_random_seed_);
        allocator_->free((void**)&h_random_seed_, true);
        allocator_->free((void**)&d_curand_state_);
        allocator_->free((void**)&h_curand_state_, true);

Li Zhang's avatar
Li Zhang committed
743
744
745
746
747
        for (auto& s : states_) {
            allocator_->free((void**)&s.h_context_length, true);
            allocator_->free((void**)&s.h_finished, true);
            allocator_->free((void**)&s.h_rope_theta, true);
            allocator_->free((void**)&s.output_ids);
Li Zhang's avatar
Li Zhang committed
748
            allocator_->free((void**)&s.curand_state);
Li Zhang's avatar
Li Zhang committed
749
750
751
752
753
754
        }
        allocator_->free((void**)&h_tmp_k_ptrs_, true);
        allocator_->free((void**)&h_tmp_v_ptrs_, true);
        allocator_->free((void**)&h_cu_block_counts_, true);
        allocator_->free((void**)&h_k_block_ptrs_, true);
        allocator_->free((void**)&h_v_block_ptrs_, true);
Li Zhang's avatar
Li Zhang committed
755
756
757
758
        allocator_->free((void**)&h_input_ids_buf_, true);
        allocator_->free((void**)&h_input_length_buf_, true);
        allocator_->free((void**)&h_seq_limit_len_, true);

Li Zhang's avatar
Li Zhang committed
759
760
761
        allocator_->free((void**)&h_request_output_ids_ptrs_, true);
        allocator_->free((void**)&h_request_output_ids_lens_, true);
        allocator_->free((void**)&h_request_seqlen_ptrs_, true);
Li Zhang's avatar
Li Zhang committed
762

Li Zhang's avatar
Li Zhang committed
763
764
        allocator_->free((void**)&h_output_ids_, true);

Li Zhang's avatar
Li Zhang committed
765
766
767
768
769
        is_allocate_persistant_buffer_ = false;
    }
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
770
771
772
773
774
LlamaBatch<T>::LlamaBatch(int                              max_batch_size,
                          int                              max_context_token_num,
                          int                              session_len,
                          std::unique_ptr<SequenceManager> sequence_manager,
                          LlamaV2<T>*                      model):
Li Zhang's avatar
Li Zhang committed
775
776
777
    max_batch_size_(max_batch_size),
    max_context_token_num_(max_context_token_num),
    session_len_(session_len),
Li Zhang's avatar
Li Zhang committed
778
779
780
781
782
    rank_(model->tensor_para_.rank_),
    debug_(model->debug_),
    step_length_(model->step_length_),
    sequence_manager_(std::move(sequence_manager)),
    model_(model),
Li Zhang's avatar
Li Zhang committed
783
784
    data_type_(getTensorType<T>())
{
Li Zhang's avatar
Li Zhang committed
785
786
787
788
789
790
791
792
793
794
    stream_         = model_->stream_;
    allocator_      = model_->allocator_;
    cublas_wrapper_ = model_->cublas_wrapper_;

    for (auto& s : states_) {
        s.requests.resize(max_batch_size);
        s.sequences.resize(max_batch_size);
        s.seq_len_limit.resize(max_batch_size);
        s.is_swap_in.resize(max_batch_size);
    }
Li Zhang's avatar
Li Zhang committed
795

Li Zhang's avatar
Li Zhang committed
796
797
798
    state_    = &states_[0];
    back_     = &states_[1];
    incoming_ = &states_[2];
Li Zhang's avatar
Li Zhang committed
799

Li Zhang's avatar
Li Zhang committed
800
801
    AllocateBuffer(max_batch_size, session_len_);
    AllocatePersistantBuffer(max_batch_size);
Li Zhang's avatar
Li Zhang committed
802
803
804
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
805
void LlamaBatch<T>::InitializeSampling()
Li Zhang's avatar
Li Zhang committed
806
{
Li Zhang's avatar
Li Zhang committed
807
    NvtxScope _("InitSampling");
Li Zhang's avatar
Li Zhang committed
808
    const int batch_size = state_->active_size;
Li Zhang's avatar
Li Zhang committed
809
    TensorMap inputs;
Li Zhang's avatar
Li Zhang committed
810
    for (const auto& [name, h_ptr, d_ptr] : sampling_params_) {
Li Zhang's avatar
Li Zhang committed
811
        // find an exemplar that matches the param name
Li Zhang's avatar
Li Zhang committed
812
        const Tensor* ptr{};
Li Zhang's avatar
Li Zhang committed
813
        for (int i = 0; i < batch_size; ++i) {
Li Zhang's avatar
Li Zhang committed
814
815
            if (state_->requests[i]->inputs[rank_].isExist(name)) {
                ptr = &state_->requests[i]->inputs[rank_].at(name);
Li Zhang's avatar
Li Zhang committed
816
817
818
                break;
            }
        }
Li Zhang's avatar
Li Zhang committed
819
        // fill the batch of the param
Li Zhang's avatar
Li Zhang committed
820
821
822
823
        if (ptr) {
            const auto& ref   = *ptr;
            auto        shape = ref.shape;
            FT_CHECK(shape[0] == 1);
Li Zhang's avatar
Li Zhang committed
824
            shape[0]                = batch_size;
Li Zhang's avatar
Li Zhang committed
825
            const int size_in_bytes = ref.sizeBytes();
Li Zhang's avatar
Li Zhang committed
826
            memset(h_ptr, 0, size_in_bytes * batch_size);
Li Zhang's avatar
Li Zhang committed
827
            for (int i = 0; i < batch_size; ++i) {
Li Zhang's avatar
Li Zhang committed
828
829
                if (state_->requests[i]->inputs[rank_].isExist(name)) {
                    Tensor& src = state_->requests[i]->inputs[rank_].at(name);
Li Zhang's avatar
Li Zhang committed
830
                    FT_CHECK(ref.shape == src.shape);
Li Zhang's avatar
Li Zhang committed
831
                    std::copy_n(src.getPtr<std::byte>(), size_in_bytes, h_ptr + size_in_bytes * i);
Li Zhang's avatar
Li Zhang committed
832
833
                }
            }
Li Zhang's avatar
Li Zhang committed
834
835
836
837
            if (d_ptr) {
                Copy(h_ptr, batch_size * size_in_bytes, d_ptr);
            }
            inputs.insert({name, {d_ptr ? MEMORY_GPU : MEMORY_CPU, ref.type, shape, d_ptr ? d_ptr : h_ptr}});
Li Zhang's avatar
Li Zhang committed
838
            if (debug_ && rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
839
                TM_LOG_INFO("[initializeSampling] %s", format({name, inputs.at(name)}).c_str());
Li Zhang's avatar
Li Zhang committed
840
841
842
843
            }
        }
    }

Li Zhang's avatar
Li Zhang committed
844
845
846
847
848
    // init for eos
    std::fill_n(h_end_ids_buf_, batch_size, model_->end_id_);
    Copy(h_end_ids_buf_, batch_size, d_end_ids_buf_);
    inputs.insert({"end_id", {MEMORY_GPU, TYPE_INT32, {(size_t)batch_size}, d_end_ids_buf_}});

Li Zhang's avatar
Li Zhang committed
849
850
    inputs_ = std::move(inputs);

Li Zhang's avatar
Li Zhang committed
851
    model_->dynamic_decode_layer_->setup(batch_size, 1, &inputs_);
Li Zhang's avatar
Li Zhang committed
852
853
854
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
855
auto LlamaBatch<T>::InitializeGeneration() -> GenerationState
Li Zhang's avatar
Li Zhang committed
856
{
Li Zhang's avatar
Li Zhang committed
857
    NvtxScope _("InitGen");
Li Zhang's avatar
Li Zhang committed
858
859
    const int batch_size      = state_->active_size;
    const int max_context_len = *std::max_element(state_->h_context_length, state_->h_context_length + batch_size);
Li Zhang's avatar
Li Zhang committed
860

Li Zhang's avatar
Li Zhang committed
861
862
863
864
865
866
867
868
869
    Copy(state_->h_context_length, batch_size, context_length_buf_);  // also referenced in `SetOutputTensors`
    Copy(context_length_buf_, batch_size, sequence_lengths_);
    // `sequence_lengths_` will be increased by dynamic decode
    // note that in decoder and in output "sequence length" has different semantic
    // - in decoder it means length of sequence that has kv cache already computed
    // - in output it means length of all tokens (the last generated token does not have k/v cache computed yet)
    invokePlusScalar(sequence_lengths_, -1, batch_size, stream_);
    sync_check_cuda_error();

Li Zhang's avatar
Li Zhang committed
870
871
    Clear(token_ids_buf_, batch_size * session_len_);
    invokeTransposeAxis01(token_ids_buf_, state_->output_ids, batch_size, session_len_, 1, stream_);
Li Zhang's avatar
Li Zhang committed
872
873
874
875
876
877
878
879
    sync_check_cuda_error();

    // token_ids_buf_[s, b]
    // ABCDe            ABCDe     e
    // ABCDEFGHIJk      ABCDEFGHIJk
    // ABCDEFGHi    ->  ABCDEFGHi i
    // ABCDEFGh         ABCDEFGh  h
    // ABCd             ABCd      d
Li Zhang's avatar
Li Zhang committed
880
    invokePadLastTokenIds(token_ids_buf_, context_length_buf_, max_context_len, batch_size, stream_);
Li Zhang's avatar
Li Zhang committed
881
882
    sync_check_cuda_error();

Li Zhang's avatar
Li Zhang committed
883
884
885
886
887
888
889
890
891
892
893
894
    // used for dispatching split-k decoding kernels
    const int sum_seq_len =
        std::accumulate(state_->h_context_length, state_->h_context_length + batch_size, -batch_size);
    const int max_seq_len = *std::max_element(state_->h_context_length, state_->h_context_length + batch_size) - 1;

    // seq_limit_len_, will be compared to `step` instead of `sequence_length`, so padding len should be accounted
    // for
    for (int i = 0; i < batch_size; ++i) {
        h_seq_limit_len_[i] = state_->seq_len_limit[i] + (max_context_len - state_->h_context_length[i]);
    }
    Copy(h_seq_limit_len_, batch_size, seq_limit_len_);
    Copy(state_->h_finished, batch_size, finished_buf_);
Li Zhang's avatar
Li Zhang committed
895

Li Zhang's avatar
Li Zhang committed
896
897
898
899
900
901
902
903
904
905
906
907
    for (int i = 0; i < batch_size; ++i) {
        Tensor& output_ids         = state_->requests[i]->outputs[rank_].at("output_ids");
        int*    req_output_ids_ptr = output_ids.getPtr<int>();
        int*    req_seqlen_ptr     = state_->requests[i]->outputs[rank_].getPtr<int>("sequence_length");

        h_request_output_ids_ptrs_[i] = req_output_ids_ptr;
        h_request_output_ids_lens_[i] = output_ids.shape.at(2);
        h_request_seqlen_ptrs_[i]     = req_seqlen_ptr;

        FT_CHECK(h_request_output_ids_ptrs_[i]);
        FT_CHECK(h_request_output_ids_lens_[i]);
        FT_CHECK(h_request_seqlen_ptrs_[i]);
Li Zhang's avatar
Li Zhang committed
908
    }
Li Zhang's avatar
Li Zhang committed
909
910
911
912
913
    Copy(h_request_output_ids_ptrs_, batch_size, request_output_ids_ptrs_);
    Copy(h_request_output_ids_lens_, batch_size, request_output_ids_lens_);
    Copy(h_request_seqlen_ptrs_, batch_size, request_seqlen_ptrs_);

    Copy(state_->h_rope_theta, batch_size, rope_theta_);
Li Zhang's avatar
Li Zhang committed
914
915
916
917

    // ! range of step_ [1, 2 * session_len]
    // consider a sequence with context_len == session_len and another sequence with context_len == 1 and
    // request_output_len == session_len - 1 => step_ will loop in [session_len, 2 * session_len)
Li Zhang's avatar
Li Zhang committed
918
    const int start_step = max_context_len;
Li Zhang's avatar
Li Zhang committed
919
920

    if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
921
922
        TM_LOG_INFO("[initGen] batch_size = %d", (int)batch_size);
        TM_LOG_INFO("[initGen] max_context_len = %d", (int)max_context_len);
Li Zhang's avatar
Li Zhang committed
923

Li Zhang's avatar
Li Zhang committed
924
925
926
927
928
929
930
931
932
933
        if (debug_) {
            TM_LOG_INFO("[initGen] slot  sequence_id  context_len  seq_limit_len  finished");
            for (int i = 0; i < batch_size; ++i) {
                TM_LOG_INFO("[initGen] %4d  %11ld  %11d  %13d  %8d",
                            i,
                            (long)state_->sequences[i]->id,
                            state_->h_context_length[i],
                            (int)h_seq_limit_len_[i],
                            (int)state_->h_finished[i]);
            }
Li Zhang's avatar
Li Zhang committed
934
935
        }
    }
Li Zhang's avatar
Li Zhang committed
936
937

    return GenerationState{max_context_len, start_step, sum_seq_len, max_seq_len};
Li Zhang's avatar
Li Zhang committed
938
939
940
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
941
bool LlamaBatch<T>::Generate(GenerationState& g)
Li Zhang's avatar
Li Zhang committed
942
{
Li Zhang's avatar
Li Zhang committed
943
944
945
    NvtxScope scope("Generate");
    const int batch_size = state_->active_size;

Li Zhang's avatar
Li Zhang committed
946
    constexpr int kLogInterval = 10;
Li Zhang's avatar
Li Zhang committed
947
948
    if (rank_ == 0 && (g.step - 1) % kLogInterval == 0) {
        TM_LOG_INFO("------------------------- step = %d -------------------------", g.step - 1);
Li Zhang's avatar
Li Zhang committed
949
950
    }

Li Zhang's avatar
Li Zhang committed
951
    const bool is_first_step = (g.step == g.max_init_ctx_len);
Li Zhang's avatar
Li Zhang committed
952
953
954

    std::vector<int> prev;
    if (debug_ && rank_ == 0 && is_first_step) {
Li Zhang's avatar
Li Zhang committed
955
956
        prev.resize(batch_size);
        Copy(token_ids_buf_ + (g.step - 1) * batch_size, batch_size, prev.data());
Li Zhang's avatar
Li Zhang committed
957
958
959
    }

    // embeddingLookup(step_ - 1);
Li Zhang's avatar
Li Zhang committed
960
    model_->embeddingLookup(decoder_input_buf_,  //
Li Zhang's avatar
Li Zhang committed
961
                            token_ids_buf_,
Li Zhang's avatar
Li Zhang committed
962
963
                            batch_size,
                            g.step - 1);
Li Zhang's avatar
Li Zhang committed
964

Li Zhang's avatar
Li Zhang committed
965
966
967
    model_->decoderForward(decoder_output_buf_,
                           k_block_ptrs_,
                           v_block_ptrs_,
Li Zhang's avatar
Li Zhang committed
968
969
970
                           decoder_input_buf_,
                           sequence_lengths_,
                           finished_buf_,
Li Zhang's avatar
Li Zhang committed
971
972
973
                           cu_block_counts_,
                           rope_theta_,
                           g.step,
Li Zhang's avatar
Li Zhang committed
974
                           0,
Li Zhang's avatar
Li Zhang committed
975
976
977
                           g.sum_seq_len,
                           g.max_seq_len,
                           batch_size);
Li Zhang's avatar
Li Zhang committed
978

Li Zhang's avatar
Li Zhang committed
979
    model_->postDecodeEmbedding(logits_buf_,  //
Li Zhang's avatar
Li Zhang committed
980
981
                                local_logits_buf_,
                                decoder_output_buf_,
Li Zhang's avatar
Li Zhang committed
982
                                batch_size);
Li Zhang's avatar
Li Zhang committed
983

Li Zhang's avatar
Li Zhang committed
984
985
986
    /// sync for better NVTX visualization, THIS IS NOT NEEDED
    // check_cuda_error(cudaStreamSynchronize(stream_));

Li Zhang's avatar
Li Zhang committed
987
988
989
    // stop-words & bad-words require the matched tokens to be contiguous, so item size > 1 is
    // not supported yet.
    bool should_stop{};
Li Zhang's avatar
Li Zhang committed
990
    model_->dynamicDecode(token_ids_buf_,
Li Zhang's avatar
Li Zhang committed
991
992
993
                          finished_buf_,
                          sequence_lengths_,
                          &should_stop,
Li Zhang's avatar
Li Zhang committed
994
                          state_->curand_state,
Li Zhang's avatar
Li Zhang committed
995
996
997
998
999
                          &inputs_,
                          &outputs_,
                          logits_buf_,
                          seq_limit_len_,
                          context_length_buf_,
Li Zhang's avatar
Li Zhang committed
1000
                          d_end_ids_buf_,
Li Zhang's avatar
Li Zhang committed
1001
                          g.step,
Li Zhang's avatar
Li Zhang committed
1002
                          0,
Li Zhang's avatar
Li Zhang committed
1003
                          g.max_init_ctx_len,
Li Zhang's avatar
Li Zhang committed
1004
                          session_len_ * 2,
Li Zhang's avatar
Li Zhang committed
1005
                          batch_size);
Li Zhang's avatar
Li Zhang committed
1006
1007

    if (debug_ && rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
1008
        std::vector<int> curr(batch_size);
Li Zhang's avatar
Li Zhang committed
1009

Li Zhang's avatar
Li Zhang committed
1010
        Copy(token_ids_buf_ + g.step * batch_size, batch_size, curr.data());
Li Zhang's avatar
Li Zhang committed
1011
1012
1013
1014
1015
1016
1017
        cudaStreamSynchronize(stream_);

        if (is_first_step) {
            std::stringstream sprev;
            for (int k = 0; k < prev.size(); ++k) {
                sprev << std::setw(6) << prev[k];
            }
Li Zhang's avatar
Li Zhang committed
1018
            TM_LOG_INFO("[ lookup ] step = %d, [%s]", g.step - 1, sprev.str().c_str());
Li Zhang's avatar
Li Zhang committed
1019
1020
1021
1022
1023
1024
        }

        std::stringstream scurr;
        for (int k = 0; k < curr.size(); ++k) {
            scurr << std::setw(6) << curr[k];
        }
Li Zhang's avatar
Li Zhang committed
1025
        TM_LOG_INFO("[generate] step = %d, [%s]", g.step - 1, scurr.str().c_str());
Li Zhang's avatar
Li Zhang committed
1026
1027
1028
    }

    ////////////////////////////////////////////////
Li Zhang's avatar
Li Zhang committed
1029
1030
1031
1032
    /// ! increase the counters
    g.step += 1;
    g.max_seq_len += 1;
    g.sum_seq_len += batch_size;
Li Zhang's avatar
Li Zhang committed
1033
1034
1035
1036
1037

    return !should_stop;
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
1038
void LlamaBatch<T>::ContextDecode()
Li Zhang's avatar
Li Zhang committed
1039
{
Li Zhang's avatar
Li Zhang committed
1040
    NvtxScope  _("prefill");
Li Zhang's avatar
Li Zhang committed
1041
    const auto batch_size = state_->active_size;
Li Zhang's avatar
Li Zhang committed
1042

Li Zhang's avatar
Li Zhang committed
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
    int base = -1;
    for (int i = 0; i < batch_size; ++i) {
        if (state_->is_swap_in[i]) {
            const auto& seq = *state_->sequences[i];
            dbg(std::tuple(i, state_->h_context_length[i], seq.cache_len));
            if (const int missing = state_->h_context_length[i] - seq.cache_len; missing > 1) {
                base = base < 0 ? i : base;
                dbg(seq.tokens, seq.cache_len);
                Copy(state_->output_ids + i * session_len_ + seq.cache_len, missing, input_ids_buf_ + i * session_len_);
                // subtract input/context len by 1 to skip last input token (will process with decoder later)
                h_input_length_buf_[i] = missing - 1;
Li Zhang's avatar
Li Zhang committed
1054
1055
1056
            }
        }
    }
Li Zhang's avatar
Li Zhang committed
1057
1058
1059
    if (base < 0) {
        // TM_LOG_INFO("[decodeContext] Context decoding is not needed.");
        return;
Li Zhang's avatar
Li Zhang committed
1060
1061
    }

Li Zhang's avatar
Li Zhang committed
1062
    const int context_decode_count = batch_size - base;
Li Zhang's avatar
Li Zhang committed
1063

Li Zhang's avatar
Li Zhang committed
1064
1065
1066
    Copy(state_->h_context_length, batch_size, context_length_buf_);
    Copy(state_->h_rope_theta, batch_size, rope_theta_);
    Copy(h_input_length_buf_, batch_size, input_length_buf_);
Li Zhang's avatar
Li Zhang committed
1067

Li Zhang's avatar
Li Zhang committed
1068
1069
    // check_cuda_error(cudaStreamSynchronize(stream_));
    // const auto tick = std::chrono::high_resolution_clock::now();
Li Zhang's avatar
Li Zhang committed
1070

Li Zhang's avatar
Li Zhang committed
1071
1072
    if (rank_ == 0) {
        TM_LOG_INFO("[decodeContext] base = %d, count = %d", base, context_decode_count);
Li Zhang's avatar
Li Zhang committed
1073
    }
Li Zhang's avatar
Li Zhang committed
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
    // subtract input/context len by 1 to skip last input token (will process with decoder later)
    invokePlusScalar(context_length_buf_ + base, -1, context_decode_count, stream_);

    // find sub-batch offsets
    std::vector<int> offsets{base};
    std::vector<int> max_context_cnts;
    int              accum_size        = 0;
    int              accum_input_count = 0;
    int              max_context_count = 0;
    for (int i = base; i < batch_size; ++i) {
        int size          = accum_size + 1;
        int input_count   = accum_input_count + h_input_length_buf_[i];
        int context_count = std::max(max_context_count, state_->h_context_length[i] - 1);
        // we have `cu_seqlens` on q so no padding for input is needed
        // kernels are expecting uniform k/v cache length -> `max_context_count * size <= max_context_token_num_`
        if (input_count <= max_context_token_num_ && context_count * size <= max_context_token_num_) {
            accum_size        = size;
            accum_input_count = input_count;
            max_context_count = context_count;
Li Zhang's avatar
Li Zhang committed
1093
        }
Li Zhang's avatar
Li Zhang committed
1094
1095
1096
1097
1098
1099
        else {
            offsets.push_back(i);
            max_context_cnts.push_back(max_context_count);
            accum_size        = 1;
            accum_input_count = h_input_length_buf_[i];
            max_context_count = state_->h_context_length[i] - 1;
Li Zhang's avatar
Li Zhang committed
1100
1101
        }
    }
Li Zhang's avatar
Li Zhang committed
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
    offsets.push_back(batch_size);
    max_context_cnts.push_back(max_context_count);

    dbg(offsets, max_context_cnts);

    // context decode on sub-batches
    for (int k = 0; k < offsets.size() - 1; ++k) {
        int              first          = offsets[k];
        int              last           = offsets[k + 1];
        int              sub_batch_size = last - first;
        T*               k_ptr          = tmp_k_cache_buf_;
        T*               v_ptr          = tmp_v_cache_buf_;
        std::vector<int> decode_indices{};
        std::vector<int> decode_lengths{};
        int              max_input_len{};
        auto             input_ids = context_decoder_ids_buf_;
        TM_LOG_INFO("first = %d, last = %d", first, last);
        for (int i = first; i < last; ++i) {
Li Zhang's avatar
Li Zhang committed
1120
            // TM_LOG_INFO("session_len = %d, input_length = %d", session_len_, h_input_length_buf_[i]);
Li Zhang's avatar
Li Zhang committed
1121
1122
1123
1124
1125
1126
1127
1128
1129
            input_ids = Copy(input_ids_buf_ + i * session_len_, h_input_length_buf_[i], input_ids);
            dbg(i, h_input_length_buf_[i]);
            h_tmp_k_ptrs_[i] = k_ptr;
            h_tmp_v_ptrs_[i] = v_ptr;
            k_ptr += model_->local_kv_head_num_ * max_context_cnts[k] * model_->size_per_head_;
            v_ptr += model_->local_kv_head_num_ * max_context_cnts[k] * model_->size_per_head_;
            decode_indices.push_back(i);
            decode_lengths.push_back(h_input_length_buf_[i]);
            max_input_len = std::max(max_input_len, h_input_length_buf_[i]);
Li Zhang's avatar
Li Zhang committed
1130
        }
Li Zhang's avatar
Li Zhang committed
1131
1132
        int token_count = input_ids - context_decoder_ids_buf_;
        dbg(token_count, max_input_len, max_context_cnts[k]);
Li Zhang's avatar
Li Zhang committed
1133

Li Zhang's avatar
Li Zhang committed
1134
1135
        Copy(h_tmp_k_ptrs_ + first, sub_batch_size, tmp_k_ptrs_ + first);
        Copy(h_tmp_v_ptrs_ + first, sub_batch_size, tmp_v_ptrs_ + first);
Li Zhang's avatar
Li Zhang committed
1136
1137

        if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
1138
1139
1140
1141
1142
1143
1144
            TM_LOG_INFO(
                "[decodeContext] offset = %d, batch_size = %d, token_num = %d, max_input_len = %d, max_context_len = %d",
                base,
                sub_batch_size,
                token_count,
                max_input_len,
                max_context_cnts[k]);
Li Zhang's avatar
Li Zhang committed
1145
1146
        }

Li Zhang's avatar
Li Zhang committed
1147
1148
        dbg(first, last);
        dbg(k_block_ptrs_, v_block_ptrs_);
Li Zhang's avatar
Li Zhang committed
1149

Li Zhang's avatar
Li Zhang committed
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
        model_->contextDecode(nullptr,
                              k_block_ptrs_,
                              v_block_ptrs_,
                              tmp_k_ptrs_ + first,
                              tmp_v_ptrs_ + first,
                              context_decoder_input_buf_,
                              context_decoder_output_buf_,
                              context_decoder_ids_buf_,
                              input_length_buf_ + first,
                              context_length_buf_ + first,
                              cu_block_counts_ + first,
                              rope_theta_ + first,
                              token_count,
                              max_input_len,
                              max_context_cnts[k],
                              max_context_cnts[k],
                              sub_batch_size);

        // compute logits of inputs if requested
        OutputContextLogits(context_decoder_output_buf_, decode_indices, decode_lengths);
    }

    invokePlusScalar(context_length_buf_ + base, 1, context_decode_count, stream_);

    std::fill(h_input_length_buf_ + base, h_input_length_buf_ + batch_size, 0);

    // `SequenceManager` needs real-time value of cache length
    for (int i = base; i < batch_size; ++i) {
        if (state_->requests[i]) {
            FT_CHECK(state_->sequences[i]);
            state_->sequences[i]->cache_len = state_->h_context_length[i] - 1;  // -1 since we skip last token
Li Zhang's avatar
Li Zhang committed
1181
1182
        }
    }
Li Zhang's avatar
Li Zhang committed
1183

Li Zhang's avatar
Li Zhang committed
1184
1185
1186
1187
1188
    // check_cuda_error(cudaStreamSynchronize(stream_));
    // const auto tock = std::chrono::high_resolution_clock::now();
    // if (rank_ == 0) {
    //     TM_LOG_INFO("[decodeContext] %.2f ms", std::chrono::duration<float, std::milli>(tock - tick).count());
    // }
Li Zhang's avatar
Li Zhang committed
1189
1190
}

1191
template<typename T>
Li Zhang's avatar
Li Zhang committed
1192
void LlamaBatch<T>::OutputContextLogits(T*                      context_decoder_output,
1193
1194
1195
1196
1197
1198
1199
1200
                                        const std::vector<int>& indices,
                                        const std::vector<int>& lengths)
{
    std::vector<float*> output_logits;
    int                 num_token = 0;
    {
        bool is_return_logits = false;
        for (int k = 0; k < indices.size(); ++k) {
Li Zhang's avatar
Li Zhang committed
1201
            auto& request = state_->requests[indices[k]];
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
            output_logits.push_back(request->outputs[rank_].getPtr<float>("logits", nullptr));
            num_token += lengths[k];
            if (output_logits.back()) {
                is_return_logits = true;
            }
        }
        if (!is_return_logits) {
            return;
        }
    }

    if (context_logits_buf_ == nullptr) {
Li Zhang's avatar
Li Zhang committed
1214
        NcclGuard guard(model_->tensor_para_, stream_, true);
Chen Xin's avatar
Chen Xin committed
1215
        context_logits_buf_ =
Li Zhang's avatar
Li Zhang committed
1216
1217
            (float*)allocator_->malloc(sizeof(float) * model_->vocab_size_padded_ * max_context_token_num_);
        const auto tp = model_->tensor_para_.world_size_;
1218
        if (tp > 1) {
Li Zhang's avatar
Li Zhang committed
1219
1220
            FT_CHECK(model_->vocab_size_padded_ % tp == 0);
            const auto local_vocab_size = model_->vocab_size_padded_ / tp;
1221
1222
1223
1224
1225
            local_context_logits_buf_ =
                (float*)allocator_->malloc(sizeof(float) * local_vocab_size * max_context_token_num_);
        }
    }

Li Zhang's avatar
Li Zhang committed
1226
    model_->postDecodeEmbedding(context_logits_buf_, local_context_logits_buf_, context_decoder_output, num_token);
1227
1228
1229
1230
1231

    auto logits = context_logits_buf_;

    for (int k = 0; k < indices.size(); ++k) {
        if (output_logits[k]) {
Li Zhang's avatar
Li Zhang committed
1232
            Copy(logits, model_->vocab_size_ * lengths[k], output_logits[k]);
1233
        }
Li Zhang's avatar
Li Zhang committed
1234
        logits += model_->vocab_size_padded_ * lengths[k];
1235
1236
1237
    }
}

Li Zhang's avatar
Li Zhang committed
1238
template<typename T>
Li Zhang's avatar
Li Zhang committed
1239
auto LlamaBatch<T>::Finish(GenerationState& g, int& finished_count) -> std::vector<Signal>
Li Zhang's avatar
Li Zhang committed
1240
{
Li Zhang's avatar
Li Zhang committed
1241
1242
    NvtxScope scope("Finish");
    const int batch_size = state_->active_size;
Li Zhang's avatar
Li Zhang committed
1243

Li Zhang's avatar
Li Zhang committed
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
    // [s,b] -> [b,s] and skip padding in [context_len, max_context_len)
    invokeGatherOutput(state_->output_ids,
                       token_ids_buf_,
                       context_length_buf_,
                       g.max_init_ctx_len,
                       g.step,
                       session_len_,
                       batch_size,
                       stream_);
    sync_check_cuda_error();
Li Zhang's avatar
Li Zhang committed
1254

Li Zhang's avatar
Li Zhang committed
1255
1256
    Copy(state_->output_ids, batch_size * session_len_, h_output_ids_);
    Copy(finished_buf_, batch_size, state_->h_finished);
Li Zhang's avatar
Li Zhang committed
1257
1258
    Copy(sequence_lengths_, batch_size, state_->h_context_length);

Li Zhang's avatar
Li Zhang committed
1259
    check_cuda_error(cudaStreamSynchronize(stream_));
Li Zhang's avatar
Li Zhang committed
1260

Li Zhang's avatar
Li Zhang committed
1261
1262
1263
    // invariant: context_length = sequence_length + 1
    for (int i = 0; i < batch_size; ++i) {
        ++state_->h_context_length[i];
Li Zhang's avatar
Li Zhang committed
1264
    }
Li Zhang's avatar
Li Zhang committed
1265

Li Zhang's avatar
Li Zhang committed
1266
1267
1268
1269
1270
1271
1272
1273
    {  // set output tokens ids and sequence length
        int* output_ptr = h_output_ids_;
        for (int i = 0; i < batch_size; ++i) {
            if (state_->requests[i] && (state_->requests[i]->stream_cb || state_->h_finished[i])) {
                const int count = state_->h_context_length[i] - 1 + int(g.step != g.max_init_ctx_len);
                // TODO: sync history output tokens at when receiving the request and copy only the last token here
                std::copy(output_ptr, output_ptr + count, h_request_output_ids_ptrs_[i]);
                *h_request_seqlen_ptrs_[i] = count;
Li Zhang's avatar
Li Zhang committed
1274
            }
Li Zhang's avatar
Li Zhang committed
1275
            output_ptr += session_len_;
Li Zhang's avatar
Li Zhang committed
1276
        }
Chen Xin's avatar
Chen Xin committed
1277
    }
Li Zhang's avatar
Li Zhang committed
1278
1279
1280

    if (debug_ && rank_ == 0) {
        std::stringstream ss;
Li Zhang's avatar
Li Zhang committed
1281
1282
        for (int i = 0; i < batch_size; ++i) {
            ss << (i ? ", " : "") << "(" << state_->h_context_length[i] << "," << state_->h_finished[i] << ")";
Li Zhang's avatar
Li Zhang committed
1283
        }
lvhan028's avatar
lvhan028 committed
1284
        TM_LOG_INFO("[finish] [%s]", ss.str().c_str());
Li Zhang's avatar
Li Zhang committed
1285
1286
    }

Li Zhang's avatar
Li Zhang committed
1287
1288
1289
1290
1291
    // `SequenceManager` needs real-time value of cache length
    for (int i = 0; i < batch_size; ++i) {
        if (state_->requests[i]) {
            FT_CHECK(state_->sequences[i]);
            state_->sequences[i]->cache_len = state_->h_context_length[i];
Li Zhang's avatar
Li Zhang committed
1292
1293
1294
        }
    }

Li Zhang's avatar
Li Zhang committed
1295
1296
    std::vector<Signal> signals;
    {
Li Zhang's avatar
Li Zhang committed
1297
        NvtxScope _("stream_and_completion_signal");
Li Zhang's avatar
Li Zhang committed
1298
        for (int i = 0; i < batch_size; ++i) {
Li Zhang's avatar
Li Zhang committed
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
            if (state_->requests[i]) {
                if (state_->h_finished[i]) {
                    // Interrupt finished sequences and move the request handle into the signal closure
                    signals.push_back(Interrupt(i));
                    ++finished_count;
                }
                else if (state_->requests[i]->stream_cb) {
                    // Create signals by copying the request handles for non-finished streaming requests
                    signals.push_back([this, r = state_->requests[i]] {
                        if (rank_ == 0) {
                            r->stream_cb(&r->outputs[rank_].get());
                        }
                    });
                }
Li Zhang's avatar
Li Zhang committed
1313
1314
            }
        }
Li Zhang's avatar
Li Zhang committed
1315
1316
1317
1318
        if (finished_count) {
            // synchronize for interrupted sequences
            check_cuda_error(cudaStreamSynchronize(stream_));
        }
Li Zhang's avatar
Li Zhang committed
1319
    }
Li Zhang's avatar
Li Zhang committed
1320
    return signals;
Li Zhang's avatar
Li Zhang committed
1321
1322
1323
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
1324
auto LlamaBatch<T>::Interrupt(int index, bool force_stop, bool force_end) -> Signal
Li Zhang's avatar
Li Zhang committed
1325
1326
{
    if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
1327
        TM_LOG_INFO("[Interrupt] slot = %d, id = %lu", index, (long)state_->requests[index]->id);
Li Zhang's avatar
Li Zhang committed
1328
1329
1330
    }

    if (debug_ && rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
1331
1332
        std::vector<int> tokens(state_->h_context_length[index]);
        Copy(state_->output_ids + index * session_len_, tokens.size(), tokens.data());
Li Zhang's avatar
Li Zhang committed
1333
1334
1335
1336
1337
        cudaStreamSynchronize(stream_);
        std::stringstream ss;
        for (const auto& t : tokens) {
            ss << " " << t;
        }
Li Zhang's avatar
Li Zhang committed
1338
        TM_LOG_INFO("[Interrupt] slot %d, tokens [%s]", index, ss.str().c_str());
Li Zhang's avatar
Li Zhang committed
1339
1340
    }

Li Zhang's avatar
Li Zhang committed
1341
1342
1343
    if (state_->requests[index]->end_flag || force_end) {
        // Sequence is ending this round or a stop request is issued to end it
        FT_CHECK(sequence_manager_->Erase(state_->requests[index]->id));
Li Zhang's avatar
Li Zhang committed
1344
1345
    }
    else {
Li Zhang's avatar
Li Zhang committed
1346
1347
1348
        // Account for the last generated token if not a stop request (which doesn't generate)
        const int output_len = state_->h_context_length[index] + 1 - static_cast<int>(force_stop);
        auto&     seq        = *state_->sequences[index];
Li Zhang's avatar
Li Zhang committed
1349

Li Zhang's avatar
Li Zhang committed
1350
        // Update token IDs
Li Zhang's avatar
Li Zhang committed
1351
1352
        seq.tokens.resize(output_len);
        const auto output_ids_data = state_->requests[index]->outputs[rank_].at("output_ids").getPtr<int>();
Li Zhang's avatar
Li Zhang committed
1353
        std::copy_n(output_ids_data, output_len, seq.tokens.data());
Li Zhang's avatar
Li Zhang committed
1354

Li Zhang's avatar
Li Zhang committed
1355
1356
1357
1358
        // Save random state in host memory
        seq.random_state.resize(sizeof(curandState_t));
        // This async copy must be synchronized by the caller
        Copy(state_->curand_state + index, 1, (curandState_t*)seq.random_state.data());
Li Zhang's avatar
Li Zhang committed
1359

Li Zhang's avatar
Li Zhang committed
1360
        // Set unlock flag for corresponding blocks, will be unlocked in the next `Materialize()`
Li Zhang's avatar
Li Zhang committed
1361
1362
1363
1364
        sequence_manager_->UpdateAndSetUnlock(seq);
    }

    state_->sequences[index] = nullptr;
Li Zhang's avatar
Li Zhang committed
1365
1366
1367
1368
1369
1370
1371

    // move the request handle into the signal
    return [this, r = std::move(state_->requests[index])] {
        if (rank_ == 0) {
            r->signal.set_value(0);
        }
    };
Li Zhang's avatar
Li Zhang committed
1372
1373
1374
1375
1376
}

template<typename T>
void LlamaBatch<T>::InternalThreadEntry(int device_id)
{
Li Zhang's avatar
Li Zhang committed
1377
    // TM_LOG_INFO("[InternalThreadEntry] %d", (int)rank_);
Li Zhang's avatar
Li Zhang committed
1378
1379
1380
1381
1382
1383
1384
1385
    check_cuda_error(cudaSetDevice(device_id));

    auto& shared_state = model_->shared_state_;

    auto& request_queue  = shared_state->request_queue;
    auto& infer_requests = shared_state->infer_requests;
    auto& stop_requests  = shared_state->stop_requests;

Li Zhang's avatar
Li Zhang committed
1386
    // sequences that are removed but still counted in state's size
Li Zhang's avatar
Li Zhang committed
1387
1388
1389
1390
    int finished_count = 0;

    GenerationState g{};

Li Zhang's avatar
Li Zhang committed
1391
1392
1393
    constexpr int request_interval = 1;
    long          request_counter  = 0;

Li Zhang's avatar
Li Zhang committed
1394
1395
1396
1397
    while (1) {
        if (rank_ == 0) {
            const int  free_slot_count = max_batch_size_ - state_->size + finished_count;
            const bool is_empty        = (free_slot_count == max_batch_size_);
Li Zhang's avatar
Li Zhang committed
1398
1399
1400
1401
1402
1403
1404
1405
            stop_requests.clear();
            infer_requests.clear();
            if (is_empty || request_counter % request_interval == 0) {
                // Block if batch is empty
                request_queue.dequeue(stop_requests, infer_requests, free_slot_count, is_empty, shared_state->abort);
                if (!shared_state->abort) {
                    RejectInvalidRequests(stop_requests, infer_requests);
                }
Li Zhang's avatar
Li Zhang committed
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
            }
        }

        NvtxScope scope("mainloop");

        // wait while rank-0 is dequeueing
        shared_state->barrier->wait();

        if (shared_state->abort) {
            TM_LOG_INFO("[InternalThreadEntry] stop requested.");
            return;
        }

        auto signals = ProcessStopRequests(stop_requests);

Li Zhang's avatar
Li Zhang committed
1421
        // Shared `priority` field will be assigned by rank-0
Li Zhang's avatar
Li Zhang committed
1422
1423
        ProcessInferRequests(infer_requests);

Li Zhang's avatar
Li Zhang committed
1424
        // Wait while shared `requests` is being used
Li Zhang's avatar
Li Zhang committed
1425
1426
        shared_state->barrier->wait();

Li Zhang's avatar
Li Zhang committed
1427
1428
        SendSignals(std::move(signals));

Li Zhang's avatar
Li Zhang committed
1429
        auto modified = Initialize();
Li Zhang's avatar
Li Zhang committed
1430
1431
        // finished sequences is handled by `Initialize()`
        finished_count = 0;
Li Zhang's avatar
Li Zhang committed
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444

        ContextDecode();

        if (state_->active_size) {
            if (modified) {
                g = InitializeGeneration();
                InitializeSampling();
            }
            for (int i = 0; i < step_length_; ++i) {
                if (!Generate(g)) {
                    break;
                }
            }
Li Zhang's avatar
Li Zhang committed
1445
1446
1447
1448
1449
1450
1451
1452
1453
            if (auto signals = Finish(g, finished_count); !signals.empty()) {
                if (finished_count) {
                    // Finished requests and corresponding output tensors will be released when notified
                    // wait for all ranks to ensure no rank (except for output thread) will access related
                    // resources
                    shared_state->barrier->wait();
                }
                SendSignals(std::move(signals));
            }
Li Zhang's avatar
Li Zhang committed
1454
        }
Li Zhang's avatar
Li Zhang committed
1455
1456

        ++request_counter;
Li Zhang's avatar
Li Zhang committed
1457
1458
    }

Li Zhang's avatar
Li Zhang committed
1459
1460
1461
1462
    FT_CHECK(0);
}

template<typename T>
Li Zhang's avatar
Li Zhang committed
1463
void LlamaBatch<T>::SendSignals(std::vector<Signal> signals)
Li Zhang's avatar
Li Zhang committed
1464
{
Li Zhang's avatar
Li Zhang committed
1465
1466
1467
1468
1469
1470
1471
1472
    if (rank_ != 0 || signals.empty()) {
        return;
    }
    {
        std::lock_guard lock{output_mutex_};
        output_signals_.insert(output_signals_.end(),  //
                               std::move_iterator{signals.begin()},
                               std::move_iterator{signals.end()});
Li Zhang's avatar
Li Zhang committed
1473
    }
Li Zhang's avatar
Li Zhang committed
1474
    output_cv_.notify_one();
Li Zhang's avatar
Li Zhang committed
1475
1476
1477
1478
1479
}

template<typename T>
void LlamaBatch<T>::Start()
{
Li Zhang's avatar
Li Zhang committed
1480
    TM_LOG_INFO("LlamaBatch<T>::Start()");
Li Zhang's avatar
Li Zhang committed
1481
1482
1483
    int device_id = -1;
    check_cuda_error(cudaGetDevice(&device_id));
    internal_thread_ = std::thread(&LlamaBatch::InternalThreadEntry, this, device_id);
Li Zhang's avatar
Li Zhang committed
1484
    if (rank_ == 0) {
Li Zhang's avatar
Li Zhang committed
1485
        output_thread_ = std::thread(&LlamaBatch::OutputThreadEntry, this);
Li Zhang's avatar
Li Zhang committed
1486
    }
Li Zhang's avatar
Li Zhang committed
1487
}
Li Zhang's avatar
Li Zhang committed
1488

Li Zhang's avatar
Li Zhang committed
1489
1490
1491
1492
template<typename T>
void LlamaBatch<T>::OutputThreadEntry()
{
    while (true) {
Li Zhang's avatar
Li Zhang committed
1493
        std::vector<Signal> signals;
Li Zhang's avatar
Li Zhang committed
1494
        {
Li Zhang's avatar
Li Zhang committed
1495
            // Wait for signals to come
Li Zhang's avatar
Li Zhang committed
1496
            std::unique_lock lock(output_mutex_);
Li Zhang's avatar
Li Zhang committed
1497
            output_cv_.wait(lock, [&] { return !output_signals_.empty() || output_stop_token_; });
Li Zhang's avatar
Li Zhang committed
1498
1499
1500
1501
            if (output_stop_token_) {
                TM_LOG_INFO("[OutputThreadEntry] stop requested.");
                return;
            }
Li Zhang's avatar
Li Zhang committed
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
            signals = std::move(output_signals_);
        }
        if (rank_ == 0 && model_->ffi_lock_) {
            model_->ffi_lock_(1);
        }
        // invoke stream cbs & signals
        for (const auto& s : signals) {
            s();
        }
        if (rank_ == 0 && model_->ffi_lock_) {
            model_->ffi_lock_(0);
Li Zhang's avatar
Li Zhang committed
1513
1514
        }
    }
Li Zhang's avatar
Li Zhang committed
1515
1516
1517
1518
1519
}

template class LlamaBatch<half>;
template class LlamaBatch<float>;

lvhan028's avatar
lvhan028 committed
1520
}  // namespace turbomind