SequenceManager.h 4.52 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
5
// Copyright (c) OpenMMLab. All rights reserved.

#pragma once

#include "src/turbomind/models/llama/BlockManager.h"
6
#include <functional>
Li Zhang's avatar
Li Zhang committed
7
8
9
10
11
12
13
14
15
16
17
18
19

namespace turbomind {

struct Sequence {

    enum Status
    {
        kCached = 0,
        kLocked,
        kActive
    };

    uint64_t id;
20
    Status   status = kCached;
Li Zhang's avatar
Li Zhang committed
21

Li Zhang's avatar
Li Zhang committed
22
23
    BlockIds  blocks;
    UniqueIds block_unique_ids;
Li Zhang's avatar
Li Zhang committed
24

25
26
    int input_length = 0;

Li Zhang's avatar
Li Zhang committed
27
28
    mutable std::vector<int> tokens;  // update by user

29
    mutable int cache_len = 0;
Li Zhang's avatar
Li Zhang committed
30
31

    // additional data kept round-to-round
32
    mutable std::vector<uint8_t> random_state;  // update by user
Li Zhang's avatar
Li Zhang committed
33

34
35
    mutable float rope_theta = 0.f;

Chen Xin's avatar
Chen Xin committed
36
    // embedding data
37
    mutable std::vector<std::vector<uint8_t>> input_embeddings;
Chen Xin's avatar
Chen Xin committed
38
39
    mutable std::vector<std::pair<int, int>>    input_embedding_ranges;

Li Zhang's avatar
Li Zhang committed
40
    explicit Sequence(uint64_t _id): id(_id) {}
Li Zhang's avatar
Li Zhang committed
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56

    friend std::ostream& operator<<(std::ostream& os, const Sequence& seq);
};

using Sequences = std::vector<const Sequence*>;

inline std::ostream& operator<<(std::ostream& os, const Sequence& seq)
{
    os << "id=" << seq.id << ", status=" << seq.status << ", token_count=" << seq.tokens.size()
       << ", block_count=" << seq.blocks.size() << ", cache_len=" << seq.cache_len
       << ", random_state_size=" << seq.random_state.size();
    return os;
}

class SequenceManager {
public:
zhouxiang's avatar
zhouxiang committed
57
58
59
60
61
62
63
64
65
66
    explicit SequenceManager(size_t         layer_num,
                             size_t         head_num,
                             size_t         head_dim,
                             size_t         block_seq_len,
                             double         block_count,
                             int            chunk_size,
                             size_t         elem_bits,
                             int            rank,
                             IAllocator*    allocator,
                             GetFreeMemSize get_free_size);
Li Zhang's avatar
Li Zhang committed
67
68
69
70

    SequenceManager(const SequenceManager&)     = delete;
    SequenceManager(SequenceManager&&) noexcept = default;

Li Zhang's avatar
Li Zhang committed
71
    [[nodiscard]] const Sequence* Create(uint64_t id);
Li Zhang's avatar
Li Zhang committed
72

Li Zhang's avatar
Li Zhang committed
73
    [[nodiscard]] const Sequence* Get(uint64_t id);
Li Zhang's avatar
Li Zhang committed
74

Li Zhang's avatar
Li Zhang committed
75
    [[nodiscard]] bool Contains(uint64_t id);
Li Zhang's avatar
Li Zhang committed
76

Li Zhang's avatar
Li Zhang committed
77
    [[nodiscard]] bool Erase(uint64_t id);
Li Zhang's avatar
Li Zhang committed
78
79
80
81
82
83
84
85
86

    void UpdateAndSetUnlock(const Sequence& seq);

    struct Outcome {
        int allocation;
        int swap_in;
        int swap_out;
    };

87
88
    using AdjustInputCount = std::function<std::pair<int, int>(const Sequences&, const std::vector<int>&)>;

Li Zhang's avatar
Li Zhang committed
89
90
91
    [[nodiscard]] Outcome Materialize(Sequences                    sequences,
                                      std::vector<int>             context_lengths,
                                      const std::vector<uint64_t>& priorities,
92
93
                                      int                          step_length,
                                      AdjustInputCount             adjust);
Li Zhang's avatar
Li Zhang committed
94

Li Zhang's avatar
Li Zhang committed
95
    [[nodiscard]] void* GetKeyPtr(int block_id)
Li Zhang's avatar
Li Zhang committed
96
    {
Li Zhang's avatar
Li Zhang committed
97
        return block_manager_->block(block_id).data;
Li Zhang's avatar
Li Zhang committed
98
99
    }

Li Zhang's avatar
Li Zhang committed
100
    [[nodiscard]] void* GetValPtr(int block_id)
Li Zhang's avatar
Li Zhang committed
101
    {
102
        return (uint8_t*)GetKeyPtr(block_id) + val_offset_;
Li Zhang's avatar
Li Zhang committed
103
104
105
106
107
108
109
110
    }

    int max_block_count() const noexcept
    {
        return block_manager_->max_block_count();
    }

private:
111
    void Erase(std::map<uint64_t, Sequence>::iterator& it);
Li Zhang's avatar
Li Zhang committed
112

Li Zhang's avatar
Li Zhang committed
113
114
115
116
117
118
119
120
121
122
123
124
    void CommitUnlockAndFree();

    void VerifyAndLockCached(const Sequences& sequences);

    std::vector<int> CountRequiredBlocks(const Sequences&        sequences,  //
                                         const std::vector<int>& context_lengths,
                                         int                     step_length);

    static void SortByPriority(Sequences&                   sequences,  //
                               std::vector<int>&            context_lengths,
                               const std::vector<uint64_t>& priorities);

Li Zhang's avatar
Li Zhang committed
125
126
127
128
    static void AssignAndActivate(const Sequences&        sequences,  //
                                  const std::vector<int>& counts,
                                  const BlockIds&         blocks,
                                  const UniqueIds&        unique_ids);
Li Zhang's avatar
Li Zhang committed
129
130
131
132
133
134
135
136
137
138
139

private:
    int    block_seq_len_;
    int    rank_;
    size_t val_offset_{};

    // Use `std::map` to avoid reference invalidation
    std::map<uint64_t, Sequence> sequences_;

    std::unique_ptr<BlockManager> block_manager_;

Li Zhang's avatar
Li Zhang committed
140
141
    BlockIds unlocked_;
    BlockIds freed_;
Li Zhang's avatar
Li Zhang committed
142
143
144
145
146
147
148
149
150
};

inline std::ostream& operator<<(std::ostream& os, const SequenceManager::Outcome& oc)
{
    os << "allocation: " << oc.allocation << ", swap-in: " << oc.swap_in << ", swap-out: " << oc.swap_out;
    return os;
}

}  // namespace turbomind