SequenceManager.h 4.44 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
5
// Copyright (c) OpenMMLab. All rights reserved.

#pragma once

#include "src/turbomind/models/llama/BlockManager.h"
6
#include <functional>
Li Zhang's avatar
Li Zhang committed
7
8
9
10
11
12
13
14
15
16
17
18
19

namespace turbomind {

struct Sequence {

    enum Status
    {
        kCached = 0,
        kLocked,
        kActive
    };

    uint64_t id;
20
    Status   status = kCached;
Li Zhang's avatar
Li Zhang committed
21

Li Zhang's avatar
Li Zhang committed
22
23
    BlockIds  blocks;
    UniqueIds block_unique_ids;
Li Zhang's avatar
Li Zhang committed
24

25
26
    int input_length = 0;

Li Zhang's avatar
Li Zhang committed
27
28
    mutable std::vector<int> tokens;  // update by user

29
    mutable int cache_len = 0;
Li Zhang's avatar
Li Zhang committed
30
31
32
33

    // additional data kept round-to-round
    mutable std::vector<std::byte> random_state;  // update by user

34
35
    mutable float rope_theta = 0.f;

Chen Xin's avatar
Chen Xin committed
36
37
38
39
    // embedding data
    mutable std::vector<std::vector<std::byte>> input_embeddings;
    mutable std::vector<std::pair<int, int>>    input_embedding_ranges;

Li Zhang's avatar
Li Zhang committed
40
    explicit Sequence(uint64_t _id): id(_id) {}
Li Zhang's avatar
Li Zhang committed
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69

    friend std::ostream& operator<<(std::ostream& os, const Sequence& seq);
};

using Sequences = std::vector<const Sequence*>;

inline std::ostream& operator<<(std::ostream& os, const Sequence& seq)
{
    os << "id=" << seq.id << ", status=" << seq.status << ", token_count=" << seq.tokens.size()
       << ", block_count=" << seq.blocks.size() << ", cache_len=" << seq.cache_len
       << ", random_state_size=" << seq.random_state.size();
    return os;
}

class SequenceManager {
public:
    explicit SequenceManager(size_t      layer_num,
                             size_t      head_num,
                             size_t      head_dim,
                             size_t      block_seq_len,
                             double      block_count,
                             int         chunk_size,
                             size_t      elem_bits,
                             int         rank,
                             IAllocator* allocator);

    SequenceManager(const SequenceManager&)     = delete;
    SequenceManager(SequenceManager&&) noexcept = default;

Li Zhang's avatar
Li Zhang committed
70
    [[nodiscard]] const Sequence* Create(uint64_t id);
Li Zhang's avatar
Li Zhang committed
71

Li Zhang's avatar
Li Zhang committed
72
    [[nodiscard]] const Sequence* Get(uint64_t id);
Li Zhang's avatar
Li Zhang committed
73

Li Zhang's avatar
Li Zhang committed
74
    [[nodiscard]] bool Contains(uint64_t id);
Li Zhang's avatar
Li Zhang committed
75

Li Zhang's avatar
Li Zhang committed
76
    [[nodiscard]] bool Erase(uint64_t id);
Li Zhang's avatar
Li Zhang committed
77
78
79
80
81
82
83
84
85

    void UpdateAndSetUnlock(const Sequence& seq);

    struct Outcome {
        int allocation;
        int swap_in;
        int swap_out;
    };

86
87
    using AdjustInputCount = std::function<std::pair<int, int>(const Sequences&, const std::vector<int>&)>;

Li Zhang's avatar
Li Zhang committed
88
89
90
    [[nodiscard]] Outcome Materialize(Sequences                    sequences,
                                      std::vector<int>             context_lengths,
                                      const std::vector<uint64_t>& priorities,
91
92
                                      int                          step_length,
                                      AdjustInputCount             adjust);
Li Zhang's avatar
Li Zhang committed
93

Li Zhang's avatar
Li Zhang committed
94
    [[nodiscard]] void* GetKeyPtr(int block_id)
Li Zhang's avatar
Li Zhang committed
95
    {
Li Zhang's avatar
Li Zhang committed
96
        return block_manager_->block(block_id).data;
Li Zhang's avatar
Li Zhang committed
97
98
    }

Li Zhang's avatar
Li Zhang committed
99
    [[nodiscard]] void* GetValPtr(int block_id)
Li Zhang's avatar
Li Zhang committed
100
    {
Li Zhang's avatar
Li Zhang committed
101
        return (std::byte*)GetKeyPtr(block_id) + val_offset_;
Li Zhang's avatar
Li Zhang committed
102
103
104
105
106
107
108
109
    }

    int max_block_count() const noexcept
    {
        return block_manager_->max_block_count();
    }

private:
Li Zhang's avatar
Li Zhang committed
110
111
    void Erase(std::map<uint64_t, Sequence>::iterator it);

Li Zhang's avatar
Li Zhang committed
112
113
114
115
116
117
118
119
120
121
122
123
    void CommitUnlockAndFree();

    void VerifyAndLockCached(const Sequences& sequences);

    std::vector<int> CountRequiredBlocks(const Sequences&        sequences,  //
                                         const std::vector<int>& context_lengths,
                                         int                     step_length);

    static void SortByPriority(Sequences&                   sequences,  //
                               std::vector<int>&            context_lengths,
                               const std::vector<uint64_t>& priorities);

Li Zhang's avatar
Li Zhang committed
124
125
126
127
    static void AssignAndActivate(const Sequences&        sequences,  //
                                  const std::vector<int>& counts,
                                  const BlockIds&         blocks,
                                  const UniqueIds&        unique_ids);
Li Zhang's avatar
Li Zhang committed
128
129
130
131
132
133
134
135
136
137
138

private:
    int    block_seq_len_;
    int    rank_;
    size_t val_offset_{};

    // Use `std::map` to avoid reference invalidation
    std::map<uint64_t, Sequence> sequences_;

    std::unique_ptr<BlockManager> block_manager_;

Li Zhang's avatar
Li Zhang committed
139
140
    BlockIds unlocked_;
    BlockIds freed_;
Li Zhang's avatar
Li Zhang committed
141
142
143
144
145
146
147
148
149
};

inline std::ostream& operator<<(std::ostream& os, const SequenceManager::Outcome& oc)
{
    os << "allocation: " << oc.allocation << ", swap-in: " << oc.swap_in << ", swap-out: " << oc.swap_out;
    return os;
}

}  // namespace turbomind