SequenceManager.h 4.21 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
5
// Copyright (c) OpenMMLab. All rights reserved.

#pragma once

#include "src/turbomind/models/llama/BlockManager.h"
6
#include <functional>
Li Zhang's avatar
Li Zhang committed
7
8
9
10
11
12
13
14
15
16
17
18
19

namespace turbomind {

struct Sequence {

    enum Status
    {
        kCached = 0,
        kLocked,
        kActive
    };

    uint64_t id;
20
    Status   status = kCached;
Li Zhang's avatar
Li Zhang committed
21
22
23
24

    std::vector<const Block*> blocks;
    std::vector<uint64_t>     block_unique_ids;

25
26
    int input_length = 0;

Li Zhang's avatar
Li Zhang committed
27
28
    mutable std::vector<int> tokens;  // update by user

29
    mutable int cache_len = 0;
Li Zhang's avatar
Li Zhang committed
30
31
32
33

    // additional data kept round-to-round
    mutable std::vector<std::byte> random_state;  // update by user

34
35
36
    mutable float rope_theta = 0.f;

    Sequence(uint64_t _id): id(_id) {}
Li Zhang's avatar
Li Zhang committed
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

    friend std::ostream& operator<<(std::ostream& os, const Sequence& seq);
};

using Sequences = std::vector<const Sequence*>;

inline std::ostream& operator<<(std::ostream& os, const Sequence& seq)
{
    os << "id=" << seq.id << ", status=" << seq.status << ", token_count=" << seq.tokens.size()
       << ", block_count=" << seq.blocks.size() << ", cache_len=" << seq.cache_len
       << ", random_state_size=" << seq.random_state.size();
    return os;
}

class SequenceManager {
public:
    explicit SequenceManager(size_t      layer_num,
                             size_t      head_num,
                             size_t      head_dim,
                             size_t      block_seq_len,
                             double      block_count,
                             int         chunk_size,
                             size_t      elem_bits,
                             int         rank,
                             IAllocator* allocator);

    SequenceManager(const SequenceManager&)     = delete;
    SequenceManager(SequenceManager&&) noexcept = default;

Li Zhang's avatar
Li Zhang committed
66
    [[nodiscard]] const Sequence* Create(uint64_t id);
Li Zhang's avatar
Li Zhang committed
67

Li Zhang's avatar
Li Zhang committed
68
    [[nodiscard]] const Sequence* Get(uint64_t id);
Li Zhang's avatar
Li Zhang committed
69

Li Zhang's avatar
Li Zhang committed
70
    [[nodiscard]] bool Contains(uint64_t id);
Li Zhang's avatar
Li Zhang committed
71

Li Zhang's avatar
Li Zhang committed
72
    [[nodiscard]] bool Erase(uint64_t id);
Li Zhang's avatar
Li Zhang committed
73
74
75
76
77
78
79
80
81

    void UpdateAndSetUnlock(const Sequence& seq);

    struct Outcome {
        int allocation;
        int swap_in;
        int swap_out;
    };

82
83
    using AdjustInputCount = std::function<std::pair<int, int>(const Sequences&, const std::vector<int>&)>;

Li Zhang's avatar
Li Zhang committed
84
85
86
    [[nodiscard]] Outcome Materialize(Sequences                    sequences,
                                      std::vector<int>             context_lengths,
                                      const std::vector<uint64_t>& priorities,
87
88
                                      int                          step_length,
                                      AdjustInputCount             adjust);
Li Zhang's avatar
Li Zhang committed
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

    void* OffsetKey(void* block_ptr)
    {
        return block_ptr;
    }

    void* OffsetVal(void* block_ptr)
    {
        return (std::byte*)block_ptr + val_offset_;
    }

    int max_block_count() const noexcept
    {
        return block_manager_->max_block_count();
    }

private:
    void CommitUnlockAndFree();

    void VerifyAndLockCached(const Sequences& sequences);

    std::vector<int> CountRequiredBlocks(const Sequences&        sequences,  //
                                         const std::vector<int>& context_lengths,
                                         int                     step_length);

    static void SortByPriority(Sequences&                   sequences,  //
                               std::vector<int>&            context_lengths,
                               const std::vector<uint64_t>& priorities);

    static void AssignAndActivate(const Sequences&                 sequences,  //
                                  const std::vector<int>&          block_counts,
                                  const std::vector<const Block*>& blocks);

private:
    int    block_seq_len_;
    int    rank_;
    size_t val_offset_{};

    bool need_verify_{};

    // Use `std::map` to avoid reference invalidation
    std::map<uint64_t, Sequence> sequences_;

    std::unique_ptr<BlockManager> block_manager_;

    std::vector<const Block*> unlocked_;
    std::vector<const Block*> freed_;
};

inline std::ostream& operator<<(std::ostream& os, const SequenceManager::Outcome& oc)
{
    os << "allocation: " << oc.allocation << ", swap-in: " << oc.swap_in << ", swap-out: " << oc.swap_out;
    return os;
}

}  // namespace turbomind