SequenceManager.h 4.28 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
5
// Copyright (c) OpenMMLab. All rights reserved.

#pragma once

#include "src/turbomind/models/llama/BlockManager.h"
6
#include <functional>
Li Zhang's avatar
Li Zhang committed
7
8
9
10
11
12
13
14
15
16
17
18
19

namespace turbomind {

struct Sequence {

    enum Status
    {
        kCached = 0,
        kLocked,
        kActive
    };

    uint64_t id;
20
    Status   status = kCached;
Li Zhang's avatar
Li Zhang committed
21

Li Zhang's avatar
Li Zhang committed
22
23
    BlockIds  blocks;
    UniqueIds block_unique_ids;
Li Zhang's avatar
Li Zhang committed
24

25
26
    int input_length = 0;

Li Zhang's avatar
Li Zhang committed
27
28
    mutable std::vector<int> tokens;  // update by user

29
    mutable int cache_len = 0;
Li Zhang's avatar
Li Zhang committed
30
31
32
33

    // additional data kept round-to-round
    mutable std::vector<std::byte> random_state;  // update by user

34
35
    mutable float rope_theta = 0.f;

Li Zhang's avatar
Li Zhang committed
36
    explicit Sequence(uint64_t _id): id(_id) {}
Li Zhang's avatar
Li Zhang committed
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65

    friend std::ostream& operator<<(std::ostream& os, const Sequence& seq);
};

using Sequences = std::vector<const Sequence*>;

inline std::ostream& operator<<(std::ostream& os, const Sequence& seq)
{
    os << "id=" << seq.id << ", status=" << seq.status << ", token_count=" << seq.tokens.size()
       << ", block_count=" << seq.blocks.size() << ", cache_len=" << seq.cache_len
       << ", random_state_size=" << seq.random_state.size();
    return os;
}

class SequenceManager {
public:
    explicit SequenceManager(size_t      layer_num,
                             size_t      head_num,
                             size_t      head_dim,
                             size_t      block_seq_len,
                             double      block_count,
                             int         chunk_size,
                             size_t      elem_bits,
                             int         rank,
                             IAllocator* allocator);

    SequenceManager(const SequenceManager&)     = delete;
    SequenceManager(SequenceManager&&) noexcept = default;

Li Zhang's avatar
Li Zhang committed
66
    [[nodiscard]] const Sequence* Create(uint64_t id);
Li Zhang's avatar
Li Zhang committed
67

Li Zhang's avatar
Li Zhang committed
68
    [[nodiscard]] const Sequence* Get(uint64_t id);
Li Zhang's avatar
Li Zhang committed
69

Li Zhang's avatar
Li Zhang committed
70
    [[nodiscard]] bool Contains(uint64_t id);
Li Zhang's avatar
Li Zhang committed
71

Li Zhang's avatar
Li Zhang committed
72
    [[nodiscard]] bool Erase(uint64_t id);
Li Zhang's avatar
Li Zhang committed
73
74
75
76
77
78
79
80
81

    void UpdateAndSetUnlock(const Sequence& seq);

    struct Outcome {
        int allocation;
        int swap_in;
        int swap_out;
    };

82
83
    using AdjustInputCount = std::function<std::pair<int, int>(const Sequences&, const std::vector<int>&)>;

Li Zhang's avatar
Li Zhang committed
84
85
86
    [[nodiscard]] Outcome Materialize(Sequences                    sequences,
                                      std::vector<int>             context_lengths,
                                      const std::vector<uint64_t>& priorities,
87
88
                                      int                          step_length,
                                      AdjustInputCount             adjust);
Li Zhang's avatar
Li Zhang committed
89

Li Zhang's avatar
Li Zhang committed
90
    [[nodiscard]] void* GetKeyPtr(int block_id)
Li Zhang's avatar
Li Zhang committed
91
    {
Li Zhang's avatar
Li Zhang committed
92
        return block_manager_->block(block_id).data;
Li Zhang's avatar
Li Zhang committed
93
94
    }

Li Zhang's avatar
Li Zhang committed
95
    [[nodiscard]] void* GetValPtr(int block_id)
Li Zhang's avatar
Li Zhang committed
96
    {
Li Zhang's avatar
Li Zhang committed
97
        return (std::byte*)GetKeyPtr(block_id) + val_offset_;
Li Zhang's avatar
Li Zhang committed
98
99
100
101
102
103
104
105
    }

    int max_block_count() const noexcept
    {
        return block_manager_->max_block_count();
    }

private:
Li Zhang's avatar
Li Zhang committed
106
107
    void Erase(std::map<uint64_t, Sequence>::iterator it);

Li Zhang's avatar
Li Zhang committed
108
109
110
111
112
113
114
115
116
117
118
119
    void CommitUnlockAndFree();

    void VerifyAndLockCached(const Sequences& sequences);

    std::vector<int> CountRequiredBlocks(const Sequences&        sequences,  //
                                         const std::vector<int>& context_lengths,
                                         int                     step_length);

    static void SortByPriority(Sequences&                   sequences,  //
                               std::vector<int>&            context_lengths,
                               const std::vector<uint64_t>& priorities);

Li Zhang's avatar
Li Zhang committed
120
121
122
123
    static void AssignAndActivate(const Sequences&        sequences,  //
                                  const std::vector<int>& counts,
                                  const BlockIds&         blocks,
                                  const UniqueIds&        unique_ids);
Li Zhang's avatar
Li Zhang committed
124
125
126
127
128
129
130
131
132
133
134

private:
    int    block_seq_len_;
    int    rank_;
    size_t val_offset_{};

    // Use `std::map` to avoid reference invalidation
    std::map<uint64_t, Sequence> sequences_;

    std::unique_ptr<BlockManager> block_manager_;

Li Zhang's avatar
Li Zhang committed
135
136
    BlockIds unlocked_;
    BlockIds freed_;
Li Zhang's avatar
Li Zhang committed
137
138
139
140
141
142
143
144
145
};

inline std::ostream& operator<<(std::ostream& os, const SequenceManager::Outcome& oc)
{
    os << "allocation: " << oc.allocation << ", swap-in: " << oc.swap_in << ", swap-out: " << oc.swap_out;
    return os;
}

}  // namespace turbomind