SequenceManager.h 3.92 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
// Copyright (c) OpenMMLab. All rights reserved.

#pragma once

#include "src/turbomind/models/llama/BlockManager.h"

namespace turbomind {

struct Sequence {

    enum Status
    {
        kCached = 0,
        kLocked,
        kActive
    };

    uint64_t id;
    Status   status;

    std::vector<const Block*> blocks;
    std::vector<uint64_t>     block_unique_ids;

    mutable std::vector<int> tokens;  // update by user

    mutable int cache_len;

    // additional data kept round-to-round
    mutable std::vector<std::byte> random_state;  // update by user

    mutable float rope_theta;

    friend std::ostream& operator<<(std::ostream& os, const Sequence& seq);
};

using Sequences = std::vector<const Sequence*>;

inline std::ostream& operator<<(std::ostream& os, const Sequence& seq)
{
    os << "id=" << seq.id << ", status=" << seq.status << ", token_count=" << seq.tokens.size()
       << ", block_count=" << seq.blocks.size() << ", cache_len=" << seq.cache_len
       << ", random_state_size=" << seq.random_state.size();
    return os;
}

class SequenceManager {
public:
    explicit SequenceManager(size_t      layer_num,
                             size_t      head_num,
                             size_t      head_dim,
                             size_t      block_seq_len,
                             double      block_count,
                             int         chunk_size,
                             size_t      elem_bits,
                             int         rank,
                             IAllocator* allocator);

    SequenceManager(const SequenceManager&)     = delete;
    SequenceManager(SequenceManager&&) noexcept = default;

Li Zhang's avatar
Li Zhang committed
61
    [[nodiscard]] const Sequence* Create(uint64_t id);
Li Zhang's avatar
Li Zhang committed
62

Li Zhang's avatar
Li Zhang committed
63
    [[nodiscard]] const Sequence* Get(uint64_t id);
Li Zhang's avatar
Li Zhang committed
64

Li Zhang's avatar
Li Zhang committed
65
    [[nodiscard]] bool Contains(uint64_t id);
Li Zhang's avatar
Li Zhang committed
66

Li Zhang's avatar
Li Zhang committed
67
    [[nodiscard]] bool Erase(uint64_t id);
Li Zhang's avatar
Li Zhang committed
68
69
70
71
72
73
74
75
76

    void UpdateAndSetUnlock(const Sequence& seq);

    struct Outcome {
        int allocation;
        int swap_in;
        int swap_out;
    };

Li Zhang's avatar
Li Zhang committed
77
78
79
80
    [[nodiscard]] Outcome Materialize(Sequences                    sequences,
                                      std::vector<int>             context_lengths,
                                      const std::vector<uint64_t>& priorities,
                                      int                          step_length);
Li Zhang's avatar
Li Zhang committed
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

    void* OffsetKey(void* block_ptr)
    {
        return block_ptr;
    }

    void* OffsetVal(void* block_ptr)
    {
        return (std::byte*)block_ptr + val_offset_;
    }

    int max_block_count() const noexcept
    {
        return block_manager_->max_block_count();
    }

private:
    void CommitUnlockAndFree();

    void VerifyAndLockCached(const Sequences& sequences);

    std::vector<int> CountRequiredBlocks(const Sequences&        sequences,  //
                                         const std::vector<int>& context_lengths,
                                         int                     step_length);

    static void SortByPriority(Sequences&                   sequences,  //
                               std::vector<int>&            context_lengths,
                               const std::vector<uint64_t>& priorities);

    static void AssignAndActivate(const Sequences&                 sequences,  //
                                  const std::vector<int>&          block_counts,
                                  const std::vector<const Block*>& blocks);

private:
    int    block_seq_len_;
    int    rank_;
    size_t val_offset_{};

    bool need_verify_{};

    // Use `std::map` to avoid reference invalidation
    std::map<uint64_t, Sequence> sequences_;

    std::unique_ptr<BlockManager> block_manager_;

    std::vector<const Block*> unlocked_;
    std::vector<const Block*> freed_;
};

inline std::ostream& operator<<(std::ostream& os, const SequenceManager::Outcome& oc)
{
    os << "allocation: " << oc.allocation << ", swap-in: " << oc.swap_in << ", swap-out: " << oc.swap_out;
    return os;
}

}  // namespace turbomind