BlockManager.h 3.68 KB
Newer Older
Li Zhang's avatar
Li Zhang committed
1
2
3
4
// Copyright (c) OpenMMLab. All rights reserved.

#pragma once

zhouxiang's avatar
zhouxiang committed
5
#include "src/turbomind/models/llama/Barrier.h"
Li Zhang's avatar
Li Zhang committed
6
7
8
9
#include "src/turbomind/utils/allocator.h"
#include "src/turbomind/utils/cuda_utils.h"
#include "src/turbomind/utils/logger.h"
#include <algorithm>
zhouxiang's avatar
zhouxiang committed
10
#include <atomic>
Li Zhang's avatar
Li Zhang committed
11
12
#include <cstdint>
#include <cuda_runtime.h>
zhouxiang's avatar
zhouxiang committed
13
#include <functional>
Li Zhang's avatar
Li Zhang committed
14
15
16
#include <iterator>
#include <numeric>
#include <queue>
Li Zhang's avatar
Li Zhang committed
17
#include <sstream>
Li Zhang's avatar
Li Zhang committed
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#include <unordered_map>
#include <vector>

namespace turbomind {

// [L, H, S, D]

// [L, S/x, H, x, D]

struct Block {
    int      id;         // fixed linear id in the pool
    int      use_count;  // active sequences using the block
    uint64_t unique_id;  // unique for every block allocation
    uint64_t timestamp;
    void*    data;

    friend std::ostream& operator<<(std::ostream& os, const Block& block);
Li Zhang's avatar
Li Zhang committed
35
36
37
38
39
40
    friend std::string   to_string(const Block& b)
    {
        std::stringstream ss;
        ss << b;
        return ss.str();
    }
Li Zhang's avatar
Li Zhang committed
41
42
};

Li Zhang's avatar
Li Zhang committed
43
44
45
using BlockIds  = std::vector<int>;
using UniqueIds = std::vector<uint64_t>;

Li Zhang's avatar
Li Zhang committed
46
47
inline bool is_active(const Block& block)
{
Li Zhang's avatar
Li Zhang committed
48
49
    // timestamp may be 0 for newly allocated block that has not been written
    return block.use_count > 0;
Li Zhang's avatar
Li Zhang committed
50
51
52
53
}

inline bool is_cached(const Block& block)
{
Li Zhang's avatar
Li Zhang committed
54
    return block.use_count == 0 && block.timestamp != 0;
Li Zhang's avatar
Li Zhang committed
55
56
57
58
}

inline bool is_free(const Block& block)
{
Li Zhang's avatar
Li Zhang committed
59
    return block.use_count == 0 && block.timestamp == 0;
Li Zhang's avatar
Li Zhang committed
60
61
62
63
64
65
66
67
68
}

struct Snapshot {
    int              active;
    int              cached;
    int              free;
    std::vector<int> use_count;
};

zhouxiang's avatar
zhouxiang committed
69
70
71
72
using GetFreeMemSize = std::function<size_t()>;

size_t GetSyncFreeMemSize(Barrier& barrier, std::atomic<size_t>& value);

Li Zhang's avatar
Li Zhang committed
73
74
class BlockManager {
public:
zhouxiang's avatar
zhouxiang committed
75
76
    explicit BlockManager(
        size_t block_size, double block_count, int chunk_size, IAllocator* allocator, GetFreeMemSize get_free_size);
Li Zhang's avatar
Li Zhang committed
77
78
79
80

    ~BlockManager();

    // free -> active (use_count = 1, ref_count = 1)
Li Zhang's avatar
Li Zhang committed
81
    [[nodiscard]] std::pair<BlockIds, UniqueIds> Allocate(int count);
Li Zhang's avatar
Li Zhang committed
82
83

    // cached -> active (use_count += 1)
Li Zhang's avatar
Li Zhang committed
84
    [[maybe_unused]] int Lock(const BlockIds& ids);
Li Zhang's avatar
Li Zhang committed
85
86

    // active -> cached (use_count -= 1)
Li Zhang's avatar
Li Zhang committed
87
    [[maybe_unused]] int Unlock(const BlockIds& ids);
Li Zhang's avatar
Li Zhang committed
88
89
90
91
92

    // cached -> free (ref_count = 0)
    void Evict(int count);

    // cached -> free (ref_count -= 1)
Li Zhang's avatar
Li Zhang committed
93
    void Free(BlockIds bs);
Li Zhang's avatar
Li Zhang committed
94
95

    // increase timestamp in reversed order
Li Zhang's avatar
Li Zhang committed
96
97
98
    void Touch(const BlockIds& bs);

    [[nodiscard]] int Verify(const BlockIds& block_ids, const UniqueIds& unique_ids);
Li Zhang's avatar
Li Zhang committed
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121

    Snapshot TakeSnapshot();

    int max_block_count() const noexcept
    {
        return max_block_count_;
    }

    int active_count() const noexcept
    {
        return active_ids_.size();
    }

    int cached_count() const noexcept
    {
        return cached_ids_.size();
    }

    int free_count() const noexcept
    {
        return (max_block_count_ - blocks_.size()) + free_ids_.size();
    }

Li Zhang's avatar
Li Zhang committed
122
123
124
125
126
127
128
129
130
131
    Block& block(int idx)
    {
        return blocks_[idx];
    }

    int unique_id(int idx)
    {
        return blocks_[idx].unique_id;
    }

Li Zhang's avatar
Li Zhang committed
132
133
134
    friend std::ostream& operator<<(std::ostream& os, const BlockManager&);

private:
zhouxiang's avatar
zhouxiang committed
135
    static size_t GetBlockCount(size_t block_size, double ratio, GetFreeMemSize get_free_size);
Li Zhang's avatar
Li Zhang committed
136
137

    // move indices between sets
Li Zhang's avatar
Li Zhang committed
138
    static void Move(BlockIds& src, const BlockIds& delta, BlockIds& dst);
Li Zhang's avatar
Li Zhang committed
139
140
141
142
143
144
145
146
147
148
149
150

    // allocate a chunk of blocks
    bool Malloc();

private:
    size_t      block_size_;
    int         max_block_count_{};
    int         chunk_size_{};
    IAllocator* allocator_;

    std::vector<void*> chunks_;

Li Zhang's avatar
Li Zhang committed
151
152
153
    BlockIds active_ids_;
    BlockIds cached_ids_;
    BlockIds free_ids_;
Li Zhang's avatar
Li Zhang committed
154
155
156
157
158
159
160
161

    std::vector<Block> blocks_;  // < 100k

    uint64_t unique_id_{1};
    uint64_t timestamp_{1};
};

}  // namespace turbomind