kvcache.cpp 3.13 KB
Newer Older
blkmjsian's avatar
blkmjsian committed
1
2
#include "../cache.hpp"

3
__INFINI_C struct KVCache *createKVCache(
blkmjsian's avatar
blkmjsian committed
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
    size_t nlayers,
    size_t max_len,
    size_t nkvh_,
    size_t dk,
    size_t dv,
    infiniDtype_t dtype,
    infiniDevice_t device,
    int *dev_ids,
    size_t ndev) {

    KVCache *cache = new KVCache();
    auto nkvh = nkvh_ / ndev;

    auto shape_k = std::vector<size_t>{max_len, nkvh, dk};
    auto shape_v = std::vector<size_t>{max_len, nkvh, dv};
    for (unsigned int idev = 0; idev < ndev; idev++) {
        RUN_INFINI(infinirtSetDevice(device, dev_ids[idev]));
        auto kcache = std::vector<std::shared_ptr<Tensor>>();
        auto vcache = std::vector<std::shared_ptr<Tensor>>();
        for (unsigned int layer = 0; layer < nlayers; layer++) {
wooway777's avatar
wooway777 committed
24
25
            kcache.push_back(Tensor::buffer(dtype, shape_k));
            vcache.push_back(Tensor::buffer(dtype, shape_v));
blkmjsian's avatar
blkmjsian committed
26
27
28
29
30
31
32
33
        }
        cache->k.push_back(kcache);
        cache->v.push_back(vcache);
    }

    return cache;
}

34
__INFINI_C struct KVCache *duplicateKVCache(const KVCache *kv_cache, size_t seq_len) {
blkmjsian's avatar
blkmjsian committed
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
    auto ndev = kv_cache->k.size();
    auto nlayers = kv_cache->k[0].size();
    auto device = kv_cache->k[0][0]->deviceType();
    auto dtype = kv_cache->k[0][0]->dtype();
    auto shape_k = kv_cache->k[0][0]->shape();
    auto shape_v = kv_cache->v[0][0]->shape();
    auto size_k = seq_len * shape_k[1] * shape_k[2] * dsize(dtype);
    auto size_v = seq_len * shape_v[1] * shape_v[2] * dsize(dtype);
    KVCache *new_kv_cache = new KVCache();
    for (unsigned int idev = 0; idev < ndev; idev++) {
        RUN_INFINI(infinirtSetDevice(device, kv_cache->k[idev][0]->deviceId()));
        for (unsigned int layer = 0; layer < nlayers; layer++) {
            auto kcache = std::vector<std::shared_ptr<Tensor>>();
            auto vcache = std::vector<std::shared_ptr<Tensor>>();
            for (unsigned int layer = 0; layer < nlayers; layer++) {
wooway777's avatar
wooway777 committed
50
51
                kcache.push_back(Tensor::buffer(dtype, shape_k));
                vcache.push_back(Tensor::buffer(dtype, shape_v));
blkmjsian's avatar
blkmjsian committed
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
            }
            new_kv_cache->k.push_back(kcache);
            new_kv_cache->v.push_back(vcache);
            RUN_INFINI(infinirtMemcpy(new_kv_cache->k[idev][layer]->data(),
                                      kv_cache->k[idev][layer]->data(),
                                      size_k,
                                      INFINIRT_MEMCPY_D2D));
            RUN_INFINI(infinirtMemcpy(new_kv_cache->v[idev][layer]->data(),
                                      kv_cache->v[idev][layer]->data(),
                                      size_v,
                                      INFINIRT_MEMCPY_D2D));
        }
    }
    return new_kv_cache;
}

68
__INFINI_C void dropKVCache(KVCache *kv_cache) {
blkmjsian's avatar
blkmjsian committed
69
70
71
72
73
74
75
76
77
78
79
80
    auto ndev = kv_cache->k.size();
    auto nlayers = kv_cache->k[0].size();
    auto device = kv_cache->k[0][0]->deviceType();
    for (unsigned int idev = 0; idev < ndev; idev++) {
        RUN_INFINI(infinirtSetDevice(device, kv_cache->k[idev][0]->deviceId()));
        for (unsigned int layer = 0; layer < nlayers; layer++) {
            kv_cache->k[idev][layer].reset();
            kv_cache->v[idev][layer].reset();
        }
    }
    delete kv_cache;
}