Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
877aec85
Unverified
Commit
877aec85
authored
Apr 09, 2025
by
Yuhao Tsui
Committed by
GitHub
Apr 09, 2025
Browse files
Merge branch 'kvcache-ai:main' into main
parents
84164f58
9037bf30
Changes
251
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3473 additions
and
0 deletions
+3473
-0
csrc/balance_serve/kvc2/src/cache_entry.hh
csrc/balance_serve/kvc2/src/cache_entry.hh
+182
-0
csrc/balance_serve/kvc2/src/common.h
csrc/balance_serve/kvc2/src/common.h
+0
-0
csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
+135
-0
csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
+54
-0
csrc/balance_serve/kvc2/src/defs.h
csrc/balance_serve/kvc2/src/defs.h
+35
-0
csrc/balance_serve/kvc2/src/gpu_cache.cpp
csrc/balance_serve/kvc2/src/gpu_cache.cpp
+282
-0
csrc/balance_serve/kvc2/src/gpu_cache.hh
csrc/balance_serve/kvc2/src/gpu_cache.hh
+74
-0
csrc/balance_serve/kvc2/src/hasher.hpp
csrc/balance_serve/kvc2/src/hasher.hpp
+40
-0
csrc/balance_serve/kvc2/src/io_helper.hpp
csrc/balance_serve/kvc2/src/io_helper.hpp
+155
-0
csrc/balance_serve/kvc2/src/kvc2.h
csrc/balance_serve/kvc2/src/kvc2.h
+138
-0
csrc/balance_serve/kvc2/src/kvc2_utils.py
csrc/balance_serve/kvc2/src/kvc2_utils.py
+64
-0
csrc/balance_serve/kvc2/src/metrics.cpp
csrc/balance_serve/kvc2/src/metrics.cpp
+141
-0
csrc/balance_serve/kvc2/src/metrics.h
csrc/balance_serve/kvc2/src/metrics.h
+77
-0
csrc/balance_serve/kvc2/src/model_config.h
csrc/balance_serve/kvc2/src/model_config.h
+119
-0
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
+125
-0
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
+54
-0
csrc/balance_serve/kvc2/src/prefix.cpp
csrc/balance_serve/kvc2/src/prefix.cpp
+1744
-0
csrc/balance_serve/kvc2/src/utils/all.hpp
csrc/balance_serve/kvc2/src/utils/all.hpp
+3
-0
csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
+14
-0
csrc/balance_serve/kvc2/src/utils/easy_format.hpp
csrc/balance_serve/kvc2/src/utils/easy_format.hpp
+37
-0
No files found.
csrc/balance_serve/kvc2/src/cache_entry.hh
0 → 100644
View file @
877aec85
#ifndef __CACHE_ENTRY_HH_
#define __CACHE_ENTRY_HH_
#include "async_store.hh"
#include "cuda_stream_manager.hh"
#include "defs.h"
#include "hasher.hpp"
#include "io_helper.hpp"
#include "page_aligned_memory_pool.h"
#include "utils/periodic_task.hpp"
#include <atomic>
#include <list>
#include <memory>
#include "utils/mutex_extend.hpp"
namespace
kvc2
{
using
CacheBlockKey
=
TokensHash
;
class
CacheEntryManager
;
struct
DoubleVerticalBlocksHandle
;
class
GPUPageCache
;
struct
ConcurrentControlUnit
{
std
::
atomic_size_t
ref_count
=
0
;
std
::
atomic_bool
dirty
=
false
;
TransferControl
<
std
::
mutex
>
tc
;
bool
can_desert
();
void
debug
();
};
enum
IOOption
{
IO_ForceRead
,
IO_ForceWrite
,
IO_Read
,
IO_Write
,
};
inline
std
::
string
to_string
(
IOOption
op
)
{
switch
(
op
)
{
case
IO_ForceRead
:
return
"IO_ForceRead"
;
case
IO_ForceWrite
:
return
"IO_ForceWrite"
;
case
IO_Read
:
return
"IO_Read"
;
case
IO_Write
:
return
"IO_Write"
;
default:
return
"Unknown"
;
}
}
struct
CacheBlockEntry
{
friend
CacheEntryManager
;
using
MutexT
=
non_recursive_mutex
;
// using MutexT = std::mutex;
MutexT
lock
;
// for cache
bool
with_key
=
true
;
CacheBlockKey
hash
=
0
;
CacheBlockKey
hash_check
=
0
;
CacheInfo
cache_info
;
CacheEntryManager
*
manager
=
nullptr
;
// for memory pool
void
*
data
=
nullptr
;
size_t
size
=
0
;
ConcurrentControlUnit
cpu_cc
;
// for disk
size_t
layer
=
-
1
;
size_t
idx
=
-
1
;
// for gpu
std
::
optional
<
size_t
>
gpu_block_idx
=
std
::
nullopt
;
ConcurrentControlUnit
gpu_cc
;
CacheBlockEntry
()
=
default
;
CacheBlockEntry
(
const
CacheBlockEntry
&
other
)
=
delete
;
CacheBlockEntry
&
operator
=
(
const
CacheBlockEntry
&
other
)
=
delete
;
CacheBlockEntry
(
CacheBlockEntry
&&
other
)
=
delete
;
CacheBlockEntry
&
operator
=
(
CacheBlockEntry
&&
other
)
=
delete
;
~
CacheBlockEntry
();
private:
bool
alloc_on_cpu
();
public:
void
free_on_cpu
();
bool
alloc_on_cpu_no_lock
();
bool
inc_ref_or_alloc_on_cpu
();
void
set_key
(
TokensHash
key
,
std
::
shared_ptr
<
CacheBlockEntry
>
me
);
std
::
unique_lock
<
MutexT
>
try_lock
();
std
::
lock_guard
<
MutexT
>
lock_guard
();
// will not get lock
void
io_with
(
async_store
::
IODealer
*
dealer
,
IO_Helper
<
CacheBlockEntry
>&
io_helper
,
async_store
::
ArrayStore
*
store
,
size_t
layer
,
size_t
index
,
IOOption
option
);
void
flush_back_async
(
IO_Helper
<
CacheBlockEntry
>&
helper
,
std
::
vector
<
std
::
atomic_bool
*>&
dirty_flags
);
void
debug
();
};
struct
CacheBlockEntryCollector
{
std
::
vector
<
CacheBlockEntry
*>
entries
;
std
::
function
<
void
(
CacheBlockEntry
*
)
>
exit_fn
;
CacheBlockEntryCollector
(
std
::
function
<
void
(
CacheBlockEntry
*
)
>
exit_fn
);
~
CacheBlockEntryCollector
();
CacheBlockEntryCollector
(
const
CacheBlockEntryCollector
&
other
)
=
delete
;
CacheBlockEntryCollector
(
CacheBlockEntryCollector
&&
other
)
=
delete
;
CacheBlockEntryCollector
&
operator
=
(
const
CacheBlockEntryCollector
&
other
)
=
delete
;
CacheBlockEntryCollector
&
operator
=
(
CacheBlockEntryCollector
&&
other
)
=
delete
;
};
struct
KVC2
;
struct
CacheEntryManagerConfig
{
size_t
evict_count
=
100
;
KVC2
*
kvc2_top
=
nullptr
;
};
class
CacheEntryManager
{
public:
using
Key
=
CacheBlockKey
;
using
BlockPtr
=
std
::
shared_ptr
<
CacheBlockEntry
>
;
private:
friend
CacheBlockEntry
;
CacheEntryManagerConfig
config
;
std
::
mutex
lock
;
std
::
list
<
BlockPtr
>
usage_list
;
std
::
unordered_map
<
Key
,
std
::
list
<
BlockPtr
>::
iterator
>
key_entry_map
;
void
insert
(
BlockPtr
entry
);
BlockPtr
access
(
const
Key
&
key
);
// void remove(const Key& key);
void
evict
(
std
::
function
<
bool
(
const
BlockPtr
&
)
>
filter
,
std
::
function
<
bool
()
>
stop_condition
);
public:
std
::
unique_ptr
<
periodic
::
PeriodicTask
>
background_flush_back
=
nullptr
;
std
::
shared_ptr
<
PageAlignedMemoryPool
>
pool
;
std
::
shared_ptr
<
GPUPageCache
>
gpu_cache
;
CacheEntryManager
(
CacheEntryManagerConfig
config
);
// disable all move and copy
CacheEntryManager
(
const
CacheEntryManager
&
other
)
=
delete
;
CacheEntryManager
&
operator
=
(
const
CacheEntryManager
&
other
)
=
delete
;
CacheEntryManager
(
CacheEntryManager
&&
other
)
=
delete
;
CacheEntryManager
&
operator
=
(
CacheEntryManager
&&
other
)
=
delete
;
void
cpu_background_flush
();
void
evict_for_cpu_cache
();
// just get block pointers, not allocate them, will not return nullptr
BlockPtr
get
(
bool
&
is_new
,
size_t
size
,
std
::
optional
<
Key
>
key
=
std
::
nullopt
);
void
debug
();
};
}
// namespace kvc2
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/common.h
0 → 100644
View file @
877aec85
csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
0 → 100644
View file @
877aec85
#include "cuda_stream_manager.hh"
#include <cuda_runtime.h>
#include <functional>
#include <iostream>
#include <stdexcept>
#include <vector>
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
CudaStreamManager
::
CudaStreamManager
(
const
std
::
vector
<
size_t
>&
device_ids
,
int
num_streams_per_device
)
{
for
(
int
device_id
:
device_ids
)
{
auto
x
=
std
::
unique_ptr
<
DeviceInfo
>
(
new
DeviceInfo
);
DeviceInfo
&
device_info
=
*
x
;
device_info
.
device_id
=
device_id
;
device_info
.
next_stream_index
=
0
;
device_info
.
stop_flag
=
false
;
// 设置设备
cudaError_t
err
=
cudaSetDevice
(
device_id
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaSetDevice failed on device {}: {}"
,
device_id
,
cudaGetErrorString
(
err
));
throw
std
::
runtime_error
(
"cudaSetDevice failed"
);
}
// 创建 CUDA 流
device_info
.
streams
.
resize
(
num_streams_per_device
);
for
(
int
i
=
0
;
i
<
num_streams_per_device
;
++
i
)
{
err
=
cudaStreamCreate
(
&
device_info
.
streams
[
i
]);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"Failed to create CUDA stream on device {}: {}"
,
device_id
,
cudaGetErrorString
(
err
));
throw
std
::
runtime_error
(
"Failed to create CUDA stream"
);
}
}
// 启动设备工作线程
device_info
.
worker_thread
=
std
::
thread
(
&
CudaStreamManager
::
deviceWorker
,
this
,
std
::
ref
(
device_info
));
devices_
.
push_back
(
std
::
move
(
x
));
}
}
CudaStreamManager
::~
CudaStreamManager
()
{
// 通知所有设备线程停止
for
(
auto
&
device_info
:
devices_
)
{
device_info
->
stop_flag
.
store
(
true
);
auto
request
=
std
::
shared_ptr
<
Request
>
(
new
Request
);
request
->
should_exit
=
true
;
device_info
->
request_queue
.
enqueue
(
std
::
move
(
request
));
}
// 等待所有线程结束
for
(
auto
&
device_info
:
devices_
)
{
if
(
device_info
->
worker_thread
.
joinable
())
{
device_info
->
worker_thread
.
join
();
}
// 销毁 CUDA 流
cudaSetDevice
(
device_info
->
device_id
);
for
(
auto
&
stream
:
device_info
->
streams
)
{
cudaStreamDestroy
(
stream
);
}
}
}
void
CudaStreamManager
::
submitRequest
(
std
::
shared_ptr
<
Request
>
request
)
{
// 找到对应的设备
for
(
auto
&
device_info
:
devices_
)
{
if
(
device_info
->
device_id
==
request
->
device_id
)
{
device_info
->
request_queue
.
enqueue
(
request
);
return
;
}
}
throw
std
::
runtime_error
(
"Invalid device ID in request"
);
}
void
CudaStreamManager
::
deviceWorker
(
DeviceInfo
&
device_info
)
{
// 设置设备
cudaError_t
err
=
cudaSetDevice
(
device_info
.
device_id
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaSetDevice failed in worker thread for device {}: {}"
,
device_info
.
device_id
,
cudaGetErrorString
(
err
));
return
;
}
while
(
device_info
.
stop_flag
.
load
()
==
false
)
{
auto
request
=
device_info
.
request_queue
.
dequeue
();
if
(
request
->
should_exit
)
{
return
;
}
// 处理请求
SPDLOG_DEBUG
(
"Getting request on device {}, count {}"
,
device_info
.
device_id
,
request
->
host_mem_addresses
.
size
());
int
stream_index
=
device_info
.
next_stream_index
;
cudaStream_t
stream
=
device_info
.
streams
[
stream_index
];
device_info
.
next_stream_index
=
(
device_info
.
next_stream_index
+
1
)
%
device_info
.
streams
.
size
();
size_t
num_transfers
=
request
->
host_mem_addresses
.
size
();
for
(
size_t
i
=
0
;
i
<
num_transfers
;
++
i
)
{
void
*
dst
=
request
->
device_mem_addresses
[
i
];
void
*
src
=
request
->
host_mem_addresses
[
i
];
if
(
request
->
direction
==
cudaMemcpyDeviceToHost
)
{
std
::
swap
(
dst
,
src
);
}
cudaError_t
err
=
cudaMemcpyAsync
(
dst
,
src
,
request
->
sizes
[
i
],
request
->
direction
,
stream
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaMemcpyAsync failed on device {}: {}"
,
device_info
.
device_id
,
cudaGetErrorString
(
err
));
// 可以根据需要处理错误,这里简单地继续
continue
;
}
}
// 添加回调函数,因为是异步,所以需要包起来
struct
CallbackData
{
std
::
function
<
void
()
>
callback
;
};
CallbackData
*
cb_data
=
new
CallbackData
{
request
->
callback
};
err
=
cudaLaunchHostFunc
(
stream
,
[](
void
*
data
)
{
// SPDLOG_DEBUG("Callback function called");
CallbackData
*
cb_data
=
static_cast
<
CallbackData
*>
(
data
);
cb_data
->
callback
();
delete
cb_data
;
},
cb_data
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaLaunchHostFunc failed on device {}: {}"
,
device_info
.
device_id
,
cudaGetErrorString
(
err
));
// 根据需要处理错误
}
}
}
csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
0 → 100644
View file @
877aec85
/*
* @Author: Xie Weiyu ervinxie@qq.com
* @Date: 2024-11-19 09:24:47
* @LastEditors: Xie Weiyu ervinxie@qq.com
* @LastEditTime: 2024-11-20 02:55:49
* @FilePath: /kvc2/src/cuda_stream_manager.hh
* @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
*/
#pragma once
#include <cuda_runtime.h>
#include <atomic>
#include <functional>
#include <memory>
#include <thread>
#include <vector>
#include "utils/mpsc.hpp"
class
CudaStreamManager
{
public:
// 构造函数,接受要使用的设备 ID 列表和每个设备的流数量
CudaStreamManager
(
const
std
::
vector
<
size_t
>&
device_ids
,
int
num_streams_per_device
);
~
CudaStreamManager
();
// 请求结构体
struct
Request
{
bool
should_exit
=
false
;
int
device_id
;
std
::
vector
<
void
*>
host_mem_addresses
;
std
::
vector
<
void
*>
device_mem_addresses
;
std
::
vector
<
size_t
>
sizes
;
cudaMemcpyKind
direction
;
std
::
function
<
void
()
>
callback
;
};
void
submitRequest
(
std
::
shared_ptr
<
Request
>
request
);
private:
// 每个设备的信息
struct
DeviceInfo
{
int
device_id
;
std
::
thread
worker_thread
;
std
::
vector
<
cudaStream_t
>
streams
;
int
next_stream_index
;
MPSCQueueConsumerLock
<
std
::
shared_ptr
<
Request
>>
request_queue
;
std
::
atomic_bool
stop_flag
;
};
// 设备 ID 到 DeviceInfo 的映射
std
::
vector
<
std
::
unique_ptr
<
DeviceInfo
>>
devices_
;
// 私有方法
void
deviceWorker
(
DeviceInfo
&
device_info
);
};
csrc/balance_serve/kvc2/src/defs.h
0 → 100644
View file @
877aec85
#ifndef __DEFS_H_
#define __DEFS_H_
#include <cstdint>
#include <optional>
#include <vector>
#include "model_config.h"
namespace
kvc2
{
using
kvc2_ptr
=
void
*
;
// using data_block_ptr = std::intptr_t;
using
data_block_ptr
=
void
*
;
using
layer_data
=
std
::
vector
<
data_block_ptr
>
;
using
kvc2_handle
=
void
*
;
using
Token
=
uint32_t
;
using
Tokens
=
std
::
vector
<
Token
>
;
using
TokenPtr
=
std
::
intptr_t
;
using
TokenLength
=
size_t
;
using
BlockLength
=
size_t
;
struct
CacheInfo
{
ModelName
model_name
;
bool
is_key_cache
;
QuantType
quant_type
;
size_t
hidden_layer_count
();
std
::
filesystem
::
path
path
(
std
::
optional
<
size_t
>
which_layer
=
std
::
nullopt
);
bool
operator
==
(
const
CacheInfo
&
other
)
const
;
size_t
element_size
(
size_t
block_length
);
size_t
hash_value
()
const
;
};
};
// namespace kvc2
#endif
csrc/balance_serve/kvc2/src/gpu_cache.cpp
0 → 100644
View file @
877aec85
#include "gpu_cache.hh"
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
#include "cache_entry.hh"
#include "utils/arithmetic.hpp"
namespace
kvc2
{
GPUPageCache
::
GPUPageCache
(
GPUPageCacheConfig
&
config
)
:
config
(
config
)
{
if
(
torch
::
cuda
::
is_available
())
{
size_t
gpu_count
=
torch
::
cuda
::
device_count
();
SPDLOG_INFO
(
"Number of available GPUs: {}, want {}"
,
gpu_count
,
config
.
gpu_devices_id
.
size
());
if
(
gpu_count
<
config
.
gpu_devices_id
.
size
())
{
SPDLOG_ERROR
(
"Not enough GPUs available."
);
exit
(
0
);
}
for
(
auto
x
:
config
.
gpu_devices_id
)
{
gpu_devices
.
push_back
(
torch
::
Device
(
torch
::
kCUDA
,
x
));
}
}
else
{
SPDLOG_ERROR
(
"CUDA is not available on this system."
);
exit
(
0
);
}
SPDLOG_WARN
(
"Creating GPU Cache"
);
shape
.
push_back
(
config
.
layer_count
);
shape
.
push_back
(
config
.
total_kvcache_pages
);
shape
.
push_back
(
config
.
num_token_per_page
);
if
(
config
.
full_kv_cache_on_each_gpu
)
{
if
(
config
.
gpu_devices_id
.
size
()
>
1
)
{
SPDLOG_WARN
(
"Replicated KVCache on multiple gpu"
);
}
shape
.
push_back
(
config
.
num_k_heads
);
}
else
{
shape
.
push_back
(
config
.
num_k_heads
/
config
.
gpu_devices_id
.
size
());
}
shape
.
push_back
(
config
.
k_head_dim
);
tensor_size
=
torch
::
elementSize
(
config
.
tensor_type
);
for
(
auto
&
s
:
shape
)
{
tensor_size
*=
s
;
}
SPDLOG_INFO
(
"Creating KV Page Cache, Shape ({},{},{},{},{}), Size {} MiB"
,
shape
[
0
],
shape
[
1
],
shape
[
2
],
shape
[
3
],
shape
[
4
],
tensor_size
/
(
1
<<
20
));
if
(
config
.
k_cache_on
)
{
for
(
size_t
i
=
0
;
i
<
config
.
gpu_devices_id
.
size
();
i
++
)
{
auto
k
=
torch
::
zeros
(
shape
,
torch
::
TensorOptions
().
dtype
(
config
.
tensor_type
));
k
=
k
.
to
(
gpu_devices
[
i
]);
k_cache
.
push_back
(
k
);
SPDLOG_INFO
(
"K Page Cache of GPU {} is created"
,
config
.
gpu_devices_id
[
i
]);
}
occupations
.
resize
(
config
.
layer_count
);
}
else
{
SPDLOG_WARN
(
"Disalbe K Cache"
);
assert
(
config
.
gpu_only
);
}
if
(
config
.
v_cache_on
)
{
for
(
size_t
i
=
0
;
i
<
config
.
gpu_devices_id
.
size
();
i
++
)
{
auto
v
=
torch
::
zeros
(
shape
,
torch
::
TensorOptions
().
dtype
(
config
.
tensor_type
));
v
=
v
.
to
(
gpu_devices
[
i
]);
v_cache
.
push_back
(
v
);
SPDLOG_INFO
(
"V Page Cache of GPU {} is created"
,
config
.
gpu_devices_id
[
i
]);
}
v_occupations
.
resize
(
config
.
layer_count
);
}
else
{
SPDLOG_WARN
(
"Disalbe V Cache"
);
// assert(config.gpu_only); // should not assert
}
if
(
config
.
gpu_only
)
{
gpu_only_occupations
.
resize
(
config
.
total_kvcache_pages
,
false
);
}
num_free_pages
=
config
.
total_kvcache_pages
;
for
(
size_t
i
=
0
;
i
<
config
.
layer_count
;
i
++
)
{
if
(
config
.
k_cache_on
)
occupations
[
i
].
resize
(
config
.
total_kvcache_pages
,
nullptr
);
if
(
config
.
v_cache_on
)
v_occupations
[
i
].
resize
(
config
.
total_kvcache_pages
,
nullptr
);
}
tp_size
.
resize
(
config
.
gpu_devices_id
.
size
(),
shape
[
2
]
*
shape
[
3
]
*
shape
[
4
]
*
c10
::
elementSize
(
config
.
tensor_type
));
tp_offset
.
resize
(
config
.
gpu_devices_id
.
size
(),
0
);
for
(
size_t
i
=
1
;
i
<
tp_offset
.
size
();
i
++
)
{
tp_offset
[
i
]
=
tp_offset
[
i
-
1
]
+
tp_size
[
i
-
1
];
}
stream_manager
=
std
::
unique_ptr
<
CudaStreamManager
>
(
new
CudaStreamManager
(
config
.
gpu_devices_id
,
config
.
num_streams_per_device
));
}
bool
GPUPageCache
::
alloc_col
(
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_entries
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_entries
,
size_t
at
)
{
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
auto
idx
=
next_empty_col
();
if
(
idx
.
has_value
())
{
// must have entry lock
auto
&
k0_entry
=
k_entries
[
0
][
at
];
k0_entry
->
gpu_block_idx
=
idx
;
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
config
.
k_cache_on
)
{
assert
(
k_entries
[
l
][
at
]
->
data
!=
nullptr
);
occupations
[
l
][
idx
.
value
()]
=
k_entries
[
l
][
at
];
}
if
(
config
.
v_cache_on
)
{
assert
(
v_entries
[
l
][
at
]
->
data
!=
nullptr
);
v_occupations
[
l
][
idx
.
value
()]
=
v_entries
[
l
][
at
];
}
}
return
true
;
}
else
{
return
false
;
}
}
std
::
vector
<
size_t
>
GPUPageCache
::
gpu_only_alloc_col
(
size_t
count
)
{
assert
(
config
.
gpu_only
);
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
std
::
vector
<
size_t
>
re
;
for
(
size_t
i
=
0
;
i
<
config
.
total_kvcache_pages
;
i
++
)
{
if
(
gpu_only_occupations
[
i
]
==
false
)
{
re
.
push_back
(
i
);
if
(
re
.
size
()
==
count
)
{
break
;
}
}
}
if
(
re
.
size
()
==
count
)
{
for
(
auto
at
:
re
)
{
gpu_only_occupations
[
at
]
=
true
;
}
}
else
{
SPDLOG_WARN
(
"GPU ONLY: Cannot allocate {} cols"
,
count
);
re
.
clear
();
}
return
re
;
}
void
GPUPageCache
::
gpu_only_free_cols
(
std
::
vector
<
size_t
>
cols
)
{
assert
(
config
.
gpu_only
);
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
for
(
auto
at
:
cols
)
{
assert
(
gpu_only_occupations
[
at
]);
gpu_only_occupations
[
at
]
=
false
;
}
}
std
::
optional
<
size_t
>
GPUPageCache
::
next_empty_col
()
{
if
(
num_free_pages
==
0
)
{
evict_cols
();
if
(
num_free_pages
==
0
)
{
return
std
::
nullopt
;
}
}
while
(
occupations
[
0
][
_col_idx
]
!=
nullptr
)
{
_col_idx
=
(
_col_idx
+
1
)
%
config
.
total_kvcache_pages
;
}
num_free_pages
-=
1
;
return
_col_idx
;
}
void
GPUPageCache
::
evict_cols
()
{
auto
evicted_count
=
0
;
for
(
size_t
i
=
0
;
i
<
config
.
total_kvcache_pages
;
i
++
)
{
auto
&
h
=
occupations
[
0
][
i
];
if
(
h
==
nullptr
)
{
continue
;
}
auto
lg
=
h
->
lock_guard
();
if
(
h
->
gpu_cc
.
can_desert
())
{
h
->
gpu_cc
.
tc
.
reset
();
h
=
nullptr
;
num_free_pages
+=
1
;
evicted_count
+=
1
;
}
}
if
(
evicted_count
>
0
)
SPDLOG_INFO
(
"GPU: Evicted {} GPU pages"
,
evicted_count
);
}
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
GPUPageCache
::
try_lock_col
(
size_t
at
)
{
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
re
;
if
(
config
.
k_cache_on
)
{
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
occupations
[
l
][
at
]
==
nullptr
)
{
return
{};
}
auto
ul
=
occupations
[
l
][
at
]
->
try_lock
();
if
(
ul
.
owns_lock
())
{
re
.
push_back
(
std
::
move
(
ul
));
}
else
{
return
{};
}
}
}
if
(
config
.
v_cache_on
)
{
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
v_occupations
[
l
][
at
]
==
nullptr
)
{
return
{};
}
auto
ul
=
v_occupations
[
l
][
at
]
->
try_lock
();
if
(
ul
.
owns_lock
())
{
re
.
push_back
(
std
::
move
(
ul
));
}
else
{
return
{};
}
}
}
return
re
;
}
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
GPUPageCache
::
basic_request
(
cudaMemcpyKind
direction
,
std
::
function
<
void
()
>
callback
)
{
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
re
;
re
.
resize
(
config
.
gpu_devices_id
.
size
(),
nullptr
);
for
(
size_t
i
=
0
;
i
<
re
.
size
();
i
++
)
{
re
[
i
]
=
std
::
shared_ptr
<
CudaStreamManager
::
Request
>
(
new
CudaStreamManager
::
Request
);
re
[
i
]
->
direction
=
direction
;
re
[
i
]
->
device_id
=
config
.
gpu_devices_id
[
i
];
re
[
i
]
->
callback
=
callback
;
}
return
re
;
}
void
GPUPageCache
::
submit_requests
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
reqs
)
{
for
(
auto
&
r
:
reqs
)
{
stream_manager
->
submitRequest
(
r
);
}
}
void
GPUPageCache
::
append_col_to_request
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>&
reqs
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_handles
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_handles
,
size_t
at
)
{
if
(
config
.
k_cache_on
==
false
&&
config
.
v_cache_on
==
false
)
{
return
;
}
auto
gpu_block_idx
=
k_handles
[
0
][
at
]
->
gpu_block_idx
.
value
();
for
(
size_t
layer
=
0
;
layer
<
config
.
layer_count
;
layer
++
)
{
for
(
size_t
which_gpu
=
0
;
which_gpu
<
config
.
gpu_devices_id
.
size
();
which_gpu
++
)
{
if
(
config
.
k_cache_on
)
{
assert
(
k_handles
[
layer
][
at
]
->
data
!=
nullptr
);
reqs
[
which_gpu
]
->
sizes
.
push_back
(
tp_size
[
which_gpu
]);
reqs
[
which_gpu
]
->
host_mem_addresses
.
push_back
(
offset_by_bytes
(
k_handles
[
layer
][
at
]
->
data
,
tp_offset
[
which_gpu
]));
reqs
[
which_gpu
]
->
device_mem_addresses
.
push_back
(
k_cache
[
which_gpu
][
layer
][
gpu_block_idx
].
data_ptr
());
}
if
(
config
.
v_cache_on
)
{
assert
(
v_handles
[
layer
][
at
]
->
data
!=
nullptr
);
reqs
[
which_gpu
]
->
sizes
.
push_back
(
tp_size
[
which_gpu
]);
reqs
[
which_gpu
]
->
host_mem_addresses
.
push_back
(
offset_by_bytes
(
v_handles
[
layer
][
at
]
->
data
,
tp_offset
[
which_gpu
]));
reqs
[
which_gpu
]
->
device_mem_addresses
.
push_back
(
v_cache
[
which_gpu
][
layer
][
gpu_block_idx
].
data_ptr
());
}
}
}
// SPDLOG_DEBUG("GPU: Appended Vertical Handle to Request, count {}", reqs[0]->sizes.size());
}
void
GPUPageCache
::
debug
()
{
size_t
count
=
0
;
for
(
size_t
i
=
0
;
i
<
config
.
total_kvcache_pages
;
i
++
)
{
if
(
occupations
[
0
][
i
]
==
nullptr
)
{
count
+=
1
;
}
else
{
// occupations[0][i]->gpu_cc.debug();
}
}
SPDLOG_DEBUG
(
"Free Page: {}/{}"
,
count
,
config
.
total_kvcache_pages
);
}
}
// namespace kvc2
csrc/balance_serve/kvc2/src/gpu_cache.hh
0 → 100644
View file @
877aec85
#ifndef __GPU_CACHE_HH_
#define __GPU_CACHE_HH_
#include <torch/torch.h>
#include "cache_entry.hh"
#include "cuda_stream_manager.hh"
#include "defs.h"
#include "kvc2.h"
#include "metrics.h"
#include "utils/periodic_task.hpp"
namespace
kvc2
{
class
GPUPageCache
{
std
::
vector
<
torch
::
Device
>
gpu_devices
;
std
::
vector
<
int64_t
>
shape
;
size_t
tensor_size
;
std
::
vector
<
size_t
>
tp_offset
;
std
::
vector
<
size_t
>
tp_size
;
// met
std
::
shared_ptr
<
Metrics
>
met
;
// states
std
::
mutex
lock
;
size_t
num_free_pages
;
std
::
vector
<
bool
>
gpu_only_occupations
;
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>
occupations
,
v_occupations
;
size_t
_col_idx
=
0
;
// cuda stream manager
std
::
optional
<
size_t
>
next_empty_col
();
public:
GPUPageCacheConfig
config
;
std
::
unique_ptr
<
CudaStreamManager
>
stream_manager
;
std
::
vector
<
torch
::
Tensor
>
k_cache
;
std
::
vector
<
torch
::
Tensor
>
v_cache
;
std
::
unique_ptr
<
periodic
::
PeriodicTask
>
background_flush_back
=
nullptr
;
GPUPageCache
(
GPUPageCacheConfig
&
config
);
std
::
vector
<
size_t
>
gpu_only_alloc_col
(
size_t
count
);
void
gpu_only_free_cols
(
std
::
vector
<
size_t
>
cols
);
void
gpu_background_flush
();
bool
alloc_col
(
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_entries
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_entries
,
size_t
at
);
void
evict_cols
();
void
flush_col
(
size_t
at
);
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
try_lock_col
(
size_t
at
);
void
free_col
(
size_t
at
);
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
basic_request
(
cudaMemcpyKind
direction
,
std
::
function
<
void
()
>
callback
);
void
submit_requests
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
reqs
);
void
append_col_to_request
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>&
reqs
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_handles
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_handles
,
size_t
at
);
void
debug
();
};
}
// namespace kvc2
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/hasher.hpp
0 → 100644
View file @
877aec85
#ifndef __HASHER_HPP_
#define __HASHER_HPP_
#include "defs.h"
#include "xxhash.h"
namespace
kvc2
{
const
uint64_t
hash_seed
=
4123512
;
const
uint64_t
check_hash_seed
=
1025753
;
using
TokensHash
=
XXH64_hash_t
;
struct
TokensHasher
{
XXH64_state_t
*
state
;
TokensHasher
()
{
state
=
XXH64_createState
();
reset
();
}
~
TokensHasher
()
{
XXH64_freeState
(
state
);
}
TokensHasher
(
TokensHasher
&
other
)
=
delete
;
TokensHasher
&
operator
=
(
TokensHasher
&
other
)
=
delete
;
TokensHasher
(
TokensHasher
&&
other
)
=
delete
;
TokensHasher
&
operator
=
(
TokensHasher
&&
other
)
=
delete
;
TokensHash
get
()
{
return
XXH64_digest
(
state
);
}
void
reset
(
size_t
seed
=
hash_seed
)
{
XXH64_reset
(
state
,
seed
);
}
TokensHash
update
(
Token
*
data
,
TokenLength
length
)
{
XXH64_update
(
state
,
data
,
length
*
sizeof
(
Token
));
return
get
();
}
TokensHash
update_raw
(
void
*
data
,
size_t
size
)
{
XXH64_update
(
state
,
data
,
size
);
return
get
();
}
static
TokensHash
hash
(
Token
*
data
,
TokenLength
length
)
{
return
XXH64
(
data
,
length
*
sizeof
(
Token
),
hash_seed
);
}
};
}
// namespace kvc2
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/io_helper.hpp
0 → 100644
View file @
877aec85
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-12-11 06:35:31
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-12-11 06:50:55
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
#include <atomic>
#include <future>
#include <iostream>
#include <mutex>
#include <optional>
#include <string>
#include <vector>
struct
BatchPromise
{
std
::
promise
<
void
>
promise
;
std
::
shared_future
<
void
>
fut
;
std
::
atomic_size_t
count
;
inline
BatchPromise
(
size_t
count
)
:
count
(
count
)
{
fut
=
promise
.
get_future
().
share
();
}
inline
void
inc
(
size_t
count
=
1
)
{
this
->
count
.
fetch_add
(
count
,
std
::
memory_order_seq_cst
);
}
inline
void
set
()
{
if
(
count
.
fetch_sub
(
1
,
std
::
memory_order_seq_cst
)
==
1
)
{
promise
.
set_value
();
}
}
inline
std
::
shared_future
<
void
>
get_shared_fut
()
{
return
fut
;
}
};
template
<
typename
Lock
>
struct
TransferControl
{
Lock
lock
;
std
::
optional
<
std
::
shared_future
<
void
>>
transfer_ok
=
std
::
nullopt
;
bool
has_data
=
false
;
TransferControl
()
{}
/*
true, std::nullopt : Already has data
false, shared_future : Transfer already started, should wait for the future
false, std::nullopt : should transfer by you
true, shared_future: Should not appear
*/
std
::
pair
<
bool
,
std
::
optional
<
std
::
shared_future
<
void
>>>
has_data_or_transfer
(
std
::
shared_future
<
void
>
shared_fut
)
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
if
(
has_data
)
{
return
{
true
,
std
::
nullopt
};
}
else
{
if
(
transfer_ok
.
has_value
())
{
return
{
false
,
transfer_ok
};
}
else
{
transfer_ok
=
shared_fut
;
return
{
false
,
std
::
nullopt
};
}
}
}
void
set_has_data
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
has_data
=
true
;
transfer_ok
=
std
::
nullopt
;
}
bool
get_has_data
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
if
(
has_data
)
{
return
true
;
}
else
{
return
false
;
}
}
void
reset
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
transfer_ok
=
std
::
nullopt
;
has_data
=
false
;
}
std
::
string
debug
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
return
std
::
string
(
""
)
+
(
has_data
?
"has data"
:
"no data"
)
+
" "
+
(
transfer_ok
.
has_value
()
?
"transfer "
:
"no transfer"
);
}
};
struct
ConcurrentController
{
std
::
atomic_bool
dirty
=
false
;
std
::
atomic_size_t
ref_count
=
0
;
TransferControl
<
std
::
mutex
>
tc
;
};
template
<
typename
Unit
>
struct
IO_Helper
{
BatchPromise
batch_promise
;
std
::
function
<
void
(
Unit
*
)
>
call_back_on_unit
=
nullptr
;
std
::
function
<
void
()
>
call_back
=
nullptr
;
std
::
vector
<
std
::
shared_future
<
void
>>
futs
;
std
::
vector
<
Unit
*>
units_by_myself
;
IO_Helper
(
std
::
function
<
void
(
Unit
*
)
>
call_back_on_unit
,
std
::
function
<
void
()
>
call_back
=
nullptr
)
:
batch_promise
(
1
),
call_back_on_unit
(
call_back_on_unit
),
call_back
(
call_back
)
{}
IO_Helper
(
const
IO_Helper
&
other
)
=
delete
;
IO_Helper
&
operator
=
(
const
IO_Helper
&
other
)
=
delete
;
IO_Helper
(
IO_Helper
&&
other
)
=
delete
;
IO_Helper
&
operator
=
(
IO_Helper
&&
other
)
=
delete
;
~
IO_Helper
()
{
// std::cout<<"Destory IO helper"<<std::endl;
}
size_t
total_task_count
=
0
;
void
new_task
(
size_t
count
=
1
)
{
total_task_count
+=
1
;
batch_promise
.
inc
(
count
);
}
void
finish_add_taks
()
{
batch_promise
.
set
();
}
bool
absorb_tc
(
Unit
*
unit
,
TransferControl
<
std
::
mutex
>&
tc
)
{
auto
[
ok
,
fut
]
=
tc
.
has_data_or_transfer
(
batch_promise
.
get_shared_fut
());
if
(
ok
)
{
return
false
;
}
else
{
if
(
fut
.
has_value
())
{
futs
.
push_back
(
fut
.
value
());
// printf("Transfer started\n");
return
false
;
}
else
{
units_by_myself
.
push_back
(
unit
);
// printf("Not Transfer\n");
return
true
;
}
}
}
void
wait
()
{
for
(
auto
&
fut
:
futs
)
{
fut
.
wait
();
}
batch_promise
.
get_shared_fut
().
wait
();
for
(
auto
&
b
:
units_by_myself
)
{
call_back_on_unit
(
b
);
}
if
(
call_back
)
call_back
();
}
};
csrc/balance_serve/kvc2/src/kvc2.h
0 → 100644
View file @
877aec85
#pragma once
#include <torch/torch.h>
#include <cstdint>
#include <optional>
#include <vector>
#include "defs.h"
#include "model_config.h"
namespace
kvc2
{
struct
GPUPageCacheConfig
{
bool
gpu_only
;
std
::
vector
<
size_t
>
gpu_devices_id
;
size_t
layer_count
;
size_t
total_kvcache_pages
;
size_t
num_token_per_page
;
size_t
num_k_heads
;
size_t
k_head_dim
;
bool
full_kv_cache_on_each_gpu
=
false
;
bool
k_cache_on
=
true
;
bool
v_cache_on
=
true
;
torch
::
ScalarType
tensor_type
;
// for cuda stream manager
size_t
num_streams_per_device
=
4
;
};
struct
KVC2Config
{
bool
k_cache_on
=
true
;
bool
v_cache_on
=
true
;
bool
gpu_only
=
false
;
bool
load_from_disk
=
true
;
bool
save_to_disk
=
true
;
std
::
string
path
;
std
::
string
config_path
;
TokenLength
num_token_per_page
=
256
;
size_t
memory_pool_size
=
10e9
;
size_t
evict_count
=
20
;
std
::
optional
<
GPUPageCacheConfig
>
gpu_cache_config
=
std
::
nullopt
;
size_t
metrics_port
;
double
recompute_ratio
=
0.2
;
};
class
DoubleCacheHandleInterface
;
class
KVC2Interface
{
public:
virtual
~
KVC2Interface
()
=
default
;
virtual
void
load
()
=
0
;
virtual
void
save
()
=
0
;
/*
Raw Insert
Insert kvcache from kvcache_data to disk.
info: cache info
id: start pointer of token array
length: length of token array
kvcache_data: data of kvcache
This will firstly match the ID array with the existing kvcache, and then insert the unmatched kvcache to disk.
*/
virtual
void
raw_insert
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
const
std
::
vector
<
layer_data
>&
k_cache
,
const
std
::
vector
<
layer_data
>&
v_cache
)
=
0
;
/*
Raw Read
Read kvcache from disk to user specified pointers.
info: cache info
id: start pointer of token array
length: length of token array
kvcache_data: data of kvcache
Return: matched length of prefix, in tokens
This will not read from memory pool, it directly read from disk.
*/
virtual
TokenLength
raw_read
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
const
std
::
vector
<
layer_data
>&
k_cache
,
const
std
::
vector
<
layer_data
>&
v_cache
)
=
0
;
/*
Lookup
Lookup kvcache and load it from disk to memory pool if needed.
info: cache info
id: start pointer of token array
length: length of token array
Return: kvc2_handle, holds kvcache until being released.
if not found, matched_length will return 0.
if memory pool is full, return nullptr
*/
virtual
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
lookup
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
)
=
0
;
/*
Lookup and allocate to gpu
info.is_k_cache does not matter here
*/
virtual
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
lookup_to_gpu
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
)
=
0
;
virtual
void
lookup_to_gpu_async
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
,
std
::
function
<
void
(
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
)
>
call_back
)
=
0
;
virtual
std
::
pair
<
std
::
vector
<
torch
::
Tensor
>
,
std
::
vector
<
torch
::
Tensor
>>
get_kvcache
()
=
0
;
virtual
void
debug
()
=
0
;
};
std
::
shared_ptr
<
KVC2Interface
>
create_kvc2
(
KVC2Config
config
);
enum
MatchStatus
{
Exact
,
Partial
,
NotMatchExact
,
NotMatchPartial
,
};
class
DoubleCacheHandleInterface
{
public:
virtual
~
DoubleCacheHandleInterface
()
=
default
;
virtual
TokenLength
matched_length
()
=
0
;
virtual
std
::
vector
<
MatchStatus
>
matched_status
()
=
0
;
virtual
std
::
vector
<
layer_data
>
handle_data
(
bool
is_key_cache
)
=
0
;
virtual
bool
to_gpu
()
=
0
;
virtual
void
to_gpu_async
(
std
::
function
<
void
(
bool
)
>
call_back
)
=
0
;
virtual
std
::
vector
<
size_t
>
get_gpu_block_idx
()
=
0
;
virtual
std
::
vector
<
size_t
>
get_gpu_attached_block_idx
()
=
0
;
virtual
void
append_tokens
(
Token
*
tokens
,
TokenLength
length
)
=
0
;
// update generated tokens
virtual
void
debug
()
=
0
;
};
};
// namespace kvc2
csrc/balance_serve/kvc2/src/kvc2_utils.py
0 → 100644
View file @
877aec85
import
torch
import
ctypes
def
aligned_tensor
(
size
,
alignment
=
4096
):
num_bytes
=
size
mem
=
ctypes
.
c_void_p
()
error_code
=
ctypes
.
CDLL
(
None
).
posix_memalign
(
ctypes
.
byref
(
mem
),
ctypes
.
c_size_t
(
alignment
),
ctypes
.
c_size_t
(
num_bytes
)
)
if
error_code
!=
0
:
raise
MemoryError
(
f
"posix_memalign failed with error code
{
error_code
}
"
)
array_type
=
(
ctypes
.
c_int8
*
size
)
raw_array
=
array_type
.
from_address
(
mem
.
value
)
tensor
=
torch
.
frombuffer
(
raw_array
,
dtype
=
torch
.
int8
)
if
tensor
.
data_ptr
()
%
alignment
!=
0
:
raise
ValueError
(
f
"Tensor data_ptr
{
tensor
.
data_ptr
()
}
is not aligned to
{
alignment
}
bytes"
)
return
tensor
,
mem
def
alloc_aligned_cache
(
layer_count
,
block_count
,
element_size
):
cache
=
[]
cache_mem
=
[]
for
i
in
range
(
layer_count
):
layer_data
=
[]
layer_mem
=
[]
for
j
in
range
(
block_count
):
tensor
,
mem_ptr
=
aligned_tensor
(
element_size
,
alignment
=
4096
)
layer_data
.
append
(
tensor
)
layer_mem
.
append
(
mem_ptr
)
cache
.
append
(
layer_data
)
cache_mem
.
append
(
layer_mem
)
return
cache
,
cache_mem
def
dealloc_aligned_cache
(
cache_mem
):
for
layer_mem
in
cache_mem
:
for
mem_ptr
in
layer_mem
:
ctypes
.
CDLL
(
None
).
free
(
mem_ptr
)
def
get_tensor_ptr
(
tensors
):
tensor_ptr
=
[]
for
layer
in
tensors
:
layer_ptr
=
[]
for
data
in
layer
:
layer_ptr
.
append
(
data
.
data_ptr
())
tensor_ptr
.
append
(
layer_ptr
)
return
tensor_ptr
def
get_tensor_from_data_ptr
(
matched_data
,
element_size
):
re
=
[]
for
layer
in
matched_data
:
re_layer
=
[]
for
data_ptr
in
layer
:
array_type
=
(
ctypes
.
c_int8
*
element_size
)
raw_array
=
array_type
.
from_address
(
data_ptr
)
tensor
=
torch
.
frombuffer
(
raw_array
,
dtype
=
torch
.
int8
)
re_layer
.
append
(
tensor
)
re
.
append
(
re_layer
)
return
re
if
__name__
==
"__main__"
:
pass
\ No newline at end of file
csrc/balance_serve/kvc2/src/metrics.cpp
0 → 100644
View file @
877aec85
#include "metrics.h"
namespace
kvc2
{
Metrics
::
Metrics
(
const
MetricsConfig
&
config
)
:
registry_
(
std
::
make_shared
<
prometheus
::
Registry
>
()),
exposer_
(
config
.
endpoint
)
{
// 注册 prefix_nodes Counter
auto
&
prefix_nodes_family
=
prometheus
::
BuildCounter
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_prefix_nodes"
)
.
Help
(
"Number of prefix nodes"
)
.
Register
(
*
registry_
);
prefix_nodes
=
&
prefix_nodes_family
.
Add
({});
// 注册 prefix_block_count Counter
auto
&
prefix_block_count_family
=
prometheus
::
BuildCounter
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_prefix_block_count"
)
.
Help
(
"Number of prefix blocks"
)
.
Register
(
*
registry_
);
prefix_block_count
=
&
prefix_block_count_family
.
Add
({});
// 定义统一的桶大小,最大为 10000 ms (10 s)
std
::
vector
<
double
>
common_buckets
=
{
1.0
,
5.0
,
10.0
,
50.0
,
100.0
,
500.0
,
1000.0
,
5000.0
,
10000.0
};
// 注册 raw_insert_time_ms Histogram
auto
&
raw_insert_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_raw_insert_time_ms"
)
.
Help
(
"function raw insert's time in milliseconds"
)
.
Register
(
*
registry_
);
raw_insert_time_ms
=
&
raw_insert_time_ms_family
.
Add
({},
common_buckets
);
// 注册 lookup_time_ms Histogram
auto
&
lookup_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_lookup_time_ms"
)
.
Help
(
"function lookup's time in milliseconds"
)
.
Register
(
*
registry_
);
lookup_time_ms
=
&
lookup_time_ms_family
.
Add
({},
common_buckets
);
// 注册 lookup_prefixmatch_length Histogram
auto
&
lookup_prefixmatch_length_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_lookup_prefixmatch_length"
)
.
Help
(
"function lookup's prefix match length"
)
.
Register
(
*
registry_
);
lookup_prefixmatch_length
=
&
lookup_prefixmatch_length_family
.
Add
({},
common_buckets
);
// 注册 matched_length_percentage Histogram
auto
&
matched_length_percentage_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_matched_length_percentage"
)
.
Help
(
"function matched length percentage"
)
.
Register
(
*
registry_
);
matched_length_percentage
=
&
matched_length_percentage_family
.
Add
({},
common_buckets
);
// 注册 disk_usage Gauge
auto
&
disk_usage_family
=
prometheus
::
BuildGauge
().
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_disk_usage"
).
Help
(
"disk usage"
).
Register
(
*
registry_
);
disk_usage
=
&
disk_usage_family
.
Add
({});
// 注册 memory_pool_size Gauge
memory_pool_size_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_memory_pool_size"
)
.
Help
(
"memory pool size"
)
.
Register
(
*
registry_
);
// 注册 memory_pool_node_count Gauge
memory_pool_node_count_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_memory_pool_node_count"
)
.
Help
(
"memory pool node count"
)
.
Register
(
*
registry_
);
// 注册 lru_entry_count Gauge
lru_entry_count_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_lru_entry_count"
)
.
Help
(
"lru entry count"
)
.
Register
(
*
registry_
);
// 注册 gpu_page_count Gauge
gpu_page_count_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_gpu_page_count"
)
.
Help
(
"gpu page count"
)
.
Register
(
*
registry_
);
// 注册 append_tokens_time_ms Histogram
auto
&
append_tokens_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_append_tokens_time_ms"
)
.
Help
(
"append tokens time in milliseconds"
)
.
Register
(
*
registry_
);
append_tokens_time_ms
=
&
append_tokens_time_ms_family
.
Add
({},
common_buckets
);
// 注册 gpu_flush_back_time_ms Histogram
auto
&
gpu_flush_back_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_gpu_flush_back_time_ms"
)
.
Help
(
"gpu flush back time in milliseconds"
)
.
Register
(
*
registry_
);
gpu_flush_back_time_ms
=
&
gpu_flush_back_time_ms_family
.
Add
({},
common_buckets
);
// 注册 cpu_flush_back_time_ms Histogram
auto
&
cpu_flush_back_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_cpu_flush_back_time_ms"
)
.
Help
(
"cpu flush back time in milliseconds"
)
.
Register
(
*
registry_
);
cpu_flush_back_time_ms
=
&
cpu_flush_back_time_ms_family
.
Add
({},
common_buckets
);
exposer_
.
RegisterCollectable
(
registry_
);
}
// 析构函数
Metrics
::~
Metrics
()
{
// 停止指标暴露
// exposer_.Stop();
}
// 获取 memory_pool_size 指标
prometheus
::
Gauge
*
Metrics
::
memory_pool_size
(
const
std
::
string
&
type
)
{
return
&
memory_pool_size_family_
->
Add
({{
"type"
,
type
}});
}
// 获取 memory_pool_node_count 指标
prometheus
::
Gauge
*
Metrics
::
memory_pool_node_count
(
const
std
::
string
&
type
)
{
return
&
memory_pool_node_count_family_
->
Add
({{
"type"
,
type
}});
}
// 获取 lru_entry_count 指标
prometheus
::
Gauge
*
Metrics
::
lru_entry_count
(
const
std
::
string
&
type
)
{
return
&
lru_entry_count_family_
->
Add
({{
"type"
,
type
}});
}
// 获取 gpu_page_count 指标
prometheus
::
Gauge
*
Metrics
::
gpu_page_count
(
std
::
string
type
)
{
return
&
gpu_page_count_family_
->
Add
({{
"type"
,
type
}});
}
TimeObserver
::
TimeObserver
(
prometheus
::
Histogram
*
h
)
{
histogram_
=
h
;
timer_
.
start
();
}
TimeObserver
::~
TimeObserver
()
{
timer_
.
stop
();
histogram_
->
Observe
(
timer_
.
elapsedNs
()
/
1e6
);
// ns -> ms
}
}
// namespace kvc2
\ No newline at end of file
csrc/balance_serve/kvc2/src/metrics.h
0 → 100644
View file @
877aec85
#pragma once
#include <atomic>
#include <chrono>
#include <memory>
#include <string>
#include <thread>
#include <vector>
#include "prometheus/counter.h"
#include "prometheus/exposer.h"
#include "prometheus/gauge.h"
#include "prometheus/histogram.h"
#include "prometheus/registry.h"
#include "utils/timer.hpp"
namespace
kvc2
{
// 指标前缀宏定义
#define METRIC_PREFIX "kvc2"
struct
MetricsConfig
{
std
::
string
endpoint
;
// 监听端点,如 "0.0.0.0:8080"
};
class
Metrics
{
public:
// 构造函数传入 MetricsConfig
Metrics
(
const
MetricsConfig
&
config
);
~
Metrics
();
// 禁止拷贝和赋值
Metrics
(
const
Metrics
&
)
=
delete
;
Metrics
&
operator
=
(
const
Metrics
&
)
=
delete
;
// 指标指针
prometheus
::
Counter
*
prefix_nodes
;
prometheus
::
Counter
*
prefix_block_count
;
prometheus
::
Histogram
*
raw_insert_time_ms
;
prometheus
::
Histogram
*
lookup_time_ms
;
prometheus
::
Histogram
*
lookup_prefixmatch_length
;
prometheus
::
Histogram
*
matched_length_percentage
;
prometheus
::
Gauge
*
disk_usage
;
prometheus
::
Gauge
*
memory_pool_size
(
const
std
::
string
&
type
);
prometheus
::
Gauge
*
memory_pool_node_count
(
const
std
::
string
&
type
);
prometheus
::
Gauge
*
lru_entry_count
(
const
std
::
string
&
type
);
prometheus
::
Gauge
*
gpu_page_count
(
std
::
string
type
);
prometheus
::
Histogram
*
append_tokens_time_ms
;
prometheus
::
Histogram
*
gpu_flush_back_time_ms
;
prometheus
::
Histogram
*
cpu_flush_back_time_ms
;
private:
std
::
shared_ptr
<
prometheus
::
Registry
>
registry_
;
prometheus
::
Exposer
exposer_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
memory_pool_size_family_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
memory_pool_node_count_family_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
lru_entry_count_family_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
gpu_page_count_family_
;
};
class
TimeObserver
{
public:
TimeObserver
(
prometheus
::
Histogram
*
h
);
~
TimeObserver
();
private:
Timer
timer_
;
prometheus
::
Histogram
*
histogram_
;
};
}
// namespace kvc2
\ No newline at end of file
csrc/balance_serve/kvc2/src/model_config.h
0 → 100644
View file @
877aec85
#ifndef __MODEL_CONFIG_HPP_
#define __MODEL_CONFIG_HPP_
#include "nlohmann/json.hpp"
#include <iostream>
#include <filesystem>
#include <fstream>
using
DimSize
=
size_t
;
using
URL
=
std
::
string
;
using
ModelName
=
std
::
string
;
// We must assure this can be load by config.json
class
ModelConfig
{
public:
DimSize
hidden_size
;
DimSize
intermediate_size
;
size_t
max_position_embeddings
;
std
::
string
model_type
;
size_t
num_attention_heads
;
size_t
num_hidden_layers
;
size_t
num_key_value_heads
;
size_t
vocab_size
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE
(
ModelConfig
,
hidden_size
,
intermediate_size
,
max_position_embeddings
,
model_type
,
num_attention_heads
,
num_hidden_layers
,
num_key_value_heads
,
vocab_size
);
void
load_from
(
std
::
filesystem
::
path
path
)
{
std
::
cout
<<
"Load from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
nlohmann
::
json
j
;
i
>>
j
;
*
this
=
j
.
get
<
ModelConfig
>
();
}
};
using
QuantType
=
std
::
string
;
static
const
QuantType
NoQuantType
=
""
;
class
QuantConfig
{
public:
QuantType
name
;
// For GEMV
QuantType
type_of_dot_vector
=
NoQuantType
;
inline
bool
can_be_used_as_matrix
()
{
return
type_of_dot_vector
!=
NoQuantType
;
}
bool
can_be_used_as_vector
;
double
bytes_per_element
;
bool
has_scale
;
bool
has_min
;
size_t
block_element_count
;
size_t
block_element_size
;
URL
reference
=
""
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT
(
QuantConfig
,
name
,
type_of_dot_vector
,
can_be_used_as_vector
,
bytes_per_element
,
has_scale
,
has_min
,
block_element_count
,
block_element_size
,
reference
);
};
inline
std
::
map
<
QuantType
,
QuantConfig
>
quant_configs
;
inline
std
::
map
<
ModelName
,
ModelConfig
>
model_configs
;
inline
void
load_quant_configs
(
std
::
filesystem
::
path
path
)
{
nlohmann
::
json
j
;
if
(
std
::
filesystem
::
exists
(
path
))
{
std
::
cout
<<
__FUNCTION__
<<
" from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
i
>>
j
;
quant_configs
=
j
.
get
<
std
::
map
<
QuantType
,
QuantConfig
>>
();
std
::
cout
<<
"Loaded Quant Configs"
<<
std
::
endl
;
for
(
auto
&
[
k
,
v
]
:
quant_configs
)
{
std
::
cout
<<
" - "
<<
k
<<
std
::
endl
;
}
}
else
{
std
::
cout
<<
__FUNCTION__
<<
" no file at "
<<
path
<<
std
::
endl
;
}
}
inline
void
dump_quant_configs
(
std
::
filesystem
::
path
path
)
{
std
::
ofstream
o
(
path
);
nlohmann
::
json
j
=
quant_configs
;
o
<<
j
.
dump
(
4
);
}
inline
void
load_model_configs
(
std
::
filesystem
::
path
path
)
{
nlohmann
::
json
j
;
if
(
std
::
filesystem
::
exists
(
path
))
{
std
::
cout
<<
__FUNCTION__
<<
" from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
i
>>
j
;
model_configs
=
j
.
get
<
std
::
map
<
ModelName
,
ModelConfig
>>
();
std
::
cout
<<
"Loaded Model Configs"
<<
std
::
endl
;
for
(
auto
&
[
k
,
v
]
:
model_configs
)
{
std
::
cout
<<
" - "
<<
k
<<
std
::
endl
;
}
}
else
{
std
::
cout
<<
__FUNCTION__
<<
" no file at "
<<
path
<<
std
::
endl
;
}
}
inline
void
dump_model_configs
(
std
::
filesystem
::
path
path
)
{
std
::
ofstream
o
(
path
);
nlohmann
::
json
j
=
model_configs
;
o
<<
j
.
dump
(
4
);
}
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
0 → 100644
View file @
877aec85
#include "page_aligned_memory_pool.h"
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
#include "utils/arithmetic.hpp"
#include "utils/easy_format.hpp"
/// 构造函数
PageAlignedMemoryPool
::
PageAlignedMemoryPool
(
size_t
size_in_bytes
)
{
total_size
=
(
size_in_bytes
/
PageSize
)
*
PageSize
;
// 对齐分配。C++17 对齐方式写法,如果编译器不支持可以改用其它方法
data
=
::
operator
new
[](
total_size
,
std
::
align_val_t
(
PageSize
));
total_pages
=
total_size
/
PageSize
;
assert
(
total_pages
>=
Blocks
);
page_per_block
=
total_pages
/
Blocks
;
for
(
size_t
block_index
=
0
;
block_index
<
Blocks
;
block_index
++
)
{
first_page
[
block_index
]
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
intptr_t
>
(
data
)
+
static_cast
<
intptr_t
>
(
block_index
)
*
page_per_block
*
PageSize
);
count_page
[
block_index
]
=
block_index
==
Blocks
-
1
?
(
total_pages
-
page_per_block
*
(
Blocks
-
1
))
:
page_per_block
;
SPDLOG_DEBUG
(
"first_page[{}] = {}, count_page[{}] = {}"
,
block_index
,
reinterpret_cast
<
intptr_t
>
(
first_page
[
block_index
])
-
reinterpret_cast
<
intptr_t
>
(
data
),
block_index
,
count_page
[
block_index
]);
bitmap
[
block_index
].
resize
(
count_page
[
block_index
],
0
);
}
SPDLOG_INFO
(
"PageAlignedMemoryPool with size {} Mbytes, {} pages"
,
total_size
/
(
1
<<
20
),
page_count
());
}
/// 析构函数
PageAlignedMemoryPool
::~
PageAlignedMemoryPool
()
{
if
(
data
)
{
// 注意:需要与分配时的对齐方式对应
::
operator
delete
[](
data
,
std
::
align_val_t
(
PageSize
));
data
=
nullptr
;
}
}
/// 返回总页数
size_t
PageAlignedMemoryPool
::
page_count
()
{
return
total_size
/
PageSize
;
}
/// 返回按整页对齐后的字节数
size_t
PageAlignedMemoryPool
::
page_padded_size
(
size_t
size
)
{
return
div_up
(
size
,
PageSize
)
*
PageSize
;
}
void
*
PageAlignedMemoryPool
::
alloc_in_block
(
size_t
block_index
,
size_t
alloc_size
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
lock
[
block_index
]);
size_t
free_pages
=
0
;
for
(
size_t
i
=
0
;
i
<
count_page
[
block_index
];
i
++
)
{
if
(
bitmap
[
block_index
][
i
]
==
0
)
{
free_pages
++
;
if
(
free_pages
==
alloc_size
)
{
size_t
page_index
=
i
+
1
-
free_pages
;
for
(
size_t
page
=
page_index
;
page
<
page_index
+
alloc_size
;
page
++
)
{
bitmap
[
block_index
][
page
]
=
1
;
// SPDLOG_DEBUG("alloc page {} in block {}", page, block_index);
}
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
intptr_t
>
(
first_page
[
block_index
])
+
page_index
*
PageSize
);
}
}
else
{
free_pages
=
0
;
}
}
return
nullptr
;
}
/// 分配函数
void
*
PageAlignedMemoryPool
::
alloc
(
size_t
size
)
{
size_t
alloc_size
=
div_up
(
size
,
PageSize
);
auto
cnt
=
now_block
.
fetch_add
(
1
,
std
::
memory_order_relaxed
);
for
(
size_t
i
=
0
;
i
<
Blocks
;
i
++
)
{
auto
result
=
alloc_in_block
((
i
+
cnt
)
%
Blocks
,
alloc_size
);
if
(
result
!=
nullptr
)
{
allocated
.
fetch_add
(
alloc_size
*
PageSize
,
std
::
memory_order_relaxed
);
alloc_count
.
fetch_add
(
1
,
std
::
memory_order_relaxed
);
return
result
;
}
}
return
nullptr
;
}
/// 释放函数
void
PageAlignedMemoryPool
::
free
(
void
*
p
,
size_t
size
)
{
auto
alloc_size
=
div_up
(
size
,
PageSize
);
size_t
block_index
=
(
reinterpret_cast
<
intptr_t
>
(
p
)
-
reinterpret_cast
<
intptr_t
>
(
data
))
/
page_per_block
/
PageSize
;
size_t
page_index
=
(
reinterpret_cast
<
intptr_t
>
(
p
)
-
reinterpret_cast
<
intptr_t
>
(
first_page
[
block_index
]))
/
PageSize
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
lock
[
block_index
]);
for
(
size_t
page
=
page_index
;
page
<
page_index
+
alloc_size
;
page
++
)
bitmap
[
block_index
][
page
]
=
0
;
allocated
.
fetch_sub
(
alloc_size
*
PageSize
,
std
::
memory_order_relaxed
);
free_count
.
fetch_add
(
1
,
std
::
memory_order_relaxed
);
}
// TODO: too slow
std
::
vector
<
void
*>
PageAlignedMemoryPool
::
alloc_multiple
(
size_t
size
,
size_t
count
)
{
std
::
vector
<
void
*>
result
;
for
(
size_t
i
=
0
;
i
<
count
;
i
++
)
{
auto
p
=
alloc
(
size
);
if
(
p
==
nullptr
)
{
for
(
auto
ptr
:
result
)
{
free
(
ptr
,
size
);
}
return
{};
}
result
.
push_back
(
p
);
}
return
result
;
}
void
PageAlignedMemoryPool
::
defragment
()
{}
/// 调试打印
std
::
string
PageAlignedMemoryPool
::
debug
()
{
return
fmt
::
format
(
"PageAlignedMemoryPool: total_size: {}MB, allocated: {}, alloc/free count: {}/{}
\n
"
,
readable_number
(
total_size
),
readable_number
(
size_t
(
allocated
)),
size_t
(
alloc_count
),
size_t
(
free_count
));
}
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
0 → 100644
View file @
877aec85
#pragma once
#include <assert.h>
#include <algorithm> // std::sort
#include <atomic>
#include <bitset>
#include <cstddef> // size_t
#include <mutex> // std::mutex
#include <vector>
constexpr
size_t
PageSize
=
4096
;
/// PageAlignedMemoryPool 类的声明
struct
PageAlignedMemoryPool
{
private:
constexpr
static
size_t
Blocks
=
16
;
void
*
data
=
nullptr
;
size_t
total_size
=
0
,
total_pages
=
0
;
std
::
atomic_size_t
now_block
=
0
;
std
::
atomic_size_t
allocated
=
0
;
// allocated_size
std
::
atomic_size_t
alloc_count
=
0
;
std
::
atomic_size_t
free_count
=
0
;
std
::
mutex
lock
[
Blocks
];
size_t
page_per_block
=
0
;
void
*
first_page
[
Blocks
];
size_t
count_page
[
Blocks
];
std
::
vector
<
int8_t
>
bitmap
[
Blocks
];
void
*
alloc_in_block
(
size_t
block_index
,
size_t
alloc_size
);
public:
/// 构造函数和析构函数
explicit
PageAlignedMemoryPool
(
size_t
size_in_bytes
);
~
PageAlignedMemoryPool
();
/// 禁用拷贝和移动
PageAlignedMemoryPool
(
PageAlignedMemoryPool
&&
other
)
=
delete
;
PageAlignedMemoryPool
&
operator
=
(
PageAlignedMemoryPool
&&
other
)
=
delete
;
PageAlignedMemoryPool
(
const
PageAlignedMemoryPool
&
other
)
=
delete
;
PageAlignedMemoryPool
&
operator
=
(
const
PageAlignedMemoryPool
&
other
)
=
delete
;
/// 成员函数
size_t
page_count
();
size_t
page_padded_size
(
size_t
size
);
void
*
alloc
(
size_t
size
);
std
::
vector
<
void
*>
alloc_multiple
(
size_t
size
,
size_t
count
);
void
free
(
void
*
data
,
size_t
size
);
void
defragment
();
std
::
string
debug
();
};
csrc/balance_serve/kvc2/src/prefix.cpp
0 → 100644
View file @
877aec85
#include <immintrin.h>
#include <tbb/concurrent_hash_map.h>
#include <algorithm>
#include <cstdint>
#include <fstream>
#include <functional>
#include <list>
#include <map>
#include <memory>
#include <mutex>
#include <nlohmann/json.hpp>
#include <optional>
#include <shared_mutex>
#include <unordered_map>
#include <vector>
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
#include "async_store.hh"
#include "cuda_stream_manager.hh"
#include "kvc2.h"
#include "metrics.h"
#include "cache_entry.hh"
#include "gpu_cache.hh"
#include "hasher.hpp"
#include "io_helper.hpp"
#include "page_aligned_memory_pool.h"
#include "utils/arithmetic.hpp"
#include "utils/easy_format.hpp"
#include "utils/periodic_task.hpp"
namespace
kvc2
{
struct
KVC2
;
// will be set when init
TokenLength
NumTokenPerBlock
;
int
EvictCount
;
using
Layer
=
size_t
;
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE
(
CacheInfo
,
model_name
,
is_key_cache
,
quant_type
);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE
(
KVC2Config
,
gpu_only
,
load_from_disk
,
save_to_disk
,
path
,
config_path
,
num_token_per_page
,
memory_pool_size
,
evict_count
,
metrics_port
,
recompute_ratio
);
size_t
CacheInfo
::
hidden_layer_count
()
{
return
model_configs
.
at
(
model_name
).
num_hidden_layers
;
}
std
::
filesystem
::
path
CacheInfo
::
path
(
std
::
optional
<
size_t
>
which_layer
)
{
auto
folder
=
std
::
filesystem
::
path
(
model_name
)
/
quant_type
/
(
is_key_cache
?
"key"
:
"value"
);
if
(
which_layer
.
has_value
())
{
folder
/=
fmt
::
format
(
"layer-{}.kvc"
,
which_layer
.
value
());
}
return
folder
;
}
bool
CacheInfo
::
operator
==
(
const
CacheInfo
&
other
)
const
{
return
model_name
==
other
.
model_name
&&
is_key_cache
==
other
.
is_key_cache
&&
quant_type
==
other
.
quant_type
;
}
size_t
CacheInfo
::
element_size
(
size_t
block_length
)
{
size_t
count
=
model_configs
[
model_name
].
hidden_size
*
block_length
;
auto
&
q
=
quant_configs
[
quant_type
];
return
count
/
q
.
block_element_count
*
q
.
block_element_size
;
}
size_t
CacheInfo
::
hash_value
()
const
{
size_t
x
=
hash_seed
;
x
=
XXH64
(
model_name
.
data
(),
model_name
.
size
(),
x
);
x
=
XXH64
(
"quant_type"
,
10
,
x
);
x
=
XXH64
(
quant_type
.
data
(),
quant_type
.
size
(),
x
);
if
(
is_key_cache
)
{
x
=
XXH64
(
"key"
,
3
,
x
);
}
else
{
x
=
XXH64
(
"value"
,
5
,
x
);
}
return
x
;
}
}
// namespace kvc2
template
<
>
struct
std
::
hash
<
kvc2
::
CacheInfo
>
{
std
::
size_t
operator
()(
const
kvc2
::
CacheInfo
&
s
)
const
noexcept
{
return
s
.
hash_value
();
}
};
namespace
kvc2
{
struct
Location
{
size_t
start_idx
;
// start block index
size_t
length
;
// length of blocks
NLOHMANN_DEFINE_TYPE_INTRUSIVE
(
Location
,
start_idx
,
length
);
Location
cut_tail
(
size_t
offset_from_tail
)
{
Location
re
;
size_t
offset
=
length
-
offset_from_tail
;
re
.
start_idx
=
start_idx
+
offset
;
re
.
length
=
offset_from_tail
;
length
=
offset
;
return
re
;
}
};
struct
SegmentLocations
{
std
::
vector
<
std
::
optional
<
size_t
>>
offsets
;
void
add_location
(
size_t
start_block
,
Location
location
)
{
if
(
location
.
length
+
start_block
>
offsets
.
size
())
{
offsets
.
resize
(
location
.
length
+
start_block
,
std
::
nullopt
);
}
for
(
size_t
i
=
start_block
;
i
<
start_block
+
location
.
length
;
i
++
)
{
offsets
[
i
]
=
location
.
start_idx
+
i
-
start_block
;
}
}
void
set_location
(
size_t
start_block
,
size_t
disk_location
)
{
if
(
start_block
>=
offsets
.
size
())
{
offsets
.
resize
(
start_block
+
1
,
std
::
nullopt
);
}
offsets
[
start_block
]
=
disk_location
;
}
std
::
optional
<
size_t
>
get_idx
(
size_t
block_idx
)
const
{
if
(
block_idx
>=
offsets
.
size
())
{
return
std
::
nullopt
;
}
else
{
return
offsets
[
block_idx
];
}
}
bool
has_location
(
size_t
block_idx
,
size_t
length
)
{
for
(
size_t
i
=
block_idx
;
i
<
block_idx
+
length
;
i
++
)
{
if
(
get_idx
(
i
).
has_value
()
==
false
)
{
return
false
;
}
}
return
true
;
}
void
debug
()
{
for
(
size_t
i
=
0
;
i
<
offsets
.
size
();
++
i
)
{
if
(
offsets
[
i
].
has_value
())
{
SPDLOG_DEBUG
(
"Block {} -> Disk Location {}"
,
i
,
offsets
[
i
].
value
());
}
else
{
SPDLOG_DEBUG
(
"Block {} -> No Disk Location"
,
i
);
}
}
}
};
struct
CacheDiskLocations
{
std
::
unordered_map
<
CacheInfo
,
Location
>
location_map
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE
(
CacheDiskLocations
,
location_map
);
std
::
optional
<
Location
>
get_location
(
CacheInfo
cache_info
,
TokenLength
local_ids_length
)
{
size_t
blocks_length
=
div_up
(
local_ids_length
,
NumTokenPerBlock
);
if
(
location_map
.
count
(
cache_info
)
==
0
)
{
return
std
::
nullopt
;
}
Location
re
=
location_map
[
cache_info
];
re
.
length
=
blocks_length
;
return
re
;
}
std
::
optional
<
size_t
>
get_location_of_a_block
(
CacheInfo
info
,
size_t
local_at
)
{
if
(
location_map
.
count
(
info
)
==
0
)
{
return
std
::
nullopt
;
}
auto
loc
=
location_map
[
info
];
if
(
local_at
>=
loc
.
length
)
{
return
std
::
nullopt
;
}
return
loc
.
start_idx
+
local_at
;
}
};
struct
DiskCacheAllocator
{
private:
// metadata
std
::
filesystem
::
path
path
;
CacheInfo
info
;
std
::
mutex
lock
;
size_t
now_idx
;
// store
size_t
capacity
;
std
::
vector
<
async_store
::
ArrayStore
*>
stores
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE
(
DiskCacheAllocator
,
now_idx
);
void
update_capacity
()
{
capacity
=
std
::
numeric_limits
<
size_t
>::
max
();
for
(
auto
&
store
:
stores
)
{
capacity
=
std
::
min
(
capacity
,
async_store
::
capacity
(
store
));
}
}
void
extend
(
size_t
to
)
{
for
(
size_t
i
=
0
;
i
<
info
.
hidden_layer_count
();
i
++
)
{
async_store
::
extend
(
stores
[
i
],
to
);
}
update_capacity
();
}
public:
async_store
::
ArrayStore
*
get_store
(
int
i
)
{
return
stores
[
i
];
}
Location
alloc
(
size_t
block_count
)
{
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
Location
re
;
re
.
start_idx
=
now_idx
;
re
.
length
=
block_count
;
now_idx
+=
block_count
;
if
(
now_idx
>=
capacity
)
{
extend
(
capacity
*
2
);
}
return
re
;
}
DiskCacheAllocator
(
std
::
filesystem
::
path
path
,
CacheInfo
info
)
:
path
(
path
),
info
(
info
)
{
// SPDLOG_DEBUG("Create DiskCacheAllocator {}", path.c_str());
auto
allocator_path
=
path
/
info
.
path
();
if
(
std
::
filesystem
::
exists
(
allocator_path
)
==
false
)
{
std
::
filesystem
::
create_directories
(
allocator_path
);
}
// restore metadata later in json load
now_idx
=
0
;
for
(
size_t
i
=
0
;
i
<
info
.
hidden_layer_count
();
i
++
)
{
// SPDLOG_DEBUG("Create store {} for {}", (path / info.path(i)).c_str(),i);
auto
store
=
async_store
::
create_or_open_store
(
info
.
element_size
(
NumTokenPerBlock
),
1000
,
path
/
info
.
path
(
i
));
stores
.
push_back
(
store
);
}
update_capacity
();
}
~
DiskCacheAllocator
()
{
for
(
auto
store
:
stores
)
{
async_store
::
close_store
(
store
);
}
}
};
struct
DiskCacheManager
{
KVC2Config
config
;
std
::
mutex
lock
;
std
::
unordered_map
<
CacheInfo
,
std
::
shared_ptr
<
DiskCacheAllocator
>>
allocators
;
friend
void
to_json
(
nlohmann
::
json
&
nlohmann_json_j
,
const
DiskCacheManager
&
nlohmann_json_t
)
{
nlohmann_json_j
[
"config"
]
=
nlohmann_json_t
.
config
;
nlohmann_json_j
[
"allocators"
]
=
nlohmann
::
json
::
array
();
for
(
auto
&
[
info
,
allocator
]
:
nlohmann_json_t
.
allocators
)
{
nlohmann_json_j
[
"allocators"
].
push_back
({{
"info"
,
info
},
{
"allocator"
,
*
allocator
}});
}
}
friend
void
from_json
(
const
nlohmann
::
json
&
nlohmann_json_j
,
DiskCacheManager
&
nlohmann_json_t
)
{
// SPDLOG_DEBUG("Load DiskCacheManager Json");
nlohmann_json_j
.
at
(
"config"
).
get_to
(
nlohmann_json_t
.
config
);
for
(
const
auto
&
allocator_json
:
nlohmann_json_j
.
at
(
"allocators"
))
{
// SPDLOG_DEBUG("Make Allocator {}",allocator_json.dump());
CacheInfo
info
;
allocator_json
.
at
(
"info"
).
get_to
(
info
);
auto
allocator
=
std
::
make_shared
<
DiskCacheAllocator
>
(
nlohmann_json_t
.
config
.
path
,
info
);
allocator_json
.
at
(
"allocator"
).
get_to
(
*
allocator
);
nlohmann_json_t
.
allocators
[
info
]
=
allocator
;
}
};
DiskCacheManager
(
KVC2Config
config
)
:
config
(
config
)
{
SPDLOG_INFO
(
"DiskCacheManager root path: {}"
,
config
.
path
.
c_str
());
if
(
!
std
::
filesystem
::
exists
(
config
.
path
))
{
std
::
filesystem
::
create_directories
(
config
.
path
);
}
}
std
::
shared_ptr
<
DiskCacheAllocator
>
get_allocator
(
CacheInfo
info
)
{
{
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
if
(
allocators
.
count
(
info
)
==
0
)
{
allocators
.
emplace
(
info
,
std
::
make_shared
<
DiskCacheAllocator
>
(
config
.
path
,
info
));
}
}
return
allocators
.
at
(
info
);
}
Location
allocate
(
CacheInfo
info
,
size_t
cache_block_count
)
{
auto
allocator
=
get_allocator
(
info
);
return
allocator
->
alloc
(
cache_block_count
);
}
};
struct
Prefix
{
uint64_t
prefix_id
;
// 0 for nullptr, started from 1
TokenLength
start_length
;
Tokens
ids
;
CacheDiskLocations
locations
;
Prefix
*
prev
=
nullptr
;
// No serialization
bool
prev_set
=
false
;
friend
void
to_json
(
nlohmann
::
json
&
nlohmann_json_j
,
const
Prefix
&
nlohmann_json_t
)
{
nlohmann_json_j
[
"prefix_id"
]
=
nlohmann_json_t
.
prefix_id
;
nlohmann_json_j
[
"start_length"
]
=
nlohmann_json_t
.
start_length
;
nlohmann_json_j
[
"ids"
]
=
nlohmann_json_t
.
ids
;
if
(
nlohmann_json_t
.
prev
)
{
nlohmann_json_j
[
"prev"
]
=
nlohmann_json_t
.
prev
->
prefix_id
;
}
else
{
nlohmann_json_j
[
"prev"
]
=
0
;
}
nlohmann_json_j
[
"locations"
]
=
nlohmann_json_t
.
locations
;
}
friend
void
from_json
(
const
nlohmann
::
json
&
nlohmann_json_j
,
Prefix
&
nlohmann_json_t
)
{
nlohmann_json_j
.
at
(
"prefix_id"
).
get_to
(
nlohmann_json_t
.
prefix_id
);
nlohmann_json_j
.
at
(
"start_length"
).
get_to
(
nlohmann_json_t
.
start_length
);
nlohmann_json_j
.
at
(
"ids"
).
get_to
(
nlohmann_json_t
.
ids
);
nlohmann_json_j
.
at
(
"locations"
).
get_to
(
nlohmann_json_t
.
locations
);
auto
prev_id
=
nlohmann_json_j
.
at
(
"prev"
).
get
<
uint64_t
>
();
nlohmann_json_t
.
prev
=
reinterpret_cast
<
Prefix
*>
(
prev_id
);
nlohmann_json_t
.
prev_set
=
false
;
};
TokenLength
local_length
()
{
return
ids
.
size
();
}
TokenLength
length
()
{
return
start_length
+
local_length
();
}
Tokens
prefix_to
(
TokenLength
length
)
{
TokenLength
local_length
=
length
-
start_length
;
Tokens
re
;
if
(
prev
)
{
re
=
prev
->
prefix_to
(
start_length
);
}
re
.
insert
(
re
.
end
(),
ids
.
begin
(),
ids
.
begin
()
+
local_length
);
return
re
;
}
Tokens
full
()
{
return
prefix_to
(
length
());
}
void
update_location
(
CacheInfo
info
,
Location
location
)
{
locations
.
location_map
[
info
]
=
location
;
}
Prefix
*
to_first_prefix_without_disk_locations
(
CacheInfo
k_info
/*, CacheInfo v_info*/
)
{
// just k_info
auto
now_prefix
=
this
;
while
(
now_prefix
->
prev
!=
nullptr
)
{
auto
&
prev
=
now_prefix
->
prev
;
auto
k_location
=
prev
->
locations
.
get_location
(
k_info
,
prev
->
local_length
());
// auto v_location = prev->locations.get_location(v_info, prev->local_length());
if
(
k_location
.
has_value
())
{
// assert(v_location.has_value());
// after now_prefix, we need to insert new kv cache.
break
;
}
now_prefix
=
prev
;
}
return
now_prefix
;
}
void
hash_to_with
(
TokenLength
length
,
TokensHasher
&
hasher
)
{
TokenLength
local_length
=
length
-
start_length
;
if
(
prev
)
{
prev
->
hash_to_with
(
start_length
,
hasher
);
}
hasher
.
update
(
ids
.
data
(),
local_length
);
}
void
debug
()
{
fmt
::
print
(
"Prefix {}, start_length: {}, local_length: {}, prev: {},
\n
"
,
prefix_id
,
start_length
,
local_length
(),
(
void
*
)
prev
);
}
};
struct
PrefixMatch
{
Prefix
*
prefix
;
TokenLength
match_length
;
std
::
vector
<
TokensHash
>
matched_hashes
(
CacheInfo
info
,
Layer
layer
)
{
std
::
vector
<
TokensHash
>
re
;
if
(
prefix
==
nullptr
)
return
re
;
TokensHasher
hasher
;
hasher
.
reset
(
info
.
hash_value
());
hasher
.
update_raw
(
&
layer
,
sizeof
(
layer
));
auto
ids
=
prefix
->
prefix_to
(
match_length
);
for
(
TokenLength
i
=
0
;
i
<
ids
.
size
();
i
+=
NumTokenPerBlock
)
{
TokenLength
len
=
std
::
min
(
NumTokenPerBlock
,
ids
.
size
()
-
i
);
re
.
push_back
(
hasher
.
update
(
ids
.
data
()
+
i
,
len
));
}
return
re
;
}
void
collect_locations
(
CacheInfo
info
,
SegmentLocations
&
seg_locs
)
{
auto
now_prefix
=
prefix
;
size_t
length
=
match_length
;
while
(
now_prefix
!=
nullptr
)
{
TokenLength
local_length
=
length
-
now_prefix
->
start_length
;
auto
loc
=
now_prefix
->
locations
.
get_location
(
info
,
local_length
);
if
(
loc
.
has_value
())
{
seg_locs
.
add_location
(
now_prefix
->
start_length
/
NumTokenPerBlock
,
loc
.
value
());
}
length
=
now_prefix
->
start_length
;
now_prefix
=
now_prefix
->
prev
;
}
}
};
std
::
string
to_string
(
const
MatchStatus
&
status
)
{
switch
(
status
)
{
case
Exact
:
return
"Exact"
;
case
Partial
:
return
"Partial"
;
case
NotMatchExact
:
return
"NotMatchExact"
;
case
NotMatchPartial
:
return
"NotMatchPartial"
;
default:
return
"Unknown"
;
}
}
struct
MatchByBlock
{
// prefix, block idx at prefix, status
std
::
vector
<
std
::
tuple
<
Prefix
*
,
BlockLength
,
MatchStatus
>>
matches
;
bool
any_match
()
{
for
(
auto
&
[
p
,
l
,
m
]
:
matches
)
{
if
(
p
)
{
return
true
;
}
}
return
false
;
}
size_t
partial_count
()
{
size_t
re
=
0
;
for
(
auto
&
[
p
,
l
,
m
]
:
matches
)
{
if
(
m
==
Partial
)
{
re
++
;
}
}
return
re
;
}
bool
has_partial
()
{
return
partial_count
()
>
0
;
}
std
::
vector
<
std
::
optional
<
TokensHash
>>
matched_hashes
(
CacheInfo
info
,
Layer
layer
)
{
// TODO: This function might be slow
std
::
vector
<
std
::
optional
<
TokensHash
>>
re
(
matches
.
size
(),
std
::
nullopt
);
for
(
size_t
i
=
0
;
i
<
matches
.
size
();
i
++
)
{
TokensHasher
hasher
;
hasher
.
reset
(
info
.
hash_value
());
hasher
.
update_raw
(
&
layer
,
sizeof
(
layer
));
auto
&
[
p
,
idx
,
status
]
=
matches
[
i
];
if
(
p
)
{
p
->
hash_to_with
((
idx
+
1
)
*
NumTokenPerBlock
,
hasher
);
re
[
i
]
=
hasher
.
get
();
}
}
return
re
;
}
void
collect_locations
(
CacheInfo
info
,
SegmentLocations
&
seg_locs
)
{
for
(
size_t
i
=
0
;
i
<
matches
.
size
();
i
++
)
{
auto
&
[
p
,
idx
,
status
]
=
matches
[
i
];
if
(
p
)
{
auto
local_at
=
idx
-
p
->
start_length
/
NumTokenPerBlock
;
seg_locs
.
set_location
(
i
,
p
->
locations
.
get_location_of_a_block
(
info
,
local_at
).
value
());
}
}
}
std
::
string
debug_string
()
{
std
::
string
re
=
fmt
::
format
(
"{} Match: "
,
matches
.
size
());
for
(
auto
&
[
p
,
idx
,
status
]
:
matches
)
{
switch
(
status
)
{
case
Exact
:
re
+=
"E"
;
break
;
case
Partial
:
re
+=
"P"
;
break
;
case
NotMatchExact
:
re
+=
"N"
;
break
;
case
NotMatchPartial
:
re
+=
"n"
;
break
;
default:
assert
(
0
);
}
}
return
re
;
}
};
struct
PrefixTree
{
std
::
shared_mutex
rw_lock
;
std
::
atomic_uint64_t
prefix_id_counter
=
1
;
using
MapT
=
std
::
unordered_map
<
TokensHash
,
std
::
pair
<
std
::
shared_ptr
<
Prefix
>
,
BlockLength
>>
;
// Prefix, start_block_idx
MapT
prefix_map
;
std
::
shared_ptr
<
Metrics
>
met
;
std
::
vector
<
std
::
shared_ptr
<
Prefix
>>
prefix_refs
=
{
nullptr
};
// 0 is nullptr
friend
void
to_json
(
nlohmann
::
json
&
nlohmann_json_j
,
const
PrefixTree
&
nlohmann_json_t
)
{
nlohmann_json_j
[
"prefix_id_counter"
]
=
nlohmann_json_t
.
prefix_id_counter
.
load
();
nlohmann_json_j
[
"prefix_refs"
]
=
nlohmann
::
json
::
array
();
for
(
auto
prefix
:
nlohmann_json_t
.
prefix_refs
)
{
if
(
prefix
==
nullptr
)
continue
;
nlohmann_json_j
[
"prefix_refs"
].
push_back
(
*
prefix
);
}
}
friend
void
from_json
(
const
nlohmann
::
json
&
nlohmann_json_j
,
PrefixTree
&
nlohmann_json_t
)
{
nlohmann_json_t
.
prefix_id_counter
=
nlohmann_json_j
.
at
(
"prefix_id_counter"
).
get
<
uint64_t
>
();
nlohmann_json_t
.
prefix_refs
.
resize
(
nlohmann_json_t
.
prefix_id_counter
);
for
(
size_t
i
=
1
;
i
<
nlohmann_json_t
.
prefix_id_counter
;
++
i
)
{
auto
prefix
=
std
::
make_shared
<
Prefix
>
();
nlohmann_json_j
.
at
(
"prefix_refs"
)[
i
-
1
].
get_to
(
*
prefix
);
nlohmann_json_t
.
prefix_refs
[
i
]
=
prefix
;
}
nlohmann_json_t
.
init_prevs
();
nlohmann_json_t
.
init_map
();
};
void
init_prevs
()
{
for
(
auto
p
:
prefix_refs
)
{
if
(
p
)
{
if
(
p
->
prev_set
==
false
)
{
p
->
prev
=
prefix_refs
[
reinterpret_cast
<
uint64_t
>
(
p
->
prev
)].
get
();
p
->
prev_set
=
true
;
}
}
}
}
void
init_map
()
{
assert
(
prefix_map
.
empty
());
for
(
auto
p
:
prefix_refs
)
{
if
(
p
==
nullptr
)
continue
;
auto
ids
=
p
->
full
();
for
(
TokenLength
i
=
p
->
start_length
;
i
<
p
->
length
();
i
+=
NumTokenPerBlock
)
{
TokenLength
end
=
std
::
min
(
i
+
NumTokenPerBlock
,
p
->
length
());
assert
(
end
%
NumTokenPerBlock
==
0
);
auto
hash
=
TokensHasher
::
hash
(
ids
.
data
(),
end
);
prefix_map
[
hash
]
=
{
p
,
end
/
NumTokenPerBlock
-
1
};
}
}
}
// Look up prefix from the map, return the matched prefix and length.
// If the prefix is not found, match contains nullptr and 0.
PrefixMatch
look_up
(
Token
*
data
,
TokenLength
length
,
bool
need_lock
=
true
)
{
std
::
shared_lock
<
std
::
shared_mutex
>
sl
;
if
(
need_lock
)
{
sl
=
std
::
shared_lock
<
std
::
shared_mutex
>
(
rw_lock
);
}
// TODO: prefix cache
}
PrefixMatch
look_up_or_insert
(
Token
*
data
,
TokenLength
length
)
{
std
::
unique_lock
<
std
::
shared_mutex
>
ul
(
rw_lock
);
auto
match
=
look_up
(
data
,
length
,
false
);
if
(
match
.
match_length
==
length
)
{
return
match
;
}
auto
new_prefix
=
new_prefix_node
(
match
.
prefix
,
match
.
match_length
,
data
,
length
,
false
);
PrefixMatch
re
;
re
.
prefix
=
new_prefix
.
get
();
re
.
match_length
=
length
;
return
re
;
}
std
::
shared_ptr
<
Prefix
>
new_prefix_node
(
Prefix
*
prev
,
TokenLength
prev_match_length
,
Token
*
data
,
TokenLength
length
,
bool
need_lock
=
true
)
{
std
::
unique_lock
<
std
::
shared_mutex
>
ul
;
if
(
need_lock
)
ul
=
std
::
unique_lock
<
std
::
shared_mutex
>
(
rw_lock
);
auto
new_prefix
=
std
::
make_shared
<
Prefix
>
();
new_prefix
->
prefix_id
=
prefix_id_counter
.
fetch_add
(
1
);
new_prefix
->
start_length
=
prev_match_length
;
new_prefix
->
ids
=
Tokens
(
data
+
prev_match_length
,
data
+
length
);
new_prefix
->
prev
=
prev
;
new_prefix
->
prev_set
=
true
;
prefix_refs
.
push_back
(
new_prefix
);
met
->
prefix_nodes
->
Increment
();
met
->
prefix_block_count
->
Increment
(
div_up
(
length
-
prev_match_length
,
NumTokenPerBlock
));
assert
(
prefix_refs
.
size
()
==
prefix_id_counter
.
load
());
TokensHasher
hasher
;
hasher
.
update
(
data
,
prev_match_length
);
for
(
TokenLength
i
=
prev_match_length
;
i
<
length
;
i
+=
NumTokenPerBlock
)
{
TokenLength
len
=
std
::
min
(
NumTokenPerBlock
,
length
-
i
);
auto
hash
=
hasher
.
update
(
data
+
i
,
len
);
prefix_map
[
hash
]
=
{
new_prefix
,
i
/
NumTokenPerBlock
};
}
return
new_prefix
;
}
void
debug
()
{
fmt
::
print
(
"PrefixTree with {} prefixes, prefix counter: {}
\n
"
,
prefix_map
.
size
(),
prefix_id_counter
.
load
());
for
(
auto
&
[
hash
,
prefix
]
:
prefix_map
)
{
fmt
::
print
(
"Hash: {:016x}, start block {}
\n
"
,
hash
,
prefix
.
second
);
prefix
.
first
->
debug
();
}
}
};
size_t
locations_blocks_count
(
const
std
::
vector
<
Location
>&
locations
)
{
auto
re
=
0
;
for
(
auto
&
loc
:
locations
)
{
re
+=
loc
.
length
;
}
return
re
;
}
struct
DoubleCacheHandle
:
public
DoubleCacheHandleInterface
{
ModelName
model_name
;
QuantType
quant_type
;
bool
is_k_cache_on
;
bool
is_v_cache_on
;
CacheInfo
k_info
()
{
if
(
is_k_cache_on
==
false
)
{
SPDLOG_WARN
(
"Get K CacheInfo, but K Cache is off"
);
}
return
CacheInfo
{
.
model_name
=
model_name
,
.
is_key_cache
=
true
,
.
quant_type
=
quant_type
,
};
};
CacheInfo
v_info
()
{
if
(
is_v_cache_on
==
false
)
{
SPDLOG_WARN
(
"Get V CacheInfo, but K Cache is off"
);
}
return
CacheInfo
{
.
model_name
=
model_name
,
.
is_key_cache
=
false
,
.
quant_type
=
quant_type
,
};
};
Tokens
ids
;
TokenLength
estimated_length
;
bool
enable_alt
=
false
;
PrefixMatch
match
;
// MatchByBlock match_by_blocks;
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>
k_cache_handles
;
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>
v_cache_handles
;
SegmentLocations
k_seg_locs
;
SegmentLocations
v_seg_locs
;
KVC2
*
kvc2_top
;
// for Cache Fusion
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>
attatched_cache_handles
;
std
::
unique_ptr
<
CacheBlockEntryCollector
>
cpu_releaser
=
nullptr
,
gpu_releaser
=
nullptr
;
std
::
vector
<
size_t
>
gpu_only_block_idx
;
virtual
~
DoubleCacheHandle
();
// interface
TokenLength
matched_length
()
override
{
if
(
enable_alt
)
{
assert
(
0
);
}
else
{
return
match
.
match_length
;
}
}
MatchStatus
status_at
(
BlockLength
i
)
{
assert
(
i
<
div_up
(
estimated_length
,
NumTokenPerBlock
));
if
(
enable_alt
)
{
assert
(
false
);
// if (i >= match_by_blocks.matches.size()) {
// return match_by_blocks.has_partial() ? MatchStatus::NotMatchPartial : MatchStatus::NotMatchExact;
// }
// return std::get<2>(match_by_blocks.matches[i]);
}
else
{
if
(
i
<
match
.
match_length
/
NumTokenPerBlock
)
{
return
MatchStatus
::
Exact
;
}
else
{
return
MatchStatus
::
NotMatchExact
;
}
}
}
std
::
vector
<
MatchStatus
>
matched_status
()
override
{
assert
(
false
);
}
bool
any_match
()
{
if
(
enable_alt
)
{
assert
(
false
);
// return match_by_blocks.any_match();
}
else
{
return
match
.
prefix
!=
nullptr
;
}
}
BlockLength
match_range_length
()
{
if
(
enable_alt
)
{
assert
(
false
);
// return match_by_blocks.matches.size();
}
else
{
return
div_up
(
match
.
match_length
,
NumTokenPerBlock
);
}
}
std
::
vector
<
layer_data
>
handle_data
(
bool
is_key_cache
)
override
{
return
export_raw_pointers
(
is_key_cache
);
}
bool
to_gpu
()
override
;
void
to_gpu_async
(
std
::
function
<
void
(
bool
)
>
call_back
)
override
;
std
::
vector
<
size_t
>
get_gpu_block_idx
()
override
;
bool
alloc_attached_blocks
(
BlockLength
count
);
std
::
vector
<
size_t
>
get_gpu_attached_block_idx
()
override
;
void
append_tokens
(
Token
*
tokens
,
TokenLength
length
)
override
;
void
debug
()
override
{}
void
set_cache_info
(
ModelName
model_name
,
QuantType
quant_type
,
bool
turn_on_k_cache
,
bool
turn_on_v_cache
)
{
this
->
model_name
=
model_name
;
this
->
quant_type
=
quant_type
;
if
(
turn_on_k_cache
)
{
is_k_cache_on
=
true
;
k_cache_handles
.
resize
(
k_info
().
hidden_layer_count
());
}
else
{
is_k_cache_on
=
false
;
k_cache_handles
.
clear
();
}
if
(
turn_on_v_cache
)
{
is_v_cache_on
=
true
;
v_cache_handles
.
resize
(
v_info
().
hidden_layer_count
());
}
else
{
is_v_cache_on
=
false
;
v_cache_handles
.
clear
();
}
}
void
check_before_insert
()
{
std
::
optional
<
size_t
>
blocks_count
=
std
::
nullopt
;
auto
check_single_cache
=
[
&
blocks_count
](
CacheInfo
cache_info
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
layers
,
Tokens
&
ids
)
{
for
(
size_t
i
=
0
;
i
<
cache_info
.
hidden_layer_count
();
i
++
)
{
auto
&
layer
=
layers
[
i
];
if
(
blocks_count
.
has_value
()
==
false
)
{
blocks_count
=
layer
.
size
();
}
else
{
if
(
blocks_count
.
value
()
!=
layer
.
size
())
{
SPDLOG_ERROR
(
"Layer {} has different block count"
,
i
);
throw
std
::
runtime_error
(
"Layer has different block count"
);
}
}
}
if
(
blocks_count
.
has_value
())
{
if
(
blocks_count
.
value
()
!=
div_up
(
ids
.
size
(),
NumTokenPerBlock
))
{
SPDLOG_ERROR
(
"Block count not match, ids: {}, blocks: {}"
,
ids
.
size
(),
blocks_count
.
value
());
throw
std
::
runtime_error
(
"Block count not match"
);
}
}
};
if
(
is_k_cache_on
)
check_single_cache
(
k_info
(),
k_cache_handles
,
ids
);
if
(
is_v_cache_on
)
check_single_cache
(
v_info
(),
v_cache_handles
,
ids
);
}
template
<
typename
Fn
>
void
for_all_cache_block_entry
(
Fn
f
)
{
if
(
is_k_cache_on
)
{
for
(
auto
&
layer
:
k_cache_handles
)
{
for
(
auto
&
block
:
layer
)
{
if
(
f
(
block
)
==
false
)
return
;
}
}
}
if
(
is_v_cache_on
)
{
for
(
auto
&
layer
:
v_cache_handles
)
{
for
(
auto
&
block
:
layer
)
{
if
(
f
(
block
)
==
false
)
return
;
}
}
}
}
// concurrent check ok
bool
alloc_on_cpu
()
{
assert
(
cpu_releaser
==
nullptr
);
std
::
unique_ptr
<
CacheBlockEntryCollector
>
releaser
=
std
::
make_unique
<
CacheBlockEntryCollector
>
([](
CacheBlockEntry
*
entry
)
{
auto
lg
=
entry
->
lock_guard
();
entry
->
cpu_cc
.
ref_count
.
fetch_sub
(
1
);
});
bool
ok
=
true
;
for_all_cache_block_entry
([
&
ok
,
&
releaser
](
std
::
shared_ptr
<
CacheBlockEntry
>&
block_entry
)
{
if
(
block_entry
->
inc_ref_or_alloc_on_cpu
()
==
false
)
{
ok
=
false
;
return
false
;
}
else
{
releaser
->
entries
.
push_back
(
block_entry
.
get
());
}
return
true
;
});
if
(
ok
)
{
cpu_releaser
=
std
::
move
(
releaser
);
}
return
ok
;
}
bool
alloc_on_gpu_cols
()
{
assert
(
is_k_cache_on
);
assert
(
gpu_releaser
==
nullptr
);
std
::
unique_ptr
<
CacheBlockEntryCollector
>
releaser
=
std
::
make_unique
<
CacheBlockEntryCollector
>
([](
CacheBlockEntry
*
entry
)
{
auto
lg
=
entry
->
lock_guard
();
entry
->
gpu_cc
.
ref_count
.
fetch_sub
(
1
);
});
GPUPageCache
*
gpu_cache
=
k_cache_handles
[
0
][
0
]
->
manager
->
gpu_cache
.
get
();
gpu_cache
->
background_flush_back
->
wakeUpWait
();
bool
ok
=
true
;
size_t
want_count
=
0
;
for
(
size_t
i
=
0
;
i
<
k_cache_handles
[
0
].
size
();
i
++
)
{
auto
lg
=
k_cache_handles
[
0
][
i
]
->
lock_guard
();
if
(
k_cache_handles
[
0
][
i
]
->
gpu_block_idx
.
has_value
()
==
false
)
{
want_count
+=
1
;
if
(
gpu_cache
->
alloc_col
(
k_cache_handles
,
v_cache_handles
,
i
)
==
false
)
{
ok
=
false
;
break
;
}
}
k_cache_handles
[
0
][
i
]
->
gpu_cc
.
ref_count
.
fetch_add
(
1
);
releaser
->
entries
.
push_back
(
k_cache_handles
[
0
][
i
].
get
());
}
if
(
ok
==
false
)
{
SPDLOG_WARN
(
"Handle cannot allocate {} gpu pages"
,
want_count
);
}
else
{
gpu_releaser
=
std
::
move
(
releaser
);
}
return
ok
;
}
static
void
segment_io_layer
(
async_store
::
IODealer
*
dealer
,
IO_Helper
<
CacheBlockEntry
>&
io_helper
,
async_store
::
ArrayStore
*
store
,
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>&
layer_entries
,
size_t
block_start
,
size_t
length
,
Layer
layer
,
const
SegmentLocations
&
locations
,
IOOption
option
)
{
SPDLOG_TRACE
(
"{} [{}:{}) blocks to/from disk"
,
to_string
(
option
),
block_start
,
block_start
+
length
);
for
(
size_t
i
=
block_start
;
i
<
block_start
+
length
;
i
++
)
{
if
(
locations
.
get_idx
(
i
).
has_value
())
{
SPDLOG_TRACE
(
"Location for block {}, {}"
,
i
,
locations
.
get_idx
(
i
).
value
());
layer_entries
[
i
]
->
io_with
(
dealer
,
io_helper
,
store
,
layer
,
locations
.
get_idx
(
i
).
value
(),
option
);
}
}
}
std
::
shared_ptr
<
IO_Helper
<
CacheBlockEntry
>>
segment_io
(
async_store
::
IODealer
*
dealer
,
DiskCacheManager
*
manager
,
BlockLength
block_start
,
BlockLength
length
,
IOOption
option
)
{
auto
io_helper
=
std
::
make_shared
<
IO_Helper
<
CacheBlockEntry
>>
([
option
](
CacheBlockEntry
*
b
)
{
switch
(
option
)
{
case
IO_ForceRead
:
break
;
case
IO_ForceWrite
:
break
;
case
IO_Read
:
{
b
->
cpu_cc
.
tc
.
set_has_data
();
break
;
}
case
IO_Write
:
break
;
default:
assert
(
0
);
}
});
auto
single_segment_io
=
[
dealer
,
manager
,
block_start
,
length
,
option
,
io_helper
](
CacheInfo
info
,
SegmentLocations
&
seg_locs
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
layers
)
{
assert
(
layers
[
0
].
size
()
>=
block_start
+
length
);
auto
allocator
=
manager
->
get_allocator
(
info
);
for
(
size_t
l
=
0
;
l
<
info
.
hidden_layer_count
();
l
++
)
{
segment_io_layer
(
dealer
,
*
io_helper
,
allocator
->
get_store
(
l
),
layers
[
l
],
block_start
,
length
,
l
,
seg_locs
,
option
);
}
};
if
(
is_k_cache_on
)
single_segment_io
(
k_info
(),
k_seg_locs
,
k_cache_handles
);
if
(
is_v_cache_on
)
single_segment_io
(
v_info
(),
v_seg_locs
,
v_cache_handles
);
io_helper
->
finish_add_taks
();
SPDLOG_DEBUG
(
"Segment IO Submitted, total task count {}"
,
io_helper
->
total_task_count
);
return
io_helper
;
}
std
::
shared_ptr
<
IO_Helper
<
CacheBlockEntry
>>
gpu_io
(
GPUPageCache
*
gpu_cache
,
BlockLength
block_start
,
BlockLength
length
,
IOOption
option
)
{
auto
io_helper
=
std
::
make_shared
<
IO_Helper
<
CacheBlockEntry
>>
([
option
](
CacheBlockEntry
*
b
)
{
switch
(
option
)
{
case
IO_ForceRead
:
break
;
case
IO_ForceWrite
:
break
;
case
IO_Read
:
{
b
->
gpu_cc
.
tc
.
set_has_data
();
break
;
}
case
IO_Write
:
break
;
default:
assert
(
0
);
}
});
cudaMemcpyKind
direction
;
if
(
option
==
IO_Read
||
option
==
IO_ForceRead
)
{
direction
=
cudaMemcpyHostToDevice
;
}
if
(
option
==
IO_Write
||
option
==
IO_ForceWrite
)
{
direction
=
cudaMemcpyDeviceToHost
;
}
auto
reqs
=
gpu_cache
->
basic_request
(
direction
,
[
io_helper
]()
{
io_helper
->
batch_promise
.
set
();
});
for
(
size_t
i
=
block_start
;
i
<
length
;
i
++
)
{
auto
status
=
status_at
(
i
);
if
(
status
==
NotMatchExact
||
status
==
NotMatchPartial
)
{
SPDLOG_DEBUG
(
"GPU: Col Handle not match (Skipped by Alt Match)"
);
continue
;
}
auto
ptr
=
k_cache_handles
[
0
][
i
].
get
();
switch
(
option
)
{
case
IO_Read
:
{
if
(
io_helper
->
absorb_tc
(
ptr
,
ptr
->
gpu_cc
.
tc
)
==
false
)
{
// SPDLOG_DEBUG("GPU: Col Handle need me to wait");
continue
;
}
break
;
}
case
IO_ForceRead
:
{
break
;
}
case
IO_ForceWrite
:
{
break
;
}
case
IO_Write
:
{
break
;
}
default:
{
assert
(
0
);
}
}
SPDLOG_DEBUG
(
"GPU: Col Handle needs me to transfer"
);
gpu_cache
->
append_col_to_request
(
reqs
,
k_cache_handles
,
v_cache_handles
,
i
);
}
io_helper
->
new_task
(
reqs
.
size
());
gpu_cache
->
submit_requests
(
reqs
);
io_helper
->
finish_add_taks
();
return
io_helper
;
}
// void set_raw_handles(const std::vector<layer_data>& k, const std::vector<layer_data>& v) {
// set_raw_handles(true, k);
// set_raw_handles(false, v);
// }
void
set_raw_handles
(
bool
is_key_cache
,
const
std
::
vector
<
layer_data
>&
layer_data
)
{
auto
single_set_raw_handles
=
[
layer_data
](
CacheInfo
info
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
handles
)
{
handles
.
resize
(
layer_data
.
size
());
for
(
size_t
i
=
0
;
i
<
info
.
hidden_layer_count
();
i
++
)
{
auto
&
layer
=
layer_data
[
i
];
handles
[
i
].
clear
();
for
(
auto
&
block_data
:
layer
)
{
auto
handle
=
std
::
make_shared
<
CacheBlockEntry
>
();
handle
->
data
=
reinterpret_cast
<
void
*>
(
block_data
);
handle
->
size
=
info
.
element_size
(
NumTokenPerBlock
);
handles
[
i
].
push_back
(
handle
);
}
}
};
if
(
is_key_cache
)
{
is_k_cache_on
=
true
;
single_set_raw_handles
(
k_info
(),
k_cache_handles
);
}
else
{
is_v_cache_on
=
true
;
single_set_raw_handles
(
v_info
(),
v_cache_handles
);
}
}
std
::
vector
<
layer_data
>
export_raw_pointers
(
bool
is_key_cache
)
{
std
::
vector
<
layer_data
>
re
;
auto
single_export_raw_pointers
=
[
&
re
](
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
layers
)
{
for
(
auto
&
layer_handle
:
layers
)
{
layer_data
layer
;
for
(
size_t
i
=
0
;
i
<
layer_handle
.
size
();
i
++
)
{
auto
block
=
layer_handle
.
at
(
i
);
layer
.
push_back
(
reinterpret_cast
<
data_block_ptr
>
(
block
->
data
));
}
re
.
push_back
(
layer
);
}
};
if
(
is_key_cache
)
{
if
(
is_k_cache_on
==
false
)
{
SPDLOG_WARN
(
"Export K Cache, but K Cache is off"
);
}
single_export_raw_pointers
(
k_cache_handles
);
}
else
{
if
(
is_v_cache_on
==
false
)
{
SPDLOG_WARN
(
"Export V Cache, but V Cache is off"
);
}
single_export_raw_pointers
(
v_cache_handles
);
}
return
re
;
}
void
get_handles
();
void
get_empty_handles
();
void
collect_locations
()
{
if
(
enable_alt
)
{
assert
(
false
);
// match_by_blocks.collect_locations(k_info(), k_seg_locs);
// match_by_blocks.collect_locations(v_info(), v_seg_locs);
}
else
{
if
(
is_k_cache_on
)
match
.
collect_locations
(
k_info
(),
k_seg_locs
);
if
(
is_v_cache_on
)
match
.
collect_locations
(
v_info
(),
v_seg_locs
);
}
if
(
is_k_cache_on
)
k_seg_locs
.
debug
();
// v_seg_locs.debug();
}
};
struct
KVC2
:
KVC2Interface
{
KVC2Config
config
;
std
::
shared_ptr
<
Metrics
>
met
;
std
::
filesystem
::
path
root
;
std
::
unique_ptr
<
PrefixTree
>
tree
;
std
::
unique_ptr
<
DiskCacheManager
>
disk_cache
;
std
::
shared_ptr
<
PageAlignedMemoryPool
>
memory_pool
;
std
::
unique_ptr
<
CacheEntryManager
>
cache_manager
;
std
::
unique_ptr
<
async_store
::
IODealer
>
io_dealer
;
std
::
shared_ptr
<
GPUPageCache
>
gpu_cache
;
public:
void
load
()
override
{
load_quant_configs
(
root
/
"quant_configs.json"
);
load_model_configs
(
root
/
"model_configs.json"
);
{
auto
where
=
root
/
"tree.json"
;
if
(
std
::
filesystem
::
exists
(
where
))
{
nlohmann
::
json
j
;
std
::
ifstream
i
(
where
);
i
>>
j
;
j
.
get_to
(
*
tree
);
SPDLOG_WARN
(
"Loaded from {}"
,
where
.
c_str
());
}
}
{
auto
where
=
root
/
"disk_cache.json"
;
if
(
std
::
filesystem
::
exists
(
where
))
{
nlohmann
::
json
j
;
std
::
ifstream
i
(
where
);
i
>>
j
;
j
.
get_to
(
*
disk_cache
);
SPDLOG_WARN
(
"Loaded from {}"
,
where
.
c_str
());
}
}
{
auto
where
=
root
/
"config.json"
;
if
(
std
::
filesystem
::
exists
(
where
))
{
nlohmann
::
json
j
;
std
::
ifstream
i
(
where
);
i
>>
j
;
j
.
get_to
(
config
);
SPDLOG_WARN
(
"Loaded from {}"
,
where
.
c_str
());
}
}
}
void
save
()
override
{
if
(
config
.
save_to_disk
==
false
)
{
return
;
}
flush_back
();
{
nlohmann
::
json
j
;
j
=
*
tree
;
auto
where
=
root
/
"tree.json"
;
std
::
ofstream
o
(
where
);
o
<<
j
;
SPDLOG_WARN
(
"Serialized to {}"
,
where
.
c_str
());
}
{
nlohmann
::
json
j
;
j
=
*
disk_cache
;
auto
where
=
root
/
"disk_cache.json"
;
std
::
ofstream
o
(
where
);
o
<<
j
;
SPDLOG_WARN
(
"Serialized to {}"
,
where
.
c_str
());
}
{
nlohmann
::
json
j
;
j
=
config
;
auto
where
=
root
/
"config.json"
;
std
::
ofstream
o
(
where
);
o
<<
j
;
SPDLOG_WARN
(
"Serialized to {}"
,
where
.
c_str
());
}
dump_quant_configs
(
root
/
"quant_configs.json"
);
dump_model_configs
(
root
/
"model_configs.json"
);
}
void
raw_insert
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
const
std
::
vector
<
layer_data
>&
k_cache
,
const
std
::
vector
<
layer_data
>&
v_cache
)
override
{
TimeObserver
time_observer
(
met
->
raw_insert_time_ms
);
SPDLOG_INFO
(
"Raw Insert"
);
if
(
length
%
NumTokenPerBlock
!=
0
)
{
SPDLOG_WARN
(
"Try to insert tokens with length {}, which is not a multiple of NumTokenPerBlock({}), getting floor"
,
length
,
NumTokenPerBlock
);
length
=
length
/
NumTokenPerBlock
*
NumTokenPerBlock
;
}
auto
h
=
std
::
make_shared
<
DoubleCacheHandle
>
();
h
->
kvc2_top
=
this
;
h
->
set_cache_info
(
model_name
,
quant_type
,
config
.
k_cache_on
,
config
.
v_cache_on
);
h
->
ids
=
Tokens
(
id
,
id
+
length
);
if
(
config
.
k_cache_on
)
h
->
set_raw_handles
(
true
,
k_cache
);
if
(
config
.
v_cache_on
)
h
->
set_raw_handles
(
false
,
v_cache
);
h
->
check_before_insert
();
h
->
match
=
tree
->
look_up_or_insert
(
id
,
length
);
auto
now_prefix
=
h
->
match
.
prefix
;
assert
(
config
.
k_cache_on
);
if
(
now_prefix
->
locations
.
get_location
(
h
->
k_info
(),
length
-
now_prefix
->
start_length
).
has_value
())
{
assert
(
now_prefix
->
locations
.
get_location
(
h
->
v_info
(),
length
-
now_prefix
->
start_length
).
has_value
());
SPDLOG_INFO
(
"KV Cache Already on disk"
);
// already on disk
}
else
{
now_prefix
=
now_prefix
->
to_first_prefix_without_disk_locations
(
h
->
k_info
());
// insert new kv cache locations
TokenLength
new_length
=
length
-
now_prefix
->
start_length
;
SPDLOG_DEBUG
(
"Inserting new kv cache, length: {}"
,
new_length
);
assert
(
new_length
>
0
);
if
(
config
.
v_cache_on
)
{
// allocate a big space on disk
auto
k_loc
=
disk_cache
->
allocate
(
h
->
k_info
(),
div_up
(
new_length
,
NumTokenPerBlock
));
auto
v_loc
=
disk_cache
->
allocate
(
h
->
v_info
(),
div_up
(
new_length
,
NumTokenPerBlock
));
h
->
k_seg_locs
.
add_location
(
now_prefix
->
start_length
/
NumTokenPerBlock
,
k_loc
);
h
->
v_seg_locs
.
add_location
(
now_prefix
->
start_length
/
NumTokenPerBlock
,
v_loc
);
// split it to prefix trees
for
(
auto
tail
=
h
->
match
.
prefix
;
tail
!=
now_prefix
->
prev
;
tail
=
tail
->
prev
)
{
TokenLength
local_ids_length
=
tail
->
local_length
();
tail
->
update_location
(
h
->
k_info
(),
k_loc
.
cut_tail
(
div_up
(
local_ids_length
,
NumTokenPerBlock
)));
tail
->
update_location
(
h
->
v_info
(),
v_loc
.
cut_tail
(
div_up
(
local_ids_length
,
NumTokenPerBlock
)));
}
assert
(
k_loc
.
length
==
0
);
assert
(
v_loc
.
length
==
0
);
}
else
{
// allocate a big space on disk
auto
k_loc
=
disk_cache
->
allocate
(
h
->
k_info
(),
div_up
(
new_length
,
NumTokenPerBlock
));
h
->
k_seg_locs
.
add_location
(
now_prefix
->
start_length
/
NumTokenPerBlock
,
k_loc
);
// split it to prefix trees
for
(
auto
tail
=
h
->
match
.
prefix
;
tail
!=
now_prefix
->
prev
;
tail
=
tail
->
prev
)
{
TokenLength
local_ids_length
=
tail
->
local_length
();
tail
->
update_location
(
h
->
k_info
(),
k_loc
.
cut_tail
(
div_up
(
local_ids_length
,
NumTokenPerBlock
)));
}
assert
(
k_loc
.
length
==
0
);
}
// write new kv cache
auto
disk_io_helper
=
h
->
segment_io
(
io_dealer
.
get
(),
disk_cache
.
get
(),
now_prefix
->
start_length
/
NumTokenPerBlock
,
div_up
(
new_length
,
NumTokenPerBlock
),
IO_ForceWrite
);
disk_io_helper
->
wait
();
}
}
TokenLength
raw_read
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
const
std
::
vector
<
layer_data
>&
k_cache
,
const
std
::
vector
<
layer_data
>&
v_cache
)
override
{
SPDLOG_INFO
(
"Raw Read"
);
auto
h
=
std
::
make_shared
<
DoubleCacheHandle
>
();
h
->
kvc2_top
=
this
;
h
->
set_cache_info
(
model_name
,
quant_type
,
config
.
k_cache_on
,
config
.
v_cache_on
);
h
->
ids
=
Tokens
(
id
,
id
+
length
);
if
(
config
.
k_cache_on
)
h
->
set_raw_handles
(
true
,
k_cache
);
if
(
config
.
v_cache_on
)
h
->
set_raw_handles
(
false
,
v_cache
);
h
->
match
=
tree
->
look_up
(
id
,
length
);
if
(
h
->
match
.
prefix
==
nullptr
)
{
SPDLOG_INFO
(
"Not Found"
);
return
0
;
}
SPDLOG_DEBUG
(
"Found {}"
,
h
->
match
.
match_length
);
h
->
collect_locations
();
auto
disk_io_helper
=
h
->
segment_io
(
io_dealer
.
get
(),
disk_cache
.
get
(),
0
,
div_up
(
h
->
match
.
match_length
,
NumTokenPerBlock
),
IO_ForceRead
);
disk_io_helper
->
wait
();
return
h
->
match
.
match_length
;
}
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
lookup
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
)
override
{
TimeObserver
time_observer
(
met
->
lookup_time_ms
);
auto
re
=
std
::
make_shared
<
DoubleCacheHandle
>
();
re
->
set_cache_info
(
model_name
,
quant_type
,
config
.
k_cache_on
,
config
.
v_cache_on
);
re
->
ids
=
Tokens
(
id
,
id
+
length
);
re
->
estimated_length
=
estimated_length
;
re
->
kvc2_top
=
this
;
SPDLOG_DEBUG
(
"Lookup TokenLength {}"
,
length
);
if
(
config
.
gpu_only
==
false
)
{
// TODO:
}
return
re
;
};
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
lookup_to_gpu
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
size_t
length
,
size_t
estimated_length
)
override
{
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
lookup_to_gpu_async
(
model_name
,
quant_type
,
id
,
length
,
estimated_length
,
[
&
p
](
auto
re
)
{
p
.
set_value
(
re
);
});
return
p
.
get_future
().
get
();
}
void
lookup_to_gpu_async
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
,
std
::
function
<
void
(
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
)
>
call_back
)
override
{
auto
re
=
lookup
(
model_name
,
quant_type
,
id
,
length
,
estimated_length
);
if
(
re
==
nullptr
)
{
call_back
(
nullptr
);
return
;
}
auto
h
=
static_cast
<
DoubleCacheHandle
*>
(
re
.
get
());
if
(
config
.
gpu_only
)
{
auto
total_block_count
=
div_up
(
estimated_length
,
NumTokenPerBlock
);
h
->
gpu_only_block_idx
=
gpu_cache
->
gpu_only_alloc_col
(
total_block_count
);
if
(
h
->
gpu_only_block_idx
.
empty
())
{
call_back
(
nullptr
);
}
else
{
call_back
(
re
);
}
}
else
{
if
(
h
->
k_info
().
hidden_layer_count
()
!=
gpu_cache
->
config
.
layer_count
)
{
SPDLOG_ERROR
(
"GPU Cache Layer Count not match"
);
assert
(
false
);
}
if
(
h
->
alloc_on_gpu_cols
()
==
false
)
{
call_back
(
nullptr
);
return
;
}
h
->
to_gpu_async
([
call_back
,
re
](
bool
ok
)
{
if
(
ok
)
{
call_back
(
re
);
}
else
{
call_back
(
nullptr
);
}
});
}
}
std
::
pair
<
std
::
vector
<
torch
::
Tensor
>
,
std
::
vector
<
torch
::
Tensor
>>
get_kvcache
()
override
{
return
{
gpu_cache
->
k_cache
,
gpu_cache
->
v_cache
};
}
void
flush_back
()
{
gpu_cache
->
background_flush_back
->
wakeUpWait
();
cache_manager
->
background_flush_back
->
wakeUpWait
();
}
void
debug
()
override
{
cache_manager
->
debug
();
tree
->
debug
();
}
virtual
~
KVC2
()
{
flush_back
();
};
KVC2
(
KVC2Config
config
)
:
config
(
config
)
{
SPDLOG_INFO
(
"Creating KVC2 using these config"
);
SPDLOG_INFO
(
" GPU Only: {}"
,
config
.
gpu_only
);
SPDLOG_INFO
(
" Load: {}, Save: {}"
,
config
.
load_from_disk
,
config
.
save_to_disk
);
SPDLOG_INFO
(
" Path: {}"
,
config
.
path
);
SPDLOG_INFO
(
" Config Path: {}"
,
config
.
config_path
);
SPDLOG_INFO
(
" Num Token/Page: {}, Memory Pool Size: {}"
,
config
.
num_token_per_page
,
readable_number
(
config
.
memory_pool_size
));
SPDLOG_INFO
(
" Evict Count: {}, Metrics Port: {}"
,
config
.
evict_count
,
config
.
metrics_port
);
SPDLOG_INFO
(
" Recompute Ratio: {:.2f}"
,
config
.
recompute_ratio
);
if
(
config
.
gpu_cache_config
)
{
const
auto
&
gpu_config
=
*
config
.
gpu_cache_config
;
SPDLOG_INFO
(
" GPU Devices: {}"
,
format_vector
(
gpu_config
.
gpu_devices_id
));
SPDLOG_INFO
(
" Layer Count: {}, Total KVCache Pages: {}"
,
gpu_config
.
layer_count
,
gpu_config
.
total_kvcache_pages
);
SPDLOG_INFO
(
" Num Token/Page: {}, Num K Heads: {}"
,
gpu_config
.
num_token_per_page
,
gpu_config
.
num_k_heads
);
SPDLOG_INFO
(
" K Head Dim: {}, Tensor Type: {}"
,
gpu_config
.
k_head_dim
,
static_cast
<
int
>
(
gpu_config
.
tensor_type
));
SPDLOG_INFO
(
" MemcpyCudaStreams/Device: {}"
,
gpu_config
.
num_streams_per_device
);
}
else
{
SPDLOG_INFO
(
" GPU Cache Config: None"
);
}
load_model_configs
(
config
.
config_path
+
"/model_configs.json"
);
load_quant_configs
(
config
.
config_path
+
"/quant_configs.json"
);
// met
MetricsConfig
met_conf
;
met_conf
.
endpoint
=
"0.0.0.0:"
+
std
::
to_string
(
config
.
metrics_port
);
SPDLOG_INFO
(
"Creating kvc2 metrics exporter on {}"
,
met_conf
.
endpoint
);
met
=
std
::
make_shared
<
Metrics
>
(
met_conf
);
if
(
config
.
gpu_only
==
false
)
{
if
(
config
.
k_cache_on
==
false
)
{
SPDLOG_ERROR
(
"if k_cache_on is false, gpu_only must be true"
);
assert
(
false
);
}
root
=
config
.
path
;
tree
=
std
::
make_unique
<
PrefixTree
>
();
disk_cache
=
std
::
make_unique
<
DiskCacheManager
>
(
config
);
memory_pool
=
std
::
make_shared
<
PageAlignedMemoryPool
>
(
config
.
memory_pool_size
);
cache_manager
=
std
::
unique_ptr
<
CacheEntryManager
>
(
new
CacheEntryManager
(
CacheEntryManagerConfig
{.
evict_count
=
config
.
evict_count
,
.
kvc2_top
=
this
}));
cache_manager
->
pool
=
memory_pool
;
io_dealer
=
std
::
make_unique
<
async_store
::
IODealer
>
();
io_dealer
->
start_io_thread
().
detach
();
tree
->
met
=
met
;
if
(
config
.
gpu_cache_config
.
has_value
())
{
gpu_cache
=
std
::
make_shared
<
GPUPageCache
>
(
config
.
gpu_cache_config
.
value
());
cache_manager
->
gpu_cache
=
gpu_cache
;
}
cache_manager
->
cpu_background_flush
();
gpu_cache
->
gpu_background_flush
();
}
else
{
SPDLOG_CRITICAL
(
"GPU ONLY MODE, NO PREFIX CACHE"
);
gpu_cache
=
std
::
make_shared
<
GPUPageCache
>
(
config
.
gpu_cache_config
.
value
());
}
}
};
std
::
shared_ptr
<
KVC2Interface
>
create_kvc2
(
KVC2Config
config
)
{
NumTokenPerBlock
=
config
.
num_token_per_page
;
EvictCount
=
config
.
evict_count
;
// SPDLOG_WARN("Sizeof KVC2Config {} here", sizeof(KVC2Config));
return
std
::
make_shared
<
KVC2
>
(
config
);
}
DoubleCacheHandle
::~
DoubleCacheHandle
()
{
if
(
kvc2_top
->
config
.
gpu_only
)
{
kvc2_top
->
gpu_cache
->
gpu_only_free_cols
(
gpu_only_block_idx
);
}
else
{
for_all_cache_block_entry
([](
std
::
shared_ptr
<
CacheBlockEntry
>&
block_entry
)
{
block_entry
->
lock_guard
();
if
(
block_entry
->
with_key
==
false
&&
block_entry
->
data
!=
nullptr
)
{
block_entry
->
free_on_cpu
();
}
return
true
;
});
}
};
void
DoubleCacheHandle
::
get_handles
()
{
size_t
new_count
=
0
,
total_count
=
0
;
auto
get_info_handles
=
[
this
,
&
new_count
,
&
total_count
](
CacheInfo
info
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
layers
)
{
auto
total_block_count
=
div_up
(
estimated_length
,
NumTokenPerBlock
);
for
(
size_t
l
=
0
;
l
<
info
.
hidden_layer_count
();
l
++
)
{
auto
hashes
=
match
.
matched_hashes
(
info
,
l
);
layers
[
l
].
resize
(
total_block_count
,
nullptr
);
for
(
size_t
i
=
0
;
i
<
total_block_count
;
i
++
)
{
std
::
optional
<
CacheEntryManager
::
Key
>
key
=
std
::
nullopt
;
if
(
i
<
hashes
.
size
())
key
=
hashes
[
i
];
bool
is_new
;
total_count
+=
1
;
layers
[
l
][
i
]
=
this
->
kvc2_top
->
cache_manager
->
get
(
is_new
,
info
.
element_size
(
NumTokenPerBlock
),
key
);
if
(
is_new
)
new_count
+=
1
;
layers
[
l
][
i
]
->
cache_info
=
info
;
layers
[
l
][
i
]
->
layer
=
l
;
}
}
};
if
(
kvc2_top
->
config
.
k_cache_on
)
get_info_handles
(
k_info
(),
k_cache_handles
);
if
(
kvc2_top
->
config
.
v_cache_on
)
get_info_handles
(
v_info
(),
v_cache_handles
);
SPDLOG_INFO
(
"New Handles: {}/{}"
,
new_count
,
total_count
);
}
bool
DoubleCacheHandle
::
to_gpu
()
{
std
::
promise
<
bool
>
p
;
to_gpu_async
([
&
p
](
bool
ok
)
{
p
.
set_value
(
ok
);
});
return
p
.
get_future
().
get
();
}
void
DoubleCacheHandle
::
to_gpu_async
(
std
::
function
<
void
(
bool
)
>
call_back
)
{
if
(
enable_alt
)
{
assert
(
false
);
// size_t page_size = kvc2_top->config.num_token_per_page;
// BlockLength count =
// div_up(TokenLength(std::ceil(match_by_blocks.partial_count() * page_size *
// kvc2_top->config.recompute_ratio)),
// page_size);
// if (alloc_attached_blocks(count) == false) {
// SPDLOG_WARN("Cannot allocate attached GPU block");
// call_back(false);
// return;
// } else {
// SPDLOG_INFO("Allocated {} attached GPU blocks", count);
// }
}
// don't wait here
if
(
any_match
()
==
false
)
{
SPDLOG_INFO
(
"No match, No need to load to gpu"
);
call_back
(
true
);
return
;
}
auto
gpu_io_helper
=
gpu_io
(
kvc2_top
->
gpu_cache
.
get
(),
0
,
match_range_length
(),
IO_Read
);
gpu_io_helper
->
call_back
=
[
call_back
]()
{
call_back
(
true
);
};
// Ok this is very stupid, but I have to do this for now
std
::
thread
([
gpu_io_helper
]()
{
gpu_io_helper
->
wait
();
}).
detach
();
}
bool
DoubleCacheHandle
::
alloc_attached_blocks
(
BlockLength
count
)
{
// attached_vertical_handles.resize(count);
// for (size_t i = 0; i < count; i++) {
// attached_vertical_handles[i] = std::shared_ptr<DoubleVerticalBlocksHandle>(new DoubleVerticalBlocksHandle);
// attached_vertical_handles[i]->gpu_only = true;
// }
// return kvc2_top->gpu_cache->alloc_pages(attached_vertical_handles);
return
true
;
}
std
::
vector
<
size_t
>
DoubleCacheHandle
::
get_gpu_attached_block_idx
()
{
std
::
vector
<
size_t
>
re
;
// for (auto& h : attached_vertical_handles) {
// re.push_back(h->gpu_block_idx.value());
// }
return
re
;
}
void
CacheBlockEntry
::
set_key
(
TokensHash
key
,
std
::
shared_ptr
<
CacheBlockEntry
>
me
)
{
assert
(
with_key
==
false
);
with_key
=
true
;
hash
=
key
;
// SPDLOG_DEBUG("Insert New Gen KVCache, key {}", key);
std
::
lock_guard
<
std
::
mutex
>
manager_lg
(
manager
->
lock
);
if
(
manager
->
key_entry_map
.
contains
(
me
->
hash
))
{
SPDLOG_WARN
(
"Duplicate key {}"
,
me
->
hash
);
}
else
{
manager
->
insert
(
me
);
}
}
std
::
vector
<
size_t
>
DoubleCacheHandle
::
get_gpu_block_idx
()
{
if
(
kvc2_top
->
config
.
gpu_only
)
{
return
gpu_only_block_idx
;
}
else
{
std
::
vector
<
size_t
>
re
;
for
(
auto
&
handle
:
k_cache_handles
[
0
])
{
re
.
push_back
(
handle
->
gpu_block_idx
.
value
());
}
return
re
;
}
}
/*
length : total length of tokens (including matched tokens)
1. update key, insert CacheBlock hash to lru
2. set dirty flag
3. update prefix tree, allocate new disk location
*/
void
DoubleCacheHandle
::
append_tokens
(
Token
*
all_tokens
,
TokenLength
length
)
{
if
(
kvc2_top
->
config
.
gpu_only
)
{
return
;
}
TimeObserver
time_observer
(
kvc2_top
->
met
->
append_tokens_time_ms
);
if
(
enable_alt
)
{
SPDLOG_WARN
(
"Append Tokens Not Implemented for Alternative Path"
);
return
;
}
if
(
length
>
estimated_length
)
{
SPDLOG_ERROR
(
"Length {} exceed estimated length {}"
,
length
,
estimated_length
);
assert
(
false
);
}
size_t
match_length
=
matched_length
();
if
(
length
<
match_length
)
{
SPDLOG_WARN
(
"Length {} less than match length {}"
,
length
,
match_length
);
assert
(
false
);
}
if
(
length
>
ids
.
size
())
{
ids
.
insert
(
ids
.
end
(),
all_tokens
+
ids
.
size
(),
all_tokens
+
length
);
}
static
const
auto
num_token_per_page
=
kvc2_top
->
config
.
num_token_per_page
;
if
(
match_length
%
num_token_per_page
!=
0
)
{
SPDLOG_ERROR
(
"Match length {} is not multiple of num_token_per_page {}"
,
match_length
,
num_token_per_page
);
assert
(
false
);
}
if
(
match_length
+
num_token_per_page
>
length
)
{
// SPDLOG_DEBUG("append_tokens No need to update");
return
;
}
SPDLOG_DEBUG
(
"Append Tokens to {}"
,
length
);
auto
pre_match_length
=
match_length
;
// set gpu dirty flag
size_t
new_added_block_count
=
0
;
while
(
match_length
+
num_token_per_page
<=
length
)
{
match_length
+=
num_token_per_page
;
new_added_block_count
+=
1
;
}
// update prefix tree
match
.
prefix
=
kvc2_top
->
tree
->
new_prefix_node
(
match
.
prefix
,
pre_match_length
,
ids
.
data
(),
match_length
).
get
();
match
.
match_length
=
match_length
;
// alloc disk location for new added prefix
auto
disk_cache
=
kvc2_top
->
disk_cache
.
get
();
Location
k_loc
{
0
,
0
},
v_loc
{
0
,
0
};
if
(
is_k_cache_on
)
{
k_loc
=
disk_cache
->
allocate
(
k_info
(),
new_added_block_count
);
k_seg_locs
.
add_location
(
match
.
prefix
->
start_length
/
NumTokenPerBlock
,
k_loc
);
match
.
prefix
->
update_location
(
k_info
(),
k_loc
);
}
if
(
is_v_cache_on
)
{
v_loc
=
disk_cache
->
allocate
(
v_info
(),
new_added_block_count
);
v_seg_locs
.
add_location
(
match
.
prefix
->
start_length
/
NumTokenPerBlock
,
v_loc
);
match
.
prefix
->
update_location
(
v_info
(),
v_loc
);
}
// update cache handles
auto
update_cache_handles
=
[
this
,
pre_match_length
,
length
](
CacheInfo
info
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
layers
,
Location
loc
)
{
TokensHasher
hasher
;
for
(
Layer
l
=
0
;
l
<
info
.
hidden_layer_count
();
l
++
)
{
hasher
.
reset
(
info
.
hash_value
());
hasher
.
update_raw
(
&
l
,
sizeof
(
l
));
hasher
.
update
(
ids
.
data
(),
pre_match_length
);
auto
page_count_start
=
pre_match_length
/
num_token_per_page
;
for
(
size_t
i
=
pre_match_length
;
i
+
num_token_per_page
<=
length
;
i
+=
num_token_per_page
)
{
auto
page_count
=
i
/
num_token_per_page
;
hasher
.
update
(
ids
.
data
()
+
i
,
num_token_per_page
);
auto
block
=
layers
[
l
][
page_count
];
{
auto
lg
=
block
->
lock_guard
();
block
->
idx
=
loc
.
start_idx
+
page_count
-
page_count_start
;
block
->
set_key
(
hasher
.
get
(),
block
);
if
(
l
==
0
&&
info
.
is_key_cache
)
{
block
->
gpu_cc
.
tc
.
set_has_data
();
}
block
->
gpu_cc
.
dirty
.
store
(
true
);
}
}
}
};
if
(
is_k_cache_on
)
{
update_cache_handles
(
k_info
(),
k_cache_handles
,
k_loc
);
}
if
(
is_v_cache_on
)
{
update_cache_handles
(
v_info
(),
v_cache_handles
,
v_loc
);
}
// kvc2_top->block_cache->debug();
}
void
CacheBlockEntry
::
flush_back_async
(
IO_Helper
<
CacheBlockEntry
>&
helper
,
std
::
vector
<
std
::
atomic_bool
*>&
dirty_flags
)
{
auto
kvc2_top
=
manager
->
config
.
kvc2_top
;
auto
allocator
=
kvc2_top
->
disk_cache
->
get_allocator
(
cache_info
);
// if (layer == 0) {
// SPDLOG_DEBUG("Flush {} to {}", fmt::ptr(this), idx);
// }
io_with
(
kvc2_top
->
io_dealer
.
get
(),
helper
,
allocator
->
get_store
(
layer
),
layer
,
idx
,
IOOption
::
IO_Write
);
dirty_flags
.
push_back
(
&
cpu_cc
.
dirty
);
}
void
CacheEntryManager
::
cpu_background_flush
()
{
if
(
background_flush_back
.
get
()
==
nullptr
)
{
SPDLOG_INFO
(
"Starting CPU Background flush"
);
background_flush_back
=
std
::
unique_ptr
<
periodic
::
PeriodicTask
>
(
new
periodic
::
PeriodicTask
([
this
]()
{
// Timer t("CPU Flush");
std
::
vector
<
std
::
atomic_bool
*>
dirty_cpus
;
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
entry_uls
;
IO_Helper
<
CacheBlockEntry
>
io_helper
(
nullptr
,
[
&
dirty_cpus
]()
{
for
(
auto
&
flag
:
dirty_cpus
)
{
flag
->
store
(
false
);
}
if
(
dirty_cpus
.
size
()
>
0
)
SPDLOG_DEBUG
(
"{} dirty CPU pages flushed."
,
dirty_cpus
.
size
());
});
{
std
::
lock_guard
<
std
::
mutex
>
ul
(
lock
);
for
(
auto
&
e
:
usage_list
)
{
auto
ul
=
e
->
try_lock
();
if
(
ul
.
owns_lock
())
{
if
(
e
->
cpu_cc
.
dirty
.
load
())
{
entry_uls
.
push_back
(
std
::
move
(
ul
));
e
->
flush_back_async
(
io_helper
,
dirty_cpus
);
}
}
// if (dirty_cpus.size() == 100) {
// break;
// }
}
}
io_helper
.
finish_add_taks
();
io_helper
.
wait
();
}));
}
else
{
SPDLOG_ERROR
(
"Flush Thread Already Started"
);
}
}
void
GPUPageCache
::
gpu_background_flush
()
{
if
(
background_flush_back
.
get
()
==
nullptr
)
{
SPDLOG_INFO
(
"Starting GPU Background flush"
);
background_flush_back
=
std
::
unique_ptr
<
periodic
::
PeriodicTask
>
(
new
periodic
::
PeriodicTask
([
this
]()
{
// Timer t("GPU Flush");
std
::
vector
<
size_t
>
dirty_cols
;
std
::
vector
<
CacheBlockEntry
*>
entries
;
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
uls
;
BatchPromise
promise
(
config
.
gpu_devices_id
.
size
());
auto
reqs
=
basic_request
(
cudaMemcpyDeviceToHost
,
[
&
promise
]()
{
promise
.
set
();
});
for
(
size_t
i
=
0
;
i
<
config
.
total_kvcache_pages
;
i
++
)
{
std
::
lock_guard
<
std
::
mutex
>
lg
(
this
->
lock
);
auto
col_uls
=
try_lock_col
(
i
);
if
(
col_uls
.
empty
())
continue
;
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
config
.
k_cache_on
&&
(
occupations
[
l
][
i
]
->
gpu_cc
.
dirty
.
load
()
==
false
||
occupations
[
l
][
i
]
->
cpu_cc
.
dirty
.
load
()))
goto
next_gpu_page
;
if
(
config
.
v_cache_on
&&
(
v_occupations
[
l
][
i
]
->
gpu_cc
.
dirty
.
load
()
==
false
||
v_occupations
[
l
][
i
]
->
cpu_cc
.
dirty
.
load
()))
goto
next_gpu_page
;
}
dirty_cols
.
push_back
(
i
);
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
// occupations[l][i]->alloc_on_cpu_no_lock();
if
(
config
.
k_cache_on
)
entries
.
push_back
(
occupations
[
l
][
i
].
get
());
if
(
config
.
v_cache_on
)
entries
.
push_back
(
v_occupations
[
l
][
i
].
get
());
}
append_col_to_request
(
reqs
,
occupations
,
v_occupations
,
i
);
for
(
auto
&
ul
:
col_uls
)
{
uls
.
push_back
(
std
::
move
(
ul
));
}
next_gpu_page:
continue
;
}
submit_requests
(
reqs
);
promise
.
get_shared_fut
().
wait
();
if
(
dirty_cols
.
empty
()
==
false
)
SPDLOG_INFO
(
"GPU Flushed Back {} cols"
,
dirty_cols
.
size
());
for
(
auto
&
entry
:
entries
)
{
entry
->
cpu_cc
.
tc
.
set_has_data
();
// we have locks here
entry
->
cpu_cc
.
dirty
.
store
(
true
);
}
for
(
auto
&
col
:
dirty_cols
)
{
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
config
.
k_cache_on
)
occupations
[
l
][
col
]
->
gpu_cc
.
dirty
.
store
(
false
);
if
(
config
.
v_cache_on
)
v_occupations
[
l
][
col
]
->
gpu_cc
.
dirty
.
store
(
false
);
}
}
if
(
dirty_cols
.
empty
()
==
false
)
{
debug
();
}
}));
}
else
{
SPDLOG_ERROR
(
"Flush Thread Already Started"
);
}
}
}
// namespace kvc2
csrc/balance_serve/kvc2/src/utils/all.hpp
0 → 100644
View file @
877aec85
#pragma once
#include "easy_format.hpp"
#include "timer.hpp"
\ No newline at end of file
csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
0 → 100644
View file @
877aec85
#include <memory>
#include <type_traits>
template
<
typename
T
,
typename
U
>
T
div_up
(
T
x
,
U
by
)
{
static_assert
(
std
::
is_integral_v
<
T
>
);
static_assert
(
std
::
is_integral_v
<
U
>
);
return
(
x
+
by
-
1
)
/
by
;
}
template
<
typename
T
>
T
*
offset_by_bytes
(
T
*
t
,
size_t
n
)
{
return
reinterpret_cast
<
T
*>
(
reinterpret_cast
<
size_t
>
(
t
)
+
n
);
}
csrc/balance_serve/kvc2/src/utils/easy_format.hpp
0 → 100644
View file @
877aec85
#ifndef __EASY_FORMAT_HPP_
#define __EASY_FORMAT_HPP_
#include <array>
#include <iomanip>
#include <sstream>
#include <string>
#include <vector>
template
<
typename
T
>
inline
std
::
string
format_vector
(
const
std
::
vector
<
T
>&
v
)
{
std
::
ostringstream
oss
;
if
(
v
.
empty
())
return
"[]"
;
for
(
size_t
i
=
0
;
i
<
v
.
size
();
++
i
)
{
oss
<<
v
[
i
];
if
(
i
<
v
.
size
()
-
1
)
oss
<<
", "
;
// 逗号分隔
}
return
oss
.
str
();
}
inline
std
::
array
<
std
::
string
,
7
>
units
=
{
""
,
"K"
,
"M"
,
"G"
,
"T"
,
"P"
,
"E"
};
inline
std
::
string
readable_number
(
size_t
size
)
{
size_t
unit_index
=
0
;
double
readable_size
=
size
;
while
(
readable_size
>=
1000
&&
unit_index
<
units
.
size
()
-
1
)
{
readable_size
/=
1000
;
unit_index
++
;
}
std
::
ostringstream
ss
;
ss
<<
std
::
fixed
<<
std
::
setprecision
(
2
)
<<
readable_size
;
std
::
string
str
=
ss
.
str
();
return
str
+
""
+
units
[
unit_index
];
}
#endif
\ No newline at end of file
Prev
1
2
3
4
5
6
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment