Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
25cee581
Commit
25cee581
authored
Mar 31, 2025
by
Atream
Browse files
add balance-serve, support concurrence
parent
8d0292aa
Changes
196
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3424 additions
and
0 deletions
+3424
-0
csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
+135
-0
csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
+54
-0
csrc/balance_serve/kvc2/src/defs.h
csrc/balance_serve/kvc2/src/defs.h
+35
-0
csrc/balance_serve/kvc2/src/gpu_cache.cpp
csrc/balance_serve/kvc2/src/gpu_cache.cpp
+282
-0
csrc/balance_serve/kvc2/src/gpu_cache.hh
csrc/balance_serve/kvc2/src/gpu_cache.hh
+74
-0
csrc/balance_serve/kvc2/src/hasher.hpp
csrc/balance_serve/kvc2/src/hasher.hpp
+40
-0
csrc/balance_serve/kvc2/src/io_helper.hpp
csrc/balance_serve/kvc2/src/io_helper.hpp
+155
-0
csrc/balance_serve/kvc2/src/kvc2.h
csrc/balance_serve/kvc2/src/kvc2.h
+138
-0
csrc/balance_serve/kvc2/src/kvc2_utils.py
csrc/balance_serve/kvc2/src/kvc2_utils.py
+64
-0
csrc/balance_serve/kvc2/src/metrics.cpp
csrc/balance_serve/kvc2/src/metrics.cpp
+141
-0
csrc/balance_serve/kvc2/src/metrics.h
csrc/balance_serve/kvc2/src/metrics.h
+77
-0
csrc/balance_serve/kvc2/src/model_config.h
csrc/balance_serve/kvc2/src/model_config.h
+103
-0
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
+123
-0
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
+53
-0
csrc/balance_serve/kvc2/src/prefix.cpp
csrc/balance_serve/kvc2/src/prefix.cpp
+1746
-0
csrc/balance_serve/kvc2/src/utils/all.hpp
csrc/balance_serve/kvc2/src/utils/all.hpp
+3
-0
csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
+14
-0
csrc/balance_serve/kvc2/src/utils/easy_format.hpp
csrc/balance_serve/kvc2/src/utils/easy_format.hpp
+37
-0
csrc/balance_serve/kvc2/src/utils/lock_free_queue.hpp
csrc/balance_serve/kvc2/src/utils/lock_free_queue.hpp
+60
-0
csrc/balance_serve/kvc2/src/utils/mpsc.hpp
csrc/balance_serve/kvc2/src/utils/mpsc.hpp
+90
-0
No files found.
csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
0 → 100644
View file @
25cee581
#include "cuda_stream_manager.hh"
#include <cuda_runtime.h>
#include <functional>
#include <iostream>
#include <stdexcept>
#include <vector>
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
CudaStreamManager
::
CudaStreamManager
(
const
std
::
vector
<
size_t
>&
device_ids
,
int
num_streams_per_device
)
{
for
(
int
device_id
:
device_ids
)
{
auto
x
=
std
::
unique_ptr
<
DeviceInfo
>
(
new
DeviceInfo
);
DeviceInfo
&
device_info
=
*
x
;
device_info
.
device_id
=
device_id
;
device_info
.
next_stream_index
=
0
;
device_info
.
stop_flag
=
false
;
// 设置设备
cudaError_t
err
=
cudaSetDevice
(
device_id
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaSetDevice failed on device {}: {}"
,
device_id
,
cudaGetErrorString
(
err
));
throw
std
::
runtime_error
(
"cudaSetDevice failed"
);
}
// 创建 CUDA 流
device_info
.
streams
.
resize
(
num_streams_per_device
);
for
(
int
i
=
0
;
i
<
num_streams_per_device
;
++
i
)
{
err
=
cudaStreamCreate
(
&
device_info
.
streams
[
i
]);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"Failed to create CUDA stream on device {}: {}"
,
device_id
,
cudaGetErrorString
(
err
));
throw
std
::
runtime_error
(
"Failed to create CUDA stream"
);
}
}
// 启动设备工作线程
device_info
.
worker_thread
=
std
::
thread
(
&
CudaStreamManager
::
deviceWorker
,
this
,
std
::
ref
(
device_info
));
devices_
.
push_back
(
std
::
move
(
x
));
}
}
CudaStreamManager
::~
CudaStreamManager
()
{
// 通知所有设备线程停止
for
(
auto
&
device_info
:
devices_
)
{
device_info
->
stop_flag
.
store
(
true
);
auto
request
=
std
::
shared_ptr
<
Request
>
(
new
Request
);
request
->
should_exit
=
true
;
device_info
->
request_queue
.
enqueue
(
std
::
move
(
request
));
}
// 等待所有线程结束
for
(
auto
&
device_info
:
devices_
)
{
if
(
device_info
->
worker_thread
.
joinable
())
{
device_info
->
worker_thread
.
join
();
}
// 销毁 CUDA 流
cudaSetDevice
(
device_info
->
device_id
);
for
(
auto
&
stream
:
device_info
->
streams
)
{
cudaStreamDestroy
(
stream
);
}
}
}
void
CudaStreamManager
::
submitRequest
(
std
::
shared_ptr
<
Request
>
request
)
{
// 找到对应的设备
for
(
auto
&
device_info
:
devices_
)
{
if
(
device_info
->
device_id
==
request
->
device_id
)
{
device_info
->
request_queue
.
enqueue
(
request
);
return
;
}
}
throw
std
::
runtime_error
(
"Invalid device ID in request"
);
}
void
CudaStreamManager
::
deviceWorker
(
DeviceInfo
&
device_info
)
{
// 设置设备
cudaError_t
err
=
cudaSetDevice
(
device_info
.
device_id
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaSetDevice failed in worker thread for device {}: {}"
,
device_info
.
device_id
,
cudaGetErrorString
(
err
));
return
;
}
while
(
device_info
.
stop_flag
.
load
()
==
false
)
{
auto
request
=
device_info
.
request_queue
.
dequeue
();
if
(
request
->
should_exit
)
{
return
;
}
// 处理请求
SPDLOG_DEBUG
(
"Getting request on device {}, count {}"
,
device_info
.
device_id
,
request
->
host_mem_addresses
.
size
());
int
stream_index
=
device_info
.
next_stream_index
;
cudaStream_t
stream
=
device_info
.
streams
[
stream_index
];
device_info
.
next_stream_index
=
(
device_info
.
next_stream_index
+
1
)
%
device_info
.
streams
.
size
();
size_t
num_transfers
=
request
->
host_mem_addresses
.
size
();
for
(
size_t
i
=
0
;
i
<
num_transfers
;
++
i
)
{
void
*
dst
=
request
->
device_mem_addresses
[
i
];
void
*
src
=
request
->
host_mem_addresses
[
i
];
if
(
request
->
direction
==
cudaMemcpyDeviceToHost
)
{
std
::
swap
(
dst
,
src
);
}
cudaError_t
err
=
cudaMemcpyAsync
(
dst
,
src
,
request
->
sizes
[
i
],
request
->
direction
,
stream
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaMemcpyAsync failed on device {}: {}"
,
device_info
.
device_id
,
cudaGetErrorString
(
err
));
// 可以根据需要处理错误,这里简单地继续
continue
;
}
}
// 添加回调函数,因为是异步,所以需要包起来
struct
CallbackData
{
std
::
function
<
void
()
>
callback
;
};
CallbackData
*
cb_data
=
new
CallbackData
{
request
->
callback
};
err
=
cudaLaunchHostFunc
(
stream
,
[](
void
*
data
)
{
// SPDLOG_DEBUG("Callback function called");
CallbackData
*
cb_data
=
static_cast
<
CallbackData
*>
(
data
);
cb_data
->
callback
();
delete
cb_data
;
},
cb_data
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaLaunchHostFunc failed on device {}: {}"
,
device_info
.
device_id
,
cudaGetErrorString
(
err
));
// 根据需要处理错误
}
}
}
csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
0 → 100644
View file @
25cee581
/*
* @Author: Xie Weiyu ervinxie@qq.com
* @Date: 2024-11-19 09:24:47
* @LastEditors: Xie Weiyu ervinxie@qq.com
* @LastEditTime: 2024-11-20 02:55:49
* @FilePath: /kvc2/src/cuda_stream_manager.hh
* @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
*/
#pragma once
#include <cuda_runtime.h>
#include <atomic>
#include <functional>
#include <memory>
#include <thread>
#include <vector>
#include "utils/mpsc.hpp"
class
CudaStreamManager
{
public:
// 构造函数,接受要使用的设备 ID 列表和每个设备的流数量
CudaStreamManager
(
const
std
::
vector
<
size_t
>&
device_ids
,
int
num_streams_per_device
);
~
CudaStreamManager
();
// 请求结构体
struct
Request
{
bool
should_exit
=
false
;
int
device_id
;
std
::
vector
<
void
*>
host_mem_addresses
;
std
::
vector
<
void
*>
device_mem_addresses
;
std
::
vector
<
size_t
>
sizes
;
cudaMemcpyKind
direction
;
std
::
function
<
void
()
>
callback
;
};
void
submitRequest
(
std
::
shared_ptr
<
Request
>
request
);
private:
// 每个设备的信息
struct
DeviceInfo
{
int
device_id
;
std
::
thread
worker_thread
;
std
::
vector
<
cudaStream_t
>
streams
;
int
next_stream_index
;
MPSCQueueConsumerLock
<
std
::
shared_ptr
<
Request
>>
request_queue
;
std
::
atomic_bool
stop_flag
;
};
// 设备 ID 到 DeviceInfo 的映射
std
::
vector
<
std
::
unique_ptr
<
DeviceInfo
>>
devices_
;
// 私有方法
void
deviceWorker
(
DeviceInfo
&
device_info
);
};
csrc/balance_serve/kvc2/src/defs.h
0 → 100644
View file @
25cee581
#ifndef __DEFS_H_
#define __DEFS_H_
#include <cstdint>
#include <optional>
#include <vector>
#include "model_config.h"
namespace
kvc2
{
using
kvc2_ptr
=
void
*
;
// using data_block_ptr = std::intptr_t;
using
data_block_ptr
=
void
*
;
using
layer_data
=
std
::
vector
<
data_block_ptr
>
;
using
kvc2_handle
=
void
*
;
using
Token
=
uint32_t
;
using
Tokens
=
std
::
vector
<
Token
>
;
using
TokenPtr
=
std
::
intptr_t
;
using
TokenLength
=
size_t
;
using
BlockLength
=
size_t
;
struct
CacheInfo
{
ModelName
model_name
;
bool
is_key_cache
;
QuantType
quant_type
;
size_t
hidden_layer_count
();
std
::
filesystem
::
path
path
(
std
::
optional
<
size_t
>
which_layer
=
std
::
nullopt
);
bool
operator
==
(
const
CacheInfo
&
other
)
const
;
size_t
element_size
(
size_t
block_length
);
size_t
hash_value
()
const
;
};
};
// namespace kvc2
#endif
csrc/balance_serve/kvc2/src/gpu_cache.cpp
0 → 100644
View file @
25cee581
#include "gpu_cache.hh"
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
#include "cache_entry.hh"
#include "utils/arithmetic.hpp"
namespace
kvc2
{
GPUPageCache
::
GPUPageCache
(
GPUPageCacheConfig
&
config
)
:
config
(
config
)
{
if
(
torch
::
cuda
::
is_available
())
{
size_t
gpu_count
=
torch
::
cuda
::
device_count
();
SPDLOG_INFO
(
"Number of available GPUs: {}, want {}"
,
gpu_count
,
config
.
gpu_devices_id
.
size
());
if
(
gpu_count
<
config
.
gpu_devices_id
.
size
())
{
SPDLOG_ERROR
(
"Not enough GPUs available."
);
exit
(
0
);
}
for
(
auto
x
:
config
.
gpu_devices_id
)
{
gpu_devices
.
push_back
(
torch
::
Device
(
torch
::
kCUDA
,
x
));
}
}
else
{
SPDLOG_ERROR
(
"CUDA is not available on this system."
);
exit
(
0
);
}
SPDLOG_WARN
(
"Creating GPU Cache"
);
shape
.
push_back
(
config
.
layer_count
);
shape
.
push_back
(
config
.
total_kvcache_pages
);
shape
.
push_back
(
config
.
num_token_per_page
);
if
(
config
.
full_kv_cache_on_each_gpu
)
{
if
(
config
.
gpu_devices_id
.
size
()
>
1
)
{
SPDLOG_WARN
(
"Replicated KVCache on multiple gpu"
);
}
shape
.
push_back
(
config
.
num_k_heads
);
}
else
{
shape
.
push_back
(
config
.
num_k_heads
/
config
.
gpu_devices_id
.
size
());
}
shape
.
push_back
(
config
.
k_head_dim
);
tensor_size
=
torch
::
elementSize
(
config
.
tensor_type
);
for
(
auto
&
s
:
shape
)
{
tensor_size
*=
s
;
}
SPDLOG_INFO
(
"Creating KV Page Cache, Shape ({},{},{},{},{}), Size {} MiB"
,
shape
[
0
],
shape
[
1
],
shape
[
2
],
shape
[
3
],
shape
[
4
],
tensor_size
/
(
1
<<
20
));
if
(
config
.
k_cache_on
)
{
for
(
size_t
i
=
0
;
i
<
config
.
gpu_devices_id
.
size
();
i
++
)
{
auto
k
=
torch
::
zeros
(
shape
,
torch
::
TensorOptions
().
dtype
(
config
.
tensor_type
));
k
=
k
.
to
(
gpu_devices
[
i
]);
k_cache
.
push_back
(
k
);
SPDLOG_INFO
(
"K Page Cache of GPU {} is created"
,
config
.
gpu_devices_id
[
i
]);
}
occupations
.
resize
(
config
.
layer_count
);
}
else
{
SPDLOG_WARN
(
"Disalbe K Cache"
);
assert
(
config
.
gpu_only
);
}
if
(
config
.
v_cache_on
)
{
for
(
size_t
i
=
0
;
i
<
config
.
gpu_devices_id
.
size
();
i
++
)
{
auto
v
=
torch
::
zeros
(
shape
,
torch
::
TensorOptions
().
dtype
(
config
.
tensor_type
));
v
=
v
.
to
(
gpu_devices
[
i
]);
v_cache
.
push_back
(
v
);
SPDLOG_INFO
(
"V Page Cache of GPU {} is created"
,
config
.
gpu_devices_id
[
i
]);
}
v_occupations
.
resize
(
config
.
layer_count
);
}
else
{
SPDLOG_WARN
(
"Disalbe V Cache"
);
// assert(config.gpu_only); // should not assert
}
if
(
config
.
gpu_only
)
{
gpu_only_occupations
.
resize
(
config
.
total_kvcache_pages
,
false
);
}
num_free_pages
=
config
.
total_kvcache_pages
;
for
(
size_t
i
=
0
;
i
<
config
.
layer_count
;
i
++
)
{
if
(
config
.
k_cache_on
)
occupations
[
i
].
resize
(
config
.
total_kvcache_pages
,
nullptr
);
if
(
config
.
v_cache_on
)
v_occupations
[
i
].
resize
(
config
.
total_kvcache_pages
,
nullptr
);
}
tp_size
.
resize
(
config
.
gpu_devices_id
.
size
(),
shape
[
2
]
*
shape
[
3
]
*
shape
[
4
]
*
c10
::
elementSize
(
config
.
tensor_type
));
tp_offset
.
resize
(
config
.
gpu_devices_id
.
size
(),
0
);
for
(
size_t
i
=
1
;
i
<
tp_offset
.
size
();
i
++
)
{
tp_offset
[
i
]
=
tp_offset
[
i
-
1
]
+
tp_size
[
i
-
1
];
}
stream_manager
=
std
::
unique_ptr
<
CudaStreamManager
>
(
new
CudaStreamManager
(
config
.
gpu_devices_id
,
config
.
num_streams_per_device
));
}
bool
GPUPageCache
::
alloc_col
(
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_entries
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_entries
,
size_t
at
)
{
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
auto
idx
=
next_empty_col
();
if
(
idx
.
has_value
())
{
// must have entry lock
auto
&
k0_entry
=
k_entries
[
0
][
at
];
k0_entry
->
gpu_block_idx
=
idx
;
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
config
.
k_cache_on
)
{
assert
(
k_entries
[
l
][
at
]
->
data
!=
nullptr
);
occupations
[
l
][
idx
.
value
()]
=
k_entries
[
l
][
at
];
}
if
(
config
.
v_cache_on
)
{
assert
(
v_entries
[
l
][
at
]
->
data
!=
nullptr
);
v_occupations
[
l
][
idx
.
value
()]
=
v_entries
[
l
][
at
];
}
}
return
true
;
}
else
{
return
false
;
}
}
std
::
vector
<
size_t
>
GPUPageCache
::
gpu_only_alloc_col
(
size_t
count
)
{
assert
(
config
.
gpu_only
);
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
std
::
vector
<
size_t
>
re
;
for
(
size_t
i
=
0
;
i
<
config
.
total_kvcache_pages
;
i
++
)
{
if
(
gpu_only_occupations
[
i
]
==
false
)
{
re
.
push_back
(
i
);
if
(
re
.
size
()
==
count
)
{
break
;
}
}
}
if
(
re
.
size
()
==
count
)
{
for
(
auto
at
:
re
)
{
gpu_only_occupations
[
at
]
=
true
;
}
}
else
{
SPDLOG_WARN
(
"GPU ONLY: Cannot allocate {} cols"
,
count
);
re
.
clear
();
}
return
re
;
}
void
GPUPageCache
::
gpu_only_free_cols
(
std
::
vector
<
size_t
>
cols
)
{
assert
(
config
.
gpu_only
);
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
for
(
auto
at
:
cols
)
{
assert
(
gpu_only_occupations
[
at
]);
gpu_only_occupations
[
at
]
=
false
;
}
}
std
::
optional
<
size_t
>
GPUPageCache
::
next_empty_col
()
{
if
(
num_free_pages
==
0
)
{
evict_cols
();
if
(
num_free_pages
==
0
)
{
return
std
::
nullopt
;
}
}
while
(
occupations
[
0
][
_col_idx
]
!=
nullptr
)
{
_col_idx
=
(
_col_idx
+
1
)
%
config
.
total_kvcache_pages
;
}
num_free_pages
-=
1
;
return
_col_idx
;
}
void
GPUPageCache
::
evict_cols
()
{
auto
evicted_count
=
0
;
for
(
size_t
i
=
0
;
i
<
config
.
total_kvcache_pages
;
i
++
)
{
auto
&
h
=
occupations
[
0
][
i
];
if
(
h
==
nullptr
)
{
continue
;
}
auto
lg
=
h
->
lock_guard
();
if
(
h
->
gpu_cc
.
can_desert
())
{
h
->
gpu_cc
.
tc
.
reset
();
h
=
nullptr
;
num_free_pages
+=
1
;
evicted_count
+=
1
;
}
}
if
(
evicted_count
>
0
)
SPDLOG_INFO
(
"GPU: Evicted {} GPU pages"
,
evicted_count
);
}
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
GPUPageCache
::
try_lock_col
(
size_t
at
)
{
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
re
;
if
(
config
.
k_cache_on
)
{
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
occupations
[
l
][
at
]
==
nullptr
)
{
return
{};
}
auto
ul
=
occupations
[
l
][
at
]
->
try_lock
();
if
(
ul
.
owns_lock
())
{
re
.
push_back
(
std
::
move
(
ul
));
}
else
{
return
{};
}
}
}
if
(
config
.
v_cache_on
)
{
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
v_occupations
[
l
][
at
]
==
nullptr
)
{
return
{};
}
auto
ul
=
v_occupations
[
l
][
at
]
->
try_lock
();
if
(
ul
.
owns_lock
())
{
re
.
push_back
(
std
::
move
(
ul
));
}
else
{
return
{};
}
}
}
return
re
;
}
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
GPUPageCache
::
basic_request
(
cudaMemcpyKind
direction
,
std
::
function
<
void
()
>
callback
)
{
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
re
;
re
.
resize
(
config
.
gpu_devices_id
.
size
(),
nullptr
);
for
(
size_t
i
=
0
;
i
<
re
.
size
();
i
++
)
{
re
[
i
]
=
std
::
shared_ptr
<
CudaStreamManager
::
Request
>
(
new
CudaStreamManager
::
Request
);
re
[
i
]
->
direction
=
direction
;
re
[
i
]
->
device_id
=
config
.
gpu_devices_id
[
i
];
re
[
i
]
->
callback
=
callback
;
}
return
re
;
}
void
GPUPageCache
::
submit_requests
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
reqs
)
{
for
(
auto
&
r
:
reqs
)
{
stream_manager
->
submitRequest
(
r
);
}
}
void
GPUPageCache
::
append_col_to_request
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>&
reqs
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_handles
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_handles
,
size_t
at
)
{
if
(
config
.
k_cache_on
==
false
&&
config
.
v_cache_on
==
false
)
{
return
;
}
auto
gpu_block_idx
=
k_handles
[
0
][
at
]
->
gpu_block_idx
.
value
();
for
(
size_t
layer
=
0
;
layer
<
config
.
layer_count
;
layer
++
)
{
for
(
size_t
which_gpu
=
0
;
which_gpu
<
config
.
gpu_devices_id
.
size
();
which_gpu
++
)
{
if
(
config
.
k_cache_on
)
{
assert
(
k_handles
[
layer
][
at
]
->
data
!=
nullptr
);
reqs
[
which_gpu
]
->
sizes
.
push_back
(
tp_size
[
which_gpu
]);
reqs
[
which_gpu
]
->
host_mem_addresses
.
push_back
(
offset_by_bytes
(
k_handles
[
layer
][
at
]
->
data
,
tp_offset
[
which_gpu
]));
reqs
[
which_gpu
]
->
device_mem_addresses
.
push_back
(
k_cache
[
which_gpu
][
layer
][
gpu_block_idx
].
data_ptr
());
}
if
(
config
.
v_cache_on
)
{
assert
(
v_handles
[
layer
][
at
]
->
data
!=
nullptr
);
reqs
[
which_gpu
]
->
sizes
.
push_back
(
tp_size
[
which_gpu
]);
reqs
[
which_gpu
]
->
host_mem_addresses
.
push_back
(
offset_by_bytes
(
v_handles
[
layer
][
at
]
->
data
,
tp_offset
[
which_gpu
]));
reqs
[
which_gpu
]
->
device_mem_addresses
.
push_back
(
v_cache
[
which_gpu
][
layer
][
gpu_block_idx
].
data_ptr
());
}
}
}
// SPDLOG_DEBUG("GPU: Appended Vertical Handle to Request, count {}", reqs[0]->sizes.size());
}
void
GPUPageCache
::
debug
()
{
size_t
count
=
0
;
for
(
size_t
i
=
0
;
i
<
config
.
total_kvcache_pages
;
i
++
)
{
if
(
occupations
[
0
][
i
]
==
nullptr
)
{
count
+=
1
;
}
else
{
// occupations[0][i]->gpu_cc.debug();
}
}
SPDLOG_DEBUG
(
"Free Page: {}/{}"
,
count
,
config
.
total_kvcache_pages
);
}
}
// namespace kvc2
csrc/balance_serve/kvc2/src/gpu_cache.hh
0 → 100644
View file @
25cee581
#ifndef __GPU_CACHE_HH_
#define __GPU_CACHE_HH_
#include <torch/torch.h>
#include "cache_entry.hh"
#include "cuda_stream_manager.hh"
#include "defs.h"
#include "kvc2.h"
#include "metrics.h"
#include "utils/periodic_task.hpp"
namespace
kvc2
{
class
GPUPageCache
{
std
::
vector
<
torch
::
Device
>
gpu_devices
;
std
::
vector
<
int64_t
>
shape
;
size_t
tensor_size
;
std
::
vector
<
size_t
>
tp_offset
;
std
::
vector
<
size_t
>
tp_size
;
// met
std
::
shared_ptr
<
Metrics
>
met
;
// states
std
::
mutex
lock
;
size_t
num_free_pages
;
std
::
vector
<
bool
>
gpu_only_occupations
;
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>
occupations
,
v_occupations
;
size_t
_col_idx
=
0
;
// cuda stream manager
std
::
optional
<
size_t
>
next_empty_col
();
public:
GPUPageCacheConfig
config
;
std
::
unique_ptr
<
CudaStreamManager
>
stream_manager
;
std
::
vector
<
torch
::
Tensor
>
k_cache
;
std
::
vector
<
torch
::
Tensor
>
v_cache
;
std
::
unique_ptr
<
periodic
::
PeriodicTask
>
background_flush_back
=
nullptr
;
GPUPageCache
(
GPUPageCacheConfig
&
config
);
std
::
vector
<
size_t
>
gpu_only_alloc_col
(
size_t
count
);
void
gpu_only_free_cols
(
std
::
vector
<
size_t
>
cols
);
void
gpu_background_flush
();
bool
alloc_col
(
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_entries
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_entries
,
size_t
at
);
void
evict_cols
();
void
flush_col
(
size_t
at
);
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
try_lock_col
(
size_t
at
);
void
free_col
(
size_t
at
);
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
basic_request
(
cudaMemcpyKind
direction
,
std
::
function
<
void
()
>
callback
);
void
submit_requests
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
reqs
);
void
append_col_to_request
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>&
reqs
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_handles
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_handles
,
size_t
at
);
void
debug
();
};
}
// namespace kvc2
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/hasher.hpp
0 → 100644
View file @
25cee581
#ifndef __HASHER_HPP_
#define __HASHER_HPP_
#include "defs.h"
#include "xxhash.h"
namespace
kvc2
{
const
uint64_t
hash_seed
=
4123512
;
const
uint64_t
check_hash_seed
=
1025753
;
using
TokensHash
=
XXH64_hash_t
;
struct
TokensHasher
{
XXH64_state_t
*
state
;
TokensHasher
()
{
state
=
XXH64_createState
();
reset
();
}
~
TokensHasher
()
{
XXH64_freeState
(
state
);
}
TokensHasher
(
TokensHasher
&
other
)
=
delete
;
TokensHasher
&
operator
=
(
TokensHasher
&
other
)
=
delete
;
TokensHasher
(
TokensHasher
&&
other
)
=
delete
;
TokensHasher
&
operator
=
(
TokensHasher
&&
other
)
=
delete
;
TokensHash
get
()
{
return
XXH64_digest
(
state
);
}
void
reset
(
size_t
seed
=
hash_seed
)
{
XXH64_reset
(
state
,
seed
);
}
TokensHash
update
(
Token
*
data
,
TokenLength
length
)
{
XXH64_update
(
state
,
data
,
length
*
sizeof
(
Token
));
return
get
();
}
TokensHash
update_raw
(
void
*
data
,
size_t
size
)
{
XXH64_update
(
state
,
data
,
size
);
return
get
();
}
static
TokensHash
hash
(
Token
*
data
,
TokenLength
length
)
{
return
XXH64
(
data
,
length
*
sizeof
(
Token
),
hash_seed
);
}
};
}
// namespace kvc2
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/io_helper.hpp
0 → 100644
View file @
25cee581
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-12-11 06:35:31
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-12-11 06:50:55
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
#include <atomic>
#include <future>
#include <iostream>
#include <mutex>
#include <optional>
#include <string>
#include <vector>
struct
BatchPromise
{
std
::
promise
<
void
>
promise
;
std
::
shared_future
<
void
>
fut
;
std
::
atomic_size_t
count
;
inline
BatchPromise
(
size_t
count
)
:
count
(
count
)
{
fut
=
promise
.
get_future
().
share
();
}
inline
void
inc
(
size_t
count
=
1
)
{
this
->
count
.
fetch_add
(
count
,
std
::
memory_order_seq_cst
);
}
inline
void
set
()
{
if
(
count
.
fetch_sub
(
1
,
std
::
memory_order_seq_cst
)
==
1
)
{
promise
.
set_value
();
}
}
inline
std
::
shared_future
<
void
>
get_shared_fut
()
{
return
fut
;
}
};
template
<
typename
Lock
>
struct
TransferControl
{
Lock
lock
;
std
::
optional
<
std
::
shared_future
<
void
>>
transfer_ok
=
std
::
nullopt
;
bool
has_data
=
false
;
TransferControl
()
{}
/*
true, std::nullopt : Already has data
false, shared_future : Transfer already started, should wait for the future
false, std::nullopt : should transfer by you
true, shared_future: Should not appear
*/
std
::
pair
<
bool
,
std
::
optional
<
std
::
shared_future
<
void
>>>
has_data_or_transfer
(
std
::
shared_future
<
void
>
shared_fut
)
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
if
(
has_data
)
{
return
{
true
,
std
::
nullopt
};
}
else
{
if
(
transfer_ok
.
has_value
())
{
return
{
false
,
transfer_ok
};
}
else
{
transfer_ok
=
shared_fut
;
return
{
false
,
std
::
nullopt
};
}
}
}
void
set_has_data
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
has_data
=
true
;
transfer_ok
=
std
::
nullopt
;
}
bool
get_has_data
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
if
(
has_data
)
{
return
true
;
}
else
{
return
false
;
}
}
void
reset
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
transfer_ok
=
std
::
nullopt
;
has_data
=
false
;
}
std
::
string
debug
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
return
std
::
string
(
""
)
+
(
has_data
?
"has data"
:
"no data"
)
+
" "
+
(
transfer_ok
.
has_value
()
?
"transfer "
:
"no transfer"
);
}
};
struct
ConcurrentController
{
std
::
atomic_bool
dirty
=
false
;
std
::
atomic_size_t
ref_count
=
0
;
TransferControl
<
std
::
mutex
>
tc
;
};
template
<
typename
Unit
>
struct
IO_Helper
{
BatchPromise
batch_promise
;
std
::
function
<
void
(
Unit
*
)
>
call_back_on_unit
=
nullptr
;
std
::
function
<
void
()
>
call_back
=
nullptr
;
std
::
vector
<
std
::
shared_future
<
void
>>
futs
;
std
::
vector
<
Unit
*>
units_by_myself
;
IO_Helper
(
std
::
function
<
void
(
Unit
*
)
>
call_back_on_unit
,
std
::
function
<
void
()
>
call_back
=
nullptr
)
:
batch_promise
(
1
),
call_back_on_unit
(
call_back_on_unit
),
call_back
(
call_back
)
{}
IO_Helper
(
const
IO_Helper
&
other
)
=
delete
;
IO_Helper
&
operator
=
(
const
IO_Helper
&
other
)
=
delete
;
IO_Helper
(
IO_Helper
&&
other
)
=
delete
;
IO_Helper
&
operator
=
(
IO_Helper
&&
other
)
=
delete
;
~
IO_Helper
()
{
// std::cout<<"Destory IO helper"<<std::endl;
}
size_t
total_task_count
=
0
;
void
new_task
(
size_t
count
=
1
)
{
total_task_count
+=
1
;
batch_promise
.
inc
(
count
);
}
void
finish_add_taks
()
{
batch_promise
.
set
();
}
bool
absorb_tc
(
Unit
*
unit
,
TransferControl
<
std
::
mutex
>&
tc
)
{
auto
[
ok
,
fut
]
=
tc
.
has_data_or_transfer
(
batch_promise
.
get_shared_fut
());
if
(
ok
)
{
return
false
;
}
else
{
if
(
fut
.
has_value
())
{
futs
.
push_back
(
fut
.
value
());
// printf("Transfer started\n");
return
false
;
}
else
{
units_by_myself
.
push_back
(
unit
);
// printf("Not Transfer\n");
return
true
;
}
}
}
void
wait
()
{
for
(
auto
&
fut
:
futs
)
{
fut
.
wait
();
}
batch_promise
.
get_shared_fut
().
wait
();
for
(
auto
&
b
:
units_by_myself
)
{
call_back_on_unit
(
b
);
}
if
(
call_back
)
call_back
();
}
};
csrc/balance_serve/kvc2/src/kvc2.h
0 → 100644
View file @
25cee581
#pragma once
#include <torch/torch.h>
#include <cstdint>
#include <optional>
#include <vector>
#include "defs.h"
#include "model_config.h"
namespace
kvc2
{
struct
GPUPageCacheConfig
{
bool
gpu_only
;
std
::
vector
<
size_t
>
gpu_devices_id
;
size_t
layer_count
;
size_t
total_kvcache_pages
;
size_t
num_token_per_page
;
size_t
num_k_heads
;
size_t
k_head_dim
;
bool
full_kv_cache_on_each_gpu
=
false
;
bool
k_cache_on
=
true
;
bool
v_cache_on
=
true
;
torch
::
ScalarType
tensor_type
;
// for cuda stream manager
size_t
num_streams_per_device
=
4
;
};
struct
KVC2Config
{
bool
k_cache_on
=
true
;
bool
v_cache_on
=
true
;
bool
gpu_only
=
false
;
bool
load_from_disk
=
true
;
bool
save_to_disk
=
true
;
std
::
string
path
;
std
::
string
config_path
;
TokenLength
num_token_per_page
=
256
;
size_t
memory_pool_size
=
10e9
;
size_t
evict_count
=
20
;
std
::
optional
<
GPUPageCacheConfig
>
gpu_cache_config
=
std
::
nullopt
;
size_t
metrics_port
;
double
recompute_ratio
=
0.2
;
};
class
DoubleCacheHandleInterface
;
class
KVC2Interface
{
public:
virtual
~
KVC2Interface
()
=
default
;
virtual
void
load
()
=
0
;
virtual
void
save
()
=
0
;
/*
Raw Insert
Insert kvcache from kvcache_data to disk.
info: cache info
id: start pointer of token array
length: length of token array
kvcache_data: data of kvcache
This will firstly match the ID array with the existing kvcache, and then insert the unmatched kvcache to disk.
*/
virtual
void
raw_insert
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
const
std
::
vector
<
layer_data
>&
k_cache
,
const
std
::
vector
<
layer_data
>&
v_cache
)
=
0
;
/*
Raw Read
Read kvcache from disk to user specified pointers.
info: cache info
id: start pointer of token array
length: length of token array
kvcache_data: data of kvcache
Return: matched length of prefix, in tokens
This will not read from memory pool, it directly read from disk.
*/
virtual
TokenLength
raw_read
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
const
std
::
vector
<
layer_data
>&
k_cache
,
const
std
::
vector
<
layer_data
>&
v_cache
)
=
0
;
/*
Lookup
Lookup kvcache and load it from disk to memory pool if needed.
info: cache info
id: start pointer of token array
length: length of token array
Return: kvc2_handle, holds kvcache until being released.
if not found, matched_length will return 0.
if memory pool is full, return nullptr
*/
virtual
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
lookup
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
)
=
0
;
/*
Lookup and allocate to gpu
info.is_k_cache does not matter here
*/
virtual
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
lookup_to_gpu
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
)
=
0
;
virtual
void
lookup_to_gpu_async
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
,
std
::
function
<
void
(
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
)
>
call_back
)
=
0
;
virtual
std
::
pair
<
std
::
vector
<
torch
::
Tensor
>
,
std
::
vector
<
torch
::
Tensor
>>
get_kvcache
()
=
0
;
virtual
void
debug
()
=
0
;
};
std
::
shared_ptr
<
KVC2Interface
>
create_kvc2
(
KVC2Config
config
);
enum
MatchStatus
{
Exact
,
Partial
,
NotMatchExact
,
NotMatchPartial
,
};
class
DoubleCacheHandleInterface
{
public:
virtual
~
DoubleCacheHandleInterface
()
=
default
;
virtual
TokenLength
matched_length
()
=
0
;
virtual
std
::
vector
<
MatchStatus
>
matched_status
()
=
0
;
virtual
std
::
vector
<
layer_data
>
handle_data
(
bool
is_key_cache
)
=
0
;
virtual
bool
to_gpu
()
=
0
;
virtual
void
to_gpu_async
(
std
::
function
<
void
(
bool
)
>
call_back
)
=
0
;
virtual
std
::
vector
<
size_t
>
get_gpu_block_idx
()
=
0
;
virtual
std
::
vector
<
size_t
>
get_gpu_attached_block_idx
()
=
0
;
virtual
void
append_tokens
(
Token
*
tokens
,
TokenLength
length
)
=
0
;
// update generated tokens
virtual
void
debug
()
=
0
;
};
};
// namespace kvc2
csrc/balance_serve/kvc2/src/kvc2_utils.py
0 → 100644
View file @
25cee581
import
torch
import
ctypes
def
aligned_tensor
(
size
,
alignment
=
4096
):
num_bytes
=
size
mem
=
ctypes
.
c_void_p
()
error_code
=
ctypes
.
CDLL
(
None
).
posix_memalign
(
ctypes
.
byref
(
mem
),
ctypes
.
c_size_t
(
alignment
),
ctypes
.
c_size_t
(
num_bytes
)
)
if
error_code
!=
0
:
raise
MemoryError
(
f
"posix_memalign failed with error code
{
error_code
}
"
)
array_type
=
(
ctypes
.
c_int8
*
size
)
raw_array
=
array_type
.
from_address
(
mem
.
value
)
tensor
=
torch
.
frombuffer
(
raw_array
,
dtype
=
torch
.
int8
)
if
tensor
.
data_ptr
()
%
alignment
!=
0
:
raise
ValueError
(
f
"Tensor data_ptr
{
tensor
.
data_ptr
()
}
is not aligned to
{
alignment
}
bytes"
)
return
tensor
,
mem
def
alloc_aligned_cache
(
layer_count
,
block_count
,
element_size
):
cache
=
[]
cache_mem
=
[]
for
i
in
range
(
layer_count
):
layer_data
=
[]
layer_mem
=
[]
for
j
in
range
(
block_count
):
tensor
,
mem_ptr
=
aligned_tensor
(
element_size
,
alignment
=
4096
)
layer_data
.
append
(
tensor
)
layer_mem
.
append
(
mem_ptr
)
cache
.
append
(
layer_data
)
cache_mem
.
append
(
layer_mem
)
return
cache
,
cache_mem
def
dealloc_aligned_cache
(
cache_mem
):
for
layer_mem
in
cache_mem
:
for
mem_ptr
in
layer_mem
:
ctypes
.
CDLL
(
None
).
free
(
mem_ptr
)
def
get_tensor_ptr
(
tensors
):
tensor_ptr
=
[]
for
layer
in
tensors
:
layer_ptr
=
[]
for
data
in
layer
:
layer_ptr
.
append
(
data
.
data_ptr
())
tensor_ptr
.
append
(
layer_ptr
)
return
tensor_ptr
def
get_tensor_from_data_ptr
(
matched_data
,
element_size
):
re
=
[]
for
layer
in
matched_data
:
re_layer
=
[]
for
data_ptr
in
layer
:
array_type
=
(
ctypes
.
c_int8
*
element_size
)
raw_array
=
array_type
.
from_address
(
data_ptr
)
tensor
=
torch
.
frombuffer
(
raw_array
,
dtype
=
torch
.
int8
)
re_layer
.
append
(
tensor
)
re
.
append
(
re_layer
)
return
re
if
__name__
==
"__main__"
:
pass
\ No newline at end of file
csrc/balance_serve/kvc2/src/metrics.cpp
0 → 100644
View file @
25cee581
#include "metrics.h"
namespace
kvc2
{
Metrics
::
Metrics
(
const
MetricsConfig
&
config
)
:
registry_
(
std
::
make_shared
<
prometheus
::
Registry
>
()),
exposer_
(
config
.
endpoint
)
{
// 注册 prefix_nodes Counter
auto
&
prefix_nodes_family
=
prometheus
::
BuildCounter
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_prefix_nodes"
)
.
Help
(
"Number of prefix nodes"
)
.
Register
(
*
registry_
);
prefix_nodes
=
&
prefix_nodes_family
.
Add
({});
// 注册 prefix_block_count Counter
auto
&
prefix_block_count_family
=
prometheus
::
BuildCounter
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_prefix_block_count"
)
.
Help
(
"Number of prefix blocks"
)
.
Register
(
*
registry_
);
prefix_block_count
=
&
prefix_block_count_family
.
Add
({});
// 定义统一的桶大小,最大为 10000 ms (10 s)
std
::
vector
<
double
>
common_buckets
=
{
1.0
,
5.0
,
10.0
,
50.0
,
100.0
,
500.0
,
1000.0
,
5000.0
,
10000.0
};
// 注册 raw_insert_time_ms Histogram
auto
&
raw_insert_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_raw_insert_time_ms"
)
.
Help
(
"function raw insert's time in milliseconds"
)
.
Register
(
*
registry_
);
raw_insert_time_ms
=
&
raw_insert_time_ms_family
.
Add
({},
common_buckets
);
// 注册 lookup_time_ms Histogram
auto
&
lookup_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_lookup_time_ms"
)
.
Help
(
"function lookup's time in milliseconds"
)
.
Register
(
*
registry_
);
lookup_time_ms
=
&
lookup_time_ms_family
.
Add
({},
common_buckets
);
// 注册 lookup_prefixmatch_length Histogram
auto
&
lookup_prefixmatch_length_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_lookup_prefixmatch_length"
)
.
Help
(
"function lookup's prefix match length"
)
.
Register
(
*
registry_
);
lookup_prefixmatch_length
=
&
lookup_prefixmatch_length_family
.
Add
({},
common_buckets
);
// 注册 matched_length_percentage Histogram
auto
&
matched_length_percentage_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_matched_length_percentage"
)
.
Help
(
"function matched length percentage"
)
.
Register
(
*
registry_
);
matched_length_percentage
=
&
matched_length_percentage_family
.
Add
({},
common_buckets
);
// 注册 disk_usage Gauge
auto
&
disk_usage_family
=
prometheus
::
BuildGauge
().
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_disk_usage"
).
Help
(
"disk usage"
).
Register
(
*
registry_
);
disk_usage
=
&
disk_usage_family
.
Add
({});
// 注册 memory_pool_size Gauge
memory_pool_size_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_memory_pool_size"
)
.
Help
(
"memory pool size"
)
.
Register
(
*
registry_
);
// 注册 memory_pool_node_count Gauge
memory_pool_node_count_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_memory_pool_node_count"
)
.
Help
(
"memory pool node count"
)
.
Register
(
*
registry_
);
// 注册 lru_entry_count Gauge
lru_entry_count_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_lru_entry_count"
)
.
Help
(
"lru entry count"
)
.
Register
(
*
registry_
);
// 注册 gpu_page_count Gauge
gpu_page_count_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_gpu_page_count"
)
.
Help
(
"gpu page count"
)
.
Register
(
*
registry_
);
// 注册 append_tokens_time_ms Histogram
auto
&
append_tokens_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_append_tokens_time_ms"
)
.
Help
(
"append tokens time in milliseconds"
)
.
Register
(
*
registry_
);
append_tokens_time_ms
=
&
append_tokens_time_ms_family
.
Add
({},
common_buckets
);
// 注册 gpu_flush_back_time_ms Histogram
auto
&
gpu_flush_back_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_gpu_flush_back_time_ms"
)
.
Help
(
"gpu flush back time in milliseconds"
)
.
Register
(
*
registry_
);
gpu_flush_back_time_ms
=
&
gpu_flush_back_time_ms_family
.
Add
({},
common_buckets
);
// 注册 cpu_flush_back_time_ms Histogram
auto
&
cpu_flush_back_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_cpu_flush_back_time_ms"
)
.
Help
(
"cpu flush back time in milliseconds"
)
.
Register
(
*
registry_
);
cpu_flush_back_time_ms
=
&
cpu_flush_back_time_ms_family
.
Add
({},
common_buckets
);
exposer_
.
RegisterCollectable
(
registry_
);
}
// 析构函数
Metrics
::~
Metrics
()
{
// 停止指标暴露
// exposer_.Stop();
}
// 获取 memory_pool_size 指标
prometheus
::
Gauge
*
Metrics
::
memory_pool_size
(
const
std
::
string
&
type
)
{
return
&
memory_pool_size_family_
->
Add
({{
"type"
,
type
}});
}
// 获取 memory_pool_node_count 指标
prometheus
::
Gauge
*
Metrics
::
memory_pool_node_count
(
const
std
::
string
&
type
)
{
return
&
memory_pool_node_count_family_
->
Add
({{
"type"
,
type
}});
}
// 获取 lru_entry_count 指标
prometheus
::
Gauge
*
Metrics
::
lru_entry_count
(
const
std
::
string
&
type
)
{
return
&
lru_entry_count_family_
->
Add
({{
"type"
,
type
}});
}
// 获取 gpu_page_count 指标
prometheus
::
Gauge
*
Metrics
::
gpu_page_count
(
std
::
string
type
)
{
return
&
gpu_page_count_family_
->
Add
({{
"type"
,
type
}});
}
TimeObserver
::
TimeObserver
(
prometheus
::
Histogram
*
h
)
{
histogram_
=
h
;
timer_
.
start
();
}
TimeObserver
::~
TimeObserver
()
{
timer_
.
stop
();
histogram_
->
Observe
(
timer_
.
elapsedNs
()
/
1e6
);
// ns -> ms
}
}
// namespace kvc2
\ No newline at end of file
csrc/balance_serve/kvc2/src/metrics.h
0 → 100644
View file @
25cee581
#pragma once
#include "prometheus/counter.h"
#include "prometheus/exposer.h"
#include "prometheus/gauge.h"
#include "prometheus/histogram.h"
#include "prometheus/registry.h"
#include <atomic>
#include <chrono>
#include <memory>
#include <string>
#include <thread>
#include <vector>
#include "utils/timer.hpp"
namespace
kvc2
{
// 指标前缀宏定义
#define METRIC_PREFIX "kvc2"
struct
MetricsConfig
{
std
::
string
endpoint
;
// 监听端点,如 "0.0.0.0:8080"
};
class
Metrics
{
public:
// 构造函数传入 MetricsConfig
Metrics
(
const
MetricsConfig
&
config
);
~
Metrics
();
// 禁止拷贝和赋值
Metrics
(
const
Metrics
&
)
=
delete
;
Metrics
&
operator
=
(
const
Metrics
&
)
=
delete
;
// 指标指针
prometheus
::
Counter
*
prefix_nodes
;
prometheus
::
Counter
*
prefix_block_count
;
prometheus
::
Histogram
*
raw_insert_time_ms
;
prometheus
::
Histogram
*
lookup_time_ms
;
prometheus
::
Histogram
*
lookup_prefixmatch_length
;
prometheus
::
Histogram
*
matched_length_percentage
;
prometheus
::
Gauge
*
disk_usage
;
prometheus
::
Gauge
*
memory_pool_size
(
const
std
::
string
&
type
);
prometheus
::
Gauge
*
memory_pool_node_count
(
const
std
::
string
&
type
);
prometheus
::
Gauge
*
lru_entry_count
(
const
std
::
string
&
type
);
prometheus
::
Gauge
*
gpu_page_count
(
std
::
string
type
);
prometheus
::
Histogram
*
append_tokens_time_ms
;
prometheus
::
Histogram
*
gpu_flush_back_time_ms
;
prometheus
::
Histogram
*
cpu_flush_back_time_ms
;
private:
std
::
shared_ptr
<
prometheus
::
Registry
>
registry_
;
prometheus
::
Exposer
exposer_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
memory_pool_size_family_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
memory_pool_node_count_family_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
lru_entry_count_family_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
gpu_page_count_family_
;
};
class
TimeObserver
{
public:
TimeObserver
(
prometheus
::
Histogram
*
h
);
~
TimeObserver
();
private:
Timer
timer_
;
prometheus
::
Histogram
*
histogram_
;
};
}
// namespace kvc2
\ No newline at end of file
csrc/balance_serve/kvc2/src/model_config.h
0 → 100644
View file @
25cee581
#ifndef __MODEL_CONFIG_HPP_
#define __MODEL_CONFIG_HPP_
#include <iostream>
#include "nlohmann/json.hpp"
#include <filesystem>
#include <fstream>
using
DimSize
=
size_t
;
using
URL
=
std
::
string
;
using
ModelName
=
std
::
string
;
// We must assure this can be load by config.json
class
ModelConfig
{
public:
DimSize
hidden_size
;
DimSize
intermediate_size
;
size_t
max_position_embeddings
;
std
::
string
model_type
;
size_t
num_attention_heads
;
size_t
num_hidden_layers
;
size_t
num_key_value_heads
;
size_t
vocab_size
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE
(
ModelConfig
,
hidden_size
,
intermediate_size
,
max_position_embeddings
,
model_type
,
num_attention_heads
,
num_hidden_layers
,
num_key_value_heads
,
vocab_size
);
void
load_from
(
std
::
filesystem
::
path
path
)
{
std
::
ifstream
i
(
path
);
nlohmann
::
json
j
;
i
>>
j
;
*
this
=
j
.
get
<
ModelConfig
>
();
}
};
using
QuantType
=
std
::
string
;
static
const
QuantType
NoQuantType
=
""
;
class
QuantConfig
{
public:
QuantType
name
;
// For GEMV
QuantType
type_of_dot_vector
=
NoQuantType
;
inline
bool
can_be_used_as_matrix
()
{
return
type_of_dot_vector
!=
NoQuantType
;
}
bool
can_be_used_as_vector
;
double
bytes_per_element
;
bool
has_scale
;
bool
has_min
;
size_t
block_element_count
;
size_t
block_element_size
;
URL
reference
=
""
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT
(
QuantConfig
,
name
,
type_of_dot_vector
,
can_be_used_as_vector
,
bytes_per_element
,
has_scale
,
has_min
,
block_element_count
,
block_element_size
,
reference
);
};
inline
std
::
map
<
QuantType
,
QuantConfig
>
quant_configs
;
inline
std
::
map
<
ModelName
,
ModelConfig
>
model_configs
;
inline
void
load_quant_configs
(
std
::
filesystem
::
path
path
)
{
std
::
cout
<<
__FUNCTION__
<<
" from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
nlohmann
::
json
j
;
i
>>
j
;
quant_configs
=
j
.
get
<
std
::
map
<
QuantType
,
QuantConfig
>>
();
std
::
cout
<<
"Loaded Quant Configs"
<<
std
::
endl
;
for
(
auto
&
[
k
,
v
]
:
quant_configs
)
{
std
::
cout
<<
" - "
<<
k
<<
std
::
endl
;
}
}
inline
void
dump_quant_configs
(
std
::
filesystem
::
path
path
)
{
std
::
ofstream
o
(
path
);
nlohmann
::
json
j
=
quant_configs
;
o
<<
j
.
dump
(
4
);
}
inline
void
load_model_configs
(
std
::
filesystem
::
path
path
)
{
std
::
cout
<<
__FUNCTION__
<<
" from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
nlohmann
::
json
j
;
i
>>
j
;
model_configs
=
j
.
get
<
std
::
map
<
ModelName
,
ModelConfig
>>
();
std
::
cout
<<
"Loaded Model Configs"
<<
std
::
endl
;
for
(
auto
&
[
k
,
v
]
:
model_configs
)
{
std
::
cout
<<
" - "
<<
k
<<
std
::
endl
;
}
}
inline
void
dump_model_configs
(
std
::
filesystem
::
path
path
)
{
std
::
ofstream
o
(
path
);
nlohmann
::
json
j
=
model_configs
;
o
<<
j
.
dump
(
4
);
}
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
0 → 100644
View file @
25cee581
#include "page_aligned_memory_pool.h"
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
#include "utils/arithmetic.hpp"
#include "utils/easy_format.hpp"
/// 构造函数
PageAlignedMemoryPool
::
PageAlignedMemoryPool
(
size_t
size_in_bytes
)
{
total_size
=
(
size_in_bytes
/
PageSize
)
*
PageSize
;
// 对齐分配。C++17 对齐方式写法,如果编译器不支持可以改用其它方法
data
=
::
operator
new
[](
total_size
,
std
::
align_val_t
(
PageSize
));
total_pages
=
total_size
/
PageSize
;
assert
(
total_pages
>=
Blocks
);
page_per_block
=
total_pages
/
Blocks
;
for
(
size_t
block_index
=
0
;
block_index
<
Blocks
;
block_index
++
)
{
first_page
[
block_index
]
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
intptr_t
>
(
data
)
+
static_cast
<
intptr_t
>
(
block_index
)
*
page_per_block
*
PageSize
);
count_page
[
block_index
]
=
block_index
==
Blocks
-
1
?
(
total_pages
-
page_per_block
*
(
Blocks
-
1
))
:
page_per_block
;
SPDLOG_DEBUG
(
"first_page[{}] = {}, count_page[{}] = {}"
,
block_index
,
reinterpret_cast
<
intptr_t
>
(
first_page
[
block_index
])
-
reinterpret_cast
<
intptr_t
>
(
data
),
block_index
,
count_page
[
block_index
]);
bitmap
[
block_index
].
resize
(
count_page
[
block_index
],
0
);
}
SPDLOG_INFO
(
"PageAlignedMemoryPool with size {} Mbytes, {} pages"
,
total_size
/
(
1
<<
20
),
page_count
());
}
/// 析构函数
PageAlignedMemoryPool
::~
PageAlignedMemoryPool
()
{
if
(
data
)
{
// 注意:需要与分配时的对齐方式对应
::
operator
delete
[](
data
,
std
::
align_val_t
(
PageSize
));
data
=
nullptr
;
}
}
/// 返回总页数
size_t
PageAlignedMemoryPool
::
page_count
()
{
return
total_size
/
PageSize
;
}
/// 返回按整页对齐后的字节数
size_t
PageAlignedMemoryPool
::
page_padded_size
(
size_t
size
)
{
return
div_up
(
size
,
PageSize
)
*
PageSize
;
}
void
*
PageAlignedMemoryPool
::
alloc_in_block
(
size_t
block_index
,
size_t
alloc_size
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
lock
[
block_index
]);
size_t
free_pages
=
0
;
for
(
size_t
i
=
0
;
i
<
count_page
[
block_index
];
i
++
)
{
if
(
bitmap
[
block_index
][
i
]
==
0
)
{
free_pages
++
;
if
(
free_pages
==
alloc_size
)
{
size_t
page_index
=
i
+
1
-
free_pages
;
for
(
size_t
page
=
page_index
;
page
<
page_index
+
alloc_size
;
page
++
)
{
bitmap
[
block_index
][
page
]
=
1
;
// SPDLOG_DEBUG("alloc page {} in block {}", page, block_index);
}
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
intptr_t
>
(
first_page
[
block_index
])
+
page_index
*
PageSize
);
}
}
else
{
free_pages
=
0
;
}
}
return
nullptr
;
}
/// 分配函数
void
*
PageAlignedMemoryPool
::
alloc
(
size_t
size
)
{
size_t
alloc_size
=
div_up
(
size
,
PageSize
);
auto
cnt
=
now_block
.
fetch_add
(
1
,
std
::
memory_order_relaxed
);
for
(
size_t
i
=
0
;
i
<
Blocks
;
i
++
)
{
auto
result
=
alloc_in_block
((
i
+
cnt
)
%
Blocks
,
alloc_size
);
if
(
result
!=
nullptr
)
{
allocated
.
fetch_add
(
alloc_size
*
PageSize
,
std
::
memory_order_relaxed
);
alloc_count
.
fetch_add
(
1
,
std
::
memory_order_relaxed
);
return
result
;
}
}
return
nullptr
;
}
/// 释放函数
void
PageAlignedMemoryPool
::
free
(
void
*
p
,
size_t
size
)
{
auto
alloc_size
=
div_up
(
size
,
PageSize
);
size_t
block_index
=
(
reinterpret_cast
<
intptr_t
>
(
p
)
-
reinterpret_cast
<
intptr_t
>
(
data
))
/
page_per_block
/
PageSize
;
size_t
page_index
=
(
reinterpret_cast
<
intptr_t
>
(
p
)
-
reinterpret_cast
<
intptr_t
>
(
first_page
[
block_index
]))
/
PageSize
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
lock
[
block_index
]);
for
(
size_t
page
=
page_index
;
page
<
page_index
+
alloc_size
;
page
++
)
bitmap
[
block_index
][
page
]
=
0
;
allocated
.
fetch_sub
(
alloc_size
*
PageSize
,
std
::
memory_order_relaxed
);
free_count
.
fetch_add
(
1
,
std
::
memory_order_relaxed
);
}
// TODO: too slow
std
::
vector
<
void
*>
PageAlignedMemoryPool
::
alloc_multiple
(
size_t
size
,
size_t
count
)
{
std
::
vector
<
void
*>
result
;
for
(
size_t
i
=
0
;
i
<
count
;
i
++
)
{
auto
p
=
alloc
(
size
);
if
(
p
==
nullptr
)
{
for
(
auto
ptr
:
result
)
{
free
(
ptr
,
size
);
}
return
{};
}
result
.
push_back
(
p
);
}
return
result
;
}
void
PageAlignedMemoryPool
::
defragment
()
{}
/// 调试打印
std
::
string
PageAlignedMemoryPool
::
debug
()
{
return
fmt
::
format
(
"PageAlignedMemoryPool: total_size: {}MB, allocated: {}, alloc/free count: {}/{}
\n
"
,
readable_number
(
total_size
),
readable_number
(
size_t
(
allocated
)),
size_t
(
alloc_count
),
size_t
(
free_count
));
}
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
0 → 100644
View file @
25cee581
#pragma once
#include <algorithm> // std::sort
#include <cstddef> // size_t
#include <mutex> // std::mutex
#include <vector>
#include <assert.h>
#include <bitset>
#include <atomic>
constexpr
size_t
PageSize
=
4096
;
/// PageAlignedMemoryPool 类的声明
struct
PageAlignedMemoryPool
{
private:
constexpr
static
size_t
Blocks
=
16
;
void
*
data
=
nullptr
;
size_t
total_size
=
0
,
total_pages
=
0
;
std
::
atomic_size_t
now_block
=
0
;
std
::
atomic_size_t
allocated
=
0
;
// allocated_size
std
::
atomic_size_t
alloc_count
=
0
;
std
::
atomic_size_t
free_count
=
0
;
std
::
mutex
lock
[
Blocks
];
size_t
page_per_block
=
0
;
void
*
first_page
[
Blocks
];
size_t
count_page
[
Blocks
];
std
::
vector
<
int8_t
>
bitmap
[
Blocks
];
void
*
alloc_in_block
(
size_t
block_index
,
size_t
alloc_size
);
public:
/// 构造函数和析构函数
explicit
PageAlignedMemoryPool
(
size_t
size_in_bytes
);
~
PageAlignedMemoryPool
();
/// 禁用拷贝和移动
PageAlignedMemoryPool
(
PageAlignedMemoryPool
&&
other
)
=
delete
;
PageAlignedMemoryPool
&
operator
=
(
PageAlignedMemoryPool
&&
other
)
=
delete
;
PageAlignedMemoryPool
(
const
PageAlignedMemoryPool
&
other
)
=
delete
;
PageAlignedMemoryPool
&
operator
=
(
const
PageAlignedMemoryPool
&
other
)
=
delete
;
/// 成员函数
size_t
page_count
();
size_t
page_padded_size
(
size_t
size
);
void
*
alloc
(
size_t
size
);
std
::
vector
<
void
*>
alloc_multiple
(
size_t
size
,
size_t
count
);
void
free
(
void
*
data
,
size_t
size
);
void
defragment
();
std
::
string
debug
();
};
csrc/balance_serve/kvc2/src/prefix.cpp
0 → 100644
View file @
25cee581
#include <immintrin.h>
#include <tbb/concurrent_hash_map.h>
#include <algorithm>
#include <cstdint>
#include <fstream>
#include <functional>
#include <list>
#include <map>
#include <memory>
#include <mutex>
#include <nlohmann/json.hpp>
#include <optional>
#include <shared_mutex>
#include <unordered_map>
#include <vector>
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
#include "async_store.hh"
#include "cuda_stream_manager.hh"
#include "kvc2.h"
#include "metrics.h"
#include "cache_entry.hh"
#include "gpu_cache.hh"
#include "hasher.hpp"
#include "io_helper.hpp"
#include "page_aligned_memory_pool.h"
#include "utils/arithmetic.hpp"
#include "utils/easy_format.hpp"
#include "utils/periodic_task.hpp"
namespace
kvc2
{
struct
KVC2
;
// will be set when init
TokenLength
NumTokenPerBlock
;
int
EvictCount
;
using
Layer
=
size_t
;
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE
(
CacheInfo
,
model_name
,
is_key_cache
,
quant_type
);
NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE
(
KVC2Config
,
gpu_only
,
load_from_disk
,
save_to_disk
,
path
,
config_path
,
num_token_per_page
,
memory_pool_size
,
evict_count
,
metrics_port
,
recompute_ratio
);
size_t
CacheInfo
::
hidden_layer_count
()
{
return
model_configs
.
at
(
model_name
).
num_hidden_layers
;
}
std
::
filesystem
::
path
CacheInfo
::
path
(
std
::
optional
<
size_t
>
which_layer
)
{
auto
folder
=
std
::
filesystem
::
path
(
model_name
)
/
quant_type
/
(
is_key_cache
?
"key"
:
"value"
);
if
(
which_layer
.
has_value
())
{
folder
/=
fmt
::
format
(
"layer-{}.kvc"
,
which_layer
.
value
());
}
return
folder
;
}
bool
CacheInfo
::
operator
==
(
const
CacheInfo
&
other
)
const
{
return
model_name
==
other
.
model_name
&&
is_key_cache
==
other
.
is_key_cache
&&
quant_type
==
other
.
quant_type
;
}
size_t
CacheInfo
::
element_size
(
size_t
block_length
)
{
size_t
count
=
model_configs
[
model_name
].
hidden_size
*
block_length
;
auto
&
q
=
quant_configs
[
quant_type
];
return
count
/
q
.
block_element_count
*
q
.
block_element_size
;
}
size_t
CacheInfo
::
hash_value
()
const
{
size_t
x
=
hash_seed
;
x
=
XXH64
(
model_name
.
data
(),
model_name
.
size
(),
x
);
x
=
XXH64
(
"quant_type"
,
10
,
x
);
x
=
XXH64
(
quant_type
.
data
(),
quant_type
.
size
(),
x
);
if
(
is_key_cache
)
{
x
=
XXH64
(
"key"
,
3
,
x
);
}
else
{
x
=
XXH64
(
"value"
,
5
,
x
);
}
return
x
;
}
}
// namespace kvc2
template
<
>
struct
std
::
hash
<
kvc2
::
CacheInfo
>
{
std
::
size_t
operator
()(
const
kvc2
::
CacheInfo
&
s
)
const
noexcept
{
return
s
.
hash_value
();
}
};
namespace
kvc2
{
struct
Location
{
size_t
start_idx
;
// start block index
size_t
length
;
// length of blocks
NLOHMANN_DEFINE_TYPE_INTRUSIVE
(
Location
,
start_idx
,
length
);
Location
cut_tail
(
size_t
offset_from_tail
)
{
Location
re
;
size_t
offset
=
length
-
offset_from_tail
;
re
.
start_idx
=
start_idx
+
offset
;
re
.
length
=
offset_from_tail
;
length
=
offset
;
return
re
;
}
};
struct
SegmentLocations
{
std
::
vector
<
std
::
optional
<
size_t
>>
offsets
;
void
add_location
(
size_t
start_block
,
Location
location
)
{
if
(
location
.
length
+
start_block
>
offsets
.
size
())
{
offsets
.
resize
(
location
.
length
+
start_block
,
std
::
nullopt
);
}
for
(
size_t
i
=
start_block
;
i
<
start_block
+
location
.
length
;
i
++
)
{
offsets
[
i
]
=
location
.
start_idx
+
i
-
start_block
;
}
}
void
set_location
(
size_t
start_block
,
size_t
disk_location
)
{
if
(
start_block
>=
offsets
.
size
())
{
offsets
.
resize
(
start_block
+
1
,
std
::
nullopt
);
}
offsets
[
start_block
]
=
disk_location
;
}
std
::
optional
<
size_t
>
get_idx
(
size_t
block_idx
)
const
{
if
(
block_idx
>=
offsets
.
size
())
{
return
std
::
nullopt
;
}
else
{
return
offsets
[
block_idx
];
}
}
bool
has_location
(
size_t
block_idx
,
size_t
length
)
{
for
(
size_t
i
=
block_idx
;
i
<
block_idx
+
length
;
i
++
)
{
if
(
get_idx
(
i
).
has_value
()
==
false
)
{
return
false
;
}
}
return
true
;
}
void
debug
()
{
for
(
size_t
i
=
0
;
i
<
offsets
.
size
();
++
i
)
{
if
(
offsets
[
i
].
has_value
())
{
SPDLOG_DEBUG
(
"Block {} -> Disk Location {}"
,
i
,
offsets
[
i
].
value
());
}
else
{
SPDLOG_DEBUG
(
"Block {} -> No Disk Location"
,
i
);
}
}
}
};
struct
CacheDiskLocations
{
std
::
unordered_map
<
CacheInfo
,
Location
>
location_map
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE
(
CacheDiskLocations
,
location_map
);
std
::
optional
<
Location
>
get_location
(
CacheInfo
cache_info
,
TokenLength
local_ids_length
)
{
size_t
blocks_length
=
div_up
(
local_ids_length
,
NumTokenPerBlock
);
if
(
location_map
.
count
(
cache_info
)
==
0
)
{
return
std
::
nullopt
;
}
Location
re
=
location_map
[
cache_info
];
re
.
length
=
blocks_length
;
return
re
;
}
std
::
optional
<
size_t
>
get_location_of_a_block
(
CacheInfo
info
,
size_t
local_at
)
{
if
(
location_map
.
count
(
info
)
==
0
)
{
return
std
::
nullopt
;
}
auto
loc
=
location_map
[
info
];
if
(
local_at
>=
loc
.
length
)
{
return
std
::
nullopt
;
}
return
loc
.
start_idx
+
local_at
;
}
};
struct
DiskCacheAllocator
{
private:
// metadata
std
::
filesystem
::
path
path
;
CacheInfo
info
;
std
::
mutex
lock
;
size_t
now_idx
;
// store
size_t
capacity
;
std
::
vector
<
async_store
::
ArrayStore
*>
stores
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE
(
DiskCacheAllocator
,
now_idx
);
void
update_capacity
()
{
capacity
=
std
::
numeric_limits
<
size_t
>::
max
();
for
(
auto
&
store
:
stores
)
{
capacity
=
std
::
min
(
capacity
,
async_store
::
capacity
(
store
));
}
}
void
extend
(
size_t
to
)
{
for
(
size_t
i
=
0
;
i
<
info
.
hidden_layer_count
();
i
++
)
{
async_store
::
extend
(
stores
[
i
],
to
);
}
update_capacity
();
}
public:
async_store
::
ArrayStore
*
get_store
(
int
i
)
{
return
stores
[
i
];
}
Location
alloc
(
size_t
block_count
)
{
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
Location
re
;
re
.
start_idx
=
now_idx
;
re
.
length
=
block_count
;
now_idx
+=
block_count
;
if
(
now_idx
>=
capacity
)
{
extend
(
capacity
*
2
);
}
return
re
;
}
DiskCacheAllocator
(
std
::
filesystem
::
path
path
,
CacheInfo
info
)
:
path
(
path
),
info
(
info
)
{
// SPDLOG_DEBUG("Create DiskCacheAllocator {}", path.c_str());
auto
allocator_path
=
path
/
info
.
path
();
if
(
std
::
filesystem
::
exists
(
allocator_path
)
==
false
)
{
std
::
filesystem
::
create_directories
(
allocator_path
);
}
// restore metadata later in json load
now_idx
=
0
;
for
(
size_t
i
=
0
;
i
<
info
.
hidden_layer_count
();
i
++
)
{
// SPDLOG_DEBUG("Create store {} for {}", (path / info.path(i)).c_str(),i);
auto
store
=
async_store
::
create_or_open_store
(
info
.
element_size
(
NumTokenPerBlock
),
1000
,
path
/
info
.
path
(
i
));
stores
.
push_back
(
store
);
}
update_capacity
();
}
~
DiskCacheAllocator
()
{
for
(
auto
store
:
stores
)
{
async_store
::
close_store
(
store
);
}
}
};
struct
DiskCacheManager
{
KVC2Config
config
;
std
::
mutex
lock
;
std
::
unordered_map
<
CacheInfo
,
std
::
shared_ptr
<
DiskCacheAllocator
>>
allocators
;
friend
void
to_json
(
nlohmann
::
json
&
nlohmann_json_j
,
const
DiskCacheManager
&
nlohmann_json_t
)
{
nlohmann_json_j
[
"config"
]
=
nlohmann_json_t
.
config
;
nlohmann_json_j
[
"allocators"
]
=
nlohmann
::
json
::
array
();
for
(
auto
&
[
info
,
allocator
]
:
nlohmann_json_t
.
allocators
)
{
nlohmann_json_j
[
"allocators"
].
push_back
({{
"info"
,
info
},
{
"allocator"
,
*
allocator
}});
}
}
friend
void
from_json
(
const
nlohmann
::
json
&
nlohmann_json_j
,
DiskCacheManager
&
nlohmann_json_t
)
{
// SPDLOG_DEBUG("Load DiskCacheManager Json");
nlohmann_json_j
.
at
(
"config"
).
get_to
(
nlohmann_json_t
.
config
);
for
(
const
auto
&
allocator_json
:
nlohmann_json_j
.
at
(
"allocators"
))
{
// SPDLOG_DEBUG("Make Allocator {}",allocator_json.dump());
CacheInfo
info
;
allocator_json
.
at
(
"info"
).
get_to
(
info
);
auto
allocator
=
std
::
make_shared
<
DiskCacheAllocator
>
(
nlohmann_json_t
.
config
.
path
,
info
);
allocator_json
.
at
(
"allocator"
).
get_to
(
*
allocator
);
nlohmann_json_t
.
allocators
[
info
]
=
allocator
;
}
};
DiskCacheManager
(
KVC2Config
config
)
:
config
(
config
)
{
SPDLOG_INFO
(
"DiskCacheManager root path: {}"
,
config
.
path
.
c_str
());
if
(
!
std
::
filesystem
::
exists
(
config
.
path
))
{
std
::
filesystem
::
create_directories
(
config
.
path
);
}
}
std
::
shared_ptr
<
DiskCacheAllocator
>
get_allocator
(
CacheInfo
info
)
{
{
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
if
(
allocators
.
count
(
info
)
==
0
)
{
allocators
.
emplace
(
info
,
std
::
make_shared
<
DiskCacheAllocator
>
(
config
.
path
,
info
));
}
}
return
allocators
.
at
(
info
);
}
Location
allocate
(
CacheInfo
info
,
size_t
cache_block_count
)
{
auto
allocator
=
get_allocator
(
info
);
return
allocator
->
alloc
(
cache_block_count
);
}
};
struct
Prefix
{
uint64_t
prefix_id
;
// 0 for nullptr, started from 1
TokenLength
start_length
;
Tokens
ids
;
CacheDiskLocations
locations
;
Prefix
*
prev
=
nullptr
;
// No serialization
bool
prev_set
=
false
;
friend
void
to_json
(
nlohmann
::
json
&
nlohmann_json_j
,
const
Prefix
&
nlohmann_json_t
)
{
nlohmann_json_j
[
"prefix_id"
]
=
nlohmann_json_t
.
prefix_id
;
nlohmann_json_j
[
"start_length"
]
=
nlohmann_json_t
.
start_length
;
nlohmann_json_j
[
"ids"
]
=
nlohmann_json_t
.
ids
;
if
(
nlohmann_json_t
.
prev
)
{
nlohmann_json_j
[
"prev"
]
=
nlohmann_json_t
.
prev
->
prefix_id
;
}
else
{
nlohmann_json_j
[
"prev"
]
=
0
;
}
nlohmann_json_j
[
"locations"
]
=
nlohmann_json_t
.
locations
;
}
friend
void
from_json
(
const
nlohmann
::
json
&
nlohmann_json_j
,
Prefix
&
nlohmann_json_t
)
{
nlohmann_json_j
.
at
(
"prefix_id"
).
get_to
(
nlohmann_json_t
.
prefix_id
);
nlohmann_json_j
.
at
(
"start_length"
).
get_to
(
nlohmann_json_t
.
start_length
);
nlohmann_json_j
.
at
(
"ids"
).
get_to
(
nlohmann_json_t
.
ids
);
nlohmann_json_j
.
at
(
"locations"
).
get_to
(
nlohmann_json_t
.
locations
);
auto
prev_id
=
nlohmann_json_j
.
at
(
"prev"
).
get
<
uint64_t
>
();
nlohmann_json_t
.
prev
=
reinterpret_cast
<
Prefix
*>
(
prev_id
);
nlohmann_json_t
.
prev_set
=
false
;
};
TokenLength
local_length
()
{
return
ids
.
size
();
}
TokenLength
length
()
{
return
start_length
+
local_length
();
}
Tokens
prefix_to
(
TokenLength
length
)
{
TokenLength
local_length
=
length
-
start_length
;
Tokens
re
;
if
(
prev
)
{
re
=
prev
->
prefix_to
(
start_length
);
}
re
.
insert
(
re
.
end
(),
ids
.
begin
(),
ids
.
begin
()
+
local_length
);
return
re
;
}
Tokens
full
()
{
return
prefix_to
(
length
());
}
void
update_location
(
CacheInfo
info
,
Location
location
)
{
locations
.
location_map
[
info
]
=
location
;
}
Prefix
*
to_first_prefix_without_disk_locations
(
CacheInfo
k_info
/*, CacheInfo v_info*/
)
{
// just k_info
auto
now_prefix
=
this
;
while
(
now_prefix
->
prev
!=
nullptr
)
{
auto
&
prev
=
now_prefix
->
prev
;
auto
k_location
=
prev
->
locations
.
get_location
(
k_info
,
prev
->
local_length
());
// auto v_location = prev->locations.get_location(v_info, prev->local_length());
if
(
k_location
.
has_value
())
{
// assert(v_location.has_value());
// after now_prefix, we need to insert new kv cache.
break
;
}
now_prefix
=
prev
;
}
return
now_prefix
;
}
void
hash_to_with
(
TokenLength
length
,
TokensHasher
&
hasher
)
{
TokenLength
local_length
=
length
-
start_length
;
if
(
prev
)
{
prev
->
hash_to_with
(
start_length
,
hasher
);
}
hasher
.
update
(
ids
.
data
(),
local_length
);
}
void
debug
()
{
fmt
::
print
(
"Prefix {}, start_length: {}, local_length: {}, prev: {},
\n
"
,
prefix_id
,
start_length
,
local_length
(),
(
void
*
)
prev
);
}
};
struct
PrefixMatch
{
Prefix
*
prefix
;
TokenLength
match_length
;
std
::
vector
<
TokensHash
>
matched_hashes
(
CacheInfo
info
,
Layer
layer
)
{
std
::
vector
<
TokensHash
>
re
;
if
(
prefix
==
nullptr
)
return
re
;
TokensHasher
hasher
;
hasher
.
reset
(
info
.
hash_value
());
hasher
.
update_raw
(
&
layer
,
sizeof
(
layer
));
auto
ids
=
prefix
->
prefix_to
(
match_length
);
for
(
TokenLength
i
=
0
;
i
<
ids
.
size
();
i
+=
NumTokenPerBlock
)
{
TokenLength
len
=
std
::
min
(
NumTokenPerBlock
,
ids
.
size
()
-
i
);
re
.
push_back
(
hasher
.
update
(
ids
.
data
()
+
i
,
len
));
}
return
re
;
}
void
collect_locations
(
CacheInfo
info
,
SegmentLocations
&
seg_locs
)
{
auto
now_prefix
=
prefix
;
size_t
length
=
match_length
;
while
(
now_prefix
!=
nullptr
)
{
TokenLength
local_length
=
length
-
now_prefix
->
start_length
;
auto
loc
=
now_prefix
->
locations
.
get_location
(
info
,
local_length
);
if
(
loc
.
has_value
())
{
seg_locs
.
add_location
(
now_prefix
->
start_length
/
NumTokenPerBlock
,
loc
.
value
());
}
length
=
now_prefix
->
start_length
;
now_prefix
=
now_prefix
->
prev
;
}
}
};
std
::
string
to_string
(
const
MatchStatus
&
status
)
{
switch
(
status
)
{
case
Exact
:
return
"Exact"
;
case
Partial
:
return
"Partial"
;
case
NotMatchExact
:
return
"NotMatchExact"
;
case
NotMatchPartial
:
return
"NotMatchPartial"
;
default:
return
"Unknown"
;
}
}
struct
MatchByBlock
{
// prefix, block idx at prefix, status
std
::
vector
<
std
::
tuple
<
Prefix
*
,
BlockLength
,
MatchStatus
>>
matches
;
bool
any_match
()
{
for
(
auto
&
[
p
,
l
,
m
]
:
matches
)
{
if
(
p
)
{
return
true
;
}
}
return
false
;
}
size_t
partial_count
()
{
size_t
re
=
0
;
for
(
auto
&
[
p
,
l
,
m
]
:
matches
)
{
if
(
m
==
Partial
)
{
re
++
;
}
}
return
re
;
}
bool
has_partial
()
{
return
partial_count
()
>
0
;
}
std
::
vector
<
std
::
optional
<
TokensHash
>>
matched_hashes
(
CacheInfo
info
,
Layer
layer
)
{
// TODO: This function might be slow
std
::
vector
<
std
::
optional
<
TokensHash
>>
re
(
matches
.
size
(),
std
::
nullopt
);
for
(
size_t
i
=
0
;
i
<
matches
.
size
();
i
++
)
{
TokensHasher
hasher
;
hasher
.
reset
(
info
.
hash_value
());
hasher
.
update_raw
(
&
layer
,
sizeof
(
layer
));
auto
&
[
p
,
idx
,
status
]
=
matches
[
i
];
if
(
p
)
{
p
->
hash_to_with
((
idx
+
1
)
*
NumTokenPerBlock
,
hasher
);
re
[
i
]
=
hasher
.
get
();
}
}
return
re
;
}
void
collect_locations
(
CacheInfo
info
,
SegmentLocations
&
seg_locs
)
{
for
(
size_t
i
=
0
;
i
<
matches
.
size
();
i
++
)
{
auto
&
[
p
,
idx
,
status
]
=
matches
[
i
];
if
(
p
)
{
auto
local_at
=
idx
-
p
->
start_length
/
NumTokenPerBlock
;
seg_locs
.
set_location
(
i
,
p
->
locations
.
get_location_of_a_block
(
info
,
local_at
).
value
());
}
}
}
std
::
string
debug_string
()
{
std
::
string
re
=
fmt
::
format
(
"{} Match: "
,
matches
.
size
());
for
(
auto
&
[
p
,
idx
,
status
]
:
matches
)
{
switch
(
status
)
{
case
Exact
:
re
+=
"E"
;
break
;
case
Partial
:
re
+=
"P"
;
break
;
case
NotMatchExact
:
re
+=
"N"
;
break
;
case
NotMatchPartial
:
re
+=
"n"
;
break
;
default:
assert
(
0
);
}
}
return
re
;
}
};
struct
PrefixTree
{
std
::
shared_mutex
rw_lock
;
std
::
atomic_uint64_t
prefix_id_counter
=
1
;
using
MapT
=
std
::
unordered_map
<
TokensHash
,
std
::
pair
<
std
::
shared_ptr
<
Prefix
>
,
BlockLength
>>
;
// Prefix, start_block_idx
MapT
prefix_map
;
std
::
shared_ptr
<
Metrics
>
met
;
std
::
vector
<
std
::
shared_ptr
<
Prefix
>>
prefix_refs
=
{
nullptr
};
// 0 is nullptr
friend
void
to_json
(
nlohmann
::
json
&
nlohmann_json_j
,
const
PrefixTree
&
nlohmann_json_t
)
{
nlohmann_json_j
[
"prefix_id_counter"
]
=
nlohmann_json_t
.
prefix_id_counter
.
load
();
nlohmann_json_j
[
"prefix_refs"
]
=
nlohmann
::
json
::
array
();
for
(
auto
prefix
:
nlohmann_json_t
.
prefix_refs
)
{
if
(
prefix
==
nullptr
)
continue
;
nlohmann_json_j
[
"prefix_refs"
].
push_back
(
*
prefix
);
}
}
friend
void
from_json
(
const
nlohmann
::
json
&
nlohmann_json_j
,
PrefixTree
&
nlohmann_json_t
)
{
nlohmann_json_t
.
prefix_id_counter
=
nlohmann_json_j
.
at
(
"prefix_id_counter"
).
get
<
uint64_t
>
();
nlohmann_json_t
.
prefix_refs
.
resize
(
nlohmann_json_t
.
prefix_id_counter
);
for
(
size_t
i
=
1
;
i
<
nlohmann_json_t
.
prefix_id_counter
;
++
i
)
{
auto
prefix
=
std
::
make_shared
<
Prefix
>
();
nlohmann_json_j
.
at
(
"prefix_refs"
)[
i
-
1
].
get_to
(
*
prefix
);
nlohmann_json_t
.
prefix_refs
[
i
]
=
prefix
;
}
nlohmann_json_t
.
init_prevs
();
nlohmann_json_t
.
init_map
();
};
void
init_prevs
()
{
for
(
auto
p
:
prefix_refs
)
{
if
(
p
)
{
if
(
p
->
prev_set
==
false
)
{
p
->
prev
=
prefix_refs
[
reinterpret_cast
<
uint64_t
>
(
p
->
prev
)].
get
();
p
->
prev_set
=
true
;
}
}
}
}
void
init_map
()
{
assert
(
prefix_map
.
empty
());
for
(
auto
p
:
prefix_refs
)
{
if
(
p
==
nullptr
)
continue
;
auto
ids
=
p
->
full
();
for
(
TokenLength
i
=
p
->
start_length
;
i
<
p
->
length
();
i
+=
NumTokenPerBlock
)
{
TokenLength
end
=
std
::
min
(
i
+
NumTokenPerBlock
,
p
->
length
());
assert
(
end
%
NumTokenPerBlock
==
0
);
auto
hash
=
TokensHasher
::
hash
(
ids
.
data
(),
end
);
prefix_map
[
hash
]
=
{
p
,
end
/
NumTokenPerBlock
-
1
};
}
}
}
// Look up prefix from the map, return the matched prefix and length.
// If the prefix is not found, match contains nullptr and 0.
PrefixMatch
look_up
(
Token
*
data
,
TokenLength
length
,
bool
need_lock
=
true
)
{
std
::
shared_lock
<
std
::
shared_mutex
>
sl
;
if
(
need_lock
)
{
sl
=
std
::
shared_lock
<
std
::
shared_mutex
>
(
rw_lock
);
}
//TODO: prefix cache
}
PrefixMatch
look_up_or_insert
(
Token
*
data
,
TokenLength
length
)
{
std
::
unique_lock
<
std
::
shared_mutex
>
ul
(
rw_lock
);
auto
match
=
look_up
(
data
,
length
,
false
);
if
(
match
.
match_length
==
length
)
{
return
match
;
}
auto
new_prefix
=
new_prefix_node
(
match
.
prefix
,
match
.
match_length
,
data
,
length
,
false
);
PrefixMatch
re
;
re
.
prefix
=
new_prefix
.
get
();
re
.
match_length
=
length
;
return
re
;
}
std
::
shared_ptr
<
Prefix
>
new_prefix_node
(
Prefix
*
prev
,
TokenLength
prev_match_length
,
Token
*
data
,
TokenLength
length
,
bool
need_lock
=
true
)
{
std
::
unique_lock
<
std
::
shared_mutex
>
ul
;
if
(
need_lock
)
ul
=
std
::
unique_lock
<
std
::
shared_mutex
>
(
rw_lock
);
auto
new_prefix
=
std
::
make_shared
<
Prefix
>
();
new_prefix
->
prefix_id
=
prefix_id_counter
.
fetch_add
(
1
);
new_prefix
->
start_length
=
prev_match_length
;
new_prefix
->
ids
=
Tokens
(
data
+
prev_match_length
,
data
+
length
);
new_prefix
->
prev
=
prev
;
new_prefix
->
prev_set
=
true
;
prefix_refs
.
push_back
(
new_prefix
);
met
->
prefix_nodes
->
Increment
();
met
->
prefix_block_count
->
Increment
(
div_up
(
length
-
prev_match_length
,
NumTokenPerBlock
));
assert
(
prefix_refs
.
size
()
==
prefix_id_counter
.
load
());
TokensHasher
hasher
;
hasher
.
update
(
data
,
prev_match_length
);
for
(
TokenLength
i
=
prev_match_length
;
i
<
length
;
i
+=
NumTokenPerBlock
)
{
TokenLength
len
=
std
::
min
(
NumTokenPerBlock
,
length
-
i
);
auto
hash
=
hasher
.
update
(
data
+
i
,
len
);
prefix_map
[
hash
]
=
{
new_prefix
,
i
/
NumTokenPerBlock
};
}
return
new_prefix
;
}
void
debug
()
{
fmt
::
print
(
"PrefixTree with {} prefixes, prefix counter: {}
\n
"
,
prefix_map
.
size
(),
prefix_id_counter
.
load
());
for
(
auto
&
[
hash
,
prefix
]
:
prefix_map
)
{
fmt
::
print
(
"Hash: {:016x}, start block {}
\n
"
,
hash
,
prefix
.
second
);
prefix
.
first
->
debug
();
}
}
};
size_t
locations_blocks_count
(
const
std
::
vector
<
Location
>&
locations
)
{
auto
re
=
0
;
for
(
auto
&
loc
:
locations
)
{
re
+=
loc
.
length
;
}
return
re
;
}
struct
DoubleCacheHandle
:
public
DoubleCacheHandleInterface
{
ModelName
model_name
;
QuantType
quant_type
;
bool
is_k_cache_on
;
bool
is_v_cache_on
;
CacheInfo
k_info
()
{
if
(
is_k_cache_on
==
false
)
{
SPDLOG_WARN
(
"Get K CacheInfo, but K Cache is off"
);
}
return
CacheInfo
{
.
model_name
=
model_name
,
.
is_key_cache
=
true
,
.
quant_type
=
quant_type
,
};
};
CacheInfo
v_info
()
{
if
(
is_v_cache_on
==
false
)
{
SPDLOG_WARN
(
"Get V CacheInfo, but K Cache is off"
);
}
return
CacheInfo
{
.
model_name
=
model_name
,
.
is_key_cache
=
false
,
.
quant_type
=
quant_type
,
};
};
Tokens
ids
;
TokenLength
estimated_length
;
bool
enable_alt
=
false
;
PrefixMatch
match
;
// MatchByBlock match_by_blocks;
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>
k_cache_handles
;
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>
v_cache_handles
;
SegmentLocations
k_seg_locs
;
SegmentLocations
v_seg_locs
;
KVC2
*
kvc2_top
;
// for Cache Fusion
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>
attatched_cache_handles
;
std
::
unique_ptr
<
CacheBlockEntryCollector
>
cpu_releaser
=
nullptr
,
gpu_releaser
=
nullptr
;
std
::
vector
<
size_t
>
gpu_only_block_idx
;
virtual
~
DoubleCacheHandle
();
// interface
TokenLength
matched_length
()
override
{
if
(
enable_alt
)
{
assert
(
0
);
}
else
{
return
match
.
match_length
;
}
}
MatchStatus
status_at
(
BlockLength
i
)
{
assert
(
i
<
div_up
(
estimated_length
,
NumTokenPerBlock
));
if
(
enable_alt
)
{
assert
(
false
);
// if (i >= match_by_blocks.matches.size()) {
// return match_by_blocks.has_partial() ? MatchStatus::NotMatchPartial : MatchStatus::NotMatchExact;
// }
// return std::get<2>(match_by_blocks.matches[i]);
}
else
{
if
(
i
<
match
.
match_length
/
NumTokenPerBlock
)
{
return
MatchStatus
::
Exact
;
}
else
{
return
MatchStatus
::
NotMatchExact
;
}
}
}
std
::
vector
<
MatchStatus
>
matched_status
()
override
{
assert
(
false
);
}
bool
any_match
()
{
if
(
enable_alt
)
{
assert
(
false
);
// return match_by_blocks.any_match();
}
else
{
return
match
.
prefix
!=
nullptr
;
}
}
BlockLength
match_range_length
()
{
if
(
enable_alt
)
{
assert
(
false
);
// return match_by_blocks.matches.size();
}
else
{
return
div_up
(
match
.
match_length
,
NumTokenPerBlock
);
}
}
std
::
vector
<
layer_data
>
handle_data
(
bool
is_key_cache
)
override
{
return
export_raw_pointers
(
is_key_cache
);
}
bool
to_gpu
()
override
;
void
to_gpu_async
(
std
::
function
<
void
(
bool
)
>
call_back
)
override
;
std
::
vector
<
size_t
>
get_gpu_block_idx
()
override
;
bool
alloc_attached_blocks
(
BlockLength
count
);
std
::
vector
<
size_t
>
get_gpu_attached_block_idx
()
override
;
void
append_tokens
(
Token
*
tokens
,
TokenLength
length
)
override
;
void
debug
()
override
{}
void
set_cache_info
(
ModelName
model_name
,
QuantType
quant_type
,
bool
turn_on_k_cache
,
bool
turn_on_v_cache
)
{
this
->
model_name
=
model_name
;
this
->
quant_type
=
quant_type
;
if
(
turn_on_k_cache
)
{
is_k_cache_on
=
true
;
k_cache_handles
.
resize
(
k_info
().
hidden_layer_count
());
}
else
{
is_k_cache_on
=
false
;
k_cache_handles
.
clear
();
}
if
(
turn_on_v_cache
)
{
is_v_cache_on
=
true
;
v_cache_handles
.
resize
(
v_info
().
hidden_layer_count
());
}
else
{
is_v_cache_on
=
false
;
v_cache_handles
.
clear
();
}
}
void
check_before_insert
()
{
std
::
optional
<
size_t
>
blocks_count
=
std
::
nullopt
;
auto
check_single_cache
=
[
&
blocks_count
](
CacheInfo
cache_info
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
layers
,
Tokens
&
ids
)
{
for
(
size_t
i
=
0
;
i
<
cache_info
.
hidden_layer_count
();
i
++
)
{
auto
&
layer
=
layers
[
i
];
if
(
blocks_count
.
has_value
()
==
false
)
{
blocks_count
=
layer
.
size
();
}
else
{
if
(
blocks_count
.
value
()
!=
layer
.
size
())
{
SPDLOG_ERROR
(
"Layer {} has different block count"
,
i
);
throw
std
::
runtime_error
(
"Layer has different block count"
);
}
}
}
if
(
blocks_count
.
has_value
())
{
if
(
blocks_count
.
value
()
!=
div_up
(
ids
.
size
(),
NumTokenPerBlock
))
{
SPDLOG_ERROR
(
"Block count not match, ids: {}, blocks: {}"
,
ids
.
size
(),
blocks_count
.
value
());
throw
std
::
runtime_error
(
"Block count not match"
);
}
}
};
if
(
is_k_cache_on
)
check_single_cache
(
k_info
(),
k_cache_handles
,
ids
);
if
(
is_v_cache_on
)
check_single_cache
(
v_info
(),
v_cache_handles
,
ids
);
}
template
<
typename
Fn
>
void
for_all_cache_block_entry
(
Fn
f
)
{
if
(
is_k_cache_on
)
{
for
(
auto
&
layer
:
k_cache_handles
)
{
for
(
auto
&
block
:
layer
)
{
if
(
f
(
block
)
==
false
)
return
;
}
}
}
if
(
is_v_cache_on
)
{
for
(
auto
&
layer
:
v_cache_handles
)
{
for
(
auto
&
block
:
layer
)
{
if
(
f
(
block
)
==
false
)
return
;
}
}
}
}
// concurrent check ok
bool
alloc_on_cpu
()
{
assert
(
cpu_releaser
==
nullptr
);
std
::
unique_ptr
<
CacheBlockEntryCollector
>
releaser
=
std
::
make_unique
<
CacheBlockEntryCollector
>
([](
CacheBlockEntry
*
entry
)
{
auto
lg
=
entry
->
lock_guard
();
entry
->
cpu_cc
.
ref_count
.
fetch_sub
(
1
);
});
bool
ok
=
true
;
for_all_cache_block_entry
([
&
ok
,
&
releaser
](
std
::
shared_ptr
<
CacheBlockEntry
>&
block_entry
)
{
if
(
block_entry
->
inc_ref_or_alloc_on_cpu
()
==
false
)
{
ok
=
false
;
return
false
;
}
else
{
releaser
->
entries
.
push_back
(
block_entry
.
get
());
}
return
true
;
});
if
(
ok
)
{
cpu_releaser
=
std
::
move
(
releaser
);
}
return
ok
;
}
bool
alloc_on_gpu_cols
()
{
assert
(
is_k_cache_on
);
assert
(
gpu_releaser
==
nullptr
);
std
::
unique_ptr
<
CacheBlockEntryCollector
>
releaser
=
std
::
make_unique
<
CacheBlockEntryCollector
>
([](
CacheBlockEntry
*
entry
)
{
auto
lg
=
entry
->
lock_guard
();
entry
->
gpu_cc
.
ref_count
.
fetch_sub
(
1
);
});
GPUPageCache
*
gpu_cache
=
k_cache_handles
[
0
][
0
]
->
manager
->
gpu_cache
.
get
();
gpu_cache
->
background_flush_back
->
wakeUpWait
();
bool
ok
=
true
;
size_t
want_count
=
0
;
for
(
size_t
i
=
0
;
i
<
k_cache_handles
[
0
].
size
();
i
++
)
{
auto
lg
=
k_cache_handles
[
0
][
i
]
->
lock_guard
();
if
(
k_cache_handles
[
0
][
i
]
->
gpu_block_idx
.
has_value
()
==
false
)
{
want_count
+=
1
;
if
(
gpu_cache
->
alloc_col
(
k_cache_handles
,
v_cache_handles
,
i
)
==
false
)
{
ok
=
false
;
break
;
}
}
k_cache_handles
[
0
][
i
]
->
gpu_cc
.
ref_count
.
fetch_add
(
1
);
releaser
->
entries
.
push_back
(
k_cache_handles
[
0
][
i
].
get
());
}
if
(
ok
==
false
)
{
SPDLOG_WARN
(
"Handle cannot allocate {} gpu pages"
,
want_count
);
}
else
{
gpu_releaser
=
std
::
move
(
releaser
);
}
return
ok
;
}
static
void
segment_io_layer
(
async_store
::
IODealer
*
dealer
,
IO_Helper
<
CacheBlockEntry
>&
io_helper
,
async_store
::
ArrayStore
*
store
,
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>&
layer_entries
,
size_t
block_start
,
size_t
length
,
Layer
layer
,
const
SegmentLocations
&
locations
,
IOOption
option
)
{
SPDLOG_TRACE
(
"{} [{}:{}) blocks to/from disk"
,
to_string
(
option
),
block_start
,
block_start
+
length
);
for
(
size_t
i
=
block_start
;
i
<
block_start
+
length
;
i
++
)
{
if
(
locations
.
get_idx
(
i
).
has_value
())
{
SPDLOG_TRACE
(
"Location for block {}, {}"
,
i
,
locations
.
get_idx
(
i
).
value
());
layer_entries
[
i
]
->
io_with
(
dealer
,
io_helper
,
store
,
layer
,
locations
.
get_idx
(
i
).
value
(),
option
);
}
}
}
std
::
shared_ptr
<
IO_Helper
<
CacheBlockEntry
>>
segment_io
(
async_store
::
IODealer
*
dealer
,
DiskCacheManager
*
manager
,
BlockLength
block_start
,
BlockLength
length
,
IOOption
option
)
{
auto
io_helper
=
std
::
make_shared
<
IO_Helper
<
CacheBlockEntry
>>
([
option
](
CacheBlockEntry
*
b
)
{
switch
(
option
)
{
case
IO_ForceRead
:
break
;
case
IO_ForceWrite
:
break
;
case
IO_Read
:
{
b
->
cpu_cc
.
tc
.
set_has_data
();
break
;
}
case
IO_Write
:
break
;
default:
assert
(
0
);
}
});
auto
single_segment_io
=
[
dealer
,
manager
,
block_start
,
length
,
option
,
io_helper
](
CacheInfo
info
,
SegmentLocations
&
seg_locs
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
layers
)
{
assert
(
layers
[
0
].
size
()
>=
block_start
+
length
);
auto
allocator
=
manager
->
get_allocator
(
info
);
for
(
size_t
l
=
0
;
l
<
info
.
hidden_layer_count
();
l
++
)
{
segment_io_layer
(
dealer
,
*
io_helper
,
allocator
->
get_store
(
l
),
layers
[
l
],
block_start
,
length
,
l
,
seg_locs
,
option
);
}
};
if
(
is_k_cache_on
)
single_segment_io
(
k_info
(),
k_seg_locs
,
k_cache_handles
);
if
(
is_v_cache_on
)
single_segment_io
(
v_info
(),
v_seg_locs
,
v_cache_handles
);
io_helper
->
finish_add_taks
();
SPDLOG_DEBUG
(
"Segment IO Submitted, total task count {}"
,
io_helper
->
total_task_count
);
return
io_helper
;
}
std
::
shared_ptr
<
IO_Helper
<
CacheBlockEntry
>>
gpu_io
(
GPUPageCache
*
gpu_cache
,
BlockLength
block_start
,
BlockLength
length
,
IOOption
option
)
{
auto
io_helper
=
std
::
make_shared
<
IO_Helper
<
CacheBlockEntry
>>
([
option
](
CacheBlockEntry
*
b
)
{
switch
(
option
)
{
case
IO_ForceRead
:
break
;
case
IO_ForceWrite
:
break
;
case
IO_Read
:
{
b
->
gpu_cc
.
tc
.
set_has_data
();
break
;
}
case
IO_Write
:
break
;
default:
assert
(
0
);
}
});
cudaMemcpyKind
direction
;
if
(
option
==
IO_Read
||
option
==
IO_ForceRead
)
{
direction
=
cudaMemcpyHostToDevice
;
}
if
(
option
==
IO_Write
||
option
==
IO_ForceWrite
)
{
direction
=
cudaMemcpyDeviceToHost
;
}
auto
reqs
=
gpu_cache
->
basic_request
(
direction
,
[
io_helper
]()
{
io_helper
->
batch_promise
.
set
();
});
for
(
size_t
i
=
block_start
;
i
<
length
;
i
++
)
{
auto
status
=
status_at
(
i
);
if
(
status
==
NotMatchExact
||
status
==
NotMatchPartial
)
{
SPDLOG_DEBUG
(
"GPU: Col Handle not match (Skipped by Alt Match)"
);
continue
;
}
auto
ptr
=
k_cache_handles
[
0
][
i
].
get
();
switch
(
option
)
{
case
IO_Read
:
{
if
(
io_helper
->
absorb_tc
(
ptr
,
ptr
->
gpu_cc
.
tc
)
==
false
)
{
// SPDLOG_DEBUG("GPU: Col Handle need me to wait");
continue
;
}
break
;
}
case
IO_ForceRead
:
{
break
;
}
case
IO_ForceWrite
:
{
break
;
}
case
IO_Write
:
{
break
;
}
default:
{
assert
(
0
);
}
}
SPDLOG_DEBUG
(
"GPU: Col Handle needs me to transfer"
);
gpu_cache
->
append_col_to_request
(
reqs
,
k_cache_handles
,
v_cache_handles
,
i
);
}
io_helper
->
new_task
(
reqs
.
size
());
gpu_cache
->
submit_requests
(
reqs
);
io_helper
->
finish_add_taks
();
return
io_helper
;
}
// void set_raw_handles(const std::vector<layer_data>& k, const std::vector<layer_data>& v) {
// set_raw_handles(true, k);
// set_raw_handles(false, v);
// }
void
set_raw_handles
(
bool
is_key_cache
,
const
std
::
vector
<
layer_data
>&
layer_data
)
{
auto
single_set_raw_handles
=
[
layer_data
](
CacheInfo
info
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
handles
)
{
handles
.
resize
(
layer_data
.
size
());
for
(
size_t
i
=
0
;
i
<
info
.
hidden_layer_count
();
i
++
)
{
auto
&
layer
=
layer_data
[
i
];
handles
[
i
].
clear
();
for
(
auto
&
block_data
:
layer
)
{
auto
handle
=
std
::
make_shared
<
CacheBlockEntry
>
();
handle
->
data
=
reinterpret_cast
<
void
*>
(
block_data
);
handle
->
size
=
info
.
element_size
(
NumTokenPerBlock
);
handles
[
i
].
push_back
(
handle
);
}
}
};
if
(
is_key_cache
)
{
is_k_cache_on
=
true
;
single_set_raw_handles
(
k_info
(),
k_cache_handles
);
}
else
{
is_v_cache_on
=
true
;
single_set_raw_handles
(
v_info
(),
v_cache_handles
);
}
}
std
::
vector
<
layer_data
>
export_raw_pointers
(
bool
is_key_cache
)
{
std
::
vector
<
layer_data
>
re
;
auto
single_export_raw_pointers
=
[
&
re
](
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
layers
)
{
for
(
auto
&
layer_handle
:
layers
)
{
layer_data
layer
;
for
(
size_t
i
=
0
;
i
<
layer_handle
.
size
();
i
++
)
{
auto
block
=
layer_handle
.
at
(
i
);
layer
.
push_back
(
reinterpret_cast
<
data_block_ptr
>
(
block
->
data
));
}
re
.
push_back
(
layer
);
}
};
if
(
is_key_cache
)
{
if
(
is_k_cache_on
==
false
)
{
SPDLOG_WARN
(
"Export K Cache, but K Cache is off"
);
}
single_export_raw_pointers
(
k_cache_handles
);
}
else
{
if
(
is_v_cache_on
==
false
)
{
SPDLOG_WARN
(
"Export V Cache, but V Cache is off"
);
}
single_export_raw_pointers
(
v_cache_handles
);
}
return
re
;
}
void
get_handles
();
void
get_empty_handles
();
void
collect_locations
()
{
if
(
enable_alt
)
{
assert
(
false
);
// match_by_blocks.collect_locations(k_info(), k_seg_locs);
// match_by_blocks.collect_locations(v_info(), v_seg_locs);
}
else
{
if
(
is_k_cache_on
)
match
.
collect_locations
(
k_info
(),
k_seg_locs
);
if
(
is_v_cache_on
)
match
.
collect_locations
(
v_info
(),
v_seg_locs
);
}
if
(
is_k_cache_on
)
k_seg_locs
.
debug
();
// v_seg_locs.debug();
}
};
struct
KVC2
:
KVC2Interface
{
KVC2Config
config
;
std
::
shared_ptr
<
Metrics
>
met
;
std
::
filesystem
::
path
root
;
std
::
unique_ptr
<
PrefixTree
>
tree
;
std
::
unique_ptr
<
DiskCacheManager
>
disk_cache
;
std
::
shared_ptr
<
PageAlignedMemoryPool
>
memory_pool
;
std
::
unique_ptr
<
CacheEntryManager
>
cache_manager
;
std
::
unique_ptr
<
async_store
::
IODealer
>
io_dealer
;
std
::
shared_ptr
<
GPUPageCache
>
gpu_cache
;
public:
void
load
()
override
{
load_quant_configs
(
root
/
"quant_configs.json"
);
load_model_configs
(
root
/
"model_configs.json"
);
{
auto
where
=
root
/
"tree.json"
;
if
(
std
::
filesystem
::
exists
(
where
))
{
nlohmann
::
json
j
;
std
::
ifstream
i
(
where
);
i
>>
j
;
j
.
get_to
(
*
tree
);
SPDLOG_WARN
(
"Loaded from {}"
,
where
.
c_str
());
}
}
{
auto
where
=
root
/
"disk_cache.json"
;
if
(
std
::
filesystem
::
exists
(
where
))
{
nlohmann
::
json
j
;
std
::
ifstream
i
(
where
);
i
>>
j
;
j
.
get_to
(
*
disk_cache
);
SPDLOG_WARN
(
"Loaded from {}"
,
where
.
c_str
());
}
}
{
auto
where
=
root
/
"config.json"
;
if
(
std
::
filesystem
::
exists
(
where
))
{
nlohmann
::
json
j
;
std
::
ifstream
i
(
where
);
i
>>
j
;
j
.
get_to
(
config
);
SPDLOG_WARN
(
"Loaded from {}"
,
where
.
c_str
());
}
}
}
void
save
()
override
{
if
(
config
.
save_to_disk
==
false
)
{
return
;
}
flush_back
();
{
nlohmann
::
json
j
;
j
=
*
tree
;
auto
where
=
root
/
"tree.json"
;
std
::
ofstream
o
(
where
);
o
<<
j
;
SPDLOG_WARN
(
"Serialized to {}"
,
where
.
c_str
());
}
{
nlohmann
::
json
j
;
j
=
*
disk_cache
;
auto
where
=
root
/
"disk_cache.json"
;
std
::
ofstream
o
(
where
);
o
<<
j
;
SPDLOG_WARN
(
"Serialized to {}"
,
where
.
c_str
());
}
{
nlohmann
::
json
j
;
j
=
config
;
auto
where
=
root
/
"config.json"
;
std
::
ofstream
o
(
where
);
o
<<
j
;
SPDLOG_WARN
(
"Serialized to {}"
,
where
.
c_str
());
}
dump_quant_configs
(
root
/
"quant_configs.json"
);
dump_model_configs
(
root
/
"model_configs.json"
);
}
void
raw_insert
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
const
std
::
vector
<
layer_data
>&
k_cache
,
const
std
::
vector
<
layer_data
>&
v_cache
)
override
{
TimeObserver
time_observer
(
met
->
raw_insert_time_ms
);
SPDLOG_INFO
(
"Raw Insert"
);
if
(
length
%
NumTokenPerBlock
!=
0
)
{
SPDLOG_WARN
(
"Try to insert tokens with length {}, which is not a multiple of NumTokenPerBlock({}), getting floor"
,
length
,
NumTokenPerBlock
);
length
=
length
/
NumTokenPerBlock
*
NumTokenPerBlock
;
}
auto
h
=
std
::
make_shared
<
DoubleCacheHandle
>
();
h
->
kvc2_top
=
this
;
h
->
set_cache_info
(
model_name
,
quant_type
,
config
.
k_cache_on
,
config
.
v_cache_on
);
h
->
ids
=
Tokens
(
id
,
id
+
length
);
if
(
config
.
k_cache_on
)
h
->
set_raw_handles
(
true
,
k_cache
);
if
(
config
.
v_cache_on
)
h
->
set_raw_handles
(
false
,
v_cache
);
h
->
check_before_insert
();
h
->
match
=
tree
->
look_up_or_insert
(
id
,
length
);
auto
now_prefix
=
h
->
match
.
prefix
;
assert
(
config
.
k_cache_on
);
if
(
now_prefix
->
locations
.
get_location
(
h
->
k_info
(),
length
-
now_prefix
->
start_length
).
has_value
())
{
assert
(
now_prefix
->
locations
.
get_location
(
h
->
v_info
(),
length
-
now_prefix
->
start_length
).
has_value
());
SPDLOG_INFO
(
"KV Cache Already on disk"
);
// already on disk
}
else
{
now_prefix
=
now_prefix
->
to_first_prefix_without_disk_locations
(
h
->
k_info
());
// insert new kv cache locations
TokenLength
new_length
=
length
-
now_prefix
->
start_length
;
SPDLOG_DEBUG
(
"Inserting new kv cache, length: {}"
,
new_length
);
assert
(
new_length
>
0
);
if
(
config
.
v_cache_on
)
{
// allocate a big space on disk
auto
k_loc
=
disk_cache
->
allocate
(
h
->
k_info
(),
div_up
(
new_length
,
NumTokenPerBlock
));
auto
v_loc
=
disk_cache
->
allocate
(
h
->
v_info
(),
div_up
(
new_length
,
NumTokenPerBlock
));
h
->
k_seg_locs
.
add_location
(
now_prefix
->
start_length
/
NumTokenPerBlock
,
k_loc
);
h
->
v_seg_locs
.
add_location
(
now_prefix
->
start_length
/
NumTokenPerBlock
,
v_loc
);
// split it to prefix trees
for
(
auto
tail
=
h
->
match
.
prefix
;
tail
!=
now_prefix
->
prev
;
tail
=
tail
->
prev
)
{
TokenLength
local_ids_length
=
tail
->
local_length
();
tail
->
update_location
(
h
->
k_info
(),
k_loc
.
cut_tail
(
div_up
(
local_ids_length
,
NumTokenPerBlock
)));
tail
->
update_location
(
h
->
v_info
(),
v_loc
.
cut_tail
(
div_up
(
local_ids_length
,
NumTokenPerBlock
)));
}
assert
(
k_loc
.
length
==
0
);
assert
(
v_loc
.
length
==
0
);
}
else
{
// allocate a big space on disk
auto
k_loc
=
disk_cache
->
allocate
(
h
->
k_info
(),
div_up
(
new_length
,
NumTokenPerBlock
));
h
->
k_seg_locs
.
add_location
(
now_prefix
->
start_length
/
NumTokenPerBlock
,
k_loc
);
// split it to prefix trees
for
(
auto
tail
=
h
->
match
.
prefix
;
tail
!=
now_prefix
->
prev
;
tail
=
tail
->
prev
)
{
TokenLength
local_ids_length
=
tail
->
local_length
();
tail
->
update_location
(
h
->
k_info
(),
k_loc
.
cut_tail
(
div_up
(
local_ids_length
,
NumTokenPerBlock
)));
}
assert
(
k_loc
.
length
==
0
);
}
// write new kv cache
auto
disk_io_helper
=
h
->
segment_io
(
io_dealer
.
get
(),
disk_cache
.
get
(),
now_prefix
->
start_length
/
NumTokenPerBlock
,
div_up
(
new_length
,
NumTokenPerBlock
),
IO_ForceWrite
);
disk_io_helper
->
wait
();
}
}
TokenLength
raw_read
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
const
std
::
vector
<
layer_data
>&
k_cache
,
const
std
::
vector
<
layer_data
>&
v_cache
)
override
{
SPDLOG_INFO
(
"Raw Read"
);
auto
h
=
std
::
make_shared
<
DoubleCacheHandle
>
();
h
->
kvc2_top
=
this
;
h
->
set_cache_info
(
model_name
,
quant_type
,
config
.
k_cache_on
,
config
.
v_cache_on
);
h
->
ids
=
Tokens
(
id
,
id
+
length
);
if
(
config
.
k_cache_on
)
h
->
set_raw_handles
(
true
,
k_cache
);
if
(
config
.
v_cache_on
)
h
->
set_raw_handles
(
false
,
v_cache
);
h
->
match
=
tree
->
look_up
(
id
,
length
);
if
(
h
->
match
.
prefix
==
nullptr
)
{
SPDLOG_INFO
(
"Not Found"
);
return
0
;
}
SPDLOG_DEBUG
(
"Found {}"
,
h
->
match
.
match_length
);
h
->
collect_locations
();
auto
disk_io_helper
=
h
->
segment_io
(
io_dealer
.
get
(),
disk_cache
.
get
(),
0
,
div_up
(
h
->
match
.
match_length
,
NumTokenPerBlock
),
IO_ForceRead
);
disk_io_helper
->
wait
();
return
h
->
match
.
match_length
;
}
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
lookup
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
)
override
{
TimeObserver
time_observer
(
met
->
lookup_time_ms
);
auto
re
=
std
::
make_shared
<
DoubleCacheHandle
>
();
re
->
set_cache_info
(
model_name
,
quant_type
,
config
.
k_cache_on
,
config
.
v_cache_on
);
re
->
ids
=
Tokens
(
id
,
id
+
length
);
re
->
estimated_length
=
estimated_length
;
re
->
kvc2_top
=
this
;
SPDLOG_DEBUG
(
"Lookup TokenLength {}"
,
length
);
if
(
config
.
gpu_only
==
false
)
{
//TODO:
}
return
re
;
};
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
lookup_to_gpu
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
size_t
length
,
size_t
estimated_length
)
override
{
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
lookup_to_gpu_async
(
model_name
,
quant_type
,
id
,
length
,
estimated_length
,
[
&
p
](
auto
re
)
{
p
.
set_value
(
re
);
});
return
p
.
get_future
().
get
();
}
void
lookup_to_gpu_async
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
,
std
::
function
<
void
(
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
)
>
call_back
)
override
{
auto
re
=
lookup
(
model_name
,
quant_type
,
id
,
length
,
estimated_length
);
if
(
re
==
nullptr
)
{
call_back
(
nullptr
);
return
;
}
auto
h
=
static_cast
<
DoubleCacheHandle
*>
(
re
.
get
());
if
(
config
.
gpu_only
)
{
auto
total_block_count
=
div_up
(
estimated_length
,
NumTokenPerBlock
);
h
->
gpu_only_block_idx
=
gpu_cache
->
gpu_only_alloc_col
(
total_block_count
);
if
(
h
->
gpu_only_block_idx
.
empty
())
{
call_back
(
nullptr
);
}
else
{
call_back
(
re
);
}
}
else
{
if
(
h
->
k_info
().
hidden_layer_count
()
!=
gpu_cache
->
config
.
layer_count
)
{
SPDLOG_ERROR
(
"GPU Cache Layer Count not match"
);
assert
(
false
);
}
if
(
h
->
alloc_on_gpu_cols
()
==
false
)
{
call_back
(
nullptr
);
return
;
}
h
->
to_gpu_async
([
call_back
,
re
](
bool
ok
)
{
if
(
ok
)
{
call_back
(
re
);
}
else
{
call_back
(
nullptr
);
}
});
}
}
std
::
pair
<
std
::
vector
<
torch
::
Tensor
>
,
std
::
vector
<
torch
::
Tensor
>>
get_kvcache
()
override
{
return
{
gpu_cache
->
k_cache
,
gpu_cache
->
v_cache
};
}
void
flush_back
()
{
gpu_cache
->
background_flush_back
->
wakeUpWait
();
cache_manager
->
background_flush_back
->
wakeUpWait
();
}
void
debug
()
override
{
cache_manager
->
debug
();
tree
->
debug
();
}
virtual
~
KVC2
()
{
flush_back
();
};
KVC2
(
KVC2Config
config
)
:
config
(
config
)
{
SPDLOG_INFO
(
"Creating KVC2 using these config"
);
SPDLOG_INFO
(
" GPU Only: {}"
,
config
.
gpu_only
);
SPDLOG_INFO
(
" Load: {}, Save: {}"
,
config
.
load_from_disk
,
config
.
save_to_disk
);
SPDLOG_INFO
(
" Path: {}"
,
config
.
path
);
SPDLOG_INFO
(
" Config Path: {}"
,
config
.
config_path
);
SPDLOG_INFO
(
" Num Token/Page: {}, Memory Pool Size: {}"
,
config
.
num_token_per_page
,
readable_number
(
config
.
memory_pool_size
));
SPDLOG_INFO
(
" Evict Count: {}, Metrics Port: {}"
,
config
.
evict_count
,
config
.
metrics_port
);
SPDLOG_INFO
(
" Recompute Ratio: {:.2f}"
,
config
.
recompute_ratio
);
if
(
config
.
gpu_cache_config
)
{
const
auto
&
gpu_config
=
*
config
.
gpu_cache_config
;
SPDLOG_INFO
(
" GPU Devices: {}"
,
format_vector
(
gpu_config
.
gpu_devices_id
));
SPDLOG_INFO
(
" Layer Count: {}, Total KVCache Pages: {}"
,
gpu_config
.
layer_count
,
gpu_config
.
total_kvcache_pages
);
SPDLOG_INFO
(
" Num Token/Page: {}, Num K Heads: {}"
,
gpu_config
.
num_token_per_page
,
gpu_config
.
num_k_heads
);
SPDLOG_INFO
(
" K Head Dim: {}, Tensor Type: {}"
,
gpu_config
.
k_head_dim
,
static_cast
<
int
>
(
gpu_config
.
tensor_type
));
SPDLOG_INFO
(
" MemcpyCudaStreams/Device: {}"
,
gpu_config
.
num_streams_per_device
);
}
else
{
SPDLOG_INFO
(
" GPU Cache Config: None"
);
}
load_model_configs
(
config
.
config_path
+
"/model_configs.json"
);
load_quant_configs
(
config
.
config_path
+
"/quant_configs.json"
);
// met
MetricsConfig
met_conf
;
met_conf
.
endpoint
=
"0.0.0.0:"
+
std
::
to_string
(
config
.
metrics_port
);
SPDLOG_INFO
(
"Creating kvc2 metrics exporter on {}"
,
met_conf
.
endpoint
);
met
=
std
::
make_shared
<
Metrics
>
(
met_conf
);
if
(
config
.
gpu_only
==
false
)
{
if
(
config
.
k_cache_on
==
false
)
{
SPDLOG_ERROR
(
"if k_cache_on is false, gpu_only must be true"
);
assert
(
false
);
}
root
=
config
.
path
;
tree
=
std
::
make_unique
<
PrefixTree
>
();
disk_cache
=
std
::
make_unique
<
DiskCacheManager
>
(
config
);
memory_pool
=
std
::
make_shared
<
PageAlignedMemoryPool
>
(
config
.
memory_pool_size
);
cache_manager
=
std
::
unique_ptr
<
CacheEntryManager
>
(
new
CacheEntryManager
(
CacheEntryManagerConfig
{.
evict_count
=
config
.
evict_count
,
.
kvc2_top
=
this
}));
cache_manager
->
pool
=
memory_pool
;
io_dealer
=
std
::
make_unique
<
async_store
::
IODealer
>
();
io_dealer
->
start_io_thread
().
detach
();
tree
->
met
=
met
;
if
(
config
.
gpu_cache_config
.
has_value
())
{
gpu_cache
=
std
::
make_shared
<
GPUPageCache
>
(
config
.
gpu_cache_config
.
value
());
cache_manager
->
gpu_cache
=
gpu_cache
;
}
cache_manager
->
cpu_background_flush
();
gpu_cache
->
gpu_background_flush
();
}
else
{
SPDLOG_CRITICAL
(
"GPU ONLY MODE, NO PREFIX CACHE"
);
gpu_cache
=
std
::
make_shared
<
GPUPageCache
>
(
config
.
gpu_cache_config
.
value
());
}
}
};
std
::
shared_ptr
<
KVC2Interface
>
create_kvc2
(
KVC2Config
config
)
{
NumTokenPerBlock
=
config
.
num_token_per_page
;
EvictCount
=
config
.
evict_count
;
// SPDLOG_WARN("Sizeof KVC2Config {} here", sizeof(KVC2Config));
return
std
::
make_shared
<
KVC2
>
(
config
);
}
DoubleCacheHandle
::~
DoubleCacheHandle
()
{
if
(
kvc2_top
->
config
.
gpu_only
)
{
kvc2_top
->
gpu_cache
->
gpu_only_free_cols
(
gpu_only_block_idx
);
}
else
{
for_all_cache_block_entry
([](
std
::
shared_ptr
<
CacheBlockEntry
>&
block_entry
)
{
block_entry
->
lock_guard
();
if
(
block_entry
->
with_key
==
false
&&
block_entry
->
data
!=
nullptr
)
{
block_entry
->
free_on_cpu
();
}
return
true
;
});
}
};
void
DoubleCacheHandle
::
get_handles
()
{
size_t
new_count
=
0
,
total_count
=
0
;
auto
get_info_handles
=
[
this
,
&
new_count
,
&
total_count
](
CacheInfo
info
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
layers
)
{
auto
total_block_count
=
div_up
(
estimated_length
,
NumTokenPerBlock
);
for
(
size_t
l
=
0
;
l
<
info
.
hidden_layer_count
();
l
++
)
{
auto
hashes
=
match
.
matched_hashes
(
info
,
l
);
layers
[
l
].
resize
(
total_block_count
,
nullptr
);
for
(
size_t
i
=
0
;
i
<
total_block_count
;
i
++
)
{
std
::
optional
<
CacheEntryManager
::
Key
>
key
=
std
::
nullopt
;
if
(
i
<
hashes
.
size
())
key
=
hashes
[
i
];
bool
is_new
;
total_count
+=
1
;
layers
[
l
][
i
]
=
this
->
kvc2_top
->
cache_manager
->
get
(
is_new
,
info
.
element_size
(
NumTokenPerBlock
),
key
);
if
(
is_new
)
new_count
+=
1
;
layers
[
l
][
i
]
->
cache_info
=
info
;
layers
[
l
][
i
]
->
layer
=
l
;
}
}
};
if
(
kvc2_top
->
config
.
k_cache_on
)
get_info_handles
(
k_info
(),
k_cache_handles
);
if
(
kvc2_top
->
config
.
v_cache_on
)
get_info_handles
(
v_info
(),
v_cache_handles
);
SPDLOG_INFO
(
"New Handles: {}/{}"
,
new_count
,
total_count
);
}
bool
DoubleCacheHandle
::
to_gpu
()
{
std
::
promise
<
bool
>
p
;
to_gpu_async
([
&
p
](
bool
ok
)
{
p
.
set_value
(
ok
);
});
return
p
.
get_future
().
get
();
}
void
DoubleCacheHandle
::
to_gpu_async
(
std
::
function
<
void
(
bool
)
>
call_back
)
{
if
(
enable_alt
)
{
assert
(
false
);
// size_t page_size = kvc2_top->config.num_token_per_page;
// BlockLength count =
// div_up(TokenLength(std::ceil(match_by_blocks.partial_count() * page_size *
// kvc2_top->config.recompute_ratio)),
// page_size);
// if (alloc_attached_blocks(count) == false) {
// SPDLOG_WARN("Cannot allocate attached GPU block");
// call_back(false);
// return;
// } else {
// SPDLOG_INFO("Allocated {} attached GPU blocks", count);
// }
}
// don't wait here
if
(
any_match
()
==
false
)
{
SPDLOG_INFO
(
"No match, No need to load to gpu"
);
call_back
(
true
);
return
;
}
auto
gpu_io_helper
=
gpu_io
(
kvc2_top
->
gpu_cache
.
get
(),
0
,
match_range_length
(),
IO_Read
);
gpu_io_helper
->
call_back
=
[
call_back
]()
{
call_back
(
true
);
};
// Ok this is very stupid, but I have to do this for now
std
::
thread
([
gpu_io_helper
]()
{
gpu_io_helper
->
wait
();
}).
detach
();
}
bool
DoubleCacheHandle
::
alloc_attached_blocks
(
BlockLength
count
)
{
// attached_vertical_handles.resize(count);
// for (size_t i = 0; i < count; i++) {
// attached_vertical_handles[i] = std::shared_ptr<DoubleVerticalBlocksHandle>(new DoubleVerticalBlocksHandle);
// attached_vertical_handles[i]->gpu_only = true;
// }
// return kvc2_top->gpu_cache->alloc_pages(attached_vertical_handles);
return
true
;
}
std
::
vector
<
size_t
>
DoubleCacheHandle
::
get_gpu_attached_block_idx
()
{
std
::
vector
<
size_t
>
re
;
// for (auto& h : attached_vertical_handles) {
// re.push_back(h->gpu_block_idx.value());
// }
return
re
;
}
void
CacheBlockEntry
::
set_key
(
TokensHash
key
,
std
::
shared_ptr
<
CacheBlockEntry
>
me
)
{
assert
(
with_key
==
false
);
with_key
=
true
;
hash
=
key
;
// SPDLOG_DEBUG("Insert New Gen KVCache, key {}", key);
std
::
lock_guard
<
std
::
mutex
>
manager_lg
(
manager
->
lock
);
if
(
manager
->
key_entry_map
.
contains
(
me
->
hash
))
{
SPDLOG_WARN
(
"Duplicate key {}"
,
me
->
hash
);
}
else
{
manager
->
insert
(
me
);
}
}
std
::
vector
<
size_t
>
DoubleCacheHandle
::
get_gpu_block_idx
()
{
if
(
kvc2_top
->
config
.
gpu_only
)
{
return
gpu_only_block_idx
;
}
else
{
std
::
vector
<
size_t
>
re
;
for
(
auto
&
handle
:
k_cache_handles
[
0
])
{
re
.
push_back
(
handle
->
gpu_block_idx
.
value
());
}
return
re
;
}
}
/*
length : total length of tokens (including matched tokens)
1. update key, insert CacheBlock hash to lru
2. set dirty flag
3. update prefix tree, allocate new disk location
*/
void
DoubleCacheHandle
::
append_tokens
(
Token
*
all_tokens
,
TokenLength
length
)
{
if
(
kvc2_top
->
config
.
gpu_only
)
{
return
;
}
TimeObserver
time_observer
(
kvc2_top
->
met
->
append_tokens_time_ms
);
if
(
enable_alt
)
{
SPDLOG_WARN
(
"Append Tokens Not Implemented for Alternative Path"
);
return
;
}
if
(
length
>
estimated_length
)
{
SPDLOG_ERROR
(
"Length {} exceed estimated length {}"
,
length
,
estimated_length
);
assert
(
false
);
}
size_t
match_length
=
matched_length
();
if
(
length
<
match_length
)
{
SPDLOG_WARN
(
"Length {} less than match length {}"
,
length
,
match_length
);
assert
(
false
);
}
if
(
length
>
ids
.
size
())
{
ids
.
insert
(
ids
.
end
(),
all_tokens
+
ids
.
size
(),
all_tokens
+
length
);
}
static
const
auto
num_token_per_page
=
kvc2_top
->
config
.
num_token_per_page
;
if
(
match_length
%
num_token_per_page
!=
0
)
{
SPDLOG_ERROR
(
"Match length {} is not multiple of num_token_per_page {}"
,
match_length
,
num_token_per_page
);
assert
(
false
);
}
if
(
match_length
+
num_token_per_page
>
length
)
{
// SPDLOG_DEBUG("append_tokens No need to update");
return
;
}
SPDLOG_DEBUG
(
"Append Tokens to {}"
,
length
);
auto
pre_match_length
=
match_length
;
// set gpu dirty flag
size_t
new_added_block_count
=
0
;
while
(
match_length
+
num_token_per_page
<=
length
)
{
match_length
+=
num_token_per_page
;
new_added_block_count
+=
1
;
}
// update prefix tree
match
.
prefix
=
kvc2_top
->
tree
->
new_prefix_node
(
match
.
prefix
,
pre_match_length
,
ids
.
data
(),
match_length
).
get
();
match
.
match_length
=
match_length
;
// alloc disk location for new added prefix
auto
disk_cache
=
kvc2_top
->
disk_cache
.
get
();
Location
k_loc
{
0
,
0
},
v_loc
{
0
,
0
};
if
(
is_k_cache_on
)
{
k_loc
=
disk_cache
->
allocate
(
k_info
(),
new_added_block_count
);
k_seg_locs
.
add_location
(
match
.
prefix
->
start_length
/
NumTokenPerBlock
,
k_loc
);
match
.
prefix
->
update_location
(
k_info
(),
k_loc
);
}
if
(
is_v_cache_on
)
{
v_loc
=
disk_cache
->
allocate
(
v_info
(),
new_added_block_count
);
v_seg_locs
.
add_location
(
match
.
prefix
->
start_length
/
NumTokenPerBlock
,
v_loc
);
match
.
prefix
->
update_location
(
v_info
(),
v_loc
);
}
// update cache handles
auto
update_cache_handles
=
[
this
,
pre_match_length
,
length
](
CacheInfo
info
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
layers
,
Location
loc
)
{
TokensHasher
hasher
;
for
(
Layer
l
=
0
;
l
<
info
.
hidden_layer_count
();
l
++
)
{
hasher
.
reset
(
info
.
hash_value
());
hasher
.
update_raw
(
&
l
,
sizeof
(
l
));
hasher
.
update
(
ids
.
data
(),
pre_match_length
);
auto
page_count_start
=
pre_match_length
/
num_token_per_page
;
for
(
size_t
i
=
pre_match_length
;
i
+
num_token_per_page
<=
length
;
i
+=
num_token_per_page
)
{
auto
page_count
=
i
/
num_token_per_page
;
hasher
.
update
(
ids
.
data
()
+
i
,
num_token_per_page
);
auto
block
=
layers
[
l
][
page_count
];
{
auto
lg
=
block
->
lock_guard
();
block
->
idx
=
loc
.
start_idx
+
page_count
-
page_count_start
;
block
->
set_key
(
hasher
.
get
(),
block
);
if
(
l
==
0
&&
info
.
is_key_cache
)
{
block
->
gpu_cc
.
tc
.
set_has_data
();
}
block
->
gpu_cc
.
dirty
.
store
(
true
);
}
}
}
};
if
(
is_k_cache_on
)
{
update_cache_handles
(
k_info
(),
k_cache_handles
,
k_loc
);
}
if
(
is_v_cache_on
)
{
update_cache_handles
(
v_info
(),
v_cache_handles
,
v_loc
);
}
// kvc2_top->block_cache->debug();
}
void
CacheBlockEntry
::
flush_back_async
(
IO_Helper
<
CacheBlockEntry
>&
helper
,
std
::
vector
<
std
::
atomic_bool
*>&
dirty_flags
)
{
auto
kvc2_top
=
manager
->
config
.
kvc2_top
;
auto
allocator
=
kvc2_top
->
disk_cache
->
get_allocator
(
cache_info
);
// if (layer == 0) {
// SPDLOG_DEBUG("Flush {} to {}", fmt::ptr(this), idx);
// }
io_with
(
kvc2_top
->
io_dealer
.
get
(),
helper
,
allocator
->
get_store
(
layer
),
layer
,
idx
,
IOOption
::
IO_Write
);
dirty_flags
.
push_back
(
&
cpu_cc
.
dirty
);
}
void
CacheEntryManager
::
cpu_background_flush
()
{
if
(
background_flush_back
.
get
()
==
nullptr
)
{
SPDLOG_INFO
(
"Starting CPU Background flush"
);
background_flush_back
=
std
::
unique_ptr
<
periodic
::
PeriodicTask
>
(
new
periodic
::
PeriodicTask
([
this
]()
{
// Timer t("CPU Flush");
std
::
vector
<
std
::
atomic_bool
*>
dirty_cpus
;
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
entry_uls
;
IO_Helper
<
CacheBlockEntry
>
io_helper
(
nullptr
,
[
&
dirty_cpus
]()
{
for
(
auto
&
flag
:
dirty_cpus
)
{
flag
->
store
(
false
);
}
if
(
dirty_cpus
.
size
()
>
0
)
SPDLOG_DEBUG
(
"{} dirty CPU pages flushed."
,
dirty_cpus
.
size
());
});
{
std
::
lock_guard
<
std
::
mutex
>
ul
(
lock
);
for
(
auto
&
e
:
usage_list
)
{
auto
ul
=
e
->
try_lock
();
if
(
ul
.
owns_lock
())
{
if
(
e
->
cpu_cc
.
dirty
.
load
())
{
entry_uls
.
push_back
(
std
::
move
(
ul
));
e
->
flush_back_async
(
io_helper
,
dirty_cpus
);
}
}
// if (dirty_cpus.size() == 100) {
// break;
// }
}
}
io_helper
.
finish_add_taks
();
io_helper
.
wait
();
}));
}
else
{
SPDLOG_ERROR
(
"Flush Thread Already Started"
);
}
}
void
GPUPageCache
::
gpu_background_flush
()
{
if
(
background_flush_back
.
get
()
==
nullptr
)
{
SPDLOG_INFO
(
"Starting GPU Background flush"
);
background_flush_back
=
std
::
unique_ptr
<
periodic
::
PeriodicTask
>
(
new
periodic
::
PeriodicTask
([
this
]()
{
// Timer t("GPU Flush");
std
::
vector
<
size_t
>
dirty_cols
;
std
::
vector
<
CacheBlockEntry
*>
entries
;
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
uls
;
BatchPromise
promise
(
config
.
gpu_devices_id
.
size
());
auto
reqs
=
basic_request
(
cudaMemcpyDeviceToHost
,
[
&
promise
]()
{
promise
.
set
();
});
for
(
size_t
i
=
0
;
i
<
config
.
total_kvcache_pages
;
i
++
)
{
std
::
lock_guard
<
std
::
mutex
>
lg
(
this
->
lock
);
auto
col_uls
=
try_lock_col
(
i
);
if
(
col_uls
.
empty
())
continue
;
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
config
.
k_cache_on
&&
(
occupations
[
l
][
i
]
->
gpu_cc
.
dirty
.
load
()
==
false
||
occupations
[
l
][
i
]
->
cpu_cc
.
dirty
.
load
()))
goto
next_gpu_page
;
if
(
config
.
v_cache_on
&&
(
v_occupations
[
l
][
i
]
->
gpu_cc
.
dirty
.
load
()
==
false
||
v_occupations
[
l
][
i
]
->
cpu_cc
.
dirty
.
load
()))
goto
next_gpu_page
;
}
dirty_cols
.
push_back
(
i
);
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
// occupations[l][i]->alloc_on_cpu_no_lock();
if
(
config
.
k_cache_on
)
entries
.
push_back
(
occupations
[
l
][
i
].
get
());
if
(
config
.
v_cache_on
)
entries
.
push_back
(
v_occupations
[
l
][
i
].
get
());
}
append_col_to_request
(
reqs
,
occupations
,
v_occupations
,
i
);
for
(
auto
&
ul
:
col_uls
)
{
uls
.
push_back
(
std
::
move
(
ul
));
}
next_gpu_page:
continue
;
}
submit_requests
(
reqs
);
promise
.
get_shared_fut
().
wait
();
if
(
dirty_cols
.
empty
()
==
false
)
SPDLOG_INFO
(
"GPU Flushed Back {} cols"
,
dirty_cols
.
size
());
for
(
auto
&
entry
:
entries
)
{
entry
->
cpu_cc
.
tc
.
set_has_data
();
// we have locks here
entry
->
cpu_cc
.
dirty
.
store
(
true
);
}
for
(
auto
&
col
:
dirty_cols
)
{
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
config
.
k_cache_on
)
occupations
[
l
][
col
]
->
gpu_cc
.
dirty
.
store
(
false
);
if
(
config
.
v_cache_on
)
v_occupations
[
l
][
col
]
->
gpu_cc
.
dirty
.
store
(
false
);
}
}
if
(
dirty_cols
.
empty
()
==
false
)
{
debug
();
}
}));
}
else
{
SPDLOG_ERROR
(
"Flush Thread Already Started"
);
}
}
}
// namespace kvc2
csrc/balance_serve/kvc2/src/utils/all.hpp
0 → 100644
View file @
25cee581
#pragma once
#include "easy_format.hpp"
#include "timer.hpp"
\ No newline at end of file
csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
0 → 100644
View file @
25cee581
#include <memory>
#include <type_traits>
template
<
typename
T
,
typename
U
>
T
div_up
(
T
x
,
U
by
)
{
static_assert
(
std
::
is_integral_v
<
T
>
);
static_assert
(
std
::
is_integral_v
<
U
>
);
return
(
x
+
by
-
1
)
/
by
;
}
template
<
typename
T
>
T
*
offset_by_bytes
(
T
*
t
,
size_t
n
)
{
return
reinterpret_cast
<
T
*>
(
reinterpret_cast
<
size_t
>
(
t
)
+
n
);
}
csrc/balance_serve/kvc2/src/utils/easy_format.hpp
0 → 100644
View file @
25cee581
#ifndef __EASY_FORMAT_HPP_
#define __EASY_FORMAT_HPP_
#include <array>
#include <iomanip>
#include <sstream>
#include <string>
#include <vector>
template
<
typename
T
>
inline
std
::
string
format_vector
(
const
std
::
vector
<
T
>&
v
)
{
std
::
ostringstream
oss
;
if
(
v
.
empty
())
return
"[]"
;
for
(
size_t
i
=
0
;
i
<
v
.
size
();
++
i
)
{
oss
<<
v
[
i
];
if
(
i
<
v
.
size
()
-
1
)
oss
<<
", "
;
// 逗号分隔
}
return
oss
.
str
();
}
inline
std
::
array
<
std
::
string
,
7
>
units
=
{
""
,
"K"
,
"M"
,
"G"
,
"T"
,
"P"
,
"E"
};
inline
std
::
string
readable_number
(
size_t
size
)
{
size_t
unit_index
=
0
;
double
readable_size
=
size
;
while
(
readable_size
>=
1000
&&
unit_index
<
units
.
size
()
-
1
)
{
readable_size
/=
1000
;
unit_index
++
;
}
std
::
ostringstream
ss
;
ss
<<
std
::
fixed
<<
std
::
setprecision
(
2
)
<<
readable_size
;
std
::
string
str
=
ss
.
str
();
return
str
+
""
+
units
[
unit_index
];
}
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/utils/lock_free_queue.hpp
0 → 100644
View file @
25cee581
#include <atomic>
#include <future>
#include <iostream>
#include <memory>
#include <thread>
#include <vector>
template
<
typename
T
>
class
MPSCQueue
{
struct
Node
{
std
::
shared_ptr
<
T
>
data
;
std
::
atomic
<
Node
*>
next
;
Node
()
:
next
(
nullptr
)
{}
Node
(
std
::
shared_ptr
<
T
>
data_
)
:
data
(
std
::
move
(
data_
)),
next
(
nullptr
)
{}
};
std
::
atomic
<
Node
*>
head
;
Node
*
tail
;
public:
std
::
atomic_size_t
enqueue_count
=
0
;
size_t
dequeue_count
=
0
;
MPSCQueue
()
{
Node
*
dummy
=
new
Node
();
head
.
store
(
dummy
,
std
::
memory_order_relaxed
);
tail
=
dummy
;
}
~
MPSCQueue
()
{
// 清理剩余的节点
Node
*
node
=
tail
;
while
(
node
)
{
Node
*
next
=
node
->
next
.
load
(
std
::
memory_order_relaxed
);
delete
node
;
node
=
next
;
}
}
// 生产者调用
void
enqueue
(
std
::
shared_ptr
<
T
>
data
)
{
enqueue_count
.
fetch_add
(
1
);
Node
*
node
=
new
Node
(
std
::
move
(
data
));
Node
*
prev_head
=
head
.
exchange
(
node
,
std
::
memory_order_acq_rel
);
prev_head
->
next
.
store
(
node
,
std
::
memory_order_release
);
}
// 消费者调用
std
::
shared_ptr
<
T
>
dequeue
()
{
Node
*
next
=
tail
->
next
.
load
(
std
::
memory_order_acquire
);
if
(
next
)
{
std
::
shared_ptr
<
T
>
res
=
std
::
move
(
next
->
data
);
delete
tail
;
tail
=
next
;
dequeue_count
+=
1
;
return
res
;
}
return
nullptr
;
}
};
\ No newline at end of file
csrc/balance_serve/kvc2/src/utils/mpsc.hpp
0 → 100644
View file @
25cee581
#include <atomic>
#include <cassert>
#include <iostream>
#include <optional>
#include <semaphore>
template
<
typename
T
>
class
MPSCQueue
{
struct
Node
{
T
data
;
std
::
atomic
<
Node
*>
next
;
Node
()
:
next
(
nullptr
)
{}
Node
(
T
data_
)
:
data
(
std
::
move
(
data_
)),
next
(
nullptr
)
{}
};
std
::
atomic
<
Node
*>
head
;
Node
*
tail
;
public:
std
::
atomic_size_t
enqueue_count
=
0
;
size_t
dequeue_count
=
0
;
MPSCQueue
()
{
Node
*
dummy
=
new
Node
();
head
.
store
(
dummy
,
std
::
memory_order_seq_cst
);
tail
=
dummy
;
}
~
MPSCQueue
()
{
Node
*
node
=
tail
;
while
(
node
)
{
Node
*
next
=
node
->
next
.
load
(
std
::
memory_order_seq_cst
);
delete
node
;
node
=
next
;
}
}
// 生产者调用
void
enqueue
(
T
data
)
{
enqueue_count
.
fetch_add
(
1
);
Node
*
node
=
new
Node
(
std
::
move
(
data
));
Node
*
prev_head
=
head
.
exchange
(
node
,
std
::
memory_order_seq_cst
);
prev_head
->
next
.
store
(
node
,
std
::
memory_order_seq_cst
);
}
// 消费者调用
std
::
optional
<
T
>
dequeue
()
{
Node
*
next
=
tail
->
next
.
load
(
std
::
memory_order_seq_cst
);
if
(
next
)
{
T
res
=
std
::
move
(
next
->
data
);
delete
tail
;
tail
=
next
;
dequeue_count
+=
1
;
return
res
;
}
return
std
::
nullopt
;
}
size_t
size
()
{
return
enqueue_count
.
load
()
-
dequeue_count
;
}
};
template
<
typename
T
>
class
MPSCQueueConsumerLock
{
MPSCQueue
<
T
>
queue
;
std
::
counting_semaphore
<>
sema
{
0
};
public:
void
enqueue
(
T
data
)
{
queue
.
enqueue
(
std
::
move
(
data
));
// std::atomic_thread_fence(std::memory_order_seq_cst);// Inserting this because the memory order might be wrong, I
// am also not that sure about this.
sema
.
release
();
}
T
dequeue
()
{
auto
re
=
queue
.
dequeue
();
if
(
re
.
has_value
())
{
while
(
sema
.
try_acquire
()
==
false
)
{
std
::
cerr
<<
__FILE__
<<
":"
<<
__FUNCTION__
<<
" sema try acquire should be success, retrying, please check"
<<
std
::
endl
;
// assert(false);
}
return
re
.
value
();
}
sema
.
acquire
();
return
queue
.
dequeue
().
value
();
}
size_t
size
()
{
return
queue
.
size
();
}
};
Prev
1
2
3
4
5
6
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment