Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
877aec85
Unverified
Commit
877aec85
authored
Apr 09, 2025
by
Yuhao Tsui
Committed by
GitHub
Apr 09, 2025
Browse files
Merge branch 'kvcache-ai:main' into main
parents
84164f58
9037bf30
Changes
251
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3473 additions
and
0 deletions
+3473
-0
csrc/balance_serve/kvc2/src/cache_entry.hh
csrc/balance_serve/kvc2/src/cache_entry.hh
+182
-0
csrc/balance_serve/kvc2/src/common.h
csrc/balance_serve/kvc2/src/common.h
+0
-0
csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
+135
-0
csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
+54
-0
csrc/balance_serve/kvc2/src/defs.h
csrc/balance_serve/kvc2/src/defs.h
+35
-0
csrc/balance_serve/kvc2/src/gpu_cache.cpp
csrc/balance_serve/kvc2/src/gpu_cache.cpp
+282
-0
csrc/balance_serve/kvc2/src/gpu_cache.hh
csrc/balance_serve/kvc2/src/gpu_cache.hh
+74
-0
csrc/balance_serve/kvc2/src/hasher.hpp
csrc/balance_serve/kvc2/src/hasher.hpp
+40
-0
csrc/balance_serve/kvc2/src/io_helper.hpp
csrc/balance_serve/kvc2/src/io_helper.hpp
+155
-0
csrc/balance_serve/kvc2/src/kvc2.h
csrc/balance_serve/kvc2/src/kvc2.h
+138
-0
csrc/balance_serve/kvc2/src/kvc2_utils.py
csrc/balance_serve/kvc2/src/kvc2_utils.py
+64
-0
csrc/balance_serve/kvc2/src/metrics.cpp
csrc/balance_serve/kvc2/src/metrics.cpp
+141
-0
csrc/balance_serve/kvc2/src/metrics.h
csrc/balance_serve/kvc2/src/metrics.h
+77
-0
csrc/balance_serve/kvc2/src/model_config.h
csrc/balance_serve/kvc2/src/model_config.h
+119
-0
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
+125
-0
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
+54
-0
csrc/balance_serve/kvc2/src/prefix.cpp
csrc/balance_serve/kvc2/src/prefix.cpp
+1744
-0
csrc/balance_serve/kvc2/src/utils/all.hpp
csrc/balance_serve/kvc2/src/utils/all.hpp
+3
-0
csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
+14
-0
csrc/balance_serve/kvc2/src/utils/easy_format.hpp
csrc/balance_serve/kvc2/src/utils/easy_format.hpp
+37
-0
No files found.
csrc/balance_serve/kvc2/src/cache_entry.hh
0 → 100644
View file @
877aec85
#ifndef __CACHE_ENTRY_HH_
#define __CACHE_ENTRY_HH_
#include "async_store.hh"
#include "cuda_stream_manager.hh"
#include "defs.h"
#include "hasher.hpp"
#include "io_helper.hpp"
#include "page_aligned_memory_pool.h"
#include "utils/periodic_task.hpp"
#include <atomic>
#include <list>
#include <memory>
#include "utils/mutex_extend.hpp"
namespace
kvc2
{
using
CacheBlockKey
=
TokensHash
;
class
CacheEntryManager
;
struct
DoubleVerticalBlocksHandle
;
class
GPUPageCache
;
struct
ConcurrentControlUnit
{
std
::
atomic_size_t
ref_count
=
0
;
std
::
atomic_bool
dirty
=
false
;
TransferControl
<
std
::
mutex
>
tc
;
bool
can_desert
();
void
debug
();
};
enum
IOOption
{
IO_ForceRead
,
IO_ForceWrite
,
IO_Read
,
IO_Write
,
};
inline
std
::
string
to_string
(
IOOption
op
)
{
switch
(
op
)
{
case
IO_ForceRead
:
return
"IO_ForceRead"
;
case
IO_ForceWrite
:
return
"IO_ForceWrite"
;
case
IO_Read
:
return
"IO_Read"
;
case
IO_Write
:
return
"IO_Write"
;
default:
return
"Unknown"
;
}
}
struct
CacheBlockEntry
{
friend
CacheEntryManager
;
using
MutexT
=
non_recursive_mutex
;
// using MutexT = std::mutex;
MutexT
lock
;
// for cache
bool
with_key
=
true
;
CacheBlockKey
hash
=
0
;
CacheBlockKey
hash_check
=
0
;
CacheInfo
cache_info
;
CacheEntryManager
*
manager
=
nullptr
;
// for memory pool
void
*
data
=
nullptr
;
size_t
size
=
0
;
ConcurrentControlUnit
cpu_cc
;
// for disk
size_t
layer
=
-
1
;
size_t
idx
=
-
1
;
// for gpu
std
::
optional
<
size_t
>
gpu_block_idx
=
std
::
nullopt
;
ConcurrentControlUnit
gpu_cc
;
CacheBlockEntry
()
=
default
;
CacheBlockEntry
(
const
CacheBlockEntry
&
other
)
=
delete
;
CacheBlockEntry
&
operator
=
(
const
CacheBlockEntry
&
other
)
=
delete
;
CacheBlockEntry
(
CacheBlockEntry
&&
other
)
=
delete
;
CacheBlockEntry
&
operator
=
(
CacheBlockEntry
&&
other
)
=
delete
;
~
CacheBlockEntry
();
private:
bool
alloc_on_cpu
();
public:
void
free_on_cpu
();
bool
alloc_on_cpu_no_lock
();
bool
inc_ref_or_alloc_on_cpu
();
void
set_key
(
TokensHash
key
,
std
::
shared_ptr
<
CacheBlockEntry
>
me
);
std
::
unique_lock
<
MutexT
>
try_lock
();
std
::
lock_guard
<
MutexT
>
lock_guard
();
// will not get lock
void
io_with
(
async_store
::
IODealer
*
dealer
,
IO_Helper
<
CacheBlockEntry
>&
io_helper
,
async_store
::
ArrayStore
*
store
,
size_t
layer
,
size_t
index
,
IOOption
option
);
void
flush_back_async
(
IO_Helper
<
CacheBlockEntry
>&
helper
,
std
::
vector
<
std
::
atomic_bool
*>&
dirty_flags
);
void
debug
();
};
struct
CacheBlockEntryCollector
{
std
::
vector
<
CacheBlockEntry
*>
entries
;
std
::
function
<
void
(
CacheBlockEntry
*
)
>
exit_fn
;
CacheBlockEntryCollector
(
std
::
function
<
void
(
CacheBlockEntry
*
)
>
exit_fn
);
~
CacheBlockEntryCollector
();
CacheBlockEntryCollector
(
const
CacheBlockEntryCollector
&
other
)
=
delete
;
CacheBlockEntryCollector
(
CacheBlockEntryCollector
&&
other
)
=
delete
;
CacheBlockEntryCollector
&
operator
=
(
const
CacheBlockEntryCollector
&
other
)
=
delete
;
CacheBlockEntryCollector
&
operator
=
(
CacheBlockEntryCollector
&&
other
)
=
delete
;
};
struct
KVC2
;
struct
CacheEntryManagerConfig
{
size_t
evict_count
=
100
;
KVC2
*
kvc2_top
=
nullptr
;
};
class
CacheEntryManager
{
public:
using
Key
=
CacheBlockKey
;
using
BlockPtr
=
std
::
shared_ptr
<
CacheBlockEntry
>
;
private:
friend
CacheBlockEntry
;
CacheEntryManagerConfig
config
;
std
::
mutex
lock
;
std
::
list
<
BlockPtr
>
usage_list
;
std
::
unordered_map
<
Key
,
std
::
list
<
BlockPtr
>::
iterator
>
key_entry_map
;
void
insert
(
BlockPtr
entry
);
BlockPtr
access
(
const
Key
&
key
);
// void remove(const Key& key);
void
evict
(
std
::
function
<
bool
(
const
BlockPtr
&
)
>
filter
,
std
::
function
<
bool
()
>
stop_condition
);
public:
std
::
unique_ptr
<
periodic
::
PeriodicTask
>
background_flush_back
=
nullptr
;
std
::
shared_ptr
<
PageAlignedMemoryPool
>
pool
;
std
::
shared_ptr
<
GPUPageCache
>
gpu_cache
;
CacheEntryManager
(
CacheEntryManagerConfig
config
);
// disable all move and copy
CacheEntryManager
(
const
CacheEntryManager
&
other
)
=
delete
;
CacheEntryManager
&
operator
=
(
const
CacheEntryManager
&
other
)
=
delete
;
CacheEntryManager
(
CacheEntryManager
&&
other
)
=
delete
;
CacheEntryManager
&
operator
=
(
CacheEntryManager
&&
other
)
=
delete
;
void
cpu_background_flush
();
void
evict_for_cpu_cache
();
// just get block pointers, not allocate them, will not return nullptr
BlockPtr
get
(
bool
&
is_new
,
size_t
size
,
std
::
optional
<
Key
>
key
=
std
::
nullopt
);
void
debug
();
};
}
// namespace kvc2
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/common.h
0 → 100644
View file @
877aec85
csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
0 → 100644
View file @
877aec85
#include "cuda_stream_manager.hh"
#include <cuda_runtime.h>
#include <functional>
#include <iostream>
#include <stdexcept>
#include <vector>
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
CudaStreamManager
::
CudaStreamManager
(
const
std
::
vector
<
size_t
>&
device_ids
,
int
num_streams_per_device
)
{
for
(
int
device_id
:
device_ids
)
{
auto
x
=
std
::
unique_ptr
<
DeviceInfo
>
(
new
DeviceInfo
);
DeviceInfo
&
device_info
=
*
x
;
device_info
.
device_id
=
device_id
;
device_info
.
next_stream_index
=
0
;
device_info
.
stop_flag
=
false
;
// 设置设备
cudaError_t
err
=
cudaSetDevice
(
device_id
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaSetDevice failed on device {}: {}"
,
device_id
,
cudaGetErrorString
(
err
));
throw
std
::
runtime_error
(
"cudaSetDevice failed"
);
}
// 创建 CUDA 流
device_info
.
streams
.
resize
(
num_streams_per_device
);
for
(
int
i
=
0
;
i
<
num_streams_per_device
;
++
i
)
{
err
=
cudaStreamCreate
(
&
device_info
.
streams
[
i
]);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"Failed to create CUDA stream on device {}: {}"
,
device_id
,
cudaGetErrorString
(
err
));
throw
std
::
runtime_error
(
"Failed to create CUDA stream"
);
}
}
// 启动设备工作线程
device_info
.
worker_thread
=
std
::
thread
(
&
CudaStreamManager
::
deviceWorker
,
this
,
std
::
ref
(
device_info
));
devices_
.
push_back
(
std
::
move
(
x
));
}
}
CudaStreamManager
::~
CudaStreamManager
()
{
// 通知所有设备线程停止
for
(
auto
&
device_info
:
devices_
)
{
device_info
->
stop_flag
.
store
(
true
);
auto
request
=
std
::
shared_ptr
<
Request
>
(
new
Request
);
request
->
should_exit
=
true
;
device_info
->
request_queue
.
enqueue
(
std
::
move
(
request
));
}
// 等待所有线程结束
for
(
auto
&
device_info
:
devices_
)
{
if
(
device_info
->
worker_thread
.
joinable
())
{
device_info
->
worker_thread
.
join
();
}
// 销毁 CUDA 流
cudaSetDevice
(
device_info
->
device_id
);
for
(
auto
&
stream
:
device_info
->
streams
)
{
cudaStreamDestroy
(
stream
);
}
}
}
void
CudaStreamManager
::
submitRequest
(
std
::
shared_ptr
<
Request
>
request
)
{
// 找到对应的设备
for
(
auto
&
device_info
:
devices_
)
{
if
(
device_info
->
device_id
==
request
->
device_id
)
{
device_info
->
request_queue
.
enqueue
(
request
);
return
;
}
}
throw
std
::
runtime_error
(
"Invalid device ID in request"
);
}
void
CudaStreamManager
::
deviceWorker
(
DeviceInfo
&
device_info
)
{
// 设置设备
cudaError_t
err
=
cudaSetDevice
(
device_info
.
device_id
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaSetDevice failed in worker thread for device {}: {}"
,
device_info
.
device_id
,
cudaGetErrorString
(
err
));
return
;
}
while
(
device_info
.
stop_flag
.
load
()
==
false
)
{
auto
request
=
device_info
.
request_queue
.
dequeue
();
if
(
request
->
should_exit
)
{
return
;
}
// 处理请求
SPDLOG_DEBUG
(
"Getting request on device {}, count {}"
,
device_info
.
device_id
,
request
->
host_mem_addresses
.
size
());
int
stream_index
=
device_info
.
next_stream_index
;
cudaStream_t
stream
=
device_info
.
streams
[
stream_index
];
device_info
.
next_stream_index
=
(
device_info
.
next_stream_index
+
1
)
%
device_info
.
streams
.
size
();
size_t
num_transfers
=
request
->
host_mem_addresses
.
size
();
for
(
size_t
i
=
0
;
i
<
num_transfers
;
++
i
)
{
void
*
dst
=
request
->
device_mem_addresses
[
i
];
void
*
src
=
request
->
host_mem_addresses
[
i
];
if
(
request
->
direction
==
cudaMemcpyDeviceToHost
)
{
std
::
swap
(
dst
,
src
);
}
cudaError_t
err
=
cudaMemcpyAsync
(
dst
,
src
,
request
->
sizes
[
i
],
request
->
direction
,
stream
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaMemcpyAsync failed on device {}: {}"
,
device_info
.
device_id
,
cudaGetErrorString
(
err
));
// 可以根据需要处理错误,这里简单地继续
continue
;
}
}
// 添加回调函数,因为是异步,所以需要包起来
struct
CallbackData
{
std
::
function
<
void
()
>
callback
;
};
CallbackData
*
cb_data
=
new
CallbackData
{
request
->
callback
};
err
=
cudaLaunchHostFunc
(
stream
,
[](
void
*
data
)
{
// SPDLOG_DEBUG("Callback function called");
CallbackData
*
cb_data
=
static_cast
<
CallbackData
*>
(
data
);
cb_data
->
callback
();
delete
cb_data
;
},
cb_data
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaLaunchHostFunc failed on device {}: {}"
,
device_info
.
device_id
,
cudaGetErrorString
(
err
));
// 根据需要处理错误
}
}
}
csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
0 → 100644
View file @
877aec85
/*
* @Author: Xie Weiyu ervinxie@qq.com
* @Date: 2024-11-19 09:24:47
* @LastEditors: Xie Weiyu ervinxie@qq.com
* @LastEditTime: 2024-11-20 02:55:49
* @FilePath: /kvc2/src/cuda_stream_manager.hh
* @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
*/
#pragma once
#include <cuda_runtime.h>
#include <atomic>
#include <functional>
#include <memory>
#include <thread>
#include <vector>
#include "utils/mpsc.hpp"
class
CudaStreamManager
{
public:
// 构造函数,接受要使用的设备 ID 列表和每个设备的流数量
CudaStreamManager
(
const
std
::
vector
<
size_t
>&
device_ids
,
int
num_streams_per_device
);
~
CudaStreamManager
();
// 请求结构体
struct
Request
{
bool
should_exit
=
false
;
int
device_id
;
std
::
vector
<
void
*>
host_mem_addresses
;
std
::
vector
<
void
*>
device_mem_addresses
;
std
::
vector
<
size_t
>
sizes
;
cudaMemcpyKind
direction
;
std
::
function
<
void
()
>
callback
;
};
void
submitRequest
(
std
::
shared_ptr
<
Request
>
request
);
private:
// 每个设备的信息
struct
DeviceInfo
{
int
device_id
;
std
::
thread
worker_thread
;
std
::
vector
<
cudaStream_t
>
streams
;
int
next_stream_index
;
MPSCQueueConsumerLock
<
std
::
shared_ptr
<
Request
>>
request_queue
;
std
::
atomic_bool
stop_flag
;
};
// 设备 ID 到 DeviceInfo 的映射
std
::
vector
<
std
::
unique_ptr
<
DeviceInfo
>>
devices_
;
// 私有方法
void
deviceWorker
(
DeviceInfo
&
device_info
);
};
csrc/balance_serve/kvc2/src/defs.h
0 → 100644
View file @
877aec85
#ifndef __DEFS_H_
#define __DEFS_H_
#include <cstdint>
#include <optional>
#include <vector>
#include "model_config.h"
namespace
kvc2
{
using
kvc2_ptr
=
void
*
;
// using data_block_ptr = std::intptr_t;
using
data_block_ptr
=
void
*
;
using
layer_data
=
std
::
vector
<
data_block_ptr
>
;
using
kvc2_handle
=
void
*
;
using
Token
=
uint32_t
;
using
Tokens
=
std
::
vector
<
Token
>
;
using
TokenPtr
=
std
::
intptr_t
;
using
TokenLength
=
size_t
;
using
BlockLength
=
size_t
;
struct
CacheInfo
{
ModelName
model_name
;
bool
is_key_cache
;
QuantType
quant_type
;
size_t
hidden_layer_count
();
std
::
filesystem
::
path
path
(
std
::
optional
<
size_t
>
which_layer
=
std
::
nullopt
);
bool
operator
==
(
const
CacheInfo
&
other
)
const
;
size_t
element_size
(
size_t
block_length
);
size_t
hash_value
()
const
;
};
};
// namespace kvc2
#endif
csrc/balance_serve/kvc2/src/gpu_cache.cpp
0 → 100644
View file @
877aec85
#include "gpu_cache.hh"
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
#include "cache_entry.hh"
#include "utils/arithmetic.hpp"
namespace
kvc2
{
GPUPageCache
::
GPUPageCache
(
GPUPageCacheConfig
&
config
)
:
config
(
config
)
{
if
(
torch
::
cuda
::
is_available
())
{
size_t
gpu_count
=
torch
::
cuda
::
device_count
();
SPDLOG_INFO
(
"Number of available GPUs: {}, want {}"
,
gpu_count
,
config
.
gpu_devices_id
.
size
());
if
(
gpu_count
<
config
.
gpu_devices_id
.
size
())
{
SPDLOG_ERROR
(
"Not enough GPUs available."
);
exit
(
0
);
}
for
(
auto
x
:
config
.
gpu_devices_id
)
{
gpu_devices
.
push_back
(
torch
::
Device
(
torch
::
kCUDA
,
x
));
}
}
else
{
SPDLOG_ERROR
(
"CUDA is not available on this system."
);
exit
(
0
);
}
SPDLOG_WARN
(
"Creating GPU Cache"
);
shape
.
push_back
(
config
.
layer_count
);
shape
.
push_back
(
config
.
total_kvcache_pages
);
shape
.
push_back
(
config
.
num_token_per_page
);
if
(
config
.
full_kv_cache_on_each_gpu
)
{
if
(
config
.
gpu_devices_id
.
size
()
>
1
)
{
SPDLOG_WARN
(
"Replicated KVCache on multiple gpu"
);
}
shape
.
push_back
(
config
.
num_k_heads
);
}
else
{
shape
.
push_back
(
config
.
num_k_heads
/
config
.
gpu_devices_id
.
size
());
}
shape
.
push_back
(
config
.
k_head_dim
);
tensor_size
=
torch
::
elementSize
(
config
.
tensor_type
);
for
(
auto
&
s
:
shape
)
{
tensor_size
*=
s
;
}
SPDLOG_INFO
(
"Creating KV Page Cache, Shape ({},{},{},{},{}), Size {} MiB"
,
shape
[
0
],
shape
[
1
],
shape
[
2
],
shape
[
3
],
shape
[
4
],
tensor_size
/
(
1
<<
20
));
if
(
config
.
k_cache_on
)
{
for
(
size_t
i
=
0
;
i
<
config
.
gpu_devices_id
.
size
();
i
++
)
{
auto
k
=
torch
::
zeros
(
shape
,
torch
::
TensorOptions
().
dtype
(
config
.
tensor_type
));
k
=
k
.
to
(
gpu_devices
[
i
]);
k_cache
.
push_back
(
k
);
SPDLOG_INFO
(
"K Page Cache of GPU {} is created"
,
config
.
gpu_devices_id
[
i
]);
}
occupations
.
resize
(
config
.
layer_count
);
}
else
{
SPDLOG_WARN
(
"Disalbe K Cache"
);
assert
(
config
.
gpu_only
);
}
if
(
config
.
v_cache_on
)
{
for
(
size_t
i
=
0
;
i
<
config
.
gpu_devices_id
.
size
();
i
++
)
{
auto
v
=
torch
::
zeros
(
shape
,
torch
::
TensorOptions
().
dtype
(
config
.
tensor_type
));
v
=
v
.
to
(
gpu_devices
[
i
]);
v_cache
.
push_back
(
v
);
SPDLOG_INFO
(
"V Page Cache of GPU {} is created"
,
config
.
gpu_devices_id
[
i
]);
}
v_occupations
.
resize
(
config
.
layer_count
);
}
else
{
SPDLOG_WARN
(
"Disalbe V Cache"
);
// assert(config.gpu_only); // should not assert
}
if
(
config
.
gpu_only
)
{
gpu_only_occupations
.
resize
(
config
.
total_kvcache_pages
,
false
);
}
num_free_pages
=
config
.
total_kvcache_pages
;
for
(
size_t
i
=
0
;
i
<
config
.
layer_count
;
i
++
)
{
if
(
config
.
k_cache_on
)
occupations
[
i
].
resize
(
config
.
total_kvcache_pages
,
nullptr
);
if
(
config
.
v_cache_on
)
v_occupations
[
i
].
resize
(
config
.
total_kvcache_pages
,
nullptr
);
}
tp_size
.
resize
(
config
.
gpu_devices_id
.
size
(),
shape
[
2
]
*
shape
[
3
]
*
shape
[
4
]
*
c10
::
elementSize
(
config
.
tensor_type
));
tp_offset
.
resize
(
config
.
gpu_devices_id
.
size
(),
0
);
for
(
size_t
i
=
1
;
i
<
tp_offset
.
size
();
i
++
)
{
tp_offset
[
i
]
=
tp_offset
[
i
-
1
]
+
tp_size
[
i
-
1
];
}
stream_manager
=
std
::
unique_ptr
<
CudaStreamManager
>
(
new
CudaStreamManager
(
config
.
gpu_devices_id
,
config
.
num_streams_per_device
));
}
bool
GPUPageCache
::
alloc_col
(
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_entries
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_entries
,
size_t
at
)
{
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
auto
idx
=
next_empty_col
();
if
(
idx
.
has_value
())
{
// must have entry lock
auto
&
k0_entry
=
k_entries
[
0
][
at
];
k0_entry
->
gpu_block_idx
=
idx
;
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
config
.
k_cache_on
)
{
assert
(
k_entries
[
l
][
at
]
->
data
!=
nullptr
);
occupations
[
l
][
idx
.
value
()]
=
k_entries
[
l
][
at
];
}
if
(
config
.
v_cache_on
)
{
assert
(
v_entries
[
l
][
at
]
->
data
!=
nullptr
);
v_occupations
[
l
][
idx
.
value
()]
=
v_entries
[
l
][
at
];
}
}
return
true
;
}
else
{
return
false
;
}
}
std
::
vector
<
size_t
>
GPUPageCache
::
gpu_only_alloc_col
(
size_t
count
)
{
assert
(
config
.
gpu_only
);
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
std
::
vector
<
size_t
>
re
;
for
(
size_t
i
=
0
;
i
<
config
.
total_kvcache_pages
;
i
++
)
{
if
(
gpu_only_occupations
[
i
]
==
false
)
{
re
.
push_back
(
i
);
if
(
re
.
size
()
==
count
)
{
break
;
}
}
}
if
(
re
.
size
()
==
count
)
{
for
(
auto
at
:
re
)
{
gpu_only_occupations
[
at
]
=
true
;
}
}
else
{
SPDLOG_WARN
(
"GPU ONLY: Cannot allocate {} cols"
,
count
);
re
.
clear
();
}
return
re
;
}
void
GPUPageCache
::
gpu_only_free_cols
(
std
::
vector
<
size_t
>
cols
)
{
assert
(
config
.
gpu_only
);
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
for
(
auto
at
:
cols
)
{
assert
(
gpu_only_occupations
[
at
]);
gpu_only_occupations
[
at
]
=
false
;
}
}
std
::
optional
<
size_t
>
GPUPageCache
::
next_empty_col
()
{
if
(
num_free_pages
==
0
)
{
evict_cols
();
if
(
num_free_pages
==
0
)
{
return
std
::
nullopt
;
}
}
while
(
occupations
[
0
][
_col_idx
]
!=
nullptr
)
{
_col_idx
=
(
_col_idx
+
1
)
%
config
.
total_kvcache_pages
;
}
num_free_pages
-=
1
;
return
_col_idx
;
}
void
GPUPageCache
::
evict_cols
()
{
auto
evicted_count
=
0
;
for
(
size_t
i
=
0
;
i
<
config
.
total_kvcache_pages
;
i
++
)
{
auto
&
h
=
occupations
[
0
][
i
];
if
(
h
==
nullptr
)
{
continue
;
}
auto
lg
=
h
->
lock_guard
();
if
(
h
->
gpu_cc
.
can_desert
())
{
h
->
gpu_cc
.
tc
.
reset
();
h
=
nullptr
;
num_free_pages
+=
1
;
evicted_count
+=
1
;
}
}
if
(
evicted_count
>
0
)
SPDLOG_INFO
(
"GPU: Evicted {} GPU pages"
,
evicted_count
);
}
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
GPUPageCache
::
try_lock_col
(
size_t
at
)
{
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
re
;
if
(
config
.
k_cache_on
)
{
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
occupations
[
l
][
at
]
==
nullptr
)
{
return
{};
}
auto
ul
=
occupations
[
l
][
at
]
->
try_lock
();
if
(
ul
.
owns_lock
())
{
re
.
push_back
(
std
::
move
(
ul
));
}
else
{
return
{};
}
}
}
if
(
config
.
v_cache_on
)
{
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
v_occupations
[
l
][
at
]
==
nullptr
)
{
return
{};
}
auto
ul
=
v_occupations
[
l
][
at
]
->
try_lock
();
if
(
ul
.
owns_lock
())
{
re
.
push_back
(
std
::
move
(
ul
));
}
else
{
return
{};
}
}
}
return
re
;
}
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
GPUPageCache
::
basic_request
(
cudaMemcpyKind
direction
,
std
::
function
<
void
()
>
callback
)
{
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
re
;
re
.
resize
(
config
.
gpu_devices_id
.
size
(),
nullptr
);
for
(
size_t
i
=
0
;
i
<
re
.
size
();
i
++
)
{
re
[
i
]
=
std
::
shared_ptr
<
CudaStreamManager
::
Request
>
(
new
CudaStreamManager
::
Request
);
re
[
i
]
->
direction
=
direction
;
re
[
i
]
->
device_id
=
config
.
gpu_devices_id
[
i
];
re
[
i
]
->
callback
=
callback
;
}
return
re
;
}
void
GPUPageCache
::
submit_requests
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
reqs
)
{
for
(
auto
&
r
:
reqs
)
{
stream_manager
->
submitRequest
(
r
);
}
}
void
GPUPageCache
::
append_col_to_request
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>&
reqs
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_handles
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_handles
,
size_t
at
)
{
if
(
config
.
k_cache_on
==
false
&&
config
.
v_cache_on
==
false
)
{
return
;
}
auto
gpu_block_idx
=
k_handles
[
0
][
at
]
->
gpu_block_idx
.
value
();
for
(
size_t
layer
=
0
;
layer
<
config
.
layer_count
;
layer
++
)
{
for
(
size_t
which_gpu
=
0
;
which_gpu
<
config
.
gpu_devices_id
.
size
();
which_gpu
++
)
{
if
(
config
.
k_cache_on
)
{
assert
(
k_handles
[
layer
][
at
]
->
data
!=
nullptr
);
reqs
[
which_gpu
]
->
sizes
.
push_back
(
tp_size
[
which_gpu
]);
reqs
[
which_gpu
]
->
host_mem_addresses
.
push_back
(
offset_by_bytes
(
k_handles
[
layer
][
at
]
->
data
,
tp_offset
[
which_gpu
]));
reqs
[
which_gpu
]
->
device_mem_addresses
.
push_back
(
k_cache
[
which_gpu
][
layer
][
gpu_block_idx
].
data_ptr
());
}
if
(
config
.
v_cache_on
)
{
assert
(
v_handles
[
layer
][
at
]
->
data
!=
nullptr
);
reqs
[
which_gpu
]
->
sizes
.
push_back
(
tp_size
[
which_gpu
]);
reqs
[
which_gpu
]
->
host_mem_addresses
.
push_back
(
offset_by_bytes
(
v_handles
[
layer
][
at
]
->
data
,
tp_offset
[
which_gpu
]));
reqs
[
which_gpu
]
->
device_mem_addresses
.
push_back
(
v_cache
[
which_gpu
][
layer
][
gpu_block_idx
].
data_ptr
());
}
}
}
// SPDLOG_DEBUG("GPU: Appended Vertical Handle to Request, count {}", reqs[0]->sizes.size());
}
void
GPUPageCache
::
debug
()
{
size_t
count
=
0
;
for
(
size_t
i
=
0
;
i
<
config
.
total_kvcache_pages
;
i
++
)
{
if
(
occupations
[
0
][
i
]
==
nullptr
)
{
count
+=
1
;
}
else
{
// occupations[0][i]->gpu_cc.debug();
}
}
SPDLOG_DEBUG
(
"Free Page: {}/{}"
,
count
,
config
.
total_kvcache_pages
);
}
}
// namespace kvc2
csrc/balance_serve/kvc2/src/gpu_cache.hh
0 → 100644
View file @
877aec85
#ifndef __GPU_CACHE_HH_
#define __GPU_CACHE_HH_
#include <torch/torch.h>
#include "cache_entry.hh"
#include "cuda_stream_manager.hh"
#include "defs.h"
#include "kvc2.h"
#include "metrics.h"
#include "utils/periodic_task.hpp"
namespace
kvc2
{
class
GPUPageCache
{
std
::
vector
<
torch
::
Device
>
gpu_devices
;
std
::
vector
<
int64_t
>
shape
;
size_t
tensor_size
;
std
::
vector
<
size_t
>
tp_offset
;
std
::
vector
<
size_t
>
tp_size
;
// met
std
::
shared_ptr
<
Metrics
>
met
;
// states
std
::
mutex
lock
;
size_t
num_free_pages
;
std
::
vector
<
bool
>
gpu_only_occupations
;
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>
occupations
,
v_occupations
;
size_t
_col_idx
=
0
;
// cuda stream manager
std
::
optional
<
size_t
>
next_empty_col
();
public:
GPUPageCacheConfig
config
;
std
::
unique_ptr
<
CudaStreamManager
>
stream_manager
;
std
::
vector
<
torch
::
Tensor
>
k_cache
;
std
::
vector
<
torch
::
Tensor
>
v_cache
;
std
::
unique_ptr
<
periodic
::
PeriodicTask
>
background_flush_back
=
nullptr
;
GPUPageCache
(
GPUPageCacheConfig
&
config
);
std
::
vector
<
size_t
>
gpu_only_alloc_col
(
size_t
count
);
void
gpu_only_free_cols
(
std
::
vector
<
size_t
>
cols
);
void
gpu_background_flush
();
bool
alloc_col
(
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_entries
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_entries
,
size_t
at
);
void
evict_cols
();
void
flush_col
(
size_t
at
);
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
try_lock_col
(
size_t
at
);
void
free_col
(
size_t
at
);
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
basic_request
(
cudaMemcpyKind
direction
,
std
::
function
<
void
()
>
callback
);
void
submit_requests
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
reqs
);
void
append_col_to_request
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>&
reqs
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_handles
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_handles
,
size_t
at
);
void
debug
();
};
}
// namespace kvc2
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/hasher.hpp
0 → 100644
View file @
877aec85
#ifndef __HASHER_HPP_
#define __HASHER_HPP_
#include "defs.h"
#include "xxhash.h"
namespace
kvc2
{
const
uint64_t
hash_seed
=
4123512
;
const
uint64_t
check_hash_seed
=
1025753
;
using
TokensHash
=
XXH64_hash_t
;
struct
TokensHasher
{
XXH64_state_t
*
state
;
TokensHasher
()
{
state
=
XXH64_createState
();
reset
();
}
~
TokensHasher
()
{
XXH64_freeState
(
state
);
}
TokensHasher
(
TokensHasher
&
other
)
=
delete
;
TokensHasher
&
operator
=
(
TokensHasher
&
other
)
=
delete
;
TokensHasher
(
TokensHasher
&&
other
)
=
delete
;
TokensHasher
&
operator
=
(
TokensHasher
&&
other
)
=
delete
;
TokensHash
get
()
{
return
XXH64_digest
(
state
);
}
void
reset
(
size_t
seed
=
hash_seed
)
{
XXH64_reset
(
state
,
seed
);
}
TokensHash
update
(
Token
*
data
,
TokenLength
length
)
{
XXH64_update
(
state
,
data
,
length
*
sizeof
(
Token
));
return
get
();
}
TokensHash
update_raw
(
void
*
data
,
size_t
size
)
{
XXH64_update
(
state
,
data
,
size
);
return
get
();
}
static
TokensHash
hash
(
Token
*
data
,
TokenLength
length
)
{
return
XXH64
(
data
,
length
*
sizeof
(
Token
),
hash_seed
);
}
};
}
// namespace kvc2
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/io_helper.hpp
0 → 100644
View file @
877aec85
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-12-11 06:35:31
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-12-11 06:50:55
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
#include <atomic>
#include <future>
#include <iostream>
#include <mutex>
#include <optional>
#include <string>
#include <vector>
struct
BatchPromise
{
std
::
promise
<
void
>
promise
;
std
::
shared_future
<
void
>
fut
;
std
::
atomic_size_t
count
;
inline
BatchPromise
(
size_t
count
)
:
count
(
count
)
{
fut
=
promise
.
get_future
().
share
();
}
inline
void
inc
(
size_t
count
=
1
)
{
this
->
count
.
fetch_add
(
count
,
std
::
memory_order_seq_cst
);
}
inline
void
set
()
{
if
(
count
.
fetch_sub
(
1
,
std
::
memory_order_seq_cst
)
==
1
)
{
promise
.
set_value
();
}
}
inline
std
::
shared_future
<
void
>
get_shared_fut
()
{
return
fut
;
}
};
template
<
typename
Lock
>
struct
TransferControl
{
Lock
lock
;
std
::
optional
<
std
::
shared_future
<
void
>>
transfer_ok
=
std
::
nullopt
;
bool
has_data
=
false
;
TransferControl
()
{}
/*
true, std::nullopt : Already has data
false, shared_future : Transfer already started, should wait for the future
false, std::nullopt : should transfer by you
true, shared_future: Should not appear
*/
std
::
pair
<
bool
,
std
::
optional
<
std
::
shared_future
<
void
>>>
has_data_or_transfer
(
std
::
shared_future
<
void
>
shared_fut
)
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
if
(
has_data
)
{
return
{
true
,
std
::
nullopt
};
}
else
{
if
(
transfer_ok
.
has_value
())
{
return
{
false
,
transfer_ok
};
}
else
{
transfer_ok
=
shared_fut
;
return
{
false
,
std
::
nullopt
};
}
}
}
void
set_has_data
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
has_data
=
true
;
transfer_ok
=
std
::
nullopt
;
}
bool
get_has_data
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
if
(
has_data
)
{
return
true
;
}
else
{
return
false
;
}
}
void
reset
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
transfer_ok
=
std
::
nullopt
;
has_data
=
false
;
}
std
::
string
debug
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
return
std
::
string
(
""
)
+
(
has_data
?
"has data"
:
"no data"
)
+
" "
+
(
transfer_ok
.
has_value
()
?
"transfer "
:
"no transfer"
);
}
};
struct
ConcurrentController
{
std
::
atomic_bool
dirty
=
false
;
std
::
atomic_size_t
ref_count
=
0
;
TransferControl
<
std
::
mutex
>
tc
;
};
template
<
typename
Unit
>
struct
IO_Helper
{
BatchPromise
batch_promise
;
std
::
function
<
void
(
Unit
*
)
>
call_back_on_unit
=
nullptr
;
std
::
function
<
void
()
>
call_back
=
nullptr
;
std
::
vector
<
std
::
shared_future
<
void
>>
futs
;
std
::
vector
<
Unit
*>
units_by_myself
;
IO_Helper
(
std
::
function
<
void
(
Unit
*
)
>
call_back_on_unit
,
std
::
function
<
void
()
>
call_back
=
nullptr
)
:
batch_promise
(
1
),
call_back_on_unit
(
call_back_on_unit
),
call_back
(
call_back
)
{}
IO_Helper
(
const
IO_Helper
&
other
)
=
delete
;
IO_Helper
&
operator
=
(
const
IO_Helper
&
other
)
=
delete
;
IO_Helper
(
IO_Helper
&&
other
)
=
delete
;
IO_Helper
&
operator
=
(
IO_Helper
&&
other
)
=
delete
;
~
IO_Helper
()
{
// std::cout<<"Destory IO helper"<<std::endl;
}
size_t
total_task_count
=
0
;
void
new_task
(
size_t
count
=
1
)
{
total_task_count
+=
1
;
batch_promise
.
inc
(
count
);
}
void
finish_add_taks
()
{
batch_promise
.
set
();
}
bool
absorb_tc
(
Unit
*
unit
,
TransferControl
<
std
::
mutex
>&
tc
)
{
auto
[
ok
,
fut
]
=
tc
.
has_data_or_transfer
(
batch_promise
.
get_shared_fut
());
if
(
ok
)
{
return
false
;
}
else
{
if
(
fut
.
has_value
())
{
futs
.
push_back
(
fut
.
value
());
// printf("Transfer started\n");
return
false
;
}
else
{
units_by_myself
.
push_back
(
unit
);
// printf("Not Transfer\n");
return
true
;
}
}
}
void
wait
()
{
for
(
auto
&
fut
:
futs
)
{
fut
.
wait
();
}
batch_promise
.
get_shared_fut
().
wait
();
for
(
auto
&
b
:
units_by_myself
)
{
call_back_on_unit
(
b
);
}
if
(
call_back
)
call_back
();
}
};
csrc/balance_serve/kvc2/src/kvc2.h
0 → 100644
View file @
877aec85
#pragma once
#include <torch/torch.h>
#include <cstdint>
#include <optional>
#include <vector>
#include "defs.h"
#include "model_config.h"
namespace
kvc2
{
struct
GPUPageCacheConfig
{
bool
gpu_only
;
std
::
vector
<
size_t
>
gpu_devices_id
;
size_t
layer_count
;
size_t
total_kvcache_pages
;
size_t
num_token_per_page
;
size_t
num_k_heads
;
size_t
k_head_dim
;
bool
full_kv_cache_on_each_gpu
=
false
;
bool
k_cache_on
=
true
;
bool
v_cache_on
=
true
;
torch
::
ScalarType
tensor_type
;
// for cuda stream manager
size_t
num_streams_per_device
=
4
;
};
struct
KVC2Config
{
bool
k_cache_on
=
true
;
bool
v_cache_on
=
true
;
bool
gpu_only
=
false
;
bool
load_from_disk
=
true
;
bool
save_to_disk
=
true
;
std
::
string
path
;
std
::
string
config_path
;
TokenLength
num_token_per_page
=
256
;
size_t
memory_pool_size
=
10e9
;
size_t
evict_count
=
20
;
std
::
optional
<
GPUPageCacheConfig
>
gpu_cache_config
=
std
::
nullopt
;
size_t
metrics_port
;
double
recompute_ratio
=
0.2
;
};
class
DoubleCacheHandleInterface
;
class
KVC2Interface
{
public:
virtual
~
KVC2Interface
()
=
default
;
virtual
void
load
()
=
0
;
virtual
void
save
()
=
0
;
/*
Raw Insert
Insert kvcache from kvcache_data to disk.
info: cache info
id: start pointer of token array
length: length of token array
kvcache_data: data of kvcache
This will firstly match the ID array with the existing kvcache, and then insert the unmatched kvcache to disk.
*/
virtual
void
raw_insert
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
const
std
::
vector
<
layer_data
>&
k_cache
,
const
std
::
vector
<
layer_data
>&
v_cache
)
=
0
;
/*
Raw Read
Read kvcache from disk to user specified pointers.
info: cache info
id: start pointer of token array
length: length of token array
kvcache_data: data of kvcache
Return: matched length of prefix, in tokens
This will not read from memory pool, it directly read from disk.
*/
virtual
TokenLength
raw_read
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
const
std
::
vector
<
layer_data
>&
k_cache
,
const
std
::
vector
<
layer_data
>&
v_cache
)
=
0
;
/*
Lookup
Lookup kvcache and load it from disk to memory pool if needed.
info: cache info
id: start pointer of token array
length: length of token array
Return: kvc2_handle, holds kvcache until being released.
if not found, matched_length will return 0.
if memory pool is full, return nullptr
*/
virtual
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
lookup
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
)
=
0
;
/*
Lookup and allocate to gpu
info.is_k_cache does not matter here
*/
virtual
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
lookup_to_gpu
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
)
=
0
;
virtual
void
lookup_to_gpu_async
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
,
std
::
function
<
void
(
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
)
>
call_back
)
=
0
;
virtual
std
::
pair
<
std
::
vector
<
torch
::
Tensor
>
,
std
::
vector
<
torch
::
Tensor
>>
get_kvcache
()
=
0
;
virtual
void
debug
()
=
0
;
};
std
::
shared_ptr
<
KVC2Interface
>
create_kvc2
(
KVC2Config
config
);
enum
MatchStatus
{
Exact
,
Partial
,
NotMatchExact
,
NotMatchPartial
,
};
class
DoubleCacheHandleInterface
{
public:
virtual
~
DoubleCacheHandleInterface
()
=
default
;
virtual
TokenLength
matched_length
()
=
0
;
virtual
std
::
vector
<
MatchStatus
>
matched_status
()
=
0
;
virtual
std
::
vector
<
layer_data
>
handle_data
(
bool
is_key_cache
)
=
0
;
virtual
bool
to_gpu
()
=
0
;
virtual
void
to_gpu_async
(
std
::
function
<
void
(
bool
)
>
call_back
)
=
0
;
virtual
std
::
vector
<
size_t
>
get_gpu_block_idx
()
=
0
;
virtual
std
::
vector
<
size_t
>
get_gpu_attached_block_idx
()
=
0
;
virtual
void
append_tokens
(
Token
*
tokens
,
TokenLength
length
)
=
0
;
// update generated tokens
virtual
void
debug
()
=
0
;
};
};
// namespace kvc2
csrc/balance_serve/kvc2/src/kvc2_utils.py
0 → 100644
View file @
877aec85
import
torch
import
ctypes
def
aligned_tensor
(
size
,
alignment
=
4096
):
num_bytes
=
size
mem
=
ctypes
.
c_void_p
()
error_code
=
ctypes
.
CDLL
(
None
).
posix_memalign
(
ctypes
.
byref
(
mem
),
ctypes
.
c_size_t
(
alignment
),
ctypes
.
c_size_t
(
num_bytes
)
)
if
error_code
!=
0
:
raise
MemoryError
(
f
"posix_memalign failed with error code
{
error_code
}
"
)
array_type
=
(
ctypes
.
c_int8
*
size
)
raw_array
=
array_type
.
from_address
(
mem
.
value
)
tensor
=
torch
.
frombuffer
(
raw_array
,
dtype
=
torch
.
int8
)
if
tensor
.
data_ptr
()
%
alignment
!=
0
:
raise
ValueError
(
f
"Tensor data_ptr
{
tensor
.
data_ptr
()
}
is not aligned to
{
alignment
}
bytes"
)
return
tensor
,
mem
def
alloc_aligned_cache
(
layer_count
,
block_count
,
element_size
):
cache
=
[]
cache_mem
=
[]
for
i
in
range
(
layer_count
):
layer_data
=
[]
layer_mem
=
[]
for
j
in
range
(
block_count
):
tensor
,
mem_ptr
=
aligned_tensor
(
element_size
,
alignment
=
4096
)
layer_data
.
append
(
tensor
)
layer_mem
.
append
(
mem_ptr
)
cache
.
append
(
layer_data
)
cache_mem
.
append
(
layer_mem
)
return
cache
,
cache_mem
def
dealloc_aligned_cache
(
cache_mem
):
for
layer_mem
in
cache_mem
:
for
mem_ptr
in
layer_mem
:
ctypes
.
CDLL
(
None
).
free
(
mem_ptr
)
def
get_tensor_ptr
(
tensors
):
tensor_ptr
=
[]
for
layer
in
tensors
:
layer_ptr
=
[]
for
data
in
layer
:
layer_ptr
.
append
(
data
.
data_ptr
())
tensor_ptr
.
append
(
layer_ptr
)
return
tensor_ptr
def
get_tensor_from_data_ptr
(
matched_data
,
element_size
):
re
=
[]
for
layer
in
matched_data
:
re_layer
=
[]
for
data_ptr
in
layer
:
array_type
=
(
ctypes
.
c_int8
*
element_size
)
raw_array
=
array_type
.
from_address
(
data_ptr
)
tensor
=
torch
.
frombuffer
(
raw_array
,
dtype
=
torch
.
int8
)
re_layer
.
append
(
tensor
)
re
.
append
(
re_layer
)
return
re
if
__name__
==
"__main__"
:
pass
\ No newline at end of file
csrc/balance_serve/kvc2/src/metrics.cpp
0 → 100644
View file @
877aec85
#include "metrics.h"
namespace
kvc2
{
Metrics
::
Metrics
(
const
MetricsConfig
&
config
)
:
registry_
(
std
::
make_shared
<
prometheus
::
Registry
>
()),
exposer_
(
config
.
endpoint
)
{
// 注册 prefix_nodes Counter
auto
&
prefix_nodes_family
=
prometheus
::
BuildCounter
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_prefix_nodes"
)
.
Help
(
"Number of prefix nodes"
)
.
Register
(
*
registry_
);
prefix_nodes
=
&
prefix_nodes_family
.
Add
({});
// 注册 prefix_block_count Counter
auto
&
prefix_block_count_family
=
prometheus
::
BuildCounter
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_prefix_block_count"
)
.
Help
(
"Number of prefix blocks"
)
.
Register
(
*
registry_
);
prefix_block_count
=
&
prefix_block_count_family
.
Add
({});
// 定义统一的桶大小,最大为 10000 ms (10 s)
std
::
vector
<
double
>
common_buckets
=
{
1.0
,
5.0
,
10.0
,
50.0
,
100.0
,
500.0
,
1000.0
,
5000.0
,
10000.0
};
// 注册 raw_insert_time_ms Histogram
auto
&
raw_insert_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_raw_insert_time_ms"
)
.
Help
(
"function raw insert's time in milliseconds"
)
.
Register
(
*
registry_
);
raw_insert_time_ms
=
&
raw_insert_time_ms_family
.
Add
({},
common_buckets
);
// 注册 lookup_time_ms Histogram
auto
&
lookup_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_lookup_time_ms"
)
.
Help
(
"function lookup's time in milliseconds"
)
.
Register
(
*
registry_
);
lookup_time_ms
=
&
lookup_time_ms_family
.
Add
({},
common_buckets
);
// 注册 lookup_prefixmatch_length Histogram
auto
&
lookup_prefixmatch_length_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_lookup_prefixmatch_length"
)
.
Help
(
"function lookup's prefix match length"
)
.
Register
(
*
registry_
);
lookup_prefixmatch_length
=
&
lookup_prefixmatch_length_family
.
Add
({},
common_buckets
);
// 注册 matched_length_percentage Histogram
auto
&
matched_length_percentage_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_matched_length_percentage"
)
.
Help
(
"function matched length percentage"
)
.
Register
(
*
registry_
);
matched_length_percentage
=
&
matched_length_percentage_family
.
Add
({},
common_buckets
);
// 注册 disk_usage Gauge
auto
&
disk_usage_family
=
prometheus
::
BuildGauge
().
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_disk_usage"
).
Help
(
"disk usage"
).
Register
(
*
registry_
);
disk_usage
=
&
disk_usage_family
.
Add
({});
// 注册 memory_pool_size Gauge
memory_pool_size_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_memory_pool_size"
)
.
Help
(
"memory pool size"
)
.
Register
(
*
registry_
);
// 注册 memory_pool_node_count Gauge
memory_pool_node_count_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_memory_pool_node_count"
)
.
Help
(
"memory pool node count"
)
.
Register
(
*
registry_
);
// 注册 lru_entry_count Gauge
lru_entry_count_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_lru_entry_count"
)
.
Help
(
"lru entry count"
)
.
Register
(
*
registry_
);
// 注册 gpu_page_count Gauge
gpu_page_count_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_gpu_page_count"
)
.
Help
(
"gpu page count"
)
.
Register
(
*
registry_
);
// 注册 append_tokens_time_ms Histogram
auto
&
append_tokens_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_append_tokens_time_ms"
)
.
Help
(
"append tokens time in milliseconds"
)
.
Register
(
*
registry_
);
append_tokens_time_ms
=
&
append_tokens_time_ms_family
.
Add
({},
common_buckets
);
// 注册 gpu_flush_back_time_ms Histogram
auto
&
gpu_flush_back_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_gpu_flush_back_time_ms"
)
.
Help
(
"gpu flush back time in milliseconds"
)
.
Register
(
*
registry_
);
gpu_flush_back_time_ms
=
&
gpu_flush_back_time_ms_family
.
Add
({},
common_buckets
);
// 注册 cpu_flush_back_time_ms Histogram
auto
&
cpu_flush_back_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_cpu_flush_back_time_ms"
)
.
Help
(
"cpu flush back time in milliseconds"
)
.
Register
(
*
registry_
);
cpu_flush_back_time_ms
=
&
cpu_flush_back_time_ms_family
.
Add
({},
common_buckets
);
exposer_
.
RegisterCollectable
(
registry_
);
}
// 析构函数
Metrics
::~
Metrics
()
{
// 停止指标暴露
// exposer_.Stop();
}
// 获取 memory_pool_size 指标
prometheus
::
Gauge
*
Metrics
::
memory_pool_size
(
const
std
::
string
&
type
)
{
return
&
memory_pool_size_family_
->
Add
({{
"type"
,
type
}});
}
// 获取 memory_pool_node_count 指标
prometheus
::
Gauge
*
Metrics
::
memory_pool_node_count
(
const
std
::
string
&
type
)
{
return
&
memory_pool_node_count_family_
->
Add
({{
"type"
,
type
}});
}
// 获取 lru_entry_count 指标
prometheus
::
Gauge
*
Metrics
::
lru_entry_count
(
const
std
::
string
&
type
)
{
return
&
lru_entry_count_family_
->
Add
({{
"type"
,
type
}});
}
// 获取 gpu_page_count 指标
prometheus
::
Gauge
*
Metrics
::
gpu_page_count
(
std
::
string
type
)
{
return
&
gpu_page_count_family_
->
Add
({{
"type"
,
type
}});
}
TimeObserver
::
TimeObserver
(
prometheus
::
Histogram
*
h
)
{
histogram_
=
h
;
timer_
.
start
();
}
TimeObserver
::~
TimeObserver
()
{
timer_
.
stop
();
histogram_
->
Observe
(
timer_
.
elapsedNs
()
/
1e6
);
// ns -> ms
}
}
// namespace kvc2
\ No newline at end of file
csrc/balance_serve/kvc2/src/metrics.h
0 → 100644
View file @
877aec85
#pragma once
#include <atomic>
#include <chrono>
#include <memory>
#include <string>
#include <thread>
#include <vector>
#include "prometheus/counter.h"
#include "prometheus/exposer.h"
#include "prometheus/gauge.h"
#include "prometheus/histogram.h"
#include "prometheus/registry.h"
#include "utils/timer.hpp"
namespace
kvc2
{
// 指标前缀宏定义
#define METRIC_PREFIX "kvc2"
struct
MetricsConfig
{
std
::
string
endpoint
;
// 监听端点,如 "0.0.0.0:8080"
};
class
Metrics
{
public:
// 构造函数传入 MetricsConfig
Metrics
(
const
MetricsConfig
&
config
);
~
Metrics
();
// 禁止拷贝和赋值
Metrics
(
const
Metrics
&
)
=
delete
;
Metrics
&
operator
=
(
const
Metrics
&
)
=
delete
;
// 指标指针
prometheus
::
Counter
*
prefix_nodes
;
prometheus
::
Counter
*
prefix_block_count
;
prometheus
::
Histogram
*
raw_insert_time_ms
;
prometheus
::
Histogram
*
lookup_time_ms
;
prometheus
::
Histogram
*
lookup_prefixmatch_length
;
prometheus
::
Histogram
*
matched_length_percentage
;
prometheus
::
Gauge
*
disk_usage
;
prometheus
::
Gauge
*
memory_pool_size
(
const
std
::
string
&
type
);
prometheus
::
Gauge
*
memory_pool_node_count
(
const
std
::
string
&
type
);
prometheus
::
Gauge
*
lru_entry_count
(
const
std
::
string
&
type
);
prometheus
::
Gauge
*
gpu_page_count
(
std
::
string
type
);
prometheus
::
Histogram
*
append_tokens_time_ms
;
prometheus
::
Histogram
*
gpu_flush_back_time_ms
;
prometheus
::
Histogram
*
cpu_flush_back_time_ms
;
private:
std
::
shared_ptr
<
prometheus
::
Registry
>
registry_
;
prometheus
::
Exposer
exposer_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
memory_pool_size_family_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
memory_pool_node_count_family_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
lru_entry_count_family_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
gpu_page_count_family_
;
};
class
TimeObserver
{
public:
TimeObserver
(
prometheus
::
Histogram
*
h
);
~
TimeObserver
();
private:
Timer
timer_
;
prometheus
::
Histogram
*
histogram_
;
};
}
// namespace kvc2
\ No newline at end of file
csrc/balance_serve/kvc2/src/model_config.h
0 → 100644
View file @
877aec85
#ifndef __MODEL_CONFIG_HPP_
#define __MODEL_CONFIG_HPP_
#include "nlohmann/json.hpp"
#include <iostream>
#include <filesystem>
#include <fstream>
using
DimSize
=
size_t
;
using
URL
=
std
::
string
;
using
ModelName
=
std
::
string
;
// We must assure this can be load by config.json
class
ModelConfig
{
public:
DimSize
hidden_size
;
DimSize
intermediate_size
;
size_t
max_position_embeddings
;
std
::
string
model_type
;
size_t
num_attention_heads
;
size_t
num_hidden_layers
;
size_t
num_key_value_heads
;
size_t
vocab_size
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE
(
ModelConfig
,
hidden_size
,
intermediate_size
,
max_position_embeddings
,
model_type
,
num_attention_heads
,
num_hidden_layers
,
num_key_value_heads
,
vocab_size
);
void
load_from
(
std
::
filesystem
::
path
path
)
{
std
::
cout
<<
"Load from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
nlohmann
::
json
j
;
i
>>
j
;
*
this
=
j
.
get
<
ModelConfig
>
();
}
};
using
QuantType
=
std
::
string
;
static
const
QuantType
NoQuantType
=
""
;
class
QuantConfig
{
public:
QuantType
name
;
// For GEMV
QuantType
type_of_dot_vector
=
NoQuantType
;
inline
bool
can_be_used_as_matrix
()
{
return
type_of_dot_vector
!=
NoQuantType
;
}
bool
can_be_used_as_vector
;
double
bytes_per_element
;
bool
has_scale
;
bool
has_min
;
size_t
block_element_count
;
size_t
block_element_size
;
URL
reference
=
""
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT
(
QuantConfig
,
name
,
type_of_dot_vector
,
can_be_used_as_vector
,
bytes_per_element
,
has_scale
,
has_min
,
block_element_count
,
block_element_size
,
reference
);
};
inline
std
::
map
<
QuantType
,
QuantConfig
>
quant_configs
;
inline
std
::
map
<
ModelName
,
ModelConfig
>
model_configs
;
inline
void
load_quant_configs
(
std
::
filesystem
::
path
path
)
{
nlohmann
::
json
j
;
if
(
std
::
filesystem
::
exists
(
path
))
{
std
::
cout
<<
__FUNCTION__
<<
" from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
i
>>
j
;
quant_configs
=
j
.
get
<
std
::
map
<
QuantType
,
QuantConfig
>>
();
std
::
cout
<<
"Loaded Quant Configs"
<<
std
::
endl
;
for
(
auto
&
[
k
,
v
]
:
quant_configs
)
{
std
::
cout
<<
" - "
<<
k
<<
std
::
endl
;
}
}
else
{
std
::
cout
<<
__FUNCTION__
<<
" no file at "
<<
path
<<
std
::
endl
;
}
}
inline
void
dump_quant_configs
(
std
::
filesystem
::
path
path
)
{
std
::
ofstream
o
(
path
);
nlohmann
::
json
j
=
quant_configs
;
o
<<
j
.
dump
(
4
);
}
inline
void
load_model_configs
(
std
::
filesystem
::
path
path
)
{
nlohmann
::
json
j
;
if
(
std
::
filesystem
::
exists
(
path
))
{
std
::
cout
<<
__FUNCTION__
<<
" from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
i
>>
j
;
model_configs
=
j
.
get
<
std
::
map
<
ModelName
,
ModelConfig
>>
();
std
::
cout
<<
"Loaded Model Configs"
<<
std
::
endl
;
for
(
auto
&
[
k
,
v
]
:
model_configs
)
{
std
::
cout
<<
" - "
<<
k
<<
std
::
endl
;
}
}
else
{
std
::
cout
<<
__FUNCTION__
<<
" no file at "
<<
path
<<
std
::
endl
;
}
}
inline
void
dump_model_configs
(
std
::
filesystem
::
path
path
)
{
std
::
ofstream
o
(
path
);
nlohmann
::
json
j
=
model_configs
;
o
<<
j
.
dump
(
4
);
}
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
0 → 100644
View file @
877aec85
#include "page_aligned_memory_pool.h"
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
#include "utils/arithmetic.hpp"
#include "utils/easy_format.hpp"
/// 构造函数
PageAlignedMemoryPool
::
PageAlignedMemoryPool
(
size_t
size_in_bytes
)
{
total_size
=
(
size_in_bytes
/
PageSize
)
*
PageSize
;
// 对齐分配。C++17 对齐方式写法,如果编译器不支持可以改用其它方法
data
=
::
operator
new
[](
total_size
,
std
::
align_val_t
(
PageSize
));
total_pages
=
total_size
/
PageSize
;
assert
(
total_pages
>=
Blocks
);
page_per_block
=
total_pages
/
Blocks
;
for
(
size_t
block_index
=
0
;
block_index
<
Blocks
;
block_index
++
)
{
first_page
[
block_index
]
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
intptr_t
>
(
data
)
+
static_cast
<
intptr_t
>
(
block_index
)
*
page_per_block
*
PageSize
);
count_page
[
block_index
]
=
block_index
==
Blocks
-
1
?
(
total_pages
-
page_per_block
*
(
Blocks
-
1
))
:
page_per_block
;
SPDLOG_DEBUG
(
"first_page[{}] = {}, count_page[{}] = {}"
,
block_index
,
reinterpret_cast
<
intptr_t
>
(
first_page
[
block_index
])
-
reinterpret_cast
<
intptr_t
>
(
data
),
block_index
,
count_page
[
block_index
]);
bitmap
[
block_index
].
resize
(
count_page
[
block_index
],
0
);
}
SPDLOG_INFO
(
"PageAlignedMemoryPool with size {} Mbytes, {} pages"
,
total_size
/
(
1
<<
20
),
page_count
());
}
/// 析构函数
PageAlignedMemoryPool
::~
PageAlignedMemoryPool
()
{
if
(
data
)
{
// 注意:需要与分配时的对齐方式对应
::
operator
delete
[](
data
,
std
::
align_val_t
(
PageSize
));
data
=
nullptr
;
}
}
/// 返回总页数
size_t
PageAlignedMemoryPool
::
page_count
()
{
return
total_size
/
PageSize
;
}
/// 返回按整页对齐后的字节数
size_t
PageAlignedMemoryPool
::
page_padded_size
(
size_t
size
)
{
return
div_up
(
size
,
PageSize
)
*
PageSize
;
}
void
*
PageAlignedMemoryPool
::
alloc_in_block
(
size_t
block_index
,
size_t
alloc_size
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
lock
[
block_index
]);
size_t
free_pages
=
0
;
for
(
size_t
i
=
0
;
i
<
count_page
[
block_index
];
i
++
)
{
if
(
bitmap
[
block_index
][
i
]
==
0
)
{
free_pages
++
;
if
(
free_pages
==
alloc_size
)
{
size_t
page_index
=
i
+
1
-
free_pages
;
for
(
size_t
page
=
page_index
;
page
<
page_index
+
alloc_size
;
page
++
)
{
bitmap
[
block_index
][
page
]
=
1
;
// SPDLOG_DEBUG("alloc page {} in block {}", page, block_index);
}
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
intptr_t
>
(
first_page
[
block_index
])
+
page_index
*
PageSize
);
}
}
else
{
free_pages
=
0
;
}
}
return
nullptr
;
}
/// 分配函数
void
*
PageAlignedMemoryPool
::
alloc
(
size_t
size
)
{
size_t
alloc_size
=
div_up
(
size
,
PageSize
);
auto
cnt
=
now_block
.
fetch_add
(
1
,
std
::
memory_order_relaxed
);
for
(
size_t
i
=
0
;
i
<
Blocks
;
i
++
)
{
auto
result
=
alloc_in_block
((
i
+
cnt
)
%
Blocks
,
alloc_size
);
if
(
result
!=
nullptr
)
{
allocated
.
fetch_add
(
alloc_size
*
PageSize
,
std
::
memory_order_relaxed
);
alloc_count
.
fetch_add
(
1
,
std
::
memory_order_relaxed
);
return
result
;
}
}
return
nullptr
;
}
/// 释放函数
void
PageAlignedMemoryPool
::
free
(
void
*
p
,
size_t
size
)
{
auto
alloc_size
=
div_up
(
size
,
PageSize
);
size_t
block_index
=
(
reinterpret_cast
<
intptr_t
>
(
p
)
-
reinterpret_cast
<
intptr_t
>
(
data
))
/
page_per_block
/
PageSize
;
size_t
page_index
=
(
reinterpret_cast
<
intptr_t
>
(
p
)
-
reinterpret_cast
<
intptr_t
>
(
first_page
[
block_index
]))
/
PageSize
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
lock
[
block_index
]);
for
(
size_t
page
=
page_index
;
page
<
page_index
+
alloc_size
;
page
++
)
bitmap
[
block_index
][
page
]
=
0
;
allocated
.
fetch_sub
(
alloc_size
*
PageSize
,
std
::
memory_order_relaxed
);
free_count
.
fetch_add
(
1
,
std
::
memory_order_relaxed
);
}
// TODO: too slow
std
::
vector
<
void
*>
PageAlignedMemoryPool
::
alloc_multiple
(
size_t
size
,
size_t
count
)
{
std
::
vector
<
void
*>
result
;
for
(
size_t
i
=
0
;
i
<
count
;
i
++
)
{
auto
p
=
alloc
(
size
);
if
(
p
==
nullptr
)
{
for
(
auto
ptr
:
result
)
{
free
(
ptr
,
size
);
}
return
{};
}
result
.
push_back
(
p
);
}
return
result
;
}
void
PageAlignedMemoryPool
::
defragment
()
{}
/// 调试打印
std
::
string
PageAlignedMemoryPool
::
debug
()
{
return
fmt
::
format
(
"PageAlignedMemoryPool: total_size: {}MB, allocated: {}, alloc/free count: {}/{}
\n
"
,
readable_number
(
total_size
),
readable_number
(
size_t
(
allocated
)),
size_t
(
alloc_count
),
size_t
(
free_count
));
}
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
0 → 100644
View file @
877aec85
#pragma once
#include <assert.h>
#include <algorithm> // std::sort
#include <atomic>
#include <bitset>
#include <cstddef> // size_t
#include <mutex> // std::mutex
#include <vector>
constexpr
size_t
PageSize
=
4096
;
/// PageAlignedMemoryPool 类的声明
struct
PageAlignedMemoryPool
{
private:
constexpr
static
size_t
Blocks
=
16
;
void
*
data
=
nullptr
;
size_t
total_size
=
0
,
total_pages
=
0
;
std
::
atomic_size_t
now_block
=
0
;
std
::
atomic_size_t
allocated
=
0
;
// allocated_size
std
::
atomic_size_t
alloc_count
=
0
;
std
::
atomic_size_t
free_count
=
0
;
std
::
mutex
lock
[
Blocks
];
size_t
page_per_block
=
0
;
void
*
first_page
[
Blocks
];
size_t
count_page
[
Blocks
];
std
::
vector
<
int8_t
>
bitmap
[
Blocks
];
void
*
alloc_in_block
(
size_t
block_index
,
size_t
alloc_size
);
public:
/// 构造函数和析构函数
explicit
PageAlignedMemoryPool
(
size_t
size_in_bytes
);
~
PageAlignedMemoryPool
();
/// 禁用拷贝和移动
PageAlignedMemoryPool
(
PageAlignedMemoryPool
&&
other
)
=
delete
;
PageAlignedMemoryPool
&
operator
=
(
PageAlignedMemoryPool
&&
other
)
=
delete
;
PageAlignedMemoryPool
(
const
PageAlignedMemoryPool
&
other
)
=
delete
;
PageAlignedMemoryPool
&
operator
=
(
const
PageAlignedMemoryPool
&
other
)
=
delete
;
/// 成员函数
size_t
page_count
();
size_t
page_padded_size
(
size_t
size
);
void
*
alloc
(
size_t
size
);
std
::
vector
<
void
*>
alloc_multiple
(
size_t
size
,
size_t
count
);
void
free
(
void
*
data
,
size_t
size
);
void
defragment
();
std
::
string
debug
();
};
csrc/balance_serve/kvc2/src/prefix.cpp
0 → 100644
View file @
877aec85
This diff is collapsed.
Click to expand it.
csrc/balance_serve/kvc2/src/utils/all.hpp
0 → 100644
View file @
877aec85
#pragma once
#include "easy_format.hpp"
#include "timer.hpp"
\ No newline at end of file
csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
0 → 100644
View file @
877aec85
#include <memory>
#include <type_traits>
template
<
typename
T
,
typename
U
>
T
div_up
(
T
x
,
U
by
)
{
static_assert
(
std
::
is_integral_v
<
T
>
);
static_assert
(
std
::
is_integral_v
<
U
>
);
return
(
x
+
by
-
1
)
/
by
;
}
template
<
typename
T
>
T
*
offset_by_bytes
(
T
*
t
,
size_t
n
)
{
return
reinterpret_cast
<
T
*>
(
reinterpret_cast
<
size_t
>
(
t
)
+
n
);
}
csrc/balance_serve/kvc2/src/utils/easy_format.hpp
0 → 100644
View file @
877aec85
#ifndef __EASY_FORMAT_HPP_
#define __EASY_FORMAT_HPP_
#include <array>
#include <iomanip>
#include <sstream>
#include <string>
#include <vector>
template
<
typename
T
>
inline
std
::
string
format_vector
(
const
std
::
vector
<
T
>&
v
)
{
std
::
ostringstream
oss
;
if
(
v
.
empty
())
return
"[]"
;
for
(
size_t
i
=
0
;
i
<
v
.
size
();
++
i
)
{
oss
<<
v
[
i
];
if
(
i
<
v
.
size
()
-
1
)
oss
<<
", "
;
// 逗号分隔
}
return
oss
.
str
();
}
inline
std
::
array
<
std
::
string
,
7
>
units
=
{
""
,
"K"
,
"M"
,
"G"
,
"T"
,
"P"
,
"E"
};
inline
std
::
string
readable_number
(
size_t
size
)
{
size_t
unit_index
=
0
;
double
readable_size
=
size
;
while
(
readable_size
>=
1000
&&
unit_index
<
units
.
size
()
-
1
)
{
readable_size
/=
1000
;
unit_index
++
;
}
std
::
ostringstream
ss
;
ss
<<
std
::
fixed
<<
std
::
setprecision
(
2
)
<<
readable_size
;
std
::
string
str
=
ss
.
str
();
return
str
+
""
+
units
[
unit_index
];
}
#endif
\ No newline at end of file
Prev
1
2
3
4
5
6
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment