Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
25cee581
"tests/nn/vscode:/vscode.git/clone" did not exist on "1c8d219d0a3e6364ded7d6970b755b06b7aa8e05"
Commit
25cee581
authored
Mar 31, 2025
by
Atream
Browse files
add balance-serve, support concurrence
parent
8d0292aa
Changes
196
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3424 additions
and
0 deletions
+3424
-0
csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
+135
-0
csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
+54
-0
csrc/balance_serve/kvc2/src/defs.h
csrc/balance_serve/kvc2/src/defs.h
+35
-0
csrc/balance_serve/kvc2/src/gpu_cache.cpp
csrc/balance_serve/kvc2/src/gpu_cache.cpp
+282
-0
csrc/balance_serve/kvc2/src/gpu_cache.hh
csrc/balance_serve/kvc2/src/gpu_cache.hh
+74
-0
csrc/balance_serve/kvc2/src/hasher.hpp
csrc/balance_serve/kvc2/src/hasher.hpp
+40
-0
csrc/balance_serve/kvc2/src/io_helper.hpp
csrc/balance_serve/kvc2/src/io_helper.hpp
+155
-0
csrc/balance_serve/kvc2/src/kvc2.h
csrc/balance_serve/kvc2/src/kvc2.h
+138
-0
csrc/balance_serve/kvc2/src/kvc2_utils.py
csrc/balance_serve/kvc2/src/kvc2_utils.py
+64
-0
csrc/balance_serve/kvc2/src/metrics.cpp
csrc/balance_serve/kvc2/src/metrics.cpp
+141
-0
csrc/balance_serve/kvc2/src/metrics.h
csrc/balance_serve/kvc2/src/metrics.h
+77
-0
csrc/balance_serve/kvc2/src/model_config.h
csrc/balance_serve/kvc2/src/model_config.h
+103
-0
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
+123
-0
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
+53
-0
csrc/balance_serve/kvc2/src/prefix.cpp
csrc/balance_serve/kvc2/src/prefix.cpp
+1746
-0
csrc/balance_serve/kvc2/src/utils/all.hpp
csrc/balance_serve/kvc2/src/utils/all.hpp
+3
-0
csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
+14
-0
csrc/balance_serve/kvc2/src/utils/easy_format.hpp
csrc/balance_serve/kvc2/src/utils/easy_format.hpp
+37
-0
csrc/balance_serve/kvc2/src/utils/lock_free_queue.hpp
csrc/balance_serve/kvc2/src/utils/lock_free_queue.hpp
+60
-0
csrc/balance_serve/kvc2/src/utils/mpsc.hpp
csrc/balance_serve/kvc2/src/utils/mpsc.hpp
+90
-0
No files found.
csrc/balance_serve/kvc2/src/cuda_stream_manager.cpp
0 → 100644
View file @
25cee581
#include "cuda_stream_manager.hh"
#include <cuda_runtime.h>
#include <functional>
#include <iostream>
#include <stdexcept>
#include <vector>
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
CudaStreamManager
::
CudaStreamManager
(
const
std
::
vector
<
size_t
>&
device_ids
,
int
num_streams_per_device
)
{
for
(
int
device_id
:
device_ids
)
{
auto
x
=
std
::
unique_ptr
<
DeviceInfo
>
(
new
DeviceInfo
);
DeviceInfo
&
device_info
=
*
x
;
device_info
.
device_id
=
device_id
;
device_info
.
next_stream_index
=
0
;
device_info
.
stop_flag
=
false
;
// 设置设备
cudaError_t
err
=
cudaSetDevice
(
device_id
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaSetDevice failed on device {}: {}"
,
device_id
,
cudaGetErrorString
(
err
));
throw
std
::
runtime_error
(
"cudaSetDevice failed"
);
}
// 创建 CUDA 流
device_info
.
streams
.
resize
(
num_streams_per_device
);
for
(
int
i
=
0
;
i
<
num_streams_per_device
;
++
i
)
{
err
=
cudaStreamCreate
(
&
device_info
.
streams
[
i
]);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"Failed to create CUDA stream on device {}: {}"
,
device_id
,
cudaGetErrorString
(
err
));
throw
std
::
runtime_error
(
"Failed to create CUDA stream"
);
}
}
// 启动设备工作线程
device_info
.
worker_thread
=
std
::
thread
(
&
CudaStreamManager
::
deviceWorker
,
this
,
std
::
ref
(
device_info
));
devices_
.
push_back
(
std
::
move
(
x
));
}
}
CudaStreamManager
::~
CudaStreamManager
()
{
// 通知所有设备线程停止
for
(
auto
&
device_info
:
devices_
)
{
device_info
->
stop_flag
.
store
(
true
);
auto
request
=
std
::
shared_ptr
<
Request
>
(
new
Request
);
request
->
should_exit
=
true
;
device_info
->
request_queue
.
enqueue
(
std
::
move
(
request
));
}
// 等待所有线程结束
for
(
auto
&
device_info
:
devices_
)
{
if
(
device_info
->
worker_thread
.
joinable
())
{
device_info
->
worker_thread
.
join
();
}
// 销毁 CUDA 流
cudaSetDevice
(
device_info
->
device_id
);
for
(
auto
&
stream
:
device_info
->
streams
)
{
cudaStreamDestroy
(
stream
);
}
}
}
void
CudaStreamManager
::
submitRequest
(
std
::
shared_ptr
<
Request
>
request
)
{
// 找到对应的设备
for
(
auto
&
device_info
:
devices_
)
{
if
(
device_info
->
device_id
==
request
->
device_id
)
{
device_info
->
request_queue
.
enqueue
(
request
);
return
;
}
}
throw
std
::
runtime_error
(
"Invalid device ID in request"
);
}
void
CudaStreamManager
::
deviceWorker
(
DeviceInfo
&
device_info
)
{
// 设置设备
cudaError_t
err
=
cudaSetDevice
(
device_info
.
device_id
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaSetDevice failed in worker thread for device {}: {}"
,
device_info
.
device_id
,
cudaGetErrorString
(
err
));
return
;
}
while
(
device_info
.
stop_flag
.
load
()
==
false
)
{
auto
request
=
device_info
.
request_queue
.
dequeue
();
if
(
request
->
should_exit
)
{
return
;
}
// 处理请求
SPDLOG_DEBUG
(
"Getting request on device {}, count {}"
,
device_info
.
device_id
,
request
->
host_mem_addresses
.
size
());
int
stream_index
=
device_info
.
next_stream_index
;
cudaStream_t
stream
=
device_info
.
streams
[
stream_index
];
device_info
.
next_stream_index
=
(
device_info
.
next_stream_index
+
1
)
%
device_info
.
streams
.
size
();
size_t
num_transfers
=
request
->
host_mem_addresses
.
size
();
for
(
size_t
i
=
0
;
i
<
num_transfers
;
++
i
)
{
void
*
dst
=
request
->
device_mem_addresses
[
i
];
void
*
src
=
request
->
host_mem_addresses
[
i
];
if
(
request
->
direction
==
cudaMemcpyDeviceToHost
)
{
std
::
swap
(
dst
,
src
);
}
cudaError_t
err
=
cudaMemcpyAsync
(
dst
,
src
,
request
->
sizes
[
i
],
request
->
direction
,
stream
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaMemcpyAsync failed on device {}: {}"
,
device_info
.
device_id
,
cudaGetErrorString
(
err
));
// 可以根据需要处理错误,这里简单地继续
continue
;
}
}
// 添加回调函数,因为是异步,所以需要包起来
struct
CallbackData
{
std
::
function
<
void
()
>
callback
;
};
CallbackData
*
cb_data
=
new
CallbackData
{
request
->
callback
};
err
=
cudaLaunchHostFunc
(
stream
,
[](
void
*
data
)
{
// SPDLOG_DEBUG("Callback function called");
CallbackData
*
cb_data
=
static_cast
<
CallbackData
*>
(
data
);
cb_data
->
callback
();
delete
cb_data
;
},
cb_data
);
if
(
err
!=
cudaSuccess
)
{
SPDLOG_WARN
(
"cudaLaunchHostFunc failed on device {}: {}"
,
device_info
.
device_id
,
cudaGetErrorString
(
err
));
// 根据需要处理错误
}
}
}
csrc/balance_serve/kvc2/src/cuda_stream_manager.hh
0 → 100644
View file @
25cee581
/*
* @Author: Xie Weiyu ervinxie@qq.com
* @Date: 2024-11-19 09:24:47
* @LastEditors: Xie Weiyu ervinxie@qq.com
* @LastEditTime: 2024-11-20 02:55:49
* @FilePath: /kvc2/src/cuda_stream_manager.hh
* @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
*/
#pragma once
#include <cuda_runtime.h>
#include <atomic>
#include <functional>
#include <memory>
#include <thread>
#include <vector>
#include "utils/mpsc.hpp"
class
CudaStreamManager
{
public:
// 构造函数,接受要使用的设备 ID 列表和每个设备的流数量
CudaStreamManager
(
const
std
::
vector
<
size_t
>&
device_ids
,
int
num_streams_per_device
);
~
CudaStreamManager
();
// 请求结构体
struct
Request
{
bool
should_exit
=
false
;
int
device_id
;
std
::
vector
<
void
*>
host_mem_addresses
;
std
::
vector
<
void
*>
device_mem_addresses
;
std
::
vector
<
size_t
>
sizes
;
cudaMemcpyKind
direction
;
std
::
function
<
void
()
>
callback
;
};
void
submitRequest
(
std
::
shared_ptr
<
Request
>
request
);
private:
// 每个设备的信息
struct
DeviceInfo
{
int
device_id
;
std
::
thread
worker_thread
;
std
::
vector
<
cudaStream_t
>
streams
;
int
next_stream_index
;
MPSCQueueConsumerLock
<
std
::
shared_ptr
<
Request
>>
request_queue
;
std
::
atomic_bool
stop_flag
;
};
// 设备 ID 到 DeviceInfo 的映射
std
::
vector
<
std
::
unique_ptr
<
DeviceInfo
>>
devices_
;
// 私有方法
void
deviceWorker
(
DeviceInfo
&
device_info
);
};
csrc/balance_serve/kvc2/src/defs.h
0 → 100644
View file @
25cee581
#ifndef __DEFS_H_
#define __DEFS_H_
#include <cstdint>
#include <optional>
#include <vector>
#include "model_config.h"
namespace
kvc2
{
using
kvc2_ptr
=
void
*
;
// using data_block_ptr = std::intptr_t;
using
data_block_ptr
=
void
*
;
using
layer_data
=
std
::
vector
<
data_block_ptr
>
;
using
kvc2_handle
=
void
*
;
using
Token
=
uint32_t
;
using
Tokens
=
std
::
vector
<
Token
>
;
using
TokenPtr
=
std
::
intptr_t
;
using
TokenLength
=
size_t
;
using
BlockLength
=
size_t
;
struct
CacheInfo
{
ModelName
model_name
;
bool
is_key_cache
;
QuantType
quant_type
;
size_t
hidden_layer_count
();
std
::
filesystem
::
path
path
(
std
::
optional
<
size_t
>
which_layer
=
std
::
nullopt
);
bool
operator
==
(
const
CacheInfo
&
other
)
const
;
size_t
element_size
(
size_t
block_length
);
size_t
hash_value
()
const
;
};
};
// namespace kvc2
#endif
csrc/balance_serve/kvc2/src/gpu_cache.cpp
0 → 100644
View file @
25cee581
#include "gpu_cache.hh"
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
#include "cache_entry.hh"
#include "utils/arithmetic.hpp"
namespace
kvc2
{
GPUPageCache
::
GPUPageCache
(
GPUPageCacheConfig
&
config
)
:
config
(
config
)
{
if
(
torch
::
cuda
::
is_available
())
{
size_t
gpu_count
=
torch
::
cuda
::
device_count
();
SPDLOG_INFO
(
"Number of available GPUs: {}, want {}"
,
gpu_count
,
config
.
gpu_devices_id
.
size
());
if
(
gpu_count
<
config
.
gpu_devices_id
.
size
())
{
SPDLOG_ERROR
(
"Not enough GPUs available."
);
exit
(
0
);
}
for
(
auto
x
:
config
.
gpu_devices_id
)
{
gpu_devices
.
push_back
(
torch
::
Device
(
torch
::
kCUDA
,
x
));
}
}
else
{
SPDLOG_ERROR
(
"CUDA is not available on this system."
);
exit
(
0
);
}
SPDLOG_WARN
(
"Creating GPU Cache"
);
shape
.
push_back
(
config
.
layer_count
);
shape
.
push_back
(
config
.
total_kvcache_pages
);
shape
.
push_back
(
config
.
num_token_per_page
);
if
(
config
.
full_kv_cache_on_each_gpu
)
{
if
(
config
.
gpu_devices_id
.
size
()
>
1
)
{
SPDLOG_WARN
(
"Replicated KVCache on multiple gpu"
);
}
shape
.
push_back
(
config
.
num_k_heads
);
}
else
{
shape
.
push_back
(
config
.
num_k_heads
/
config
.
gpu_devices_id
.
size
());
}
shape
.
push_back
(
config
.
k_head_dim
);
tensor_size
=
torch
::
elementSize
(
config
.
tensor_type
);
for
(
auto
&
s
:
shape
)
{
tensor_size
*=
s
;
}
SPDLOG_INFO
(
"Creating KV Page Cache, Shape ({},{},{},{},{}), Size {} MiB"
,
shape
[
0
],
shape
[
1
],
shape
[
2
],
shape
[
3
],
shape
[
4
],
tensor_size
/
(
1
<<
20
));
if
(
config
.
k_cache_on
)
{
for
(
size_t
i
=
0
;
i
<
config
.
gpu_devices_id
.
size
();
i
++
)
{
auto
k
=
torch
::
zeros
(
shape
,
torch
::
TensorOptions
().
dtype
(
config
.
tensor_type
));
k
=
k
.
to
(
gpu_devices
[
i
]);
k_cache
.
push_back
(
k
);
SPDLOG_INFO
(
"K Page Cache of GPU {} is created"
,
config
.
gpu_devices_id
[
i
]);
}
occupations
.
resize
(
config
.
layer_count
);
}
else
{
SPDLOG_WARN
(
"Disalbe K Cache"
);
assert
(
config
.
gpu_only
);
}
if
(
config
.
v_cache_on
)
{
for
(
size_t
i
=
0
;
i
<
config
.
gpu_devices_id
.
size
();
i
++
)
{
auto
v
=
torch
::
zeros
(
shape
,
torch
::
TensorOptions
().
dtype
(
config
.
tensor_type
));
v
=
v
.
to
(
gpu_devices
[
i
]);
v_cache
.
push_back
(
v
);
SPDLOG_INFO
(
"V Page Cache of GPU {} is created"
,
config
.
gpu_devices_id
[
i
]);
}
v_occupations
.
resize
(
config
.
layer_count
);
}
else
{
SPDLOG_WARN
(
"Disalbe V Cache"
);
// assert(config.gpu_only); // should not assert
}
if
(
config
.
gpu_only
)
{
gpu_only_occupations
.
resize
(
config
.
total_kvcache_pages
,
false
);
}
num_free_pages
=
config
.
total_kvcache_pages
;
for
(
size_t
i
=
0
;
i
<
config
.
layer_count
;
i
++
)
{
if
(
config
.
k_cache_on
)
occupations
[
i
].
resize
(
config
.
total_kvcache_pages
,
nullptr
);
if
(
config
.
v_cache_on
)
v_occupations
[
i
].
resize
(
config
.
total_kvcache_pages
,
nullptr
);
}
tp_size
.
resize
(
config
.
gpu_devices_id
.
size
(),
shape
[
2
]
*
shape
[
3
]
*
shape
[
4
]
*
c10
::
elementSize
(
config
.
tensor_type
));
tp_offset
.
resize
(
config
.
gpu_devices_id
.
size
(),
0
);
for
(
size_t
i
=
1
;
i
<
tp_offset
.
size
();
i
++
)
{
tp_offset
[
i
]
=
tp_offset
[
i
-
1
]
+
tp_size
[
i
-
1
];
}
stream_manager
=
std
::
unique_ptr
<
CudaStreamManager
>
(
new
CudaStreamManager
(
config
.
gpu_devices_id
,
config
.
num_streams_per_device
));
}
bool
GPUPageCache
::
alloc_col
(
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_entries
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_entries
,
size_t
at
)
{
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
auto
idx
=
next_empty_col
();
if
(
idx
.
has_value
())
{
// must have entry lock
auto
&
k0_entry
=
k_entries
[
0
][
at
];
k0_entry
->
gpu_block_idx
=
idx
;
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
config
.
k_cache_on
)
{
assert
(
k_entries
[
l
][
at
]
->
data
!=
nullptr
);
occupations
[
l
][
idx
.
value
()]
=
k_entries
[
l
][
at
];
}
if
(
config
.
v_cache_on
)
{
assert
(
v_entries
[
l
][
at
]
->
data
!=
nullptr
);
v_occupations
[
l
][
idx
.
value
()]
=
v_entries
[
l
][
at
];
}
}
return
true
;
}
else
{
return
false
;
}
}
std
::
vector
<
size_t
>
GPUPageCache
::
gpu_only_alloc_col
(
size_t
count
)
{
assert
(
config
.
gpu_only
);
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
std
::
vector
<
size_t
>
re
;
for
(
size_t
i
=
0
;
i
<
config
.
total_kvcache_pages
;
i
++
)
{
if
(
gpu_only_occupations
[
i
]
==
false
)
{
re
.
push_back
(
i
);
if
(
re
.
size
()
==
count
)
{
break
;
}
}
}
if
(
re
.
size
()
==
count
)
{
for
(
auto
at
:
re
)
{
gpu_only_occupations
[
at
]
=
true
;
}
}
else
{
SPDLOG_WARN
(
"GPU ONLY: Cannot allocate {} cols"
,
count
);
re
.
clear
();
}
return
re
;
}
void
GPUPageCache
::
gpu_only_free_cols
(
std
::
vector
<
size_t
>
cols
)
{
assert
(
config
.
gpu_only
);
std
::
lock_guard
<
std
::
mutex
>
lg
(
lock
);
for
(
auto
at
:
cols
)
{
assert
(
gpu_only_occupations
[
at
]);
gpu_only_occupations
[
at
]
=
false
;
}
}
std
::
optional
<
size_t
>
GPUPageCache
::
next_empty_col
()
{
if
(
num_free_pages
==
0
)
{
evict_cols
();
if
(
num_free_pages
==
0
)
{
return
std
::
nullopt
;
}
}
while
(
occupations
[
0
][
_col_idx
]
!=
nullptr
)
{
_col_idx
=
(
_col_idx
+
1
)
%
config
.
total_kvcache_pages
;
}
num_free_pages
-=
1
;
return
_col_idx
;
}
void
GPUPageCache
::
evict_cols
()
{
auto
evicted_count
=
0
;
for
(
size_t
i
=
0
;
i
<
config
.
total_kvcache_pages
;
i
++
)
{
auto
&
h
=
occupations
[
0
][
i
];
if
(
h
==
nullptr
)
{
continue
;
}
auto
lg
=
h
->
lock_guard
();
if
(
h
->
gpu_cc
.
can_desert
())
{
h
->
gpu_cc
.
tc
.
reset
();
h
=
nullptr
;
num_free_pages
+=
1
;
evicted_count
+=
1
;
}
}
if
(
evicted_count
>
0
)
SPDLOG_INFO
(
"GPU: Evicted {} GPU pages"
,
evicted_count
);
}
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
GPUPageCache
::
try_lock_col
(
size_t
at
)
{
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
re
;
if
(
config
.
k_cache_on
)
{
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
occupations
[
l
][
at
]
==
nullptr
)
{
return
{};
}
auto
ul
=
occupations
[
l
][
at
]
->
try_lock
();
if
(
ul
.
owns_lock
())
{
re
.
push_back
(
std
::
move
(
ul
));
}
else
{
return
{};
}
}
}
if
(
config
.
v_cache_on
)
{
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
v_occupations
[
l
][
at
]
==
nullptr
)
{
return
{};
}
auto
ul
=
v_occupations
[
l
][
at
]
->
try_lock
();
if
(
ul
.
owns_lock
())
{
re
.
push_back
(
std
::
move
(
ul
));
}
else
{
return
{};
}
}
}
return
re
;
}
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
GPUPageCache
::
basic_request
(
cudaMemcpyKind
direction
,
std
::
function
<
void
()
>
callback
)
{
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
re
;
re
.
resize
(
config
.
gpu_devices_id
.
size
(),
nullptr
);
for
(
size_t
i
=
0
;
i
<
re
.
size
();
i
++
)
{
re
[
i
]
=
std
::
shared_ptr
<
CudaStreamManager
::
Request
>
(
new
CudaStreamManager
::
Request
);
re
[
i
]
->
direction
=
direction
;
re
[
i
]
->
device_id
=
config
.
gpu_devices_id
[
i
];
re
[
i
]
->
callback
=
callback
;
}
return
re
;
}
void
GPUPageCache
::
submit_requests
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
reqs
)
{
for
(
auto
&
r
:
reqs
)
{
stream_manager
->
submitRequest
(
r
);
}
}
void
GPUPageCache
::
append_col_to_request
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>&
reqs
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_handles
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_handles
,
size_t
at
)
{
if
(
config
.
k_cache_on
==
false
&&
config
.
v_cache_on
==
false
)
{
return
;
}
auto
gpu_block_idx
=
k_handles
[
0
][
at
]
->
gpu_block_idx
.
value
();
for
(
size_t
layer
=
0
;
layer
<
config
.
layer_count
;
layer
++
)
{
for
(
size_t
which_gpu
=
0
;
which_gpu
<
config
.
gpu_devices_id
.
size
();
which_gpu
++
)
{
if
(
config
.
k_cache_on
)
{
assert
(
k_handles
[
layer
][
at
]
->
data
!=
nullptr
);
reqs
[
which_gpu
]
->
sizes
.
push_back
(
tp_size
[
which_gpu
]);
reqs
[
which_gpu
]
->
host_mem_addresses
.
push_back
(
offset_by_bytes
(
k_handles
[
layer
][
at
]
->
data
,
tp_offset
[
which_gpu
]));
reqs
[
which_gpu
]
->
device_mem_addresses
.
push_back
(
k_cache
[
which_gpu
][
layer
][
gpu_block_idx
].
data_ptr
());
}
if
(
config
.
v_cache_on
)
{
assert
(
v_handles
[
layer
][
at
]
->
data
!=
nullptr
);
reqs
[
which_gpu
]
->
sizes
.
push_back
(
tp_size
[
which_gpu
]);
reqs
[
which_gpu
]
->
host_mem_addresses
.
push_back
(
offset_by_bytes
(
v_handles
[
layer
][
at
]
->
data
,
tp_offset
[
which_gpu
]));
reqs
[
which_gpu
]
->
device_mem_addresses
.
push_back
(
v_cache
[
which_gpu
][
layer
][
gpu_block_idx
].
data_ptr
());
}
}
}
// SPDLOG_DEBUG("GPU: Appended Vertical Handle to Request, count {}", reqs[0]->sizes.size());
}
void
GPUPageCache
::
debug
()
{
size_t
count
=
0
;
for
(
size_t
i
=
0
;
i
<
config
.
total_kvcache_pages
;
i
++
)
{
if
(
occupations
[
0
][
i
]
==
nullptr
)
{
count
+=
1
;
}
else
{
// occupations[0][i]->gpu_cc.debug();
}
}
SPDLOG_DEBUG
(
"Free Page: {}/{}"
,
count
,
config
.
total_kvcache_pages
);
}
}
// namespace kvc2
csrc/balance_serve/kvc2/src/gpu_cache.hh
0 → 100644
View file @
25cee581
#ifndef __GPU_CACHE_HH_
#define __GPU_CACHE_HH_
#include <torch/torch.h>
#include "cache_entry.hh"
#include "cuda_stream_manager.hh"
#include "defs.h"
#include "kvc2.h"
#include "metrics.h"
#include "utils/periodic_task.hpp"
namespace
kvc2
{
class
GPUPageCache
{
std
::
vector
<
torch
::
Device
>
gpu_devices
;
std
::
vector
<
int64_t
>
shape
;
size_t
tensor_size
;
std
::
vector
<
size_t
>
tp_offset
;
std
::
vector
<
size_t
>
tp_size
;
// met
std
::
shared_ptr
<
Metrics
>
met
;
// states
std
::
mutex
lock
;
size_t
num_free_pages
;
std
::
vector
<
bool
>
gpu_only_occupations
;
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>
occupations
,
v_occupations
;
size_t
_col_idx
=
0
;
// cuda stream manager
std
::
optional
<
size_t
>
next_empty_col
();
public:
GPUPageCacheConfig
config
;
std
::
unique_ptr
<
CudaStreamManager
>
stream_manager
;
std
::
vector
<
torch
::
Tensor
>
k_cache
;
std
::
vector
<
torch
::
Tensor
>
v_cache
;
std
::
unique_ptr
<
periodic
::
PeriodicTask
>
background_flush_back
=
nullptr
;
GPUPageCache
(
GPUPageCacheConfig
&
config
);
std
::
vector
<
size_t
>
gpu_only_alloc_col
(
size_t
count
);
void
gpu_only_free_cols
(
std
::
vector
<
size_t
>
cols
);
void
gpu_background_flush
();
bool
alloc_col
(
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_entries
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_entries
,
size_t
at
);
void
evict_cols
();
void
flush_col
(
size_t
at
);
std
::
vector
<
std
::
unique_lock
<
CacheBlockEntry
::
MutexT
>>
try_lock_col
(
size_t
at
);
void
free_col
(
size_t
at
);
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
basic_request
(
cudaMemcpyKind
direction
,
std
::
function
<
void
()
>
callback
);
void
submit_requests
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>
reqs
);
void
append_col_to_request
(
std
::
vector
<
std
::
shared_ptr
<
CudaStreamManager
::
Request
>>&
reqs
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
k_handles
,
std
::
vector
<
std
::
vector
<
std
::
shared_ptr
<
CacheBlockEntry
>>>&
v_handles
,
size_t
at
);
void
debug
();
};
}
// namespace kvc2
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/hasher.hpp
0 → 100644
View file @
25cee581
#ifndef __HASHER_HPP_
#define __HASHER_HPP_
#include "defs.h"
#include "xxhash.h"
namespace
kvc2
{
const
uint64_t
hash_seed
=
4123512
;
const
uint64_t
check_hash_seed
=
1025753
;
using
TokensHash
=
XXH64_hash_t
;
struct
TokensHasher
{
XXH64_state_t
*
state
;
TokensHasher
()
{
state
=
XXH64_createState
();
reset
();
}
~
TokensHasher
()
{
XXH64_freeState
(
state
);
}
TokensHasher
(
TokensHasher
&
other
)
=
delete
;
TokensHasher
&
operator
=
(
TokensHasher
&
other
)
=
delete
;
TokensHasher
(
TokensHasher
&&
other
)
=
delete
;
TokensHasher
&
operator
=
(
TokensHasher
&&
other
)
=
delete
;
TokensHash
get
()
{
return
XXH64_digest
(
state
);
}
void
reset
(
size_t
seed
=
hash_seed
)
{
XXH64_reset
(
state
,
seed
);
}
TokensHash
update
(
Token
*
data
,
TokenLength
length
)
{
XXH64_update
(
state
,
data
,
length
*
sizeof
(
Token
));
return
get
();
}
TokensHash
update_raw
(
void
*
data
,
size_t
size
)
{
XXH64_update
(
state
,
data
,
size
);
return
get
();
}
static
TokensHash
hash
(
Token
*
data
,
TokenLength
length
)
{
return
XXH64
(
data
,
length
*
sizeof
(
Token
),
hash_seed
);
}
};
}
// namespace kvc2
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/io_helper.hpp
0 → 100644
View file @
25cee581
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-12-11 06:35:31
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-12-11 06:50:55
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
#include <atomic>
#include <future>
#include <iostream>
#include <mutex>
#include <optional>
#include <string>
#include <vector>
struct
BatchPromise
{
std
::
promise
<
void
>
promise
;
std
::
shared_future
<
void
>
fut
;
std
::
atomic_size_t
count
;
inline
BatchPromise
(
size_t
count
)
:
count
(
count
)
{
fut
=
promise
.
get_future
().
share
();
}
inline
void
inc
(
size_t
count
=
1
)
{
this
->
count
.
fetch_add
(
count
,
std
::
memory_order_seq_cst
);
}
inline
void
set
()
{
if
(
count
.
fetch_sub
(
1
,
std
::
memory_order_seq_cst
)
==
1
)
{
promise
.
set_value
();
}
}
inline
std
::
shared_future
<
void
>
get_shared_fut
()
{
return
fut
;
}
};
template
<
typename
Lock
>
struct
TransferControl
{
Lock
lock
;
std
::
optional
<
std
::
shared_future
<
void
>>
transfer_ok
=
std
::
nullopt
;
bool
has_data
=
false
;
TransferControl
()
{}
/*
true, std::nullopt : Already has data
false, shared_future : Transfer already started, should wait for the future
false, std::nullopt : should transfer by you
true, shared_future: Should not appear
*/
std
::
pair
<
bool
,
std
::
optional
<
std
::
shared_future
<
void
>>>
has_data_or_transfer
(
std
::
shared_future
<
void
>
shared_fut
)
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
if
(
has_data
)
{
return
{
true
,
std
::
nullopt
};
}
else
{
if
(
transfer_ok
.
has_value
())
{
return
{
false
,
transfer_ok
};
}
else
{
transfer_ok
=
shared_fut
;
return
{
false
,
std
::
nullopt
};
}
}
}
void
set_has_data
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
has_data
=
true
;
transfer_ok
=
std
::
nullopt
;
}
bool
get_has_data
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
if
(
has_data
)
{
return
true
;
}
else
{
return
false
;
}
}
void
reset
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
transfer_ok
=
std
::
nullopt
;
has_data
=
false
;
}
std
::
string
debug
()
{
std
::
lock_guard
<
Lock
>
lg
(
lock
);
return
std
::
string
(
""
)
+
(
has_data
?
"has data"
:
"no data"
)
+
" "
+
(
transfer_ok
.
has_value
()
?
"transfer "
:
"no transfer"
);
}
};
struct
ConcurrentController
{
std
::
atomic_bool
dirty
=
false
;
std
::
atomic_size_t
ref_count
=
0
;
TransferControl
<
std
::
mutex
>
tc
;
};
template
<
typename
Unit
>
struct
IO_Helper
{
BatchPromise
batch_promise
;
std
::
function
<
void
(
Unit
*
)
>
call_back_on_unit
=
nullptr
;
std
::
function
<
void
()
>
call_back
=
nullptr
;
std
::
vector
<
std
::
shared_future
<
void
>>
futs
;
std
::
vector
<
Unit
*>
units_by_myself
;
IO_Helper
(
std
::
function
<
void
(
Unit
*
)
>
call_back_on_unit
,
std
::
function
<
void
()
>
call_back
=
nullptr
)
:
batch_promise
(
1
),
call_back_on_unit
(
call_back_on_unit
),
call_back
(
call_back
)
{}
IO_Helper
(
const
IO_Helper
&
other
)
=
delete
;
IO_Helper
&
operator
=
(
const
IO_Helper
&
other
)
=
delete
;
IO_Helper
(
IO_Helper
&&
other
)
=
delete
;
IO_Helper
&
operator
=
(
IO_Helper
&&
other
)
=
delete
;
~
IO_Helper
()
{
// std::cout<<"Destory IO helper"<<std::endl;
}
size_t
total_task_count
=
0
;
void
new_task
(
size_t
count
=
1
)
{
total_task_count
+=
1
;
batch_promise
.
inc
(
count
);
}
void
finish_add_taks
()
{
batch_promise
.
set
();
}
bool
absorb_tc
(
Unit
*
unit
,
TransferControl
<
std
::
mutex
>&
tc
)
{
auto
[
ok
,
fut
]
=
tc
.
has_data_or_transfer
(
batch_promise
.
get_shared_fut
());
if
(
ok
)
{
return
false
;
}
else
{
if
(
fut
.
has_value
())
{
futs
.
push_back
(
fut
.
value
());
// printf("Transfer started\n");
return
false
;
}
else
{
units_by_myself
.
push_back
(
unit
);
// printf("Not Transfer\n");
return
true
;
}
}
}
void
wait
()
{
for
(
auto
&
fut
:
futs
)
{
fut
.
wait
();
}
batch_promise
.
get_shared_fut
().
wait
();
for
(
auto
&
b
:
units_by_myself
)
{
call_back_on_unit
(
b
);
}
if
(
call_back
)
call_back
();
}
};
csrc/balance_serve/kvc2/src/kvc2.h
0 → 100644
View file @
25cee581
#pragma once
#include <torch/torch.h>
#include <cstdint>
#include <optional>
#include <vector>
#include "defs.h"
#include "model_config.h"
namespace
kvc2
{
struct
GPUPageCacheConfig
{
bool
gpu_only
;
std
::
vector
<
size_t
>
gpu_devices_id
;
size_t
layer_count
;
size_t
total_kvcache_pages
;
size_t
num_token_per_page
;
size_t
num_k_heads
;
size_t
k_head_dim
;
bool
full_kv_cache_on_each_gpu
=
false
;
bool
k_cache_on
=
true
;
bool
v_cache_on
=
true
;
torch
::
ScalarType
tensor_type
;
// for cuda stream manager
size_t
num_streams_per_device
=
4
;
};
struct
KVC2Config
{
bool
k_cache_on
=
true
;
bool
v_cache_on
=
true
;
bool
gpu_only
=
false
;
bool
load_from_disk
=
true
;
bool
save_to_disk
=
true
;
std
::
string
path
;
std
::
string
config_path
;
TokenLength
num_token_per_page
=
256
;
size_t
memory_pool_size
=
10e9
;
size_t
evict_count
=
20
;
std
::
optional
<
GPUPageCacheConfig
>
gpu_cache_config
=
std
::
nullopt
;
size_t
metrics_port
;
double
recompute_ratio
=
0.2
;
};
class
DoubleCacheHandleInterface
;
class
KVC2Interface
{
public:
virtual
~
KVC2Interface
()
=
default
;
virtual
void
load
()
=
0
;
virtual
void
save
()
=
0
;
/*
Raw Insert
Insert kvcache from kvcache_data to disk.
info: cache info
id: start pointer of token array
length: length of token array
kvcache_data: data of kvcache
This will firstly match the ID array with the existing kvcache, and then insert the unmatched kvcache to disk.
*/
virtual
void
raw_insert
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
const
std
::
vector
<
layer_data
>&
k_cache
,
const
std
::
vector
<
layer_data
>&
v_cache
)
=
0
;
/*
Raw Read
Read kvcache from disk to user specified pointers.
info: cache info
id: start pointer of token array
length: length of token array
kvcache_data: data of kvcache
Return: matched length of prefix, in tokens
This will not read from memory pool, it directly read from disk.
*/
virtual
TokenLength
raw_read
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
const
std
::
vector
<
layer_data
>&
k_cache
,
const
std
::
vector
<
layer_data
>&
v_cache
)
=
0
;
/*
Lookup
Lookup kvcache and load it from disk to memory pool if needed.
info: cache info
id: start pointer of token array
length: length of token array
Return: kvc2_handle, holds kvcache until being released.
if not found, matched_length will return 0.
if memory pool is full, return nullptr
*/
virtual
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
lookup
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
)
=
0
;
/*
Lookup and allocate to gpu
info.is_k_cache does not matter here
*/
virtual
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
lookup_to_gpu
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
)
=
0
;
virtual
void
lookup_to_gpu_async
(
ModelName
model_name
,
QuantType
quant_type
,
Token
*
id
,
TokenLength
length
,
TokenLength
estimated_length
,
std
::
function
<
void
(
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
)
>
call_back
)
=
0
;
virtual
std
::
pair
<
std
::
vector
<
torch
::
Tensor
>
,
std
::
vector
<
torch
::
Tensor
>>
get_kvcache
()
=
0
;
virtual
void
debug
()
=
0
;
};
std
::
shared_ptr
<
KVC2Interface
>
create_kvc2
(
KVC2Config
config
);
enum
MatchStatus
{
Exact
,
Partial
,
NotMatchExact
,
NotMatchPartial
,
};
class
DoubleCacheHandleInterface
{
public:
virtual
~
DoubleCacheHandleInterface
()
=
default
;
virtual
TokenLength
matched_length
()
=
0
;
virtual
std
::
vector
<
MatchStatus
>
matched_status
()
=
0
;
virtual
std
::
vector
<
layer_data
>
handle_data
(
bool
is_key_cache
)
=
0
;
virtual
bool
to_gpu
()
=
0
;
virtual
void
to_gpu_async
(
std
::
function
<
void
(
bool
)
>
call_back
)
=
0
;
virtual
std
::
vector
<
size_t
>
get_gpu_block_idx
()
=
0
;
virtual
std
::
vector
<
size_t
>
get_gpu_attached_block_idx
()
=
0
;
virtual
void
append_tokens
(
Token
*
tokens
,
TokenLength
length
)
=
0
;
// update generated tokens
virtual
void
debug
()
=
0
;
};
};
// namespace kvc2
csrc/balance_serve/kvc2/src/kvc2_utils.py
0 → 100644
View file @
25cee581
import
torch
import
ctypes
def
aligned_tensor
(
size
,
alignment
=
4096
):
num_bytes
=
size
mem
=
ctypes
.
c_void_p
()
error_code
=
ctypes
.
CDLL
(
None
).
posix_memalign
(
ctypes
.
byref
(
mem
),
ctypes
.
c_size_t
(
alignment
),
ctypes
.
c_size_t
(
num_bytes
)
)
if
error_code
!=
0
:
raise
MemoryError
(
f
"posix_memalign failed with error code
{
error_code
}
"
)
array_type
=
(
ctypes
.
c_int8
*
size
)
raw_array
=
array_type
.
from_address
(
mem
.
value
)
tensor
=
torch
.
frombuffer
(
raw_array
,
dtype
=
torch
.
int8
)
if
tensor
.
data_ptr
()
%
alignment
!=
0
:
raise
ValueError
(
f
"Tensor data_ptr
{
tensor
.
data_ptr
()
}
is not aligned to
{
alignment
}
bytes"
)
return
tensor
,
mem
def
alloc_aligned_cache
(
layer_count
,
block_count
,
element_size
):
cache
=
[]
cache_mem
=
[]
for
i
in
range
(
layer_count
):
layer_data
=
[]
layer_mem
=
[]
for
j
in
range
(
block_count
):
tensor
,
mem_ptr
=
aligned_tensor
(
element_size
,
alignment
=
4096
)
layer_data
.
append
(
tensor
)
layer_mem
.
append
(
mem_ptr
)
cache
.
append
(
layer_data
)
cache_mem
.
append
(
layer_mem
)
return
cache
,
cache_mem
def
dealloc_aligned_cache
(
cache_mem
):
for
layer_mem
in
cache_mem
:
for
mem_ptr
in
layer_mem
:
ctypes
.
CDLL
(
None
).
free
(
mem_ptr
)
def
get_tensor_ptr
(
tensors
):
tensor_ptr
=
[]
for
layer
in
tensors
:
layer_ptr
=
[]
for
data
in
layer
:
layer_ptr
.
append
(
data
.
data_ptr
())
tensor_ptr
.
append
(
layer_ptr
)
return
tensor_ptr
def
get_tensor_from_data_ptr
(
matched_data
,
element_size
):
re
=
[]
for
layer
in
matched_data
:
re_layer
=
[]
for
data_ptr
in
layer
:
array_type
=
(
ctypes
.
c_int8
*
element_size
)
raw_array
=
array_type
.
from_address
(
data_ptr
)
tensor
=
torch
.
frombuffer
(
raw_array
,
dtype
=
torch
.
int8
)
re_layer
.
append
(
tensor
)
re
.
append
(
re_layer
)
return
re
if
__name__
==
"__main__"
:
pass
\ No newline at end of file
csrc/balance_serve/kvc2/src/metrics.cpp
0 → 100644
View file @
25cee581
#include "metrics.h"
namespace
kvc2
{
Metrics
::
Metrics
(
const
MetricsConfig
&
config
)
:
registry_
(
std
::
make_shared
<
prometheus
::
Registry
>
()),
exposer_
(
config
.
endpoint
)
{
// 注册 prefix_nodes Counter
auto
&
prefix_nodes_family
=
prometheus
::
BuildCounter
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_prefix_nodes"
)
.
Help
(
"Number of prefix nodes"
)
.
Register
(
*
registry_
);
prefix_nodes
=
&
prefix_nodes_family
.
Add
({});
// 注册 prefix_block_count Counter
auto
&
prefix_block_count_family
=
prometheus
::
BuildCounter
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_prefix_block_count"
)
.
Help
(
"Number of prefix blocks"
)
.
Register
(
*
registry_
);
prefix_block_count
=
&
prefix_block_count_family
.
Add
({});
// 定义统一的桶大小,最大为 10000 ms (10 s)
std
::
vector
<
double
>
common_buckets
=
{
1.0
,
5.0
,
10.0
,
50.0
,
100.0
,
500.0
,
1000.0
,
5000.0
,
10000.0
};
// 注册 raw_insert_time_ms Histogram
auto
&
raw_insert_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_raw_insert_time_ms"
)
.
Help
(
"function raw insert's time in milliseconds"
)
.
Register
(
*
registry_
);
raw_insert_time_ms
=
&
raw_insert_time_ms_family
.
Add
({},
common_buckets
);
// 注册 lookup_time_ms Histogram
auto
&
lookup_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_lookup_time_ms"
)
.
Help
(
"function lookup's time in milliseconds"
)
.
Register
(
*
registry_
);
lookup_time_ms
=
&
lookup_time_ms_family
.
Add
({},
common_buckets
);
// 注册 lookup_prefixmatch_length Histogram
auto
&
lookup_prefixmatch_length_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_lookup_prefixmatch_length"
)
.
Help
(
"function lookup's prefix match length"
)
.
Register
(
*
registry_
);
lookup_prefixmatch_length
=
&
lookup_prefixmatch_length_family
.
Add
({},
common_buckets
);
// 注册 matched_length_percentage Histogram
auto
&
matched_length_percentage_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_matched_length_percentage"
)
.
Help
(
"function matched length percentage"
)
.
Register
(
*
registry_
);
matched_length_percentage
=
&
matched_length_percentage_family
.
Add
({},
common_buckets
);
// 注册 disk_usage Gauge
auto
&
disk_usage_family
=
prometheus
::
BuildGauge
().
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_disk_usage"
).
Help
(
"disk usage"
).
Register
(
*
registry_
);
disk_usage
=
&
disk_usage_family
.
Add
({});
// 注册 memory_pool_size Gauge
memory_pool_size_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_memory_pool_size"
)
.
Help
(
"memory pool size"
)
.
Register
(
*
registry_
);
// 注册 memory_pool_node_count Gauge
memory_pool_node_count_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_memory_pool_node_count"
)
.
Help
(
"memory pool node count"
)
.
Register
(
*
registry_
);
// 注册 lru_entry_count Gauge
lru_entry_count_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_lru_entry_count"
)
.
Help
(
"lru entry count"
)
.
Register
(
*
registry_
);
// 注册 gpu_page_count Gauge
gpu_page_count_family_
=
&
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_gpu_page_count"
)
.
Help
(
"gpu page count"
)
.
Register
(
*
registry_
);
// 注册 append_tokens_time_ms Histogram
auto
&
append_tokens_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_append_tokens_time_ms"
)
.
Help
(
"append tokens time in milliseconds"
)
.
Register
(
*
registry_
);
append_tokens_time_ms
=
&
append_tokens_time_ms_family
.
Add
({},
common_buckets
);
// 注册 gpu_flush_back_time_ms Histogram
auto
&
gpu_flush_back_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_gpu_flush_back_time_ms"
)
.
Help
(
"gpu flush back time in milliseconds"
)
.
Register
(
*
registry_
);
gpu_flush_back_time_ms
=
&
gpu_flush_back_time_ms_family
.
Add
({},
common_buckets
);
// 注册 cpu_flush_back_time_ms Histogram
auto
&
cpu_flush_back_time_ms_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_cpu_flush_back_time_ms"
)
.
Help
(
"cpu flush back time in milliseconds"
)
.
Register
(
*
registry_
);
cpu_flush_back_time_ms
=
&
cpu_flush_back_time_ms_family
.
Add
({},
common_buckets
);
exposer_
.
RegisterCollectable
(
registry_
);
}
// 析构函数
Metrics
::~
Metrics
()
{
// 停止指标暴露
// exposer_.Stop();
}
// 获取 memory_pool_size 指标
prometheus
::
Gauge
*
Metrics
::
memory_pool_size
(
const
std
::
string
&
type
)
{
return
&
memory_pool_size_family_
->
Add
({{
"type"
,
type
}});
}
// 获取 memory_pool_node_count 指标
prometheus
::
Gauge
*
Metrics
::
memory_pool_node_count
(
const
std
::
string
&
type
)
{
return
&
memory_pool_node_count_family_
->
Add
({{
"type"
,
type
}});
}
// 获取 lru_entry_count 指标
prometheus
::
Gauge
*
Metrics
::
lru_entry_count
(
const
std
::
string
&
type
)
{
return
&
lru_entry_count_family_
->
Add
({{
"type"
,
type
}});
}
// 获取 gpu_page_count 指标
prometheus
::
Gauge
*
Metrics
::
gpu_page_count
(
std
::
string
type
)
{
return
&
gpu_page_count_family_
->
Add
({{
"type"
,
type
}});
}
TimeObserver
::
TimeObserver
(
prometheus
::
Histogram
*
h
)
{
histogram_
=
h
;
timer_
.
start
();
}
TimeObserver
::~
TimeObserver
()
{
timer_
.
stop
();
histogram_
->
Observe
(
timer_
.
elapsedNs
()
/
1e6
);
// ns -> ms
}
}
// namespace kvc2
\ No newline at end of file
csrc/balance_serve/kvc2/src/metrics.h
0 → 100644
View file @
25cee581
#pragma once
#include "prometheus/counter.h"
#include "prometheus/exposer.h"
#include "prometheus/gauge.h"
#include "prometheus/histogram.h"
#include "prometheus/registry.h"
#include <atomic>
#include <chrono>
#include <memory>
#include <string>
#include <thread>
#include <vector>
#include "utils/timer.hpp"
namespace
kvc2
{
// 指标前缀宏定义
#define METRIC_PREFIX "kvc2"
struct
MetricsConfig
{
std
::
string
endpoint
;
// 监听端点,如 "0.0.0.0:8080"
};
class
Metrics
{
public:
// 构造函数传入 MetricsConfig
Metrics
(
const
MetricsConfig
&
config
);
~
Metrics
();
// 禁止拷贝和赋值
Metrics
(
const
Metrics
&
)
=
delete
;
Metrics
&
operator
=
(
const
Metrics
&
)
=
delete
;
// 指标指针
prometheus
::
Counter
*
prefix_nodes
;
prometheus
::
Counter
*
prefix_block_count
;
prometheus
::
Histogram
*
raw_insert_time_ms
;
prometheus
::
Histogram
*
lookup_time_ms
;
prometheus
::
Histogram
*
lookup_prefixmatch_length
;
prometheus
::
Histogram
*
matched_length_percentage
;
prometheus
::
Gauge
*
disk_usage
;
prometheus
::
Gauge
*
memory_pool_size
(
const
std
::
string
&
type
);
prometheus
::
Gauge
*
memory_pool_node_count
(
const
std
::
string
&
type
);
prometheus
::
Gauge
*
lru_entry_count
(
const
std
::
string
&
type
);
prometheus
::
Gauge
*
gpu_page_count
(
std
::
string
type
);
prometheus
::
Histogram
*
append_tokens_time_ms
;
prometheus
::
Histogram
*
gpu_flush_back_time_ms
;
prometheus
::
Histogram
*
cpu_flush_back_time_ms
;
private:
std
::
shared_ptr
<
prometheus
::
Registry
>
registry_
;
prometheus
::
Exposer
exposer_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
memory_pool_size_family_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
memory_pool_node_count_family_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
lru_entry_count_family_
;
prometheus
::
Family
<
prometheus
::
Gauge
>*
gpu_page_count_family_
;
};
class
TimeObserver
{
public:
TimeObserver
(
prometheus
::
Histogram
*
h
);
~
TimeObserver
();
private:
Timer
timer_
;
prometheus
::
Histogram
*
histogram_
;
};
}
// namespace kvc2
\ No newline at end of file
csrc/balance_serve/kvc2/src/model_config.h
0 → 100644
View file @
25cee581
#ifndef __MODEL_CONFIG_HPP_
#define __MODEL_CONFIG_HPP_
#include <iostream>
#include "nlohmann/json.hpp"
#include <filesystem>
#include <fstream>
using
DimSize
=
size_t
;
using
URL
=
std
::
string
;
using
ModelName
=
std
::
string
;
// We must assure this can be load by config.json
class
ModelConfig
{
public:
DimSize
hidden_size
;
DimSize
intermediate_size
;
size_t
max_position_embeddings
;
std
::
string
model_type
;
size_t
num_attention_heads
;
size_t
num_hidden_layers
;
size_t
num_key_value_heads
;
size_t
vocab_size
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE
(
ModelConfig
,
hidden_size
,
intermediate_size
,
max_position_embeddings
,
model_type
,
num_attention_heads
,
num_hidden_layers
,
num_key_value_heads
,
vocab_size
);
void
load_from
(
std
::
filesystem
::
path
path
)
{
std
::
ifstream
i
(
path
);
nlohmann
::
json
j
;
i
>>
j
;
*
this
=
j
.
get
<
ModelConfig
>
();
}
};
using
QuantType
=
std
::
string
;
static
const
QuantType
NoQuantType
=
""
;
class
QuantConfig
{
public:
QuantType
name
;
// For GEMV
QuantType
type_of_dot_vector
=
NoQuantType
;
inline
bool
can_be_used_as_matrix
()
{
return
type_of_dot_vector
!=
NoQuantType
;
}
bool
can_be_used_as_vector
;
double
bytes_per_element
;
bool
has_scale
;
bool
has_min
;
size_t
block_element_count
;
size_t
block_element_size
;
URL
reference
=
""
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT
(
QuantConfig
,
name
,
type_of_dot_vector
,
can_be_used_as_vector
,
bytes_per_element
,
has_scale
,
has_min
,
block_element_count
,
block_element_size
,
reference
);
};
inline
std
::
map
<
QuantType
,
QuantConfig
>
quant_configs
;
inline
std
::
map
<
ModelName
,
ModelConfig
>
model_configs
;
inline
void
load_quant_configs
(
std
::
filesystem
::
path
path
)
{
std
::
cout
<<
__FUNCTION__
<<
" from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
nlohmann
::
json
j
;
i
>>
j
;
quant_configs
=
j
.
get
<
std
::
map
<
QuantType
,
QuantConfig
>>
();
std
::
cout
<<
"Loaded Quant Configs"
<<
std
::
endl
;
for
(
auto
&
[
k
,
v
]
:
quant_configs
)
{
std
::
cout
<<
" - "
<<
k
<<
std
::
endl
;
}
}
inline
void
dump_quant_configs
(
std
::
filesystem
::
path
path
)
{
std
::
ofstream
o
(
path
);
nlohmann
::
json
j
=
quant_configs
;
o
<<
j
.
dump
(
4
);
}
inline
void
load_model_configs
(
std
::
filesystem
::
path
path
)
{
std
::
cout
<<
__FUNCTION__
<<
" from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
nlohmann
::
json
j
;
i
>>
j
;
model_configs
=
j
.
get
<
std
::
map
<
ModelName
,
ModelConfig
>>
();
std
::
cout
<<
"Loaded Model Configs"
<<
std
::
endl
;
for
(
auto
&
[
k
,
v
]
:
model_configs
)
{
std
::
cout
<<
" - "
<<
k
<<
std
::
endl
;
}
}
inline
void
dump_model_configs
(
std
::
filesystem
::
path
path
)
{
std
::
ofstream
o
(
path
);
nlohmann
::
json
j
=
model_configs
;
o
<<
j
.
dump
(
4
);
}
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
0 → 100644
View file @
25cee581
#include "page_aligned_memory_pool.h"
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
#include "utils/arithmetic.hpp"
#include "utils/easy_format.hpp"
/// 构造函数
PageAlignedMemoryPool
::
PageAlignedMemoryPool
(
size_t
size_in_bytes
)
{
total_size
=
(
size_in_bytes
/
PageSize
)
*
PageSize
;
// 对齐分配。C++17 对齐方式写法,如果编译器不支持可以改用其它方法
data
=
::
operator
new
[](
total_size
,
std
::
align_val_t
(
PageSize
));
total_pages
=
total_size
/
PageSize
;
assert
(
total_pages
>=
Blocks
);
page_per_block
=
total_pages
/
Blocks
;
for
(
size_t
block_index
=
0
;
block_index
<
Blocks
;
block_index
++
)
{
first_page
[
block_index
]
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
intptr_t
>
(
data
)
+
static_cast
<
intptr_t
>
(
block_index
)
*
page_per_block
*
PageSize
);
count_page
[
block_index
]
=
block_index
==
Blocks
-
1
?
(
total_pages
-
page_per_block
*
(
Blocks
-
1
))
:
page_per_block
;
SPDLOG_DEBUG
(
"first_page[{}] = {}, count_page[{}] = {}"
,
block_index
,
reinterpret_cast
<
intptr_t
>
(
first_page
[
block_index
])
-
reinterpret_cast
<
intptr_t
>
(
data
),
block_index
,
count_page
[
block_index
]);
bitmap
[
block_index
].
resize
(
count_page
[
block_index
],
0
);
}
SPDLOG_INFO
(
"PageAlignedMemoryPool with size {} Mbytes, {} pages"
,
total_size
/
(
1
<<
20
),
page_count
());
}
/// 析构函数
PageAlignedMemoryPool
::~
PageAlignedMemoryPool
()
{
if
(
data
)
{
// 注意:需要与分配时的对齐方式对应
::
operator
delete
[](
data
,
std
::
align_val_t
(
PageSize
));
data
=
nullptr
;
}
}
/// 返回总页数
size_t
PageAlignedMemoryPool
::
page_count
()
{
return
total_size
/
PageSize
;
}
/// 返回按整页对齐后的字节数
size_t
PageAlignedMemoryPool
::
page_padded_size
(
size_t
size
)
{
return
div_up
(
size
,
PageSize
)
*
PageSize
;
}
void
*
PageAlignedMemoryPool
::
alloc_in_block
(
size_t
block_index
,
size_t
alloc_size
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
lock
[
block_index
]);
size_t
free_pages
=
0
;
for
(
size_t
i
=
0
;
i
<
count_page
[
block_index
];
i
++
)
{
if
(
bitmap
[
block_index
][
i
]
==
0
)
{
free_pages
++
;
if
(
free_pages
==
alloc_size
)
{
size_t
page_index
=
i
+
1
-
free_pages
;
for
(
size_t
page
=
page_index
;
page
<
page_index
+
alloc_size
;
page
++
)
{
bitmap
[
block_index
][
page
]
=
1
;
// SPDLOG_DEBUG("alloc page {} in block {}", page, block_index);
}
return
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
intptr_t
>
(
first_page
[
block_index
])
+
page_index
*
PageSize
);
}
}
else
{
free_pages
=
0
;
}
}
return
nullptr
;
}
/// 分配函数
void
*
PageAlignedMemoryPool
::
alloc
(
size_t
size
)
{
size_t
alloc_size
=
div_up
(
size
,
PageSize
);
auto
cnt
=
now_block
.
fetch_add
(
1
,
std
::
memory_order_relaxed
);
for
(
size_t
i
=
0
;
i
<
Blocks
;
i
++
)
{
auto
result
=
alloc_in_block
((
i
+
cnt
)
%
Blocks
,
alloc_size
);
if
(
result
!=
nullptr
)
{
allocated
.
fetch_add
(
alloc_size
*
PageSize
,
std
::
memory_order_relaxed
);
alloc_count
.
fetch_add
(
1
,
std
::
memory_order_relaxed
);
return
result
;
}
}
return
nullptr
;
}
/// 释放函数
void
PageAlignedMemoryPool
::
free
(
void
*
p
,
size_t
size
)
{
auto
alloc_size
=
div_up
(
size
,
PageSize
);
size_t
block_index
=
(
reinterpret_cast
<
intptr_t
>
(
p
)
-
reinterpret_cast
<
intptr_t
>
(
data
))
/
page_per_block
/
PageSize
;
size_t
page_index
=
(
reinterpret_cast
<
intptr_t
>
(
p
)
-
reinterpret_cast
<
intptr_t
>
(
first_page
[
block_index
]))
/
PageSize
;
std
::
lock_guard
<
std
::
mutex
>
guard
(
lock
[
block_index
]);
for
(
size_t
page
=
page_index
;
page
<
page_index
+
alloc_size
;
page
++
)
bitmap
[
block_index
][
page
]
=
0
;
allocated
.
fetch_sub
(
alloc_size
*
PageSize
,
std
::
memory_order_relaxed
);
free_count
.
fetch_add
(
1
,
std
::
memory_order_relaxed
);
}
// TODO: too slow
std
::
vector
<
void
*>
PageAlignedMemoryPool
::
alloc_multiple
(
size_t
size
,
size_t
count
)
{
std
::
vector
<
void
*>
result
;
for
(
size_t
i
=
0
;
i
<
count
;
i
++
)
{
auto
p
=
alloc
(
size
);
if
(
p
==
nullptr
)
{
for
(
auto
ptr
:
result
)
{
free
(
ptr
,
size
);
}
return
{};
}
result
.
push_back
(
p
);
}
return
result
;
}
void
PageAlignedMemoryPool
::
defragment
()
{}
/// 调试打印
std
::
string
PageAlignedMemoryPool
::
debug
()
{
return
fmt
::
format
(
"PageAlignedMemoryPool: total_size: {}MB, allocated: {}, alloc/free count: {}/{}
\n
"
,
readable_number
(
total_size
),
readable_number
(
size_t
(
allocated
)),
size_t
(
alloc_count
),
size_t
(
free_count
));
}
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
0 → 100644
View file @
25cee581
#pragma once
#include <algorithm> // std::sort
#include <cstddef> // size_t
#include <mutex> // std::mutex
#include <vector>
#include <assert.h>
#include <bitset>
#include <atomic>
constexpr
size_t
PageSize
=
4096
;
/// PageAlignedMemoryPool 类的声明
struct
PageAlignedMemoryPool
{
private:
constexpr
static
size_t
Blocks
=
16
;
void
*
data
=
nullptr
;
size_t
total_size
=
0
,
total_pages
=
0
;
std
::
atomic_size_t
now_block
=
0
;
std
::
atomic_size_t
allocated
=
0
;
// allocated_size
std
::
atomic_size_t
alloc_count
=
0
;
std
::
atomic_size_t
free_count
=
0
;
std
::
mutex
lock
[
Blocks
];
size_t
page_per_block
=
0
;
void
*
first_page
[
Blocks
];
size_t
count_page
[
Blocks
];
std
::
vector
<
int8_t
>
bitmap
[
Blocks
];
void
*
alloc_in_block
(
size_t
block_index
,
size_t
alloc_size
);
public:
/// 构造函数和析构函数
explicit
PageAlignedMemoryPool
(
size_t
size_in_bytes
);
~
PageAlignedMemoryPool
();
/// 禁用拷贝和移动
PageAlignedMemoryPool
(
PageAlignedMemoryPool
&&
other
)
=
delete
;
PageAlignedMemoryPool
&
operator
=
(
PageAlignedMemoryPool
&&
other
)
=
delete
;
PageAlignedMemoryPool
(
const
PageAlignedMemoryPool
&
other
)
=
delete
;
PageAlignedMemoryPool
&
operator
=
(
const
PageAlignedMemoryPool
&
other
)
=
delete
;
/// 成员函数
size_t
page_count
();
size_t
page_padded_size
(
size_t
size
);
void
*
alloc
(
size_t
size
);
std
::
vector
<
void
*>
alloc_multiple
(
size_t
size
,
size_t
count
);
void
free
(
void
*
data
,
size_t
size
);
void
defragment
();
std
::
string
debug
();
};
csrc/balance_serve/kvc2/src/prefix.cpp
0 → 100644
View file @
25cee581
This diff is collapsed.
Click to expand it.
csrc/balance_serve/kvc2/src/utils/all.hpp
0 → 100644
View file @
25cee581
#pragma once
#include "easy_format.hpp"
#include "timer.hpp"
\ No newline at end of file
csrc/balance_serve/kvc2/src/utils/arithmetic.hpp
0 → 100644
View file @
25cee581
#include <memory>
#include <type_traits>
template
<
typename
T
,
typename
U
>
T
div_up
(
T
x
,
U
by
)
{
static_assert
(
std
::
is_integral_v
<
T
>
);
static_assert
(
std
::
is_integral_v
<
U
>
);
return
(
x
+
by
-
1
)
/
by
;
}
template
<
typename
T
>
T
*
offset_by_bytes
(
T
*
t
,
size_t
n
)
{
return
reinterpret_cast
<
T
*>
(
reinterpret_cast
<
size_t
>
(
t
)
+
n
);
}
csrc/balance_serve/kvc2/src/utils/easy_format.hpp
0 → 100644
View file @
25cee581
#ifndef __EASY_FORMAT_HPP_
#define __EASY_FORMAT_HPP_
#include <array>
#include <iomanip>
#include <sstream>
#include <string>
#include <vector>
template
<
typename
T
>
inline
std
::
string
format_vector
(
const
std
::
vector
<
T
>&
v
)
{
std
::
ostringstream
oss
;
if
(
v
.
empty
())
return
"[]"
;
for
(
size_t
i
=
0
;
i
<
v
.
size
();
++
i
)
{
oss
<<
v
[
i
];
if
(
i
<
v
.
size
()
-
1
)
oss
<<
", "
;
// 逗号分隔
}
return
oss
.
str
();
}
inline
std
::
array
<
std
::
string
,
7
>
units
=
{
""
,
"K"
,
"M"
,
"G"
,
"T"
,
"P"
,
"E"
};
inline
std
::
string
readable_number
(
size_t
size
)
{
size_t
unit_index
=
0
;
double
readable_size
=
size
;
while
(
readable_size
>=
1000
&&
unit_index
<
units
.
size
()
-
1
)
{
readable_size
/=
1000
;
unit_index
++
;
}
std
::
ostringstream
ss
;
ss
<<
std
::
fixed
<<
std
::
setprecision
(
2
)
<<
readable_size
;
std
::
string
str
=
ss
.
str
();
return
str
+
""
+
units
[
unit_index
];
}
#endif
\ No newline at end of file
csrc/balance_serve/kvc2/src/utils/lock_free_queue.hpp
0 → 100644
View file @
25cee581
#include <atomic>
#include <future>
#include <iostream>
#include <memory>
#include <thread>
#include <vector>
template
<
typename
T
>
class
MPSCQueue
{
struct
Node
{
std
::
shared_ptr
<
T
>
data
;
std
::
atomic
<
Node
*>
next
;
Node
()
:
next
(
nullptr
)
{}
Node
(
std
::
shared_ptr
<
T
>
data_
)
:
data
(
std
::
move
(
data_
)),
next
(
nullptr
)
{}
};
std
::
atomic
<
Node
*>
head
;
Node
*
tail
;
public:
std
::
atomic_size_t
enqueue_count
=
0
;
size_t
dequeue_count
=
0
;
MPSCQueue
()
{
Node
*
dummy
=
new
Node
();
head
.
store
(
dummy
,
std
::
memory_order_relaxed
);
tail
=
dummy
;
}
~
MPSCQueue
()
{
// 清理剩余的节点
Node
*
node
=
tail
;
while
(
node
)
{
Node
*
next
=
node
->
next
.
load
(
std
::
memory_order_relaxed
);
delete
node
;
node
=
next
;
}
}
// 生产者调用
void
enqueue
(
std
::
shared_ptr
<
T
>
data
)
{
enqueue_count
.
fetch_add
(
1
);
Node
*
node
=
new
Node
(
std
::
move
(
data
));
Node
*
prev_head
=
head
.
exchange
(
node
,
std
::
memory_order_acq_rel
);
prev_head
->
next
.
store
(
node
,
std
::
memory_order_release
);
}
// 消费者调用
std
::
shared_ptr
<
T
>
dequeue
()
{
Node
*
next
=
tail
->
next
.
load
(
std
::
memory_order_acquire
);
if
(
next
)
{
std
::
shared_ptr
<
T
>
res
=
std
::
move
(
next
->
data
);
delete
tail
;
tail
=
next
;
dequeue_count
+=
1
;
return
res
;
}
return
nullptr
;
}
};
\ No newline at end of file
csrc/balance_serve/kvc2/src/utils/mpsc.hpp
0 → 100644
View file @
25cee581
#include <atomic>
#include <cassert>
#include <iostream>
#include <optional>
#include <semaphore>
template
<
typename
T
>
class
MPSCQueue
{
struct
Node
{
T
data
;
std
::
atomic
<
Node
*>
next
;
Node
()
:
next
(
nullptr
)
{}
Node
(
T
data_
)
:
data
(
std
::
move
(
data_
)),
next
(
nullptr
)
{}
};
std
::
atomic
<
Node
*>
head
;
Node
*
tail
;
public:
std
::
atomic_size_t
enqueue_count
=
0
;
size_t
dequeue_count
=
0
;
MPSCQueue
()
{
Node
*
dummy
=
new
Node
();
head
.
store
(
dummy
,
std
::
memory_order_seq_cst
);
tail
=
dummy
;
}
~
MPSCQueue
()
{
Node
*
node
=
tail
;
while
(
node
)
{
Node
*
next
=
node
->
next
.
load
(
std
::
memory_order_seq_cst
);
delete
node
;
node
=
next
;
}
}
// 生产者调用
void
enqueue
(
T
data
)
{
enqueue_count
.
fetch_add
(
1
);
Node
*
node
=
new
Node
(
std
::
move
(
data
));
Node
*
prev_head
=
head
.
exchange
(
node
,
std
::
memory_order_seq_cst
);
prev_head
->
next
.
store
(
node
,
std
::
memory_order_seq_cst
);
}
// 消费者调用
std
::
optional
<
T
>
dequeue
()
{
Node
*
next
=
tail
->
next
.
load
(
std
::
memory_order_seq_cst
);
if
(
next
)
{
T
res
=
std
::
move
(
next
->
data
);
delete
tail
;
tail
=
next
;
dequeue_count
+=
1
;
return
res
;
}
return
std
::
nullopt
;
}
size_t
size
()
{
return
enqueue_count
.
load
()
-
dequeue_count
;
}
};
template
<
typename
T
>
class
MPSCQueueConsumerLock
{
MPSCQueue
<
T
>
queue
;
std
::
counting_semaphore
<>
sema
{
0
};
public:
void
enqueue
(
T
data
)
{
queue
.
enqueue
(
std
::
move
(
data
));
// std::atomic_thread_fence(std::memory_order_seq_cst);// Inserting this because the memory order might be wrong, I
// am also not that sure about this.
sema
.
release
();
}
T
dequeue
()
{
auto
re
=
queue
.
dequeue
();
if
(
re
.
has_value
())
{
while
(
sema
.
try_acquire
()
==
false
)
{
std
::
cerr
<<
__FILE__
<<
":"
<<
__FUNCTION__
<<
" sema try acquire should be success, retrying, please check"
<<
std
::
endl
;
// assert(false);
}
return
re
.
value
();
}
sema
.
acquire
();
return
queue
.
dequeue
().
value
();
}
size_t
size
()
{
return
queue
.
size
();
}
};
Prev
1
2
3
4
5
6
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment