Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ox696c
ktransformers
Commits
64de7843
Commit
64de7843
authored
Apr 08, 2025
by
qiyuxinlin
Browse files
format kvc2, delete quant_configs, move model_configs to ~/.ktransformers
parent
9dd24ecd
Changes
31
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
744 additions
and
625 deletions
+744
-625
csrc/balance_serve/kvc2/src/async_store.cpp
csrc/balance_serve/kvc2/src/async_store.cpp
+9
-9
csrc/balance_serve/kvc2/src/gpu_cache.cpp
csrc/balance_serve/kvc2/src/gpu_cache.cpp
+5
-5
csrc/balance_serve/kvc2/src/metrics.h
csrc/balance_serve/kvc2/src/metrics.h
+5
-5
csrc/balance_serve/kvc2/src/model_config.h
csrc/balance_serve/kvc2/src/model_config.h
+38
-22
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
+11
-9
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
+8
-7
csrc/balance_serve/kvc2/src/prefix.cpp
csrc/balance_serve/kvc2/src/prefix.cpp
+11
-13
csrc/balance_serve/kvc2/test/kvc2test/common.hpp
csrc/balance_serve/kvc2/test/kvc2test/common.hpp
+4
-4
csrc/balance_serve/kvc2/test/kvc2test/flush-back.cpp
csrc/balance_serve/kvc2/test/kvc2test/flush-back.cpp
+2
-2
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt-without-vcache.cpp
...serve/kvc2/test/kvc2test/lookup-gpu-mt-without-vcache.cpp
+1
-1
csrc/balance_serve/kvc2/test/kvc2test/lookup-without-vcache.cpp
...alance_serve/kvc2/test/kvc2test/lookup-without-vcache.cpp
+1
-2
csrc/balance_serve/kvc2/test/page_pool_test.cpp
csrc/balance_serve/kvc2/test/page_pool_test.cpp
+7
-9
csrc/balance_serve/kvc2/test/test_periodic_task.cpp
csrc/balance_serve/kvc2/test/test_periodic_task.cpp
+97
-105
csrc/balance_serve/sched/bind.cpp
csrc/balance_serve/sched/bind.cpp
+105
-64
csrc/balance_serve/sched/metrics.cpp
csrc/balance_serve/sched/metrics.cpp
+68
-56
csrc/balance_serve/sched/metrics.h
csrc/balance_serve/sched/metrics.h
+31
-28
csrc/balance_serve/sched/model_config.h
csrc/balance_serve/sched/model_config.h
+28
-22
csrc/balance_serve/sched/scheduler.cpp
csrc/balance_serve/sched/scheduler.cpp
+283
-239
csrc/balance_serve/sched/scheduler.h
csrc/balance_serve/sched/scheduler.h
+29
-21
csrc/balance_serve/sched/utils/arithmetic.hpp
csrc/balance_serve/sched/utils/arithmetic.hpp
+1
-2
No files found.
csrc/balance_serve/kvc2/src/async_store.cpp
View file @
64de7843
...
...
@@ -35,23 +35,23 @@ struct ArrayStore {
if
(
to
<=
size
)
{
return
;
}
//TODO: extend file
//
TODO: extend file
size
=
to
;
//LOG_INFO("Extend file to `, size `", to, size_in_bytes());
//
LOG_INFO("Extend file to `, size `", to, size_in_bytes());
}
ArrayStore
(
size_t
element_size
,
size_t
size
,
std
::
filesystem
::
path
data_path
)
:
element_size
(
element_size
),
element_size_aligned
((
element_size
+
DeviceBlockSize
-
1
)
/
DeviceBlockSize
),
data_path
(
data_path
)
{
//TODO: prefix cache
//
TODO: prefix cache
}
void
read
(
size_t
index
,
void
*
buffer
)
{
//TODO: read from file
//
TODO: read from file
}
void
write
(
size_t
index
,
void
*
buffer
)
{
//TODO: write to file
//
TODO: write to file
}
};
...
...
@@ -98,15 +98,15 @@ struct IODealerImpl {
IODealerImpl
(
bool
use_io_uring
,
int
IO_DEPTH
)
:
use_io_uring
(
use_io_uring
),
IO_DEPTH
(
IO_DEPTH
)
{}
void
queue_consumer
()
{
//TODO:
//
TODO:
}
void
io_perf
()
{
//TODO:
//
TODO:
}
void
io_dealer
()
{
//TODO:
//
TODO:
}
};
...
...
@@ -130,7 +130,7 @@ void IODealer::stop() {
if
(
io_impl
->
stop
)
{
return
;
}
//LOG_INFO("Stopping IO Dealer");
//
LOG_INFO("Stopping IO Dealer");
io_impl
->
stop
=
true
;
}
...
...
csrc/balance_serve/kvc2/src/gpu_cache.cpp
View file @
64de7843
...
...
@@ -77,7 +77,6 @@ GPUPageCache::GPUPageCache(GPUPageCacheConfig& config) : config(config) {
gpu_only_occupations
.
resize
(
config
.
total_kvcache_pages
,
false
);
}
num_free_pages
=
config
.
total_kvcache_pages
;
for
(
size_t
i
=
0
;
i
<
config
.
layer_count
;
i
++
)
{
if
(
config
.
k_cache_on
)
...
...
@@ -248,18 +247,19 @@ void GPUPageCache::append_col_to_request(std::vector<std::shared_ptr<CudaStreamM
auto
gpu_block_idx
=
k_handles
[
0
][
at
]
->
gpu_block_idx
.
value
();
for
(
size_t
layer
=
0
;
layer
<
config
.
layer_count
;
layer
++
)
{
for
(
size_t
which_gpu
=
0
;
which_gpu
<
config
.
gpu_devices_id
.
size
();
which_gpu
++
)
{
if
(
config
.
k_cache_on
)
{
assert
(
k_handles
[
layer
][
at
]
->
data
!=
nullptr
);
reqs
[
which_gpu
]
->
sizes
.
push_back
(
tp_size
[
which_gpu
]);
reqs
[
which_gpu
]
->
host_mem_addresses
.
push_back
(
offset_by_bytes
(
k_handles
[
layer
][
at
]
->
data
,
tp_offset
[
which_gpu
]));
reqs
[
which_gpu
]
->
host_mem_addresses
.
push_back
(
offset_by_bytes
(
k_handles
[
layer
][
at
]
->
data
,
tp_offset
[
which_gpu
]));
reqs
[
which_gpu
]
->
device_mem_addresses
.
push_back
(
k_cache
[
which_gpu
][
layer
][
gpu_block_idx
].
data_ptr
());
}
if
(
config
.
v_cache_on
)
{
assert
(
v_handles
[
layer
][
at
]
->
data
!=
nullptr
);
reqs
[
which_gpu
]
->
sizes
.
push_back
(
tp_size
[
which_gpu
]);
reqs
[
which_gpu
]
->
host_mem_addresses
.
push_back
(
offset_by_bytes
(
v_handles
[
layer
][
at
]
->
data
,
tp_offset
[
which_gpu
]));
reqs
[
which_gpu
]
->
host_mem_addresses
.
push_back
(
offset_by_bytes
(
v_handles
[
layer
][
at
]
->
data
,
tp_offset
[
which_gpu
]));
reqs
[
which_gpu
]
->
device_mem_addresses
.
push_back
(
v_cache
[
which_gpu
][
layer
][
gpu_block_idx
].
data_ptr
());
}
}
...
...
csrc/balance_serve/kvc2/src/metrics.h
View file @
64de7843
#pragma once
#include "prometheus/counter.h"
#include "prometheus/exposer.h"
#include "prometheus/gauge.h"
#include "prometheus/histogram.h"
#include "prometheus/registry.h"
#include <atomic>
#include <chrono>
#include <memory>
#include <string>
#include <thread>
#include <vector>
#include "prometheus/counter.h"
#include "prometheus/exposer.h"
#include "prometheus/gauge.h"
#include "prometheus/histogram.h"
#include "prometheus/registry.h"
#include "utils/timer.hpp"
...
...
csrc/balance_serve/kvc2/src/model_config.h
View file @
64de7843
#ifndef __MODEL_CONFIG_HPP_
#define __MODEL_CONFIG_HPP_
#include <iostream>
#include "nlohmann/json.hpp"
#include <iostream>
#include <filesystem>
#include <fstream>
...
...
@@ -13,7 +13,7 @@ using ModelName = std::string;
// We must assure this can be load by config.json
class
ModelConfig
{
public:
public:
DimSize
hidden_size
;
DimSize
intermediate_size
;
size_t
max_position_embeddings
;
...
...
@@ -23,10 +23,13 @@ class ModelConfig {
size_t
num_key_value_heads
;
size_t
vocab_size
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE
(
ModelConfig
,
hidden_size
,
intermediate_size
,
max_position_embeddings
,
model_type
,
num_attention_heads
,
num_hidden_layers
,
num_key_value_heads
,
vocab_size
);
NLOHMANN_DEFINE_TYPE_INTRUSIVE
(
ModelConfig
,
hidden_size
,
intermediate_size
,
max_position_embeddings
,
model_type
,
num_attention_heads
,
num_hidden_layers
,
num_key_value_heads
,
vocab_size
);
void
load_from
(
std
::
filesystem
::
path
path
)
{
std
::
cout
<<
"Load from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
nlohmann
::
json
j
;
i
>>
j
;
...
...
@@ -38,12 +41,14 @@ using QuantType = std::string;
static
const
QuantType
NoQuantType
=
""
;
class
QuantConfig
{
public:
public:
QuantType
name
;
// For GEMV
QuantType
type_of_dot_vector
=
NoQuantType
;
inline
bool
can_be_used_as_matrix
()
{
return
type_of_dot_vector
!=
NoQuantType
;
}
inline
bool
can_be_used_as_matrix
()
{
return
type_of_dot_vector
!=
NoQuantType
;
}
bool
can_be_used_as_vector
;
...
...
@@ -56,8 +61,11 @@ class QuantConfig {
URL
reference
=
""
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT
(
QuantConfig
,
name
,
type_of_dot_vector
,
can_be_used_as_vector
,
bytes_per_element
,
has_scale
,
has_min
,
block_element_count
,
NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT
(
QuantConfig
,
name
,
type_of_dot_vector
,
can_be_used_as_vector
,
bytes_per_element
,
has_scale
,
has_min
,
block_element_count
,
block_element_size
,
reference
);
};
...
...
@@ -65,15 +73,19 @@ inline std::map<QuantType, QuantConfig> quant_configs;
inline
std
::
map
<
ModelName
,
ModelConfig
>
model_configs
;
inline
void
load_quant_configs
(
std
::
filesystem
::
path
path
)
{
nlohmann
::
json
j
;
if
(
std
::
filesystem
::
exists
(
path
))
{
std
::
cout
<<
__FUNCTION__
<<
" from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
nlohmann
::
json
j
;
i
>>
j
;
quant_configs
=
j
.
get
<
std
::
map
<
QuantType
,
QuantConfig
>>
();
std
::
cout
<<
"Loaded Quant Configs"
<<
std
::
endl
;
for
(
auto
&
[
k
,
v
]
:
quant_configs
)
{
for
(
auto
&
[
k
,
v
]
:
quant_configs
)
{
std
::
cout
<<
" - "
<<
k
<<
std
::
endl
;
}
}
else
{
std
::
cout
<<
__FUNCTION__
<<
" no file at "
<<
path
<<
std
::
endl
;
}
}
inline
void
dump_quant_configs
(
std
::
filesystem
::
path
path
)
{
...
...
@@ -83,15 +95,19 @@ inline void dump_quant_configs(std::filesystem::path path) {
}
inline
void
load_model_configs
(
std
::
filesystem
::
path
path
)
{
nlohmann
::
json
j
;
if
(
std
::
filesystem
::
exists
(
path
))
{
std
::
cout
<<
__FUNCTION__
<<
" from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
nlohmann
::
json
j
;
i
>>
j
;
model_configs
=
j
.
get
<
std
::
map
<
ModelName
,
ModelConfig
>>
();
std
::
cout
<<
"Loaded Model Configs"
<<
std
::
endl
;
for
(
auto
&
[
k
,
v
]
:
model_configs
)
{
for
(
auto
&
[
k
,
v
]
:
model_configs
)
{
std
::
cout
<<
" - "
<<
k
<<
std
::
endl
;
}
}
else
{
std
::
cout
<<
__FUNCTION__
<<
" no file at "
<<
path
<<
std
::
endl
;
}
}
inline
void
dump_model_configs
(
std
::
filesystem
::
path
path
)
{
...
...
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.cpp
View file @
64de7843
...
...
@@ -17,13 +17,14 @@ PageAlignedMemoryPool::PageAlignedMemoryPool(size_t size_in_bytes) {
assert
(
total_pages
>=
Blocks
);
page_per_block
=
total_pages
/
Blocks
;
for
(
size_t
block_index
=
0
;
block_index
<
Blocks
;
block_index
++
)
{
first_page
[
block_index
]
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
intptr_t
>
(
data
)
+
static_cast
<
intptr_t
>
(
block_index
)
*
page_per_block
*
PageSize
);
for
(
size_t
block_index
=
0
;
block_index
<
Blocks
;
block_index
++
)
{
first_page
[
block_index
]
=
reinterpret_cast
<
void
*>
(
reinterpret_cast
<
intptr_t
>
(
data
)
+
static_cast
<
intptr_t
>
(
block_index
)
*
page_per_block
*
PageSize
);
count_page
[
block_index
]
=
block_index
==
Blocks
-
1
?
(
total_pages
-
page_per_block
*
(
Blocks
-
1
))
:
page_per_block
;
SPDLOG_DEBUG
(
"first_page[{}] = {}, count_page[{}] = {}"
,
block_index
,
reinterpret_cast
<
intptr_t
>
(
first_page
[
block_index
])
-
reinterpret_cast
<
intptr_t
>
(
data
),
block_index
,
count_page
[
block_index
]);
SPDLOG_DEBUG
(
"first_page[{}] = {}, count_page[{}] = {}"
,
block_index
,
reinterpret_cast
<
intptr_t
>
(
first_page
[
block_index
])
-
reinterpret_cast
<
intptr_t
>
(
data
),
block_index
,
count_page
[
block_index
]);
bitmap
[
block_index
].
resize
(
count_page
[
block_index
],
0
);
}
SPDLOG_INFO
(
"PageAlignedMemoryPool with size {} Mbytes, {} pages"
,
total_size
/
(
1
<<
20
),
page_count
());
...
...
@@ -53,7 +54,7 @@ void* PageAlignedMemoryPool::alloc_in_block(size_t block_index, size_t alloc_siz
size_t
free_pages
=
0
;
for
(
size_t
i
=
0
;
i
<
count_page
[
block_index
];
i
++
)
{
if
(
bitmap
[
block_index
][
i
]
==
0
)
{
free_pages
++
;
free_pages
++
;
if
(
free_pages
==
alloc_size
)
{
size_t
page_index
=
i
+
1
-
free_pages
;
for
(
size_t
page
=
page_index
;
page
<
page_index
+
alloc_size
;
page
++
)
{
...
...
@@ -73,7 +74,7 @@ void* PageAlignedMemoryPool::alloc_in_block(size_t block_index, size_t alloc_siz
void
*
PageAlignedMemoryPool
::
alloc
(
size_t
size
)
{
size_t
alloc_size
=
div_up
(
size
,
PageSize
);
auto
cnt
=
now_block
.
fetch_add
(
1
,
std
::
memory_order_relaxed
);
for
(
size_t
i
=
0
;
i
<
Blocks
;
i
++
)
{
for
(
size_t
i
=
0
;
i
<
Blocks
;
i
++
)
{
auto
result
=
alloc_in_block
((
i
+
cnt
)
%
Blocks
,
alloc_size
);
if
(
result
!=
nullptr
)
{
allocated
.
fetch_add
(
alloc_size
*
PageSize
,
std
::
memory_order_relaxed
);
...
...
@@ -119,5 +120,6 @@ void PageAlignedMemoryPool::defragment() {}
/// 调试打印
std
::
string
PageAlignedMemoryPool
::
debug
()
{
return
fmt
::
format
(
"PageAlignedMemoryPool: total_size: {}MB, allocated: {}, alloc/free count: {}/{}
\n
"
,
readable_number
(
total_size
),
readable_number
(
size_t
(
allocated
)),
size_t
(
alloc_count
),
size_t
(
free_count
));
readable_number
(
total_size
),
readable_number
(
size_t
(
allocated
)),
size_t
(
alloc_count
),
size_t
(
free_count
));
}
csrc/balance_serve/kvc2/src/page_aligned_memory_pool.h
View file @
64de7843
#pragma once
#include <assert.h>
#include <algorithm> // std::sort
#include <atomic>
#include <bitset>
#include <cstddef> // size_t
#include <mutex> // std::mutex
#include <vector>
#include <assert.h>
#include <bitset>
#include <atomic>
constexpr
size_t
PageSize
=
4096
;
...
...
@@ -26,10 +26,11 @@ struct PageAlignedMemoryPool {
std
::
mutex
lock
[
Blocks
];
size_t
page_per_block
=
0
;
void
*
first_page
[
Blocks
];
void
*
first_page
[
Blocks
];
size_t
count_page
[
Blocks
];
std
::
vector
<
int8_t
>
bitmap
[
Blocks
];
void
*
alloc_in_block
(
size_t
block_index
,
size_t
alloc_size
);
public:
/// 构造函数和析构函数
explicit
PageAlignedMemoryPool
(
size_t
size_in_bytes
);
...
...
csrc/balance_serve/kvc2/src/prefix.cpp
View file @
64de7843
...
...
@@ -339,7 +339,7 @@ struct Prefix {
void
update_location
(
CacheInfo
info
,
Location
location
)
{
locations
.
location_map
[
info
]
=
location
;
}
Prefix
*
to_first_prefix_without_disk_locations
(
CacheInfo
k_info
/*, CacheInfo v_info*/
)
{
// just k_info
Prefix
*
to_first_prefix_without_disk_locations
(
CacheInfo
k_info
/*, CacheInfo v_info*/
)
{
// just k_info
auto
now_prefix
=
this
;
while
(
now_prefix
->
prev
!=
nullptr
)
{
auto
&
prev
=
now_prefix
->
prev
;
...
...
@@ -561,7 +561,7 @@ struct PrefixTree {
if
(
need_lock
)
{
sl
=
std
::
shared_lock
<
std
::
shared_mutex
>
(
rw_lock
);
}
//TODO: prefix cache
//
TODO: prefix cache
}
PrefixMatch
look_up_or_insert
(
Token
*
data
,
TokenLength
length
)
{
...
...
@@ -579,7 +579,6 @@ struct PrefixTree {
return
re
;
}
std
::
shared_ptr
<
Prefix
>
new_prefix_node
(
Prefix
*
prev
,
TokenLength
prev_match_length
,
Token
*
data
,
TokenLength
length
,
bool
need_lock
=
true
)
{
std
::
unique_lock
<
std
::
shared_mutex
>
ul
;
...
...
@@ -700,9 +699,7 @@ struct DoubleCacheHandle : public DoubleCacheHandleInterface {
}
}
}
std
::
vector
<
MatchStatus
>
matched_status
()
override
{
assert
(
false
);
}
std
::
vector
<
MatchStatus
>
matched_status
()
override
{
assert
(
false
);
}
bool
any_match
()
{
if
(
enable_alt
)
{
...
...
@@ -1066,7 +1063,6 @@ struct DoubleCacheHandle : public DoubleCacheHandleInterface {
};
struct
KVC2
:
KVC2Interface
{
KVC2Config
config
;
std
::
shared_ptr
<
Metrics
>
met
;
...
...
@@ -1261,7 +1257,7 @@ struct KVC2 : KVC2Interface {
re
->
kvc2_top
=
this
;
SPDLOG_DEBUG
(
"Lookup TokenLength {}"
,
length
);
if
(
config
.
gpu_only
==
false
)
{
//TODO:
//
TODO:
}
return
re
;
};
...
...
@@ -1694,9 +1690,11 @@ void GPUPageCache::gpu_background_flush() {
if
(
col_uls
.
empty
())
continue
;
for
(
size_t
l
=
0
;
l
<
config
.
layer_count
;
l
++
)
{
if
(
config
.
k_cache_on
&&
(
occupations
[
l
][
i
]
->
gpu_cc
.
dirty
.
load
()
==
false
||
occupations
[
l
][
i
]
->
cpu_cc
.
dirty
.
load
()))
if
(
config
.
k_cache_on
&&
(
occupations
[
l
][
i
]
->
gpu_cc
.
dirty
.
load
()
==
false
||
occupations
[
l
][
i
]
->
cpu_cc
.
dirty
.
load
()))
goto
next_gpu_page
;
if
(
config
.
v_cache_on
&&
(
v_occupations
[
l
][
i
]
->
gpu_cc
.
dirty
.
load
()
==
false
||
v_occupations
[
l
][
i
]
->
cpu_cc
.
dirty
.
load
()))
if
(
config
.
v_cache_on
&&
(
v_occupations
[
l
][
i
]
->
gpu_cc
.
dirty
.
load
()
==
false
||
v_occupations
[
l
][
i
]
->
cpu_cc
.
dirty
.
load
()))
goto
next_gpu_page
;
}
...
...
csrc/balance_serve/kvc2/test/kvc2test/common.hpp
View file @
64de7843
...
...
@@ -139,11 +139,11 @@ std::vector<Token> random_ids(size_t length, std::mt19937& gen) {
return
re
;
}
std
::
vector
<
layer_data
>
slice
(
std
::
vector
<
layer_data
>&
h1
,
size_t
start
,
size_t
end
){
std
::
vector
<
layer_data
>
slice
(
std
::
vector
<
layer_data
>&
h1
,
size_t
start
,
size_t
end
)
{
std
::
vector
<
layer_data
>
re
;
for
(
auto
&
l
:
h1
){
for
(
auto
&
l
:
h1
)
{
layer_data
new_layer
;
new_layer
.
insert
(
new_layer
.
end
(),
l
.
begin
()
+
start
,
l
.
begin
()
+
end
);
new_layer
.
insert
(
new_layer
.
end
(),
l
.
begin
()
+
start
,
l
.
begin
()
+
end
);
re
.
push_back
(
new_layer
);
}
return
re
;
...
...
csrc/balance_serve/kvc2/test/kvc2test/flush-back.cpp
View file @
64de7843
...
...
@@ -7,9 +7,9 @@ int main(int argc, char* argv[]) {
config
.
gpu_cache_config
->
total_kvcache_pages
=
12
;
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
// #pragma omp parallel for
// #pragma omp parallel for
for
(
size_t
ti
=
0
;
ti
<
2
;
ti
++
)
{
SPDLOG_WARN
(
"Test {}"
,
ti
);
SPDLOG_WARN
(
"Test {}"
,
ti
);
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
std
::
mt19937
gen
(
ti
+
123
);
size_t
total_page
=
10
;
...
...
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt-without-vcache.cpp
View file @
64de7843
csrc/balance_serve/kvc2/test/kvc2test/lookup-without-vcache.cpp
View file @
64de7843
...
...
@@ -11,7 +11,6 @@
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
qw25_7B_gpu_config
.
v_cache_on
=
false
;
config
.
gpu_cache_config
=
qw25_7B_gpu_config
;
config
.
v_cache_on
=
false
;
...
...
csrc/balance_serve/kvc2/test/page_pool_test.cpp
View file @
64de7843
#include <unistd.h>
#include <iostream>
#include <random>
#include <thread>
#include <vector>
#include <random>
#include <unistd.h>
#include "page_aligned_memory_pool.cpp"
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
// 每个线程执行的任务
void
thread_task
(
PageAlignedMemoryPool
&
pool
)
{
std
::
mt19937
gen
(
123
);
...
...
@@ -36,7 +35,6 @@ void thread_task(PageAlignedMemoryPool& pool) {
int
main
(
int
argc
,
char
*
argv
[])
{
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
// 创建一个内存池
PageAlignedMemoryPool
pool
(
40ll
*
1024
*
1024
*
1024
);
// 40 G
...
...
csrc/balance_serve/kvc2/test/test_periodic_task.cpp
View file @
64de7843
#include "utils/periodic_task.hpp"
#include <atomic>
#include <cassert>
#include <chrono>
#include <cstdio>
#include <future>
#include <iostream>
#include <thread>
#include <future>
#include <atomic>
#include <cassert>
#include "utils/periodic_task.hpp"
// 1. 任务是否按预期执行
void
testPeriodicTaskExecution
()
{
std
::
atomic
<
int
>
execution_count
{
0
};
auto
task
=
[
&
execution_count
]()
{
execution_count
++
;
};
auto
task
=
[
&
execution_count
]()
{
execution_count
++
;
};
periodic
::
PeriodicTask
periodic_task
(
task
,
std
::
chrono
::
milliseconds
(
50
));
...
...
@@ -26,9 +24,7 @@ void testPeriodicTaskExecution() {
// 2. 提前唤醒任务的功能
void
testWakeUpImmediately
()
{
std
::
atomic
<
int
>
execution_count
{
0
};
auto
task
=
[
&
execution_count
]()
{
execution_count
++
;
};
auto
task
=
[
&
execution_count
]()
{
execution_count
++
;
};
periodic
::
PeriodicTask
periodic_task
(
task
,
std
::
chrono
::
milliseconds
(
200
));
...
...
@@ -63,9 +59,7 @@ void testWakeUpWait() {
// 4. 任务抛出异常的处理
void
testTaskExceptionHandling
()
{
auto
task
=
[]()
{
throw
std
::
runtime_error
(
"Test exception"
);
};
auto
task
=
[]()
{
throw
std
::
runtime_error
(
"Test exception"
);
};
periodic
::
PeriodicTask
periodic_task
(
task
,
std
::
chrono
::
milliseconds
(
200
));
...
...
@@ -98,9 +92,7 @@ void testTaskStop() {
// 6. 高频唤醒的情况下任务执行是否正常
void
testHighFrequencyWakeUp
()
{
std
::
atomic
<
int
>
execution_count
{
0
};
auto
task
=
[
&
execution_count
]()
{
execution_count
++
;
};
auto
task
=
[
&
execution_count
]()
{
execution_count
++
;
};
periodic
::
PeriodicTask
periodic_task
(
task
,
std
::
chrono
::
milliseconds
(
200
));
...
...
csrc/balance_serve/sched/bind.cpp
View file @
64de7843
#include "scheduler.h"
#include <memory>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <memory>
#include "scheduler.h"
#include <torch/extension.h>
...
...
@@ -16,19 +16,25 @@ PYBIND11_MODULE(sched_ext, m) {
.
def_readwrite
(
"layer_count"
,
&
scheduler
::
ModelSettings
::
layer_count
)
.
def_readwrite
(
"num_k_heads"
,
&
scheduler
::
ModelSettings
::
num_k_heads
)
.
def_readwrite
(
"k_head_dim"
,
&
scheduler
::
ModelSettings
::
k_head_dim
)
.
def_readwrite
(
"bytes_per_params"
,
&
scheduler
::
ModelSettings
::
bytes_per_params
)
.
def_readwrite
(
"bytes_per_kv_cache_element"
,
&
scheduler
::
ModelSettings
::
bytes_per_kv_cache_element
)
.
def_readwrite
(
"bytes_per_params"
,
&
scheduler
::
ModelSettings
::
bytes_per_params
)
.
def_readwrite
(
"bytes_per_kv_cache_element"
,
&
scheduler
::
ModelSettings
::
bytes_per_kv_cache_element
)
.
def
(
"params_size"
,
&
scheduler
::
ModelSettings
::
params_nbytes
)
.
def
(
"bytes_per_token_kv_cache"
,
&
scheduler
::
ModelSettings
::
bytes_per_token_kv_cache
)
.
def
(
"bytes_per_token_kv_cache"
,
&
scheduler
::
ModelSettings
::
bytes_per_token_kv_cache
)
// 添加 pickle 支持
.
def
(
py
::
pickle
(
[](
const
scheduler
::
ModelSettings
&
self
)
{
// __getstate__
return
py
::
make_tuple
(
self
.
params_count
,
self
.
layer_count
,
self
.
num_k_heads
,
self
.
k_head_dim
,
self
.
bytes_per_params
,
self
.
bytes_per_kv_cache_element
);
[](
const
scheduler
::
ModelSettings
&
self
)
{
// __getstate__
return
py
::
make_tuple
(
self
.
params_count
,
self
.
layer_count
,
self
.
num_k_heads
,
self
.
k_head_dim
,
self
.
bytes_per_params
,
self
.
bytes_per_kv_cache_element
);
},
[](
py
::
tuple
t
)
{
// __setstate__
if
(
t
.
size
()
!=
6
)
throw
std
::
runtime_error
(
"Invalid state! t.size() = "
+
std
::
to_string
(
t
.
size
()));
throw
std
::
runtime_error
(
"Invalid state! t.size() = "
+
std
::
to_string
(
t
.
size
()));
scheduler
::
ModelSettings
ms
;
ms
.
params_count
=
t
[
0
].
cast
<
size_t
>
();
ms
.
layer_count
=
t
[
1
].
cast
<
size_t
>
();
...
...
@@ -42,20 +48,22 @@ PYBIND11_MODULE(sched_ext, m) {
py
::
class_
<
scheduler
::
SampleOptions
>
(
m
,
"SampleOptions"
)
.
def
(
py
::
init
<>
())
.
def_readwrite
(
"temperature"
,
&
scheduler
::
SampleOptions
::
temperature
)
.
def_readwrite
(
"top_p"
,
&
scheduler
::
SampleOptions
::
top_p
)
// 确保 top_p 也能被访问
.
def_readwrite
(
"top_p"
,
&
scheduler
::
SampleOptions
::
top_p
)
// 确保 top_p 也能被访问
.
def
(
py
::
pickle
(
[](
const
scheduler
::
SampleOptions
&
self
)
{
return
py
::
make_tuple
(
self
.
temperature
,
self
.
top_p
);
// 序列化 temperature 和 top_p
[](
const
scheduler
::
SampleOptions
&
self
)
{
return
py
::
make_tuple
(
self
.
temperature
,
self
.
top_p
);
// 序列化 temperature 和 top_p
},
[](
py
::
tuple
t
)
{
if
(
t
.
size
()
!=
2
)
// 确保解包时参数数量匹配
throw
std
::
runtime_error
(
"Invalid state! t.size() = "
+
std
::
to_string
(
t
.
size
()));
throw
std
::
runtime_error
(
"Invalid state! t.size() = "
+
std
::
to_string
(
t
.
size
()));
scheduler
::
SampleOptions
so
;
so
.
temperature
=
t
[
0
].
cast
<
double
>
();
so
.
top_p
=
t
[
1
].
cast
<
double
>
();
// 反序列化 top_p
return
so
;
}
));
}));
py
::
class_
<
scheduler
::
Settings
>
(
m
,
"Settings"
)
.
def
(
py
::
init
<>
())
...
...
@@ -65,33 +73,43 @@ PYBIND11_MODULE(sched_ext, m) {
.
def_readwrite
(
"page_size"
,
&
scheduler
::
Settings
::
page_size
)
.
def_readwrite
(
"gpu_device_id"
,
&
scheduler
::
Settings
::
gpu_device_id
)
.
def_readwrite
(
"gpu_memory_size"
,
&
scheduler
::
Settings
::
gpu_memory_size
)
.
def_readwrite
(
"memory_utilization_percentage"
,
&
scheduler
::
Settings
::
memory_utilization_percentage
)
.
def_readwrite
(
"memory_utilization_percentage"
,
&
scheduler
::
Settings
::
memory_utilization_percentage
)
.
def_readwrite
(
"max_batch_size"
,
&
scheduler
::
Settings
::
max_batch_size
)
.
def_readwrite
(
"recommended_chunk_prefill_token_count"
,
.
def_readwrite
(
"recommended_chunk_prefill_token_count"
,
&
scheduler
::
Settings
::
recommended_chunk_prefill_token_count
)
.
def_readwrite
(
"sample_options"
,
&
scheduler
::
Settings
::
sample_options
)
.
def_readwrite
(
"sched_metrics_port"
,
&
scheduler
::
Settings
::
sched_metrics_port
)
.
def_readwrite
(
"sched_metrics_port"
,
&
scheduler
::
Settings
::
sched_metrics_port
)
.
def_readwrite
(
"gpu_only"
,
&
scheduler
::
Settings
::
gpu_only
)
.
def_readwrite
(
"use_self_defined_head_dim"
,
&
scheduler
::
Settings
::
use_self_defined_head_dim
)
.
def_readwrite
(
"self_defined_head_dim"
,
&
scheduler
::
Settings
::
self_defined_head_dim
)
.
def_readwrite
(
"full_kv_cache_on_each_gpu"
,
&
scheduler
::
Settings
::
full_kv_cache_on_each_gpu
)
.
def_readwrite
(
"use_self_defined_head_dim"
,
&
scheduler
::
Settings
::
use_self_defined_head_dim
)
.
def_readwrite
(
"self_defined_head_dim"
,
&
scheduler
::
Settings
::
self_defined_head_dim
)
.
def_readwrite
(
"full_kv_cache_on_each_gpu"
,
&
scheduler
::
Settings
::
full_kv_cache_on_each_gpu
)
.
def_readwrite
(
"k_cache_on"
,
&
scheduler
::
Settings
::
k_cache_on
)
.
def_readwrite
(
"v_cache_on"
,
&
scheduler
::
Settings
::
v_cache_on
)
.
def_readwrite
(
"kvc2_config_path"
,
&
scheduler
::
Settings
::
kvc2_config_path
)
.
def_readwrite
(
"kvc2_root_path"
,
&
scheduler
::
Settings
::
kvc2_root_path
)
.
def_readwrite
(
"memory_pool_size_GB"
,
&
scheduler
::
Settings
::
memory_pool_size_GB
)
.
def_readwrite
(
"memory_pool_size_GB"
,
&
scheduler
::
Settings
::
memory_pool_size_GB
)
.
def_readwrite
(
"evict_count"
,
&
scheduler
::
Settings
::
evict_count
)
.
def_readwrite
(
"strategy_name"
,
&
scheduler
::
Settings
::
strategy_name
)
.
def_readwrite
(
"kvc2_metrics_port"
,
&
scheduler
::
Settings
::
kvc2_metrics_port
)
.
def_readwrite
(
"kvc2_metrics_port"
,
&
scheduler
::
Settings
::
kvc2_metrics_port
)
.
def_readwrite
(
"load_from_disk"
,
&
scheduler
::
Settings
::
load_from_disk
)
.
def_readwrite
(
"save_to_disk"
,
&
scheduler
::
Settings
::
save_to_disk
)
// derived
.
def_readwrite
(
"gpu_device_count"
,
&
scheduler
::
Settings
::
gpu_device_count
)
.
def_readwrite
(
"total_kvcache_pages"
,
&
scheduler
::
Settings
::
total_kvcache_pages
)
.
def_readwrite
(
"total_kvcache_pages"
,
&
scheduler
::
Settings
::
total_kvcache_pages
)
.
def_readwrite
(
"devices"
,
&
scheduler
::
Settings
::
devices
)
.
def
(
"auto_derive"
,
&
scheduler
::
Settings
::
auto_derive
);
py
::
class_
<
scheduler
::
BatchQueryTodo
,
std
::
shared_ptr
<
scheduler
::
BatchQueryTodo
>>
(
m
,
"BatchQueryTodo"
)
py
::
class_
<
scheduler
::
BatchQueryTodo
,
std
::
shared_ptr
<
scheduler
::
BatchQueryTodo
>>
(
m
,
"BatchQueryTodo"
)
.
def
(
py
::
init
<>
())
.
def_readwrite
(
"query_ids"
,
&
scheduler
::
BatchQueryTodo
::
query_ids
)
.
def_readwrite
(
"query_tokens"
,
&
scheduler
::
BatchQueryTodo
::
query_tokens
)
...
...
@@ -99,31 +117,42 @@ PYBIND11_MODULE(sched_ext, m) {
.
def_readwrite
(
"block_indexes"
,
&
scheduler
::
BatchQueryTodo
::
block_indexes
)
.
def_readwrite
(
"attn_masks"
,
&
scheduler
::
BatchQueryTodo
::
attn_masks
)
.
def_readwrite
(
"rope_ranges"
,
&
scheduler
::
BatchQueryTodo
::
rope_ranges
)
.
def_readwrite
(
"sample_options"
,
&
scheduler
::
BatchQueryTodo
::
sample_options
)
.
def_readwrite
(
"prefill_mini_batches"
,
&
scheduler
::
BatchQueryTodo
::
prefill_mini_batches
)
.
def_readwrite
(
"decode_mini_batches"
,
&
scheduler
::
BatchQueryTodo
::
decode_mini_batches
)
.
def_readwrite
(
"sample_options"
,
&
scheduler
::
BatchQueryTodo
::
sample_options
)
.
def_readwrite
(
"prefill_mini_batches"
,
&
scheduler
::
BatchQueryTodo
::
prefill_mini_batches
)
.
def_readwrite
(
"decode_mini_batches"
,
&
scheduler
::
BatchQueryTodo
::
decode_mini_batches
)
.
def_readwrite
(
"stop_criteria"
,
&
scheduler
::
BatchQueryTodo
::
stop_criteria
)
.
def
(
"debug"
,
&
scheduler
::
BatchQueryTodo
::
debug
)
.
def
(
py
::
pickle
(
[](
const
scheduler
::
BatchQueryTodo
&
self
)
{
return
py
::
make_tuple
(
self
.
query_ids
,
self
.
query_tokens
,
self
.
query_lengths
,
self
.
block_indexes
,
self
.
attn_masks
,
self
.
rope_ranges
,
self
.
sample_options
,
self
.
prefill_mini_batches
,
[](
const
scheduler
::
BatchQueryTodo
&
self
)
{
return
py
::
make_tuple
(
self
.
query_ids
,
self
.
query_tokens
,
self
.
query_lengths
,
self
.
block_indexes
,
self
.
attn_masks
,
self
.
rope_ranges
,
self
.
sample_options
,
self
.
prefill_mini_batches
,
self
.
decode_mini_batches
,
self
.
stop_criteria
);
},
[](
py
::
tuple
t
)
{
if
(
t
.
size
()
!=
10
)
throw
std
::
runtime_error
(
"Invalid state! t.size() = "
+
std
::
to_string
(
t
.
size
()));
throw
std
::
runtime_error
(
"Invalid state! t.size() = "
+
std
::
to_string
(
t
.
size
()));
scheduler
::
BatchQueryTodo
bqt
;
bqt
.
query_ids
=
t
[
0
].
cast
<
std
::
vector
<
scheduler
::
QueryID
>>
();
bqt
.
query_tokens
=
t
[
1
].
cast
<
std
::
vector
<
torch
::
Tensor
>>
();
bqt
.
query_lengths
=
t
[
2
].
cast
<
std
::
vector
<
scheduler
::
TokenLength
>>
();
bqt
.
query_lengths
=
t
[
2
].
cast
<
std
::
vector
<
scheduler
::
TokenLength
>>
();
bqt
.
block_indexes
=
t
[
3
].
cast
<
std
::
vector
<
torch
::
Tensor
>>
();
bqt
.
attn_masks
=
t
[
4
].
cast
<
std
::
optional
<
torch
::
Tensor
>>
();
bqt
.
rope_ranges
=
t
[
5
].
cast
<
std
::
optional
<
torch
::
Tensor
>>
();
bqt
.
sample_options
=
t
[
6
].
cast
<
std
::
vector
<
scheduler
::
SampleOptions
>>
();
bqt
.
prefill_mini_batches
=
t
[
7
].
cast
<
std
::
vector
<
scheduler
::
PrefillTask
>>
();
bqt
.
decode_mini_batches
=
t
[
8
].
cast
<
std
::
vector
<
std
::
vector
<
scheduler
::
QueryID
>>>
();
bqt
.
stop_criteria
=
t
[
9
].
cast
<
std
::
vector
<
std
::
vector
<
std
::
vector
<
int
>>>>
();
bqt
.
sample_options
=
t
[
6
].
cast
<
std
::
vector
<
scheduler
::
SampleOptions
>>
();
bqt
.
prefill_mini_batches
=
t
[
7
].
cast
<
std
::
vector
<
scheduler
::
PrefillTask
>>
();
bqt
.
decode_mini_batches
=
t
[
8
].
cast
<
std
::
vector
<
std
::
vector
<
scheduler
::
QueryID
>>>
();
bqt
.
stop_criteria
=
t
[
9
].
cast
<
std
::
vector
<
std
::
vector
<
std
::
vector
<
int
>>>>
();
return
bqt
;
}));
...
...
@@ -133,16 +162,20 @@ PYBIND11_MODULE(sched_ext, m) {
.
def_readwrite
(
"ok"
,
&
scheduler
::
QueryUpdate
::
ok
)
.
def_readwrite
(
"is_prefill"
,
&
scheduler
::
QueryUpdate
::
is_prefill
)
.
def_readwrite
(
"decode_done"
,
&
scheduler
::
QueryUpdate
::
decode_done
)
.
def_readwrite
(
"active_position"
,
&
scheduler
::
QueryUpdate
::
active_position
)
.
def_readwrite
(
"generated_token"
,
&
scheduler
::
QueryUpdate
::
generated_token
)
.
def_readwrite
(
"active_position"
,
&
scheduler
::
QueryUpdate
::
active_position
)
.
def_readwrite
(
"generated_token"
,
&
scheduler
::
QueryUpdate
::
generated_token
)
.
def
(
py
::
pickle
(
[](
const
scheduler
::
QueryUpdate
&
self
)
{
return
py
::
make_tuple
(
self
.
id
,
self
.
ok
,
self
.
is_prefill
,
self
.
decode_done
,
self
.
active_position
,
[](
const
scheduler
::
QueryUpdate
&
self
)
{
return
py
::
make_tuple
(
self
.
id
,
self
.
ok
,
self
.
is_prefill
,
self
.
decode_done
,
self
.
active_position
,
self
.
generated_token
);
},
[](
py
::
tuple
t
)
{
if
(
t
.
size
()
!=
6
)
throw
std
::
runtime_error
(
"Invalid state! t.size() = "
+
std
::
to_string
(
t
.
size
()));
throw
std
::
runtime_error
(
"Invalid state! t.size() = "
+
std
::
to_string
(
t
.
size
()));
scheduler
::
QueryUpdate
qu
;
qu
.
id
=
t
[
0
].
cast
<
scheduler
::
QueryID
>
();
qu
.
ok
=
t
[
1
].
cast
<
bool
>
();
...
...
@@ -156,8 +189,7 @@ PYBIND11_MODULE(sched_ext, m) {
py
::
class_
<
scheduler
::
InferenceContext
>
(
m
,
"InferenceContext"
)
.
def
(
py
::
init
<>
())
.
def_readwrite
(
"k_cache"
,
&
scheduler
::
InferenceContext
::
k_cache
)
.
def_readwrite
(
"v_cache"
,
&
scheduler
::
InferenceContext
::
v_cache
)
;
.
def_readwrite
(
"v_cache"
,
&
scheduler
::
InferenceContext
::
v_cache
);
py
::
class_
<
scheduler
::
QueryAdd
>
(
m
,
"QueryAdd"
)
.
def
(
py
::
init
<>
())
...
...
@@ -173,15 +205,18 @@ PYBIND11_MODULE(sched_ext, m) {
.
def
(
"serialize"
,
&
scheduler
::
QueryAdd
::
serialize
)
.
def_static
(
"deserialize"
,
&
scheduler
::
QueryAdd
::
deserialize
)
.
def
(
py
::
pickle
(
[](
const
scheduler
::
QueryAdd
&
self
)
{
[](
const
scheduler
::
QueryAdd
&
self
)
{
return
py
::
make_tuple
(
self
.
query_token
,
// self.attn_mask,
self
.
query_length
,
self
.
estimated_length
,
self
.
sample_options
,
self
.
user_id
,
self
.
SLO_TTFT_ms
,
self
.
SLO_TBT_ms
,
self
.
stop_criteria
);
self
.
query_length
,
self
.
estimated_length
,
self
.
sample_options
,
self
.
user_id
,
self
.
SLO_TTFT_ms
,
self
.
SLO_TBT_ms
,
self
.
stop_criteria
);
},
[](
py
::
tuple
t
)
{
if
(
t
.
size
()
!=
8
)
throw
std
::
runtime_error
(
"Invalid state! t.size() = "
+
std
::
to_string
(
t
.
size
()));
throw
std
::
runtime_error
(
"Invalid state! t.size() = "
+
std
::
to_string
(
t
.
size
()));
scheduler
::
QueryAdd
qa
;
qa
.
query_token
=
t
[
0
].
cast
<
std
::
vector
<
scheduler
::
Token
>>
();
// qa.attn_mask = t[1].cast<torch::Tensor>();
...
...
@@ -195,14 +230,20 @@ PYBIND11_MODULE(sched_ext, m) {
return
qa
;
}));
py
::
class_
<
scheduler
::
Scheduler
,
std
::
shared_ptr
<
scheduler
::
Scheduler
>>
(
m
,
"Scheduler"
)
py
::
class_
<
scheduler
::
Scheduler
,
std
::
shared_ptr
<
scheduler
::
Scheduler
>>
(
m
,
"Scheduler"
)
.
def
(
"init"
,
&
scheduler
::
Scheduler
::
init
)
.
def
(
"run"
,
&
scheduler
::
Scheduler
::
run
)
.
def
(
"stop"
,
&
scheduler
::
Scheduler
::
stop
)
.
def
(
"add_query"
,
&
scheduler
::
Scheduler
::
add_query
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"cancel_query"
,
&
scheduler
::
Scheduler
::
cancel_query
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"update_last_batch"
,
&
scheduler
::
Scheduler
::
update_last_batch
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"get_inference_context"
,
&
scheduler
::
Scheduler
::
get_inference_context
);
.
def
(
"add_query"
,
&
scheduler
::
Scheduler
::
add_query
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"cancel_query"
,
&
scheduler
::
Scheduler
::
cancel_query
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"update_last_batch"
,
&
scheduler
::
Scheduler
::
update_last_batch
,
py
::
call_guard
<
py
::
gil_scoped_release
>
())
.
def
(
"get_inference_context"
,
&
scheduler
::
Scheduler
::
get_inference_context
);
m
.
def
(
"create_scheduler"
,
&
scheduler
::
create_scheduler
,
"Create a new Scheduler instance"
);
m
.
def
(
"create_scheduler"
,
&
scheduler
::
create_scheduler
,
"Create a new Scheduler instance"
);
}
csrc/balance_serve/sched/metrics.cpp
View file @
64de7843
...
...
@@ -2,89 +2,101 @@
#include <iostream>
// 构造函数
Metrics
::
Metrics
(
const
MetricsConfig
&
config
)
Metrics
::
Metrics
(
const
MetricsConfig
&
config
)
:
registry_
(
std
::
make_shared
<
prometheus
::
Registry
>
()),
exposer_
(
config
.
endpoint
),
stop_uptime_thread_
(
false
),
exposer_
(
config
.
endpoint
),
stop_uptime_thread_
(
false
),
start_time_
(
std
::
chrono
::
steady_clock
::
now
())
{
// 定义统一的桶大小,最大为 10000 ms (10 s)
std
::
vector
<
double
>
common_buckets
=
{
0.001
,
0.005
,
0.01
,
0.05
,
0.1
,
0.5
,
1.0
,
5.0
,
std
::
vector
<
double
>
common_buckets
=
{
0.001
,
0.005
,
0.01
,
0.05
,
0.1
,
0.5
,
1.0
,
5.0
,
10.0
,
50.0
,
100.0
,
500.0
,
1000.0
,
5000.0
,
10000.0
};
// 毫秒
// 注册 TTFT_ms Histogram
auto
&
TTFT_family
=
prometheus
::
BuildHistogram
()
auto
&
TTFT_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_TTFT_ms"
)
.
Help
(
"Time to first token in milliseconds"
)
.
Register
(
*
registry_
);
TTFT_ms
=
&
TTFT_family
.
Add
({{
"model"
,
config
.
model_name
}},
common_buckets
);
// 注册 TBT_ms Histogram
auto
&
TBT_family
=
prometheus
::
BuildHistogram
()
auto
&
TBT_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_TBT_ms"
)
.
Help
(
"Time between tokens in milliseconds"
)
.
Register
(
*
registry_
);
TBT_ms
=
&
TBT_family
.
Add
({{
"model"
,
config
.
model_name
}},
common_buckets
);
// 注册 schedule_time Histogram
auto
&
schedule_time_family
=
prometheus
::
BuildHistogram
()
auto
&
schedule_time_family
=
prometheus
::
BuildHistogram
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_schedule_time_ms"
)
.
Help
(
"Time to generate schedule in milliseconds"
)
.
Register
(
*
registry_
);
schedule_time
=
&
schedule_time_family
.
Add
({{
"model"
,
config
.
model_name
}},
common_buckets
);
schedule_time
=
&
schedule_time_family
.
Add
({{
"model"
,
config
.
model_name
}},
common_buckets
);
// 注册 generated_tokens Counter
auto
&
generated_tokens_family
=
prometheus
::
BuildCounter
()
auto
&
generated_tokens_family
=
prometheus
::
BuildCounter
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_generated_tokens_total"
)
.
Help
(
"Total generated tokens"
)
.
Register
(
*
registry_
);
generated_tokens
=
&
generated_tokens_family
.
Add
({{
"model"
,
config
.
model_name
}});
generated_tokens
=
&
generated_tokens_family
.
Add
({{
"model"
,
config
.
model_name
}});
// 注册 throughput_query Gauge
auto
&
throughput_query_family
=
prometheus
::
BuildGauge
()
auto
&
throughput_query_family
=
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_throughput_query"
)
.
Help
(
"Throughput per second based on queries"
)
.
Register
(
*
registry_
);
throughput_query
=
&
throughput_query_family
.
Add
({{
"model"
,
config
.
model_name
}});
throughput_query
=
&
throughput_query_family
.
Add
({{
"model"
,
config
.
model_name
}});
// 注册 throughput_generated_tokens Gauge
auto
&
throughput_generated_tokens_family
=
prometheus
::
BuildGauge
()
auto
&
throughput_generated_tokens_family
=
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_throughput_generated_tokens"
)
.
Help
(
"Throughput per second based on generated tokens"
)
.
Register
(
*
registry_
);
throughput_generated_tokens
=
&
throughput_generated_tokens_family
.
Add
({{
"model"
,
config
.
model_name
}});
throughput_generated_tokens
=
&
throughput_generated_tokens_family
.
Add
({{
"model"
,
config
.
model_name
}});
// 注册 event_count Counter family
event_count_family_
=
&
prometheus
::
BuildCounter
()
event_count_family_
=
&
prometheus
::
BuildCounter
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_event_count_total"
)
.
Help
(
"Count of various events"
)
.
Register
(
*
registry_
);
batch_count_family_
=
&
prometheus
::
BuildCounter
()
batch_count_family_
=
&
prometheus
::
BuildCounter
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_batch_count_total"
)
.
Help
(
"Count of various batch by status"
)
.
Register
(
*
registry_
);
// 注册 query_count Counter family
query_count_family_
=
&
prometheus
::
BuildCounter
()
query_count_family_
=
&
prometheus
::
BuildCounter
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_query_count_total"
)
.
Help
(
"Count of queries by status"
)
.
Register
(
*
registry_
);
// 注册 uptime_ms Gauge
auto
&
uptime_family
=
prometheus
::
BuildGauge
()
auto
&
uptime_family
=
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_uptime_ms"
)
.
Help
(
"Uptime of the scheduler in milliseconds"
)
.
Register
(
*
registry_
);
uptime_ms
=
&
uptime_family
.
Add
({{
"model"
,
config
.
model_name
}});
// 注册 GPU 利用率 Gauges
auto
&
gpu_util_family
=
prometheus
::
BuildGauge
()
auto
&
gpu_util_family
=
prometheus
::
BuildGauge
()
.
Name
(
std
::
string
(
METRIC_PREFIX
)
+
"_gpu_utilization_ratio"
)
.
Help
(
"Current GPU utilization ratio (0 to 1)"
)
.
Register
(
*
registry_
);
for
(
size_t
i
=
0
;
i
<
config
.
gpu_count
;
++
i
)
{
gpu_utilization_gauges
.
push_back
(
&
gpu_util_family
.
Add
(
{{
"gpu_id"
,
std
::
to_string
(
i
)},
{
"model"
,
config
.
model_name
}}));
gpu_utilization_gauges
.
push_back
(
&
gpu_util_family
.
Add
(
{{
"gpu_id"
,
std
::
to_string
(
i
)},
{
"model"
,
config
.
model_name
}}));
}
// 将 Registry 注册到 Exposer 中
...
...
@@ -95,16 +107,15 @@ Metrics::Metrics(const MetricsConfig& config)
}
// 析构函数
Metrics
::~
Metrics
()
{
StopUptimeUpdater
();
}
Metrics
::~
Metrics
()
{
StopUptimeUpdater
();
}
// 启动 uptime 更新线程
void
Metrics
::
StartUptimeUpdater
()
{
uptime_thread_
=
std
::
thread
([
this
]()
{
while
(
!
stop_uptime_thread_
)
{
auto
now
=
std
::
chrono
::
steady_clock
::
now
();
std
::
chrono
::
duration
<
double
,
std
::
milli
>
uptime_duration
=
now
-
start_time_
;
std
::
chrono
::
duration
<
double
,
std
::
milli
>
uptime_duration
=
now
-
start_time_
;
uptime_ms
->
Set
(
uptime_duration
.
count
());
// fn_every_sec(this);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
seconds
(
1
));
...
...
@@ -121,15 +132,16 @@ void Metrics::StopUptimeUpdater() {
}
// 获取 event_count 指标
prometheus
::
Counter
*
Metrics
::
event_count
(
const
std
::
string
&
type
)
{
prometheus
::
Counter
*
Metrics
::
event_count
(
const
std
::
string
&
type
)
{
return
&
event_count_family_
->
Add
({{
"type"
,
type
}});
// 可根据需要添加更多标签
}
// 获取 query_count 指标
prometheus
::
Counter
*
Metrics
::
query_count
(
const
std
::
string
&
status
)
{
return
&
query_count_family_
->
Add
({{
"status"
,
status
}});
// 可根据需要添加更多标签
prometheus
::
Counter
*
Metrics
::
query_count
(
const
std
::
string
&
status
)
{
return
&
query_count_family_
->
Add
(
{{
"status"
,
status
}});
// 可根据需要添加更多标签
}
prometheus
::
Counter
*
Metrics
::
batch_count
(
const
std
::
string
&
type
)
{
prometheus
::
Counter
*
Metrics
::
batch_count
(
const
std
::
string
&
type
)
{
return
&
batch_count_family_
->
Add
({{
"type"
,
type
}});
}
csrc/balance_serve/sched/metrics.h
View file @
64de7843
#ifndef Metrics_H
#define Metrics_H
#include <atomic>
#include <chrono>
#include <memory>
#include <prometheus/counter.h>
#include <prometheus/exposer.h>
#include <prometheus/gauge.h>
#include <prometheus/histogram.h>
#include <prometheus/registry.h>
#include <atomic>
#include <chrono>
#include <memory>
#include <string>
#include <thread>
#include <vector>
...
...
@@ -27,40 +27,40 @@ struct MetricsConfig {
// Metrics 类,根据配置初始化 Prometheus 指标
class
Metrics
{
public:
public:
// 构造函数传入 MetricsConfig
Metrics
(
const
MetricsConfig
&
config
);
Metrics
(
const
MetricsConfig
&
config
);
~
Metrics
();
// 禁止拷贝和赋值
Metrics
(
const
Metrics
&
)
=
delete
;
Metrics
&
operator
=
(
const
Metrics
&
)
=
delete
;
Metrics
(
const
Metrics
&
)
=
delete
;
Metrics
&
operator
=
(
const
Metrics
&
)
=
delete
;
std
::
function
<
void
(
Metrics
*
)
>
fn_every_sec
;
std
::
function
<
void
(
Metrics
*
)
>
fn_every_sec
;
// 指标指针
prometheus
::
Gauge
*
uptime_ms
;
prometheus
::
Histogram
*
TTFT_ms
;
prometheus
::
Histogram
*
TBT_ms
;
prometheus
::
Histogram
*
schedule_time
;
prometheus
::
Gauge
*
throughput_query
;
prometheus
::
Gauge
*
throughput_generated_tokens
;
prometheus
::
Counter
*
generated_tokens
;
std
::
vector
<
prometheus
::
Gauge
*>
gpu_utilization_gauges
;
prometheus
::
Gauge
*
uptime_ms
;
prometheus
::
Histogram
*
TTFT_ms
;
prometheus
::
Histogram
*
TBT_ms
;
prometheus
::
Histogram
*
schedule_time
;
prometheus
::
Gauge
*
throughput_query
;
prometheus
::
Gauge
*
throughput_generated_tokens
;
prometheus
::
Counter
*
generated_tokens
;
std
::
vector
<
prometheus
::
Gauge
*>
gpu_utilization_gauges
;
// 计数器家族
prometheus
::
Counter
*
event_count
(
const
std
::
string
&
type
);
prometheus
::
Counter
*
query_count
(
const
std
::
string
&
status
);
prometheus
::
Counter
*
batch_count
(
const
std
::
string
&
type
);
prometheus
::
Counter
*
event_count
(
const
std
::
string
&
type
);
prometheus
::
Counter
*
query_count
(
const
std
::
string
&
status
);
prometheus
::
Counter
*
batch_count
(
const
std
::
string
&
type
);
private:
private:
std
::
shared_ptr
<
prometheus
::
Registry
>
registry_
;
prometheus
::
Exposer
exposer_
;
// 计数器家族
prometheus
::
Family
<
prometheus
::
Counter
>
*
event_count_family_
;
prometheus
::
Family
<
prometheus
::
Counter
>
*
batch_count_family_
;
prometheus
::
Family
<
prometheus
::
Counter
>
*
query_count_family_
;
prometheus
::
Family
<
prometheus
::
Counter
>
*
event_count_family_
;
prometheus
::
Family
<
prometheus
::
Counter
>
*
batch_count_family_
;
prometheus
::
Family
<
prometheus
::
Counter
>
*
query_count_family_
;
// 线程和控制变量用于更新 uptime_ms
std
::
thread
uptime_thread_
;
...
...
@@ -76,9 +76,12 @@ class Metrics {
};
struct
HistogramTimerWrapper
{
prometheus
::
Histogram
*
histogram
;
prometheus
::
Histogram
*
histogram
;
Timer
timer
;
inline
HistogramTimerWrapper
(
prometheus
::
Histogram
*
histogram
)
:
histogram
(
histogram
),
timer
()
{
timer
.
start
();
}
inline
HistogramTimerWrapper
(
prometheus
::
Histogram
*
histogram
)
:
histogram
(
histogram
),
timer
()
{
timer
.
start
();
}
inline
~
HistogramTimerWrapper
()
{
histogram
->
Observe
(
timer
.
elapsedMs
());
}
};
...
...
csrc/balance_serve/sched/model_config.h
View file @
64de7843
#ifndef __MODEL_CONFIG_HPP_
#define __MODEL_CONFIG_HPP_
#include <iostream>
#include "nlohmann/json.hpp"
#include <iostream>
#include <filesystem>
#include <fstream>
...
...
@@ -13,7 +13,7 @@ using ModelName = std::string;
// We must assure this can be load by config.json
class
ModelConfig
{
public:
public:
DimSize
hidden_size
;
DimSize
intermediate_size
;
size_t
max_position_embeddings
;
...
...
@@ -23,10 +23,13 @@ class ModelConfig {
size_t
num_key_value_heads
;
size_t
vocab_size
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE
(
ModelConfig
,
hidden_size
,
intermediate_size
,
max_position_embeddings
,
model_type
,
num_attention_heads
,
num_hidden_layers
,
num_key_value_heads
,
vocab_size
);
NLOHMANN_DEFINE_TYPE_INTRUSIVE
(
ModelConfig
,
hidden_size
,
intermediate_size
,
max_position_embeddings
,
model_type
,
num_attention_heads
,
num_hidden_layers
,
num_key_value_heads
,
vocab_size
);
void
load_from
(
std
::
filesystem
::
path
path
)
{
std
::
cout
<<
"Load from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
nlohmann
::
json
j
;
i
>>
j
;
...
...
@@ -38,12 +41,14 @@ using QuantType = std::string;
static
const
QuantType
NoQuantType
=
""
;
class
QuantConfig
{
public:
public:
QuantType
name
;
// For GEMV
QuantType
type_of_dot_vector
=
NoQuantType
;
inline
bool
can_be_used_as_matrix
()
{
return
type_of_dot_vector
!=
NoQuantType
;
}
inline
bool
can_be_used_as_matrix
()
{
return
type_of_dot_vector
!=
NoQuantType
;
}
bool
can_be_used_as_vector
;
...
...
@@ -56,8 +61,11 @@ class QuantConfig {
URL
reference
=
""
;
NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT
(
QuantConfig
,
name
,
type_of_dot_vector
,
can_be_used_as_vector
,
bytes_per_element
,
has_scale
,
has_min
,
block_element_count
,
NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT
(
QuantConfig
,
name
,
type_of_dot_vector
,
can_be_used_as_vector
,
bytes_per_element
,
has_scale
,
has_min
,
block_element_count
,
block_element_size
,
reference
);
};
...
...
@@ -70,15 +78,14 @@ inline void load_quant_configs(std::filesystem::path path) {
std
::
cout
<<
__FUNCTION__
<<
" from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
i
>>
j
;
}
else
{
std
::
cout
<<
__FUNCTION__
<<
" create new at "
<<
path
<<
std
::
endl
;
}
quant_configs
=
j
.
get
<
std
::
map
<
QuantType
,
QuantConfig
>>
();
std
::
cout
<<
"Loaded Quant Configs"
<<
std
::
endl
;
for
(
auto
&
[
k
,
v
]
:
quant_configs
)
{
for
(
auto
&
[
k
,
v
]
:
quant_configs
)
{
std
::
cout
<<
" - "
<<
k
<<
std
::
endl
;
}
}
else
{
std
::
cout
<<
__FUNCTION__
<<
" no file at "
<<
path
<<
std
::
endl
;
}
}
inline
void
dump_quant_configs
(
std
::
filesystem
::
path
path
)
{
...
...
@@ -93,15 +100,14 @@ inline void load_model_configs(std::filesystem::path path) {
std
::
cout
<<
__FUNCTION__
<<
" from "
<<
path
<<
std
::
endl
;
std
::
ifstream
i
(
path
);
i
>>
j
;
}
else
{
std
::
cout
<<
__FUNCTION__
<<
" create new at "
<<
path
<<
std
::
endl
;
}
model_configs
=
j
.
get
<
std
::
map
<
ModelName
,
ModelConfig
>>
();
std
::
cout
<<
"Loaded Model Configs"
<<
std
::
endl
;
for
(
auto
&
[
k
,
v
]
:
model_configs
)
{
for
(
auto
&
[
k
,
v
]
:
model_configs
)
{
std
::
cout
<<
" - "
<<
k
<<
std
::
endl
;
}
}
else
{
std
::
cout
<<
__FUNCTION__
<<
" no file at "
<<
path
<<
std
::
endl
;
}
}
inline
void
dump_model_configs
(
std
::
filesystem
::
path
path
)
{
...
...
csrc/balance_serve/sched/scheduler.cpp
View file @
64de7843
This diff is collapsed.
Click to expand it.
csrc/balance_serve/sched/scheduler.h
View file @
64de7843
#pragma once
#include
<torch/torch
.h
>
#include
"model_config
.h
"
#include <cstdint>
#include <memory>
#include <optional>
#include <torch/torch.h>
#include <vector>
#include "model_config.h"
namespace
scheduler
{
...
...
@@ -28,7 +28,9 @@ struct ModelSettings {
double
bytes_per_kv_cache_element
;
inline
size_t
params_nbytes
()
{
return
params_count
*
bytes_per_params
;
}
inline
size_t
bytes_per_token_kv_cache
()
{
return
bytes_per_kv_cache_element
*
num_k_heads
*
k_head_dim
;
}
inline
size_t
bytes_per_token_kv_cache
()
{
return
bytes_per_kv_cache_element
*
num_k_heads
*
k_head_dim
;
}
};
struct
SampleOptions
{
...
...
@@ -37,7 +39,8 @@ struct SampleOptions {
};
struct
Settings
{
// something is aukward here, kvc2 only use model_name and quant_type to get model infos.
// something is aukward here, kvc2 only use model_name and quant_type to get
// model infos.
ModelName
model_name
;
QuantType
quant_type
;
// model_setting is ignore by kvc2
...
...
@@ -79,14 +82,16 @@ struct Settings {
void
auto_derive
();
};
using
PrefillTask
=
std
::
tuple
<
QueryID
,
TokenLength
,
TokenLength
>
;
// id, start, length
using
PrefillTask
=
std
::
tuple
<
QueryID
,
TokenLength
,
TokenLength
>
;
// id, start, length
struct
BatchQueryTodo
{
// query
std
::
vector
<
QueryID
>
query_ids
;
std
::
vector
<
torch
::
Tensor
>
query_tokens
;
std
::
vector
<
TokenLength
>
query_lengths
;
std
::
vector
<
torch
::
Tensor
>
block_indexes
;
// (max_num_blocks_per_seq), dtype torch.int32.
std
::
vector
<
torch
::
Tensor
>
block_indexes
;
// (max_num_blocks_per_seq), dtype torch.int32.
std
::
optional
<
torch
::
Tensor
>
attn_masks
;
std
::
optional
<
torch
::
Tensor
>
rope_ranges
;
std
::
vector
<
SampleOptions
>
sample_options
;
...
...
@@ -94,8 +99,10 @@ struct BatchQueryTodo {
// mini batches, adjacent two mini batches are executed together
// tasks count must be <=2, because of flash infer attention
std
::
vector
<
PrefillTask
>
prefill_mini_batches
;
// prefill minibatch only has 1 prefill
std
::
vector
<
std
::
vector
<
QueryID
>>
decode_mini_batches
;
// decode minibatch has multiple decode
std
::
vector
<
PrefillTask
>
prefill_mini_batches
;
// prefill minibatch only has 1 prefill
std
::
vector
<
std
::
vector
<
QueryID
>>
decode_mini_batches
;
// decode minibatch has multiple decode
std
::
string
debug
();
bool
empty
();
...
...
@@ -141,11 +148,11 @@ struct QueryAdd {
int
SLO_TBT_ms
=
MAX_SLO_TIME
;
std
::
string
serialize
();
static
QueryAdd
deserialize
(
const
std
::
string
&
input
);
static
QueryAdd
deserialize
(
const
std
::
string
&
input
);
};
class
Scheduler
{
public:
public:
virtual
void
init
(
Settings
settings
)
=
0
;
virtual
void
run
()
=
0
;
...
...
@@ -156,7 +163,8 @@ class Scheduler {
virtual
void
cancel_query
(
QueryID
id
)
=
0
;
// inference loop call this
virtual
std
::
shared_ptr
<
BatchQueryTodo
>
update_last_batch
(
BatchQueryUpdate
updates
)
=
0
;
virtual
std
::
shared_ptr
<
BatchQueryTodo
>
update_last_batch
(
BatchQueryUpdate
updates
)
=
0
;
virtual
InferenceContext
get_inference_context
()
=
0
;
virtual
~
Scheduler
()
=
default
;
...
...
csrc/balance_serve/sched/utils/arithmetic.hpp
View file @
64de7843
#include <type_traits>
template
<
typename
T
,
typename
U
>
T
div_up
(
T
x
,
U
by
)
{
template
<
typename
T
,
typename
U
>
T
div_up
(
T
x
,
U
by
)
{
static_assert
(
std
::
is_integral_v
<
T
>
);
static_assert
(
std
::
is_integral_v
<
U
>
);
return
(
x
+
by
-
1
)
/
by
;
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment