Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
25cee581
Commit
25cee581
authored
Mar 31, 2025
by
Atream
Browse files
add balance-serve, support concurrence
parent
8d0292aa
Changes
196
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1680 additions
and
0 deletions
+1680
-0
csrc/balance_serve/kvc2/src/utils/mutex_extend.hpp
csrc/balance_serve/kvc2/src/utils/mutex_extend.hpp
+70
-0
csrc/balance_serve/kvc2/src/utils/periodic_task.hpp
csrc/balance_serve/kvc2/src/utils/periodic_task.hpp
+102
-0
csrc/balance_serve/kvc2/src/utils/spin_lock.hpp
csrc/balance_serve/kvc2/src/utils/spin_lock.hpp
+36
-0
csrc/balance_serve/kvc2/src/utils/timer.hpp
csrc/balance_serve/kvc2/src/utils/timer.hpp
+128
-0
csrc/balance_serve/kvc2/test/CMakeLists.txt
csrc/balance_serve/kvc2/test/CMakeLists.txt
+78
-0
csrc/balance_serve/kvc2/test/hashmap_test.cpp
csrc/balance_serve/kvc2/test/hashmap_test.cpp
+11
-0
csrc/balance_serve/kvc2/test/kvc2_export_header_test.cpp
csrc/balance_serve/kvc2/test/kvc2_export_header_test.cpp
+87
-0
csrc/balance_serve/kvc2/test/kvc2_export_load_test.cpp
csrc/balance_serve/kvc2/test/kvc2_export_load_test.cpp
+87
-0
csrc/balance_serve/kvc2/test/kvc2_test_utils.cpp
csrc/balance_serve/kvc2/test/kvc2_test_utils.cpp
+117
-0
csrc/balance_serve/kvc2/test/kvc2test/CMakeLists.txt
csrc/balance_serve/kvc2/test/kvc2test/CMakeLists.txt
+26
-0
csrc/balance_serve/kvc2/test/kvc2test/append-tokens.cpp
csrc/balance_serve/kvc2/test/kvc2test/append-tokens.cpp
+52
-0
csrc/balance_serve/kvc2/test/kvc2test/check-flush-back.cpp
csrc/balance_serve/kvc2/test/kvc2test/check-flush-back.cpp
+36
-0
csrc/balance_serve/kvc2/test/kvc2test/common.hpp
csrc/balance_serve/kvc2/test/kvc2test/common.hpp
+233
-0
csrc/balance_serve/kvc2/test/kvc2test/flush-back.cpp
csrc/balance_serve/kvc2/test/kvc2test/flush-back.cpp
+57
-0
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt-gpu.cpp
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt-gpu.cpp
+125
-0
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt.cpp
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt.cpp
+97
-0
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-async.cpp
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-async.cpp
+49
-0
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt-without-vcache.cpp
...serve/kvc2/test/kvc2test/lookup-gpu-mt-without-vcache.cpp
+61
-0
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt.cpp
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt.cpp
+68
-0
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu.cpp
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu.cpp
+160
-0
No files found.
csrc/balance_serve/kvc2/src/utils/mutex_extend.hpp
0 → 100644
View file @
25cee581
#ifndef __MUTEX_EXTEND_HPP_
#define __MUTEX_EXTEND_HPP_
#include <atomic>
#include <chrono>
#include <iostream>
#include <thread>
class
non_recursive_mutex
{
public:
non_recursive_mutex
()
=
default
;
// 使用 try_lock 实现非递归锁
bool
try_lock
()
{
std
::
thread
::
id
this_id
=
std
::
this_thread
::
get_id
();
// 检查当前线程是否已经持有该锁
if
(
owner
.
load
(
std
::
memory_order_acquire
)
==
this_id
)
{
return
false
;
// 如果是当前线程,返回失败
}
// 尝试加锁
if
(
mtx
.
try_lock
())
{
owner
.
store
(
this_id
,
std
::
memory_order_release
);
// 设置锁的拥有者
return
true
;
}
return
false
;
}
// lock 会阻塞,直到获得锁
void
lock
()
{
std
::
thread
::
id
this_id
=
std
::
this_thread
::
get_id
();
while
(
true
)
{
// 检查当前线程是否已经持有该锁
if
(
owner
.
load
(
std
::
memory_order_acquire
)
==
this_id
)
{
throw
std
::
runtime_error
(
"Thread is trying to lock a mutex it already holds"
);
}
// 尝试加锁
if
(
mtx
.
try_lock
())
{
owner
.
store
(
this_id
,
std
::
memory_order_release
);
// 设置锁的拥有者
return
;
}
// 如果锁未获得,则稍微等待,防止忙等
std
::
this_thread
::
yield
();
}
}
// 解锁
void
unlock
()
{
std
::
thread
::
id
this_id
=
std
::
this_thread
::
get_id
();
// 确保只有持有锁的线程可以解锁
if
(
owner
.
load
(
std
::
memory_order_acquire
)
==
this_id
)
{
owner
.
store
(
std
::
thread
::
id
(),
std
::
memory_order_release
);
// 清除锁的拥有者
mtx
.
unlock
();
}
else
{
throw
std
::
runtime_error
(
"Thread attempting to unlock a mutex it doesn't own"
);
}
}
private:
std
::
mutex
mtx
;
// 实际的互斥量
std
::
atomic
<
std
::
thread
::
id
>
owner
;
// 原子变量,记录当前锁的拥有者
};
#endif
csrc/balance_serve/kvc2/src/utils/periodic_task.hpp
0 → 100644
View file @
25cee581
#ifndef PERIODIC_TASK_HPP
#define PERIODIC_TASK_HPP
#include <atomic>
#include <chrono>
#include <condition_variable>
#include <cstdio>
#include <functional>
#include <future>
#include <iostream>
#include <mutex>
#include <stop_token>
#include <thread>
#include <utility>
#include <vector>
namespace
periodic
{
class
PeriodicTask
{
public:
explicit
PeriodicTask
(
std
::
function
<
void
()
>
func
,
std
::
chrono
::
milliseconds
interval_ms
=
std
::
chrono
::
milliseconds
(
100
))
:
func_
(
std
::
move
(
func
)),
interval_
(
interval_ms
),
worker_
([
this
](
std
::
stop_token
stoken
)
{
this
->
run
(
stoken
);
})
{
// std::cout << "PeriodicTask created with interval: " << interval_.count() << " ms" << std::endl;
}
~
PeriodicTask
()
{
worker_
.
request_stop
();
cv_
.
notify_one
();
// Ensure worker wakes up when destroyed
// std::cout << "PeriodicTask destructor called, stopping worker." << std::endl;
}
void
wakeUp
()
{
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
wakeup_mutex_
);
wake_up_requested_
=
true
;
}
cv_
.
notify_one
();
// Notify worker thread to wake up immediately
// std::cout << "wakeUp() called: worker thread will wake up." << std::endl;
}
std
::
future
<
void
>
wakeUpWait
()
{
std
::
promise
<
void
>
promise
;
std
::
future
<
void
>
future
=
promise
.
get_future
();
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
promise_mutex_
);
wakeup_promises_
.
push_back
(
std
::
move
(
promise
));
}
wakeUp
();
return
future
;
}
private:
void
run
(
std
::
stop_token
stoken
)
{
while
(
!
stoken
.
stop_requested
())
{
std
::
unique_lock
lock
(
mutex_
);
// Wait for either the time interval or a wake-up signal
cv_
.
wait_for
(
lock
,
interval_
,
[
this
]
{
return
wake_up_requested_
.
load
();
});
if
(
stoken
.
stop_requested
())
break
;
// If the wake-up was triggered, reset the flag and process the task
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
wakeup_mutex_
);
wake_up_requested_
=
false
;
}
try
{
// std::cout << "Running task function." << std::endl;
func_
();
}
catch
(...)
{
std
::
cerr
<<
"Error in task function."
<<
std
::
endl
;
}
notifyPromises
();
}
}
void
notifyPromises
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
promise_mutex_
);
// std::cout << "Notifying all waiting promises." << std::endl;
for
(
auto
&
promise
:
wakeup_promises_
)
{
promise
.
set_value
();
}
wakeup_promises_
.
clear
();
}
std
::
function
<
void
()
>
func_
;
std
::
chrono
::
milliseconds
interval_
;
std
::
mutex
mutex_
;
std
::
condition_variable
cv_
;
std
::
vector
<
std
::
promise
<
void
>>
wakeup_promises_
;
std
::
mutex
promise_mutex_
;
std
::
mutex
wakeup_mutex_
;
std
::
atomic
<
bool
>
wake_up_requested_
=
false
;
std
::
jthread
worker_
;
};
}
// namespace periodic
#endif // PERIODIC_TASK_HPP
csrc/balance_serve/kvc2/src/utils/spin_lock.hpp
0 → 100644
View file @
25cee581
/*
* @Author: Xie Weiyu ervinxie@qq.com
* @Date: 2024-11-21 06:35:47
* @LastEditors: Xie Weiyu ervinxie@qq.com
* @LastEditTime: 2024-11-21 06:35:50
* @FilePath: /kvc2/src/utils/spin_lock.hpp
* @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置:
* https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
*/
#include <atomic>
#include <chrono>
#include <thread>
class
SpinLock
{
public:
SpinLock
()
{
flag
.
clear
();
}
void
lock
()
{
const
int
max_delay
=
1024
;
// Maximum delay in microseconds
int
delay
=
1
;
// Initial delay in microseconds
while
(
flag
.
test_and_set
(
std
::
memory_order_acquire
))
{
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
microseconds
(
delay
));
delay
*=
2
;
if
(
delay
>
max_delay
)
{
delay
=
max_delay
;
}
}
}
void
unlock
()
{
flag
.
clear
(
std
::
memory_order_release
);
}
private:
std
::
atomic_flag
flag
=
ATOMIC_FLAG_INIT
;
};
csrc/balance_serve/kvc2/src/utils/timer.hpp
0 → 100644
View file @
25cee581
#pragma once
#include <cassert>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <map>
#include <sstream>
#include <string>
#include "easy_format.hpp"
inline
std
::
string
doubleToStringR2
(
double
value
)
{
std
::
stringstream
stream
;
stream
<<
std
::
fixed
<<
std
::
setprecision
(
2
)
<<
value
;
return
stream
.
str
();
}
class
Timer
{
public:
std
::
string
name
;
bool
tmp_timer
=
false
;
Timer
()
{}
Timer
(
std
::
string
name
)
:
name
(
name
),
tmp_timer
(
true
)
{
start
();
}
~
Timer
()
{
if
(
tmp_timer
)
{
std
::
cout
<<
name
<<
" "
<<
elapsedMs
()
<<
" ms"
<<
std
::
endl
;
}
}
void
start
()
{
m_startTime
=
std
::
chrono
::
high_resolution_clock
::
now
();
assert
(
m_isRunning
==
false
);
m_isRunning
=
true
;
}
void
stop
()
{
m_endTime
=
std
::
chrono
::
high_resolution_clock
::
now
();
assert
(
m_isRunning
==
true
);
m_isRunning
=
false
;
m_runningNs
+=
elapsedNs
();
}
double
elapsedNs
()
{
std
::
chrono
::
time_point
<
std
::
chrono
::
high_resolution_clock
>
endTime
;
if
(
m_isRunning
)
{
endTime
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
else
{
endTime
=
m_endTime
;
}
return
std
::
chrono
::
duration_cast
<
std
::
chrono
::
nanoseconds
>
(
endTime
-
m_startTime
).
count
();
}
void
printElapsedMilliseconds
()
{
std
::
cout
<<
elapsedNs
()
/
1e6
<<
" ms"
<<
std
::
endl
;
}
static
std
::
string
ns_to_string
(
double
duration
)
{
auto
nano_sec
=
duration
;
if
(
nano_sec
>=
1000
)
{
auto
mirco_sec
=
nano_sec
/
1000.0
;
if
(
mirco_sec
>=
1000
)
{
auto
milli_sec
=
mirco_sec
/
1000.0
;
if
(
milli_sec
>=
1000
)
{
auto
seconds
=
milli_sec
/
1000.0
;
if
(
seconds
>=
60.0
)
{
auto
minutes
=
seconds
/
60.0
;
if
(
minutes
>=
60.0
)
{
auto
hours
=
minutes
/
60.0
;
return
doubleToStringR2
(
hours
)
+
" h"
;
}
else
{
return
doubleToStringR2
(
minutes
)
+
" min"
;
}
}
else
{
return
doubleToStringR2
(
seconds
)
+
" sec"
;
}
}
else
{
return
doubleToStringR2
(
milli_sec
)
+
" ms"
;
}
}
else
{
return
doubleToStringR2
(
mirco_sec
)
+
" us"
;
}
}
else
{
return
doubleToStringR2
(
nano_sec
)
+
" ns"
;
}
}
double
runningTimeNs
()
{
return
m_runningNs
;
}
std
::
string
runningTime
()
{
auto
duration
=
m_runningNs
;
return
ns_to_string
(
duration
);
}
std
::
string
elapsedTime
()
{
return
ns_to_string
(
elapsedNs
());
}
double
elapsedMs
()
{
return
elapsedNs
()
/
1e6
;
}
std
::
string
report_throughput
(
size_t
op_cnt
)
{
double
ops
=
op_cnt
/
elapsedMs
()
*
1000
;
return
readable_number
(
ops
)
+
"op/s"
;
}
void
merge
(
Timer
&
other
)
{
assert
(
m_isRunning
==
false
);
assert
(
other
.
m_isRunning
==
false
);
m_runningNs
+=
other
.
runningTimeNs
();
}
private:
std
::
chrono
::
time_point
<
std
::
chrono
::
high_resolution_clock
>
m_startTime
;
std
::
chrono
::
time_point
<
std
::
chrono
::
high_resolution_clock
>
m_endTime
;
bool
m_isRunning
=
false
;
double
m_runningNs
=
0.0
;
};
class
Counter
{
public:
Counter
()
{}
std
::
map
<
std
::
string
,
size_t
>
counters
;
void
inc
(
const
char
*
name
,
size_t
num
)
{
counters
[
name
]
+=
num
;
};
void
print
()
{
for
(
auto
&
p
:
counters
)
{
std
::
cout
<<
p
.
first
<<
" : "
<<
p
.
second
<<
std
::
endl
;
}
};
};
csrc/balance_serve/kvc2/test/CMakeLists.txt
0 → 100644
View file @
25cee581
set
(
CMAKE_CXX_FLAGS
"-Og -march=native -Wall -Wextra -g -fopenmp"
)
# set(CMAKE_CXX_FLAGS "-O3 -march=native -Wall -Wextra -pthread")
add_subdirectory
(
kvc2test
)
include_directories
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/../src
)
add_executable
(
hashmap_test hashmap_test.cpp
)
target_link_libraries
(
hashmap_test PRIVATE TBB::tbb
)
add_executable
(
xxHash_test xxHash_test.cpp
)
target_link_libraries
(
xxHash_test PRIVATE xxhash
)
function
(
add_async_store_executable source_file
)
get_filename_component
(
target_name
${
source_file
}
NAME_WE
)
# 获取不带扩展名的文件名作为目标名
add_executable
(
${
target_name
}
${
source_file
}
)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../src
)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../third_party/nlohmann/single_include
)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../third_party/spdlog/include
)
target_link_libraries
(
${
target_name
}
PRIVATE async_store gflags
)
endfunction
()
add_async_store_executable
(
async_store_test.cpp
)
function
(
add_kvc2_executable source_file
)
get_filename_component
(
target_name
${
source_file
}
NAME_WE
)
# 获取不带扩展名的文件名作为目标名
add_executable
(
${
target_name
}
${
source_file
}
)
# target_compile_options(${target_name} PRIVATE -fopenmp -fno-strict-aliasing)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../src
)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../third_party/nlohmann/single_include
)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../third_party/spdlog/include
)
target_link_libraries
(
${
target_name
}
PRIVATE kvc2 async_store gflags
)
endfunction
()
add_kvc2_executable
(
test_lock_free_queue.cpp
)
add_kvc2_executable
(
test_queue_perf.cpp
)
# Disable deprecated test
# add_kvc2_executable(prefix_test.cpp)
# add_kvc2_executable(kvcache_disk_insert_read_test.cpp)
# add_kvc2_executable(kvcache_mem_eviction_test.cpp)
# add_kvc2_executable(kvcache_mem_insert_read_test.cpp)
# add_kvc2_executable(kvcache_save_load_test.cpp)
# add_kvc2_executable(kvc2_export_header_test.cpp)
# add_kvc2_executable(kvc2_export_load_test.cpp)
target_include_directories
(
async_store_test PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/..//third_party/nlohmann/single_include
)
target_include_directories
(
async_store_test PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/..//third_party/spdlog/include
)
target_link_libraries
(
async_store_test PRIVATE xxhash
)
add_executable
(
test_std_list test_std_list.cpp
)
add_executable
(
test_cuda_stream test_cuda_stream.cpp
)
target_include_directories
(
test_cuda_stream PRIVATE
${
CUDAToolkit_INCLUDE_DIRS
}
)
target_link_libraries
(
test_cuda_stream PRIVATE CUDA::cudart
)
add_executable
(
test_cuda_stream_manager test_cuda_stream_manager.cpp
)
target_include_directories
(
test_cuda_stream_manager PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../src
)
target_link_libraries
(
test_cuda_stream_manager PRIVATE cuda_stream_manager
)
add_executable
(
test_periodic_task test_periodic_task.cpp
)
target_include_directories
(
test_periodic_task PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../src
)
add_executable
(
test_page_pool page_pool_test.cpp
)
target_include_directories
(
test_page_pool PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../src
)
target_include_directories
(
test_page_pool PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../third_party/spdlog/include
)
\ No newline at end of file
csrc/balance_serve/kvc2/test/hashmap_test.cpp
0 → 100644
View file @
25cee581
#include <tbb/concurrent_hash_map.h>
#include <iostream>
int
main
()
{
tbb
::
concurrent_hash_map
<
int
,
int
>
map
;
map
.
insert
({
1
,
2
});
decltype
(
map
)
::
accessor
a
;
std
::
cout
<<
map
.
find
(
a
,
1
)
<<
std
::
endl
;
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2_export_header_test.cpp
0 → 100644
View file @
25cee581
#include "kvc2.h"
#include "kvc2_test_utils.cpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
std
::
mt19937
gen
(
123
);
KVC2Config
config
=
{
.
path
=
FLAGS_disk_cache_path
,
.
config_path
=
std
::
string
(
"/home/xwy/conifg"
),
.
block_length
=
BlockLength
,
.
memory_pool_size
=
size_t
(
10e9
),
.
evict_count
=
20
,
};
auto
kvcc
=
create_kvc2
(
config
);
auto
io
=
kvcc
->
start_io_thread
();
SPDLOG_INFO
(
"Disk Test"
);
auto
ids
=
random_ids
(
10
*
BlockLength
,
gen
);
auto
h1
=
random_kvcache
(
qwen_cache_info
,
10
,
gen
);
kvcc
->
raw_insert
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids
.
data
()),
ids
.
size
(),
h1
);
// complete same
{
auto
h2
=
empty_kvcache
(
qwen_cache_info
,
10
);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids
.
data
()),
ids
.
size
(),
h2
);
cmp_handle_data
(
qwen_cache_info
,
h1
,
h2
);
}
// complete prefix
{
auto
h2
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids2
=
std
::
vector
<
Token
>
(
ids
.
begin
(),
ids
.
begin
()
+
3
*
BlockLength
);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
cmp_handle_data
(
qwen_cache_info
,
h1
,
h2
,
3
);
}
// common prefix
{
auto
h2
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids2
=
std
::
vector
<
Token
>
(
ids
.
begin
(),
ids
.
begin
()
+
5
*
BlockLength
);
auto
rids
=
random_ids
(
BlockLength
*
2
+
BlockLength
/
2
,
gen
);
ids2
.
insert
(
ids2
.
end
(),
rids
.
begin
(),
rids
.
end
());
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
cmp_handle_data
(
qwen_cache_info
,
h1
,
h2
,
5
);
}
// no prefix
{
auto
h2
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids2
=
random_ids
(
10
*
BlockLength
,
gen
);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
}
// insert partly new
auto
h2
=
random_kvcache
(
qwen_cache_info
,
10
,
gen
);
copy_kvcache
(
h1
,
h2
,
0
,
5
);
auto
ids2
=
random_ids
(
10
*
BlockLength
,
gen
);
for
(
size_t
i
=
0
;
i
<
5
*
BlockLength
;
i
++
)
{
ids2
[
i
]
=
ids
[
i
];
}
kvcc
->
raw_insert
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
// read new part
{
auto
h3
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids3
=
std
::
vector
<
Token
>
(
ids2
.
begin
(),
ids2
.
begin
()
+
7
*
BlockLength
);
ids3
.
push_back
(
123
);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids3
.
data
()),
ids3
.
size
(),
h3
);
cmp_handle_data
(
qwen_cache_info
,
h3
,
h2
,
7
);
}
kvcc
->
save
();
kvcc
->
stop_io_thread
();
io
.
join
();
SPDLOG_WARN
(
"{} Test Passed"
,
__FILE__
);
return
0
;
}
\ No newline at end of file
csrc/balance_serve/kvc2/test/kvc2_export_load_test.cpp
0 → 100644
View file @
25cee581
#include "kvc2.h"
#include "kvc2_test_utils.cpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
std
::
mt19937
gen
(
123
);
KVC2Config
config
=
{
.
path
=
FLAGS_disk_cache_path
,
.
block_length
=
BlockLength
,
.
memory_pool_size
=
size_t
(
10e9
),
.
evict_count
=
20
,
};
auto
kvcc
=
create_kvc2
(
config
);
kvcc
->
load
();
auto
io
=
kvcc
->
start_io_thread
();
SPDLOG_INFO
(
"Disk Test"
);
auto
ids
=
random_ids
(
10
*
BlockLength
,
gen
);
auto
h1
=
empty_kvcache
(
qwen_cache_info
,
10
);
// kvcc->raw_insert(qwen_cache_info, reinterpret_cast<IDptr>(ids.data()), ids.size(), h1);
// complete same
{
// auto h2 = empty_kvcache(qwen_cache_info, 10);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids
.
data
()),
ids
.
size
(),
h1
);
// cmp_handle_data(qwen_cache_info, h1, h2);
}
// complete prefix
{
auto
h2
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids2
=
std
::
vector
<
Token
>
(
ids
.
begin
(),
ids
.
begin
()
+
3
*
BlockLength
);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
cmp_handle_data
(
qwen_cache_info
,
h1
,
h2
,
3
);
}
// common prefix
{
auto
h2
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids2
=
std
::
vector
<
Token
>
(
ids
.
begin
(),
ids
.
begin
()
+
5
*
BlockLength
);
auto
rids
=
random_ids
(
BlockLength
*
2
+
BlockLength
/
2
,
gen
);
ids2
.
insert
(
ids2
.
end
(),
rids
.
begin
(),
rids
.
end
());
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
cmp_handle_data
(
qwen_cache_info
,
h1
,
h2
,
5
);
}
// no prefix
{
auto
h2
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids2
=
random_ids
(
10
*
BlockLength
,
gen
);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
}
// insert partly new
auto
h2
=
random_kvcache
(
qwen_cache_info
,
10
,
gen
);
copy_kvcache
(
h1
,
h2
,
0
,
5
);
auto
ids2
=
random_ids
(
10
*
BlockLength
,
gen
);
for
(
size_t
i
=
0
;
i
<
5
*
BlockLength
;
i
++
)
{
ids2
[
i
]
=
ids
[
i
];
}
kvcc
->
raw_insert
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
// read new part
{
auto
h3
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids3
=
std
::
vector
<
Token
>
(
ids2
.
begin
(),
ids2
.
begin
()
+
7
*
BlockLength
);
ids3
.
push_back
(
123
);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids3
.
data
()),
ids3
.
size
(),
h3
);
cmp_handle_data
(
qwen_cache_info
,
h3
,
h2
,
7
);
}
kvcc
->
stop_io_thread
();
io
.
join
();
SPDLOG_WARN
(
"{} Test Passed"
,
__FILE__
);
return
0
;
}
\ No newline at end of file
csrc/balance_serve/kvc2/test/kvc2_test_utils.cpp
0 → 100644
View file @
25cee581
#include <optional>
#include <random>
#include "kvc2.h"
#define FMT_HEADER_ONLY
#include <spdlog/spdlog.h>
const
int
BlockLength
=
256
;
std
::
string
FLAGS_disk_cache_path
;
void
init
(
int
argc
,
char
*
argv
[])
{
if
(
argc
!=
2
)
{
fmt
::
print
(
"Usage: {} --disk_cache_path=xxx
\n
"
,
argv
[
0
]);
exit
(
1
);
}
FLAGS_disk_cache_path
=
argv
[
1
];
if
(
FLAGS_disk_cache_path
.
empty
())
{
fmt
::
print
(
"disk_cache_path is empty"
);
exit
(
1
);
}
}
using
namespace
kvc2
;
data_block_ptr
empty_block
(
CacheInfo
info
)
{
auto
re
=
new
(
std
::
align_val_t
(
4096
))
std
::
byte
[
info
.
element_size
(
BlockLength
)];
return
reinterpret_cast
<
data_block_ptr
>
(
re
);
}
data_block_ptr
random_block
(
CacheInfo
info
,
std
::
mt19937
&
gen
)
{
auto
re
=
empty_block
(
info
);
uint64_t
*
d
=
(
uint64_t
*
)
re
;
for
(
size_t
i
=
0
;
i
<
info
.
element_size
(
BlockLength
)
/
8
;
i
++
)
{
d
[
i
]
=
gen
();
}
return
re
;
}
layer_data
random_blocks
(
CacheInfo
info
,
size_t
block_count
,
size_t
seed
)
{
std
::
mt19937
gen
(
seed
);
layer_data
re
;
for
(
size_t
i
=
0
;
i
<
block_count
;
i
++
)
{
re
.
push_back
(
random_block
(
info
,
gen
));
}
return
re
;
}
layer_data
empty_blocks
(
CacheInfo
info
,
size_t
block_count
)
{
layer_data
re
;
for
(
size_t
i
=
0
;
i
<
block_count
;
i
++
)
{
re
.
push_back
(
empty_block
(
info
));
}
return
re
;
}
void
copy_kvcache
(
std
::
vector
<
layer_data
>&
from
,
std
::
vector
<
layer_data
>&
to
,
size_t
block_start
,
size_t
length
)
{
for
(
size_t
i
=
0
;
i
<
from
.
size
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
length
;
j
++
)
{
to
[
i
][
block_start
+
j
]
=
from
[
i
][
block_start
+
j
];
}
}
}
std
::
vector
<
layer_data
>
random_kvcache
(
CacheInfo
info
,
size_t
block_count
,
std
::
mt19937
&
gen
)
{
std
::
vector
<
layer_data
>
re
;
re
.
resize
(
info
.
hidden_layer_count
());
fmt
::
print
(
"Generating random kvcache, layer {}
\n
"
,
info
.
hidden_layer_count
());
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
info
.
hidden_layer_count
();
i
++
)
{
re
[
i
]
=
random_blocks
(
info
,
block_count
,
gen
());
}
return
re
;
}
std
::
vector
<
layer_data
>
empty_kvcache
(
CacheInfo
info
,
size_t
block_count
)
{
std
::
vector
<
layer_data
>
re
;
re
.
resize
(
info
.
hidden_layer_count
());
fmt
::
print
(
"Generating empty kvcache, layer {}
\n
"
,
info
.
hidden_layer_count
());
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
info
.
hidden_layer_count
();
i
++
)
{
re
[
i
]
=
empty_blocks
(
info
,
block_count
);
}
return
re
;
}
std
::
vector
<
Token
>
random_ids
(
size_t
length
,
std
::
mt19937
&
gen
)
{
std
::
vector
<
Token
>
re
;
for
(
size_t
i
=
0
;
i
<
length
;
i
++
)
{
re
.
push_back
(
gen
());
}
return
re
;
}
CacheInfo
qwen_cache_info
=
{
.
model_name
=
"qwen2-72b-instruct"
,
.
is_key_cache
=
true
,
.
quant_type
=
"BF16"
,
};
void
cmp_handle_data
(
CacheInfo
info
,
std
::
vector
<
layer_data
>&
h1
,
std
::
vector
<
layer_data
>&
h2
,
std
::
optional
<
size_t
>
blocks
=
std
::
nullopt
)
{
assert
(
h1
.
size
()
==
h2
.
size
());
for
(
size_t
i
=
0
;
i
<
h1
.
size
();
i
++
)
{
auto
&
b1
=
h1
[
i
];
auto
&
b2
=
h2
[
i
];
if
(
blocks
.
has_value
()
==
false
)
{
assert
(
b1
.
size
()
==
b2
.
size
());
}
int
cmp_to
=
blocks
.
has_value
()
?
blocks
.
value
()
:
b1
.
size
();
for
(
int
j
=
0
;
j
<
cmp_to
;
j
++
)
{
auto
e1
=
reinterpret_cast
<
void
*>
(
b1
[
j
]);
auto
e2
=
reinterpret_cast
<
void
*>
(
b2
[
j
]);
assert
(
memcmp
(
e1
,
e2
,
info
.
element_size
(
BlockLength
))
==
0
);
}
}
fmt
::
print
(
"KVCacheHandle cmp ok
\n
"
);
}
csrc/balance_serve/kvc2/test/kvc2test/CMakeLists.txt
0 → 100644
View file @
25cee581
set
(
CMAKE_CXX_FLAGS
"-Og -march=native -Wall -Wextra -g -fopenmp"
)
function
(
add_kvc2_test source_file
)
get_filename_component
(
target_name
${
source_file
}
NAME_WE
)
# 获取不带扩展名的文件名作为目标名
add_executable
(
${
target_name
}
${
source_file
}
)
# target_compile_options(${target_name} PRIVATE -fopenmp -fno-strict-aliasing)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../src
)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../third_party/nlohmann/single_include
)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../third_party/spdlog/include
)
target_link_libraries
(
${
target_name
}
PRIVATE kvc2 async_store
)
endfunction
()
add_kvc2_test
(
raw_insert_read.cpp
)
add_kvc2_test
(
lookup.cpp
)
add_kvc2_test
(
lookup-alt.cpp
)
add_kvc2_test
(
lookup-alt-gpu.cpp
)
add_kvc2_test
(
lookup-mt.cpp
)
add_kvc2_test
(
lookup-gpu.cpp
)
add_kvc2_test
(
lookup-gpu-mt.cpp
)
add_kvc2_test
(
lookup-gpu-async.cpp
)
add_kvc2_test
(
append-tokens.cpp
)
add_kvc2_test
(
flush-back.cpp
)
add_kvc2_test
(
check-flush-back.cpp
)
add_kvc2_test
(
lookup-without-vcache.cpp
)
add_kvc2_test
(
lookup-gpu-mt-without-vcache.cpp
)
csrc/balance_serve/kvc2/test/kvc2test/append-tokens.cpp
0 → 100644
View file @
25cee581
#include <future>
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
#pragma omp parallel for
for
(
size_t
ti
=
0
;
ti
<
3
;
ti
++
)
{
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
std
::
mt19937
gen
(
ti
+
123
);
size_t
total_page
=
10
;
TokenLength
total_length
=
total_page
*
config
.
num_token_per_page
;
auto
tokens
=
random_ids
(
total_length
,
gen
);
TokenLength
prompt_length
=
3
*
config
.
num_token_per_page
;
auto
k1
=
random_kvcache
(
total_page
,
gen
);
auto
v1
=
random_kvcache
(
total_page
,
gen
);
{
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_to_gpu_async
(
test_model_name
,
test_quant_type
,
tokens
.
data
(),
prompt_length
,
total_length
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
assert
(
h
->
matched_length
()
%
config
.
num_token_per_page
==
0
);
size_t
matched_block
=
h
->
matched_length
()
/
config
.
num_token_per_page
;
auto
block_idx
=
h
->
get_gpu_block_idx
();
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
matched_block
);
for
(
size_t
at
=
matched_block
;
at
<
block_idx
.
size
();
at
++
)
{
copy_cpu_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
at
);
}
h
->
append_tokens
(
tokens
.
data
(),
total_length
);
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
total_page
);
}
{
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_to_gpu_async
(
test_model_name
,
test_quant_type
,
tokens
.
data
(),
total_length
,
total_length
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
assert
(
h
->
matched_length
()
==
total_length
);
size_t
matched_block
=
h
->
matched_length
()
/
config
.
num_token_per_page
;
auto
block_idx
=
h
->
get_gpu_block_idx
();
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
matched_block
);
}
}
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2test/check-flush-back.cpp
0 → 100644
View file @
25cee581
#include <future>
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
config
.
gpu_cache_config
->
total_kvcache_pages
=
12
;
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
kvc2
->
load
();
// #pragma omp parallel for
for
(
size_t
ti
=
0
;
ti
<
2
;
ti
++
)
{
SPDLOG_WARN
(
"Test {}"
,
ti
);
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
std
::
mt19937
gen
(
ti
+
123
);
size_t
total_page
=
10
;
TokenLength
total_length
=
total_page
*
config
.
num_token_per_page
;
auto
tokens
=
random_ids
(
total_length
,
gen
);
auto
k1
=
random_kvcache
(
total_page
,
gen
);
auto
v1
=
random_kvcache
(
total_page
,
gen
);
{
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_to_gpu_async
(
test_model_name
,
test_quant_type
,
tokens
.
data
(),
total_length
,
total_length
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
assert
(
h
->
matched_length
()
==
total_length
);
size_t
matched_block
=
h
->
matched_length
()
/
config
.
num_token_per_page
;
auto
block_idx
=
h
->
get_gpu_block_idx
();
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
matched_block
);
}
}
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2test/common.hpp
0 → 100644
View file @
25cee581
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 06:02:41
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-12-11 07:34:10
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
#include <random>
#include <thread>
#include "kvc2.h"
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
using
namespace
kvc2
;
template
<
typename
T
>
T
*
offset_by_bytes
(
T
*
t
,
size_t
n
)
{
return
reinterpret_cast
<
T
*>
(
reinterpret_cast
<
size_t
>
(
t
)
+
n
);
}
std
::
string
FLAGS_disk_cache_path
;
kvc2
::
KVC2Config
config
;
kvc2
::
GPUPageCacheConfig
qw25_7B_gpu_config
{
.
gpu_only
=
false
,
.
gpu_devices_id
=
{
0
,
1
},
.
layer_count
=
28
,
.
total_kvcache_pages
=
40
,
.
num_token_per_page
=
256
,
.
num_k_heads
=
4
,
.
k_head_dim
=
896
,
.
full_kv_cache_on_each_gpu
=
false
,
.
k_cache_on
=
true
,
.
v_cache_on
=
true
,
.
tensor_type
=
torch
::
kBFloat16
,
.
num_streams_per_device
=
4
,
};
ModelName
test_model_name
=
"Qwen2.5-7B-Instruct"
;
QuantType
test_quant_type
=
"FP16"
;
CacheInfo
test_cache_info
{
.
model_name
=
test_model_name
,
.
is_key_cache
=
true
,
.
quant_type
=
test_quant_type
,
};
void
init
(
int
argc
,
char
*
argv
[])
{
if
(
argc
!=
2
)
{
fmt
::
print
(
"Usage: {} <disk_cache_path>
\n
"
,
argv
[
0
]);
exit
(
1
);
}
load_quant_configs
(
"./config/quant_configs.json"
);
load_model_configs
(
"./config/model_configs.json"
);
FLAGS_disk_cache_path
=
argv
[
1
];
if
(
FLAGS_disk_cache_path
.
empty
())
{
fmt
::
print
(
"disk_cache_path is empty
\n
"
);
exit
(
1
);
}
config
.
path
=
FLAGS_disk_cache_path
;
config
.
config_path
=
"./config"
;
config
.
gpu_cache_config
=
qw25_7B_gpu_config
;
}
data_block_ptr
empty_block
()
{
auto
re
=
new
(
std
::
align_val_t
(
4096
))
std
::
byte
[
test_cache_info
.
element_size
(
config
.
num_token_per_page
)];
memset
(
re
,
0
,
test_cache_info
.
element_size
(
config
.
num_token_per_page
));
return
reinterpret_cast
<
data_block_ptr
>
(
re
);
}
data_block_ptr
random_block
(
std
::
mt19937
&
gen
)
{
auto
re
=
empty_block
();
uint64_t
*
d
=
(
uint64_t
*
)
re
;
for
(
size_t
i
=
0
;
i
<
test_cache_info
.
element_size
(
config
.
num_token_per_page
)
/
8
;
i
++
)
{
d
[
i
]
=
gen
();
}
return
re
;
}
layer_data
random_blocks
(
size_t
block_count
,
size_t
seed
)
{
std
::
mt19937
gen
(
seed
);
layer_data
re
;
for
(
size_t
i
=
0
;
i
<
block_count
;
i
++
)
{
re
.
push_back
(
random_block
(
gen
));
}
return
re
;
}
layer_data
empty_blocks
(
size_t
block_count
)
{
layer_data
re
;
for
(
size_t
i
=
0
;
i
<
block_count
;
i
++
)
{
re
.
push_back
(
empty_block
());
}
return
re
;
}
void
copy_kvcache
(
std
::
vector
<
layer_data
>&
from
,
std
::
vector
<
layer_data
>&
to
,
size_t
block_start
,
size_t
length
)
{
for
(
size_t
i
=
0
;
i
<
from
.
size
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
length
;
j
++
)
{
to
[
i
][
block_start
+
j
]
=
from
[
i
][
block_start
+
j
];
}
}
}
std
::
vector
<
layer_data
>
random_kvcache
(
size_t
block_count
,
std
::
mt19937
&
gen
)
{
std
::
vector
<
layer_data
>
re
;
re
.
resize
(
test_cache_info
.
hidden_layer_count
());
fmt
::
print
(
"Generating random kvcache, layer {}
\n
"
,
test_cache_info
.
hidden_layer_count
());
std
::
vector
<
std
::
mt19937
>
gens
;
for
(
size_t
i
=
0
;
i
<
test_cache_info
.
hidden_layer_count
();
i
++
)
{
gens
.
push_back
(
std
::
mt19937
(
gen
()));
}
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
test_cache_info
.
hidden_layer_count
();
i
++
)
{
re
[
i
]
=
random_blocks
(
block_count
,
gens
[
i
]());
}
return
re
;
}
std
::
vector
<
layer_data
>
empty_kvcache
(
size_t
block_count
)
{
std
::
vector
<
layer_data
>
re
;
re
.
resize
(
test_cache_info
.
hidden_layer_count
());
fmt
::
print
(
"Generating empty kvcache, layer {}
\n
"
,
test_cache_info
.
hidden_layer_count
());
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
test_cache_info
.
hidden_layer_count
();
i
++
)
{
re
[
i
]
=
empty_blocks
(
block_count
);
}
return
re
;
}
std
::
vector
<
Token
>
random_ids
(
size_t
length
,
std
::
mt19937
&
gen
)
{
std
::
vector
<
Token
>
re
;
for
(
size_t
i
=
0
;
i
<
length
;
i
++
)
{
re
.
push_back
(
gen
());
}
return
re
;
}
std
::
vector
<
layer_data
>
slice
(
std
::
vector
<
layer_data
>&
h1
,
size_t
start
,
size_t
end
){
std
::
vector
<
layer_data
>
re
;
for
(
auto
&
l
:
h1
){
layer_data
new_layer
;
new_layer
.
insert
(
new_layer
.
end
(),
l
.
begin
()
+
start
,
l
.
begin
()
+
end
);
re
.
push_back
(
new_layer
);
}
return
re
;
}
void
cmp_handle_data
(
std
::
vector
<
layer_data
>
h1
,
std
::
vector
<
layer_data
>
h2
,
std
::
optional
<
size_t
>
blocks
=
std
::
nullopt
)
{
assert
(
h1
.
size
()
==
h2
.
size
());
for
(
size_t
i
=
0
;
i
<
h1
.
size
();
i
++
)
{
auto
&
b1
=
h1
[
i
];
auto
&
b2
=
h2
[
i
];
if
(
blocks
.
has_value
()
==
false
)
{
assert
(
b1
.
size
()
==
b2
.
size
());
}
int
cmp_to
=
blocks
.
has_value
()
?
blocks
.
value
()
:
b1
.
size
();
for
(
int
j
=
0
;
j
<
cmp_to
;
j
++
)
{
auto
e1
=
reinterpret_cast
<
void
*>
(
b1
[
j
]);
auto
e2
=
reinterpret_cast
<
void
*>
(
b2
[
j
]);
assert
(
memcmp
(
e1
,
e2
,
test_cache_info
.
element_size
(
config
.
num_token_per_page
))
==
0
);
}
}
fmt
::
print
(
"KVCacheHandle cmp ok
\n
"
);
}
void
copy_gpu_cpu
(
std
::
vector
<
size_t
>&
block_idx
,
std
::
vector
<
torch
::
Tensor
>&
kcache
,
std
::
vector
<
torch
::
Tensor
>&
vcache
,
std
::
vector
<
layer_data
>&
k_cpu
,
std
::
vector
<
layer_data
>&
v_cpu
,
size_t
at
)
{
size_t
gpu_count
=
config
.
gpu_cache_config
->
gpu_devices_id
.
size
();
size_t
element_size_per_gpu
=
test_cache_info
.
element_size
(
config
.
num_token_per_page
)
/
gpu_count
;
for
(
size_t
layer
=
0
;
layer
<
test_cache_info
.
hidden_layer_count
();
layer
++
)
{
for
(
size_t
gpu_idx
=
0
;
gpu_idx
<
gpu_count
;
gpu_idx
++
)
{
{
auto
kt
=
kcache
[
gpu_idx
][
layer
][
block_idx
[
at
]].
to
(
torch
::
kCPU
);
void
*
src
=
kt
.
data_ptr
();
void
*
dst
=
offset_by_bytes
(
k_cpu
[
layer
][
at
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
}
{
auto
vt
=
vcache
[
gpu_idx
][
layer
][
block_idx
[
at
]].
to
(
torch
::
kCPU
);
void
*
src
=
vt
.
data_ptr
();
void
*
dst
=
offset_by_bytes
(
v_cpu
[
layer
][
at
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
}
}
}
}
void
copy_cpu_gpu
(
std
::
vector
<
size_t
>&
block_idx
,
std
::
vector
<
torch
::
Tensor
>&
kcache
,
std
::
vector
<
torch
::
Tensor
>&
vcache
,
std
::
vector
<
layer_data
>&
k_cpu
,
std
::
vector
<
layer_data
>&
v_cpu
,
size_t
at
)
{
size_t
gpu_count
=
config
.
gpu_cache_config
->
gpu_devices_id
.
size
();
size_t
element_size_per_gpu
=
test_cache_info
.
element_size
(
config
.
num_token_per_page
)
/
gpu_count
;
for
(
size_t
layer
=
0
;
layer
<
test_cache_info
.
hidden_layer_count
();
layer
++
)
{
for
(
size_t
gpu_idx
=
0
;
gpu_idx
<
gpu_count
;
gpu_idx
++
)
{
{
auto
kt
=
kcache
[
gpu_idx
][
layer
][
block_idx
[
at
]].
to
(
torch
::
kCPU
);
void
*
dst
=
kt
.
data_ptr
();
void
*
src
=
offset_by_bytes
(
k_cpu
[
layer
][
at
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
kcache
[
gpu_idx
][
layer
][
block_idx
[
at
]].
copy_
(
kt
);
}
{
auto
vt
=
vcache
[
gpu_idx
][
layer
][
block_idx
[
at
]].
to
(
torch
::
kCPU
);
void
*
dst
=
vt
.
data_ptr
();
void
*
src
=
offset_by_bytes
(
v_cpu
[
layer
][
at
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
vcache
[
gpu_idx
][
layer
][
block_idx
[
at
]].
copy_
(
vt
);
}
}
}
}
void
cmp_handle_gpu
(
std
::
vector
<
size_t
>&
block_idx
,
std
::
vector
<
torch
::
Tensor
>&
kcache
,
std
::
vector
<
torch
::
Tensor
>&
vcache
,
std
::
vector
<
layer_data
>&
k1
,
std
::
vector
<
layer_data
>&
v1
,
size_t
num_blocks
)
{
auto
k_from_gpu
=
empty_kvcache
(
num_blocks
);
auto
v_from_gpu
=
empty_kvcache
(
num_blocks
);
for
(
size_t
j
=
0
;
j
<
std
::
min
(
block_idx
.
size
(),
num_blocks
);
j
++
)
{
copy_gpu_cpu
(
block_idx
,
kcache
,
vcache
,
k_from_gpu
,
v_from_gpu
,
j
);
}
cmp_handle_data
(
k1
,
k_from_gpu
,
num_blocks
);
cmp_handle_data
(
v1
,
v_from_gpu
,
num_blocks
);
}
csrc/balance_serve/kvc2/test/kvc2test/flush-back.cpp
0 → 100644
View file @
25cee581
#include <future>
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
config
.
gpu_cache_config
->
total_kvcache_pages
=
12
;
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
// #pragma omp parallel for
for
(
size_t
ti
=
0
;
ti
<
2
;
ti
++
)
{
SPDLOG_WARN
(
"Test {}"
,
ti
);
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
std
::
mt19937
gen
(
ti
+
123
);
size_t
total_page
=
10
;
TokenLength
total_length
=
total_page
*
config
.
num_token_per_page
;
auto
tokens
=
random_ids
(
total_length
,
gen
);
TokenLength
prompt_length
=
3
*
config
.
num_token_per_page
;
auto
k1
=
random_kvcache
(
total_page
,
gen
);
auto
v1
=
random_kvcache
(
total_page
,
gen
);
{
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_to_gpu_async
(
test_model_name
,
test_quant_type
,
tokens
.
data
(),
prompt_length
,
total_length
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
assert
(
h
->
matched_length
()
%
config
.
num_token_per_page
==
0
);
size_t
matched_block
=
h
->
matched_length
()
/
config
.
num_token_per_page
;
auto
block_idx
=
h
->
get_gpu_block_idx
();
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
matched_block
);
for
(
size_t
at
=
matched_block
;
at
<
block_idx
.
size
();
at
++
)
{
copy_cpu_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
at
);
}
h
->
append_tokens
(
tokens
.
data
(),
total_length
);
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
total_page
);
}
{
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_to_gpu_async
(
test_model_name
,
test_quant_type
,
tokens
.
data
(),
total_length
,
total_length
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
assert
(
h
->
matched_length
()
==
total_length
);
size_t
matched_block
=
h
->
matched_length
()
/
config
.
num_token_per_page
;
auto
block_idx
=
h
->
get_gpu_block_idx
();
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
matched_block
);
}
}
kvc2
->
save
();
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt-gpu.cpp
0 → 100644
View file @
25cee581
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 08:29:45
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-22 09:56:12
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include <future>
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
trace
);
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
std
::
mt19937
gen
(
123
);
std
::
vector
<
std
::
vector
<
Token
>>
ids
;
std
::
vector
<
std
::
vector
<
layer_data
>>
k
,
v
;
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
ids
.
push_back
(
random_ids
(
1
*
config
.
num_token_per_page
,
gen
));
k
.
push_back
(
random_kvcache
(
1
,
gen
));
v
.
push_back
(
random_kvcache
(
1
,
gen
));
kvc2
->
raw_insert
(
test_model_name
,
test_quant_type
,
ids
[
i
].
data
(),
ids
[
i
].
size
(),
k
[
i
],
v
[
i
]);
}
kvc2
->
debug
();
{
// all match
std
::
vector
<
Token
*>
chunks
;
std
::
vector
<
TokenLength
>
lengths
;
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
chunks
.
push_back
(
ids
[
i
].
data
());
lengths
.
push_back
(
ids
[
i
].
size
());
}
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_alt_to_gpu_async
(
test_model_name
,
test_quant_type
,
chunks
,
lengths
,
15
*
config
.
num_token_per_page
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
auto
hk
=
h
->
handle_data
(
true
);
auto
hv
=
h
->
handle_data
(
false
);
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
cmp_handle_data
(
slice
(
hk
,
i
,
i
+
1
),
k
[
i
],
1
);
cmp_handle_data
(
slice
(
hv
,
i
,
i
+
1
),
v
[
i
],
1
);
}
auto
block_idx
=
h
->
get_gpu_block_idx
();
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
std
::
vector
<
size_t
>
blocks
=
{
block_idx
[
i
]};
cmp_handle_gpu
(
blocks
,
kcache
,
vcache
,
k
[
i
],
v
[
i
],
1
);
}
}
{
// no match in the middle
std
::
vector
<
Token
*>
chunks
;
std
::
vector
<
TokenLength
>
lengths
;
std
::
vector
<
std
::
vector
<
Token
>>
new_ids
;
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
new_ids
.
push_back
(
random_ids
(
1
*
config
.
num_token_per_page
,
gen
));
}
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
if
(
i
==
1
||
i
==
5
||
i
==
6
)
{
chunks
.
push_back
(
new_ids
[
i
].
data
());
}
else
{
chunks
.
push_back
(
ids
[
i
].
data
());
}
lengths
.
push_back
(
ids
[
i
].
size
());
}
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_alt_to_gpu_async
(
test_model_name
,
test_quant_type
,
chunks
,
lengths
,
15
*
config
.
num_token_per_page
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
auto
statuses
=
h
->
matched_status
();
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
if
(
i
==
1
)
{
assert
(
statuses
[
i
]
==
MatchStatus
::
NotMatchExact
);
}
else
if
(
i
==
5
||
i
==
6
)
{
assert
(
statuses
[
i
]
==
MatchStatus
::
NotMatchPartial
);
}
else
if
(
i
==
0
)
{
assert
(
statuses
[
i
]
==
MatchStatus
::
Exact
);
}
else
{
assert
(
statuses
[
i
]
==
MatchStatus
::
Partial
);
}
}
auto
hk
=
h
->
handle_data
(
true
);
auto
hv
=
h
->
handle_data
(
false
);
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
if
(
i
==
1
||
i
==
5
||
i
==
6
)
{
}
else
{
cmp_handle_data
(
slice
(
hk
,
i
,
i
+
1
),
k
[
i
],
1
);
cmp_handle_data
(
slice
(
hv
,
i
,
i
+
1
),
v
[
i
],
1
);
}
}
auto
block_idx
=
h
->
get_gpu_block_idx
();
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
if
(
i
==
1
||
i
==
5
||
i
==
6
)
{
}
else
{
std
::
vector
<
size_t
>
blocks
=
{
block_idx
[
i
]};
cmp_handle_gpu
(
blocks
,
kcache
,
vcache
,
k
[
i
],
v
[
i
],
1
);
}
}
}
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt.cpp
0 → 100644
View file @
25cee581
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 08:29:45
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-22 09:56:12
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
trace
);
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
std
::
mt19937
gen
(
123
);
std
::
vector
<
std
::
vector
<
Token
>>
ids
;
std
::
vector
<
std
::
vector
<
layer_data
>>
k
,
v
;
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
ids
.
push_back
(
random_ids
(
1
*
config
.
num_token_per_page
,
gen
));
k
.
push_back
(
random_kvcache
(
1
,
gen
));
v
.
push_back
(
random_kvcache
(
1
,
gen
));
kvc2
->
raw_insert
(
test_model_name
,
test_quant_type
,
ids
[
i
].
data
(),
ids
[
i
].
size
(),
k
[
i
],
v
[
i
]);
}
kvc2
->
debug
();
{
// all match
std
::
vector
<
Token
*>
chunks
;
std
::
vector
<
TokenLength
>
lengths
;
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
chunks
.
push_back
(
ids
[
i
].
data
());
lengths
.
push_back
(
ids
[
i
].
size
());
}
auto
h
=
kvc2
->
lookup_alt
(
test_model_name
,
test_quant_type
,
chunks
,
lengths
,
15
*
config
.
num_token_per_page
);
auto
hk
=
h
->
handle_data
(
true
);
auto
hv
=
h
->
handle_data
(
false
);
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
cmp_handle_data
(
slice
(
hk
,
i
,
i
+
1
),
k
[
i
],
1
);
cmp_handle_data
(
slice
(
hv
,
i
,
i
+
1
),
v
[
i
],
1
);
}
}
{
// no match in the middle
std
::
vector
<
Token
*>
chunks
;
std
::
vector
<
TokenLength
>
lengths
;
std
::
vector
<
std
::
vector
<
Token
>>
new_ids
;
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
new_ids
.
push_back
(
random_ids
(
1
*
config
.
num_token_per_page
,
gen
));
}
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
if
(
i
==
1
||
i
==
5
||
i
==
6
)
{
chunks
.
push_back
(
new_ids
[
i
].
data
());
}
else
{
chunks
.
push_back
(
ids
[
i
].
data
());
}
lengths
.
push_back
(
ids
[
i
].
size
());
}
auto
h
=
kvc2
->
lookup_alt
(
test_model_name
,
test_quant_type
,
chunks
,
lengths
,
15
*
config
.
num_token_per_page
);
auto
statuses
=
h
->
matched_status
();
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
if
(
i
==
1
)
{
assert
(
statuses
[
i
]
==
MatchStatus
::
NotMatchExact
);
}
else
if
(
i
==
5
||
i
==
6
)
{
assert
(
statuses
[
i
]
==
MatchStatus
::
NotMatchPartial
);
}
else
if
(
i
==
0
)
{
assert
(
statuses
[
i
]
==
MatchStatus
::
Exact
);
}
else
{
assert
(
statuses
[
i
]
==
MatchStatus
::
Partial
);
}
}
auto
hk
=
h
->
handle_data
(
true
);
auto
hv
=
h
->
handle_data
(
false
);
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
if
(
i
==
1
||
i
==
5
||
i
==
6
)
{
}
else
{
cmp_handle_data
(
slice
(
hk
,
i
,
i
+
1
),
k
[
i
],
1
);
cmp_handle_data
(
slice
(
hv
,
i
,
i
+
1
),
v
[
i
],
1
);
}
}
}
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-async.cpp
0 → 100644
View file @
25cee581
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 09:52:48
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-25 07:51:09
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include <future>
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
std
::
mt19937
gen
(
123
);
auto
ids1
=
random_ids
(
10
*
config
.
num_token_per_page
,
gen
);
auto
k1
=
random_kvcache
(
10
,
gen
);
auto
v1
=
random_kvcache
(
10
,
gen
);
kvc2
->
raw_insert
(
test_model_name
,
test_quant_type
,
ids1
.
data
(),
ids1
.
size
(),
k1
,
v1
);
// complete same
#pragma omp parallel for
for
(
size_t
ti
=
0
;
ti
<
3
;
ti
++
)
{
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_to_gpu_async
(
test_model_name
,
test_quant_type
,
ids1
.
data
(),
ids1
.
size
(),
ids1
.
size
()
+
2
*
config
.
num_token_per_page
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
auto
k
=
h
->
handle_data
(
true
);
auto
v
=
h
->
handle_data
(
false
);
cmp_handle_data
(
k1
,
k
,
10
);
cmp_handle_data
(
v1
,
v
,
10
);
auto
block_idx
=
h
->
get_gpu_block_idx
();
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
10
);
}
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt-without-vcache.cpp
0 → 100644
View file @
25cee581
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 09:52:48
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-25 07:51:09
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
qw25_7B_gpu_config
.
v_cache_on
=
false
;
config
.
gpu_cache_config
=
qw25_7B_gpu_config
;
config
.
v_cache_on
=
false
;
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
std
::
mt19937
gen
(
123
);
auto
ids1
=
random_ids
(
10
*
config
.
num_token_per_page
,
gen
);
auto
k1
=
random_kvcache
(
10
,
gen
);
kvc2
->
raw_insert
(
test_model_name
,
test_quant_type
,
ids1
.
data
(),
ids1
.
size
(),
k1
,
{});
// complete same
#pragma omp parallel for
for
(
size_t
ti
=
0
;
ti
<
3
;
ti
++
)
{
auto
h
=
kvc2
->
lookup_to_gpu
(
test_model_name
,
test_quant_type
,
ids1
.
data
(),
ids1
.
size
(),
ids1
.
size
()
+
2
*
config
.
num_token_per_page
);
auto
k
=
h
->
handle_data
(
true
);
cmp_handle_data
(
k1
,
k
,
10
);
auto
block_idx
=
h
->
get_gpu_block_idx
();
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
auto
k_from_gpu
=
empty_kvcache
(
15
);
size_t
gpu_count
=
config
.
gpu_cache_config
->
gpu_devices_id
.
size
();
size_t
element_size_per_gpu
=
test_cache_info
.
element_size
(
config
.
num_token_per_page
)
/
gpu_count
;
for
(
size_t
i
=
0
;
i
<
k_from_gpu
.
size
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
block_idx
.
size
();
j
++
)
{
size_t
b_idx
=
block_idx
[
j
];
for
(
size_t
gpu_idx
=
0
;
gpu_idx
<
gpu_count
;
gpu_idx
++
)
{
{
auto
kt
=
kcache
[
gpu_idx
][
i
][
b_idx
].
to
(
torch
::
kCPU
);
void
*
src
=
kt
.
data_ptr
();
void
*
dst
=
offset_by_bytes
(
k_from_gpu
[
i
][
j
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
}
}
}
}
cmp_handle_data
(
k1
,
k_from_gpu
,
10
);
}
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt.cpp
0 → 100644
View file @
25cee581
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 09:52:48
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-25 07:51:09
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
std
::
mt19937
gen
(
123
);
auto
ids1
=
random_ids
(
10
*
config
.
num_token_per_page
,
gen
);
auto
k1
=
random_kvcache
(
10
,
gen
);
auto
v1
=
random_kvcache
(
10
,
gen
);
kvc2
->
raw_insert
(
test_model_name
,
test_quant_type
,
ids1
.
data
(),
ids1
.
size
(),
k1
,
v1
);
// complete same
#pragma omp parallel for
for
(
size_t
ti
=
0
;
ti
<
3
;
ti
++
)
{
auto
h
=
kvc2
->
lookup_to_gpu
(
test_model_name
,
test_quant_type
,
ids1
.
data
(),
ids1
.
size
(),
ids1
.
size
()
+
2
*
config
.
num_token_per_page
);
auto
k
=
h
->
handle_data
(
true
);
auto
v
=
h
->
handle_data
(
false
);
cmp_handle_data
(
k1
,
k
,
10
);
cmp_handle_data
(
v1
,
v
,
10
);
auto
block_idx
=
h
->
get_gpu_block_idx
();
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
auto
k_from_gpu
=
empty_kvcache
(
15
);
auto
v_from_gpu
=
empty_kvcache
(
15
);
size_t
gpu_count
=
config
.
gpu_cache_config
->
gpu_devices_id
.
size
();
size_t
element_size_per_gpu
=
test_cache_info
.
element_size
(
config
.
num_token_per_page
)
/
gpu_count
;
for
(
size_t
i
=
0
;
i
<
k_from_gpu
.
size
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
block_idx
.
size
();
j
++
)
{
size_t
b_idx
=
block_idx
[
j
];
for
(
size_t
gpu_idx
=
0
;
gpu_idx
<
gpu_count
;
gpu_idx
++
)
{
{
auto
kt
=
kcache
[
gpu_idx
][
i
][
b_idx
].
to
(
torch
::
kCPU
);
void
*
src
=
kt
.
data_ptr
();
void
*
dst
=
offset_by_bytes
(
k_from_gpu
[
i
][
j
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
}
{
auto
vt
=
vcache
[
gpu_idx
][
i
][
b_idx
].
to
(
torch
::
kCPU
);
void
*
src
=
vt
.
data_ptr
();
void
*
dst
=
offset_by_bytes
(
v_from_gpu
[
i
][
j
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
}
}
}
}
cmp_handle_data
(
k1
,
k_from_gpu
,
10
);
cmp_handle_data
(
v1
,
v_from_gpu
,
10
);
}
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu.cpp
0 → 100644
View file @
25cee581
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 09:52:48
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-25 08:38:33
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
std
::
mt19937
gen
(
123
);
auto
ids1
=
random_ids
(
10
*
config
.
num_token_per_page
,
gen
);
auto
k1
=
random_kvcache
(
10
,
gen
);
auto
v1
=
random_kvcache
(
10
,
gen
);
kvc2
->
raw_insert
(
test_model_name
,
test_quant_type
,
ids1
.
data
(),
ids1
.
size
(),
k1
,
v1
);
// complete same
{
auto
h
=
kvc2
->
lookup_to_gpu
(
test_model_name
,
test_quant_type
,
ids1
.
data
(),
ids1
.
size
(),
ids1
.
size
()
+
5
*
config
.
num_token_per_page
);
auto
k
=
h
->
handle_data
(
true
);
auto
v
=
h
->
handle_data
(
false
);
cmp_handle_data
(
k1
,
k
,
10
);
cmp_handle_data
(
v1
,
v
,
10
);
auto
block_idx
=
h
->
get_gpu_block_idx
();
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
auto
k_from_gpu
=
empty_kvcache
(
15
);
auto
v_from_gpu
=
empty_kvcache
(
15
);
size_t
gpu_count
=
config
.
gpu_cache_config
->
gpu_devices_id
.
size
();
size_t
element_size_per_gpu
=
test_cache_info
.
element_size
(
config
.
num_token_per_page
)
/
gpu_count
;
for
(
size_t
i
=
0
;
i
<
k_from_gpu
.
size
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
block_idx
.
size
();
j
++
)
{
size_t
b_idx
=
block_idx
[
j
];
for
(
size_t
gpu_idx
=
0
;
gpu_idx
<
gpu_count
;
gpu_idx
++
)
{
{
auto
kt
=
kcache
[
gpu_idx
][
i
][
b_idx
].
to
(
torch
::
kCPU
);
void
*
src
=
kt
.
data_ptr
();
void
*
dst
=
offset_by_bytes
(
k_from_gpu
[
i
][
j
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
}
{
auto
vt
=
vcache
[
gpu_idx
][
i
][
b_idx
].
to
(
torch
::
kCPU
);
void
*
src
=
vt
.
data_ptr
();
void
*
dst
=
offset_by_bytes
(
v_from_gpu
[
i
][
j
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
}
}
}
}
cmp_handle_data
(
k1
,
k_from_gpu
,
10
);
cmp_handle_data
(
v1
,
v_from_gpu
,
10
);
}
// prefix and evict
{
auto
h
=
kvc2
->
lookup_to_gpu
(
test_model_name
,
test_quant_type
,
ids1
.
data
(),
config
.
num_token_per_page
*
3
,
config
.
gpu_cache_config
->
total_kvcache_pages
*
config
.
num_token_per_page
);
auto
k
=
h
->
handle_data
(
true
);
auto
v
=
h
->
handle_data
(
false
);
cmp_handle_data
(
k1
,
k
,
3
);
cmp_handle_data
(
v1
,
v
,
3
);
auto
block_idx
=
h
->
get_gpu_block_idx
();
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
auto
k_from_gpu
=
empty_kvcache
(
3
);
auto
v_from_gpu
=
empty_kvcache
(
3
);
size_t
gpu_count
=
config
.
gpu_cache_config
->
gpu_devices_id
.
size
();
size_t
element_size_per_gpu
=
test_cache_info
.
element_size
(
config
.
num_token_per_page
)
/
gpu_count
;
for
(
size_t
i
=
0
;
i
<
k_from_gpu
.
size
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
3
;
j
++
)
{
size_t
b_idx
=
block_idx
[
j
];
for
(
size_t
gpu_idx
=
0
;
gpu_idx
<
gpu_count
;
gpu_idx
++
)
{
{
auto
kt
=
kcache
[
gpu_idx
][
i
][
b_idx
].
to
(
torch
::
kCPU
);
void
*
src
=
kt
.
data_ptr
();
void
*
dst
=
offset_by_bytes
(
k_from_gpu
[
i
][
j
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
}
{
auto
vt
=
vcache
[
gpu_idx
][
i
][
b_idx
].
to
(
torch
::
kCPU
);
void
*
src
=
vt
.
data_ptr
();
void
*
dst
=
offset_by_bytes
(
v_from_gpu
[
i
][
j
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
}
}
}
}
cmp_handle_data
(
k1
,
k_from_gpu
,
3
);
cmp_handle_data
(
v1
,
v_from_gpu
,
3
);
}
// // complete prefix
// {
// std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(),
// ids2.size() + 3 * config.num_token_per_page);
// auto k = h->handle_data(true);
// auto v = h->handle_data(false);
// cmp_handle_data(k1, k, 3);
// cmp_handle_data(v1, v, 3);
// }
// // common prefix
// {
// std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
// auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
// ids2.insert(ids2.end(), rids.begin(), rids.end());
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
// auto k = h->handle_data(true);
// auto v = h->handle_data(false);
// cmp_handle_data(k1, k, 3);
// cmp_handle_data(v1, v, 3);
// }
// // no prefix
// {
// std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
// assert(h->matched_length() == 0);
// }
// // insert partly new
// auto k2 = random_kvcache(10, gen);
// auto v2 = random_kvcache(10, gen);
// copy_kvcache(k1, k2, 0, 5);
// copy_kvcache(v1, v2, 0, 5);
// auto ids2 = random_ids(10 * config.num_token_per_page, gen);
// for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
// ids2[i] = ids1[i];
// }
// kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
// // read new part
// {
// std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(),
// ids.size() + 7 * config.num_token_per_page);
// auto k = h->handle_data(true);
// auto v = h->handle_data(false);
// cmp_handle_data(k, k2, 7);
// cmp_handle_data(v, v2, 7);
// }
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
Prev
1
2
3
4
5
6
7
…
10
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment