Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ox696c
ktransformers
Commits
877aec85
Unverified
Commit
877aec85
authored
Apr 09, 2025
by
Yuhao Tsui
Committed by
GitHub
Apr 09, 2025
Browse files
Merge branch 'kvcache-ai:main' into main
parents
84164f58
9037bf30
Changes
251
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1602 additions
and
0 deletions
+1602
-0
csrc/balance_serve/kvc2/src/utils/lock_free_queue.hpp
csrc/balance_serve/kvc2/src/utils/lock_free_queue.hpp
+60
-0
csrc/balance_serve/kvc2/src/utils/mpsc.hpp
csrc/balance_serve/kvc2/src/utils/mpsc.hpp
+90
-0
csrc/balance_serve/kvc2/src/utils/mutex_extend.hpp
csrc/balance_serve/kvc2/src/utils/mutex_extend.hpp
+70
-0
csrc/balance_serve/kvc2/src/utils/periodic_task.hpp
csrc/balance_serve/kvc2/src/utils/periodic_task.hpp
+102
-0
csrc/balance_serve/kvc2/src/utils/spin_lock.hpp
csrc/balance_serve/kvc2/src/utils/spin_lock.hpp
+36
-0
csrc/balance_serve/kvc2/src/utils/timer.hpp
csrc/balance_serve/kvc2/src/utils/timer.hpp
+128
-0
csrc/balance_serve/kvc2/test/CMakeLists.txt
csrc/balance_serve/kvc2/test/CMakeLists.txt
+78
-0
csrc/balance_serve/kvc2/test/hashmap_test.cpp
csrc/balance_serve/kvc2/test/hashmap_test.cpp
+11
-0
csrc/balance_serve/kvc2/test/kvc2_export_header_test.cpp
csrc/balance_serve/kvc2/test/kvc2_export_header_test.cpp
+87
-0
csrc/balance_serve/kvc2/test/kvc2_export_load_test.cpp
csrc/balance_serve/kvc2/test/kvc2_export_load_test.cpp
+87
-0
csrc/balance_serve/kvc2/test/kvc2_test_utils.cpp
csrc/balance_serve/kvc2/test/kvc2_test_utils.cpp
+117
-0
csrc/balance_serve/kvc2/test/kvc2test/CMakeLists.txt
csrc/balance_serve/kvc2/test/kvc2test/CMakeLists.txt
+26
-0
csrc/balance_serve/kvc2/test/kvc2test/append-tokens.cpp
csrc/balance_serve/kvc2/test/kvc2test/append-tokens.cpp
+52
-0
csrc/balance_serve/kvc2/test/kvc2test/check-flush-back.cpp
csrc/balance_serve/kvc2/test/kvc2test/check-flush-back.cpp
+36
-0
csrc/balance_serve/kvc2/test/kvc2test/common.hpp
csrc/balance_serve/kvc2/test/kvc2test/common.hpp
+233
-0
csrc/balance_serve/kvc2/test/kvc2test/flush-back.cpp
csrc/balance_serve/kvc2/test/kvc2test/flush-back.cpp
+57
-0
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt-gpu.cpp
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt-gpu.cpp
+125
-0
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt.cpp
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt.cpp
+97
-0
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-async.cpp
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-async.cpp
+49
-0
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt-without-vcache.cpp
...serve/kvc2/test/kvc2test/lookup-gpu-mt-without-vcache.cpp
+61
-0
No files found.
csrc/balance_serve/kvc2/src/utils/lock_free_queue.hpp
0 → 100644
View file @
877aec85
#include <atomic>
#include <future>
#include <iostream>
#include <memory>
#include <thread>
#include <vector>
template
<
typename
T
>
class
MPSCQueue
{
struct
Node
{
std
::
shared_ptr
<
T
>
data
;
std
::
atomic
<
Node
*>
next
;
Node
()
:
next
(
nullptr
)
{}
Node
(
std
::
shared_ptr
<
T
>
data_
)
:
data
(
std
::
move
(
data_
)),
next
(
nullptr
)
{}
};
std
::
atomic
<
Node
*>
head
;
Node
*
tail
;
public:
std
::
atomic_size_t
enqueue_count
=
0
;
size_t
dequeue_count
=
0
;
MPSCQueue
()
{
Node
*
dummy
=
new
Node
();
head
.
store
(
dummy
,
std
::
memory_order_relaxed
);
tail
=
dummy
;
}
~
MPSCQueue
()
{
// 清理剩余的节点
Node
*
node
=
tail
;
while
(
node
)
{
Node
*
next
=
node
->
next
.
load
(
std
::
memory_order_relaxed
);
delete
node
;
node
=
next
;
}
}
// 生产者调用
void
enqueue
(
std
::
shared_ptr
<
T
>
data
)
{
enqueue_count
.
fetch_add
(
1
);
Node
*
node
=
new
Node
(
std
::
move
(
data
));
Node
*
prev_head
=
head
.
exchange
(
node
,
std
::
memory_order_acq_rel
);
prev_head
->
next
.
store
(
node
,
std
::
memory_order_release
);
}
// 消费者调用
std
::
shared_ptr
<
T
>
dequeue
()
{
Node
*
next
=
tail
->
next
.
load
(
std
::
memory_order_acquire
);
if
(
next
)
{
std
::
shared_ptr
<
T
>
res
=
std
::
move
(
next
->
data
);
delete
tail
;
tail
=
next
;
dequeue_count
+=
1
;
return
res
;
}
return
nullptr
;
}
};
\ No newline at end of file
csrc/balance_serve/kvc2/src/utils/mpsc.hpp
0 → 100644
View file @
877aec85
#include <atomic>
#include <cassert>
#include <iostream>
#include <optional>
#include <semaphore>
template
<
typename
T
>
class
MPSCQueue
{
struct
Node
{
T
data
;
std
::
atomic
<
Node
*>
next
;
Node
()
:
next
(
nullptr
)
{}
Node
(
T
data_
)
:
data
(
std
::
move
(
data_
)),
next
(
nullptr
)
{}
};
std
::
atomic
<
Node
*>
head
;
Node
*
tail
;
public:
std
::
atomic_size_t
enqueue_count
=
0
;
size_t
dequeue_count
=
0
;
MPSCQueue
()
{
Node
*
dummy
=
new
Node
();
head
.
store
(
dummy
,
std
::
memory_order_seq_cst
);
tail
=
dummy
;
}
~
MPSCQueue
()
{
Node
*
node
=
tail
;
while
(
node
)
{
Node
*
next
=
node
->
next
.
load
(
std
::
memory_order_seq_cst
);
delete
node
;
node
=
next
;
}
}
// 生产者调用
void
enqueue
(
T
data
)
{
enqueue_count
.
fetch_add
(
1
);
Node
*
node
=
new
Node
(
std
::
move
(
data
));
Node
*
prev_head
=
head
.
exchange
(
node
,
std
::
memory_order_seq_cst
);
prev_head
->
next
.
store
(
node
,
std
::
memory_order_seq_cst
);
}
// 消费者调用
std
::
optional
<
T
>
dequeue
()
{
Node
*
next
=
tail
->
next
.
load
(
std
::
memory_order_seq_cst
);
if
(
next
)
{
T
res
=
std
::
move
(
next
->
data
);
delete
tail
;
tail
=
next
;
dequeue_count
+=
1
;
return
res
;
}
return
std
::
nullopt
;
}
size_t
size
()
{
return
enqueue_count
.
load
()
-
dequeue_count
;
}
};
template
<
typename
T
>
class
MPSCQueueConsumerLock
{
MPSCQueue
<
T
>
queue
;
std
::
counting_semaphore
<>
sema
{
0
};
public:
void
enqueue
(
T
data
)
{
queue
.
enqueue
(
std
::
move
(
data
));
// std::atomic_thread_fence(std::memory_order_seq_cst);// Inserting this because the memory order might be wrong, I
// am also not that sure about this.
sema
.
release
();
}
T
dequeue
()
{
auto
re
=
queue
.
dequeue
();
if
(
re
.
has_value
())
{
while
(
sema
.
try_acquire
()
==
false
)
{
std
::
cerr
<<
__FILE__
<<
":"
<<
__FUNCTION__
<<
" sema try acquire should be success, retrying, please check"
<<
std
::
endl
;
// assert(false);
}
return
re
.
value
();
}
sema
.
acquire
();
return
queue
.
dequeue
().
value
();
}
size_t
size
()
{
return
queue
.
size
();
}
};
csrc/balance_serve/kvc2/src/utils/mutex_extend.hpp
0 → 100644
View file @
877aec85
#ifndef __MUTEX_EXTEND_HPP_
#define __MUTEX_EXTEND_HPP_
#include <atomic>
#include <chrono>
#include <iostream>
#include <thread>
class
non_recursive_mutex
{
public:
non_recursive_mutex
()
=
default
;
// 使用 try_lock 实现非递归锁
bool
try_lock
()
{
std
::
thread
::
id
this_id
=
std
::
this_thread
::
get_id
();
// 检查当前线程是否已经持有该锁
if
(
owner
.
load
(
std
::
memory_order_acquire
)
==
this_id
)
{
return
false
;
// 如果是当前线程,返回失败
}
// 尝试加锁
if
(
mtx
.
try_lock
())
{
owner
.
store
(
this_id
,
std
::
memory_order_release
);
// 设置锁的拥有者
return
true
;
}
return
false
;
}
// lock 会阻塞,直到获得锁
void
lock
()
{
std
::
thread
::
id
this_id
=
std
::
this_thread
::
get_id
();
while
(
true
)
{
// 检查当前线程是否已经持有该锁
if
(
owner
.
load
(
std
::
memory_order_acquire
)
==
this_id
)
{
throw
std
::
runtime_error
(
"Thread is trying to lock a mutex it already holds"
);
}
// 尝试加锁
if
(
mtx
.
try_lock
())
{
owner
.
store
(
this_id
,
std
::
memory_order_release
);
// 设置锁的拥有者
return
;
}
// 如果锁未获得,则稍微等待,防止忙等
std
::
this_thread
::
yield
();
}
}
// 解锁
void
unlock
()
{
std
::
thread
::
id
this_id
=
std
::
this_thread
::
get_id
();
// 确保只有持有锁的线程可以解锁
if
(
owner
.
load
(
std
::
memory_order_acquire
)
==
this_id
)
{
owner
.
store
(
std
::
thread
::
id
(),
std
::
memory_order_release
);
// 清除锁的拥有者
mtx
.
unlock
();
}
else
{
throw
std
::
runtime_error
(
"Thread attempting to unlock a mutex it doesn't own"
);
}
}
private:
std
::
mutex
mtx
;
// 实际的互斥量
std
::
atomic
<
std
::
thread
::
id
>
owner
;
// 原子变量,记录当前锁的拥有者
};
#endif
csrc/balance_serve/kvc2/src/utils/periodic_task.hpp
0 → 100644
View file @
877aec85
#ifndef PERIODIC_TASK_HPP
#define PERIODIC_TASK_HPP
#include <atomic>
#include <chrono>
#include <condition_variable>
#include <cstdio>
#include <functional>
#include <future>
#include <iostream>
#include <mutex>
#include <stop_token>
#include <thread>
#include <utility>
#include <vector>
namespace
periodic
{
class
PeriodicTask
{
public:
explicit
PeriodicTask
(
std
::
function
<
void
()
>
func
,
std
::
chrono
::
milliseconds
interval_ms
=
std
::
chrono
::
milliseconds
(
100
))
:
func_
(
std
::
move
(
func
)),
interval_
(
interval_ms
),
worker_
([
this
](
std
::
stop_token
stoken
)
{
this
->
run
(
stoken
);
})
{
// std::cout << "PeriodicTask created with interval: " << interval_.count() << " ms" << std::endl;
}
~
PeriodicTask
()
{
worker_
.
request_stop
();
cv_
.
notify_one
();
// Ensure worker wakes up when destroyed
// std::cout << "PeriodicTask destructor called, stopping worker." << std::endl;
}
void
wakeUp
()
{
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
wakeup_mutex_
);
wake_up_requested_
=
true
;
}
cv_
.
notify_one
();
// Notify worker thread to wake up immediately
// std::cout << "wakeUp() called: worker thread will wake up." << std::endl;
}
std
::
future
<
void
>
wakeUpWait
()
{
std
::
promise
<
void
>
promise
;
std
::
future
<
void
>
future
=
promise
.
get_future
();
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
promise_mutex_
);
wakeup_promises_
.
push_back
(
std
::
move
(
promise
));
}
wakeUp
();
return
future
;
}
private:
void
run
(
std
::
stop_token
stoken
)
{
while
(
!
stoken
.
stop_requested
())
{
std
::
unique_lock
lock
(
mutex_
);
// Wait for either the time interval or a wake-up signal
cv_
.
wait_for
(
lock
,
interval_
,
[
this
]
{
return
wake_up_requested_
.
load
();
});
if
(
stoken
.
stop_requested
())
break
;
// If the wake-up was triggered, reset the flag and process the task
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
wakeup_mutex_
);
wake_up_requested_
=
false
;
}
try
{
// std::cout << "Running task function." << std::endl;
func_
();
}
catch
(...)
{
std
::
cerr
<<
"Error in task function."
<<
std
::
endl
;
}
notifyPromises
();
}
}
void
notifyPromises
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
promise_mutex_
);
// std::cout << "Notifying all waiting promises." << std::endl;
for
(
auto
&
promise
:
wakeup_promises_
)
{
promise
.
set_value
();
}
wakeup_promises_
.
clear
();
}
std
::
function
<
void
()
>
func_
;
std
::
chrono
::
milliseconds
interval_
;
std
::
mutex
mutex_
;
std
::
condition_variable
cv_
;
std
::
vector
<
std
::
promise
<
void
>>
wakeup_promises_
;
std
::
mutex
promise_mutex_
;
std
::
mutex
wakeup_mutex_
;
std
::
atomic
<
bool
>
wake_up_requested_
=
false
;
std
::
jthread
worker_
;
};
}
// namespace periodic
#endif // PERIODIC_TASK_HPP
csrc/balance_serve/kvc2/src/utils/spin_lock.hpp
0 → 100644
View file @
877aec85
/*
* @Author: Xie Weiyu ervinxie@qq.com
* @Date: 2024-11-21 06:35:47
* @LastEditors: Xie Weiyu ervinxie@qq.com
* @LastEditTime: 2024-11-21 06:35:50
* @FilePath: /kvc2/src/utils/spin_lock.hpp
* @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置:
* https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
*/
#include <atomic>
#include <chrono>
#include <thread>
class
SpinLock
{
public:
SpinLock
()
{
flag
.
clear
();
}
void
lock
()
{
const
int
max_delay
=
1024
;
// Maximum delay in microseconds
int
delay
=
1
;
// Initial delay in microseconds
while
(
flag
.
test_and_set
(
std
::
memory_order_acquire
))
{
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
microseconds
(
delay
));
delay
*=
2
;
if
(
delay
>
max_delay
)
{
delay
=
max_delay
;
}
}
}
void
unlock
()
{
flag
.
clear
(
std
::
memory_order_release
);
}
private:
std
::
atomic_flag
flag
=
ATOMIC_FLAG_INIT
;
};
csrc/balance_serve/kvc2/src/utils/timer.hpp
0 → 100644
View file @
877aec85
#pragma once
#include <cassert>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <map>
#include <sstream>
#include <string>
#include "easy_format.hpp"
inline
std
::
string
doubleToStringR2
(
double
value
)
{
std
::
stringstream
stream
;
stream
<<
std
::
fixed
<<
std
::
setprecision
(
2
)
<<
value
;
return
stream
.
str
();
}
class
Timer
{
public:
std
::
string
name
;
bool
tmp_timer
=
false
;
Timer
()
{}
Timer
(
std
::
string
name
)
:
name
(
name
),
tmp_timer
(
true
)
{
start
();
}
~
Timer
()
{
if
(
tmp_timer
)
{
std
::
cout
<<
name
<<
" "
<<
elapsedMs
()
<<
" ms"
<<
std
::
endl
;
}
}
void
start
()
{
m_startTime
=
std
::
chrono
::
high_resolution_clock
::
now
();
assert
(
m_isRunning
==
false
);
m_isRunning
=
true
;
}
void
stop
()
{
m_endTime
=
std
::
chrono
::
high_resolution_clock
::
now
();
assert
(
m_isRunning
==
true
);
m_isRunning
=
false
;
m_runningNs
+=
elapsedNs
();
}
double
elapsedNs
()
{
std
::
chrono
::
time_point
<
std
::
chrono
::
high_resolution_clock
>
endTime
;
if
(
m_isRunning
)
{
endTime
=
std
::
chrono
::
high_resolution_clock
::
now
();
}
else
{
endTime
=
m_endTime
;
}
return
std
::
chrono
::
duration_cast
<
std
::
chrono
::
nanoseconds
>
(
endTime
-
m_startTime
).
count
();
}
void
printElapsedMilliseconds
()
{
std
::
cout
<<
elapsedNs
()
/
1e6
<<
" ms"
<<
std
::
endl
;
}
static
std
::
string
ns_to_string
(
double
duration
)
{
auto
nano_sec
=
duration
;
if
(
nano_sec
>=
1000
)
{
auto
mirco_sec
=
nano_sec
/
1000.0
;
if
(
mirco_sec
>=
1000
)
{
auto
milli_sec
=
mirco_sec
/
1000.0
;
if
(
milli_sec
>=
1000
)
{
auto
seconds
=
milli_sec
/
1000.0
;
if
(
seconds
>=
60.0
)
{
auto
minutes
=
seconds
/
60.0
;
if
(
minutes
>=
60.0
)
{
auto
hours
=
minutes
/
60.0
;
return
doubleToStringR2
(
hours
)
+
" h"
;
}
else
{
return
doubleToStringR2
(
minutes
)
+
" min"
;
}
}
else
{
return
doubleToStringR2
(
seconds
)
+
" sec"
;
}
}
else
{
return
doubleToStringR2
(
milli_sec
)
+
" ms"
;
}
}
else
{
return
doubleToStringR2
(
mirco_sec
)
+
" us"
;
}
}
else
{
return
doubleToStringR2
(
nano_sec
)
+
" ns"
;
}
}
double
runningTimeNs
()
{
return
m_runningNs
;
}
std
::
string
runningTime
()
{
auto
duration
=
m_runningNs
;
return
ns_to_string
(
duration
);
}
std
::
string
elapsedTime
()
{
return
ns_to_string
(
elapsedNs
());
}
double
elapsedMs
()
{
return
elapsedNs
()
/
1e6
;
}
std
::
string
report_throughput
(
size_t
op_cnt
)
{
double
ops
=
op_cnt
/
elapsedMs
()
*
1000
;
return
readable_number
(
ops
)
+
"op/s"
;
}
void
merge
(
Timer
&
other
)
{
assert
(
m_isRunning
==
false
);
assert
(
other
.
m_isRunning
==
false
);
m_runningNs
+=
other
.
runningTimeNs
();
}
private:
std
::
chrono
::
time_point
<
std
::
chrono
::
high_resolution_clock
>
m_startTime
;
std
::
chrono
::
time_point
<
std
::
chrono
::
high_resolution_clock
>
m_endTime
;
bool
m_isRunning
=
false
;
double
m_runningNs
=
0.0
;
};
class
Counter
{
public:
Counter
()
{}
std
::
map
<
std
::
string
,
size_t
>
counters
;
void
inc
(
const
char
*
name
,
size_t
num
)
{
counters
[
name
]
+=
num
;
};
void
print
()
{
for
(
auto
&
p
:
counters
)
{
std
::
cout
<<
p
.
first
<<
" : "
<<
p
.
second
<<
std
::
endl
;
}
};
};
csrc/balance_serve/kvc2/test/CMakeLists.txt
0 → 100644
View file @
877aec85
set
(
CMAKE_CXX_FLAGS
"-Og -march=native -Wall -Wextra -g -fopenmp"
)
# set(CMAKE_CXX_FLAGS "-O3 -march=native -Wall -Wextra -pthread")
add_subdirectory
(
kvc2test
)
include_directories
(
${
CMAKE_CURRENT_SOURCE_DIR
}
/../src
)
add_executable
(
hashmap_test hashmap_test.cpp
)
target_link_libraries
(
hashmap_test PRIVATE TBB::tbb
)
add_executable
(
xxHash_test xxHash_test.cpp
)
target_link_libraries
(
xxHash_test PRIVATE xxhash
)
function
(
add_async_store_executable source_file
)
get_filename_component
(
target_name
${
source_file
}
NAME_WE
)
# 获取不带扩展名的文件名作为目标名
add_executable
(
${
target_name
}
${
source_file
}
)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../src
)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../third_party/nlohmann/single_include
)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../third_party/spdlog/include
)
target_link_libraries
(
${
target_name
}
PRIVATE async_store gflags
)
endfunction
()
add_async_store_executable
(
async_store_test.cpp
)
function
(
add_kvc2_executable source_file
)
get_filename_component
(
target_name
${
source_file
}
NAME_WE
)
# 获取不带扩展名的文件名作为目标名
add_executable
(
${
target_name
}
${
source_file
}
)
# target_compile_options(${target_name} PRIVATE -fopenmp -fno-strict-aliasing)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../src
)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../third_party/nlohmann/single_include
)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../third_party/spdlog/include
)
target_link_libraries
(
${
target_name
}
PRIVATE kvc2 async_store gflags
)
endfunction
()
add_kvc2_executable
(
test_lock_free_queue.cpp
)
add_kvc2_executable
(
test_queue_perf.cpp
)
# Disable deprecated test
# add_kvc2_executable(prefix_test.cpp)
# add_kvc2_executable(kvcache_disk_insert_read_test.cpp)
# add_kvc2_executable(kvcache_mem_eviction_test.cpp)
# add_kvc2_executable(kvcache_mem_insert_read_test.cpp)
# add_kvc2_executable(kvcache_save_load_test.cpp)
# add_kvc2_executable(kvc2_export_header_test.cpp)
# add_kvc2_executable(kvc2_export_load_test.cpp)
target_include_directories
(
async_store_test PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/..//third_party/nlohmann/single_include
)
target_include_directories
(
async_store_test PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/..//third_party/spdlog/include
)
target_link_libraries
(
async_store_test PRIVATE xxhash
)
add_executable
(
test_std_list test_std_list.cpp
)
add_executable
(
test_cuda_stream test_cuda_stream.cpp
)
target_include_directories
(
test_cuda_stream PRIVATE
${
CUDAToolkit_INCLUDE_DIRS
}
)
target_link_libraries
(
test_cuda_stream PRIVATE CUDA::cudart
)
add_executable
(
test_cuda_stream_manager test_cuda_stream_manager.cpp
)
target_include_directories
(
test_cuda_stream_manager PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../src
)
target_link_libraries
(
test_cuda_stream_manager PRIVATE cuda_stream_manager
)
add_executable
(
test_periodic_task test_periodic_task.cpp
)
target_include_directories
(
test_periodic_task PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../src
)
add_executable
(
test_page_pool page_pool_test.cpp
)
target_include_directories
(
test_page_pool PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../src
)
target_include_directories
(
test_page_pool PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../third_party/spdlog/include
)
\ No newline at end of file
csrc/balance_serve/kvc2/test/hashmap_test.cpp
0 → 100644
View file @
877aec85
#include <tbb/concurrent_hash_map.h>
#include <iostream>
int
main
()
{
tbb
::
concurrent_hash_map
<
int
,
int
>
map
;
map
.
insert
({
1
,
2
});
decltype
(
map
)
::
accessor
a
;
std
::
cout
<<
map
.
find
(
a
,
1
)
<<
std
::
endl
;
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2_export_header_test.cpp
0 → 100644
View file @
877aec85
#include "kvc2.h"
#include "kvc2_test_utils.cpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
std
::
mt19937
gen
(
123
);
KVC2Config
config
=
{
.
path
=
FLAGS_disk_cache_path
,
.
config_path
=
std
::
string
(
"/home/xwy/conifg"
),
.
block_length
=
BlockLength
,
.
memory_pool_size
=
size_t
(
10e9
),
.
evict_count
=
20
,
};
auto
kvcc
=
create_kvc2
(
config
);
auto
io
=
kvcc
->
start_io_thread
();
SPDLOG_INFO
(
"Disk Test"
);
auto
ids
=
random_ids
(
10
*
BlockLength
,
gen
);
auto
h1
=
random_kvcache
(
qwen_cache_info
,
10
,
gen
);
kvcc
->
raw_insert
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids
.
data
()),
ids
.
size
(),
h1
);
// complete same
{
auto
h2
=
empty_kvcache
(
qwen_cache_info
,
10
);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids
.
data
()),
ids
.
size
(),
h2
);
cmp_handle_data
(
qwen_cache_info
,
h1
,
h2
);
}
// complete prefix
{
auto
h2
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids2
=
std
::
vector
<
Token
>
(
ids
.
begin
(),
ids
.
begin
()
+
3
*
BlockLength
);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
cmp_handle_data
(
qwen_cache_info
,
h1
,
h2
,
3
);
}
// common prefix
{
auto
h2
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids2
=
std
::
vector
<
Token
>
(
ids
.
begin
(),
ids
.
begin
()
+
5
*
BlockLength
);
auto
rids
=
random_ids
(
BlockLength
*
2
+
BlockLength
/
2
,
gen
);
ids2
.
insert
(
ids2
.
end
(),
rids
.
begin
(),
rids
.
end
());
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
cmp_handle_data
(
qwen_cache_info
,
h1
,
h2
,
5
);
}
// no prefix
{
auto
h2
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids2
=
random_ids
(
10
*
BlockLength
,
gen
);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
}
// insert partly new
auto
h2
=
random_kvcache
(
qwen_cache_info
,
10
,
gen
);
copy_kvcache
(
h1
,
h2
,
0
,
5
);
auto
ids2
=
random_ids
(
10
*
BlockLength
,
gen
);
for
(
size_t
i
=
0
;
i
<
5
*
BlockLength
;
i
++
)
{
ids2
[
i
]
=
ids
[
i
];
}
kvcc
->
raw_insert
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
// read new part
{
auto
h3
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids3
=
std
::
vector
<
Token
>
(
ids2
.
begin
(),
ids2
.
begin
()
+
7
*
BlockLength
);
ids3
.
push_back
(
123
);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids3
.
data
()),
ids3
.
size
(),
h3
);
cmp_handle_data
(
qwen_cache_info
,
h3
,
h2
,
7
);
}
kvcc
->
save
();
kvcc
->
stop_io_thread
();
io
.
join
();
SPDLOG_WARN
(
"{} Test Passed"
,
__FILE__
);
return
0
;
}
\ No newline at end of file
csrc/balance_serve/kvc2/test/kvc2_export_load_test.cpp
0 → 100644
View file @
877aec85
#include "kvc2.h"
#include "kvc2_test_utils.cpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
std
::
mt19937
gen
(
123
);
KVC2Config
config
=
{
.
path
=
FLAGS_disk_cache_path
,
.
block_length
=
BlockLength
,
.
memory_pool_size
=
size_t
(
10e9
),
.
evict_count
=
20
,
};
auto
kvcc
=
create_kvc2
(
config
);
kvcc
->
load
();
auto
io
=
kvcc
->
start_io_thread
();
SPDLOG_INFO
(
"Disk Test"
);
auto
ids
=
random_ids
(
10
*
BlockLength
,
gen
);
auto
h1
=
empty_kvcache
(
qwen_cache_info
,
10
);
// kvcc->raw_insert(qwen_cache_info, reinterpret_cast<IDptr>(ids.data()), ids.size(), h1);
// complete same
{
// auto h2 = empty_kvcache(qwen_cache_info, 10);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids
.
data
()),
ids
.
size
(),
h1
);
// cmp_handle_data(qwen_cache_info, h1, h2);
}
// complete prefix
{
auto
h2
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids2
=
std
::
vector
<
Token
>
(
ids
.
begin
(),
ids
.
begin
()
+
3
*
BlockLength
);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
cmp_handle_data
(
qwen_cache_info
,
h1
,
h2
,
3
);
}
// common prefix
{
auto
h2
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids2
=
std
::
vector
<
Token
>
(
ids
.
begin
(),
ids
.
begin
()
+
5
*
BlockLength
);
auto
rids
=
random_ids
(
BlockLength
*
2
+
BlockLength
/
2
,
gen
);
ids2
.
insert
(
ids2
.
end
(),
rids
.
begin
(),
rids
.
end
());
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
cmp_handle_data
(
qwen_cache_info
,
h1
,
h2
,
5
);
}
// no prefix
{
auto
h2
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids2
=
random_ids
(
10
*
BlockLength
,
gen
);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
}
// insert partly new
auto
h2
=
random_kvcache
(
qwen_cache_info
,
10
,
gen
);
copy_kvcache
(
h1
,
h2
,
0
,
5
);
auto
ids2
=
random_ids
(
10
*
BlockLength
,
gen
);
for
(
size_t
i
=
0
;
i
<
5
*
BlockLength
;
i
++
)
{
ids2
[
i
]
=
ids
[
i
];
}
kvcc
->
raw_insert
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids2
.
data
()),
ids2
.
size
(),
h2
);
// read new part
{
auto
h3
=
empty_kvcache
(
qwen_cache_info
,
10
);
auto
ids3
=
std
::
vector
<
Token
>
(
ids2
.
begin
(),
ids2
.
begin
()
+
7
*
BlockLength
);
ids3
.
push_back
(
123
);
kvcc
->
raw_read
(
qwen_cache_info
,
reinterpret_cast
<
TokenPtr
>
(
ids3
.
data
()),
ids3
.
size
(),
h3
);
cmp_handle_data
(
qwen_cache_info
,
h3
,
h2
,
7
);
}
kvcc
->
stop_io_thread
();
io
.
join
();
SPDLOG_WARN
(
"{} Test Passed"
,
__FILE__
);
return
0
;
}
\ No newline at end of file
csrc/balance_serve/kvc2/test/kvc2_test_utils.cpp
0 → 100644
View file @
877aec85
#include <optional>
#include <random>
#include "kvc2.h"
#define FMT_HEADER_ONLY
#include <spdlog/spdlog.h>
const
int
BlockLength
=
256
;
std
::
string
FLAGS_disk_cache_path
;
void
init
(
int
argc
,
char
*
argv
[])
{
if
(
argc
!=
2
)
{
fmt
::
print
(
"Usage: {} --disk_cache_path=xxx
\n
"
,
argv
[
0
]);
exit
(
1
);
}
FLAGS_disk_cache_path
=
argv
[
1
];
if
(
FLAGS_disk_cache_path
.
empty
())
{
fmt
::
print
(
"disk_cache_path is empty"
);
exit
(
1
);
}
}
using
namespace
kvc2
;
data_block_ptr
empty_block
(
CacheInfo
info
)
{
auto
re
=
new
(
std
::
align_val_t
(
4096
))
std
::
byte
[
info
.
element_size
(
BlockLength
)];
return
reinterpret_cast
<
data_block_ptr
>
(
re
);
}
data_block_ptr
random_block
(
CacheInfo
info
,
std
::
mt19937
&
gen
)
{
auto
re
=
empty_block
(
info
);
uint64_t
*
d
=
(
uint64_t
*
)
re
;
for
(
size_t
i
=
0
;
i
<
info
.
element_size
(
BlockLength
)
/
8
;
i
++
)
{
d
[
i
]
=
gen
();
}
return
re
;
}
layer_data
random_blocks
(
CacheInfo
info
,
size_t
block_count
,
size_t
seed
)
{
std
::
mt19937
gen
(
seed
);
layer_data
re
;
for
(
size_t
i
=
0
;
i
<
block_count
;
i
++
)
{
re
.
push_back
(
random_block
(
info
,
gen
));
}
return
re
;
}
layer_data
empty_blocks
(
CacheInfo
info
,
size_t
block_count
)
{
layer_data
re
;
for
(
size_t
i
=
0
;
i
<
block_count
;
i
++
)
{
re
.
push_back
(
empty_block
(
info
));
}
return
re
;
}
void
copy_kvcache
(
std
::
vector
<
layer_data
>&
from
,
std
::
vector
<
layer_data
>&
to
,
size_t
block_start
,
size_t
length
)
{
for
(
size_t
i
=
0
;
i
<
from
.
size
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
length
;
j
++
)
{
to
[
i
][
block_start
+
j
]
=
from
[
i
][
block_start
+
j
];
}
}
}
std
::
vector
<
layer_data
>
random_kvcache
(
CacheInfo
info
,
size_t
block_count
,
std
::
mt19937
&
gen
)
{
std
::
vector
<
layer_data
>
re
;
re
.
resize
(
info
.
hidden_layer_count
());
fmt
::
print
(
"Generating random kvcache, layer {}
\n
"
,
info
.
hidden_layer_count
());
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
info
.
hidden_layer_count
();
i
++
)
{
re
[
i
]
=
random_blocks
(
info
,
block_count
,
gen
());
}
return
re
;
}
std
::
vector
<
layer_data
>
empty_kvcache
(
CacheInfo
info
,
size_t
block_count
)
{
std
::
vector
<
layer_data
>
re
;
re
.
resize
(
info
.
hidden_layer_count
());
fmt
::
print
(
"Generating empty kvcache, layer {}
\n
"
,
info
.
hidden_layer_count
());
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
info
.
hidden_layer_count
();
i
++
)
{
re
[
i
]
=
empty_blocks
(
info
,
block_count
);
}
return
re
;
}
std
::
vector
<
Token
>
random_ids
(
size_t
length
,
std
::
mt19937
&
gen
)
{
std
::
vector
<
Token
>
re
;
for
(
size_t
i
=
0
;
i
<
length
;
i
++
)
{
re
.
push_back
(
gen
());
}
return
re
;
}
CacheInfo
qwen_cache_info
=
{
.
model_name
=
"qwen2-72b-instruct"
,
.
is_key_cache
=
true
,
.
quant_type
=
"BF16"
,
};
void
cmp_handle_data
(
CacheInfo
info
,
std
::
vector
<
layer_data
>&
h1
,
std
::
vector
<
layer_data
>&
h2
,
std
::
optional
<
size_t
>
blocks
=
std
::
nullopt
)
{
assert
(
h1
.
size
()
==
h2
.
size
());
for
(
size_t
i
=
0
;
i
<
h1
.
size
();
i
++
)
{
auto
&
b1
=
h1
[
i
];
auto
&
b2
=
h2
[
i
];
if
(
blocks
.
has_value
()
==
false
)
{
assert
(
b1
.
size
()
==
b2
.
size
());
}
int
cmp_to
=
blocks
.
has_value
()
?
blocks
.
value
()
:
b1
.
size
();
for
(
int
j
=
0
;
j
<
cmp_to
;
j
++
)
{
auto
e1
=
reinterpret_cast
<
void
*>
(
b1
[
j
]);
auto
e2
=
reinterpret_cast
<
void
*>
(
b2
[
j
]);
assert
(
memcmp
(
e1
,
e2
,
info
.
element_size
(
BlockLength
))
==
0
);
}
}
fmt
::
print
(
"KVCacheHandle cmp ok
\n
"
);
}
csrc/balance_serve/kvc2/test/kvc2test/CMakeLists.txt
0 → 100644
View file @
877aec85
set
(
CMAKE_CXX_FLAGS
"-Og -march=native -Wall -Wextra -g -fopenmp"
)
function
(
add_kvc2_test source_file
)
get_filename_component
(
target_name
${
source_file
}
NAME_WE
)
# 获取不带扩展名的文件名作为目标名
add_executable
(
${
target_name
}
${
source_file
}
)
# target_compile_options(${target_name} PRIVATE -fopenmp -fno-strict-aliasing)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../src
)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../third_party/nlohmann/single_include
)
target_include_directories
(
${
target_name
}
PRIVATE
${
CMAKE_CURRENT_SOURCE_DIR
}
/../../third_party/spdlog/include
)
target_link_libraries
(
${
target_name
}
PRIVATE kvc2 async_store
)
endfunction
()
add_kvc2_test
(
raw_insert_read.cpp
)
add_kvc2_test
(
lookup.cpp
)
add_kvc2_test
(
lookup-alt.cpp
)
add_kvc2_test
(
lookup-alt-gpu.cpp
)
add_kvc2_test
(
lookup-mt.cpp
)
add_kvc2_test
(
lookup-gpu.cpp
)
add_kvc2_test
(
lookup-gpu-mt.cpp
)
add_kvc2_test
(
lookup-gpu-async.cpp
)
add_kvc2_test
(
append-tokens.cpp
)
add_kvc2_test
(
flush-back.cpp
)
add_kvc2_test
(
check-flush-back.cpp
)
add_kvc2_test
(
lookup-without-vcache.cpp
)
add_kvc2_test
(
lookup-gpu-mt-without-vcache.cpp
)
csrc/balance_serve/kvc2/test/kvc2test/append-tokens.cpp
0 → 100644
View file @
877aec85
#include <future>
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
#pragma omp parallel for
for
(
size_t
ti
=
0
;
ti
<
3
;
ti
++
)
{
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
std
::
mt19937
gen
(
ti
+
123
);
size_t
total_page
=
10
;
TokenLength
total_length
=
total_page
*
config
.
num_token_per_page
;
auto
tokens
=
random_ids
(
total_length
,
gen
);
TokenLength
prompt_length
=
3
*
config
.
num_token_per_page
;
auto
k1
=
random_kvcache
(
total_page
,
gen
);
auto
v1
=
random_kvcache
(
total_page
,
gen
);
{
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_to_gpu_async
(
test_model_name
,
test_quant_type
,
tokens
.
data
(),
prompt_length
,
total_length
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
assert
(
h
->
matched_length
()
%
config
.
num_token_per_page
==
0
);
size_t
matched_block
=
h
->
matched_length
()
/
config
.
num_token_per_page
;
auto
block_idx
=
h
->
get_gpu_block_idx
();
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
matched_block
);
for
(
size_t
at
=
matched_block
;
at
<
block_idx
.
size
();
at
++
)
{
copy_cpu_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
at
);
}
h
->
append_tokens
(
tokens
.
data
(),
total_length
);
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
total_page
);
}
{
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_to_gpu_async
(
test_model_name
,
test_quant_type
,
tokens
.
data
(),
total_length
,
total_length
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
assert
(
h
->
matched_length
()
==
total_length
);
size_t
matched_block
=
h
->
matched_length
()
/
config
.
num_token_per_page
;
auto
block_idx
=
h
->
get_gpu_block_idx
();
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
matched_block
);
}
}
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2test/check-flush-back.cpp
0 → 100644
View file @
877aec85
#include <future>
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
config
.
gpu_cache_config
->
total_kvcache_pages
=
12
;
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
kvc2
->
load
();
// #pragma omp parallel for
for
(
size_t
ti
=
0
;
ti
<
2
;
ti
++
)
{
SPDLOG_WARN
(
"Test {}"
,
ti
);
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
std
::
mt19937
gen
(
ti
+
123
);
size_t
total_page
=
10
;
TokenLength
total_length
=
total_page
*
config
.
num_token_per_page
;
auto
tokens
=
random_ids
(
total_length
,
gen
);
auto
k1
=
random_kvcache
(
total_page
,
gen
);
auto
v1
=
random_kvcache
(
total_page
,
gen
);
{
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_to_gpu_async
(
test_model_name
,
test_quant_type
,
tokens
.
data
(),
total_length
,
total_length
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
assert
(
h
->
matched_length
()
==
total_length
);
size_t
matched_block
=
h
->
matched_length
()
/
config
.
num_token_per_page
;
auto
block_idx
=
h
->
get_gpu_block_idx
();
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
matched_block
);
}
}
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2test/common.hpp
0 → 100644
View file @
877aec85
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 06:02:41
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-12-11 07:34:10
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
#include <random>
#include <thread>
#include "kvc2.h"
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
using
namespace
kvc2
;
template
<
typename
T
>
T
*
offset_by_bytes
(
T
*
t
,
size_t
n
)
{
return
reinterpret_cast
<
T
*>
(
reinterpret_cast
<
size_t
>
(
t
)
+
n
);
}
std
::
string
FLAGS_disk_cache_path
;
kvc2
::
KVC2Config
config
;
kvc2
::
GPUPageCacheConfig
qw25_7B_gpu_config
{
.
gpu_only
=
false
,
.
gpu_devices_id
=
{
0
,
1
},
.
layer_count
=
28
,
.
total_kvcache_pages
=
40
,
.
num_token_per_page
=
256
,
.
num_k_heads
=
4
,
.
k_head_dim
=
896
,
.
full_kv_cache_on_each_gpu
=
false
,
.
k_cache_on
=
true
,
.
v_cache_on
=
true
,
.
tensor_type
=
torch
::
kBFloat16
,
.
num_streams_per_device
=
4
,
};
ModelName
test_model_name
=
"Qwen2.5-7B-Instruct"
;
QuantType
test_quant_type
=
"FP16"
;
CacheInfo
test_cache_info
{
.
model_name
=
test_model_name
,
.
is_key_cache
=
true
,
.
quant_type
=
test_quant_type
,
};
void
init
(
int
argc
,
char
*
argv
[])
{
if
(
argc
!=
2
)
{
fmt
::
print
(
"Usage: {} <disk_cache_path>
\n
"
,
argv
[
0
]);
exit
(
1
);
}
load_quant_configs
(
"./config/quant_configs.json"
);
load_model_configs
(
"./config/model_configs.json"
);
FLAGS_disk_cache_path
=
argv
[
1
];
if
(
FLAGS_disk_cache_path
.
empty
())
{
fmt
::
print
(
"disk_cache_path is empty
\n
"
);
exit
(
1
);
}
config
.
path
=
FLAGS_disk_cache_path
;
config
.
config_path
=
"./config"
;
config
.
gpu_cache_config
=
qw25_7B_gpu_config
;
}
data_block_ptr
empty_block
()
{
auto
re
=
new
(
std
::
align_val_t
(
4096
))
std
::
byte
[
test_cache_info
.
element_size
(
config
.
num_token_per_page
)];
memset
(
re
,
0
,
test_cache_info
.
element_size
(
config
.
num_token_per_page
));
return
reinterpret_cast
<
data_block_ptr
>
(
re
);
}
data_block_ptr
random_block
(
std
::
mt19937
&
gen
)
{
auto
re
=
empty_block
();
uint64_t
*
d
=
(
uint64_t
*
)
re
;
for
(
size_t
i
=
0
;
i
<
test_cache_info
.
element_size
(
config
.
num_token_per_page
)
/
8
;
i
++
)
{
d
[
i
]
=
gen
();
}
return
re
;
}
layer_data
random_blocks
(
size_t
block_count
,
size_t
seed
)
{
std
::
mt19937
gen
(
seed
);
layer_data
re
;
for
(
size_t
i
=
0
;
i
<
block_count
;
i
++
)
{
re
.
push_back
(
random_block
(
gen
));
}
return
re
;
}
layer_data
empty_blocks
(
size_t
block_count
)
{
layer_data
re
;
for
(
size_t
i
=
0
;
i
<
block_count
;
i
++
)
{
re
.
push_back
(
empty_block
());
}
return
re
;
}
void
copy_kvcache
(
std
::
vector
<
layer_data
>&
from
,
std
::
vector
<
layer_data
>&
to
,
size_t
block_start
,
size_t
length
)
{
for
(
size_t
i
=
0
;
i
<
from
.
size
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
length
;
j
++
)
{
to
[
i
][
block_start
+
j
]
=
from
[
i
][
block_start
+
j
];
}
}
}
std
::
vector
<
layer_data
>
random_kvcache
(
size_t
block_count
,
std
::
mt19937
&
gen
)
{
std
::
vector
<
layer_data
>
re
;
re
.
resize
(
test_cache_info
.
hidden_layer_count
());
fmt
::
print
(
"Generating random kvcache, layer {}
\n
"
,
test_cache_info
.
hidden_layer_count
());
std
::
vector
<
std
::
mt19937
>
gens
;
for
(
size_t
i
=
0
;
i
<
test_cache_info
.
hidden_layer_count
();
i
++
)
{
gens
.
push_back
(
std
::
mt19937
(
gen
()));
}
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
test_cache_info
.
hidden_layer_count
();
i
++
)
{
re
[
i
]
=
random_blocks
(
block_count
,
gens
[
i
]());
}
return
re
;
}
std
::
vector
<
layer_data
>
empty_kvcache
(
size_t
block_count
)
{
std
::
vector
<
layer_data
>
re
;
re
.
resize
(
test_cache_info
.
hidden_layer_count
());
fmt
::
print
(
"Generating empty kvcache, layer {}
\n
"
,
test_cache_info
.
hidden_layer_count
());
#pragma omp parallel for
for
(
size_t
i
=
0
;
i
<
test_cache_info
.
hidden_layer_count
();
i
++
)
{
re
[
i
]
=
empty_blocks
(
block_count
);
}
return
re
;
}
std
::
vector
<
Token
>
random_ids
(
size_t
length
,
std
::
mt19937
&
gen
)
{
std
::
vector
<
Token
>
re
;
for
(
size_t
i
=
0
;
i
<
length
;
i
++
)
{
re
.
push_back
(
gen
());
}
return
re
;
}
std
::
vector
<
layer_data
>
slice
(
std
::
vector
<
layer_data
>&
h1
,
size_t
start
,
size_t
end
)
{
std
::
vector
<
layer_data
>
re
;
for
(
auto
&
l
:
h1
)
{
layer_data
new_layer
;
new_layer
.
insert
(
new_layer
.
end
(),
l
.
begin
()
+
start
,
l
.
begin
()
+
end
);
re
.
push_back
(
new_layer
);
}
return
re
;
}
void
cmp_handle_data
(
std
::
vector
<
layer_data
>
h1
,
std
::
vector
<
layer_data
>
h2
,
std
::
optional
<
size_t
>
blocks
=
std
::
nullopt
)
{
assert
(
h1
.
size
()
==
h2
.
size
());
for
(
size_t
i
=
0
;
i
<
h1
.
size
();
i
++
)
{
auto
&
b1
=
h1
[
i
];
auto
&
b2
=
h2
[
i
];
if
(
blocks
.
has_value
()
==
false
)
{
assert
(
b1
.
size
()
==
b2
.
size
());
}
int
cmp_to
=
blocks
.
has_value
()
?
blocks
.
value
()
:
b1
.
size
();
for
(
int
j
=
0
;
j
<
cmp_to
;
j
++
)
{
auto
e1
=
reinterpret_cast
<
void
*>
(
b1
[
j
]);
auto
e2
=
reinterpret_cast
<
void
*>
(
b2
[
j
]);
assert
(
memcmp
(
e1
,
e2
,
test_cache_info
.
element_size
(
config
.
num_token_per_page
))
==
0
);
}
}
fmt
::
print
(
"KVCacheHandle cmp ok
\n
"
);
}
void
copy_gpu_cpu
(
std
::
vector
<
size_t
>&
block_idx
,
std
::
vector
<
torch
::
Tensor
>&
kcache
,
std
::
vector
<
torch
::
Tensor
>&
vcache
,
std
::
vector
<
layer_data
>&
k_cpu
,
std
::
vector
<
layer_data
>&
v_cpu
,
size_t
at
)
{
size_t
gpu_count
=
config
.
gpu_cache_config
->
gpu_devices_id
.
size
();
size_t
element_size_per_gpu
=
test_cache_info
.
element_size
(
config
.
num_token_per_page
)
/
gpu_count
;
for
(
size_t
layer
=
0
;
layer
<
test_cache_info
.
hidden_layer_count
();
layer
++
)
{
for
(
size_t
gpu_idx
=
0
;
gpu_idx
<
gpu_count
;
gpu_idx
++
)
{
{
auto
kt
=
kcache
[
gpu_idx
][
layer
][
block_idx
[
at
]].
to
(
torch
::
kCPU
);
void
*
src
=
kt
.
data_ptr
();
void
*
dst
=
offset_by_bytes
(
k_cpu
[
layer
][
at
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
}
{
auto
vt
=
vcache
[
gpu_idx
][
layer
][
block_idx
[
at
]].
to
(
torch
::
kCPU
);
void
*
src
=
vt
.
data_ptr
();
void
*
dst
=
offset_by_bytes
(
v_cpu
[
layer
][
at
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
}
}
}
}
void
copy_cpu_gpu
(
std
::
vector
<
size_t
>&
block_idx
,
std
::
vector
<
torch
::
Tensor
>&
kcache
,
std
::
vector
<
torch
::
Tensor
>&
vcache
,
std
::
vector
<
layer_data
>&
k_cpu
,
std
::
vector
<
layer_data
>&
v_cpu
,
size_t
at
)
{
size_t
gpu_count
=
config
.
gpu_cache_config
->
gpu_devices_id
.
size
();
size_t
element_size_per_gpu
=
test_cache_info
.
element_size
(
config
.
num_token_per_page
)
/
gpu_count
;
for
(
size_t
layer
=
0
;
layer
<
test_cache_info
.
hidden_layer_count
();
layer
++
)
{
for
(
size_t
gpu_idx
=
0
;
gpu_idx
<
gpu_count
;
gpu_idx
++
)
{
{
auto
kt
=
kcache
[
gpu_idx
][
layer
][
block_idx
[
at
]].
to
(
torch
::
kCPU
);
void
*
dst
=
kt
.
data_ptr
();
void
*
src
=
offset_by_bytes
(
k_cpu
[
layer
][
at
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
kcache
[
gpu_idx
][
layer
][
block_idx
[
at
]].
copy_
(
kt
);
}
{
auto
vt
=
vcache
[
gpu_idx
][
layer
][
block_idx
[
at
]].
to
(
torch
::
kCPU
);
void
*
dst
=
vt
.
data_ptr
();
void
*
src
=
offset_by_bytes
(
v_cpu
[
layer
][
at
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
vcache
[
gpu_idx
][
layer
][
block_idx
[
at
]].
copy_
(
vt
);
}
}
}
}
void
cmp_handle_gpu
(
std
::
vector
<
size_t
>&
block_idx
,
std
::
vector
<
torch
::
Tensor
>&
kcache
,
std
::
vector
<
torch
::
Tensor
>&
vcache
,
std
::
vector
<
layer_data
>&
k1
,
std
::
vector
<
layer_data
>&
v1
,
size_t
num_blocks
)
{
auto
k_from_gpu
=
empty_kvcache
(
num_blocks
);
auto
v_from_gpu
=
empty_kvcache
(
num_blocks
);
for
(
size_t
j
=
0
;
j
<
std
::
min
(
block_idx
.
size
(),
num_blocks
);
j
++
)
{
copy_gpu_cpu
(
block_idx
,
kcache
,
vcache
,
k_from_gpu
,
v_from_gpu
,
j
);
}
cmp_handle_data
(
k1
,
k_from_gpu
,
num_blocks
);
cmp_handle_data
(
v1
,
v_from_gpu
,
num_blocks
);
}
csrc/balance_serve/kvc2/test/kvc2test/flush-back.cpp
0 → 100644
View file @
877aec85
#include <future>
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
config
.
gpu_cache_config
->
total_kvcache_pages
=
12
;
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
// #pragma omp parallel for
for
(
size_t
ti
=
0
;
ti
<
2
;
ti
++
)
{
SPDLOG_WARN
(
"Test {}"
,
ti
);
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
std
::
mt19937
gen
(
ti
+
123
);
size_t
total_page
=
10
;
TokenLength
total_length
=
total_page
*
config
.
num_token_per_page
;
auto
tokens
=
random_ids
(
total_length
,
gen
);
TokenLength
prompt_length
=
3
*
config
.
num_token_per_page
;
auto
k1
=
random_kvcache
(
total_page
,
gen
);
auto
v1
=
random_kvcache
(
total_page
,
gen
);
{
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_to_gpu_async
(
test_model_name
,
test_quant_type
,
tokens
.
data
(),
prompt_length
,
total_length
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
assert
(
h
->
matched_length
()
%
config
.
num_token_per_page
==
0
);
size_t
matched_block
=
h
->
matched_length
()
/
config
.
num_token_per_page
;
auto
block_idx
=
h
->
get_gpu_block_idx
();
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
matched_block
);
for
(
size_t
at
=
matched_block
;
at
<
block_idx
.
size
();
at
++
)
{
copy_cpu_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
at
);
}
h
->
append_tokens
(
tokens
.
data
(),
total_length
);
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
total_page
);
}
{
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_to_gpu_async
(
test_model_name
,
test_quant_type
,
tokens
.
data
(),
total_length
,
total_length
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
assert
(
h
->
matched_length
()
==
total_length
);
size_t
matched_block
=
h
->
matched_length
()
/
config
.
num_token_per_page
;
auto
block_idx
=
h
->
get_gpu_block_idx
();
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
matched_block
);
}
}
kvc2
->
save
();
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt-gpu.cpp
0 → 100644
View file @
877aec85
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 08:29:45
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-22 09:56:12
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include <future>
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
trace
);
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
std
::
mt19937
gen
(
123
);
std
::
vector
<
std
::
vector
<
Token
>>
ids
;
std
::
vector
<
std
::
vector
<
layer_data
>>
k
,
v
;
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
ids
.
push_back
(
random_ids
(
1
*
config
.
num_token_per_page
,
gen
));
k
.
push_back
(
random_kvcache
(
1
,
gen
));
v
.
push_back
(
random_kvcache
(
1
,
gen
));
kvc2
->
raw_insert
(
test_model_name
,
test_quant_type
,
ids
[
i
].
data
(),
ids
[
i
].
size
(),
k
[
i
],
v
[
i
]);
}
kvc2
->
debug
();
{
// all match
std
::
vector
<
Token
*>
chunks
;
std
::
vector
<
TokenLength
>
lengths
;
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
chunks
.
push_back
(
ids
[
i
].
data
());
lengths
.
push_back
(
ids
[
i
].
size
());
}
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_alt_to_gpu_async
(
test_model_name
,
test_quant_type
,
chunks
,
lengths
,
15
*
config
.
num_token_per_page
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
auto
hk
=
h
->
handle_data
(
true
);
auto
hv
=
h
->
handle_data
(
false
);
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
cmp_handle_data
(
slice
(
hk
,
i
,
i
+
1
),
k
[
i
],
1
);
cmp_handle_data
(
slice
(
hv
,
i
,
i
+
1
),
v
[
i
],
1
);
}
auto
block_idx
=
h
->
get_gpu_block_idx
();
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
std
::
vector
<
size_t
>
blocks
=
{
block_idx
[
i
]};
cmp_handle_gpu
(
blocks
,
kcache
,
vcache
,
k
[
i
],
v
[
i
],
1
);
}
}
{
// no match in the middle
std
::
vector
<
Token
*>
chunks
;
std
::
vector
<
TokenLength
>
lengths
;
std
::
vector
<
std
::
vector
<
Token
>>
new_ids
;
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
new_ids
.
push_back
(
random_ids
(
1
*
config
.
num_token_per_page
,
gen
));
}
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
if
(
i
==
1
||
i
==
5
||
i
==
6
)
{
chunks
.
push_back
(
new_ids
[
i
].
data
());
}
else
{
chunks
.
push_back
(
ids
[
i
].
data
());
}
lengths
.
push_back
(
ids
[
i
].
size
());
}
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_alt_to_gpu_async
(
test_model_name
,
test_quant_type
,
chunks
,
lengths
,
15
*
config
.
num_token_per_page
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
auto
statuses
=
h
->
matched_status
();
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
if
(
i
==
1
)
{
assert
(
statuses
[
i
]
==
MatchStatus
::
NotMatchExact
);
}
else
if
(
i
==
5
||
i
==
6
)
{
assert
(
statuses
[
i
]
==
MatchStatus
::
NotMatchPartial
);
}
else
if
(
i
==
0
)
{
assert
(
statuses
[
i
]
==
MatchStatus
::
Exact
);
}
else
{
assert
(
statuses
[
i
]
==
MatchStatus
::
Partial
);
}
}
auto
hk
=
h
->
handle_data
(
true
);
auto
hv
=
h
->
handle_data
(
false
);
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
if
(
i
==
1
||
i
==
5
||
i
==
6
)
{
}
else
{
cmp_handle_data
(
slice
(
hk
,
i
,
i
+
1
),
k
[
i
],
1
);
cmp_handle_data
(
slice
(
hv
,
i
,
i
+
1
),
v
[
i
],
1
);
}
}
auto
block_idx
=
h
->
get_gpu_block_idx
();
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
if
(
i
==
1
||
i
==
5
||
i
==
6
)
{
}
else
{
std
::
vector
<
size_t
>
blocks
=
{
block_idx
[
i
]};
cmp_handle_gpu
(
blocks
,
kcache
,
vcache
,
k
[
i
],
v
[
i
],
1
);
}
}
}
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2test/lookup-alt.cpp
0 → 100644
View file @
877aec85
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 08:29:45
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-22 09:56:12
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
trace
);
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
std
::
mt19937
gen
(
123
);
std
::
vector
<
std
::
vector
<
Token
>>
ids
;
std
::
vector
<
std
::
vector
<
layer_data
>>
k
,
v
;
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
ids
.
push_back
(
random_ids
(
1
*
config
.
num_token_per_page
,
gen
));
k
.
push_back
(
random_kvcache
(
1
,
gen
));
v
.
push_back
(
random_kvcache
(
1
,
gen
));
kvc2
->
raw_insert
(
test_model_name
,
test_quant_type
,
ids
[
i
].
data
(),
ids
[
i
].
size
(),
k
[
i
],
v
[
i
]);
}
kvc2
->
debug
();
{
// all match
std
::
vector
<
Token
*>
chunks
;
std
::
vector
<
TokenLength
>
lengths
;
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
chunks
.
push_back
(
ids
[
i
].
data
());
lengths
.
push_back
(
ids
[
i
].
size
());
}
auto
h
=
kvc2
->
lookup_alt
(
test_model_name
,
test_quant_type
,
chunks
,
lengths
,
15
*
config
.
num_token_per_page
);
auto
hk
=
h
->
handle_data
(
true
);
auto
hv
=
h
->
handle_data
(
false
);
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
cmp_handle_data
(
slice
(
hk
,
i
,
i
+
1
),
k
[
i
],
1
);
cmp_handle_data
(
slice
(
hv
,
i
,
i
+
1
),
v
[
i
],
1
);
}
}
{
// no match in the middle
std
::
vector
<
Token
*>
chunks
;
std
::
vector
<
TokenLength
>
lengths
;
std
::
vector
<
std
::
vector
<
Token
>>
new_ids
;
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
new_ids
.
push_back
(
random_ids
(
1
*
config
.
num_token_per_page
,
gen
));
}
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
if
(
i
==
1
||
i
==
5
||
i
==
6
)
{
chunks
.
push_back
(
new_ids
[
i
].
data
());
}
else
{
chunks
.
push_back
(
ids
[
i
].
data
());
}
lengths
.
push_back
(
ids
[
i
].
size
());
}
auto
h
=
kvc2
->
lookup_alt
(
test_model_name
,
test_quant_type
,
chunks
,
lengths
,
15
*
config
.
num_token_per_page
);
auto
statuses
=
h
->
matched_status
();
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
if
(
i
==
1
)
{
assert
(
statuses
[
i
]
==
MatchStatus
::
NotMatchExact
);
}
else
if
(
i
==
5
||
i
==
6
)
{
assert
(
statuses
[
i
]
==
MatchStatus
::
NotMatchPartial
);
}
else
if
(
i
==
0
)
{
assert
(
statuses
[
i
]
==
MatchStatus
::
Exact
);
}
else
{
assert
(
statuses
[
i
]
==
MatchStatus
::
Partial
);
}
}
auto
hk
=
h
->
handle_data
(
true
);
auto
hv
=
h
->
handle_data
(
false
);
for
(
size_t
i
=
0
;
i
<
10
;
i
++
)
{
if
(
i
==
1
||
i
==
5
||
i
==
6
)
{
}
else
{
cmp_handle_data
(
slice
(
hk
,
i
,
i
+
1
),
k
[
i
],
1
);
cmp_handle_data
(
slice
(
hv
,
i
,
i
+
1
),
v
[
i
],
1
);
}
}
}
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-async.cpp
0 → 100644
View file @
877aec85
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 09:52:48
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-25 07:51:09
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include <future>
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
std
::
mt19937
gen
(
123
);
auto
ids1
=
random_ids
(
10
*
config
.
num_token_per_page
,
gen
);
auto
k1
=
random_kvcache
(
10
,
gen
);
auto
v1
=
random_kvcache
(
10
,
gen
);
kvc2
->
raw_insert
(
test_model_name
,
test_quant_type
,
ids1
.
data
(),
ids1
.
size
(),
k1
,
v1
);
// complete same
#pragma omp parallel for
for
(
size_t
ti
=
0
;
ti
<
3
;
ti
++
)
{
std
::
promise
<
std
::
shared_ptr
<
DoubleCacheHandleInterface
>>
p
;
kvc2
->
lookup_to_gpu_async
(
test_model_name
,
test_quant_type
,
ids1
.
data
(),
ids1
.
size
(),
ids1
.
size
()
+
2
*
config
.
num_token_per_page
,
[
&
p
](
std
::
shared_ptr
<
DoubleCacheHandleInterface
>
h
)
{
p
.
set_value
(
h
);
});
auto
fut
=
p
.
get_future
();
fut
.
wait
();
auto
h
=
fut
.
get
();
auto
k
=
h
->
handle_data
(
true
);
auto
v
=
h
->
handle_data
(
false
);
cmp_handle_data
(
k1
,
k
,
10
);
cmp_handle_data
(
v1
,
v
,
10
);
auto
block_idx
=
h
->
get_gpu_block_idx
();
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
cmp_handle_gpu
(
block_idx
,
kcache
,
vcache
,
k1
,
v1
,
10
);
}
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
csrc/balance_serve/kvc2/test/kvc2test/lookup-gpu-mt-without-vcache.cpp
0 → 100644
View file @
877aec85
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 09:52:48
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-25 07:51:09
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
int
main
(
int
argc
,
char
*
argv
[])
{
qw25_7B_gpu_config
.
v_cache_on
=
false
;
config
.
gpu_cache_config
=
qw25_7B_gpu_config
;
config
.
v_cache_on
=
false
;
init
(
argc
,
argv
);
spdlog
::
set_level
(
spdlog
::
level
::
debug
);
auto
kvc2
=
kvc2
::
create_kvc2
(
config
);
std
::
mt19937
gen
(
123
);
auto
ids1
=
random_ids
(
10
*
config
.
num_token_per_page
,
gen
);
auto
k1
=
random_kvcache
(
10
,
gen
);
kvc2
->
raw_insert
(
test_model_name
,
test_quant_type
,
ids1
.
data
(),
ids1
.
size
(),
k1
,
{});
// complete same
#pragma omp parallel for
for
(
size_t
ti
=
0
;
ti
<
3
;
ti
++
)
{
auto
h
=
kvc2
->
lookup_to_gpu
(
test_model_name
,
test_quant_type
,
ids1
.
data
(),
ids1
.
size
(),
ids1
.
size
()
+
2
*
config
.
num_token_per_page
);
auto
k
=
h
->
handle_data
(
true
);
cmp_handle_data
(
k1
,
k
,
10
);
auto
block_idx
=
h
->
get_gpu_block_idx
();
auto
[
kcache
,
vcache
]
=
kvc2
->
get_kvcache
();
auto
k_from_gpu
=
empty_kvcache
(
15
);
size_t
gpu_count
=
config
.
gpu_cache_config
->
gpu_devices_id
.
size
();
size_t
element_size_per_gpu
=
test_cache_info
.
element_size
(
config
.
num_token_per_page
)
/
gpu_count
;
for
(
size_t
i
=
0
;
i
<
k_from_gpu
.
size
();
i
++
)
{
for
(
size_t
j
=
0
;
j
<
block_idx
.
size
();
j
++
)
{
size_t
b_idx
=
block_idx
[
j
];
for
(
size_t
gpu_idx
=
0
;
gpu_idx
<
gpu_count
;
gpu_idx
++
)
{
{
auto
kt
=
kcache
[
gpu_idx
][
i
][
b_idx
].
to
(
torch
::
kCPU
);
void
*
src
=
kt
.
data_ptr
();
void
*
dst
=
offset_by_bytes
(
k_from_gpu
[
i
][
j
],
gpu_idx
*
element_size_per_gpu
);
memcpy
(
dst
,
src
,
element_size_per_gpu
);
}
}
}
}
cmp_handle_data
(
k1
,
k_from_gpu
,
10
);
}
SPDLOG_CRITICAL
(
"All Test Passed: {}"
,
argv
[
0
]);
return
0
;
}
Prev
1
2
3
4
5
6
7
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment