Commit 25cee581 authored by Atream's avatar Atream
Browse files

add balance-serve, support concurrence

parent 8d0292aa
#ifndef __MUTEX_EXTEND_HPP_
#define __MUTEX_EXTEND_HPP_
#include <atomic>
#include <chrono>
#include <iostream>
#include <thread>
class non_recursive_mutex {
public:
non_recursive_mutex() = default;
// 使用 try_lock 实现非递归锁
bool try_lock() {
std::thread::id this_id = std::this_thread::get_id();
// 检查当前线程是否已经持有该锁
if (owner.load(std::memory_order_acquire) == this_id) {
return false; // 如果是当前线程,返回失败
}
// 尝试加锁
if (mtx.try_lock()) {
owner.store(this_id, std::memory_order_release); // 设置锁的拥有者
return true;
}
return false;
}
// lock 会阻塞,直到获得锁
void lock() {
std::thread::id this_id = std::this_thread::get_id();
while (true) {
// 检查当前线程是否已经持有该锁
if (owner.load(std::memory_order_acquire) == this_id) {
throw std::runtime_error("Thread is trying to lock a mutex it already holds");
}
// 尝试加锁
if (mtx.try_lock()) {
owner.store(this_id, std::memory_order_release); // 设置锁的拥有者
return;
}
// 如果锁未获得,则稍微等待,防止忙等
std::this_thread::yield();
}
}
// 解锁
void unlock() {
std::thread::id this_id = std::this_thread::get_id();
// 确保只有持有锁的线程可以解锁
if (owner.load(std::memory_order_acquire) == this_id) {
owner.store(std::thread::id(), std::memory_order_release); // 清除锁的拥有者
mtx.unlock();
} else {
throw std::runtime_error("Thread attempting to unlock a mutex it doesn't own");
}
}
private:
std::mutex mtx; // 实际的互斥量
std::atomic<std::thread::id> owner; // 原子变量,记录当前锁的拥有者
};
#endif
#ifndef PERIODIC_TASK_HPP
#define PERIODIC_TASK_HPP
#include <atomic>
#include <chrono>
#include <condition_variable>
#include <cstdio>
#include <functional>
#include <future>
#include <iostream>
#include <mutex>
#include <stop_token>
#include <thread>
#include <utility>
#include <vector>
namespace periodic {
class PeriodicTask {
public:
explicit PeriodicTask(std::function<void()> func,
std::chrono::milliseconds interval_ms = std::chrono::milliseconds(100))
: func_(std::move(func)), interval_(interval_ms), worker_([this](std::stop_token stoken) { this->run(stoken); }) {
// std::cout << "PeriodicTask created with interval: " << interval_.count() << " ms" << std::endl;
}
~PeriodicTask() {
worker_.request_stop();
cv_.notify_one(); // Ensure worker wakes up when destroyed
// std::cout << "PeriodicTask destructor called, stopping worker." << std::endl;
}
void wakeUp() {
{
std::lock_guard<std::mutex> lock(wakeup_mutex_);
wake_up_requested_ = true;
}
cv_.notify_one(); // Notify worker thread to wake up immediately
// std::cout << "wakeUp() called: worker thread will wake up." << std::endl;
}
std::future<void> wakeUpWait() {
std::promise<void> promise;
std::future<void> future = promise.get_future();
{
std::lock_guard<std::mutex> lock(promise_mutex_);
wakeup_promises_.push_back(std::move(promise));
}
wakeUp();
return future;
}
private:
void run(std::stop_token stoken) {
while (!stoken.stop_requested()) {
std::unique_lock lock(mutex_);
// Wait for either the time interval or a wake-up signal
cv_.wait_for(lock, interval_, [this] { return wake_up_requested_.load(); });
if (stoken.stop_requested())
break;
// If the wake-up was triggered, reset the flag and process the task
{
std::lock_guard<std::mutex> lock(wakeup_mutex_);
wake_up_requested_ = false;
}
try {
// std::cout << "Running task function." << std::endl;
func_();
} catch (...) {
std::cerr << "Error in task function." << std::endl;
}
notifyPromises();
}
}
void notifyPromises() {
std::lock_guard<std::mutex> lock(promise_mutex_);
// std::cout << "Notifying all waiting promises." << std::endl;
for (auto& promise : wakeup_promises_) {
promise.set_value();
}
wakeup_promises_.clear();
}
std::function<void()> func_;
std::chrono::milliseconds interval_;
std::mutex mutex_;
std::condition_variable cv_;
std::vector<std::promise<void>> wakeup_promises_;
std::mutex promise_mutex_;
std::mutex wakeup_mutex_;
std::atomic<bool> wake_up_requested_ = false;
std::jthread worker_;
};
} // namespace periodic
#endif // PERIODIC_TASK_HPP
/*
* @Author: Xie Weiyu ervinxie@qq.com
* @Date: 2024-11-21 06:35:47
* @LastEditors: Xie Weiyu ervinxie@qq.com
* @LastEditTime: 2024-11-21 06:35:50
* @FilePath: /kvc2/src/utils/spin_lock.hpp
* @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置:
* https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
*/
#include <atomic>
#include <chrono>
#include <thread>
class SpinLock {
public:
SpinLock() { flag.clear(); }
void lock() {
const int max_delay = 1024; // Maximum delay in microseconds
int delay = 1; // Initial delay in microseconds
while (flag.test_and_set(std::memory_order_acquire)) {
std::this_thread::sleep_for(std::chrono::microseconds(delay));
delay *= 2;
if (delay > max_delay) {
delay = max_delay;
}
}
}
void unlock() { flag.clear(std::memory_order_release); }
private:
std::atomic_flag flag = ATOMIC_FLAG_INIT;
};
#pragma once
#include <cassert>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <map>
#include <sstream>
#include <string>
#include "easy_format.hpp"
inline std::string doubleToStringR2(double value) {
std::stringstream stream;
stream << std::fixed << std::setprecision(2) << value;
return stream.str();
}
class Timer {
public:
std::string name;
bool tmp_timer = false;
Timer() {}
Timer(std::string name) : name(name), tmp_timer(true) { start(); }
~Timer() {
if (tmp_timer) {
std::cout << name << " " << elapsedMs() << " ms" << std::endl;
}
}
void start() {
m_startTime = std::chrono::high_resolution_clock::now();
assert(m_isRunning == false);
m_isRunning = true;
}
void stop() {
m_endTime = std::chrono::high_resolution_clock::now();
assert(m_isRunning == true);
m_isRunning = false;
m_runningNs += elapsedNs();
}
double elapsedNs() {
std::chrono::time_point<std::chrono::high_resolution_clock> endTime;
if (m_isRunning) {
endTime = std::chrono::high_resolution_clock::now();
} else {
endTime = m_endTime;
}
return std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - m_startTime).count();
}
void printElapsedMilliseconds() { std::cout << elapsedNs() / 1e6 << " ms" << std::endl; }
static std::string ns_to_string(double duration) {
auto nano_sec = duration;
if (nano_sec >= 1000) {
auto mirco_sec = nano_sec / 1000.0;
if (mirco_sec >= 1000) {
auto milli_sec = mirco_sec / 1000.0;
if (milli_sec >= 1000) {
auto seconds = milli_sec / 1000.0;
if (seconds >= 60.0) {
auto minutes = seconds / 60.0;
if (minutes >= 60.0) {
auto hours = minutes / 60.0;
return doubleToStringR2(hours) + " h";
} else {
return doubleToStringR2(minutes) + " min";
}
} else {
return doubleToStringR2(seconds) + " sec";
}
} else {
return doubleToStringR2(milli_sec) + " ms";
}
} else {
return doubleToStringR2(mirco_sec) + " us";
}
} else {
return doubleToStringR2(nano_sec) + " ns";
}
}
double runningTimeNs() { return m_runningNs; }
std::string runningTime() {
auto duration = m_runningNs;
return ns_to_string(duration);
}
std::string elapsedTime() { return ns_to_string(elapsedNs()); }
double elapsedMs() { return elapsedNs() / 1e6; }
std::string report_throughput(size_t op_cnt) {
double ops = op_cnt / elapsedMs() * 1000;
return readable_number(ops) + "op/s";
}
void merge(Timer& other) {
assert(m_isRunning == false);
assert(other.m_isRunning == false);
m_runningNs += other.runningTimeNs();
}
private:
std::chrono::time_point<std::chrono::high_resolution_clock> m_startTime;
std::chrono::time_point<std::chrono::high_resolution_clock> m_endTime;
bool m_isRunning = false;
double m_runningNs = 0.0;
};
class Counter {
public:
Counter() {}
std::map<std::string, size_t> counters;
void inc(const char* name, size_t num) { counters[name] += num; };
void print() {
for (auto& p : counters) {
std::cout << p.first << " : " << p.second << std::endl;
}
};
};
set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fopenmp")
# set(CMAKE_CXX_FLAGS "-O3 -march=native -Wall -Wextra -pthread")
add_subdirectory(kvc2test)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src)
add_executable(hashmap_test hashmap_test.cpp)
target_link_libraries(hashmap_test PRIVATE TBB::tbb)
add_executable(xxHash_test xxHash_test.cpp)
target_link_libraries(xxHash_test PRIVATE xxhash)
function(add_async_store_executable source_file)
get_filename_component(target_name ${source_file} NAME_WE) # 获取不带扩展名的文件名作为目标名
add_executable(${target_name} ${source_file})
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/nlohmann/single_include)
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/spdlog/include)
target_link_libraries(${target_name} PRIVATE async_store gflags)
endfunction()
add_async_store_executable(async_store_test.cpp)
function(add_kvc2_executable source_file)
get_filename_component(target_name ${source_file} NAME_WE) # 获取不带扩展名的文件名作为目标名
add_executable(${target_name} ${source_file})
# target_compile_options(${target_name} PRIVATE -fopenmp -fno-strict-aliasing)
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/nlohmann/single_include)
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/spdlog/include)
target_link_libraries(${target_name} PRIVATE kvc2 async_store gflags)
endfunction()
add_kvc2_executable(test_lock_free_queue.cpp)
add_kvc2_executable(test_queue_perf.cpp)
# Disable deprecated test
# add_kvc2_executable(prefix_test.cpp)
# add_kvc2_executable(kvcache_disk_insert_read_test.cpp)
# add_kvc2_executable(kvcache_mem_eviction_test.cpp)
# add_kvc2_executable(kvcache_mem_insert_read_test.cpp)
# add_kvc2_executable(kvcache_save_load_test.cpp)
# add_kvc2_executable(kvc2_export_header_test.cpp)
# add_kvc2_executable(kvc2_export_load_test.cpp)
target_include_directories(async_store_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..//third_party/nlohmann/single_include)
target_include_directories(async_store_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/..//third_party/spdlog/include)
target_link_libraries(async_store_test PRIVATE xxhash)
add_executable(test_std_list test_std_list.cpp)
add_executable(test_cuda_stream test_cuda_stream.cpp)
target_include_directories(test_cuda_stream PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
target_link_libraries(test_cuda_stream PRIVATE CUDA::cudart)
add_executable(test_cuda_stream_manager test_cuda_stream_manager.cpp)
target_include_directories(test_cuda_stream_manager PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
target_link_libraries(test_cuda_stream_manager PRIVATE cuda_stream_manager)
add_executable(test_periodic_task test_periodic_task.cpp)
target_include_directories(test_periodic_task PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
add_executable(test_page_pool page_pool_test.cpp)
target_include_directories(test_page_pool PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../src)
target_include_directories(test_page_pool PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../third_party/spdlog/include)
\ No newline at end of file
#include <tbb/concurrent_hash_map.h>
#include <iostream>
int main() {
tbb::concurrent_hash_map<int, int> map;
map.insert({1, 2});
decltype(map)::accessor a;
std::cout << map.find(a, 1) << std::endl;
return 0;
}
#include "kvc2.h"
#include "kvc2_test_utils.cpp"
int main(int argc, char* argv[]) {
init(argc, argv);
spdlog::set_level(spdlog::level::debug);
std::mt19937 gen(123);
KVC2Config config = {
.path = FLAGS_disk_cache_path,
.config_path = std::string("/home/xwy/conifg"),
.block_length = BlockLength,
.memory_pool_size = size_t(10e9),
.evict_count = 20,
};
auto kvcc = create_kvc2(config);
auto io = kvcc->start_io_thread();
SPDLOG_INFO("Disk Test");
auto ids = random_ids(10 * BlockLength, gen);
auto h1 = random_kvcache(qwen_cache_info, 10, gen);
kvcc->raw_insert(qwen_cache_info, reinterpret_cast<TokenPtr>(ids.data()), ids.size(), h1);
// complete same
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids.data()), ids.size(), h2);
cmp_handle_data(qwen_cache_info, h1, h2);
}
// complete prefix
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
auto ids2 = std::vector<Token>(ids.begin(), ids.begin() + 3 * BlockLength);
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
cmp_handle_data(qwen_cache_info, h1, h2, 3);
}
// common prefix
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
auto ids2 = std::vector<Token>(ids.begin(), ids.begin() + 5 * BlockLength);
auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
ids2.insert(ids2.end(), rids.begin(), rids.end());
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
cmp_handle_data(qwen_cache_info, h1, h2, 5);
}
// no prefix
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
auto ids2 = random_ids(10 * BlockLength, gen);
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
}
// insert partly new
auto h2 = random_kvcache(qwen_cache_info, 10, gen);
copy_kvcache(h1, h2, 0, 5);
auto ids2 = random_ids(10 * BlockLength, gen);
for (size_t i = 0; i < 5 * BlockLength; i++) {
ids2[i] = ids[i];
}
kvcc->raw_insert(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
// read new part
{
auto h3 = empty_kvcache(qwen_cache_info, 10);
auto ids3 = std::vector<Token>(ids2.begin(), ids2.begin() + 7 * BlockLength);
ids3.push_back(123);
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids3.data()), ids3.size(), h3);
cmp_handle_data(qwen_cache_info, h3, h2, 7);
}
kvcc->save();
kvcc->stop_io_thread();
io.join();
SPDLOG_WARN("{} Test Passed", __FILE__);
return 0;
}
\ No newline at end of file
#include "kvc2.h"
#include "kvc2_test_utils.cpp"
int main(int argc, char* argv[]) {
init(argc, argv);
spdlog::set_level(spdlog::level::debug);
std::mt19937 gen(123);
KVC2Config config = {
.path = FLAGS_disk_cache_path,
.block_length = BlockLength,
.memory_pool_size = size_t(10e9),
.evict_count = 20,
};
auto kvcc = create_kvc2(config);
kvcc->load();
auto io = kvcc->start_io_thread();
SPDLOG_INFO("Disk Test");
auto ids = random_ids(10 * BlockLength, gen);
auto h1 = empty_kvcache(qwen_cache_info, 10);
// kvcc->raw_insert(qwen_cache_info, reinterpret_cast<IDptr>(ids.data()), ids.size(), h1);
// complete same
{
// auto h2 = empty_kvcache(qwen_cache_info, 10);
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids.data()), ids.size(), h1);
// cmp_handle_data(qwen_cache_info, h1, h2);
}
// complete prefix
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
auto ids2 = std::vector<Token>(ids.begin(), ids.begin() + 3 * BlockLength);
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
cmp_handle_data(qwen_cache_info, h1, h2, 3);
}
// common prefix
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
auto ids2 = std::vector<Token>(ids.begin(), ids.begin() + 5 * BlockLength);
auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
ids2.insert(ids2.end(), rids.begin(), rids.end());
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
cmp_handle_data(qwen_cache_info, h1, h2, 5);
}
// no prefix
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
auto ids2 = random_ids(10 * BlockLength, gen);
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
}
// insert partly new
auto h2 = random_kvcache(qwen_cache_info, 10, gen);
copy_kvcache(h1, h2, 0, 5);
auto ids2 = random_ids(10 * BlockLength, gen);
for (size_t i = 0; i < 5 * BlockLength; i++) {
ids2[i] = ids[i];
}
kvcc->raw_insert(qwen_cache_info, reinterpret_cast<TokenPtr>(ids2.data()), ids2.size(), h2);
// read new part
{
auto h3 = empty_kvcache(qwen_cache_info, 10);
auto ids3 = std::vector<Token>(ids2.begin(), ids2.begin() + 7 * BlockLength);
ids3.push_back(123);
kvcc->raw_read(qwen_cache_info, reinterpret_cast<TokenPtr>(ids3.data()), ids3.size(), h3);
cmp_handle_data(qwen_cache_info, h3, h2, 7);
}
kvcc->stop_io_thread();
io.join();
SPDLOG_WARN("{} Test Passed", __FILE__);
return 0;
}
\ No newline at end of file
#include <optional>
#include <random>
#include "kvc2.h"
#define FMT_HEADER_ONLY
#include <spdlog/spdlog.h>
const int BlockLength = 256;
std::string FLAGS_disk_cache_path;
void init(int argc, char* argv[]) {
if (argc != 2) {
fmt::print("Usage: {} --disk_cache_path=xxx\n", argv[0]);
exit(1);
}
FLAGS_disk_cache_path = argv[1];
if (FLAGS_disk_cache_path.empty()) {
fmt::print("disk_cache_path is empty");
exit(1);
}
}
using namespace kvc2;
data_block_ptr empty_block(CacheInfo info) {
auto re = new (std::align_val_t(4096)) std::byte[info.element_size(BlockLength)];
return reinterpret_cast<data_block_ptr>(re);
}
data_block_ptr random_block(CacheInfo info, std::mt19937& gen) {
auto re = empty_block(info);
uint64_t* d = (uint64_t*)re;
for (size_t i = 0; i < info.element_size(BlockLength) / 8; i++) {
d[i] = gen();
}
return re;
}
layer_data random_blocks(CacheInfo info, size_t block_count, size_t seed) {
std::mt19937 gen(seed);
layer_data re;
for (size_t i = 0; i < block_count; i++) {
re.push_back(random_block(info, gen));
}
return re;
}
layer_data empty_blocks(CacheInfo info, size_t block_count) {
layer_data re;
for (size_t i = 0; i < block_count; i++) {
re.push_back(empty_block(info));
}
return re;
}
void copy_kvcache(std::vector<layer_data>& from, std::vector<layer_data>& to, size_t block_start, size_t length) {
for (size_t i = 0; i < from.size(); i++) {
for (size_t j = 0; j < length; j++) {
to[i][block_start + j] = from[i][block_start + j];
}
}
}
std::vector<layer_data> random_kvcache(CacheInfo info, size_t block_count, std::mt19937& gen) {
std::vector<layer_data> re;
re.resize(info.hidden_layer_count());
fmt::print("Generating random kvcache, layer {}\n", info.hidden_layer_count());
#pragma omp parallel for
for (size_t i = 0; i < info.hidden_layer_count(); i++) {
re[i] = random_blocks(info, block_count, gen());
}
return re;
}
std::vector<layer_data> empty_kvcache(CacheInfo info, size_t block_count) {
std::vector<layer_data> re;
re.resize(info.hidden_layer_count());
fmt::print("Generating empty kvcache, layer {}\n", info.hidden_layer_count());
#pragma omp parallel for
for (size_t i = 0; i < info.hidden_layer_count(); i++) {
re[i] = empty_blocks(info, block_count);
}
return re;
}
std::vector<Token> random_ids(size_t length, std::mt19937& gen) {
std::vector<Token> re;
for (size_t i = 0; i < length; i++) {
re.push_back(gen());
}
return re;
}
CacheInfo qwen_cache_info = {
.model_name = "qwen2-72b-instruct",
.is_key_cache = true,
.quant_type = "BF16",
};
void cmp_handle_data(CacheInfo info, std::vector<layer_data>& h1, std::vector<layer_data>& h2,
std::optional<size_t> blocks = std::nullopt) {
assert(h1.size() == h2.size());
for (size_t i = 0; i < h1.size(); i++) {
auto& b1 = h1[i];
auto& b2 = h2[i];
if (blocks.has_value() == false) {
assert(b1.size() == b2.size());
}
int cmp_to = blocks.has_value() ? blocks.value() : b1.size();
for (int j = 0; j < cmp_to; j++) {
auto e1 = reinterpret_cast<void*>(b1[j]);
auto e2 = reinterpret_cast<void*>(b2[j]);
assert(memcmp(e1, e2, info.element_size(BlockLength)) == 0);
}
}
fmt::print("KVCacheHandle cmp ok\n");
}
set(CMAKE_CXX_FLAGS "-Og -march=native -Wall -Wextra -g -fopenmp")
function(add_kvc2_test source_file)
get_filename_component(target_name ${source_file} NAME_WE) # 获取不带扩展名的文件名作为目标名
add_executable(${target_name} ${source_file})
# target_compile_options(${target_name} PRIVATE -fopenmp -fno-strict-aliasing)
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../src)
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/nlohmann/single_include)
target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../third_party/spdlog/include)
target_link_libraries(${target_name} PRIVATE kvc2 async_store)
endfunction()
add_kvc2_test(raw_insert_read.cpp)
add_kvc2_test(lookup.cpp)
add_kvc2_test(lookup-alt.cpp)
add_kvc2_test(lookup-alt-gpu.cpp)
add_kvc2_test(lookup-mt.cpp)
add_kvc2_test(lookup-gpu.cpp)
add_kvc2_test(lookup-gpu-mt.cpp)
add_kvc2_test(lookup-gpu-async.cpp)
add_kvc2_test(append-tokens.cpp)
add_kvc2_test(flush-back.cpp)
add_kvc2_test(check-flush-back.cpp)
add_kvc2_test(lookup-without-vcache.cpp)
add_kvc2_test(lookup-gpu-mt-without-vcache.cpp)
#include <future>
#include "common.hpp"
int main(int argc, char* argv[]) {
init(argc, argv);
spdlog::set_level(spdlog::level::debug);
auto kvc2 = kvc2::create_kvc2(config);
#pragma omp parallel for
for (size_t ti = 0; ti < 3; ti++) {
auto [kcache, vcache] = kvc2->get_kvcache();
std::mt19937 gen(ti + 123);
size_t total_page = 10;
TokenLength total_length = total_page * config.num_token_per_page;
auto tokens = random_ids(total_length, gen);
TokenLength prompt_length = 3 * config.num_token_per_page;
auto k1 = random_kvcache(total_page, gen);
auto v1 = random_kvcache(total_page, gen);
{
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), prompt_length, total_length,
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
auto fut = p.get_future();
fut.wait();
auto h = fut.get();
assert(h->matched_length() % config.num_token_per_page == 0);
size_t matched_block = h->matched_length() / config.num_token_per_page;
auto block_idx = h->get_gpu_block_idx();
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
for (size_t at = matched_block; at < block_idx.size(); at++) {
copy_cpu_gpu(block_idx, kcache, vcache, k1, v1, at);
}
h->append_tokens(tokens.data(), total_length);
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, total_page);
}
{
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), total_length, total_length,
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
auto fut = p.get_future();
fut.wait();
auto h = fut.get();
assert(h->matched_length() == total_length);
size_t matched_block = h->matched_length() / config.num_token_per_page;
auto block_idx = h->get_gpu_block_idx();
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
}
}
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
return 0;
}
#include <future>
#include "common.hpp"
int main(int argc, char* argv[]) {
init(argc, argv);
spdlog::set_level(spdlog::level::debug);
config.gpu_cache_config->total_kvcache_pages = 12;
auto kvc2 = kvc2::create_kvc2(config);
kvc2->load();
// #pragma omp parallel for
for (size_t ti = 0; ti < 2; ti++) {
SPDLOG_WARN("Test {}", ti);
auto [kcache, vcache] = kvc2->get_kvcache();
std::mt19937 gen(ti + 123);
size_t total_page = 10;
TokenLength total_length = total_page * config.num_token_per_page;
auto tokens = random_ids(total_length, gen);
auto k1 = random_kvcache(total_page, gen);
auto v1 = random_kvcache(total_page, gen);
{
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), total_length, total_length,
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
auto fut = p.get_future();
fut.wait();
auto h = fut.get();
assert(h->matched_length() == total_length);
size_t matched_block = h->matched_length() / config.num_token_per_page;
auto block_idx = h->get_gpu_block_idx();
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
}
}
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
return 0;
}
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 06:02:41
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-12-11 07:34:10
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
#include <random>
#include <thread>
#include "kvc2.h"
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
using namespace kvc2;
template <typename T>
T* offset_by_bytes(T* t, size_t n) {
return reinterpret_cast<T*>(reinterpret_cast<size_t>(t) + n);
}
std::string FLAGS_disk_cache_path;
kvc2::KVC2Config config;
kvc2::GPUPageCacheConfig qw25_7B_gpu_config{
.gpu_only = false,
.gpu_devices_id = {0, 1},
.layer_count = 28,
.total_kvcache_pages = 40,
.num_token_per_page = 256,
.num_k_heads = 4,
.k_head_dim = 896,
.full_kv_cache_on_each_gpu = false,
.k_cache_on = true,
.v_cache_on = true,
.tensor_type = torch::kBFloat16,
.num_streams_per_device = 4,
};
ModelName test_model_name = "Qwen2.5-7B-Instruct";
QuantType test_quant_type = "FP16";
CacheInfo test_cache_info{
.model_name = test_model_name,
.is_key_cache = true,
.quant_type = test_quant_type,
};
void init(int argc, char* argv[]) {
if (argc != 2) {
fmt::print("Usage: {} <disk_cache_path>\n", argv[0]);
exit(1);
}
load_quant_configs("./config/quant_configs.json");
load_model_configs("./config/model_configs.json");
FLAGS_disk_cache_path = argv[1];
if (FLAGS_disk_cache_path.empty()) {
fmt::print("disk_cache_path is empty\n");
exit(1);
}
config.path = FLAGS_disk_cache_path;
config.config_path = "./config";
config.gpu_cache_config = qw25_7B_gpu_config;
}
data_block_ptr empty_block() {
auto re = new (std::align_val_t(4096)) std::byte[test_cache_info.element_size(config.num_token_per_page)];
memset(re, 0, test_cache_info.element_size(config.num_token_per_page));
return reinterpret_cast<data_block_ptr>(re);
}
data_block_ptr random_block(std::mt19937& gen) {
auto re = empty_block();
uint64_t* d = (uint64_t*)re;
for (size_t i = 0; i < test_cache_info.element_size(config.num_token_per_page) / 8; i++) {
d[i] = gen();
}
return re;
}
layer_data random_blocks(size_t block_count, size_t seed) {
std::mt19937 gen(seed);
layer_data re;
for (size_t i = 0; i < block_count; i++) {
re.push_back(random_block(gen));
}
return re;
}
layer_data empty_blocks(size_t block_count) {
layer_data re;
for (size_t i = 0; i < block_count; i++) {
re.push_back(empty_block());
}
return re;
}
void copy_kvcache(std::vector<layer_data>& from, std::vector<layer_data>& to, size_t block_start, size_t length) {
for (size_t i = 0; i < from.size(); i++) {
for (size_t j = 0; j < length; j++) {
to[i][block_start + j] = from[i][block_start + j];
}
}
}
std::vector<layer_data> random_kvcache(size_t block_count, std::mt19937& gen) {
std::vector<layer_data> re;
re.resize(test_cache_info.hidden_layer_count());
fmt::print("Generating random kvcache, layer {}\n", test_cache_info.hidden_layer_count());
std::vector<std::mt19937> gens;
for (size_t i = 0; i < test_cache_info.hidden_layer_count(); i++) {
gens.push_back(std::mt19937(gen()));
}
#pragma omp parallel for
for (size_t i = 0; i < test_cache_info.hidden_layer_count(); i++) {
re[i] = random_blocks(block_count, gens[i]());
}
return re;
}
std::vector<layer_data> empty_kvcache(size_t block_count) {
std::vector<layer_data> re;
re.resize(test_cache_info.hidden_layer_count());
fmt::print("Generating empty kvcache, layer {}\n", test_cache_info.hidden_layer_count());
#pragma omp parallel for
for (size_t i = 0; i < test_cache_info.hidden_layer_count(); i++) {
re[i] = empty_blocks(block_count);
}
return re;
}
std::vector<Token> random_ids(size_t length, std::mt19937& gen) {
std::vector<Token> re;
for (size_t i = 0; i < length; i++) {
re.push_back(gen());
}
return re;
}
std::vector<layer_data> slice(std::vector<layer_data>& h1,size_t start,size_t end){
std::vector<layer_data> re;
for(auto&l:h1){
layer_data new_layer;
new_layer.insert(new_layer.end(),l.begin()+start,l.begin()+end);
re.push_back(new_layer);
}
return re;
}
void cmp_handle_data(std::vector<layer_data> h1, std::vector<layer_data> h2,
std::optional<size_t> blocks = std::nullopt) {
assert(h1.size() == h2.size());
for (size_t i = 0; i < h1.size(); i++) {
auto& b1 = h1[i];
auto& b2 = h2[i];
if (blocks.has_value() == false) {
assert(b1.size() == b2.size());
}
int cmp_to = blocks.has_value() ? blocks.value() : b1.size();
for (int j = 0; j < cmp_to; j++) {
auto e1 = reinterpret_cast<void*>(b1[j]);
auto e2 = reinterpret_cast<void*>(b2[j]);
assert(memcmp(e1, e2, test_cache_info.element_size(config.num_token_per_page)) == 0);
}
}
fmt::print("KVCacheHandle cmp ok\n");
}
void copy_gpu_cpu(std::vector<size_t>& block_idx, std::vector<torch::Tensor>& kcache,
std::vector<torch::Tensor>& vcache, std::vector<layer_data>& k_cpu, std::vector<layer_data>& v_cpu,
size_t at) {
size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
for (size_t layer = 0; layer < test_cache_info.hidden_layer_count(); layer++) {
for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
{
auto kt = kcache[gpu_idx][layer][block_idx[at]].to(torch::kCPU);
void* src = kt.data_ptr();
void* dst = offset_by_bytes(k_cpu[layer][at], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
}
{
auto vt = vcache[gpu_idx][layer][block_idx[at]].to(torch::kCPU);
void* src = vt.data_ptr();
void* dst = offset_by_bytes(v_cpu[layer][at], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
}
}
}
}
void copy_cpu_gpu(std::vector<size_t>& block_idx, std::vector<torch::Tensor>& kcache,
std::vector<torch::Tensor>& vcache, std::vector<layer_data>& k_cpu, std::vector<layer_data>& v_cpu,
size_t at) {
size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
for (size_t layer = 0; layer < test_cache_info.hidden_layer_count(); layer++) {
for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
{
auto kt = kcache[gpu_idx][layer][block_idx[at]].to(torch::kCPU);
void* dst = kt.data_ptr();
void* src = offset_by_bytes(k_cpu[layer][at], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
kcache[gpu_idx][layer][block_idx[at]].copy_(kt);
}
{
auto vt = vcache[gpu_idx][layer][block_idx[at]].to(torch::kCPU);
void* dst = vt.data_ptr();
void* src = offset_by_bytes(v_cpu[layer][at], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
vcache[gpu_idx][layer][block_idx[at]].copy_(vt);
}
}
}
}
void cmp_handle_gpu(std::vector<size_t>& block_idx, std::vector<torch::Tensor>& kcache,
std::vector<torch::Tensor>& vcache, std::vector<layer_data>& k1, std::vector<layer_data>& v1,
size_t num_blocks) {
auto k_from_gpu = empty_kvcache(num_blocks);
auto v_from_gpu = empty_kvcache(num_blocks);
for (size_t j = 0; j < std::min(block_idx.size(), num_blocks); j++) {
copy_gpu_cpu(block_idx, kcache, vcache, k_from_gpu, v_from_gpu, j);
}
cmp_handle_data(k1, k_from_gpu, num_blocks);
cmp_handle_data(v1, v_from_gpu, num_blocks);
}
#include <future>
#include "common.hpp"
int main(int argc, char* argv[]) {
init(argc, argv);
spdlog::set_level(spdlog::level::debug);
config.gpu_cache_config->total_kvcache_pages = 12;
auto kvc2 = kvc2::create_kvc2(config);
// #pragma omp parallel for
for (size_t ti = 0; ti < 2; ti++) {
SPDLOG_WARN("Test {}",ti);
auto [kcache, vcache] = kvc2->get_kvcache();
std::mt19937 gen(ti + 123);
size_t total_page = 10;
TokenLength total_length = total_page * config.num_token_per_page;
auto tokens = random_ids(total_length, gen);
TokenLength prompt_length = 3 * config.num_token_per_page;
auto k1 = random_kvcache(total_page, gen);
auto v1 = random_kvcache(total_page, gen);
{
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), prompt_length, total_length,
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
auto fut = p.get_future();
fut.wait();
auto h = fut.get();
assert(h->matched_length() % config.num_token_per_page == 0);
size_t matched_block = h->matched_length() / config.num_token_per_page;
auto block_idx = h->get_gpu_block_idx();
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
for (size_t at = matched_block; at < block_idx.size(); at++) {
copy_cpu_gpu(block_idx, kcache, vcache, k1, v1, at);
}
h->append_tokens(tokens.data(), total_length);
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, total_page);
}
{
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, tokens.data(), total_length, total_length,
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
auto fut = p.get_future();
fut.wait();
auto h = fut.get();
assert(h->matched_length() == total_length);
size_t matched_block = h->matched_length() / config.num_token_per_page;
auto block_idx = h->get_gpu_block_idx();
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, matched_block);
}
}
kvc2->save();
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
return 0;
}
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 08:29:45
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-22 09:56:12
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include <future>
#include "common.hpp"
int main(int argc, char* argv[]) {
init(argc, argv);
spdlog::set_level(spdlog::level::trace);
auto kvc2 = kvc2::create_kvc2(config);
std::mt19937 gen(123);
std::vector<std::vector<Token>> ids;
std::vector<std::vector<layer_data>> k, v;
for (size_t i = 0; i < 10; i++) {
ids.push_back(random_ids(1 * config.num_token_per_page, gen));
k.push_back(random_kvcache(1, gen));
v.push_back(random_kvcache(1, gen));
kvc2->raw_insert(test_model_name, test_quant_type, ids[i].data(), ids[i].size(), k[i], v[i]);
}
kvc2->debug();
{
// all match
std::vector<Token*> chunks;
std::vector<TokenLength> lengths;
for (size_t i = 0; i < 10; i++) {
chunks.push_back(ids[i].data());
lengths.push_back(ids[i].size());
}
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
kvc2->lookup_alt_to_gpu_async(test_model_name, test_quant_type, chunks, lengths, 15 * config.num_token_per_page,
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
auto fut = p.get_future();
fut.wait();
auto h = fut.get();
auto hk = h->handle_data(true);
auto hv = h->handle_data(false);
for (size_t i = 0; i < 10; i++) {
cmp_handle_data(slice(hk, i, i + 1), k[i], 1);
cmp_handle_data(slice(hv, i, i + 1), v[i], 1);
}
auto block_idx = h->get_gpu_block_idx();
auto [kcache, vcache] = kvc2->get_kvcache();
for (size_t i = 0; i < 10; i++) {
std::vector<size_t> blocks = {block_idx[i]};
cmp_handle_gpu(blocks, kcache, vcache, k[i], v[i], 1);
}
}
{
// no match in the middle
std::vector<Token*> chunks;
std::vector<TokenLength> lengths;
std::vector<std::vector<Token>> new_ids;
for (size_t i = 0; i < 10; i++) {
new_ids.push_back(random_ids(1 * config.num_token_per_page, gen));
}
for (size_t i = 0; i < 10; i++) {
if (i == 1 || i == 5 || i == 6) {
chunks.push_back(new_ids[i].data());
} else {
chunks.push_back(ids[i].data());
}
lengths.push_back(ids[i].size());
}
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
kvc2->lookup_alt_to_gpu_async(test_model_name, test_quant_type, chunks, lengths, 15 * config.num_token_per_page,
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
auto fut = p.get_future();
fut.wait();
auto h = fut.get();
auto statuses = h->matched_status();
for (size_t i = 0; i < 10; i++) {
if (i == 1) {
assert(statuses[i] == MatchStatus::NotMatchExact);
} else if (i == 5 || i == 6) {
assert(statuses[i] == MatchStatus::NotMatchPartial);
} else if (i == 0) {
assert(statuses[i] == MatchStatus::Exact);
} else {
assert(statuses[i] == MatchStatus::Partial);
}
}
auto hk = h->handle_data(true);
auto hv = h->handle_data(false);
for (size_t i = 0; i < 10; i++) {
if (i == 1 || i == 5 || i == 6) {
} else {
cmp_handle_data(slice(hk, i, i + 1), k[i], 1);
cmp_handle_data(slice(hv, i, i + 1), v[i], 1);
}
}
auto block_idx = h->get_gpu_block_idx();
auto [kcache, vcache] = kvc2->get_kvcache();
for (size_t i = 0; i < 10; i++) {
if (i == 1 || i == 5 || i == 6) {
} else {
std::vector<size_t> blocks = {block_idx[i]};
cmp_handle_gpu(blocks, kcache, vcache, k[i], v[i], 1);
}
}
}
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
return 0;
}
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 08:29:45
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-22 09:56:12
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
int main(int argc, char* argv[]) {
init(argc, argv);
spdlog::set_level(spdlog::level::trace);
auto kvc2 = kvc2::create_kvc2(config);
std::mt19937 gen(123);
std::vector<std::vector<Token>> ids;
std::vector<std::vector<layer_data>> k, v;
for (size_t i = 0; i < 10; i++) {
ids.push_back(random_ids(1 * config.num_token_per_page, gen));
k.push_back(random_kvcache(1, gen));
v.push_back(random_kvcache(1, gen));
kvc2->raw_insert(test_model_name, test_quant_type, ids[i].data(), ids[i].size(), k[i], v[i]);
}
kvc2->debug();
{
// all match
std::vector<Token*> chunks;
std::vector<TokenLength> lengths;
for (size_t i = 0; i < 10; i++) {
chunks.push_back(ids[i].data());
lengths.push_back(ids[i].size());
}
auto h = kvc2->lookup_alt(test_model_name, test_quant_type, chunks, lengths, 15 * config.num_token_per_page);
auto hk = h->handle_data(true);
auto hv = h->handle_data(false);
for (size_t i = 0; i < 10; i++) {
cmp_handle_data(slice(hk, i, i + 1), k[i], 1);
cmp_handle_data(slice(hv, i, i + 1), v[i], 1);
}
}
{
// no match in the middle
std::vector<Token*> chunks;
std::vector<TokenLength> lengths;
std::vector<std::vector<Token>> new_ids;
for (size_t i = 0; i < 10; i++) {
new_ids.push_back(random_ids(1 * config.num_token_per_page, gen));
}
for (size_t i = 0; i < 10; i++) {
if (i == 1 || i == 5 || i == 6) {
chunks.push_back(new_ids[i].data());
} else {
chunks.push_back(ids[i].data());
}
lengths.push_back(ids[i].size());
}
auto h = kvc2->lookup_alt(test_model_name, test_quant_type, chunks, lengths, 15 * config.num_token_per_page);
auto statuses = h->matched_status();
for (size_t i = 0; i < 10; i++) {
if (i == 1) {
assert(statuses[i] == MatchStatus::NotMatchExact);
} else if (i == 5 || i == 6) {
assert(statuses[i] == MatchStatus::NotMatchPartial);
} else if (i == 0) {
assert(statuses[i] == MatchStatus::Exact);
} else {
assert(statuses[i] == MatchStatus::Partial);
}
}
auto hk = h->handle_data(true);
auto hv = h->handle_data(false);
for (size_t i = 0; i < 10; i++) {
if (i == 1 || i == 5 || i == 6) {
} else {
cmp_handle_data(slice(hk, i, i + 1), k[i], 1);
cmp_handle_data(slice(hv, i, i + 1), v[i], 1);
}
}
}
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
return 0;
}
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 09:52:48
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-25 07:51:09
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include <future>
#include "common.hpp"
int main(int argc, char* argv[]) {
init(argc, argv);
spdlog::set_level(spdlog::level::debug);
auto kvc2 = kvc2::create_kvc2(config);
std::mt19937 gen(123);
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
auto k1 = random_kvcache(10, gen);
auto v1 = random_kvcache(10, gen);
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
// complete same
#pragma omp parallel for
for (size_t ti = 0; ti < 3; ti++) {
std::promise<std::shared_ptr<DoubleCacheHandleInterface>> p;
kvc2->lookup_to_gpu_async(test_model_name, test_quant_type, ids1.data(), ids1.size(),
ids1.size() + 2 * config.num_token_per_page,
[&p](std::shared_ptr<DoubleCacheHandleInterface> h) { p.set_value(h); });
auto fut = p.get_future();
fut.wait();
auto h = fut.get();
auto k = h->handle_data(true);
auto v = h->handle_data(false);
cmp_handle_data(k1, k, 10);
cmp_handle_data(v1, v, 10);
auto block_idx = h->get_gpu_block_idx();
auto [kcache, vcache] = kvc2->get_kvcache();
cmp_handle_gpu(block_idx, kcache, vcache, k1, v1, 10);
}
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
return 0;
}
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 09:52:48
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-25 07:51:09
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
int main(int argc, char* argv[]) {
qw25_7B_gpu_config.v_cache_on = false;
config.gpu_cache_config = qw25_7B_gpu_config;
config.v_cache_on = false;
init(argc, argv);
spdlog::set_level(spdlog::level::debug);
auto kvc2 = kvc2::create_kvc2(config);
std::mt19937 gen(123);
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
auto k1 = random_kvcache(10, gen);
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, {});
// complete same
#pragma omp parallel for
for (size_t ti = 0; ti < 3; ti++) {
auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), ids1.size(),
ids1.size() + 2 * config.num_token_per_page);
auto k = h->handle_data(true);
cmp_handle_data(k1, k, 10);
auto block_idx = h->get_gpu_block_idx();
auto [kcache, vcache] = kvc2->get_kvcache();
auto k_from_gpu = empty_kvcache(15);
size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
for (size_t i = 0; i < k_from_gpu.size(); i++) {
for (size_t j = 0; j < block_idx.size(); j++) {
size_t b_idx = block_idx[j];
for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
{
auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
void* src = kt.data_ptr();
void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
}
}
}
}
cmp_handle_data(k1, k_from_gpu, 10);
}
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
return 0;
}
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 09:52:48
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-25 07:51:09
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
int main(int argc, char* argv[]) {
init(argc, argv);
spdlog::set_level(spdlog::level::debug);
auto kvc2 = kvc2::create_kvc2(config);
std::mt19937 gen(123);
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
auto k1 = random_kvcache(10, gen);
auto v1 = random_kvcache(10, gen);
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
// complete same
#pragma omp parallel for
for (size_t ti = 0; ti < 3; ti++) {
auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), ids1.size(),
ids1.size() + 2 * config.num_token_per_page);
auto k = h->handle_data(true);
auto v = h->handle_data(false);
cmp_handle_data(k1, k, 10);
cmp_handle_data(v1, v, 10);
auto block_idx = h->get_gpu_block_idx();
auto [kcache, vcache] = kvc2->get_kvcache();
auto k_from_gpu = empty_kvcache(15);
auto v_from_gpu = empty_kvcache(15);
size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
for (size_t i = 0; i < k_from_gpu.size(); i++) {
for (size_t j = 0; j < block_idx.size(); j++) {
size_t b_idx = block_idx[j];
for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
{
auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
void* src = kt.data_ptr();
void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
}
{
auto vt = vcache[gpu_idx][i][b_idx].to(torch::kCPU);
void* src = vt.data_ptr();
void* dst = offset_by_bytes(v_from_gpu[i][j], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
}
}
}
}
cmp_handle_data(k1, k_from_gpu, 10);
cmp_handle_data(v1, v_from_gpu, 10);
}
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
return 0;
}
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 09:52:48
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-25 08:38:33
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
int main(int argc, char* argv[]) {
init(argc, argv);
spdlog::set_level(spdlog::level::debug);
auto kvc2 = kvc2::create_kvc2(config);
std::mt19937 gen(123);
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
auto k1 = random_kvcache(10, gen);
auto v1 = random_kvcache(10, gen);
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
// complete same
{
auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), ids1.size(),
ids1.size() + 5 * config.num_token_per_page);
auto k = h->handle_data(true);
auto v = h->handle_data(false);
cmp_handle_data(k1, k, 10);
cmp_handle_data(v1, v, 10);
auto block_idx = h->get_gpu_block_idx();
auto [kcache, vcache] = kvc2->get_kvcache();
auto k_from_gpu = empty_kvcache(15);
auto v_from_gpu = empty_kvcache(15);
size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
for (size_t i = 0; i < k_from_gpu.size(); i++) {
for (size_t j = 0; j < block_idx.size(); j++) {
size_t b_idx = block_idx[j];
for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
{
auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
void* src = kt.data_ptr();
void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
}
{
auto vt = vcache[gpu_idx][i][b_idx].to(torch::kCPU);
void* src = vt.data_ptr();
void* dst = offset_by_bytes(v_from_gpu[i][j], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
}
}
}
}
cmp_handle_data(k1, k_from_gpu, 10);
cmp_handle_data(v1, v_from_gpu, 10);
}
// prefix and evict
{
auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), config.num_token_per_page * 3,
config.gpu_cache_config->total_kvcache_pages * config.num_token_per_page);
auto k = h->handle_data(true);
auto v = h->handle_data(false);
cmp_handle_data(k1, k, 3);
cmp_handle_data(v1, v, 3);
auto block_idx = h->get_gpu_block_idx();
auto [kcache, vcache] = kvc2->get_kvcache();
auto k_from_gpu = empty_kvcache(3);
auto v_from_gpu = empty_kvcache(3);
size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
for (size_t i = 0; i < k_from_gpu.size(); i++) {
for (size_t j = 0; j < 3; j++) {
size_t b_idx = block_idx[j];
for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
{
auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
void* src = kt.data_ptr();
void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
}
{
auto vt = vcache[gpu_idx][i][b_idx].to(torch::kCPU);
void* src = vt.data_ptr();
void* dst = offset_by_bytes(v_from_gpu[i][j], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
}
}
}
}
cmp_handle_data(k1, k_from_gpu, 3);
cmp_handle_data(v1, v_from_gpu, 3);
}
// // complete prefix
// {
// std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(),
// ids2.size() + 3 * config.num_token_per_page);
// auto k = h->handle_data(true);
// auto v = h->handle_data(false);
// cmp_handle_data(k1, k, 3);
// cmp_handle_data(v1, v, 3);
// }
// // common prefix
// {
// std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
// auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
// ids2.insert(ids2.end(), rids.begin(), rids.end());
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
// auto k = h->handle_data(true);
// auto v = h->handle_data(false);
// cmp_handle_data(k1, k, 3);
// cmp_handle_data(v1, v, 3);
// }
// // no prefix
// {
// std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
// assert(h->matched_length() == 0);
// }
// // insert partly new
// auto k2 = random_kvcache(10, gen);
// auto v2 = random_kvcache(10, gen);
// copy_kvcache(k1, k2, 0, 5);
// copy_kvcache(v1, v2, 0, 5);
// auto ids2 = random_ids(10 * config.num_token_per_page, gen);
// for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
// ids2[i] = ids1[i];
// }
// kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
// // read new part
// {
// std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(),
// ids.size() + 7 * config.num_token_per_page);
// auto k = h->handle_data(true);
// auto v = h->handle_data(false);
// cmp_handle_data(k, k2, 7);
// cmp_handle_data(v, v2, 7);
// }
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment