Unverified Commit a41d2163 authored by wang jiahao's avatar wang jiahao Committed by GitHub
Browse files

Merge pull request #1013 from kvcache-ai/work-concurrent

In v0.2.4 version, we’ve added highly desired multi-concurrency support to the community through a major refactor of the whole architecture.
parents f142f4df 4ed9744e
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 09:52:48
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-25 07:51:09
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
int main(int argc, char* argv[]) {
init(argc, argv);
spdlog::set_level(spdlog::level::debug);
auto kvc2 = kvc2::create_kvc2(config);
std::mt19937 gen(123);
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
auto k1 = random_kvcache(10, gen);
auto v1 = random_kvcache(10, gen);
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
// complete same
#pragma omp parallel for
for (size_t ti = 0; ti < 3; ti++) {
auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), ids1.size(),
ids1.size() + 2 * config.num_token_per_page);
auto k = h->handle_data(true);
auto v = h->handle_data(false);
cmp_handle_data(k1, k, 10);
cmp_handle_data(v1, v, 10);
auto block_idx = h->get_gpu_block_idx();
auto [kcache, vcache] = kvc2->get_kvcache();
auto k_from_gpu = empty_kvcache(15);
auto v_from_gpu = empty_kvcache(15);
size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
for (size_t i = 0; i < k_from_gpu.size(); i++) {
for (size_t j = 0; j < block_idx.size(); j++) {
size_t b_idx = block_idx[j];
for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
{
auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
void* src = kt.data_ptr();
void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
}
{
auto vt = vcache[gpu_idx][i][b_idx].to(torch::kCPU);
void* src = vt.data_ptr();
void* dst = offset_by_bytes(v_from_gpu[i][j], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
}
}
}
}
cmp_handle_data(k1, k_from_gpu, 10);
cmp_handle_data(v1, v_from_gpu, 10);
}
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
return 0;
}
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 09:52:48
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-25 08:38:33
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
int main(int argc, char* argv[]) {
init(argc, argv);
spdlog::set_level(spdlog::level::debug);
auto kvc2 = kvc2::create_kvc2(config);
std::mt19937 gen(123);
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
auto k1 = random_kvcache(10, gen);
auto v1 = random_kvcache(10, gen);
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
// complete same
{
auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), ids1.size(),
ids1.size() + 5 * config.num_token_per_page);
auto k = h->handle_data(true);
auto v = h->handle_data(false);
cmp_handle_data(k1, k, 10);
cmp_handle_data(v1, v, 10);
auto block_idx = h->get_gpu_block_idx();
auto [kcache, vcache] = kvc2->get_kvcache();
auto k_from_gpu = empty_kvcache(15);
auto v_from_gpu = empty_kvcache(15);
size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
for (size_t i = 0; i < k_from_gpu.size(); i++) {
for (size_t j = 0; j < block_idx.size(); j++) {
size_t b_idx = block_idx[j];
for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
{
auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
void* src = kt.data_ptr();
void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
}
{
auto vt = vcache[gpu_idx][i][b_idx].to(torch::kCPU);
void* src = vt.data_ptr();
void* dst = offset_by_bytes(v_from_gpu[i][j], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
}
}
}
}
cmp_handle_data(k1, k_from_gpu, 10);
cmp_handle_data(v1, v_from_gpu, 10);
}
// prefix and evict
{
auto h = kvc2->lookup_to_gpu(test_model_name, test_quant_type, ids1.data(), config.num_token_per_page * 3,
config.gpu_cache_config->total_kvcache_pages * config.num_token_per_page);
auto k = h->handle_data(true);
auto v = h->handle_data(false);
cmp_handle_data(k1, k, 3);
cmp_handle_data(v1, v, 3);
auto block_idx = h->get_gpu_block_idx();
auto [kcache, vcache] = kvc2->get_kvcache();
auto k_from_gpu = empty_kvcache(3);
auto v_from_gpu = empty_kvcache(3);
size_t gpu_count = config.gpu_cache_config->gpu_devices_id.size();
size_t element_size_per_gpu = test_cache_info.element_size(config.num_token_per_page) / gpu_count;
for (size_t i = 0; i < k_from_gpu.size(); i++) {
for (size_t j = 0; j < 3; j++) {
size_t b_idx = block_idx[j];
for (size_t gpu_idx = 0; gpu_idx < gpu_count; gpu_idx++) {
{
auto kt = kcache[gpu_idx][i][b_idx].to(torch::kCPU);
void* src = kt.data_ptr();
void* dst = offset_by_bytes(k_from_gpu[i][j], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
}
{
auto vt = vcache[gpu_idx][i][b_idx].to(torch::kCPU);
void* src = vt.data_ptr();
void* dst = offset_by_bytes(v_from_gpu[i][j], gpu_idx * element_size_per_gpu);
memcpy(dst, src, element_size_per_gpu);
}
}
}
}
cmp_handle_data(k1, k_from_gpu, 3);
cmp_handle_data(v1, v_from_gpu, 3);
}
// // complete prefix
// {
// std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(),
// ids2.size() + 3 * config.num_token_per_page);
// auto k = h->handle_data(true);
// auto v = h->handle_data(false);
// cmp_handle_data(k1, k, 3);
// cmp_handle_data(v1, v, 3);
// }
// // common prefix
// {
// std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
// auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
// ids2.insert(ids2.end(), rids.begin(), rids.end());
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
// auto k = h->handle_data(true);
// auto v = h->handle_data(false);
// cmp_handle_data(k1, k, 3);
// cmp_handle_data(v1, v, 3);
// }
// // no prefix
// {
// std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
// assert(h->matched_length() == 0);
// }
// // insert partly new
// auto k2 = random_kvcache(10, gen);
// auto v2 = random_kvcache(10, gen);
// copy_kvcache(k1, k2, 0, 5);
// copy_kvcache(v1, v2, 0, 5);
// auto ids2 = random_ids(10 * config.num_token_per_page, gen);
// for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
// ids2[i] = ids1[i];
// }
// kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
// // read new part
// {
// std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(),
// ids.size() + 7 * config.num_token_per_page);
// auto k = h->handle_data(true);
// auto v = h->handle_data(false);
// cmp_handle_data(k, k2, 7);
// cmp_handle_data(v, v2, 7);
// }
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
return 0;
}
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 08:48:40
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-22 09:53:06
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
template <typename F>
void test_multi(F f) {
std::vector<std::thread> threads;
for (size_t i = 0; i < 10; i++) {
threads.push_back([f]() { f(); });
}
for (auto& t : threads) {
t.join();
}
}
int main(int argc, char* argv[]) {
init(argc, argv);
spdlog::set_level(spdlog::level::debug);
auto kvc2 = kvc2::create_kvc2(config);
std::mt19937 gen(123);
auto ids1 = random_ids(3 * config.num_token_per_page, gen);
auto k1 = random_kvcache(3, gen);
auto v1 = random_kvcache(3, gen);
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
// complete same
{
#pragma omp parallel for
for (size_t i = 0; i < 10; i++) {
auto h = kvc2->lookup(test_model_name, test_quant_type, ids1.data(), ids1.size(),
ids1.size() + 10 * config.num_token_per_page);
if (h == nullptr) {
SPDLOG_WARN("Thread[{}]: h is nullptr", i);
} else {
auto k = h->handle_data(true);
auto v = h->handle_data(false);
cmp_handle_data(k1, k, 3);
cmp_handle_data(v1, v, 3);
}
}
}
// // complete prefix
// {
// std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size() + 3 *
// config.num_token_per_page); auto k = h->handle_data(true); auto v = h->handle_data(false); cmp_handle_data(k1,
// k, 3); cmp_handle_data(v1, v, 3);
// }
// // common prefix
// {
// std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
// auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
// ids2.insert(ids2.end(), rids.begin(), rids.end());
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
// auto k = h->handle_data(true);
// auto v = h->handle_data(false);
// cmp_handle_data(k1, k, 3);
// cmp_handle_data(v1, v, 3);
// }
// // no prefix
// {
// std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
// assert(h->matched_length() == 0);
// }
// // insert partly new
// auto k2 = random_kvcache(10, gen);
// auto v2 = random_kvcache(10, gen);
// copy_kvcache(k1, k2, 0, 5);
// copy_kvcache(v1, v2, 0, 5);
// auto ids2 = random_ids(10 * config.num_token_per_page, gen);
// for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
// ids2[i] = ids1[i];
// }
// kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
// // read new part
// {
// std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
// auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(), ids.size() + 7 *
// config.num_token_per_page); auto k = h->handle_data(true); auto v = h->handle_data(false); cmp_handle_data(k,
// k2, 7); cmp_handle_data(v, v2, 7);
// }
kvc2->debug();
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
return 0;
}
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 08:29:45
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-22 09:56:12
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
int main(int argc, char* argv[]) {
qw25_7B_gpu_config.v_cache_on = false;
config.gpu_cache_config = qw25_7B_gpu_config;
config.v_cache_on = false;
init(argc, argv);
spdlog::set_level(spdlog::level::debug);
auto kvc2 = kvc2::create_kvc2(config);
std::mt19937 gen(123);
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
auto k1 = random_kvcache(10, gen);
// auto v1 = random_kvcache(10, gen);
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, {});
// complete same
{
auto h = kvc2->lookup(test_model_name, test_quant_type, ids1.data(), ids1.size(),
ids1.size() + 10 * config.num_token_per_page);
auto k = h->handle_data(true);
cmp_handle_data(k1, k, 10);
}
// complete prefix
{
std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(),
ids2.size() + 3 * config.num_token_per_page);
auto k = h->handle_data(true);
cmp_handle_data(k1, k, 3);
}
// common prefix
{
std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
ids2.insert(ids2.end(), rids.begin(), rids.end());
auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
auto k = h->handle_data(true);
cmp_handle_data(k1, k, 3);
}
// no prefix
{
std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
assert(h->matched_length() == 0);
}
// insert partly new
auto k2 = random_kvcache(10, gen);
copy_kvcache(k1, k2, 0, 5);
auto ids2 = random_ids(10 * config.num_token_per_page, gen);
for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
ids2[i] = ids1[i];
}
kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, {});
// read new part
{
std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(),
ids.size() + 7 * config.num_token_per_page);
auto k = h->handle_data(true);
cmp_handle_data(k, k2, 7);
}
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
return 0;
}
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 08:29:45
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-22 09:56:12
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
int main(int argc, char* argv[]) {
init(argc, argv);
spdlog::set_level(spdlog::level::debug);
auto kvc2 = kvc2::create_kvc2(config);
std::mt19937 gen(123);
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
auto k1 = random_kvcache(10, gen);
auto v1 = random_kvcache(10, gen);
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
// complete same
{
auto h = kvc2->lookup(test_model_name, test_quant_type, ids1.data(), ids1.size(),
ids1.size() + 10 * config.num_token_per_page);
auto k = h->handle_data(true);
auto v = h->handle_data(false);
cmp_handle_data(k1, k, 10);
cmp_handle_data(v1, v, 10);
}
// complete prefix
{
std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(),
ids2.size() + 3 * config.num_token_per_page);
auto k = h->handle_data(true);
auto v = h->handle_data(false);
cmp_handle_data(k1, k, 3);
cmp_handle_data(v1, v, 3);
}
// common prefix
{
std::vector<Token> ids2(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
ids2.insert(ids2.end(), rids.begin(), rids.end());
auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
auto k = h->handle_data(true);
auto v = h->handle_data(false);
cmp_handle_data(k1, k, 3);
cmp_handle_data(v1, v, 3);
}
// no prefix
{
std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
auto h = kvc2->lookup(test_model_name, test_quant_type, ids2.data(), ids2.size(), ids2.size());
assert(h->matched_length() == 0);
}
// insert partly new
auto k2 = random_kvcache(10, gen);
auto v2 = random_kvcache(10, gen);
copy_kvcache(k1, k2, 0, 5);
copy_kvcache(v1, v2, 0, 5);
auto ids2 = random_ids(10 * config.num_token_per_page, gen);
for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
ids2[i] = ids1[i];
}
kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
// read new part
{
std::vector<Token> ids(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
auto h = kvc2->lookup(test_model_name, test_quant_type, ids.data(), ids.size(),
ids.size() + 7 * config.num_token_per_page);
auto k = h->handle_data(true);
auto v = h->handle_data(false);
cmp_handle_data(k, k2, 7);
cmp_handle_data(v, v2, 7);
}
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
return 0;
}
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-11-22 06:00:16
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-11-22 07:30:46
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#include "common.hpp"
int main(int argc, char* argv[]) {
init(argc, argv);
spdlog::set_level(spdlog::level::debug);
auto kvc2 = kvc2::create_kvc2(config);
std::mt19937 gen(123);
auto ids1 = random_ids(10 * config.num_token_per_page, gen);
auto k1 = random_kvcache(10, gen);
auto v1 = random_kvcache(10, gen);
kvc2->raw_insert(test_model_name, test_quant_type, ids1.data(), ids1.size(), k1, v1);
// complete same
{
auto k2 = empty_kvcache(10);
auto v2 = empty_kvcache(10);
auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids1.data(), ids1.size(), k2, v2);
assert(l2 == ids1.size());
cmp_handle_data(k1, k2);
cmp_handle_data(v1, v2);
}
// complete prefix
{
auto k2 = empty_kvcache(10);
auto v2 = empty_kvcache(10);
std::vector<Token> ids2 = std::vector<Token>(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
assert(l2 == 3 * config.num_token_per_page);
cmp_handle_data(k1, k2, 3);
cmp_handle_data(v1, v2, 3);
}
// common prefix
{
auto k2 = empty_kvcache(10);
auto v2 = empty_kvcache(10);
std::vector<Token> ids2 = std::vector<Token>(ids1.begin(), ids1.begin() + 3 * config.num_token_per_page);
auto rids = random_ids(config.num_token_per_page * 2 + config.num_token_per_page / 2, gen);
ids2.insert(ids2.end(), rids.begin(), rids.end());
auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
assert(l2 == 3 * config.num_token_per_page);
cmp_handle_data(k1, k2, 3);
cmp_handle_data(v1, v2, 3);
}
// no prefix
{
auto k2 = empty_kvcache(1);
auto v2 = empty_kvcache(1);
std::vector<Token> ids2 = random_ids(config.num_token_per_page, gen);
auto l2 = kvc2->raw_read(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
assert(l2 == 0);
}
// insert partly new
auto k2 = random_kvcache(10, gen);
auto v2 = random_kvcache(10, gen);
copy_kvcache(k1, k2, 0, 5);
copy_kvcache(v1, v2, 0, 5);
auto ids2 = random_ids(10 * config.num_token_per_page, gen);
for (size_t i = 0; i < 5 * config.num_token_per_page; i++) {
ids2[i] = ids1[i];
}
kvc2->raw_insert(test_model_name, test_quant_type, ids2.data(), ids2.size(), k2, v2);
// read new part
{
auto k = empty_kvcache(10);
auto v = empty_kvcache(10);
std::vector<Token> ids = std::vector<Token>(ids2.begin(), ids2.begin() + 7 * config.num_token_per_page);
auto l = kvc2->raw_read(test_model_name, test_quant_type, ids.data(), ids.size(), k, v);
assert(l == 7 * config.num_token_per_page);
cmp_handle_data(k, k2, 7);
cmp_handle_data(v, v2, 7);
}
SPDLOG_CRITICAL("All Test Passed: {}", argv[0]);
return 0;
}
#include "kvcache_test_utils.cpp"
int main(int argc, char* argv[]) {
parse_and_check(argc, argv);
spdlog::set_level(spdlog::level::debug);
std::mt19937 gen(123);
KVC2 kvc2(FLAGS_disk_cache_path);
// auto io = kvc2.io_dealer->start_io_thread();
kvc2.io_dealer->start_io_thread().detach();
auto h1 = random_kvcache(qwen_cache_info, 10, gen);
h1.ids = random_ids(10 * BlockLength, gen);
kvc2.raw_insert(h1);
// complete same
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
h2.ids = h1.ids;
kvc2.raw_read(h2);
assert(static_cast<size_t>(h2.match.match_length) == h1.ids.size());
cmp_handle_data(h1, h2);
}
// complete prefix
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 3 * BlockLength);
kvc2.raw_read(h2);
assert(h2.match.match_length == 3 * BlockLength);
cmp_handle_data(h1, h2, 3);
}
// common prefix
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 5 * BlockLength);
auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
h2.ids.insert(h2.ids.end(), rids.begin(), rids.end());
kvc2.raw_read(h2);
assert(h2.match.match_length == 5 * BlockLength);
cmp_handle_data(h1, h2, 5);
}
// no prefix
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
h2.ids = random_ids(10 * BlockLength, gen);
kvc2.raw_read(h2);
assert(h2.match.match_length == 0);
}
// insert partly new
auto h2 = random_kvcache(qwen_cache_info, 10, gen);
copy_kvcache(h1, h2, 0, 5);
h2.ids = random_ids(10 * BlockLength, gen);
for (size_t i = 0; i < 5 * BlockLength; i++) {
h2.ids[i] = h1.ids[i];
}
kvc2.raw_insert(h2);
// read new part
{
auto h = empty_kvcache(qwen_cache_info, 10);
h.ids = std::vector<ID>(h2.ids.begin(), h2.ids.begin() + 7 * BlockLength);
h.ids.push_back(123);
kvc2.raw_read(h);
assert(h.match.match_length == 7 * BlockLength);
cmp_handle_data(h, h2, 7);
}
kvc2.tree->debug();
kvc2.io_dealer->stop();
// io.join();
SPDLOG_WARN("{} Test Passed", __FILE__);
return 0;
}
\ No newline at end of file
#include "kvcache_test_utils.cpp"
int main(int argc, char* argv[]) {
parse_and_check(argc, argv);
spdlog::set_level(spdlog::level::debug);
std::mt19937 gen(123);
KVC2 kvc2(FLAGS_disk_cache_path);
auto io = kvc2.io_dealer->start_io_thread();
SPDLOG_WARN("Insert 10 x 10 KVCache");
std::vector<KVCacheHandle> handles(10);
for (int i = 0; i < 10; i++) {
handles[i] = random_kvcache(qwen_cache_info, 10, gen);
auto& h1 = handles[i];
h1.ids = random_ids(10 * BlockLength, gen);
kvc2.raw_insert(h1);
}
SPDLOG_WARN("Cache Eviction Test");
{
for (int i = 0; i < 10; i++) {
auto& h = handles[i];
SPDLOG_WARN("Lookup {}", i);
auto x = kvc2.lookup(qwen_cache_info, h.ids.data(), h.ids.size());
cmp_handle_data(h, *x);
}
SPDLOG_WARN("Simple Eviction OK");
}
{
std::vector<std::shared_ptr<KVCacheHandle>> lookup_handles;
for (int i = 0; i < 10; i++) {
auto& h = handles[i];
SPDLOG_WARN("Lookup {}", i);
auto x = kvc2.lookup(qwen_cache_info, h.ids.data(), h.ids.size());
if (i >= 5) {
assert(x == nullptr);
continue;
}
lookup_handles.push_back(x);
cmp_handle_data(h, *x);
}
SPDLOG_WARN("Cannot Eviction OK");
}
kvc2.io_dealer->stop();
io.join();
SPDLOG_WARN("{} Test Passed", __FILE__);
return 0;
}
\ No newline at end of file
#include "kvcache_test_utils.cpp"
int main(int argc, char* argv[]) {
parse_and_check(argc, argv);
spdlog::set_level(spdlog::level::debug);
std::mt19937 gen(123);
KVC2 kvc2(FLAGS_disk_cache_path);
auto io = kvc2.io_dealer->start_io_thread();
SPDLOG_INFO("Disk Test");
auto h1 = random_kvcache(qwen_cache_info, 10, gen);
h1.ids = random_ids(10 * BlockLength, gen);
kvc2.raw_insert(h1);
// complete same
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
h2.ids = h1.ids;
kvc2.raw_read(h2);
assert(static_cast<size_t>(h2.match.match_length) == h1.ids.size());
cmp_handle_data(h1, h2);
}
// complete prefix
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 3 * BlockLength);
kvc2.raw_read(h2);
assert(h2.match.match_length == 3 * BlockLength);
cmp_handle_data(h1, h2, 3);
}
// common prefix
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 5 * BlockLength);
auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
h2.ids.insert(h2.ids.end(), rids.begin(), rids.end());
kvc2.raw_read(h2);
assert(h2.match.match_length == 5 * BlockLength);
cmp_handle_data(h1, h2, 5);
}
// no prefix
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
h2.ids = random_ids(10 * BlockLength, gen);
kvc2.raw_read(h2);
assert(h2.match.match_length == 0);
}
// insert partly new
auto h2 = random_kvcache(qwen_cache_info, 10, gen);
copy_kvcache(h1, h2, 0, 5);
h2.ids = random_ids(10 * BlockLength, gen);
for (size_t i = 0; i < 5 * BlockLength; i++) {
h2.ids[i] = h1.ids[i];
}
kvc2.raw_insert(h2);
// read new part
{
auto h = empty_kvcache(qwen_cache_info, 10);
h.ids = std::vector<ID>(h2.ids.begin(), h2.ids.begin() + 7 * BlockLength);
h.ids.push_back(123);
kvc2.raw_read(h);
assert(h.match.match_length == 7 * BlockLength);
cmp_handle_data(h, h2, 7);
}
SPDLOG_WARN("Memory Test");
{
auto h = kvc2.lookup(qwen_cache_info, h1.ids.data(), h1.ids.size());
assert(h);
cmp_handle_data(h1, *h);
kvc2.block_cache->debug();
}
kvc2.block_cache->debug();
{
auto h = kvc2.lookup(qwen_cache_info, h1.ids.data(), 5 * BlockLength);
assert(h);
cmp_handle_data(h1, *h, 5);
kvc2.block_cache->debug();
}
kvc2.block_cache->debug();
kvc2.io_dealer->stop();
io.join();
SPDLOG_WARN("{} Test Passed", __FILE__);
return 0;
}
\ No newline at end of file
#include "kvcache_test_utils.cpp"
int main(int argc, char* argv[]) {
parse_and_check(argc, argv);
spdlog::set_level(spdlog::level::debug);
std::mt19937 gen(123);
std::vector<KVCacheHandle> handles(10);
{
KVC2 kvc2(FLAGS_disk_cache_path);
auto io = kvc2.io_dealer->start_io_thread();
SPDLOG_WARN("Insert 10 x 10 KVCache");
for (int i = 0; i < 10; i++) {
handles[i] = random_kvcache(qwen_cache_info, 10, gen);
auto& h1 = handles[i];
h1.ids = random_ids(10 * BlockLength, gen);
kvc2.raw_insert(h1);
}
kvc2.save();
kvc2.tree->debug();
kvc2.io_dealer->stop();
io.join();
}
{
KVC2 kvc2(FLAGS_disk_cache_path);
auto io = kvc2.io_dealer->start_io_thread();
kvc2.load();
kvc2.tree->debug();
auto& h1 = handles[0];
// complete same
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
h2.ids = h1.ids;
kvc2.raw_read(h2);
assert(static_cast<size_t>(h2.match.match_length) == h1.ids.size());
cmp_handle_data(h1, h2);
}
// complete prefix
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 3 * BlockLength);
kvc2.raw_read(h2);
assert(h2.match.match_length == 3 * BlockLength);
cmp_handle_data(h1, h2, 3);
}
// common prefix
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
h2.ids = std::vector<ID>(h1.ids.begin(), h1.ids.begin() + 5 * BlockLength);
auto rids = random_ids(BlockLength * 2 + BlockLength / 2, gen);
h2.ids.insert(h2.ids.end(), rids.begin(), rids.end());
kvc2.raw_read(h2);
assert(h2.match.match_length == 5 * BlockLength);
cmp_handle_data(h1, h2, 5);
}
// no prefix
{
auto h2 = empty_kvcache(qwen_cache_info, 10);
h2.ids = random_ids(10 * BlockLength, gen);
kvc2.raw_read(h2);
assert(h2.match.match_length == 0);
}
// insert partly new
auto h2 = random_kvcache(qwen_cache_info, 10, gen);
copy_kvcache(h1, h2, 0, 5);
h2.ids = random_ids(10 * BlockLength, gen);
for (size_t i = 0; i < 5 * BlockLength; i++) {
h2.ids[i] = h1.ids[i];
}
kvc2.raw_insert(h2);
// read new part
{
auto h = empty_kvcache(qwen_cache_info, 10);
h.ids = std::vector<ID>(h2.ids.begin(), h2.ids.begin() + 7 * BlockLength);
h.ids.push_back(123);
kvc2.raw_read(h);
assert(h.match.match_length == 7 * BlockLength);
cmp_handle_data(h, h2, 7);
}
kvc2.io_dealer->stop();
io.join();
}
SPDLOG_WARN("{} Test Passed", __FILE__);
return 0;
}
\ No newline at end of file
#include <iostream>
#include <thread>
#include <vector>
#include <random>
#include <unistd.h>
#include "page_aligned_memory_pool.cpp"
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
// 每个线程执行的任务
void thread_task(PageAlignedMemoryPool& pool) {
std::mt19937 gen(123);
std::vector<std::pair<void*, size_t>> allocated;
size_t cnt = 40000;
for (size_t i = 0; i < cnt; ++i) {
// 随机分配一个大小
size_t size = (gen() % 100 + 1) * 4096 * 4;
void* ptr = pool.alloc(size);
// SPDLOG_DEBUG(pool.debug());
if (ptr) {
pool.free(ptr, size);
// allocated.push_back({ptr, size});
}
// sleep((int)(gen() % 1000) / 1000.0);
}
// free all memory
for (auto& p : allocated) {
pool.free(p.first, p.second);
}
}
int main(int argc, char* argv[]) {
spdlog::set_level(spdlog::level::debug);
// 创建一个内存池
PageAlignedMemoryPool pool(40ll * 1024 * 1024 * 1024); // 40 G
// 创建线程
const int num_threads = 32;
std::vector<std::thread> threads;
for (int i = 0; i < num_threads; ++i) {
threads.emplace_back(thread_task, std::ref(pool));
}
// 等待所有线程完成
for (auto& t : threads) {
t.join();
}
// 输出调试信息
std::cout << pool.debug() << std::endl;
return 0;
}
\ No newline at end of file
import sys
sys.path.append('./build')
sys.path.append('./src')
import torch
import kvc2_ext
from kvc2_utils import get_tensor_from_data_ptr
# Create a kvc2 instance
path = "/mnt/data/kvc2"
kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
kvc2_ext.load(kvc2_instance)
# Start IO thread
print("Start IO thread")
kvc2_ext.start_io_thread(kvc2_instance)
print("IO thread started")
# Create CacheInfoInput
test_info = kvc2_ext.CacheInfoInput()
test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
test_info.quant_type = kvc2_ext.QuantType.QT_F32
print("Element size: ", test_info.element_size())
# Generate random test IDs (length = 2560)
torch.manual_seed(123)
length = 2560
test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
block_count = (length+255) // 256
# print("Test ID: ", test_id)
# Generate test data based on element size and hidden layer count
element_size = test_info.element_size()
hidden_layer_count = test_info.hidden_layer_count()
def read_cmp_and_release(kvc2_instance,cache_info,ids,length):
handle = kvc2_ext.lookup(kvc2_instance, cache_info, ids, length)
if kvc2_ext.is_nullptr(handle):
print("Handle is nullptr.")
exit()
matched_length = kvc2_ext.matched_length(handle)
matched_data = kvc2_ext.handle_data(handle)
print('Matched length: ', matched_length)
if matched_length >0:
print(f'First layer address {[hex(x) for x in matched_data[0]]}')
read_data = get_tensor_from_data_ptr(matched_data,element_size)
print("Just read check ok.")
kvc2_ext.release(handle)
l = 128
while l<=length:
read_cmp_and_release(kvc2_instance,test_info,test_id.data_ptr(),l)
l+=128
kvc2_ext.destroy_kvc2(kvc2_instance)
print("Test completed successfully.")
import sys
sys.path.append('./build')
sys.path.append('./src')
import torch
import kvc2_ext
from kvc2_utils import alloc_aligned_cache,dealloc_aligned_cache,get_tensor_ptr,get_tensor_from_data_ptr
# Create a kvc2 instance
path = "/mnt/data/kvc2"
kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
# Start IO thread
print("Start IO thread")
kvc2_ext.start_io_thread(kvc2_instance)
print("IO thread started")
# Create CacheInfoInput
test_info = kvc2_ext.CacheInfoInput()
test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
test_info.quant_type = kvc2_ext.QuantType.QT_F32
print("Element size: ", test_info.element_size())
# Generate random test IDs (length = 2560)
torch.manual_seed(123)
length = 2560
test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
block_count = (length+255) // 256
# print("Test ID: ", test_id)
# Generate test data based on element size and hidden layer count
element_size = test_info.element_size()
hidden_layer_count = test_info.hidden_layer_count()
write_data,write_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
# print(test_data,test_data_mem)
print('Generate Insert Data')
for layer in write_data:
for data in layer:
random_values = torch.randint(0, 256, (element_size,), dtype=torch.uint8)
data.copy_(random_values)
print('Insert New data')
# Insert raw data
kvc2_ext.raw_insert(kvc2_instance, test_info, test_id.data_ptr(), length, get_tensor_ptr(write_data))
def read_cmp_and_release(kvc2_instance,cache_info,ids,length):
handle = kvc2_ext.lookup(kvc2_instance, cache_info, ids, length)
if kvc2_ext.is_nullptr(handle):
print("Handle is nullptr.")
exit()
matched_length = kvc2_ext.matched_length(handle)
matched_data = kvc2_ext.handle_data(handle)
print('Matched length: ', matched_length)
if matched_length >0:
print(f'First layer address {[hex(x) for x in matched_data[0]]}')
read_data = get_tensor_from_data_ptr(matched_data,element_size)
for layer_w,layer_r in zip(write_data,read_data):
for data_w,data_r in zip(layer_w,layer_r):
# print(data_w,data_r)
assert torch.equal(data_w,data_r)
print("Lookup read check ok.")
kvc2_ext.release(handle)
l = 128
while l<=length:
read_cmp_and_release(kvc2_instance,test_info,test_id.data_ptr(),l)
l+=128
dealloc_aligned_cache(write_data_mem)
kvc2_ext.save(kvc2_instance)
kvc2_ext.destroy_kvc2(kvc2_instance)
print("Test completed successfully.")
import sys
sys.path.append('./build')
sys.path.append('./src')
import torch
import kvc2_ext
from kvc2_utils import alloc_aligned_cache,dealloc_aligned_cache,get_tensor_ptr,get_tensor_from_data_ptr
# Create a kvc2 instance
path = "/mnt/data/kvc2"
kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
# Start IO thread
print("Start IO thread")
kvc2_ext.start_io_thread(kvc2_instance)
print("IO thread started")
# Create CacheInfoInput
test_info = kvc2_ext.CacheInfoInput()
test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
test_info.quant_type = kvc2_ext.QuantType.QT_F32
print("Element size: ", test_info.element_size())
# Generate random test IDs (length = 2560)
length = 2560
test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
block_count = (length+255) // 256
# print("Test ID: ", test_id)
# Generate test data based on element size and hidden layer count
element_size = test_info.element_size()
hidden_layer_count = test_info.hidden_layer_count()
write_data,write_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
# print(test_data,test_data_mem)
print('Generate Insert Data')
for layer in write_data:
for data in layer:
random_values = torch.randint(0, 256, (element_size,), dtype=torch.uint8)
data.copy_(random_values)
print('Insert New data')
# Insert raw data
kvc2_ext.raw_insert(kvc2_instance, test_info, test_id.data_ptr(), length, get_tensor_ptr(write_data))
handle = kvc2_ext.lookup(kvc2_instance, test_info, test_id.data_ptr(), length)
matched_length = kvc2_ext.matched_length(handle)
matched_data = kvc2_ext.handle_data(handle)
print('Matched length: ', matched_length)
print(f'Match data layer {len(matched_data)}')
print(f'Match layer block count {len(matched_data[0])}')
read_data = get_tensor_from_data_ptr(matched_data,element_size)
for layer_w,layer_r in zip(write_data,read_data):
for data_w,data_r in zip(layer_w,layer_r):
# print(data_w,data_r)
assert torch.equal(data_w,data_r)
print("Lookup read check ok.")
dealloc_aligned_cache(write_data_mem)
kvc2_ext.save(kvc2_instance)
print("Test completed successfully.")
import sys
sys.path.append('./build')
sys.path.append('./src')
import torch
import kvc2_ext
from kvc2_utils import alloc_aligned_cache,dealloc_aligned_cache,get_tensor_ptr
# Create a kvc2 instance
path = "/mnt/data/kvc2"
kvc2_instance = kvc2_ext.create_kvc2(path,int(10e9)) # 10 G memory pool
# Start IO thread
print("Start IO thread")
kvc2_ext.start_io_thread(kvc2_instance)
print("IO thread started")
# Create CacheInfoInput
test_info = kvc2_ext.CacheInfoInput()
test_info.model_type = kvc2_ext.ModelType.MT_DeepseekV2
test_info.cache_type = kvc2_ext.CacheType.CT_KeyCache
test_info.quant_type = kvc2_ext.QuantType.QT_F32
print("Element size: ", test_info.element_size())
# Generate random test IDs (length = 2560)
length = 2560
test_id = torch.randint(0, 65536, (length,), dtype=torch.uint16).contiguous()
block_count = (length+255) // 256
# print("Test ID: ", test_id)
# Generate test data based on element size and hidden layer count
element_size = test_info.element_size()
hidden_layer_count = test_info.hidden_layer_count()
write_data,write_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
# print(test_data,test_data_mem)
print('Generate Insert Data')
for layer in write_data:
for data in layer:
random_values = torch.randint(0, 256, (element_size,), dtype=torch.uint8)
data.copy_(random_values)
print('Insert New data')
# Insert raw data
kvc2_ext.raw_insert(kvc2_instance, test_info, test_id.data_ptr(), length, get_tensor_ptr(write_data))
read_data,read_data_mem = alloc_aligned_cache(hidden_layer_count,block_count,element_size)
print('Raw read')
matched_length = kvc2_ext.raw_read(kvc2_instance, test_info, test_id.data_ptr(), length,get_tensor_ptr(read_data))
print('Matched length: ', matched_length)
for layer_w,layer_r in zip(write_data,read_data):
for data_w,data_r in zip(layer_w,layer_r):
# print(data_w,data_r)
assert torch.equal(data_w,data_r)
print("Raw read check ok.")
dealloc_aligned_cache(write_data_mem)
dealloc_aligned_cache(read_data_mem)
kvc2_ext.save(kvc2_instance)
print("Test completed successfully.")
import ctypes
import torch
def aligned_tensor(size, alignment=4096):
num_bytes = size
mem = ctypes.c_void_p()
error_code = ctypes.CDLL(None).posix_memalign(
ctypes.byref(mem), ctypes.c_size_t(alignment), ctypes.c_size_t(num_bytes)
)
if error_code != 0:
raise MemoryError(f"posix_memalign failed with error code {error_code}")
array_type = (ctypes.c_int8 * size)
raw_array = array_type.from_address(mem.value)
tensor = torch.frombuffer(raw_array, dtype=torch.int8)
if tensor.data_ptr() % alignment != 0:
raise ValueError(f"Tensor data_ptr {tensor.data_ptr()} is not aligned to {alignment} bytes")
return tensor, mem
size = 5124380
tensor, mem_ptr = aligned_tensor(size, alignment=4096)
print(f"Tensor: {tensor}, size: {tensor.size()}, dataptr: {tensor.data_ptr()}")
print(f"Tensor memory alignment: {tensor.data_ptr() % 4096 == 0}")
print(f"Allocated memory address: {mem_ptr.value}")
ctypes.CDLL(None).free(mem_ptr)
#include <cuda_runtime.h>
#include <functional>
#include <iostream>
#include <stdexcept>
#include <vector>
class CudaStreamManager {
public:
CudaStreamManager(int num_streams);
~CudaStreamManager();
// Request structure
struct Request {
std::vector<void*> host_mem_addresses;
std::vector<void*> device_mem_addresses;
std::vector<size_t> sizes;
cudaMemcpyKind direction;
std::function<void()> callback;
};
void submitRequest(const Request& request);
private:
int num_streams_;
std::vector<cudaStream_t> streams_;
int next_stream_index_;
};
CudaStreamManager::CudaStreamManager(int num_streams) : num_streams_(num_streams), next_stream_index_(0) {
streams_.resize(num_streams_);
for (int i = 0; i < num_streams_; ++i) {
cudaError_t err = cudaStreamCreate(&streams_[i]);
if (err != cudaSuccess) {
std::cerr << "Failed to create CUDA stream: " << cudaGetErrorString(err) << std::endl;
for (int j = 0; j < i; ++j) {
cudaStreamDestroy(streams_[j]);
}
throw std::runtime_error("Failed to create CUDA stream");
}
}
}
CudaStreamManager::~CudaStreamManager() {
for (int i = 0; i < num_streams_; ++i) {
cudaStreamDestroy(streams_[i]);
}
}
void CudaStreamManager::submitRequest(const Request& request) {
int stream_index = next_stream_index_;
cudaStream_t stream = streams_[stream_index];
next_stream_index_ = (next_stream_index_ + 1) % num_streams_;
size_t num_transfers = request.host_mem_addresses.size();
for (size_t i = 0; i < num_transfers; ++i) {
cudaError_t err = cudaMemcpyAsync(request.device_mem_addresses[i], request.host_mem_addresses[i], request.sizes[i],
request.direction, stream);
if (err != cudaSuccess) {
std::cerr << "cudaMemcpyAsync failed: " << cudaGetErrorString(err) << std::endl;
throw std::runtime_error("cudaMemcpyAsync failed");
}
}
// Enqueue the callback function
struct CallbackData {
std::function<void()> callback;
};
CallbackData* cb_data = new CallbackData{request.callback};
cudaError_t err = cudaLaunchHostFunc(
stream,
[](void* data) {
CallbackData* cb_data = static_cast<CallbackData*>(data);
cb_data->callback();
delete cb_data;
},
cb_data);
if (err != cudaSuccess) {
std::cerr << "cudaLaunchHostFunc failed: " << cudaGetErrorString(err) << std::endl;
throw std::runtime_error("cudaLaunchHostFunc failed");
}
}
// Example usage
int main() {
try {
CudaStreamManager stream_manager(4); // Create a manager with 4 streams
// Prepare host and device memory
const size_t num_pages = 10;
std::vector<void*> host_mem_addresses(num_pages);
std::vector<void*> device_mem_addresses(num_pages);
std::vector<size_t> sizes(num_pages, 4096); // 4KB pages
// Allocate host memory
for (size_t i = 0; i < num_pages; ++i) {
host_mem_addresses[i] = malloc(4096);
if (!host_mem_addresses[i]) {
throw std::runtime_error("Failed to allocate host memory");
}
// Initialize data if necessary
}
// Allocate device memory
for (size_t i = 0; i < num_pages; ++i) {
cudaError_t err = cudaMalloc(&device_mem_addresses[i], 4096);
if (err != cudaSuccess) {
std::cerr << "cudaMalloc failed: " << cudaGetErrorString(err) << std::endl;
throw std::runtime_error("cudaMalloc failed");
}
}
// Create a request
CudaStreamManager::Request request;
request.host_mem_addresses = host_mem_addresses;
request.device_mem_addresses = device_mem_addresses;
request.sizes = sizes;
request.direction = cudaMemcpyHostToDevice;
request.callback = []() { std::cout << "Data transfer completed!" << std::endl; };
// Submit the request
stream_manager.submitRequest(request);
// Wait for all streams to complete
cudaError_t err = cudaDeviceSynchronize();
if (err != cudaSuccess) {
std::cerr << "cudaDeviceSynchronize failed: " << cudaGetErrorString(err) << std::endl;
throw std::runtime_error("cudaDeviceSynchronize failed");
}
// Clean up
for (size_t i = 0; i < num_pages; ++i) {
free(host_mem_addresses[i]);
cudaFree(device_mem_addresses[i]);
}
} catch (const std::exception& e) {
std::cerr << "Exception: " << e.what() << std::endl;
return 1;
}
return 0;
}
#include "cuda_stream_manager.hh"
#include <cuda_runtime.h>
#include <functional>
#include <iostream>
#include <stdexcept>
#include <vector>
int main() {
try {
int num_devices = 0;
cudaError_t err = cudaGetDeviceCount(&num_devices);
if (err != cudaSuccess) {
std::cerr << "cudaGetDeviceCount failed: " << cudaGetErrorString(err) << std::endl;
return 1;
}
if (num_devices < 1) {
std::cerr << "未找到 CUDA 设备。" << std::endl;
return 1;
}
std::vector<size_t> device_ids;
for (int i = 0; i < num_devices; ++i) {
device_ids.push_back(i);
}
const size_t num_pages = 10;
const size_t page_size = 4096; // 每页 4KB
// 创建 CudaStreamManager 实例,管理所有设备
CudaStreamManager stream_manager(device_ids, 4);
// 准备主机内存和设备内存映射
std::vector<std::vector<void*>> host_mem_addresses(num_devices);
std::vector<std::vector<void*>> device_mem_addresses(num_devices);
// 分配主机内存
for (size_t i = 0; i < num_pages; ++i) {
void* host_ptr = malloc(page_size);
if (!host_ptr) {
throw std::runtime_error("Failed to allocate host memory");
}
// 如果需要,初始化数据
// 将相同的主机内存添加到每个设备的列表中
for (int device_id = 0; device_id < num_devices; ++device_id) {
host_mem_addresses[device_id].push_back(host_ptr);
}
}
// 为每个设备分配设备内存
for (int device_id = 0; device_id < num_devices; ++device_id) {
err = cudaSetDevice(device_id);
if (err != cudaSuccess) {
std::cerr << "cudaSetDevice failed: " << cudaGetErrorString(err) << std::endl;
throw std::runtime_error("cudaSetDevice failed");
}
for (size_t i = 0; i < num_pages; ++i) {
void* device_ptr;
err = cudaMalloc(&device_ptr, page_size);
if (err != cudaSuccess) {
std::cerr << "cudaMalloc failed on device " << device_id << ": " << cudaGetErrorString(err) << std::endl;
throw std::runtime_error("cudaMalloc failed");
}
device_mem_addresses[device_id].push_back(device_ptr);
}
}
// 为每个设备创建并提交请求
for (int device_id = 0; device_id < num_devices; ++device_id) {
auto request = std::shared_ptr<CudaStreamManager::Request>(new CudaStreamManager::Request);
request->device_id = device_id;
request->host_mem_addresses = host_mem_addresses[device_id];
request->device_mem_addresses = device_mem_addresses[device_id];
request->sizes = std::vector<size_t>(num_pages, page_size);
request->direction = cudaMemcpyHostToDevice;
request->callback = [device_id]() {
std::cout << "Device " << device_id << " data transfer completed!" << std::endl;
};
stream_manager.submitRequest(request);
}
// 等待一段时间,确保所有请求都被处理
// 在实际应用中,可以使用更好的同步机制
std::this_thread::sleep_for(std::chrono::seconds(5));
// 清理主机内存
for (size_t i = 0; i < num_pages; ++i) {
free(host_mem_addresses[0][i]); // 所有设备共享相同的主机内存,只需释放一次
}
// 清理设备内存
for (int device_id = 0; device_id < num_devices; ++device_id) {
err = cudaSetDevice(device_id);
if (err != cudaSuccess) {
std::cerr << "cudaSetDevice failed during cleanup: " << cudaGetErrorString(err) << std::endl;
continue;
}
for (void* ptr : device_mem_addresses[device_id]) {
cudaFree(ptr);
}
}
} catch (const std::exception& e) {
std::cerr << "异常: " << e.what() << std::endl;
return 1;
}
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment