"tests/nn/vscode:/vscode.git/clone" did not exist on "1c8d219d0a3e6364ded7d6970b755b06b7aa8e05"
Commit 25cee581 authored by Atream's avatar Atream
Browse files

add balance-serve, support concurrence

parent 8d0292aa
#include "cuda_stream_manager.hh"
#include <cuda_runtime.h>
#include <functional>
#include <iostream>
#include <stdexcept>
#include <vector>
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
// #define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
CudaStreamManager::CudaStreamManager(const std::vector<size_t>& device_ids, int num_streams_per_device) {
for (int device_id : device_ids) {
auto x = std::unique_ptr<DeviceInfo>(new DeviceInfo);
DeviceInfo& device_info = *x;
device_info.device_id = device_id;
device_info.next_stream_index = 0;
device_info.stop_flag = false;
// 设置设备
cudaError_t err = cudaSetDevice(device_id);
if (err != cudaSuccess) {
SPDLOG_WARN("cudaSetDevice failed on device {}: {}", device_id, cudaGetErrorString(err));
throw std::runtime_error("cudaSetDevice failed");
}
// 创建 CUDA 流
device_info.streams.resize(num_streams_per_device);
for (int i = 0; i < num_streams_per_device; ++i) {
err = cudaStreamCreate(&device_info.streams[i]);
if (err != cudaSuccess) {
SPDLOG_WARN("Failed to create CUDA stream on device {}: {}", device_id, cudaGetErrorString(err));
throw std::runtime_error("Failed to create CUDA stream");
}
}
// 启动设备工作线程
device_info.worker_thread = std::thread(&CudaStreamManager::deviceWorker, this, std::ref(device_info));
devices_.push_back(std::move(x));
}
}
CudaStreamManager::~CudaStreamManager() {
// 通知所有设备线程停止
for (auto& device_info : devices_) {
device_info->stop_flag.store(true);
auto request = std::shared_ptr<Request>(new Request);
request->should_exit = true;
device_info->request_queue.enqueue(std::move(request));
}
// 等待所有线程结束
for (auto& device_info : devices_) {
if (device_info->worker_thread.joinable()) {
device_info->worker_thread.join();
}
// 销毁 CUDA 流
cudaSetDevice(device_info->device_id);
for (auto& stream : device_info->streams) {
cudaStreamDestroy(stream);
}
}
}
void CudaStreamManager::submitRequest(std::shared_ptr<Request> request) {
// 找到对应的设备
for (auto& device_info : devices_) {
if (device_info->device_id == request->device_id) {
device_info->request_queue.enqueue(request);
return;
}
}
throw std::runtime_error("Invalid device ID in request");
}
void CudaStreamManager::deviceWorker(DeviceInfo& device_info) {
// 设置设备
cudaError_t err = cudaSetDevice(device_info.device_id);
if (err != cudaSuccess) {
SPDLOG_WARN("cudaSetDevice failed in worker thread for device {}: {}", device_info.device_id,
cudaGetErrorString(err));
return;
}
while (device_info.stop_flag.load() == false) {
auto request = device_info.request_queue.dequeue();
if (request->should_exit) {
return;
}
// 处理请求
SPDLOG_DEBUG("Getting request on device {}, count {}", device_info.device_id, request->host_mem_addresses.size());
int stream_index = device_info.next_stream_index;
cudaStream_t stream = device_info.streams[stream_index];
device_info.next_stream_index = (device_info.next_stream_index + 1) % device_info.streams.size();
size_t num_transfers = request->host_mem_addresses.size();
for (size_t i = 0; i < num_transfers; ++i) {
void* dst = request->device_mem_addresses[i];
void* src = request->host_mem_addresses[i];
if (request->direction == cudaMemcpyDeviceToHost) {
std::swap(dst, src);
}
cudaError_t err = cudaMemcpyAsync(dst, src, request->sizes[i], request->direction, stream);
if (err != cudaSuccess) {
SPDLOG_WARN("cudaMemcpyAsync failed on device {}: {}", device_info.device_id, cudaGetErrorString(err));
// 可以根据需要处理错误,这里简单地继续
continue;
}
}
// 添加回调函数,因为是异步,所以需要包起来
struct CallbackData {
std::function<void()> callback;
};
CallbackData* cb_data = new CallbackData{request->callback};
err = cudaLaunchHostFunc(
stream,
[](void* data) {
// SPDLOG_DEBUG("Callback function called");
CallbackData* cb_data = static_cast<CallbackData*>(data);
cb_data->callback();
delete cb_data;
},
cb_data);
if (err != cudaSuccess) {
SPDLOG_WARN("cudaLaunchHostFunc failed on device {}: {}", device_info.device_id, cudaGetErrorString(err));
// 根据需要处理错误
}
}
}
/*
* @Author: Xie Weiyu ervinxie@qq.com
* @Date: 2024-11-19 09:24:47
* @LastEditors: Xie Weiyu ervinxie@qq.com
* @LastEditTime: 2024-11-20 02:55:49
* @FilePath: /kvc2/src/cuda_stream_manager.hh
* @Description: 这是默认设置,请设置`customMade`, 打开koroFileHeader查看配置 进行设置: https://github.com/OBKoro1/koro1FileHeader/wiki/%E9%85%8D%E7%BD%AE
*/
#pragma once
#include <cuda_runtime.h>
#include <atomic>
#include <functional>
#include <memory>
#include <thread>
#include <vector>
#include "utils/mpsc.hpp"
class CudaStreamManager {
public:
// 构造函数,接受要使用的设备 ID 列表和每个设备的流数量
CudaStreamManager(const std::vector<size_t>& device_ids, int num_streams_per_device);
~CudaStreamManager();
// 请求结构体
struct Request {
bool should_exit = false;
int device_id;
std::vector<void*> host_mem_addresses;
std::vector<void*> device_mem_addresses;
std::vector<size_t> sizes;
cudaMemcpyKind direction;
std::function<void()> callback;
};
void submitRequest(std::shared_ptr<Request> request);
private:
// 每个设备的信息
struct DeviceInfo {
int device_id;
std::thread worker_thread;
std::vector<cudaStream_t> streams;
int next_stream_index;
MPSCQueueConsumerLock<std::shared_ptr<Request>> request_queue;
std::atomic_bool stop_flag;
};
// 设备 ID 到 DeviceInfo 的映射
std::vector<std::unique_ptr<DeviceInfo>> devices_;
// 私有方法
void deviceWorker(DeviceInfo& device_info);
};
#ifndef __DEFS_H_
#define __DEFS_H_
#include <cstdint>
#include <optional>
#include <vector>
#include "model_config.h"
namespace kvc2 {
using kvc2_ptr = void*;
// using data_block_ptr = std::intptr_t;
using data_block_ptr = void*;
using layer_data = std::vector<data_block_ptr>;
using kvc2_handle = void*;
using Token = uint32_t;
using Tokens = std::vector<Token>;
using TokenPtr = std::intptr_t;
using TokenLength = size_t;
using BlockLength = size_t;
struct CacheInfo {
ModelName model_name;
bool is_key_cache;
QuantType quant_type;
size_t hidden_layer_count();
std::filesystem::path path(std::optional<size_t> which_layer = std::nullopt);
bool operator==(const CacheInfo& other) const;
size_t element_size(size_t block_length);
size_t hash_value() const;
};
}; // namespace kvc2
#endif
#include "gpu_cache.hh"
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
#include "cache_entry.hh"
#include "utils/arithmetic.hpp"
namespace kvc2 {
GPUPageCache::GPUPageCache(GPUPageCacheConfig& config) : config(config) {
if (torch::cuda::is_available()) {
size_t gpu_count = torch::cuda::device_count();
SPDLOG_INFO("Number of available GPUs: {}, want {}", gpu_count, config.gpu_devices_id.size());
if (gpu_count < config.gpu_devices_id.size()) {
SPDLOG_ERROR("Not enough GPUs available.");
exit(0);
}
for (auto x : config.gpu_devices_id) {
gpu_devices.push_back(torch::Device(torch::kCUDA, x));
}
} else {
SPDLOG_ERROR("CUDA is not available on this system.");
exit(0);
}
SPDLOG_WARN("Creating GPU Cache");
shape.push_back(config.layer_count);
shape.push_back(config.total_kvcache_pages);
shape.push_back(config.num_token_per_page);
if (config.full_kv_cache_on_each_gpu) {
if (config.gpu_devices_id.size() > 1) {
SPDLOG_WARN("Replicated KVCache on multiple gpu");
}
shape.push_back(config.num_k_heads);
} else {
shape.push_back(config.num_k_heads / config.gpu_devices_id.size());
}
shape.push_back(config.k_head_dim);
tensor_size = torch::elementSize(config.tensor_type);
for (auto& s : shape) {
tensor_size *= s;
}
SPDLOG_INFO("Creating KV Page Cache, Shape ({},{},{},{},{}), Size {} MiB", shape[0], shape[1], shape[2], shape[3],
shape[4], tensor_size / (1 << 20));
if (config.k_cache_on) {
for (size_t i = 0; i < config.gpu_devices_id.size(); i++) {
auto k = torch::zeros(shape, torch::TensorOptions().dtype(config.tensor_type));
k = k.to(gpu_devices[i]);
k_cache.push_back(k);
SPDLOG_INFO("K Page Cache of GPU {} is created", config.gpu_devices_id[i]);
}
occupations.resize(config.layer_count);
} else {
SPDLOG_WARN("Disalbe K Cache");
assert(config.gpu_only);
}
if (config.v_cache_on) {
for (size_t i = 0; i < config.gpu_devices_id.size(); i++) {
auto v = torch::zeros(shape, torch::TensorOptions().dtype(config.tensor_type));
v = v.to(gpu_devices[i]);
v_cache.push_back(v);
SPDLOG_INFO("V Page Cache of GPU {} is created", config.gpu_devices_id[i]);
}
v_occupations.resize(config.layer_count);
} else {
SPDLOG_WARN("Disalbe V Cache");
// assert(config.gpu_only); // should not assert
}
if (config.gpu_only) {
gpu_only_occupations.resize(config.total_kvcache_pages, false);
}
num_free_pages = config.total_kvcache_pages;
for (size_t i = 0; i < config.layer_count; i++) {
if (config.k_cache_on)
occupations[i].resize(config.total_kvcache_pages, nullptr);
if (config.v_cache_on)
v_occupations[i].resize(config.total_kvcache_pages, nullptr);
}
tp_size.resize(config.gpu_devices_id.size(), shape[2] * shape[3] * shape[4] * c10::elementSize(config.tensor_type));
tp_offset.resize(config.gpu_devices_id.size(), 0);
for (size_t i = 1; i < tp_offset.size(); i++) {
tp_offset[i] = tp_offset[i - 1] + tp_size[i - 1];
}
stream_manager =
std::unique_ptr<CudaStreamManager>(new CudaStreamManager(config.gpu_devices_id, config.num_streams_per_device));
}
bool GPUPageCache::alloc_col(std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_entries,
std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_entries, size_t at) {
std::lock_guard<std::mutex> lg(lock);
auto idx = next_empty_col();
if (idx.has_value()) {
// must have entry lock
auto& k0_entry = k_entries[0][at];
k0_entry->gpu_block_idx = idx;
for (size_t l = 0; l < config.layer_count; l++) {
if (config.k_cache_on) {
assert(k_entries[l][at]->data != nullptr);
occupations[l][idx.value()] = k_entries[l][at];
}
if (config.v_cache_on) {
assert(v_entries[l][at]->data != nullptr);
v_occupations[l][idx.value()] = v_entries[l][at];
}
}
return true;
} else {
return false;
}
}
std::vector<size_t> GPUPageCache::gpu_only_alloc_col(size_t count) {
assert(config.gpu_only);
std::lock_guard<std::mutex> lg(lock);
std::vector<size_t> re;
for (size_t i = 0; i < config.total_kvcache_pages; i++) {
if (gpu_only_occupations[i] == false) {
re.push_back(i);
if (re.size() == count) {
break;
}
}
}
if (re.size() == count) {
for (auto at : re) {
gpu_only_occupations[at] = true;
}
} else {
SPDLOG_WARN("GPU ONLY: Cannot allocate {} cols", count);
re.clear();
}
return re;
}
void GPUPageCache::gpu_only_free_cols(std::vector<size_t> cols) {
assert(config.gpu_only);
std::lock_guard<std::mutex> lg(lock);
for (auto at : cols) {
assert(gpu_only_occupations[at]);
gpu_only_occupations[at] = false;
}
}
std::optional<size_t> GPUPageCache::next_empty_col() {
if (num_free_pages == 0) {
evict_cols();
if (num_free_pages == 0) {
return std::nullopt;
}
}
while (occupations[0][_col_idx] != nullptr) {
_col_idx = (_col_idx + 1) % config.total_kvcache_pages;
}
num_free_pages -= 1;
return _col_idx;
}
void GPUPageCache::evict_cols() {
auto evicted_count = 0;
for (size_t i = 0; i < config.total_kvcache_pages; i++) {
auto& h = occupations[0][i];
if (h == nullptr) {
continue;
}
auto lg = h->lock_guard();
if (h->gpu_cc.can_desert()) {
h->gpu_cc.tc.reset();
h = nullptr;
num_free_pages += 1;
evicted_count += 1;
}
}
if (evicted_count > 0)
SPDLOG_INFO("GPU: Evicted {} GPU pages", evicted_count);
}
std::vector<std::unique_lock<CacheBlockEntry::MutexT>> GPUPageCache::try_lock_col(size_t at) {
std::vector<std::unique_lock<CacheBlockEntry::MutexT>> re;
if (config.k_cache_on) {
for (size_t l = 0; l < config.layer_count; l++) {
if (occupations[l][at] == nullptr) {
return {};
}
auto ul = occupations[l][at]->try_lock();
if (ul.owns_lock()) {
re.push_back(std::move(ul));
} else {
return {};
}
}
}
if (config.v_cache_on) {
for (size_t l = 0; l < config.layer_count; l++) {
if (v_occupations[l][at] == nullptr) {
return {};
}
auto ul = v_occupations[l][at]->try_lock();
if (ul.owns_lock()) {
re.push_back(std::move(ul));
} else {
return {};
}
}
}
return re;
}
std::vector<std::shared_ptr<CudaStreamManager::Request>> GPUPageCache::basic_request(cudaMemcpyKind direction,
std::function<void()> callback) {
std::vector<std::shared_ptr<CudaStreamManager::Request>> re;
re.resize(config.gpu_devices_id.size(), nullptr);
for (size_t i = 0; i < re.size(); i++) {
re[i] = std::shared_ptr<CudaStreamManager::Request>(new CudaStreamManager::Request);
re[i]->direction = direction;
re[i]->device_id = config.gpu_devices_id[i];
re[i]->callback = callback;
}
return re;
}
void GPUPageCache::submit_requests(std::vector<std::shared_ptr<CudaStreamManager::Request>> reqs) {
for (auto& r : reqs) {
stream_manager->submitRequest(r);
}
}
void GPUPageCache::append_col_to_request(std::vector<std::shared_ptr<CudaStreamManager::Request>>& reqs,
std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_handles,
std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_handles,
size_t at) {
if (config.k_cache_on == false && config.v_cache_on == false) {
return;
}
auto gpu_block_idx = k_handles[0][at]->gpu_block_idx.value();
for (size_t layer = 0; layer < config.layer_count; layer++) {
for (size_t which_gpu = 0; which_gpu < config.gpu_devices_id.size(); which_gpu++) {
if (config.k_cache_on) {
assert(k_handles[layer][at]->data != nullptr);
reqs[which_gpu]->sizes.push_back(tp_size[which_gpu]);
reqs[which_gpu]->host_mem_addresses.push_back(offset_by_bytes(k_handles[layer][at]->data, tp_offset[which_gpu]));
reqs[which_gpu]->device_mem_addresses.push_back(k_cache[which_gpu][layer][gpu_block_idx].data_ptr());
}
if (config.v_cache_on) {
assert(v_handles[layer][at]->data != nullptr);
reqs[which_gpu]->sizes.push_back(tp_size[which_gpu]);
reqs[which_gpu]->host_mem_addresses.push_back(offset_by_bytes(v_handles[layer][at]->data, tp_offset[which_gpu]));
reqs[which_gpu]->device_mem_addresses.push_back(v_cache[which_gpu][layer][gpu_block_idx].data_ptr());
}
}
}
// SPDLOG_DEBUG("GPU: Appended Vertical Handle to Request, count {}", reqs[0]->sizes.size());
}
void GPUPageCache::debug() {
size_t count = 0;
for (size_t i = 0; i < config.total_kvcache_pages; i++) {
if (occupations[0][i] == nullptr) {
count += 1;
} else {
// occupations[0][i]->gpu_cc.debug();
}
}
SPDLOG_DEBUG("Free Page: {}/{}", count, config.total_kvcache_pages);
}
} // namespace kvc2
#ifndef __GPU_CACHE_HH_
#define __GPU_CACHE_HH_
#include <torch/torch.h>
#include "cache_entry.hh"
#include "cuda_stream_manager.hh"
#include "defs.h"
#include "kvc2.h"
#include "metrics.h"
#include "utils/periodic_task.hpp"
namespace kvc2 {
class GPUPageCache {
std::vector<torch::Device> gpu_devices;
std::vector<int64_t> shape;
size_t tensor_size;
std::vector<size_t> tp_offset;
std::vector<size_t> tp_size;
// met
std::shared_ptr<Metrics> met;
// states
std::mutex lock;
size_t num_free_pages;
std::vector<bool> gpu_only_occupations;
std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>> occupations,v_occupations;
size_t _col_idx = 0;
// cuda stream manager
std::optional<size_t> next_empty_col();
public:
GPUPageCacheConfig config;
std::unique_ptr<CudaStreamManager> stream_manager;
std::vector<torch::Tensor> k_cache;
std::vector<torch::Tensor> v_cache;
std::unique_ptr<periodic::PeriodicTask> background_flush_back =nullptr;
GPUPageCache(GPUPageCacheConfig& config);
std::vector<size_t> gpu_only_alloc_col(size_t count);
void gpu_only_free_cols(std::vector<size_t> cols);
void gpu_background_flush();
bool alloc_col(std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_entries,
std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_entries, size_t at);
void evict_cols();
void flush_col(size_t at);
std::vector<std::unique_lock<CacheBlockEntry::MutexT>> try_lock_col(size_t at);
void free_col(size_t at);
std::vector<std::shared_ptr<CudaStreamManager::Request>> basic_request(cudaMemcpyKind direction,
std::function<void()> callback);
void submit_requests(std::vector<std::shared_ptr<CudaStreamManager::Request>> reqs);
void append_col_to_request(std::vector<std::shared_ptr<CudaStreamManager::Request>>& reqs,
std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& k_handles,
std::vector<std::vector<std::shared_ptr<CacheBlockEntry>>>& v_handles, size_t at);
void debug();
};
} // namespace kvc2
#endif
\ No newline at end of file
#ifndef __HASHER_HPP_
#define __HASHER_HPP_
#include "defs.h"
#include "xxhash.h"
namespace kvc2 {
const uint64_t hash_seed = 4123512;
const uint64_t check_hash_seed = 1025753;
using TokensHash = XXH64_hash_t;
struct TokensHasher {
XXH64_state_t* state;
TokensHasher() {
state = XXH64_createState();
reset();
}
~TokensHasher() { XXH64_freeState(state); }
TokensHasher(TokensHasher& other) = delete;
TokensHasher& operator=(TokensHasher& other) = delete;
TokensHasher(TokensHasher&& other) = delete;
TokensHasher& operator=(TokensHasher&& other) = delete;
TokensHash get() { return XXH64_digest(state); }
void reset(size_t seed = hash_seed) { XXH64_reset(state, seed); }
TokensHash update(Token* data, TokenLength length) {
XXH64_update(state, data, length * sizeof(Token));
return get();
}
TokensHash update_raw(void* data, size_t size) {
XXH64_update(state, data, size);
return get();
}
static TokensHash hash(Token* data, TokenLength length) { return XXH64(data, length * sizeof(Token), hash_seed); }
};
} // namespace kvc2
#endif
\ No newline at end of file
/**
* @Description :
* @Author : Xie Weiyu
* @Date : 2024-12-11 06:35:31
* @Version : 1.0.0
* @LastEditors : Xie Weiyu
* @LastEditTime : 2024-12-11 06:50:55
* @Copyright (c) 2024 by KVCache.AI, All Rights Reserved.
**/
#pragma once
#include <atomic>
#include <future>
#include <iostream>
#include <mutex>
#include <optional>
#include <string>
#include <vector>
struct BatchPromise {
std::promise<void> promise;
std::shared_future<void> fut;
std::atomic_size_t count;
inline BatchPromise(size_t count) : count(count) { fut = promise.get_future().share(); }
inline void inc(size_t count = 1) { this->count.fetch_add(count, std::memory_order_seq_cst); }
inline void set() {
if (count.fetch_sub(1, std::memory_order_seq_cst) == 1) {
promise.set_value();
}
}
inline std::shared_future<void> get_shared_fut() { return fut; }
};
template <typename Lock>
struct TransferControl {
Lock lock;
std::optional<std::shared_future<void>> transfer_ok = std::nullopt;
bool has_data = false;
TransferControl() {}
/*
true, std::nullopt : Already has data
false, shared_future : Transfer already started, should wait for the future
false, std::nullopt : should transfer by you
true, shared_future: Should not appear
*/
std::pair<bool, std::optional<std::shared_future<void>>> has_data_or_transfer(std::shared_future<void> shared_fut) {
std::lock_guard<Lock> lg(lock);
if (has_data) {
return {true, std::nullopt};
} else {
if (transfer_ok.has_value()) {
return {false, transfer_ok};
} else {
transfer_ok = shared_fut;
return {false, std::nullopt};
}
}
}
void set_has_data() {
std::lock_guard<Lock> lg(lock);
has_data = true;
transfer_ok = std::nullopt;
}
bool get_has_data() {
std::lock_guard<Lock> lg(lock);
if (has_data) {
return true;
} else {
return false;
}
}
void reset() {
std::lock_guard<Lock> lg(lock);
transfer_ok = std::nullopt;
has_data = false;
}
std::string debug() {
std::lock_guard<Lock> lg(lock);
return std::string("") + (has_data ? "has data" : "no data") + " " +
(transfer_ok.has_value() ? "transfer " : "no transfer");
}
};
struct ConcurrentController {
std::atomic_bool dirty = false;
std::atomic_size_t ref_count = 0;
TransferControl<std::mutex> tc;
};
template <typename Unit>
struct IO_Helper {
BatchPromise batch_promise;
std::function<void(Unit*)> call_back_on_unit = nullptr;
std::function<void()> call_back = nullptr;
std::vector<std::shared_future<void>> futs;
std::vector<Unit*> units_by_myself;
IO_Helper(std::function<void(Unit*)> call_back_on_unit, std::function<void()> call_back = nullptr)
: batch_promise(1), call_back_on_unit(call_back_on_unit), call_back(call_back) {}
IO_Helper(const IO_Helper& other) = delete;
IO_Helper& operator=(const IO_Helper& other) = delete;
IO_Helper(IO_Helper&& other) = delete;
IO_Helper& operator=(IO_Helper&& other) = delete;
~IO_Helper() {
// std::cout<<"Destory IO helper"<<std::endl;
}
size_t total_task_count = 0;
void new_task(size_t count = 1) {
total_task_count += 1;
batch_promise.inc(count);
}
void finish_add_taks() { batch_promise.set(); }
bool absorb_tc(Unit* unit, TransferControl<std::mutex>& tc) {
auto [ok, fut] = tc.has_data_or_transfer(batch_promise.get_shared_fut());
if (ok) {
return false;
} else {
if (fut.has_value()) {
futs.push_back(fut.value());
// printf("Transfer started\n");
return false;
} else {
units_by_myself.push_back(unit);
// printf("Not Transfer\n");
return true;
}
}
}
void wait() {
for (auto& fut : futs) {
fut.wait();
}
batch_promise.get_shared_fut().wait();
for (auto& b : units_by_myself) {
call_back_on_unit(b);
}
if (call_back)
call_back();
}
};
#pragma once
#include <torch/torch.h>
#include <cstdint>
#include <optional>
#include <vector>
#include "defs.h"
#include "model_config.h"
namespace kvc2 {
struct GPUPageCacheConfig {
bool gpu_only;
std::vector<size_t> gpu_devices_id;
size_t layer_count;
size_t total_kvcache_pages;
size_t num_token_per_page;
size_t num_k_heads;
size_t k_head_dim;
bool full_kv_cache_on_each_gpu = false;
bool k_cache_on = true;
bool v_cache_on = true;
torch::ScalarType tensor_type;
// for cuda stream manager
size_t num_streams_per_device = 4;
};
struct KVC2Config {
bool k_cache_on = true;
bool v_cache_on = true;
bool gpu_only = false;
bool load_from_disk = true;
bool save_to_disk = true;
std::string path;
std::string config_path;
TokenLength num_token_per_page = 256;
size_t memory_pool_size = 10e9;
size_t evict_count = 20;
std::optional<GPUPageCacheConfig> gpu_cache_config = std::nullopt;
size_t metrics_port;
double recompute_ratio = 0.2;
};
class DoubleCacheHandleInterface;
class KVC2Interface {
public:
virtual ~KVC2Interface() = default;
virtual void load() = 0;
virtual void save() = 0;
/*
Raw Insert
Insert kvcache from kvcache_data to disk.
info: cache info
id: start pointer of token array
length: length of token array
kvcache_data: data of kvcache
This will firstly match the ID array with the existing kvcache, and then insert the unmatched kvcache to disk.
*/
virtual void raw_insert(ModelName model_name, QuantType quant_type, Token* id, TokenLength length,
const std::vector<layer_data>& k_cache, const std::vector<layer_data>& v_cache) = 0;
/*
Raw Read
Read kvcache from disk to user specified pointers.
info: cache info
id: start pointer of token array
length: length of token array
kvcache_data: data of kvcache
Return: matched length of prefix, in tokens
This will not read from memory pool, it directly read from disk.
*/
virtual TokenLength raw_read(ModelName model_name, QuantType quant_type, Token* id, TokenLength length,
const std::vector<layer_data>& k_cache, const std::vector<layer_data>& v_cache) = 0;
/*
Lookup
Lookup kvcache and load it from disk to memory pool if needed.
info: cache info
id: start pointer of token array
length: length of token array
Return: kvc2_handle, holds kvcache until being released.
if not found, matched_length will return 0.
if memory pool is full, return nullptr
*/
virtual std::shared_ptr<DoubleCacheHandleInterface> lookup(ModelName model_name, QuantType quant_type, Token* id,
TokenLength length, TokenLength estimated_length) = 0;
/*
Lookup and allocate to gpu
info.is_k_cache does not matter here
*/
virtual std::shared_ptr<DoubleCacheHandleInterface> lookup_to_gpu(ModelName model_name, QuantType quant_type,
Token* id, TokenLength length,
TokenLength estimated_length) = 0;
virtual void lookup_to_gpu_async(ModelName model_name, QuantType quant_type, Token* id, TokenLength length,
TokenLength estimated_length,
std::function<void(std::shared_ptr<DoubleCacheHandleInterface>)> call_back) = 0;
virtual std::pair<std::vector<torch::Tensor>, std::vector<torch::Tensor>> get_kvcache() = 0;
virtual void debug() = 0;
};
std::shared_ptr<KVC2Interface> create_kvc2(KVC2Config config);
enum MatchStatus {
Exact,
Partial,
NotMatchExact,
NotMatchPartial,
};
class DoubleCacheHandleInterface {
public:
virtual ~DoubleCacheHandleInterface() = default;
virtual TokenLength matched_length() = 0;
virtual std::vector<MatchStatus> matched_status() = 0;
virtual std::vector<layer_data> handle_data(bool is_key_cache) = 0;
virtual bool to_gpu() = 0;
virtual void to_gpu_async(std::function<void(bool)> call_back) = 0;
virtual std::vector<size_t> get_gpu_block_idx() = 0;
virtual std::vector<size_t> get_gpu_attached_block_idx() = 0;
virtual void append_tokens(Token* tokens, TokenLength length) = 0; // update generated tokens
virtual void debug() = 0;
};
}; // namespace kvc2
import torch
import ctypes
def aligned_tensor(size, alignment=4096):
num_bytes = size
mem = ctypes.c_void_p()
error_code = ctypes.CDLL(None).posix_memalign(
ctypes.byref(mem), ctypes.c_size_t(alignment), ctypes.c_size_t(num_bytes)
)
if error_code != 0:
raise MemoryError(f"posix_memalign failed with error code {error_code}")
array_type = (ctypes.c_int8 * size)
raw_array = array_type.from_address(mem.value)
tensor = torch.frombuffer(raw_array, dtype=torch.int8)
if tensor.data_ptr() % alignment != 0:
raise ValueError(f"Tensor data_ptr {tensor.data_ptr()} is not aligned to {alignment} bytes")
return tensor, mem
def alloc_aligned_cache(layer_count,block_count,element_size):
cache = []
cache_mem = []
for i in range(layer_count):
layer_data = []
layer_mem = []
for j in range(block_count):
tensor, mem_ptr = aligned_tensor(element_size, alignment=4096)
layer_data.append(tensor)
layer_mem.append(mem_ptr)
cache.append(layer_data)
cache_mem.append(layer_mem)
return cache,cache_mem
def dealloc_aligned_cache(cache_mem):
for layer_mem in cache_mem:
for mem_ptr in layer_mem:
ctypes.CDLL(None).free(mem_ptr)
def get_tensor_ptr(tensors):
tensor_ptr = []
for layer in tensors:
layer_ptr = []
for data in layer:
layer_ptr.append(data.data_ptr())
tensor_ptr.append(layer_ptr)
return tensor_ptr
def get_tensor_from_data_ptr(matched_data,element_size):
re = []
for layer in matched_data:
re_layer = []
for data_ptr in layer:
array_type = (ctypes.c_int8 * element_size)
raw_array = array_type.from_address(data_ptr)
tensor = torch.frombuffer(raw_array, dtype=torch.int8)
re_layer.append(tensor)
re.append(re_layer)
return re
if __name__ == "__main__":
pass
\ No newline at end of file
#include "metrics.h"
namespace kvc2 {
Metrics::Metrics(const MetricsConfig& config)
: registry_(std::make_shared<prometheus::Registry>()), exposer_(config.endpoint) {
// 注册 prefix_nodes Counter
auto& prefix_nodes_family = prometheus::BuildCounter()
.Name(std::string(METRIC_PREFIX) + "_prefix_nodes")
.Help("Number of prefix nodes")
.Register(*registry_);
prefix_nodes = &prefix_nodes_family.Add({});
// 注册 prefix_block_count Counter
auto& prefix_block_count_family = prometheus::BuildCounter()
.Name(std::string(METRIC_PREFIX) + "_prefix_block_count")
.Help("Number of prefix blocks")
.Register(*registry_);
prefix_block_count = &prefix_block_count_family.Add({});
// 定义统一的桶大小,最大为 10000 ms (10 s)
std::vector<double> common_buckets = {1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0, 5000.0, 10000.0};
// 注册 raw_insert_time_ms Histogram
auto& raw_insert_time_ms_family = prometheus::BuildHistogram()
.Name(std::string(METRIC_PREFIX) + "_raw_insert_time_ms")
.Help("function raw insert's time in milliseconds")
.Register(*registry_);
raw_insert_time_ms = &raw_insert_time_ms_family.Add({}, common_buckets);
// 注册 lookup_time_ms Histogram
auto& lookup_time_ms_family = prometheus::BuildHistogram()
.Name(std::string(METRIC_PREFIX) + "_lookup_time_ms")
.Help("function lookup's time in milliseconds")
.Register(*registry_);
lookup_time_ms = &lookup_time_ms_family.Add({}, common_buckets);
// 注册 lookup_prefixmatch_length Histogram
auto& lookup_prefixmatch_length_family = prometheus::BuildHistogram()
.Name(std::string(METRIC_PREFIX) + "_lookup_prefixmatch_length")
.Help("function lookup's prefix match length")
.Register(*registry_);
lookup_prefixmatch_length = &lookup_prefixmatch_length_family.Add({}, common_buckets);
// 注册 matched_length_percentage Histogram
auto& matched_length_percentage_family = prometheus::BuildHistogram()
.Name(std::string(METRIC_PREFIX) + "_matched_length_percentage")
.Help("function matched length percentage")
.Register(*registry_);
matched_length_percentage = &matched_length_percentage_family.Add({}, common_buckets);
// 注册 disk_usage Gauge
auto& disk_usage_family =
prometheus::BuildGauge().Name(std::string(METRIC_PREFIX) + "_disk_usage").Help("disk usage").Register(*registry_);
disk_usage = &disk_usage_family.Add({});
// 注册 memory_pool_size Gauge
memory_pool_size_family_ = &prometheus::BuildGauge()
.Name(std::string(METRIC_PREFIX) + "_memory_pool_size")
.Help("memory pool size")
.Register(*registry_);
// 注册 memory_pool_node_count Gauge
memory_pool_node_count_family_ = &prometheus::BuildGauge()
.Name(std::string(METRIC_PREFIX) + "_memory_pool_node_count")
.Help("memory pool node count")
.Register(*registry_);
// 注册 lru_entry_count Gauge
lru_entry_count_family_ = &prometheus::BuildGauge()
.Name(std::string(METRIC_PREFIX) + "_lru_entry_count")
.Help("lru entry count")
.Register(*registry_);
// 注册 gpu_page_count Gauge
gpu_page_count_family_ = &prometheus::BuildGauge()
.Name(std::string(METRIC_PREFIX) + "_gpu_page_count")
.Help("gpu page count")
.Register(*registry_);
// 注册 append_tokens_time_ms Histogram
auto& append_tokens_time_ms_family = prometheus::BuildHistogram()
.Name(std::string(METRIC_PREFIX) + "_append_tokens_time_ms")
.Help("append tokens time in milliseconds")
.Register(*registry_);
append_tokens_time_ms = &append_tokens_time_ms_family.Add({}, common_buckets);
// 注册 gpu_flush_back_time_ms Histogram
auto& gpu_flush_back_time_ms_family = prometheus::BuildHistogram()
.Name(std::string(METRIC_PREFIX) + "_gpu_flush_back_time_ms")
.Help("gpu flush back time in milliseconds")
.Register(*registry_);
gpu_flush_back_time_ms = &gpu_flush_back_time_ms_family.Add({}, common_buckets);
// 注册 cpu_flush_back_time_ms Histogram
auto& cpu_flush_back_time_ms_family = prometheus::BuildHistogram()
.Name(std::string(METRIC_PREFIX) + "_cpu_flush_back_time_ms")
.Help("cpu flush back time in milliseconds")
.Register(*registry_);
cpu_flush_back_time_ms = &cpu_flush_back_time_ms_family.Add({}, common_buckets);
exposer_.RegisterCollectable(registry_);
}
// 析构函数
Metrics::~Metrics() {
// 停止指标暴露
// exposer_.Stop();
}
// 获取 memory_pool_size 指标
prometheus::Gauge* Metrics::memory_pool_size(const std::string& type) {
return &memory_pool_size_family_->Add({{"type", type}});
}
// 获取 memory_pool_node_count 指标
prometheus::Gauge* Metrics::memory_pool_node_count(const std::string& type) {
return &memory_pool_node_count_family_->Add({{"type", type}});
}
// 获取 lru_entry_count 指标
prometheus::Gauge* Metrics::lru_entry_count(const std::string& type) {
return &lru_entry_count_family_->Add({{"type", type}});
}
// 获取 gpu_page_count 指标
prometheus::Gauge* Metrics::gpu_page_count(std::string type) {
return &gpu_page_count_family_->Add({{"type", type}});
}
TimeObserver::TimeObserver(prometheus::Histogram* h) {
histogram_ = h;
timer_.start();
}
TimeObserver::~TimeObserver() {
timer_.stop();
histogram_->Observe(timer_.elapsedNs() / 1e6); // ns -> ms
}
} // namespace kvc2
\ No newline at end of file
#pragma once
#include "prometheus/counter.h"
#include "prometheus/exposer.h"
#include "prometheus/gauge.h"
#include "prometheus/histogram.h"
#include "prometheus/registry.h"
#include <atomic>
#include <chrono>
#include <memory>
#include <string>
#include <thread>
#include <vector>
#include "utils/timer.hpp"
namespace kvc2 {
// 指标前缀宏定义
#define METRIC_PREFIX "kvc2"
struct MetricsConfig {
std::string endpoint; // 监听端点,如 "0.0.0.0:8080"
};
class Metrics {
public:
// 构造函数传入 MetricsConfig
Metrics(const MetricsConfig& config);
~Metrics();
// 禁止拷贝和赋值
Metrics(const Metrics&) = delete;
Metrics& operator=(const Metrics&) = delete;
// 指标指针
prometheus::Counter* prefix_nodes;
prometheus::Counter* prefix_block_count;
prometheus::Histogram* raw_insert_time_ms;
prometheus::Histogram* lookup_time_ms;
prometheus::Histogram* lookup_prefixmatch_length;
prometheus::Histogram* matched_length_percentage;
prometheus::Gauge* disk_usage;
prometheus::Gauge* memory_pool_size(const std::string& type);
prometheus::Gauge* memory_pool_node_count(const std::string& type);
prometheus::Gauge* lru_entry_count(const std::string& type);
prometheus::Gauge* gpu_page_count(std::string type);
prometheus::Histogram* append_tokens_time_ms;
prometheus::Histogram* gpu_flush_back_time_ms;
prometheus::Histogram* cpu_flush_back_time_ms;
private:
std::shared_ptr<prometheus::Registry> registry_;
prometheus::Exposer exposer_;
prometheus::Family<prometheus::Gauge>* memory_pool_size_family_;
prometheus::Family<prometheus::Gauge>* memory_pool_node_count_family_;
prometheus::Family<prometheus::Gauge>* lru_entry_count_family_;
prometheus::Family<prometheus::Gauge>* gpu_page_count_family_;
};
class TimeObserver {
public:
TimeObserver(prometheus::Histogram* h);
~TimeObserver();
private:
Timer timer_;
prometheus::Histogram* histogram_;
};
} // namespace kvc2
\ No newline at end of file
#ifndef __MODEL_CONFIG_HPP_
#define __MODEL_CONFIG_HPP_
#include <iostream>
#include "nlohmann/json.hpp"
#include <filesystem>
#include <fstream>
using DimSize = size_t;
using URL = std::string;
using ModelName = std::string;
// We must assure this can be load by config.json
class ModelConfig {
public:
DimSize hidden_size;
DimSize intermediate_size;
size_t max_position_embeddings;
std::string model_type;
size_t num_attention_heads;
size_t num_hidden_layers;
size_t num_key_value_heads;
size_t vocab_size;
NLOHMANN_DEFINE_TYPE_INTRUSIVE(ModelConfig, hidden_size, intermediate_size, max_position_embeddings, model_type,
num_attention_heads, num_hidden_layers, num_key_value_heads, vocab_size);
void load_from(std::filesystem::path path) {
std::ifstream i(path);
nlohmann::json j;
i >> j;
*this = j.get<ModelConfig>();
}
};
using QuantType = std::string;
static const QuantType NoQuantType = "";
class QuantConfig {
public:
QuantType name;
// For GEMV
QuantType type_of_dot_vector = NoQuantType;
inline bool can_be_used_as_matrix() { return type_of_dot_vector != NoQuantType; }
bool can_be_used_as_vector;
double bytes_per_element;
bool has_scale;
bool has_min;
size_t block_element_count;
size_t block_element_size;
URL reference = "";
NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(QuantConfig, name, type_of_dot_vector, can_be_used_as_vector,
bytes_per_element, has_scale, has_min, block_element_count,
block_element_size, reference);
};
inline std::map<QuantType, QuantConfig> quant_configs;
inline std::map<ModelName, ModelConfig> model_configs;
inline void load_quant_configs(std::filesystem::path path) {
std::cout << __FUNCTION__ << " from " << path << std::endl;
std::ifstream i(path);
nlohmann::json j;
i >> j;
quant_configs = j.get<std::map<QuantType, QuantConfig>>();
std::cout << "Loaded Quant Configs" << std::endl;
for (auto& [k, v] : quant_configs) {
std::cout << " - " << k << std::endl;
}
}
inline void dump_quant_configs(std::filesystem::path path) {
std::ofstream o(path);
nlohmann::json j = quant_configs;
o << j.dump(4);
}
inline void load_model_configs(std::filesystem::path path) {
std::cout << __FUNCTION__ << " from " << path << std::endl;
std::ifstream i(path);
nlohmann::json j;
i >> j;
model_configs = j.get<std::map<ModelName, ModelConfig>>();
std::cout << "Loaded Model Configs" << std::endl;
for (auto& [k, v] : model_configs) {
std::cout << " - " << k << std::endl;
}
}
inline void dump_model_configs(std::filesystem::path path) {
std::ofstream o(path);
nlohmann::json j = model_configs;
o << j.dump(4);
}
#endif
\ No newline at end of file
#include "page_aligned_memory_pool.h"
#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_DEBUG
#define FMT_HEADER_ONLY
#include "spdlog/spdlog.h"
#include "utils/arithmetic.hpp"
#include "utils/easy_format.hpp"
/// 构造函数
PageAlignedMemoryPool::PageAlignedMemoryPool(size_t size_in_bytes) {
total_size = (size_in_bytes / PageSize) * PageSize;
// 对齐分配。C++17 对齐方式写法,如果编译器不支持可以改用其它方法
data = ::operator new[](total_size, std::align_val_t(PageSize));
total_pages = total_size / PageSize;
assert(total_pages >= Blocks);
page_per_block = total_pages / Blocks;
for (size_t block_index = 0; block_index < Blocks; block_index ++) {
first_page[block_index] = reinterpret_cast<void*>(reinterpret_cast<intptr_t>(data) + static_cast<intptr_t>(block_index) * page_per_block * PageSize);
count_page[block_index] =
block_index == Blocks - 1 ? (total_pages - page_per_block * (Blocks - 1)) : page_per_block;
SPDLOG_DEBUG("first_page[{}] = {}, count_page[{}] = {}",
block_index, reinterpret_cast<intptr_t>(first_page[block_index]) - reinterpret_cast<intptr_t>(data),
block_index, count_page[block_index]);
bitmap[block_index].resize(count_page[block_index], 0);
}
SPDLOG_INFO("PageAlignedMemoryPool with size {} Mbytes, {} pages", total_size / (1 << 20), page_count());
}
/// 析构函数
PageAlignedMemoryPool::~PageAlignedMemoryPool() {
if (data) {
// 注意:需要与分配时的对齐方式对应
::operator delete[](data, std::align_val_t(PageSize));
data = nullptr;
}
}
/// 返回总页数
size_t PageAlignedMemoryPool::page_count() {
return total_size / PageSize;
}
/// 返回按整页对齐后的字节数
size_t PageAlignedMemoryPool::page_padded_size(size_t size) {
return div_up(size, PageSize) * PageSize;
}
void* PageAlignedMemoryPool::alloc_in_block(size_t block_index, size_t alloc_size) {
std::lock_guard<std::mutex> guard(lock[block_index]);
size_t free_pages = 0;
for (size_t i = 0; i < count_page[block_index]; i++) {
if (bitmap[block_index][i] == 0) {
free_pages ++;
if (free_pages == alloc_size) {
size_t page_index = i + 1 - free_pages;
for (size_t page = page_index; page < page_index + alloc_size; page++) {
bitmap[block_index][page] = 1;
// SPDLOG_DEBUG("alloc page {} in block {}", page, block_index);
}
return reinterpret_cast<void*>(reinterpret_cast<intptr_t>(first_page[block_index]) + page_index * PageSize);
}
} else {
free_pages = 0;
}
}
return nullptr;
}
/// 分配函数
void* PageAlignedMemoryPool::alloc(size_t size) {
size_t alloc_size = div_up(size, PageSize);
auto cnt = now_block.fetch_add(1, std::memory_order_relaxed);
for (size_t i = 0; i < Blocks; i ++) {
auto result = alloc_in_block((i + cnt) % Blocks, alloc_size);
if (result != nullptr) {
allocated.fetch_add(alloc_size * PageSize, std::memory_order_relaxed);
alloc_count.fetch_add(1, std::memory_order_relaxed);
return result;
}
}
return nullptr;
}
/// 释放函数
void PageAlignedMemoryPool::free(void* p, size_t size) {
auto alloc_size = div_up(size, PageSize);
size_t block_index = (reinterpret_cast<intptr_t>(p) - reinterpret_cast<intptr_t>(data)) / page_per_block / PageSize;
size_t page_index = (reinterpret_cast<intptr_t>(p) - reinterpret_cast<intptr_t>(first_page[block_index])) / PageSize;
std::lock_guard<std::mutex> guard(lock[block_index]);
for (size_t page = page_index; page < page_index + alloc_size; page++)
bitmap[block_index][page] = 0;
allocated.fetch_sub(alloc_size * PageSize, std::memory_order_relaxed);
free_count.fetch_add(1, std::memory_order_relaxed);
}
// TODO: too slow
std::vector<void*> PageAlignedMemoryPool::alloc_multiple(size_t size, size_t count) {
std::vector<void*> result;
for (size_t i = 0; i < count; i++) {
auto p = alloc(size);
if (p == nullptr) {
for (auto ptr : result) {
free(ptr, size);
}
return {};
}
result.push_back(p);
}
return result;
}
void PageAlignedMemoryPool::defragment() {}
/// 调试打印
std::string PageAlignedMemoryPool::debug() {
return fmt::format("PageAlignedMemoryPool: total_size: {}MB, allocated: {}, alloc/free count: {}/{}\n",
readable_number(total_size), readable_number(size_t(allocated)), size_t(alloc_count), size_t(free_count));
}
#pragma once
#include <algorithm> // std::sort
#include <cstddef> // size_t
#include <mutex> // std::mutex
#include <vector>
#include <assert.h>
#include <bitset>
#include <atomic>
constexpr size_t PageSize = 4096;
/// PageAlignedMemoryPool 类的声明
struct PageAlignedMemoryPool {
private:
constexpr static size_t Blocks = 16;
void* data = nullptr;
size_t total_size = 0, total_pages = 0;
std::atomic_size_t now_block = 0;
std::atomic_size_t allocated = 0; // allocated_size
std::atomic_size_t alloc_count = 0;
std::atomic_size_t free_count = 0;
std::mutex lock[Blocks];
size_t page_per_block = 0;
void *first_page[Blocks];
size_t count_page[Blocks];
std::vector<int8_t> bitmap[Blocks];
void* alloc_in_block(size_t block_index, size_t alloc_size);
public:
/// 构造函数和析构函数
explicit PageAlignedMemoryPool(size_t size_in_bytes);
~PageAlignedMemoryPool();
/// 禁用拷贝和移动
PageAlignedMemoryPool(PageAlignedMemoryPool&& other) = delete;
PageAlignedMemoryPool& operator=(PageAlignedMemoryPool&& other) = delete;
PageAlignedMemoryPool(const PageAlignedMemoryPool& other) = delete;
PageAlignedMemoryPool& operator=(const PageAlignedMemoryPool& other) = delete;
/// 成员函数
size_t page_count();
size_t page_padded_size(size_t size);
void* alloc(size_t size);
std::vector<void*> alloc_multiple(size_t size, size_t count);
void free(void* data, size_t size);
void defragment();
std::string debug();
};
This diff is collapsed.
#pragma once
#include "easy_format.hpp"
#include "timer.hpp"
\ No newline at end of file
#include <memory>
#include <type_traits>
template <typename T, typename U>
T div_up(T x, U by) {
static_assert(std::is_integral_v<T>);
static_assert(std::is_integral_v<U>);
return (x + by - 1) / by;
}
template <typename T>
T* offset_by_bytes(T* t, size_t n) {
return reinterpret_cast<T*>(reinterpret_cast<size_t>(t) + n);
}
#ifndef __EASY_FORMAT_HPP_
#define __EASY_FORMAT_HPP_
#include <array>
#include <iomanip>
#include <sstream>
#include <string>
#include <vector>
template <typename T>
inline std::string format_vector(const std::vector<T>& v) {
std::ostringstream oss;
if (v.empty())
return "[]";
for (size_t i = 0; i < v.size(); ++i) {
oss << v[i];
if (i < v.size() - 1)
oss << ", "; // 逗号分隔
}
return oss.str();
}
inline std::array<std::string, 7> units = {"", "K", "M", "G", "T", "P", "E"};
inline std::string readable_number(size_t size) {
size_t unit_index = 0;
double readable_size = size;
while (readable_size >= 1000 && unit_index < units.size() - 1) {
readable_size /= 1000;
unit_index++;
}
std::ostringstream ss;
ss << std::fixed << std::setprecision(2) << readable_size;
std::string str = ss.str();
return str + "" + units[unit_index];
}
#endif
\ No newline at end of file
#include <atomic>
#include <future>
#include <iostream>
#include <memory>
#include <thread>
#include <vector>
template <typename T>
class MPSCQueue {
struct Node {
std::shared_ptr<T> data;
std::atomic<Node*> next;
Node() : next(nullptr) {}
Node(std::shared_ptr<T> data_) : data(std::move(data_)), next(nullptr) {}
};
std::atomic<Node*> head;
Node* tail;
public:
std::atomic_size_t enqueue_count = 0;
size_t dequeue_count = 0;
MPSCQueue() {
Node* dummy = new Node();
head.store(dummy, std::memory_order_relaxed);
tail = dummy;
}
~MPSCQueue() {
// 清理剩余的节点
Node* node = tail;
while (node) {
Node* next = node->next.load(std::memory_order_relaxed);
delete node;
node = next;
}
}
// 生产者调用
void enqueue(std::shared_ptr<T> data) {
enqueue_count.fetch_add(1);
Node* node = new Node(std::move(data));
Node* prev_head = head.exchange(node, std::memory_order_acq_rel);
prev_head->next.store(node, std::memory_order_release);
}
// 消费者调用
std::shared_ptr<T> dequeue() {
Node* next = tail->next.load(std::memory_order_acquire);
if (next) {
std::shared_ptr<T> res = std::move(next->data);
delete tail;
tail = next;
dequeue_count += 1;
return res;
}
return nullptr;
}
};
\ No newline at end of file
#include <atomic>
#include <cassert>
#include <iostream>
#include <optional>
#include <semaphore>
template <typename T>
class MPSCQueue {
struct Node {
T data;
std::atomic<Node*> next;
Node() : next(nullptr) {}
Node(T data_) : data(std::move(data_)), next(nullptr) {}
};
std::atomic<Node*> head;
Node* tail;
public:
std::atomic_size_t enqueue_count = 0;
size_t dequeue_count = 0;
MPSCQueue() {
Node* dummy = new Node();
head.store(dummy, std::memory_order_seq_cst);
tail = dummy;
}
~MPSCQueue() {
Node* node = tail;
while (node) {
Node* next = node->next.load(std::memory_order_seq_cst);
delete node;
node = next;
}
}
// 生产者调用
void enqueue(T data) {
enqueue_count.fetch_add(1);
Node* node = new Node(std::move(data));
Node* prev_head = head.exchange(node, std::memory_order_seq_cst);
prev_head->next.store(node, std::memory_order_seq_cst);
}
// 消费者调用
std::optional<T> dequeue() {
Node* next = tail->next.load(std::memory_order_seq_cst);
if (next) {
T res = std::move(next->data);
delete tail;
tail = next;
dequeue_count += 1;
return res;
}
return std::nullopt;
}
size_t size() { return enqueue_count.load() - dequeue_count; }
};
template <typename T>
class MPSCQueueConsumerLock {
MPSCQueue<T> queue;
std::counting_semaphore<> sema{0};
public:
void enqueue(T data) {
queue.enqueue(std::move(data));
// std::atomic_thread_fence(std::memory_order_seq_cst);// Inserting this because the memory order might be wrong, I
// am also not that sure about this.
sema.release();
}
T dequeue() {
auto re = queue.dequeue();
if (re.has_value()) {
while (sema.try_acquire() == false) {
std::cerr << __FILE__ << ":" << __FUNCTION__ << " sema try acquire should be success, retrying, please check"
<< std::endl;
// assert(false);
}
return re.value();
}
sema.acquire();
return queue.dequeue().value();
}
size_t size() { return queue.size(); }
};
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment