#include "paged_compiler.hpp" #include namespace { // Todo: replace with Tensor::zeros when it is available inline void set_zeros(infinicore::Tensor &tensor) { std::vector zeros(tensor->nbytes(), 0); infinicore::context::memcpyH2D(tensor->data(), zeros.data(), tensor->nbytes(), false); } } // namespace namespace infinilm::engine { PagedCompiler::PagedCompiler(const std::shared_ptr &model, RankBarrier *barrier) : GraphCompiler(model, barrier) { for (size_t b = 1; b < 32; b++) { decode_batch_sizes_.push_back(b); } for (size_t b = 32; b < 64; b += 8) { decode_batch_sizes_.push_back(b); } for (size_t b = 64; b < 128; b += 16) { decode_batch_sizes_.push_back(b); } for (size_t b = 128; b < 256; b += 32) { decode_batch_sizes_.push_back(b); } for (size_t b = 256; b <= 512; b += 64) { decode_batch_sizes_.push_back(b); } } void PagedCompiler::compile() { if (model_->get_cache_config() != nullptr && dynamic_cast(model_->get_cache_config())) { size_t nblocks = dynamic_cast(model_->get_cache_config())->num_blocks(); // /////////// // 获取配置中的 block 大小(比如 16 或 32) int block_size = dynamic_cast(model_->get_cache_config())->block_size(); ///////// size_t max_batch_size = *std::max_element(decode_batch_sizes_.begin(), decode_batch_sizes_.end()); compiled_map_decode_.clear(); block_tables_holder_ = infinicore::Tensor::empty( {nblocks}, infinicore::DataType::I32, infinicore::context::getDevice()); set_zeros(block_tables_holder_); for (size_t b : decode_batch_sizes_) { size_t block_per_req = nblocks / b; InfinilmModel::Input input; input.input_ids = infinicore::Tensor::empty({1, b}, infinicore::DataType::I64, infinicore::context::getDevice()); input.position_ids = infinicore::Tensor::empty({b}, infinicore::DataType::I64, infinicore::context::getDevice()); input.total_sequence_lengths = infinicore::Tensor::empty({b}, infinicore::DataType::I32, infinicore::context::getDevice()); set_zeros(input.input_ids.value()); set_zeros(input.position_ids.value()); set_zeros(input.total_sequence_lengths.value()); std::vector total_sequence_lengths_vec(b, 1); infinicore::context::memcpyH2D(input.total_sequence_lengths.value()->data(), total_sequence_lengths_vec.data(), b * sizeof(int32_t), false); input.input_offsets = infinicore::Tensor::empty({b + 1}, infinicore::DataType::I32, infinicore::context::getDevice()); std::vector input_offsets_vec(b + 1, 0); for (size_t i = 0; i <= b; i++) { input_offsets_vec[i] = i; } infinicore::context::memcpyH2D(input.input_offsets.value()->data(), input_offsets_vec.data(), (b + 1) * sizeof(int32_t), false); input.cu_seqlens = infinicore::Tensor::empty({b + 1}, infinicore::DataType::I32, infinicore::context::getDevice()); infinicore::context::memcpyH2D(input.cu_seqlens.value()->data(), input_offsets_vec.data(), (b + 1) * sizeof(int32_t), false); input.block_tables = block_tables_holder_->as_strided({b, block_per_req}, {(ptrdiff_t)block_per_req, 1}); input.slot_mapping = infinicore::Tensor::empty({b}, infinicore::DataType::I64, infinicore::context::getDevice()); set_zeros(input.slot_mapping.value()); //////////////// // 从当前 dummy tensor 的 shape 中直接提取物理极限 // 1. 对于 varlen,q 的最大长度就是 input_ids 的 token 总数 (这里是 1*b / b) // 如果你的 prefill 循环里 input_ids 分配的是 {seq_len, b},这行代码依然自适应生效 input.max_seqlen_q = input.input_ids.value()->size(0); // 假设 shape 里的维 0 是 seq_len,维 1 是 batch // 2. max_seqlen_k 的绝对安全边界 = 当前分配的每请求 block 数 * 每个 block 的容量 input.max_seqlen_k = block_per_req * block_size; ///////////// barrier_->wait(); infinicore::context::startGraphRecording(); auto output = model_->forward(input); auto graph = infinicore::context::stopGraphRecording(); barrier_->wait(); auto shared_output = std::shared_ptr( new InfinilmModel::Output{infinicore::graph::GraphTensor(output.logits)}); compiled_map_decode_[b] = CompiledResult{std::move(input), std::make_tuple(graph, shared_output)}; } } } PagedCompiler::Compiled PagedCompiler::get_compiled(const InfinilmModel::Input &input) { if (model_->get_cache_config() != nullptr && dynamic_cast(model_->get_cache_config())) { size_t batch_size = input.block_tables.value()->size(0); size_t block_per_req = input.block_tables.value()->size(1); // only support decode only batch if (batch_size != input.input_ids.value()->size(1)) { return {nullptr, nullptr}; } else { auto result = compiled_map_decode_.find(batch_size); if (result == compiled_map_decode_.end()) { return {nullptr, nullptr}; } auto &graph_input = result->second.input; graph_input.input_ids.value()->copy_from(input.input_ids.value()); graph_input.position_ids.value()->copy_from(input.position_ids.value()); graph_input.total_sequence_lengths.value()->copy_from(input.total_sequence_lengths.value()); graph_input.input_offsets.value()->copy_from(input.input_offsets.value()); graph_input.cu_seqlens.value()->copy_from(input.cu_seqlens.value()); graph_input.block_tables.value()->narrow({{1, 0, block_per_req}})->copy_from(input.block_tables.value()); graph_input.slot_mapping.value()->copy_from(input.slot_mapping.value()); auto graph = std::get<0>(result->second.compiled); auto shared_output = std::shared_ptr(new InfinilmModel::Output{std::get<1>(result->second.compiled)->logits->resume_from_blob_()}); return std::make_tuple(graph, shared_output); } } else { return {nullptr, nullptr}; } } } // namespace infinilm::engine