Commit 098fab15 authored by yaoht's avatar yaoht
Browse files

适配dcu

parent cfe4b1a8
Pipeline #3511 failed with stages
in 0 seconds
#include "paged_compiler.hpp"
#include <spdlog/spdlog.h>
namespace {
// Todo: replace with Tensor::zeros when it is available
inline void set_zeros(infinicore::Tensor &tensor) {
......@@ -31,6 +33,11 @@ PagedCompiler::PagedCompiler(const std::shared_ptr<InfinilmModel> &model, RankBa
void PagedCompiler::compile() {
if (model_->get_cache_config() != nullptr && dynamic_cast<const cache::PagedKVCacheConfig *>(model_->get_cache_config())) {
size_t nblocks = dynamic_cast<const cache::PagedKVCacheConfig *>(model_->get_cache_config())->num_blocks();
// ///////////
// 获取配置中的 block 大小(比如 16 或 32)
int block_size = dynamic_cast<const cache::PagedKVCacheConfig *>(model_->get_cache_config())->block_size();
/////////
size_t max_batch_size = *std::max_element(decode_batch_sizes_.begin(), decode_batch_sizes_.end());
compiled_map_decode_.clear();
block_tables_holder_ = infinicore::Tensor::empty(
......@@ -59,6 +66,15 @@ void PagedCompiler::compile() {
input.slot_mapping = infinicore::Tensor::empty({b}, infinicore::DataType::I64, infinicore::context::getDevice());
set_zeros(input.slot_mapping.value());
////////////////
// 从当前 dummy tensor 的 shape 中直接提取物理极限
// 1. 对于 varlen,q 的最大长度就是 input_ids 的 token 总数 (这里是 1*b / b)
// 如果你的 prefill 循环里 input_ids 分配的是 {seq_len, b},这行代码依然自适应生效
input.max_seqlen_q = input.input_ids.value()->size(0); // 假设 shape 里的维 0 是 seq_len,维 1 是 batch
// 2. max_seqlen_k 的绝对安全边界 = 当前分配的每请求 block 数 * 每个 block 的容量
input.max_seqlen_k = block_per_req * block_size;
/////////////
barrier_->wait();
infinicore::context::startGraphRecording();
auto output = model_->forward(input);
......
#include "static_batching_compiler.hpp"
#include "../../cache/cache.hpp"
#include <spdlog/spdlog.h>
namespace infinilm::engine {
StaticBatchingCompiler::StaticBatchingCompiler(const std::shared_ptr<InfinilmModel> &model, RankBarrier *barrier)
......
#include "infer_engine.hpp"
#include "../cache/cache.hpp"
#include "spdlog/spdlog.h"
#include <algorithm>
namespace infinilm::engine {
//------------------------------------------------------
......@@ -109,14 +112,14 @@ std::vector<std::unordered_map<std::string, infinicore::nn::Parameter>> InferEng
// forward
//------------------------------------------------------
infinilm::InfinilmModel::Input
InferEngine::Input::to_model_input(infinicore::Device device) const {
InferEngine::Input::to_model_input(infinicore::Device device, const cache::CacheConfig *cache_config) const {
auto to_device = [&](const std::optional<infinicore::Tensor> &t)
-> std::optional<infinicore::Tensor> {
return t.has_value() ? t.value()->to(device) : t;
};
return {
InfinilmModel::Input out{
to_device(input_ids), // @todo: on device in the future
to_device(position_ids),
to_device(past_sequence_lengths), // @todo: on device in the future
......@@ -126,6 +129,27 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
to_device(block_tables),
to_device(slot_mapping),
};
// Paged prefill FA: bounds without reading GPU tensor payloads (shape / cache config only).
if (cache_config != nullptr) {
if (const auto *paged = dynamic_cast<const cache::PagedKVCacheConfig *>(cache_config)) {
if (out.block_tables.has_value()) {
out.max_seqlen_k = static_cast<int>(out.block_tables.value()->size(1) * paged->block_size());
}
if (out.slot_mapping.has_value()) {
out.max_seqlen_q = static_cast<int>(out.slot_mapping.value()->size(0));
} else if (out.input_ids.has_value()) {
const auto &sh = out.input_ids.value()->shape();
if (sh.size() == 1) {
out.max_seqlen_q = static_cast<int>(sh[0]);
} else if (sh.size() >= 2) {
out.max_seqlen_q = static_cast<int>(std::max(sh[0], sh[1]));
}
}
}
}
return out;
}
InferEngine::Output InferEngine::forward(const InferEngine::Input &input) {
......
......@@ -47,7 +47,11 @@ RankWorker::RankWorker(const InfinilmModel::Config &model_config,
// Wait until the worker thread finishes initialization (model created)
std::unique_lock<std::mutex> lk(mutex_);
cv_.wait(lk, [&] { return init_done_; });
cv_.wait(lk, [&] { return init_done_ || should_exit_; });
if (should_exit_) {
thread_.join();
throw std::runtime_error("RankWorker initialization failed (see error above)");
}
}
RankWorker::RankWorker(
......@@ -75,7 +79,11 @@ RankWorker::RankWorker(
thread_ = std::thread(&RankWorker::thread_loop, this);
// Wait until the worker thread finishes initialization (model created)
std::unique_lock<std::mutex> lk(mutex_);
cv_.wait(lk, [&] { return init_done_; });
cv_.wait(lk, [&] { return init_done_ || should_exit_; });
if (should_exit_) {
thread_.join();
throw std::runtime_error("RankWorker initialization failed (see error above)");
}
}
std::string RankWorker::info() const {
......@@ -327,7 +335,8 @@ void RankWorker::thread_loop() {
infinicore::Tensor logits;
// Try to get compiled graph
if (compiler_ != nullptr) {
auto [graph, output] = compiler_->get_compiled(local_args.to_model_input(infinicore::Device::cpu()));
auto [graph, output] = compiler_->get_compiled(
local_args.to_model_input(infinicore::Device::cpu(), model_->get_cache_config()));
if (graph != nullptr && output != nullptr) {
graph->run();
logits = output->logits;
......@@ -335,7 +344,7 @@ void RankWorker::thread_loop() {
}
// Fall back to eager mode
if (!logits) {
auto model_args = local_args.to_model_input(rank_info_.device);
auto model_args = local_args.to_model_input(rank_info_.device, model_->get_cache_config());
logits = model_->forward(model_args).logits;
}
......@@ -436,10 +445,12 @@ void RankWorker::thread_loop() {
compiler_.reset();
} catch (const std::exception &e) {
// Top-level exception: ensure any waiters are woken and the thread exits cleanly.
// Must set init_done_=true so constructor's cv_.wait(init_done_) can unblock.
{
std::lock_guard<std::mutex> lk(mutex_);
should_exit_ = true;
job_done_ = true;
init_done_ = true;
}
cv_.notify_all();
spdlog::error("[{}] fatal exception in thread_loop: {} \n", info(), e.what());
......
......@@ -53,7 +53,9 @@ public:
float top_p{1};
infinilm::InfinilmModel::Input to_model_input(infinicore::Device device) const;
/// Fills max_seqlen_q/k for paged FA prefill using tensor shapes + cache config only (no tensor D2H).
infinilm::InfinilmModel::Input to_model_input(infinicore::Device device,
const cache::CacheConfig *cache_config = nullptr) const;
};
struct Output {
......
......@@ -33,6 +33,9 @@ public:
std::optional<infinicore::Tensor> block_tables;
/// Slot ids for each token `[seq]`. Used for paged cache.
std::optional<infinicore::Tensor> slot_mapping;
// 【新增】由上层引擎在构造 Tensor 时传入
int max_seqlen_q = 0;
int max_seqlen_k = 0;
};
struct Output {
......
......@@ -181,6 +181,7 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta
// 4. Apply RoPE to Q and K
auto q_rope = infinicore::Tensor::empty({batch_size, num_attention_heads_, seq_len, head_dim_}, q_reshaped->dtype(), q_reshaped->device())->permute({0, 2, 1, 3});
rotary_emb_->forward(q_rope, q_reshaped, pos_ids_for_rope); // [bs, seq_len, n_q_head, head_dim]
rotary_emb_->forward(k_reshaped, pos_ids_for_rope, true); // [bs, seq_len, n_kv_head, head_dim]
......@@ -248,7 +249,7 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> cu_seqlens,
std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping) const {
std::optional<infinicore::Tensor> slot_mapping,int max_seqlen_q, int max_seqlen_k) const {
ASSERT(block_tables.has_value());
ASSERT(slot_mapping.has_value());
......@@ -305,9 +306,25 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
// 6. Compute attention
infinicore::Tensor attn_output = infinicore::Tensor::empty({seq_len, num_attention_heads_, head_dim_}, q_reshaped->dtype(), q_reshaped->device());
if (is_prefill) {
if (attention_backend_ == backends::AttentionBackend::FlashAttn) {
// Compute actual max sequence lengths from the cumulative seqlen tensors.
// Passing max_position_embeddings_ here causes flash-attn's splitkv kernel to
// compute far too many K-block iterations, reading block_table entries that do
// not exist and then using the garbage values as KV-cache block indices,
// resulting in an out-of-bounds GPU memory access (VMFault).
//////////////
// auto total_lens_cpu = total_sequence_lengths.value()->to(infinicore::Device::cpu());
// const auto *total_lens_ptr = reinterpret_cast<const int32_t *>(total_lens_cpu->data());
// int n_reqs = static_cast<int>(total_sequence_lengths.value()->shape()[0]);
// int max_seqlen_k = 0;
// for (int i = 0; i < n_reqs; ++i) {
// max_seqlen_k = std::max(max_seqlen_k, total_lens_ptr[i]);
// }
// // max_seqlen_q: with batch_size==1 the flattened seq_len equals the per-request length.
// int max_seqlen_q = static_cast<int>(seq_len);
infinicore::op::mha_varlen_(
attn_output,
q_reshaped,
......@@ -316,8 +333,8 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
input_offsets.value(),
cu_seqlens.value(),
block_tables.value(),
max_position_embeddings_,
max_position_embeddings_,
static_cast<int>(seq_len),
max_seqlen_k,
std::nullopt,
scaling_);
} else {
......@@ -377,16 +394,15 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> cu_seqlens,
std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping) const {
std::optional<infinicore::Tensor> slot_mapping,int max_seqlen_q, int max_seqlen_k) const {
if (!rotary_emb_) {
throw std::runtime_error("LlamaAttention: rotary_emb not configured");
}
infinicore::Tensor output;
if (auto paged_kv_cache = std::dynamic_pointer_cast<cache::PagedKVCache>(kv_cache)) {
output = forward_paged_(hidden_states, position_ids, paged_kv_cache, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping);
output = forward_paged_(hidden_states, position_ids, paged_kv_cache, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping,max_seqlen_q, max_seqlen_k);
} else {
output = forward_(hidden_states, position_ids, kv_cache, past_sequence_lengths, total_sequence_lengths);
}
return output;
......
......@@ -78,7 +78,7 @@ public:
std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> cu_seqlens,
std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping) const;
std::optional<infinicore::Tensor> slot_mapping,int max_seqlen_q, int max_seqlen_k) const;
/**
* @brief Get the layer index
......@@ -110,7 +110,7 @@ private:
std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> cu_seqlens,
std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping) const;
std::optional<infinicore::Tensor> slot_mapping,int max_seqlen_q, int max_seqlen_k) const;
protected:
// Projection layers
......
......@@ -61,13 +61,13 @@ LlamaDecoderLayer::forward(infinicore::Tensor &hidden_states,
std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> cu_seqlens,
std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping) const {
std::optional<infinicore::Tensor> slot_mapping,int max_seqlen_q, int max_seqlen_k) const {
// 1. Attention layer normalization
input_layernorm_->forward_inplace(hidden_states, residual);
// 2. Self-attention
hidden_states = self_attn_->forward(
hidden_states, position_ids, kv_cache, past_sequence_lengths, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping);
hidden_states, position_ids, kv_cache, past_sequence_lengths, total_sequence_lengths, input_offsets, cu_seqlens, block_tables, slot_mapping,max_seqlen_q, max_seqlen_k);
// 3. Post-attention layer normalization
post_attention_layernorm_->forward_inplace(hidden_states, residual);
......
......@@ -77,7 +77,7 @@ public:
std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> cu_seqlens,
std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mappin) const;
std::optional<infinicore::Tensor> slot_mappin,int max_seqlen_q, int max_seqlen_k) const;
/**
* @brief Get the layer index
......
......@@ -61,10 +61,13 @@ LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const {
auto cu_seqlens = input.cu_seqlens;
auto block_tables = input.block_tables;
auto slot_mapping = input.slot_mapping;
/////////////////////////////////
int max_seqlen_q = input.max_seqlen_q; // 拿到!
int max_seqlen_k = input.max_seqlen_k; // 拿到!
//////////////////////////
// 1. Forward through base model to get hidden states
auto hidden_states = model_->forward(
input_ids, position_ids, past_sequence_lengths, total_sequence_length, input_offsets, cu_seqlens, block_tables, slot_mapping);
input_ids, position_ids, past_sequence_lengths, total_sequence_length, input_offsets, cu_seqlens, block_tables, slot_mapping,max_seqlen_q, max_seqlen_k);
// 2. Apply language modeling head to get logits
auto logits = lm_head_->forward(hidden_states);
......
......@@ -96,7 +96,8 @@ infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> cu_seqlens,
std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping) const {
std::optional<infinicore::Tensor> slot_mapping,
int max_seqlen_q, int max_seqlen_k) const {
// 1. Embed tokens: input_ids -> [batch, seq_len, hidden_size]
auto hidden_states = embed_tokens_->forward(input_ids);
......@@ -114,7 +115,7 @@ infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
input_offsets,
cu_seqlens,
block_tables,
slot_mapping);
slot_mapping,max_seqlen_q, max_seqlen_k);
}
norm_->forward_inplace(hidden_states, residual);
......
......@@ -77,7 +77,7 @@ public:
std::optional<infinicore::Tensor> input_offsets,
std::optional<infinicore::Tensor> cu_seqlens,
std::optional<infinicore::Tensor> block_tables,
std::optional<infinicore::Tensor> slot_mapping) const;
std::optional<infinicore::Tensor> slot_mapping,int max_seqlen_q, int max_seqlen_k) const;
void reset_cache(const cache::CacheConfig *cache_config);
......
......@@ -9,6 +9,11 @@ namespace py = pybind11;
PYBIND11_MODULE(_infinilm, m) {
m.doc() = "InfiniLM Llama model Python bindings";
// Import _infinicore first so that its pybind11 types
// (DataType, Device::Type, etc.) are registered in the
// global type map before _infinilm tries to use them.
py::module_::import("infinicore");
infinilm::cache::bind_cache(m);
infinilm::models::llama::bind_llama(m);
infinilm::engine::distributed::bind_dist_config(m);
......
......@@ -34,14 +34,14 @@ inline void bind_infer_engine(py::module &m) {
.def(py::init([](
const InfinilmModel::Config &cfg,
const distributed::DistConfig &dist,
infinicore::Device::Type dev,
infinicore::Device::Type device_type,
std::shared_ptr<const infinilm::cache::CacheConfig> cache_cfg,
bool enable_graph_compiling,
const std::string &attention_backend) {
return std::make_shared<InferEngine>(
cfg,
dist,
dev,
device_type,
cache_cfg ? cache_cfg.get() : nullptr,
enable_graph_compiling,
infinilm::backends::parse_attention_backend(attention_backend));
......@@ -80,14 +80,14 @@ inline void bind_infer_engine(py::module &m) {
.def(py::init([](
const std::string &model_path,
const distributed::DistConfig &dist,
infinicore::Device::Type dev,
infinicore::Device::Type device_type,
std::shared_ptr<const infinilm::cache::CacheConfig> cache_cfg,
bool enable_graph_compiling,
const std::string &attention_backend) {
return std::make_shared<InferEngine>(
model_path,
dist,
dev,
device_type,
cache_cfg ? cache_cfg.get() : nullptr,
enable_graph_compiling,
infinilm::backends::parse_attention_backend(attention_backend));
......
......@@ -46,7 +46,6 @@ inline void bind_llama(py::module &m) {
py::class_<LlamaConfig, InfinilmModel::Config> llama_config(m, "LlamaConfig");
llama_config
.def(py::init<>())
// TODO: Change this to `dtype` after updating InfiniCore pybind11 exposing mechanism.
.def_readwrite("_dtype", &LlamaConfig::dtype)
.def_readwrite("vocab_size", &LlamaConfig::vocab_size)
.def_readwrite("hidden_size", &LlamaConfig::hidden_size)
......
......@@ -360,9 +360,16 @@ class TestModel:
# 自回归生成
# ---------------------------------------------------------------------------- #
input_ids_infini = infinicore.from_list(input_ids_list)
import os
import ctypes
rocm_path = os.environ.get('ROCM_PATH')
lib_path = os.path.join(rocm_path, 'hip/lib/libgalaxyhip.so')
lib = ctypes.CDLL(lib_path)
roctracer_stop = lib.roctracer_stop
roctracer_start = lib.roctracer_start
t1 = time.time()
print("=================== start generate ====================")
roctracer_start()
output_ids = self.model.generate(
input_ids_infini,
GenerationConfig(
......@@ -375,6 +382,7 @@ class TestModel:
),
_measure_and_log_time=True,
)
roctracer_stop()
t2 = time.time()
numpy_output_ids = np.array(
......
import infinicore
from transformers import AutoTokenizer
from infinilm.modeling_utils import load_model_state_dict_by_file
from infinilm.distributed import DistConfig
from infinilm.infer_engine import GenerationConfig, InferEngine
from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
import argparse
import sys
import time
import os
import json
from collections import OrderedDict
import numpy as np
from tqdm import tqdm
import torch
from torch.profiler import ProfilerActivity
from torch.profiler import profile as torch_prof
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
DATA_TYPE_BYTES = {
"bfloat16": 2,
"float16": 2,
"float32": 4,
}
_PAGED_KV_BLOCK_SIZE = 256
# BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128]
# INPUT_LENS = [32, 256, 1024, 4096]
# OUTPUT_LENS = [256, 1024, 4096]
def read_json_file(file_path):
"""Load and return JSON content from file_path."""
with open(file_path, "r") as file:
return json.load(file)
def parse_list(value: str):
"""Parse parse_list argument: can be a single int or a list of ints.
Examples:
"1" -> 1
"[1,2,4]" -> [1, 2, 4]
"1,2,4" -> [1, 2, 4]
"""
value = value.strip()
# Try to parse as JSON list first
if value.startswith("[") and value.endswith("]"):
try:
result = json.loads(value)
if isinstance(result, list):
return [int(x) for x in result]
return int(result)
except (json.JSONDecodeError, ValueError):
pass
# Try to parse as comma-separated values
if "," in value:
try:
return [int(x.strip()) for x in value.split(",")]
except ValueError:
pass
# Try to parse as a single integer
try:
return int(value)
except ValueError:
raise argparse.ArgumentTypeError(
f"batch-size must be an int or list[int], got: {value}"
)
def get_test_cases(
model_path: str,
batch_size_list: list[int],
input_len_list: list[int],
output_len_list: list[int],
):
model_path = os.path.expanduser(model_path)
"""Generate cases ordered by ascending KV cache memory usage."""
# Load model config to derive attention dimensions
config = read_json_file(os.path.join(model_path, "config.json"))
head_dim = config.get(
"head_dim", config.get("hidden_size") // config.get("num_attention_heads")
)
# KV heads and layers drive cache size
num_key_value_heads = config.get("num_key_value_heads")
num_hidden_layers = config.get("num_hidden_layers")
# Enumerate all batch/input/output combinations and compute KV cache size
case_list = []
for batch_size in batch_size_list:
for input_len in input_len_list:
for output_len in output_len_list:
for data_type in ["bfloat16"]:
data_type_bytes = DATA_TYPE_BYTES[data_type]
total_seq_len = input_len + output_len
kvcache_memory_bytes = (
data_type_bytes
* (batch_size * total_seq_len * num_key_value_heads * head_dim)
* num_hidden_layers
)
kvcache_memory_gb = kvcache_memory_bytes / (1024 * 1024 * 1024)
case_list.append(
{
"idx": len(case_list),
"batch_size": batch_size,
"input_len": input_len,
"output_len": output_len,
"data_type": data_type,
"kvcache_memory": round(kvcache_memory_gb, 3),
}
)
# Sort by KV cache size and wrap in OrderedDict with index keys
case_dict = OrderedDict(
(idx, case)
for idx, case in enumerate(
sorted(case_list, key=lambda case: case["kvcache_memory"])
)
)
return case_dict
def get_args():
parser = argparse.ArgumentParser(description="run Llama args")
parser.add_argument(
"--cpu",
action="store_true",
help="Run cpu test",
)
parser.add_argument(
"--nvidia",
action="store_true",
help="Run nvidia test",
)
parser.add_argument(
"--qy",
action="store_true",
help="Run qy test",
)
parser.add_argument(
"--metax",
action="store_true",
help="Run metax test",
)
parser.add_argument(
"--moore",
action="store_true",
help="Run moore test",
)
parser.add_argument(
"--iluvatar",
action="store_true",
help="Run iluvatar test",
)
parser.add_argument(
"--cambricon",
action="store_true",
help="Run cambricon test",
)
parser.add_argument(
"--ali",
action="store_true",
help="Run alippu test",
)
parser.add_argument(
"--hygon",
action="store_true",
help="Run hygon test",
)
parser.add_argument(
"--model",
type=str,
required=True,
help="model path",
)
parser.add_argument(
"--batch-size",
type=parse_list,
default=1,
help="number of prompts in a batch (can be an int or a list of ints, e.g., '1' or '[1,2,4]' or '1,2,4')",
)
parser.add_argument(
"--tensor-parallel-size",
"--tp",
type=int,
default=1,
help="total rank for tensor parallel",
)
parser.add_argument(
"--input-len",
type=parse_list,
default=10,
help="output tokens",
)
parser.add_argument(
"--output-len",
type=parse_list,
default=20,
help="output tokens",
)
parser.add_argument(
"--skip-load",
action="store_true",
help="skip loading model weights",
)
parser.add_argument(
"--top-k",
type=int,
default=1,
help="top k sampling",
)
parser.add_argument(
"--top-p",
type=float,
default=1.0,
help="top p sampling",
)
parser.add_argument(
"--temperature",
type=float,
default=1.0,
help="sampling temperature",
)
parser.add_argument(
"--enable-paged-attn",
action="store_true",
help="use paged cache",
)
parser.add_argument(
"--paged_kv_block_size",
type=int,
default=256,
help="num tokens each kv block can hold",
)
parser.add_argument(
"--enable-graph",
action="store_true",
help="enable graph compiling",
)
parser.add_argument(
"--warmup",
action="store_true",
help="Perform a warmup run before benchmarking/inference.",
)
parser.add_argument(
"--attn",
type=str,
default="default",
choices=["default", "flash-attn"],
help="attention backend to use: 'default' or 'flash-attn'",
)
return parser.parse_args()
with open("examples/bench_prompt.md", "r") as f:
prompt = f.read()
def repeat_prompt(input_ids: list[int], target_length: int):
num = len(input_ids)
repeat_times = (target_length + num - 1) // num
return (input_ids * repeat_times)[:target_length]
class TestModel:
model: infinicore.nn.Module
tokenizer: AutoTokenizer
input_ids_list: list[int]
def __init__(
self,
model_path,
infini_device=infinicore.device("cpu", 0),
tp=1,
skip_load=False,
cache_config=None,
enable_graph=False,
attn_backend="default",
) -> None:
model_path = os.path.expanduser(model_path)
# ---------------------------------------------------------------------------- #
# 创建模型,
# ---------------------------------------------------------------------------- #
model = InferEngine(
model_path,
device=infini_device,
distributed_config=DistConfig(tp),
cache_config=cache_config,
enable_graph_compiling=enable_graph,
attention_backend=attn_backend,
)
# ---------------------------------------------------------------------------- #
# 加载权重
# ---------------------------------------------------------------------------- #
if not skip_load:
load_model_state_dict_by_file(model, model_path, dtype=model.config.dtype)
# ---------------------------------------------------------------------------- #
# 创建 tokenizer
# ---------------------------------------------------------------------------- #
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
if tokenizer.pad_token is None:
if tokenizer.eos_token is not None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
else:
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
# ---------------------------------------------------------------------------- #
# token编码
# ---------------------------------------------------------------------------- #
input_content = [
tokenizer.apply_chat_template(
conversation=[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
]
# print(input_content, end="", flush=True)
# Support Transformers >= 5.0 for batch_encode_plus deprecation
encoding = tokenizer(
input_content,
padding=True,
truncation=True,
max_length=8192,
)
input_ids_list = encoding["input_ids"]
self.model = model
self.tokenizer = tokenizer
self.input_ids_list = input_ids_list
def run(
self,
batch_size: int,
input_len: int,
output_len: int,
top_k=1,
top_p=1.0,
temperature=1.0,
):
input_ids = repeat_prompt(self.input_ids_list[0], target_length=input_len)
input_ids_list = [input_ids] * batch_size
# ---------------------------------------------------------------------------- #
# 自回归生成
# ---------------------------------------------------------------------------- #
input_ids_infini = infinicore.from_list(input_ids_list)
t1 = time.time()
print("=================== start generate ====================")
output_ids = self.model.generate(
input_ids_infini,
GenerationConfig(
max_new_tokens=output_len,
eos_token_id=[],
top_k=top_k,
top_p=top_p,
temperature=temperature,
stop_on_eos=False,
),
_measure_and_log_time=True,
)
t2 = time.time()
numpy_output_ids = np.array(
[output_id.to_numpy()[0] for output_id in output_ids]
)
print(self.tokenizer.decode(numpy_output_ids, skip_special_tokens=True))
print(
f"total_time: {round((t2 - t1) * 1000, 2)} ms",
)
if __name__ == "__main__":
args = get_args()
print(args)
# Parse command line arguments
device_str = "cpu"
if args.cpu:
device_str = "cpu"
elif args.nvidia:
device_str = "cuda"
elif args.qy:
device_str = "cuda"
elif args.metax:
device_str = "cuda"
elif args.moore:
device_str = "musa"
elif args.iluvatar:
device_str = "cuda"
elif args.cambricon:
device_str = "mlu"
elif args.ali:
device_str = "cuda"
elif args.hygon:
device_str = "cuda"
else:
print(
"python examples/bench.py --nvidia --model=~/TinyLlama-1.1B-Chat-v1.0/ --batch-size=2 --tp=1 --input-len=50 --output-len=50"
)
sys.exit(1)
_PAGED_KV_BLOCK_SIZE = args.paged_kv_block_size
# -------------------------------------------------------- #
# 解析参数
# -------------------------------------------------------- #
model_path = args.model
infini_device = infinicore.device(device_str, 0)
tp = args.tensor_parallel_size
skip_load = args.skip_load
batch_size = args.batch_size
input_len = args.input_len
output_len = args.output_len
enable_paged_attn = args.enable_paged_attn
enable_graph = args.enable_graph
if isinstance(batch_size, int):
batch_size = [batch_size]
if isinstance(input_len, int):
input_len = [input_len]
if isinstance(output_len, int):
output_len = [output_len]
cases_dict = get_test_cases(model_path, batch_size, input_len, output_len)
# -------------------------------------------------------- #
# 测试
# -------------------------------------------------------- #
if enable_paged_attn:
paged_kv_block_size = _PAGED_KV_BLOCK_SIZE
max_num_blocks = max(
[
(
(c_["input_len"] + c_["output_len"] + (paged_kv_block_size - 1))
// paged_kv_block_size
)
* c_["batch_size"]
for _, c_ in cases_dict.items()
]
)
cache_config = PagedKVCacheConfig(max_num_blocks, paged_kv_block_size)
else:
cache_config = None
test = TestModel(
model_path,
infini_device=infini_device,
tp=tp,
skip_load=skip_load,
cache_config=cache_config,
enable_graph=enable_graph,
attn_backend=args.attn,
)
# ---------------------------------------------------------------------------- #
# Warmup
# ---------------------------------------------------------------------------- #
if args.warmup:
warmup_steps = 1
# warmup cache capacity
warmup_cache_len = 128
warmup_batch = len(test.input_ids_list)
test.model.reset_cache(
StaticKVCacheConfig(
max_batch_size=warmup_batch,
max_cache_len=warmup_cache_len,
)
)
avg_prompt_len = min(64, max(len(ids) for ids in test.input_ids_list))
warmup_ids = [
ids[:avg_prompt_len] if len(ids) >= avg_prompt_len else ids
for ids in test.input_ids_list
]
input_ids_infini = infinicore.from_list(warmup_ids)
print("=================== warmup start ===================")
for _ in range(warmup_steps):
_ = test.model.generate(
input_ids_infini,
GenerationConfig(
max_new_tokens=5, # decode kernel warmup
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p,
stop_on_eos=False,
),
_measure_and_log_time=False,
)
print("=================== warmup done ====================")
# reset cache back to benchmark config
if cache_config is not None:
test.model.reset_cache(cache_config)
# ---------------------------------------------------------------------------- #
# Warmup done
# ---------------------------------------------------------------------------- #
for idx, case in tqdm(cases_dict.items(), desc="Processing cases"):
tqdm.write(f"\033[92mProcessing : {case}\033[0m")
batch_size = case["batch_size"]
input_len = case["input_len"]
output_len = case["output_len"]
if not enable_paged_attn:
# reset cache if static kvcache is used
initial_capacity = input_len + output_len
test.model.reset_cache(
StaticKVCacheConfig(
max_batch_size=batch_size, max_cache_len=initial_capacity
)
)
# run test one case
# test.run(
# batch_size=batch_size,
# input_len=input_len,
# output_len=output_len,
# top_k=args.top_k,
# top_p=args.top_p,
# temperature=args.temperature,
# )
# run test one case
log_file=f"/home/qi-yuan-2026/z_prof_logs/8b_fa_1_4096_256_prof.log"
json_file=f"/home/qi-yuan-2026/z_prof_logs/8b_fa_1_4096_256_prof.json"
# log_file=f"/root/qi_yuan/z_prof_logs/70b_1_32_256_prof.log"
# json_file=f"/root/qi_yuan/z_prof_logs/70b_1_32_256_prof.json"
with torch_prof(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],record_shapes=True,profile_memory=False,with_stack=True) as prof:
with torch.no_grad():
test.run(
batch_size=batch_size,
input_len=input_len,
output_len=output_len,
top_k=args.top_k,
top_p=args.top_p,
temperature=args.temperature,
)
with open(log_file,'w') as log_file:
log_file.write(prof.key_averages().table(sort_by="self_cuda_time_total"))
print(prof.key_averages().table(sort_by="self_cuda_time_total"))
prof.export_chrome_trace(json_file)
#!/bin/bash
xmake f --use-kv-caching=true -cv
pip install -e . --no-build-isolation
\ No newline at end of file
cd InfiniLM-fa
# Basic inference
python examples/jiuge.py --hygon --model_path=../models/9g_8b_thinking_llama/
# With paged attention + flash-attn (use block_size=64 for Hygon)
python examples/jiuge.py --hygon --enable-paged-attn --paged_kv_block_size=64 --attn=flash-attn --model_path=../models/9g_8b_thinking_llama/
# With graph compilation
python examples/jiuge.py --hygon --enable-paged-attn --paged_kv_block_size=64 --attn=flash-attn --enable-graph --model_path=../models/9g_8b_thinking_llama/
# Multi-GPU (tensor parallel)
python examples/jiuge.py --hygon --tp=4 --model_path=../models/9g_8b_thinking_llama/
# Custom options
python examples/jiuge.py --hygon --model_path=<path> \
--max_new_tokens=100 \
--batch_size=1 \
--prompt="How are you" \
--top_k=1 --top_p=1.0 --temperature=1.0
python examples/bench.py --hygon --warmup --model=../models/9g_8b_thinking_llama/ \
--enable-paged-attn --attn=flash-attn --enable-graph \
--input-len=32,256,4096 --output-len=256,1024,2048,4096 \
--batch-size=1 --skip-load >> perf_hygon_bs_1.txt
python examples/bench.py --hygon --warmup --model=../models/9g_8b_thinking_llama/ \
--enable-paged-attn --attn=flash-attn --enable-graph \
--input-len=32,256 --output-len=256,1024,2048,4096 \
--batch-size=16 --skip-load >> perf_hygon_bs_16.txt
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment