Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
098fab15
Commit
098fab15
authored
Apr 21, 2026
by
yaoht
Browse files
适配dcu
parent
cfe4b1a8
Pipeline
#3511
failed with stages
in 0 seconds
Changes
20
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
727 additions
and
30 deletions
+727
-30
csrc/engine/compiler/paged_compiler.cpp
csrc/engine/compiler/paged_compiler.cpp
+16
-0
csrc/engine/compiler/static_batching_compiler.cpp
csrc/engine/compiler/static_batching_compiler.cpp
+1
-0
csrc/engine/infer_engine.cpp
csrc/engine/infer_engine.cpp
+26
-2
csrc/engine/rank_worker.cpp
csrc/engine/rank_worker.cpp
+15
-4
csrc/engine/rank_worker.hpp
csrc/engine/rank_worker.hpp
+3
-1
csrc/models/infinilm_model.hpp
csrc/models/infinilm_model.hpp
+3
-0
csrc/models/llama/llama_attention.cpp
csrc/models/llama/llama_attention.cpp
+23
-7
csrc/models/llama/llama_attention.hpp
csrc/models/llama/llama_attention.hpp
+2
-2
csrc/models/llama/llama_decoder_layer.cpp
csrc/models/llama/llama_decoder_layer.cpp
+2
-2
csrc/models/llama/llama_decoder_layer.hpp
csrc/models/llama/llama_decoder_layer.hpp
+1
-1
csrc/models/llama/llama_for_causal_lm.cpp
csrc/models/llama/llama_for_causal_lm.cpp
+5
-2
csrc/models/llama/llama_model.cpp
csrc/models/llama/llama_model.cpp
+3
-2
csrc/models/llama/llama_model.hpp
csrc/models/llama/llama_model.hpp
+1
-1
csrc/pybind11/bindings.cc
csrc/pybind11/bindings.cc
+5
-0
csrc/pybind11/engine/engine.hpp
csrc/pybind11/engine/engine.hpp
+4
-4
csrc/pybind11/models/llama.hpp
csrc/pybind11/models/llama.hpp
+0
-1
examples/bench.py
examples/bench.py
+9
-1
examples/bench_prof.py
examples/bench_prof.py
+573
-0
scripts/build.sh
scripts/build.sh
+5
-0
scripts/run_infer.sh
scripts/run_infer.sh
+30
-0
No files found.
csrc/engine/compiler/paged_compiler.cpp
View file @
098fab15
#include "paged_compiler.hpp"
#include <spdlog/spdlog.h>
namespace
{
// Todo: replace with Tensor::zeros when it is available
inline
void
set_zeros
(
infinicore
::
Tensor
&
tensor
)
{
...
...
@@ -31,6 +33,11 @@ PagedCompiler::PagedCompiler(const std::shared_ptr<InfinilmModel> &model, RankBa
void
PagedCompiler
::
compile
()
{
if
(
model_
->
get_cache_config
()
!=
nullptr
&&
dynamic_cast
<
const
cache
::
PagedKVCacheConfig
*>
(
model_
->
get_cache_config
()))
{
size_t
nblocks
=
dynamic_cast
<
const
cache
::
PagedKVCacheConfig
*>
(
model_
->
get_cache_config
())
->
num_blocks
();
// ///////////
// 获取配置中的 block 大小(比如 16 或 32)
int
block_size
=
dynamic_cast
<
const
cache
::
PagedKVCacheConfig
*>
(
model_
->
get_cache_config
())
->
block_size
();
/////////
size_t
max_batch_size
=
*
std
::
max_element
(
decode_batch_sizes_
.
begin
(),
decode_batch_sizes_
.
end
());
compiled_map_decode_
.
clear
();
block_tables_holder_
=
infinicore
::
Tensor
::
empty
(
...
...
@@ -59,6 +66,15 @@ void PagedCompiler::compile() {
input
.
slot_mapping
=
infinicore
::
Tensor
::
empty
({
b
},
infinicore
::
DataType
::
I64
,
infinicore
::
context
::
getDevice
());
set_zeros
(
input
.
slot_mapping
.
value
());
////////////////
// 从当前 dummy tensor 的 shape 中直接提取物理极限
// 1. 对于 varlen,q 的最大长度就是 input_ids 的 token 总数 (这里是 1*b / b)
// 如果你的 prefill 循环里 input_ids 分配的是 {seq_len, b},这行代码依然自适应生效
input
.
max_seqlen_q
=
input
.
input_ids
.
value
()
->
size
(
0
);
// 假设 shape 里的维 0 是 seq_len,维 1 是 batch
// 2. max_seqlen_k 的绝对安全边界 = 当前分配的每请求 block 数 * 每个 block 的容量
input
.
max_seqlen_k
=
block_per_req
*
block_size
;
/////////////
barrier_
->
wait
();
infinicore
::
context
::
startGraphRecording
();
auto
output
=
model_
->
forward
(
input
);
...
...
csrc/engine/compiler/static_batching_compiler.cpp
View file @
098fab15
#include "static_batching_compiler.hpp"
#include "../../cache/cache.hpp"
#include <spdlog/spdlog.h>
namespace
infinilm
::
engine
{
StaticBatchingCompiler
::
StaticBatchingCompiler
(
const
std
::
shared_ptr
<
InfinilmModel
>
&
model
,
RankBarrier
*
barrier
)
...
...
csrc/engine/infer_engine.cpp
View file @
098fab15
#include "infer_engine.hpp"
#include "../cache/cache.hpp"
#include "spdlog/spdlog.h"
#include <algorithm>
namespace
infinilm
::
engine
{
//------------------------------------------------------
...
...
@@ -109,14 +112,14 @@ std::vector<std::unordered_map<std::string, infinicore::nn::Parameter>> InferEng
// forward
//------------------------------------------------------
infinilm
::
InfinilmModel
::
Input
InferEngine
::
Input
::
to_model_input
(
infinicore
::
Device
device
)
const
{
InferEngine
::
Input
::
to_model_input
(
infinicore
::
Device
device
,
const
cache
::
CacheConfig
*
cache_config
)
const
{
auto
to_device
=
[
&
](
const
std
::
optional
<
infinicore
::
Tensor
>
&
t
)
->
std
::
optional
<
infinicore
::
Tensor
>
{
return
t
.
has_value
()
?
t
.
value
()
->
to
(
device
)
:
t
;
};
return
{
InfinilmModel
::
Input
out
{
to_device
(
input_ids
),
// @todo: on device in the future
to_device
(
position_ids
),
to_device
(
past_sequence_lengths
),
// @todo: on device in the future
...
...
@@ -126,6 +129,27 @@ InferEngine::Input::to_model_input(infinicore::Device device) const {
to_device
(
block_tables
),
to_device
(
slot_mapping
),
};
// Paged prefill FA: bounds without reading GPU tensor payloads (shape / cache config only).
if
(
cache_config
!=
nullptr
)
{
if
(
const
auto
*
paged
=
dynamic_cast
<
const
cache
::
PagedKVCacheConfig
*>
(
cache_config
))
{
if
(
out
.
block_tables
.
has_value
())
{
out
.
max_seqlen_k
=
static_cast
<
int
>
(
out
.
block_tables
.
value
()
->
size
(
1
)
*
paged
->
block_size
());
}
if
(
out
.
slot_mapping
.
has_value
())
{
out
.
max_seqlen_q
=
static_cast
<
int
>
(
out
.
slot_mapping
.
value
()
->
size
(
0
));
}
else
if
(
out
.
input_ids
.
has_value
())
{
const
auto
&
sh
=
out
.
input_ids
.
value
()
->
shape
();
if
(
sh
.
size
()
==
1
)
{
out
.
max_seqlen_q
=
static_cast
<
int
>
(
sh
[
0
]);
}
else
if
(
sh
.
size
()
>=
2
)
{
out
.
max_seqlen_q
=
static_cast
<
int
>
(
std
::
max
(
sh
[
0
],
sh
[
1
]));
}
}
}
}
return
out
;
}
InferEngine
::
Output
InferEngine
::
forward
(
const
InferEngine
::
Input
&
input
)
{
...
...
csrc/engine/rank_worker.cpp
View file @
098fab15
...
...
@@ -47,7 +47,11 @@ RankWorker::RankWorker(const InfinilmModel::Config &model_config,
// Wait until the worker thread finishes initialization (model created)
std
::
unique_lock
<
std
::
mutex
>
lk
(
mutex_
);
cv_
.
wait
(
lk
,
[
&
]
{
return
init_done_
;
});
cv_
.
wait
(
lk
,
[
&
]
{
return
init_done_
||
should_exit_
;
});
if
(
should_exit_
)
{
thread_
.
join
();
throw
std
::
runtime_error
(
"RankWorker initialization failed (see error above)"
);
}
}
RankWorker
::
RankWorker
(
...
...
@@ -75,7 +79,11 @@ RankWorker::RankWorker(
thread_
=
std
::
thread
(
&
RankWorker
::
thread_loop
,
this
);
// Wait until the worker thread finishes initialization (model created)
std
::
unique_lock
<
std
::
mutex
>
lk
(
mutex_
);
cv_
.
wait
(
lk
,
[
&
]
{
return
init_done_
;
});
cv_
.
wait
(
lk
,
[
&
]
{
return
init_done_
||
should_exit_
;
});
if
(
should_exit_
)
{
thread_
.
join
();
throw
std
::
runtime_error
(
"RankWorker initialization failed (see error above)"
);
}
}
std
::
string
RankWorker
::
info
()
const
{
...
...
@@ -327,7 +335,8 @@ void RankWorker::thread_loop() {
infinicore
::
Tensor
logits
;
// Try to get compiled graph
if
(
compiler_
!=
nullptr
)
{
auto
[
graph
,
output
]
=
compiler_
->
get_compiled
(
local_args
.
to_model_input
(
infinicore
::
Device
::
cpu
()));
auto
[
graph
,
output
]
=
compiler_
->
get_compiled
(
local_args
.
to_model_input
(
infinicore
::
Device
::
cpu
(),
model_
->
get_cache_config
()));
if
(
graph
!=
nullptr
&&
output
!=
nullptr
)
{
graph
->
run
();
logits
=
output
->
logits
;
...
...
@@ -335,7 +344,7 @@ void RankWorker::thread_loop() {
}
// Fall back to eager mode
if
(
!
logits
)
{
auto
model_args
=
local_args
.
to_model_input
(
rank_info_
.
device
);
auto
model_args
=
local_args
.
to_model_input
(
rank_info_
.
device
,
model_
->
get_cache_config
()
);
logits
=
model_
->
forward
(
model_args
).
logits
;
}
...
...
@@ -436,10 +445,12 @@ void RankWorker::thread_loop() {
compiler_
.
reset
();
}
catch
(
const
std
::
exception
&
e
)
{
// Top-level exception: ensure any waiters are woken and the thread exits cleanly.
// Must set init_done_=true so constructor's cv_.wait(init_done_) can unblock.
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
mutex_
);
should_exit_
=
true
;
job_done_
=
true
;
init_done_
=
true
;
}
cv_
.
notify_all
();
spdlog
::
error
(
"[{}] fatal exception in thread_loop: {}
\n
"
,
info
(),
e
.
what
());
...
...
csrc/engine/rank_worker.hpp
View file @
098fab15
...
...
@@ -53,7 +53,9 @@ public:
float
top_p
{
1
};
infinilm
::
InfinilmModel
::
Input
to_model_input
(
infinicore
::
Device
device
)
const
;
/// Fills max_seqlen_q/k for paged FA prefill using tensor shapes + cache config only (no tensor D2H).
infinilm
::
InfinilmModel
::
Input
to_model_input
(
infinicore
::
Device
device
,
const
cache
::
CacheConfig
*
cache_config
=
nullptr
)
const
;
};
struct
Output
{
...
...
csrc/models/infinilm_model.hpp
View file @
098fab15
...
...
@@ -33,6 +33,9 @@ public:
std
::
optional
<
infinicore
::
Tensor
>
block_tables
;
/// Slot ids for each token `[seq]`. Used for paged cache.
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
;
// 【新增】由上层引擎在构造 Tensor 时传入
int
max_seqlen_q
=
0
;
int
max_seqlen_k
=
0
;
};
struct
Output
{
...
...
csrc/models/llama/llama_attention.cpp
View file @
098fab15
...
...
@@ -181,6 +181,7 @@ infinicore::Tensor LlamaAttention::forward_(const infinicore::Tensor &hidden_sta
// 4. Apply RoPE to Q and K
auto
q_rope
=
infinicore
::
Tensor
::
empty
({
batch_size
,
num_attention_heads_
,
seq_len
,
head_dim_
},
q_reshaped
->
dtype
(),
q_reshaped
->
device
())
->
permute
({
0
,
2
,
1
,
3
});
rotary_emb_
->
forward
(
q_rope
,
q_reshaped
,
pos_ids_for_rope
);
// [bs, seq_len, n_q_head, head_dim]
rotary_emb_
->
forward
(
k_reshaped
,
pos_ids_for_rope
,
true
);
// [bs, seq_len, n_kv_head, head_dim]
...
...
@@ -248,7 +249,7 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
cu_seqlens
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
)
const
{
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
,
int
max_seqlen_q
,
int
max_seqlen_k
)
const
{
ASSERT
(
block_tables
.
has_value
());
ASSERT
(
slot_mapping
.
has_value
());
...
...
@@ -305,9 +306,25 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
// 6. Compute attention
infinicore
::
Tensor
attn_output
=
infinicore
::
Tensor
::
empty
({
seq_len
,
num_attention_heads_
,
head_dim_
},
q_reshaped
->
dtype
(),
q_reshaped
->
device
());
if
(
is_prefill
)
{
if
(
attention_backend_
==
backends
::
AttentionBackend
::
FlashAttn
)
{
// Compute actual max sequence lengths from the cumulative seqlen tensors.
// Passing max_position_embeddings_ here causes flash-attn's splitkv kernel to
// compute far too many K-block iterations, reading block_table entries that do
// not exist and then using the garbage values as KV-cache block indices,
// resulting in an out-of-bounds GPU memory access (VMFault).
//////////////
// auto total_lens_cpu = total_sequence_lengths.value()->to(infinicore::Device::cpu());
// const auto *total_lens_ptr = reinterpret_cast<const int32_t *>(total_lens_cpu->data());
// int n_reqs = static_cast<int>(total_sequence_lengths.value()->shape()[0]);
// int max_seqlen_k = 0;
// for (int i = 0; i < n_reqs; ++i) {
// max_seqlen_k = std::max(max_seqlen_k, total_lens_ptr[i]);
// }
// // max_seqlen_q: with batch_size==1 the flattened seq_len equals the per-request length.
// int max_seqlen_q = static_cast<int>(seq_len);
infinicore
::
op
::
mha_varlen_
(
attn_output
,
q_reshaped
,
...
...
@@ -316,8 +333,8 @@ infinicore::Tensor LlamaAttention::forward_paged_(const infinicore::Tensor &hidd
input_offsets
.
value
(),
cu_seqlens
.
value
(),
block_tables
.
value
(),
max_position_embeddings_
,
max_
position_embeddings_
,
static_cast
<
int
>
(
seq_len
)
,
max_
seqlen_k
,
std
::
nullopt
,
scaling_
);
}
else
{
...
...
@@ -377,16 +394,15 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
cu_seqlens
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
)
const
{
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
,
int
max_seqlen_q
,
int
max_seqlen_k
)
const
{
if
(
!
rotary_emb_
)
{
throw
std
::
runtime_error
(
"LlamaAttention: rotary_emb not configured"
);
}
infinicore
::
Tensor
output
;
if
(
auto
paged_kv_cache
=
std
::
dynamic_pointer_cast
<
cache
::
PagedKVCache
>
(
kv_cache
))
{
output
=
forward_paged_
(
hidden_states
,
position_ids
,
paged_kv_cache
,
total_sequence_lengths
,
input_offsets
,
cu_seqlens
,
block_tables
,
slot_mapping
);
output
=
forward_paged_
(
hidden_states
,
position_ids
,
paged_kv_cache
,
total_sequence_lengths
,
input_offsets
,
cu_seqlens
,
block_tables
,
slot_mapping
,
max_seqlen_q
,
max_seqlen_k
);
}
else
{
output
=
forward_
(
hidden_states
,
position_ids
,
kv_cache
,
past_sequence_lengths
,
total_sequence_lengths
);
}
return
output
;
...
...
csrc/models/llama/llama_attention.hpp
View file @
098fab15
...
...
@@ -78,7 +78,7 @@ public:
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
cu_seqlens
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
)
const
;
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
,
int
max_seqlen_q
,
int
max_seqlen_k
)
const
;
/**
* @brief Get the layer index
...
...
@@ -110,7 +110,7 @@ private:
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
cu_seqlens
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
)
const
;
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
,
int
max_seqlen_q
,
int
max_seqlen_k
)
const
;
protected:
// Projection layers
...
...
csrc/models/llama/llama_decoder_layer.cpp
View file @
098fab15
...
...
@@ -61,13 +61,13 @@ LlamaDecoderLayer::forward(infinicore::Tensor &hidden_states,
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
cu_seqlens
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
)
const
{
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
,
int
max_seqlen_q
,
int
max_seqlen_k
)
const
{
// 1. Attention layer normalization
input_layernorm_
->
forward_inplace
(
hidden_states
,
residual
);
// 2. Self-attention
hidden_states
=
self_attn_
->
forward
(
hidden_states
,
position_ids
,
kv_cache
,
past_sequence_lengths
,
total_sequence_lengths
,
input_offsets
,
cu_seqlens
,
block_tables
,
slot_mapping
);
hidden_states
,
position_ids
,
kv_cache
,
past_sequence_lengths
,
total_sequence_lengths
,
input_offsets
,
cu_seqlens
,
block_tables
,
slot_mapping
,
max_seqlen_q
,
max_seqlen_k
);
// 3. Post-attention layer normalization
post_attention_layernorm_
->
forward_inplace
(
hidden_states
,
residual
);
...
...
csrc/models/llama/llama_decoder_layer.hpp
View file @
098fab15
...
...
@@ -77,7 +77,7 @@ public:
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
cu_seqlens
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mappin
)
const
;
std
::
optional
<
infinicore
::
Tensor
>
slot_mappin
,
int
max_seqlen_q
,
int
max_seqlen_k
)
const
;
/**
* @brief Get the layer index
...
...
csrc/models/llama/llama_for_causal_lm.cpp
View file @
098fab15
...
...
@@ -61,10 +61,13 @@ LlamaForCausalLM::Output LlamaForCausalLM::forward(const Input &input) const {
auto
cu_seqlens
=
input
.
cu_seqlens
;
auto
block_tables
=
input
.
block_tables
;
auto
slot_mapping
=
input
.
slot_mapping
;
/////////////////////////////////
int
max_seqlen_q
=
input
.
max_seqlen_q
;
// 拿到!
int
max_seqlen_k
=
input
.
max_seqlen_k
;
// 拿到!
//////////////////////////
// 1. Forward through base model to get hidden states
auto
hidden_states
=
model_
->
forward
(
input_ids
,
position_ids
,
past_sequence_lengths
,
total_sequence_length
,
input_offsets
,
cu_seqlens
,
block_tables
,
slot_mapping
);
input_ids
,
position_ids
,
past_sequence_lengths
,
total_sequence_length
,
input_offsets
,
cu_seqlens
,
block_tables
,
slot_mapping
,
max_seqlen_q
,
max_seqlen_k
);
// 2. Apply language modeling head to get logits
auto
logits
=
lm_head_
->
forward
(
hidden_states
);
...
...
csrc/models/llama/llama_model.cpp
View file @
098fab15
...
...
@@ -96,7 +96,8 @@ infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
cu_seqlens
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
)
const
{
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
,
int
max_seqlen_q
,
int
max_seqlen_k
)
const
{
// 1. Embed tokens: input_ids -> [batch, seq_len, hidden_size]
auto
hidden_states
=
embed_tokens_
->
forward
(
input_ids
);
...
...
@@ -114,7 +115,7 @@ infinicore::Tensor LlamaModel::forward(const infinicore::Tensor &input_ids,
input_offsets
,
cu_seqlens
,
block_tables
,
slot_mapping
);
slot_mapping
,
max_seqlen_q
,
max_seqlen_k
);
}
norm_
->
forward_inplace
(
hidden_states
,
residual
);
...
...
csrc/models/llama/llama_model.hpp
View file @
098fab15
...
...
@@ -77,7 +77,7 @@ public:
std
::
optional
<
infinicore
::
Tensor
>
input_offsets
,
std
::
optional
<
infinicore
::
Tensor
>
cu_seqlens
,
std
::
optional
<
infinicore
::
Tensor
>
block_tables
,
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
)
const
;
std
::
optional
<
infinicore
::
Tensor
>
slot_mapping
,
int
max_seqlen_q
,
int
max_seqlen_k
)
const
;
void
reset_cache
(
const
cache
::
CacheConfig
*
cache_config
);
...
...
csrc/pybind11/bindings.cc
View file @
098fab15
...
...
@@ -9,6 +9,11 @@ namespace py = pybind11;
PYBIND11_MODULE
(
_infinilm
,
m
)
{
m
.
doc
()
=
"InfiniLM Llama model Python bindings"
;
// Import _infinicore first so that its pybind11 types
// (DataType, Device::Type, etc.) are registered in the
// global type map before _infinilm tries to use them.
py
::
module_
::
import
(
"infinicore"
);
infinilm
::
cache
::
bind_cache
(
m
);
infinilm
::
models
::
llama
::
bind_llama
(
m
);
infinilm
::
engine
::
distributed
::
bind_dist_config
(
m
);
...
...
csrc/pybind11/engine/engine.hpp
View file @
098fab15
...
...
@@ -34,14 +34,14 @@ inline void bind_infer_engine(py::module &m) {
.
def
(
py
::
init
([](
const
InfinilmModel
::
Config
&
cfg
,
const
distributed
::
DistConfig
&
dist
,
infinicore
::
Device
::
Type
dev
,
infinicore
::
Device
::
Type
dev
ice_type
,
std
::
shared_ptr
<
const
infinilm
::
cache
::
CacheConfig
>
cache_cfg
,
bool
enable_graph_compiling
,
const
std
::
string
&
attention_backend
)
{
return
std
::
make_shared
<
InferEngine
>
(
cfg
,
dist
,
dev
,
dev
ice_type
,
cache_cfg
?
cache_cfg
.
get
()
:
nullptr
,
enable_graph_compiling
,
infinilm
::
backends
::
parse_attention_backend
(
attention_backend
));
...
...
@@ -80,14 +80,14 @@ inline void bind_infer_engine(py::module &m) {
.
def
(
py
::
init
([](
const
std
::
string
&
model_path
,
const
distributed
::
DistConfig
&
dist
,
infinicore
::
Device
::
Type
dev
,
infinicore
::
Device
::
Type
dev
ice_type
,
std
::
shared_ptr
<
const
infinilm
::
cache
::
CacheConfig
>
cache_cfg
,
bool
enable_graph_compiling
,
const
std
::
string
&
attention_backend
)
{
return
std
::
make_shared
<
InferEngine
>
(
model_path
,
dist
,
dev
,
dev
ice_type
,
cache_cfg
?
cache_cfg
.
get
()
:
nullptr
,
enable_graph_compiling
,
infinilm
::
backends
::
parse_attention_backend
(
attention_backend
));
...
...
csrc/pybind11/models/llama.hpp
View file @
098fab15
...
...
@@ -46,7 +46,6 @@ inline void bind_llama(py::module &m) {
py
::
class_
<
LlamaConfig
,
InfinilmModel
::
Config
>
llama_config
(
m
,
"LlamaConfig"
);
llama_config
.
def
(
py
::
init
<>
())
// TODO: Change this to `dtype` after updating InfiniCore pybind11 exposing mechanism.
.
def_readwrite
(
"_dtype"
,
&
LlamaConfig
::
dtype
)
.
def_readwrite
(
"vocab_size"
,
&
LlamaConfig
::
vocab_size
)
.
def_readwrite
(
"hidden_size"
,
&
LlamaConfig
::
hidden_size
)
...
...
examples/bench.py
View file @
098fab15
...
...
@@ -360,9 +360,16 @@ class TestModel:
# 自回归生成
# ---------------------------------------------------------------------------- #
input_ids_infini
=
infinicore
.
from_list
(
input_ids_list
)
import
os
import
ctypes
rocm_path
=
os
.
environ
.
get
(
'ROCM_PATH'
)
lib_path
=
os
.
path
.
join
(
rocm_path
,
'hip/lib/libgalaxyhip.so'
)
lib
=
ctypes
.
CDLL
(
lib_path
)
roctracer_stop
=
lib
.
roctracer_stop
roctracer_start
=
lib
.
roctracer_start
t1
=
time
.
time
()
print
(
"=================== start generate ===================="
)
roctracer_start
()
output_ids
=
self
.
model
.
generate
(
input_ids_infini
,
GenerationConfig
(
...
...
@@ -375,6 +382,7 @@ class TestModel:
),
_measure_and_log_time
=
True
,
)
roctracer_stop
()
t2
=
time
.
time
()
numpy_output_ids
=
np
.
array
(
...
...
examples/bench_prof.py
0 → 100755
View file @
098fab15
import
infinicore
from
transformers
import
AutoTokenizer
from
infinilm.modeling_utils
import
load_model_state_dict_by_file
from
infinilm.distributed
import
DistConfig
from
infinilm.infer_engine
import
GenerationConfig
,
InferEngine
from
infinilm.cache
import
StaticKVCacheConfig
,
PagedKVCacheConfig
import
argparse
import
sys
import
time
import
os
import
json
from
collections
import
OrderedDict
import
numpy
as
np
from
tqdm
import
tqdm
import
torch
from
torch.profiler
import
ProfilerActivity
from
torch.profiler
import
profile
as
torch_prof
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../python"
))
DATA_TYPE_BYTES
=
{
"bfloat16"
:
2
,
"float16"
:
2
,
"float32"
:
4
,
}
_PAGED_KV_BLOCK_SIZE
=
256
# BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128]
# INPUT_LENS = [32, 256, 1024, 4096]
# OUTPUT_LENS = [256, 1024, 4096]
def
read_json_file
(
file_path
):
"""Load and return JSON content from file_path."""
with
open
(
file_path
,
"r"
)
as
file
:
return
json
.
load
(
file
)
def
parse_list
(
value
:
str
):
"""Parse parse_list argument: can be a single int or a list of ints.
Examples:
"1" -> 1
"[1,2,4]" -> [1, 2, 4]
"1,2,4" -> [1, 2, 4]
"""
value
=
value
.
strip
()
# Try to parse as JSON list first
if
value
.
startswith
(
"["
)
and
value
.
endswith
(
"]"
):
try
:
result
=
json
.
loads
(
value
)
if
isinstance
(
result
,
list
):
return
[
int
(
x
)
for
x
in
result
]
return
int
(
result
)
except
(
json
.
JSONDecodeError
,
ValueError
):
pass
# Try to parse as comma-separated values
if
","
in
value
:
try
:
return
[
int
(
x
.
strip
())
for
x
in
value
.
split
(
","
)]
except
ValueError
:
pass
# Try to parse as a single integer
try
:
return
int
(
value
)
except
ValueError
:
raise
argparse
.
ArgumentTypeError
(
f
"batch-size must be an int or list[int], got:
{
value
}
"
)
def
get_test_cases
(
model_path
:
str
,
batch_size_list
:
list
[
int
],
input_len_list
:
list
[
int
],
output_len_list
:
list
[
int
],
):
model_path
=
os
.
path
.
expanduser
(
model_path
)
"""Generate cases ordered by ascending KV cache memory usage."""
# Load model config to derive attention dimensions
config
=
read_json_file
(
os
.
path
.
join
(
model_path
,
"config.json"
))
head_dim
=
config
.
get
(
"head_dim"
,
config
.
get
(
"hidden_size"
)
//
config
.
get
(
"num_attention_heads"
)
)
# KV heads and layers drive cache size
num_key_value_heads
=
config
.
get
(
"num_key_value_heads"
)
num_hidden_layers
=
config
.
get
(
"num_hidden_layers"
)
# Enumerate all batch/input/output combinations and compute KV cache size
case_list
=
[]
for
batch_size
in
batch_size_list
:
for
input_len
in
input_len_list
:
for
output_len
in
output_len_list
:
for
data_type
in
[
"bfloat16"
]:
data_type_bytes
=
DATA_TYPE_BYTES
[
data_type
]
total_seq_len
=
input_len
+
output_len
kvcache_memory_bytes
=
(
data_type_bytes
*
(
batch_size
*
total_seq_len
*
num_key_value_heads
*
head_dim
)
*
num_hidden_layers
)
kvcache_memory_gb
=
kvcache_memory_bytes
/
(
1024
*
1024
*
1024
)
case_list
.
append
(
{
"idx"
:
len
(
case_list
),
"batch_size"
:
batch_size
,
"input_len"
:
input_len
,
"output_len"
:
output_len
,
"data_type"
:
data_type
,
"kvcache_memory"
:
round
(
kvcache_memory_gb
,
3
),
}
)
# Sort by KV cache size and wrap in OrderedDict with index keys
case_dict
=
OrderedDict
(
(
idx
,
case
)
for
idx
,
case
in
enumerate
(
sorted
(
case_list
,
key
=
lambda
case
:
case
[
"kvcache_memory"
])
)
)
return
case_dict
def
get_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"run Llama args"
)
parser
.
add_argument
(
"--cpu"
,
action
=
"store_true"
,
help
=
"Run cpu test"
,
)
parser
.
add_argument
(
"--nvidia"
,
action
=
"store_true"
,
help
=
"Run nvidia test"
,
)
parser
.
add_argument
(
"--qy"
,
action
=
"store_true"
,
help
=
"Run qy test"
,
)
parser
.
add_argument
(
"--metax"
,
action
=
"store_true"
,
help
=
"Run metax test"
,
)
parser
.
add_argument
(
"--moore"
,
action
=
"store_true"
,
help
=
"Run moore test"
,
)
parser
.
add_argument
(
"--iluvatar"
,
action
=
"store_true"
,
help
=
"Run iluvatar test"
,
)
parser
.
add_argument
(
"--cambricon"
,
action
=
"store_true"
,
help
=
"Run cambricon test"
,
)
parser
.
add_argument
(
"--ali"
,
action
=
"store_true"
,
help
=
"Run alippu test"
,
)
parser
.
add_argument
(
"--hygon"
,
action
=
"store_true"
,
help
=
"Run hygon test"
,
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
required
=
True
,
help
=
"model path"
,
)
parser
.
add_argument
(
"--batch-size"
,
type
=
parse_list
,
default
=
1
,
help
=
"number of prompts in a batch (can be an int or a list of ints, e.g., '1' or '[1,2,4]' or '1,2,4')"
,
)
parser
.
add_argument
(
"--tensor-parallel-size"
,
"--tp"
,
type
=
int
,
default
=
1
,
help
=
"total rank for tensor parallel"
,
)
parser
.
add_argument
(
"--input-len"
,
type
=
parse_list
,
default
=
10
,
help
=
"output tokens"
,
)
parser
.
add_argument
(
"--output-len"
,
type
=
parse_list
,
default
=
20
,
help
=
"output tokens"
,
)
parser
.
add_argument
(
"--skip-load"
,
action
=
"store_true"
,
help
=
"skip loading model weights"
,
)
parser
.
add_argument
(
"--top-k"
,
type
=
int
,
default
=
1
,
help
=
"top k sampling"
,
)
parser
.
add_argument
(
"--top-p"
,
type
=
float
,
default
=
1.0
,
help
=
"top p sampling"
,
)
parser
.
add_argument
(
"--temperature"
,
type
=
float
,
default
=
1.0
,
help
=
"sampling temperature"
,
)
parser
.
add_argument
(
"--enable-paged-attn"
,
action
=
"store_true"
,
help
=
"use paged cache"
,
)
parser
.
add_argument
(
"--paged_kv_block_size"
,
type
=
int
,
default
=
256
,
help
=
"num tokens each kv block can hold"
,
)
parser
.
add_argument
(
"--enable-graph"
,
action
=
"store_true"
,
help
=
"enable graph compiling"
,
)
parser
.
add_argument
(
"--warmup"
,
action
=
"store_true"
,
help
=
"Perform a warmup run before benchmarking/inference."
,
)
parser
.
add_argument
(
"--attn"
,
type
=
str
,
default
=
"default"
,
choices
=
[
"default"
,
"flash-attn"
],
help
=
"attention backend to use: 'default' or 'flash-attn'"
,
)
return
parser
.
parse_args
()
with
open
(
"examples/bench_prompt.md"
,
"r"
)
as
f
:
prompt
=
f
.
read
()
def
repeat_prompt
(
input_ids
:
list
[
int
],
target_length
:
int
):
num
=
len
(
input_ids
)
repeat_times
=
(
target_length
+
num
-
1
)
//
num
return
(
input_ids
*
repeat_times
)[:
target_length
]
class
TestModel
:
model
:
infinicore
.
nn
.
Module
tokenizer
:
AutoTokenizer
input_ids_list
:
list
[
int
]
def
__init__
(
self
,
model_path
,
infini_device
=
infinicore
.
device
(
"cpu"
,
0
),
tp
=
1
,
skip_load
=
False
,
cache_config
=
None
,
enable_graph
=
False
,
attn_backend
=
"default"
,
)
->
None
:
model_path
=
os
.
path
.
expanduser
(
model_path
)
# ---------------------------------------------------------------------------- #
# 创建模型,
# ---------------------------------------------------------------------------- #
model
=
InferEngine
(
model_path
,
device
=
infini_device
,
distributed_config
=
DistConfig
(
tp
),
cache_config
=
cache_config
,
enable_graph_compiling
=
enable_graph
,
attention_backend
=
attn_backend
,
)
# ---------------------------------------------------------------------------- #
# 加载权重
# ---------------------------------------------------------------------------- #
if
not
skip_load
:
load_model_state_dict_by_file
(
model
,
model_path
,
dtype
=
model
.
config
.
dtype
)
# ---------------------------------------------------------------------------- #
# 创建 tokenizer
# ---------------------------------------------------------------------------- #
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
if
tokenizer
.
pad_token
is
None
:
if
tokenizer
.
eos_token
is
not
None
:
tokenizer
.
pad_token
=
tokenizer
.
eos_token
tokenizer
.
pad_token_id
=
tokenizer
.
eos_token_id
else
:
tokenizer
.
add_special_tokens
({
"pad_token"
:
"[PAD]"
})
# ---------------------------------------------------------------------------- #
# token编码
# ---------------------------------------------------------------------------- #
input_content
=
[
tokenizer
.
apply_chat_template
(
conversation
=
[{
"role"
:
"user"
,
"content"
:
prompt
}],
add_generation_prompt
=
True
,
tokenize
=
False
,
)
]
# print(input_content, end="", flush=True)
# Support Transformers >= 5.0 for batch_encode_plus deprecation
encoding
=
tokenizer
(
input_content
,
padding
=
True
,
truncation
=
True
,
max_length
=
8192
,
)
input_ids_list
=
encoding
[
"input_ids"
]
self
.
model
=
model
self
.
tokenizer
=
tokenizer
self
.
input_ids_list
=
input_ids_list
def
run
(
self
,
batch_size
:
int
,
input_len
:
int
,
output_len
:
int
,
top_k
=
1
,
top_p
=
1.0
,
temperature
=
1.0
,
):
input_ids
=
repeat_prompt
(
self
.
input_ids_list
[
0
],
target_length
=
input_len
)
input_ids_list
=
[
input_ids
]
*
batch_size
# ---------------------------------------------------------------------------- #
# 自回归生成
# ---------------------------------------------------------------------------- #
input_ids_infini
=
infinicore
.
from_list
(
input_ids_list
)
t1
=
time
.
time
()
print
(
"=================== start generate ===================="
)
output_ids
=
self
.
model
.
generate
(
input_ids_infini
,
GenerationConfig
(
max_new_tokens
=
output_len
,
eos_token_id
=
[],
top_k
=
top_k
,
top_p
=
top_p
,
temperature
=
temperature
,
stop_on_eos
=
False
,
),
_measure_and_log_time
=
True
,
)
t2
=
time
.
time
()
numpy_output_ids
=
np
.
array
(
[
output_id
.
to_numpy
()[
0
]
for
output_id
in
output_ids
]
)
print
(
self
.
tokenizer
.
decode
(
numpy_output_ids
,
skip_special_tokens
=
True
))
print
(
f
"total_time:
{
round
((
t2
-
t1
)
*
1000
,
2
)
}
ms"
,
)
if
__name__
==
"__main__"
:
args
=
get_args
()
print
(
args
)
# Parse command line arguments
device_str
=
"cpu"
if
args
.
cpu
:
device_str
=
"cpu"
elif
args
.
nvidia
:
device_str
=
"cuda"
elif
args
.
qy
:
device_str
=
"cuda"
elif
args
.
metax
:
device_str
=
"cuda"
elif
args
.
moore
:
device_str
=
"musa"
elif
args
.
iluvatar
:
device_str
=
"cuda"
elif
args
.
cambricon
:
device_str
=
"mlu"
elif
args
.
ali
:
device_str
=
"cuda"
elif
args
.
hygon
:
device_str
=
"cuda"
else
:
print
(
"python examples/bench.py --nvidia --model=~/TinyLlama-1.1B-Chat-v1.0/ --batch-size=2 --tp=1 --input-len=50 --output-len=50"
)
sys
.
exit
(
1
)
_PAGED_KV_BLOCK_SIZE
=
args
.
paged_kv_block_size
# -------------------------------------------------------- #
# 解析参数
# -------------------------------------------------------- #
model_path
=
args
.
model
infini_device
=
infinicore
.
device
(
device_str
,
0
)
tp
=
args
.
tensor_parallel_size
skip_load
=
args
.
skip_load
batch_size
=
args
.
batch_size
input_len
=
args
.
input_len
output_len
=
args
.
output_len
enable_paged_attn
=
args
.
enable_paged_attn
enable_graph
=
args
.
enable_graph
if
isinstance
(
batch_size
,
int
):
batch_size
=
[
batch_size
]
if
isinstance
(
input_len
,
int
):
input_len
=
[
input_len
]
if
isinstance
(
output_len
,
int
):
output_len
=
[
output_len
]
cases_dict
=
get_test_cases
(
model_path
,
batch_size
,
input_len
,
output_len
)
# -------------------------------------------------------- #
# 测试
# -------------------------------------------------------- #
if
enable_paged_attn
:
paged_kv_block_size
=
_PAGED_KV_BLOCK_SIZE
max_num_blocks
=
max
(
[
(
(
c_
[
"input_len"
]
+
c_
[
"output_len"
]
+
(
paged_kv_block_size
-
1
))
//
paged_kv_block_size
)
*
c_
[
"batch_size"
]
for
_
,
c_
in
cases_dict
.
items
()
]
)
cache_config
=
PagedKVCacheConfig
(
max_num_blocks
,
paged_kv_block_size
)
else
:
cache_config
=
None
test
=
TestModel
(
model_path
,
infini_device
=
infini_device
,
tp
=
tp
,
skip_load
=
skip_load
,
cache_config
=
cache_config
,
enable_graph
=
enable_graph
,
attn_backend
=
args
.
attn
,
)
# ---------------------------------------------------------------------------- #
# Warmup
# ---------------------------------------------------------------------------- #
if
args
.
warmup
:
warmup_steps
=
1
# warmup cache capacity
warmup_cache_len
=
128
warmup_batch
=
len
(
test
.
input_ids_list
)
test
.
model
.
reset_cache
(
StaticKVCacheConfig
(
max_batch_size
=
warmup_batch
,
max_cache_len
=
warmup_cache_len
,
)
)
avg_prompt_len
=
min
(
64
,
max
(
len
(
ids
)
for
ids
in
test
.
input_ids_list
))
warmup_ids
=
[
ids
[:
avg_prompt_len
]
if
len
(
ids
)
>=
avg_prompt_len
else
ids
for
ids
in
test
.
input_ids_list
]
input_ids_infini
=
infinicore
.
from_list
(
warmup_ids
)
print
(
"=================== warmup start ==================="
)
for
_
in
range
(
warmup_steps
):
_
=
test
.
model
.
generate
(
input_ids_infini
,
GenerationConfig
(
max_new_tokens
=
5
,
# decode kernel warmup
temperature
=
args
.
temperature
,
top_k
=
args
.
top_k
,
top_p
=
args
.
top_p
,
stop_on_eos
=
False
,
),
_measure_and_log_time
=
False
,
)
print
(
"=================== warmup done ===================="
)
# reset cache back to benchmark config
if
cache_config
is
not
None
:
test
.
model
.
reset_cache
(
cache_config
)
# ---------------------------------------------------------------------------- #
# Warmup done
# ---------------------------------------------------------------------------- #
for
idx
,
case
in
tqdm
(
cases_dict
.
items
(),
desc
=
"Processing cases"
):
tqdm
.
write
(
f
"
\033
[92mProcessing :
{
case
}
\033
[0m"
)
batch_size
=
case
[
"batch_size"
]
input_len
=
case
[
"input_len"
]
output_len
=
case
[
"output_len"
]
if
not
enable_paged_attn
:
# reset cache if static kvcache is used
initial_capacity
=
input_len
+
output_len
test
.
model
.
reset_cache
(
StaticKVCacheConfig
(
max_batch_size
=
batch_size
,
max_cache_len
=
initial_capacity
)
)
# run test one case
# test.run(
# batch_size=batch_size,
# input_len=input_len,
# output_len=output_len,
# top_k=args.top_k,
# top_p=args.top_p,
# temperature=args.temperature,
# )
# run test one case
log_file
=
f
"/home/qi-yuan-2026/z_prof_logs/8b_fa_1_4096_256_prof.log"
json_file
=
f
"/home/qi-yuan-2026/z_prof_logs/8b_fa_1_4096_256_prof.json"
# log_file=f"/root/qi_yuan/z_prof_logs/70b_1_32_256_prof.log"
# json_file=f"/root/qi_yuan/z_prof_logs/70b_1_32_256_prof.json"
with
torch_prof
(
activities
=
[
ProfilerActivity
.
CPU
,
ProfilerActivity
.
CUDA
],
record_shapes
=
True
,
profile_memory
=
False
,
with_stack
=
True
)
as
prof
:
with
torch
.
no_grad
():
test
.
run
(
batch_size
=
batch_size
,
input_len
=
input_len
,
output_len
=
output_len
,
top_k
=
args
.
top_k
,
top_p
=
args
.
top_p
,
temperature
=
args
.
temperature
,
)
with
open
(
log_file
,
'w'
)
as
log_file
:
log_file
.
write
(
prof
.
key_averages
().
table
(
sort_by
=
"self_cuda_time_total"
))
print
(
prof
.
key_averages
().
table
(
sort_by
=
"self_cuda_time_total"
))
prof
.
export_chrome_trace
(
json_file
)
scripts/build.sh
0 → 100755
View file @
098fab15
#!/bin/bash
xmake f
--use-kv-caching
=
true
-cv
pip
install
-e
.
--no-build-isolation
\ No newline at end of file
scripts/run_infer.sh
0 → 100755
View file @
098fab15
cd
InfiniLM-fa
# Basic inference
python examples/jiuge.py
--hygon
--model_path
=
../models/9g_8b_thinking_llama/
# With paged attention + flash-attn (use block_size=64 for Hygon)
python examples/jiuge.py
--hygon
--enable-paged-attn
--paged_kv_block_size
=
64
--attn
=
flash-attn
--model_path
=
../models/9g_8b_thinking_llama/
# With graph compilation
python examples/jiuge.py
--hygon
--enable-paged-attn
--paged_kv_block_size
=
64
--attn
=
flash-attn
--enable-graph
--model_path
=
../models/9g_8b_thinking_llama/
# Multi-GPU (tensor parallel)
python examples/jiuge.py
--hygon
--tp
=
4
--model_path
=
../models/9g_8b_thinking_llama/
# Custom options
python examples/jiuge.py
--hygon
--model_path
=
<path>
\
--max_new_tokens
=
100
\
--batch_size
=
1
\
--prompt
=
"How are you"
\
--top_k
=
1
--top_p
=
1.0
--temperature
=
1.0
python examples/bench.py
--hygon
--warmup
--model
=
../models/9g_8b_thinking_llama/
\
--enable-paged-attn
--attn
=
flash-attn
--enable-graph
\
--input-len
=
32,256,4096
--output-len
=
256,1024,2048,4096
\
--batch-size
=
1
--skip-load
>>
perf_hygon_bs_1.txt
python examples/bench.py
--hygon
--warmup
--model
=
../models/9g_8b_thinking_llama/
\
--enable-paged-attn
--attn
=
flash-attn
--enable-graph
\
--input-len
=
32,256
--output-len
=
256,1024,2048,4096
\
--batch-size
=
16
--skip-load
>>
perf_hygon_bs_16.txt
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment