Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
38ac084d
Commit
38ac084d
authored
Dec 03, 2025
by
PanZezhong
Browse files
issue/95 将pybind target命名为_infinilm
parent
ae3ebe19
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
60 additions
and
74 deletions
+60
-74
csrc/cache/kv_cache.hpp
csrc/cache/kv_cache.hpp
+11
-11
csrc/models/llama/llama_attention.cpp
csrc/models/llama/llama_attention.cpp
+31
-38
csrc/models/pybind11/models.cc
csrc/models/pybind11/models.cc
+2
-2
python/infinilm/lib/__init__.py
python/infinilm/lib/__init__.py
+2
-2
python/infinilm/models/llama/backends/cpp.py
python/infinilm/models/llama/backends/cpp.py
+3
-3
setup.py
setup.py
+2
-2
test/models/llama/test_intermediate_validation.py
test/models/llama/test_intermediate_validation.py
+2
-2
test/models/llama/test_llama_inference.py
test/models/llama/test_llama_inference.py
+5
-10
xmake.lua
xmake.lua
+2
-4
No files found.
csrc/cache/kv_cache.hpp
View file @
38ac084d
#pragma once
#include "infinicore/tensor.hpp"
#include "infinicore/device.hpp"
#include "infinicore/tensor.hpp"
#include <algorithm>
#include <utility>
#include <memory>
#include <utility>
namespace
infinilm
::
cache
{
...
...
@@ -18,11 +18,11 @@ namespace infinilm::cache {
* that needs KV caching for attention mechanisms.
*/
struct
KVCache
{
infinicore
::
Tensor
k_cache
;
// [n_kv_head, capacity, head_dim]
infinicore
::
Tensor
v_cache
;
// [n_kv_head, capacity, head_dim]
size_t
cache_position
;
// Current position in cache
size_t
max_capacity
;
// Maximum capacity of cache
bool
initialized
;
// Whether cache has been initialized
infinicore
::
Tensor
k_cache
;
// [n_kv_head, capacity, head_dim]
infinicore
::
Tensor
v_cache
;
// [n_kv_head, capacity, head_dim]
size_t
cache_position
;
// Current position in cache
size_t
max_capacity
;
// Maximum capacity of cache
bool
initialized
;
// Whether cache has been initialized
KVCache
()
:
cache_position
(
0
),
max_capacity
(
0
),
initialized
(
false
),
...
...
@@ -41,12 +41,12 @@ struct KVCache {
* @param device Device
*/
void
ensure_capacity
(
size_t
num_kv_heads
,
size_t
head_dim
,
size_t
seq_len
,
infinicore
::
DataType
dtype
,
const
infinicore
::
Device
&
device
)
{
infinicore
::
DataType
dtype
,
const
infinicore
::
Device
&
device
)
{
size_t
required_capacity
=
cache_position
+
seq_len
;
// Lazy initialization
if
(
!
initialized
)
{
max_capacity
=
std
::
max
(
required_capacity
,
size_t
(
4096
));
// Start with at least 4096
max_capacity
=
std
::
max
(
required_capacity
,
size_t
(
4096
));
// Start with at least 4096
k_cache
=
infinicore
::
Tensor
::
empty
({
num_kv_heads
,
max_capacity
,
head_dim
},
dtype
,
device
);
v_cache
=
infinicore
::
Tensor
::
empty
({
num_kv_heads
,
max_capacity
,
head_dim
},
...
...
@@ -94,7 +94,7 @@ struct KVCache {
// Ensure capacity
ensure_capacity
(
num_kv_heads
,
head_dim
,
seq_len
,
k_new
->
dtype
(),
k_new
->
device
());
k_new
->
dtype
(),
k_new
->
device
());
// Copy new k/v into cache at current position
auto
k_dst
=
k_cache
->
narrow
({{
1
,
cache_position
,
seq_len
}});
...
...
@@ -113,4 +113,4 @@ struct KVCache {
}
};
}
// namespace infinilm::
models::common
}
// namespace infinilm::
cache
csrc/models/llama/llama_attention.cpp
View file @
38ac084d
...
...
@@ -3,17 +3,17 @@
#include "infinicore/nn/rope.hpp"
#include "infinicore/ops.hpp"
#include "infinicore/ops/mul.hpp"
#include <
spdlog/spdlog.h
>
#include <
algorithm
>
#include <cmath>
#include <cstring>
#include <stdexcept>
#include <iostream>
#include <algorithm>
#include <spdlog/spdlog.h>
#include <stdexcept>
namespace
infinilm
::
models
::
llama
{
LlamaAttention
::
LlamaAttention
(
const
LlamaConfig
&
config
,
const
infinicore
::
Device
&
device
,
infinicore
::
DataType
dtype
)
infinicore
::
DataType
dtype
)
:
hidden_size_
(
config
.
hidden_size
),
num_attention_heads_
(
config
.
num_attention_heads
),
num_key_value_heads_
(
config
.
num_key_value_heads
),
...
...
@@ -22,19 +22,18 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config, const infinicore::Devi
use_bias_
(
config
.
attention_bias
)
{
// Initialize projection layers
INFINICORE_NN_MODULE_INIT
(
q_proj
,
hidden_size_
,
hidden_size_
,
use_bias_
,
dtype
,
device
);
dtype
,
device
);
INFINICORE_NN_MODULE_INIT
(
k_proj
,
hidden_size_
,
kv_dim_
,
use_bias_
,
dtype
,
device
);
dtype
,
device
);
INFINICORE_NN_MODULE_INIT
(
v_proj
,
hidden_size_
,
kv_dim_
,
use_bias_
,
dtype
,
device
);
dtype
,
device
);
INFINICORE_NN_MODULE_INIT
(
o_proj
,
hidden_size_
,
hidden_size_
,
use_bias_
,
dtype
,
device
);
dtype
,
device
);
}
infinicore
::
Tensor
LlamaAttention
::
forward
(
const
infinicore
::
Tensor
&
hidden_states
,
const
infinicore
::
Tensor
&
position_ids
,
void
*
kv_cache
)
const
{
const
infinicore
::
Tensor
&
position_ids
,
void
*
kv_cache
)
const
{
if
(
!
rotary_emb_
)
{
throw
std
::
runtime_error
(
"LlamaAttention: rotary_emb not configured"
);
}
...
...
@@ -45,12 +44,11 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
size_t
seq_len
=
shape
[
1
];
// 1. Project Q, K, V
auto
q
=
q_proj_
->
forward
(
hidden_states_mutable
);
// [batch, seq_len, hidden_size]
auto
k
=
k_proj_
->
forward
(
hidden_states_mutable
);
// [batch, seq_len, kv_dim]
auto
q
=
q_proj_
->
forward
(
hidden_states_mutable
);
// [batch, seq_len, hidden_size]
auto
v
=
v
_proj_
->
forward
(
hidden_states_mutable
);
// [batch, seq_len, kv_dim]
auto
k
=
k
_proj_
->
forward
(
hidden_states_mutable
);
// [batch, seq_len, kv_dim]
auto
v
=
v_proj_
->
forward
(
hidden_states_mutable
);
// [batch, seq_len, kv_dim]
// 2. Reshape for multi-head attention
...
...
@@ -84,7 +82,6 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
auto
q_for_rope
=
q_reshaped
->
view
({
batch_size
*
seq_len
,
num_attention_heads_
,
head_dim_
})
->
contiguous
();
auto
k_for_rope
=
k_reshaped
->
view
({
batch_size
*
seq_len
,
num_key_value_heads_
,
head_dim_
})
->
contiguous
();
// Call RoPE on full batch (matching Python pattern)
auto
q_rope_out
=
rotary_emb_
->
forward
(
q_for_rope
,
pos_ids_for_rope
);
auto
k_rope_out
=
rotary_emb_
->
forward
(
k_for_rope
,
pos_ids_for_rope
);
...
...
@@ -98,8 +95,7 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
auto
output_tensor
=
infinicore
::
Tensor
::
empty
(
{
batch_size
,
seq_len
,
hidden_size_
},
q
->
dtype
(),
q
->
device
()
);
q
->
device
());
for
(
size_t
b
=
0
;
b
<
batch_size
;
++
b
)
{
// Extract batch item from RoPE output (already computed above for full batch)
...
...
@@ -110,13 +106,13 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
// Convert to [n_head, seq_len, head_dim] for cache
// Ensure contiguous after permute for F16 compatibility with cache operations
auto
q_rope
=
q_batch
->
permute
({
1
,
0
,
2
})
->
contiguous
();
// [n_q_head, seq_len, head_dim]
auto
k_rope
=
k_batch
->
permute
({
1
,
0
,
2
})
->
contiguous
();
// [n_kv_head, seq_len, head_dim]
auto
v_permuted
=
v_batch
->
permute
({
1
,
0
,
2
})
->
contiguous
();
// [n_kv_head, seq_len, head_dim]
auto
q_rope
=
q_batch
->
permute
({
1
,
0
,
2
})
->
contiguous
();
// [n_q_head, seq_len, head_dim]
auto
k_rope
=
k_batch
->
permute
({
1
,
0
,
2
})
->
contiguous
();
// [n_kv_head, seq_len, head_dim]
auto
v_permuted
=
v_batch
->
permute
({
1
,
0
,
2
})
->
contiguous
();
// [n_kv_head, seq_len, head_dim]
// 5. Prepare KV caches
infinicore
::
Tensor
k_total
=
infinicore
::
Tensor
::
empty
({
1
,
1
,
1
},
k_rope
->
dtype
(),
k_rope
->
device
())
;
infinicore
::
Tensor
v_total
=
infinicore
::
Tensor
::
empty
({
1
,
1
,
1
},
v_permuted
->
dtype
(),
v_permuted
->
device
())
;
infinicore
::
Tensor
k_total
;
infinicore
::
Tensor
v_total
;
if
(
external_cache
!=
nullptr
)
{
auto
[
k_total_tmp
,
v_total_tmp
]
=
external_cache
->
update
(
k_rope
,
v_permuted
);
k_total
=
k_total_tmp
;
...
...
@@ -136,11 +132,11 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
// Extract from KV cache (k_total and v_total are [n_kv_head, total_seq_len, head_dim])
// Python: key_states_total.narrow(0, i, 1).view((total_seq_len, num_key_value_heads, head_dim))
// Python's narrow+view ensures contiguous memory, so we need to ensure contiguous before permute
auto
k_for_attn
=
k_total
->
permute
({
1
,
0
,
2
});
// [total_seq_len, n_kv_head, head_dim]
auto
v_for_attn
=
v_total
->
permute
({
1
,
0
,
2
});
// [total_seq_len, n_kv_head, head_dim]
auto
k_for_attn
=
k_total
->
permute
({
1
,
0
,
2
});
// [total_seq_len, n_kv_head, head_dim]
auto
v_for_attn
=
v_total
->
permute
({
1
,
0
,
2
});
// [total_seq_len, n_kv_head, head_dim]
// q_batch is already [seq_len, n_q_head, head_dim] from above
auto
q_for_attn
=
q_batch
;
// [seq_len, n_q_head, head_dim]
auto
q_for_attn
=
q_batch
;
// [seq_len, n_q_head, head_dim]
// Python: grouped_query_attention calls repeat_kv if ngroup > 1
// Python: repeat_kv expands [total_seq_len, num_key_value_heads, head_dim] -> [total_seq_len, num_attention_heads, head_dim]
...
...
@@ -154,15 +150,13 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
auto
k_strides
=
k_for_attn
->
strides
();
auto
k_strided
=
k_for_attn
->
as_strided
(
{
total_seq_len
,
n_kv_head
,
ngroup
,
head_dim
},
{
k_strides
[
0
],
k_strides
[
1
],
0
,
k_strides
[
2
]}
);
{
k_strides
[
0
],
k_strides
[
1
],
0
,
k_strides
[
2
]});
k_for_attn
=
k_strided
->
contiguous
()
->
view
({
total_seq_len
,
n_kv_head
*
ngroup
,
head_dim
});
auto
v_strides
=
v_for_attn
->
strides
();
auto
v_strided
=
v_for_attn
->
as_strided
(
{
total_seq_len
,
n_kv_head
,
ngroup
,
head_dim
},
{
v_strides
[
0
],
v_strides
[
1
],
0
,
v_strides
[
2
]}
);
{
v_strides
[
0
],
v_strides
[
1
],
0
,
v_strides
[
2
]});
v_for_attn
=
v_strided
->
contiguous
()
->
view
({
total_seq_len
,
n_kv_head
*
ngroup
,
head_dim
});
}
...
...
@@ -170,26 +164,25 @@ infinicore::Tensor LlamaAttention::forward(const infinicore::Tensor &hidden_stat
// Python: Q = querys.permute((1, 0, 2)) # [num_heads, seq_len, head_dim]
// Python: K = keys # [total_seq_len, num_heads, head_dim] (NO permute!)
// Python: V = values.permute((1, 0, 2)) # [num_heads, total_seq_len, head_dim]
auto
Q
=
q_for_attn
->
permute
({
1
,
0
,
2
});
// [n_q_head, seq_len, head_dim]
auto
K
=
k_for_attn
;
// [total_seq_len, n_q_head, head_dim] - keep as-is (matching Python)
auto
V
=
v_for_attn
->
permute
({
1
,
0
,
2
});
// [n_q_head, total_seq_len, head_dim]
auto
Q
=
q_for_attn
->
permute
({
1
,
0
,
2
});
// [n_q_head, seq_len, head_dim]
auto
K
=
k_for_attn
;
// [total_seq_len, n_q_head, head_dim] - keep as-is (matching Python)
auto
V
=
v_for_attn
->
permute
({
1
,
0
,
2
});
// [n_q_head, total_seq_len, head_dim]
// Python: attn_weight = Q @ K.permute((1, 2, 0))
// Python: K.permute((1, 2, 0)) transforms [total_seq_len, num_heads, head_dim] -> [num_heads, head_dim, total_seq_len]
auto
K_transposed
=
K
->
permute
({
1
,
2
,
0
});
// [n_q_head, head_dim, total_seq_len]
auto
K_transposed
=
K
->
permute
({
1
,
2
,
0
});
// [n_q_head, head_dim, total_seq_len]
// Use GEMM with alpha=scaling to combine scaling with matrix multiplication
// This is more efficient than doing matmul followed by mul
float
scaling
=
1.0
f
/
std
::
sqrt
(
static_cast
<
float
>
(
head_dim_
));
auto
attn_weight
=
infinicore
::
op
::
matmul
(
Q
,
K_transposed
,
scaling
);
// [n_q_head, seq_len, total_seq_len]
auto
attn_weight
=
infinicore
::
op
::
matmul
(
Q
,
K_transposed
,
scaling
);
// [n_q_head, seq_len, total_seq_len]
infinicore
::
op
::
causal_softmax_
(
attn_weight
,
attn_weight
);
auto
out
=
infinicore
::
op
::
matmul
(
attn_weight
,
V
);
// [n_q_head, seq_len, head_dim]
auto
out
=
infinicore
::
op
::
matmul
(
attn_weight
,
V
);
// [n_q_head, seq_len, head_dim]
// Python: return out.permute((1, 0, 2)).contiguous() # [seq_len, num_heads, head_dim]
auto
attn_output
=
out
->
permute
({
1
,
0
,
2
})
->
contiguous
();
// [seq_len, n_q_head, head_dim]
auto
attn_output
=
out
->
permute
({
1
,
0
,
2
})
->
contiguous
();
// [seq_len, n_q_head, head_dim]
// Python: attn_output_i.copy_(attention_i)
// Python: attn_output = attn_output.view(hidden_states_shape) # [bs, seq_len, hidden_size]
...
...
csrc/models/pybind11/models.cc
View file @
38ac084d
#include <pybind11/pybind11.h>
#include "models/llama.hpp"
#include <pybind11/pybind11.h>
namespace
py
=
pybind11
;
PYBIND11_MODULE
(
_infinilm
_llama
,
m
)
{
PYBIND11_MODULE
(
_infinilm
,
m
)
{
m
.
doc
()
=
"InfiniLM Llama model Python bindings"
;
infinilm
::
models
::
llama
::
bind_llama
(
m
);
...
...
python/infinilm/lib/__init__.py
View file @
38ac084d
...
...
@@ -14,6 +14,6 @@ if str(_lib_dir) not in sys.path:
# Import the compiled C++ module
# The .so file should be installed in this directory by xmake
import
_infinilm
_llama
import
_infinilm
__all__
=
[
"_infinilm
_llama
"
]
__all__
=
[
"_infinilm"
]
python/infinilm/models/llama/backends/cpp.py
View file @
38ac084d
from
....generation.utils
import
GenerationMixin
import
infinicore
from
infinilm.models.llama.configuration_llama
import
LlamaConfig
as
_LlamaConfig
from
infinilm.lib
import
_infinilm
_llama
from
infinilm.lib
import
_infinilm
import
json
import
os
from
typing
import
Optional
,
Union
...
...
@@ -49,7 +49,7 @@ class LlamaConfig:
def
_underlying
(
self
):
"""Get underlying C++ config object, creating it if needed"""
if
self
.
_cpp_config
is
None
:
self
.
_cpp_config
=
_infinilm
_llama
.
LlamaConfig
()
self
.
_cpp_config
=
_infinilm
.
LlamaConfig
()
# Copy attributes from Python config to C++ config
for
key
in
dir
(
self
.
_python_config
):
...
...
@@ -107,7 +107,7 @@ class LlamaForCausalLM(GenerationMixin):
self
.
use_cache
=
False
self
.
_device
=
device
self
.
_model
=
_infinilm
_llama
.
LlamaForCausalLM
(
self
.
_model
=
_infinilm
.
LlamaForCausalLM
(
config
.
_underlying
,
device
.
_underlying
,
dtype
)
...
...
setup.py
View file @
38ac084d
...
...
@@ -9,8 +9,8 @@ from setuptools.command.egg_info import egg_info
def
build_cpp_module
():
"""Build and install the C++ extension module"""
subprocess
.
run
([
"xmake"
,
"build"
,
"_infinilm
_llama
"
],
check
=
True
)
subprocess
.
run
([
"xmake"
,
"install"
,
"_infinilm
_llama
"
],
check
=
True
)
subprocess
.
run
([
"xmake"
,
"build"
,
"_infinilm"
],
check
=
True
)
subprocess
.
run
([
"xmake"
,
"install"
,
"_infinilm"
],
check
=
True
)
class
Build
(
build
):
...
...
test/models/llama/test_intermediate_validation.py
View file @
38ac084d
...
...
@@ -27,7 +27,7 @@ except ImportError as e:
try
:
from
infinilm.models.llama
import
LlamaConfig
,
LlamaForCausalLM
,
Device
import
_infinilm
_llama
# Import C++ bindings for HookRegistry
import
_infinilm
# Import C++ bindings for HookRegistry
except
ImportError
as
e
:
print
(
f
"Error: InfiniLM Python package not found. Please install it:
{
e
}
"
)
sys
.
exit
(
1
)
...
...
@@ -756,7 +756,7 @@ def test_intermediate_validation(
infini_position_ids
=
torch_to_infinicore_tensor
(
position_ids
,
infini_device
)
# Create hook registry and register hooks
hook_registry
=
_infinilm
_llama
.
HookRegistry
()
hook_registry
=
_infinilm
.
HookRegistry
()
def
make_infinilm_hook
(
name
):
def
hook
(
hook_name
,
tensor
,
layer_idx
):
...
...
test/models/llama/test_llama_inference.py
View file @
38ac084d
...
...
@@ -36,7 +36,7 @@ except ImportError as e:
print
(
f
"Error: InfiniLM Python package not found. Please install it:"
)
print
(
f
" pip install -e ."
)
print
(
f
" or"
)
print
(
f
" xmake build _infinilm
_llama
&& xmake install _infinilm
_llama
"
)
print
(
f
" xmake build _infinilm && xmake install _infinilm"
)
print
(
f
" Error:
{
e
}
"
)
sys
.
exit
(
1
)
...
...
@@ -487,9 +487,6 @@ def validate_inference(
def
main
():
"""Main test function"""
# Default model path
# default_model_dir = "/var/qy_home/zenghua/.cache/modelscope/hub/models/LLM-Research/Llama-3.2-1B-Instruct"
default_model_dir
=
"/var/qy_home/zenghua/.cache/modelscope/hub/models/AI-ModelScope/TinyLlama-1.1B-Chat-v1.0"
# Default prompt
default_prompt
=
"Hello, how are you?"
...
...
@@ -545,8 +542,6 @@ def main():
sys
.
exit
(
1
)
i
+=
1
if
model_dir
is
None
:
model_dir
=
default_model_dir
if
not
os
.
path
.
exists
(
model_dir
):
print
(
f
"Error: Model directory not found:
{
model_dir
}
"
)
...
...
@@ -560,11 +555,11 @@ def main():
)
print
(
f
" Examples: cpu, cuda, cuda:0, cuda:1"
)
print
(
f
"
\n
Examples:"
)
print
(
f
"
{
sys
.
argv
[
0
]
}
{
default_model_dir
}
"
)
print
(
f
'
{
sys
.
argv
[
0
]
}
{
default_model_dir
}
--prompt "What is AI?"'
)
print
(
f
"
{
sys
.
argv
[
0
]
}
{
default_model_dir
}
--device cuda:0"
)
print
(
f
"
{
sys
.
argv
[
0
]
}
dir/to/model
"
)
print
(
f
'
{
sys
.
argv
[
0
]
}
dir/to/model
--prompt "What is AI?"'
)
print
(
f
"
{
sys
.
argv
[
0
]
}
dir/to/model
--device cuda:0"
)
print
(
f
'
{
sys
.
argv
[
0
]
}
{
default_model_dir
}
--prompt "What is AI?" --device cuda:0'
f
'
{
sys
.
argv
[
0
]
}
dir/to/model
--prompt "What is AI?" --device cuda:0'
)
sys
.
exit
(
1
)
...
...
xmake.lua
View file @
38ac084d
...
...
@@ -32,8 +32,7 @@ target("infinicore_infer")
add_installfiles
(
"include/infinicore_infer/models/*.h"
,
{
prefixdir
=
"include/infinicore_infer/models"
})
target_end
()
-- Python bindings for Llama model
target
(
"_infinilm_llama"
)
target
(
"_infinilm"
)
add_packages
(
"pybind11"
)
set_default
(
false
)
add_rules
(
"python.module"
,
{
soabi
=
true
})
...
...
@@ -52,8 +51,7 @@ target("_infinilm_llama")
add_links
(
"infinicore_cpp_api"
,
"infiniop"
,
"infinirt"
,
"infiniccl"
)
-- Add Llama model files
add_files
(
"csrc/models/llama/llama_*.cpp"
)
add_files
(
"csrc/models/debug_utils/*.cpp"
)
add_files
(
"csrc/models/*/*.cpp"
)
add_files
(
"csrc/models/pybind11/models.cc"
)
set_installdir
(
"python/infinilm"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment