Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
3d328d61
Unverified
Commit
3d328d61
authored
Dec 06, 2025
by
PanZezhong1725
Committed by
GitHub
Dec 06, 2025
Browse files
issue/92 添加InferEngine,支持多线程推理
parent
0794f307
Changes
24
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
36 additions
and
19 deletions
+36
-19
python/infinilm/models/llama/__init__.py
python/infinilm/models/llama/__init__.py
+3
-0
python/infinilm/models/llama/backends/cpp.py
python/infinilm/models/llama/backends/cpp.py
+30
-14
python/infinilm/models/llama/modeling_llama.py
python/infinilm/models/llama/modeling_llama.py
+2
-4
setup.py
setup.py
+1
-1
No files found.
python/infinilm/models/llama/__init__.py
View file @
3d328d61
...
@@ -13,6 +13,7 @@ class AutoLlamaModel:
...
@@ -13,6 +13,7 @@ class AutoLlamaModel:
device
:
infinicore
.
device
,
device
:
infinicore
.
device
,
dtype
=
infinicore
.
dtype
,
dtype
=
infinicore
.
dtype
,
backend
=
"python"
,
backend
=
"python"
,
**
kwargs
,
):
):
if
backend
==
"python"
:
if
backend
==
"python"
:
from
.
import
modeling_llama
from
.
import
modeling_llama
...
@@ -21,6 +22,7 @@ class AutoLlamaModel:
...
@@ -21,6 +22,7 @@ class AutoLlamaModel:
model_path
,
model_path
,
device
=
device
,
device
=
device
,
dtype
=
dtype
,
dtype
=
dtype
,
**
kwargs
,
)
)
elif
backend
==
"cpp"
:
elif
backend
==
"cpp"
:
...
@@ -30,6 +32,7 @@ class AutoLlamaModel:
...
@@ -30,6 +32,7 @@ class AutoLlamaModel:
model_path
,
model_path
,
device
=
device
,
device
=
device
,
dtype
=
dtype
,
dtype
=
dtype
,
**
kwargs
,
)
)
raise
KeyError
(
"invalid backend"
)
raise
KeyError
(
"invalid backend"
)
python/infinilm/models/llama/backends/cpp.py
View file @
3d328d61
...
@@ -2,6 +2,7 @@ from ....generation.utils import GenerationMixin
...
@@ -2,6 +2,7 @@ from ....generation.utils import GenerationMixin
import
infinicore
import
infinicore
from
infinilm.models.llama.configuration_llama
import
LlamaConfig
as
_LlamaConfig
from
infinilm.models.llama.configuration_llama
import
LlamaConfig
as
_LlamaConfig
from
infinilm.lib
import
_infinilm
from
infinilm.lib
import
_infinilm
from
infinilm.distributed
import
DistConfig
import
json
import
json
import
os
import
os
from
typing
import
Optional
,
Union
from
typing
import
Optional
,
Union
...
@@ -85,7 +86,13 @@ class LlamaConfig:
...
@@ -85,7 +86,13 @@ class LlamaConfig:
class
LlamaForCausalLM
(
GenerationMixin
):
class
LlamaForCausalLM
(
GenerationMixin
):
"""Llama model for causal language modeling"""
"""Llama model for causal language modeling"""
def
__init__
(
self
,
config
,
device
=
None
,
dtype
=
None
):
def
__init__
(
self
,
config
,
device
=
None
,
dtype
=
None
,
distributed_config
=
DistConfig
(
1
),
):
"""
"""
Create LlamaForCausalLM
Create LlamaForCausalLM
...
@@ -96,10 +103,7 @@ class LlamaForCausalLM(GenerationMixin):
...
@@ -96,10 +103,7 @@ class LlamaForCausalLM(GenerationMixin):
"""
"""
super
().
__init__
()
super
().
__init__
()
if
isinstance
(
config
,
dict
):
self
.
config
=
config
config
=
LlamaConfig
(
**
config
)
elif
not
isinstance
(
config
,
LlamaConfig
):
config
=
LlamaConfig
(
**
config
)
if
device
is
None
:
if
device
is
None
:
device
=
infinicore
.
device
()
device
=
infinicore
.
device
()
...
@@ -107,8 +111,11 @@ class LlamaForCausalLM(GenerationMixin):
...
@@ -107,8 +111,11 @@ class LlamaForCausalLM(GenerationMixin):
self
.
use_cache
=
False
self
.
use_cache
=
False
self
.
_device
=
device
self
.
_device
=
device
self
.
_model
=
_infinilm
.
LlamaForCausalLM
(
# self._model = _infinilm.LlamaForCausalLM(
config
.
_underlying
,
device
.
_underlying
,
dtype
# config._underlying, device._underlying, dtype
# )
self
.
_model
=
_infinilm
.
InferEngine
(
config
.
_underlying
,
distributed_config
.
_underlying
,
device
.
_underlying
.
type
)
)
def
state_dict
(
self
):
def
state_dict
(
self
):
...
@@ -122,7 +129,9 @@ class LlamaForCausalLM(GenerationMixin):
...
@@ -122,7 +129,9 @@ class LlamaForCausalLM(GenerationMixin):
Args:
Args:
state_dict: Dictionary mapping parameter names to InfiniCore tensors, numpy arrays, or torch tensors
state_dict: Dictionary mapping parameter names to InfiniCore tensors, numpy arrays, or torch tensors
"""
"""
self
.
_model
.
load_state_dict
(
state_dict
,
self
.
_device
.
_underlying
)
# self._model.load_state_dict(state_dict, self._device._underlying)
for
name
,
param
in
state_dict
.
items
():
self
.
_model
.
load_param
(
name
,
param
.
_underlying
)
def
get_parameter
(
self
,
name
):
def
get_parameter
(
self
,
name
):
"""
"""
...
@@ -136,15 +145,21 @@ class LlamaForCausalLM(GenerationMixin):
...
@@ -136,15 +145,21 @@ class LlamaForCausalLM(GenerationMixin):
"""
"""
return
self
.
_model
.
get_parameter
(
name
)
return
self
.
_model
.
get_parameter
(
name
)
@
property
#
@property
def
config
(
self
):
#
def config(self):
"""Get model configuration"""
#
"""Get model configuration"""
return
self
.
_model
.
config
()
#
return self._model.config()
def
forward
(
self
,
input_ids
,
position_ids
,
*
args
,
**
kwargs
):
def
forward
(
self
,
input_ids
,
position_ids
,
*
args
,
**
kwargs
):
kv_caches
=
None
kv_caches
=
None
# return infinicore.Tensor(
# self._model.forward(input_ids, position_ids, kv_caches)
# )
return
infinicore
.
Tensor
(
return
infinicore
.
Tensor
(
self
.
_model
.
forward
(
input_ids
,
position_ids
,
kv_caches
)
self
.
_model
.
generate
(
input_ids
.
_underlying
,
position_ids
.
_underlying
,
)
)
)
def
__call__
(
self
,
input_ids
,
position_ids
,
*
args
,
**
kwargs
):
def
__call__
(
self
,
input_ids
,
position_ids
,
*
args
,
**
kwargs
):
...
@@ -156,6 +171,7 @@ class LlamaForCausalLM(GenerationMixin):
...
@@ -156,6 +171,7 @@ class LlamaForCausalLM(GenerationMixin):
model_path
:
Union
[
str
,
os
.
PathLike
],
model_path
:
Union
[
str
,
os
.
PathLike
],
device
:
Optional
[
infinicore
.
device
]
=
None
,
device
:
Optional
[
infinicore
.
device
]
=
None
,
dtype
:
Optional
[
infinicore
.
dtype
]
=
None
,
dtype
:
Optional
[
infinicore
.
dtype
]
=
None
,
**
kwargs
,
):
):
"""
"""
Load a pretrained LlamaForCausalLM model from a directory.
Load a pretrained LlamaForCausalLM model from a directory.
...
@@ -176,4 +192,4 @@ class LlamaForCausalLM(GenerationMixin):
...
@@ -176,4 +192,4 @@ class LlamaForCausalLM(GenerationMixin):
config_dict
=
json
.
load
(
f
)
config_dict
=
json
.
load
(
f
)
config
=
LlamaConfig
(
config_dict
)
config
=
LlamaConfig
(
config_dict
)
return
cls
(
config
,
device
=
device
,
dtype
=
dtype
)
return
cls
(
config
,
device
=
device
,
dtype
=
dtype
,
**
kwargs
)
python/infinilm/models/llama/modeling_llama.py
View file @
3d328d61
...
@@ -17,7 +17,6 @@ import json
...
@@ -17,7 +17,6 @@ import json
import
os
import
os
from
typing
import
Optional
,
Union
from
typing
import
Optional
,
Union
from
transformers.utils
import
logging
import
infinicore
import
infinicore
...
@@ -25,8 +24,6 @@ from ...cache_utils import Cache, DynamicCache
...
@@ -25,8 +24,6 @@ from ...cache_utils import Cache, DynamicCache
from
...generation.utils
import
GenerationMixin
from
...generation.utils
import
GenerationMixin
from
.configuration_llama
import
LlamaConfig
from
.configuration_llama
import
LlamaConfig
logger
=
logging
.
get_logger
(
__name__
)
def
repeat_kv
(
keys
:
infinicore
.
Tensor
,
values
:
infinicore
.
Tensor
,
ngroup
:
int
):
def
repeat_kv
(
keys
:
infinicore
.
Tensor
,
values
:
infinicore
.
Tensor
,
ngroup
:
int
):
total_seq_len
,
num_heads
,
head_dim
=
keys
.
shape
total_seq_len
,
num_heads
,
head_dim
=
keys
.
shape
...
@@ -399,6 +396,7 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
...
@@ -399,6 +396,7 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
bias
=
False
,
bias
=
False
,
**
kwargs
,
**
kwargs
,
)
)
self
.
device
=
kwargs
.
get
(
"device"
,
infinicore
.
device
(
"cpu"
))
def
forward
(
def
forward
(
self
,
self
,
...
@@ -410,7 +408,7 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
...
@@ -410,7 +408,7 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
):
):
last_token
=
self
.
model
(
last_token
=
self
.
model
(
input_ids
,
input_ids
,
position_ids
,
position_ids
.
to
(
self
.
device
)
,
past_key_values
=
past_key_values
,
past_key_values
=
past_key_values
,
use_cache
=
use_cache
,
use_cache
=
use_cache
,
**
kwargs
,
**
kwargs
,
...
...
setup.py
View file @
3d328d61
...
@@ -37,7 +37,7 @@ setup(
...
@@ -37,7 +37,7 @@ setup(
version
=
"0.1.0"
,
version
=
"0.1.0"
,
description
=
"InfiniLM model implementations"
,
description
=
"InfiniLM model implementations"
,
package_dir
=
{
""
:
"python"
},
package_dir
=
{
""
:
"python"
},
packages
=
[
"infinilm"
,
"infinilm.models"
,
"infinilm.lib"
],
packages
=
[
"infinilm"
,
"infinilm.models"
,
"infinilm.lib"
,
"infinilm.distributed"
],
cmdclass
=
{
cmdclass
=
{
"build"
:
Build
,
"build"
:
Build
,
"develop"
:
Develop
,
"develop"
:
Develop
,
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment