Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
13a4154a
Unverified
Commit
13a4154a
authored
Dec 23, 2025
by
Jiacheng Huang
Committed by
GitHub
Dec 23, 2025
Browse files
issue/150 移除推理接口中的 `dtype` 传递
parent
91c06fd9
Changes
20
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
55 additions
and
88 deletions
+55
-88
csrc/models/llama/llama_attention.cpp
csrc/models/llama/llama_attention.cpp
+1
-1
csrc/models/llama/llama_attention.hpp
csrc/models/llama/llama_attention.hpp
+0
-1
csrc/models/llama/llama_config.hpp
csrc/models/llama/llama_config.hpp
+3
-0
csrc/models/llama/llama_decoder_layer.cpp
csrc/models/llama/llama_decoder_layer.cpp
+6
-5
csrc/models/llama/llama_decoder_layer.hpp
csrc/models/llama/llama_decoder_layer.hpp
+0
-1
csrc/models/llama/llama_for_causal_lm.cpp
csrc/models/llama/llama_for_causal_lm.cpp
+3
-2
csrc/models/llama/llama_for_causal_lm.hpp
csrc/models/llama/llama_for_causal_lm.hpp
+0
-2
csrc/models/llama/llama_mlp.cpp
csrc/models/llama/llama_mlp.cpp
+1
-1
csrc/models/llama/llama_mlp.hpp
csrc/models/llama/llama_mlp.hpp
+0
-1
csrc/models/llama/llama_model.cpp
csrc/models/llama/llama_model.cpp
+3
-2
csrc/models/llama/llama_model.hpp
csrc/models/llama/llama_model.hpp
+0
-1
csrc/models/model_factory.cpp
csrc/models/model_factory.cpp
+1
-1
csrc/pybind11/models/llama.hpp
csrc/pybind11/models/llama.hpp
+2
-0
examples/bench.py
examples/bench.py
+1
-20
examples/jiuge.py
examples/jiuge.py
+1
-18
examples/llama.py
examples/llama.py
+1
-18
python/infinilm/models/llama/__init__.py
python/infinilm/models/llama/__init__.py
+0
-2
python/infinilm/models/llama/configuration_llama.py
python/infinilm/models/llama/configuration_llama.py
+9
-0
python/infinilm/models/llama/modeling_llama.py
python/infinilm/models/llama/modeling_llama.py
+22
-9
test/bench/test_benchmark.py
test/bench/test_benchmark.py
+1
-3
No files found.
csrc/models/llama/llama_attention.cpp
View file @
13a4154a
...
...
@@ -16,7 +16,6 @@ namespace infinilm::models::llama {
LlamaAttention
::
LlamaAttention
(
const
LlamaConfig
&
config
,
const
infinicore
::
Device
&
device
,
size_t
layer_idx
,
infinicore
::
DataType
dtype
,
engine
::
distributed
::
RankInfo
rank_info
)
:
layer_idx_
(
layer_idx
),
hidden_size_
(
config
.
hidden_size
),
...
...
@@ -27,6 +26,7 @@ LlamaAttention::LlamaAttention(const LlamaConfig &config,
use_bias_
(
config
.
attention_bias
),
use_output_bias_
(
config
.
attention_output_bias
),
max_position_embeddings_
(
config
.
max_position_embeddings
),
rank_info_
(
rank_info
)
{
const
auto
&
dtype
{
config
.
dtype
};
int
tp_rank
=
rank_info
.
tp_rank
;
int
tp_size
=
rank_info
.
tp_size
;
...
...
csrc/models/llama/llama_attention.hpp
View file @
13a4154a
...
...
@@ -38,7 +38,6 @@ public:
LlamaAttention
(
const
LlamaConfig
&
config
,
const
infinicore
::
Device
&
device
,
size_t
layer_idx
,
infinicore
::
DataType
dtype
=
infinicore
::
DataType
::
F32
,
engine
::
distributed
::
RankInfo
rank_info
=
engine
::
distributed
::
RankInfo
());
/**
...
...
csrc/models/llama/llama_config.hpp
View file @
13a4154a
...
...
@@ -16,6 +16,9 @@ namespace infinilm::models::llama {
* It follows the same structure as HuggingFace's LlamaConfig.
*/
struct
LlamaConfig
:
public
InfinilmModel
::
Config
{
// Data type
infinicore
::
DataType
dtype
=
infinicore
::
DataType
::
F32
;
// Vocabulary and embedding
size_t
vocab_size
=
32000
;
// Vocabulary size
size_t
hidden_size
=
4096
;
// Hidden dimension size
...
...
csrc/models/llama/llama_decoder_layer.cpp
View file @
13a4154a
...
...
@@ -7,8 +7,9 @@ namespace infinilm::models::llama {
LlamaDecoderLayer
::
LlamaDecoderLayer
(
const
LlamaConfig
&
config
,
const
infinicore
::
Device
&
device
,
size_t
layer_idx
,
infinicore
::
DataType
dtype
,
engine
::
distributed
::
RankInfo
rank_info
)
:
layer_idx_
(
layer_idx
)
,
rank_info_
(
rank_info
){
engine
::
distributed
::
RankInfo
rank_info
)
:
layer_idx_
(
layer_idx
),
rank_info_
(
rank_info
)
{
const
auto
&
dtype
{
config
.
dtype
};
// Initialize layer normalization layers
INFINICORE_NN_MODULE_INIT
(
input_layernorm
,
config
.
hidden_size
,
config
.
rms_norm_eps
,
dtype
,
device
);
...
...
@@ -16,8 +17,8 @@ LlamaDecoderLayer::LlamaDecoderLayer(const LlamaConfig &config,
dtype
,
device
);
// Initialize attention and MLP modules
INFINICORE_NN_MODULE_INIT
(
self_attn
,
config
,
device
,
layer_idx
,
dtype
,
rank_info_
);
INFINICORE_NN_MODULE_INIT
(
mlp
,
config
,
device
,
dtype
,
rank_info_
);
INFINICORE_NN_MODULE_INIT
(
self_attn
,
config
,
device
,
layer_idx
,
rank_info_
);
INFINICORE_NN_MODULE_INIT
(
mlp
,
config
,
device
,
rank_info_
);
}
infinicore
::
Tensor
LlamaDecoderLayer
::
forward
(
const
infinicore
::
Tensor
&
hidden_states
,
...
...
csrc/models/llama/llama_decoder_layer.hpp
View file @
13a4154a
...
...
@@ -36,7 +36,6 @@ public:
LlamaDecoderLayer
(
const
LlamaConfig
&
config
,
const
infinicore
::
Device
&
device
,
size_t
layer_idx
,
infinicore
::
DataType
dtype
=
infinicore
::
DataType
::
F32
,
engine
::
distributed
::
RankInfo
rank_info
=
engine
::
distributed
::
RankInfo
());
/**
...
...
csrc/models/llama/llama_for_causal_lm.cpp
View file @
13a4154a
...
...
@@ -8,14 +8,15 @@ namespace infinilm::models::llama {
LlamaForCausalLM
::
LlamaForCausalLM
(
const
LlamaConfig
&
config
,
const
infinicore
::
Device
&
device
,
infinicore
::
DataType
dtype
,
engine
::
distributed
::
RankInfo
rank_info
)
{
// Initialize module's device_ member
device_
=
device
;
const
auto
&
dtype
{
config
.
dtype
};
// Initialize base model
INFINICORE_NN_MODULE_INIT
(
model
,
config
,
device
,
dtype
,
rank_info
);
INFINICORE_NN_MODULE_INIT
(
model
,
config
,
device
,
rank_info
);
// Initialize language modeling head
// Note: If tie_word_embeddings is true, we would share weights with embed_tokens
...
...
csrc/models/llama/llama_for_causal_lm.hpp
View file @
13a4154a
...
...
@@ -27,11 +27,9 @@ public:
*
* @param config Model configuration
* @param device Device to create tensors on
* @param dtype Optional data type for model parameters (defaults to BF16)
*/
LlamaForCausalLM
(
const
LlamaConfig
&
config
,
const
infinicore
::
Device
&
device
,
infinicore
::
DataType
dtype
=
infinicore
::
DataType
::
BF16
,
engine
::
distributed
::
RankInfo
rank_info
=
engine
::
distributed
::
RankInfo
());
/**
...
...
csrc/models/llama/llama_mlp.cpp
View file @
13a4154a
...
...
@@ -6,11 +6,11 @@ namespace infinilm::models::llama {
LlamaMLP
::
LlamaMLP
(
const
LlamaConfig
&
config
,
const
infinicore
::
Device
&
device
,
infinicore
::
DataType
dtype
,
engine
::
distributed
::
RankInfo
rank_info
)
:
hidden_size_
(
config
.
hidden_size
),
intermediate_size_
(
config
.
intermediate_size
),
use_bias_
(
config
.
mlp_bias
),
rank_info_
(
rank_info
)
{
const
auto
&
dtype
{
config
.
dtype
};
int
tp_rank
=
rank_info
.
tp_rank
;
int
tp_size
=
rank_info
.
tp_size
;
...
...
csrc/models/llama/llama_mlp.hpp
View file @
13a4154a
...
...
@@ -35,7 +35,6 @@ public:
*/
LlamaMLP
(
const
LlamaConfig
&
config
,
const
infinicore
::
Device
&
device
,
infinicore
::
DataType
dtype
=
infinicore
::
DataType
::
F32
,
engine
::
distributed
::
RankInfo
rank_info
=
engine
::
distributed
::
RankInfo
());
/**
...
...
csrc/models/llama/llama_model.cpp
View file @
13a4154a
...
...
@@ -9,9 +9,10 @@ namespace infinilm::models::llama {
LlamaModel
::
LlamaModel
(
const
LlamaConfig
&
config
,
const
infinicore
::
Device
&
device
,
infinicore
::
DataType
dtype
,
engine
::
distributed
::
RankInfo
rank_info
)
:
config_
(
config
)
{
const
auto
&
dtype
{
config
.
dtype
};
// Initialize token embeddings
INFINICORE_NN_MODULE_INIT
(
embed_tokens
,
config
.
vocab_size
,
config
.
hidden_size
,
std
::
nullopt
,
dtype
,
device
);
...
...
@@ -23,7 +24,7 @@ LlamaModel::LlamaModel(const LlamaConfig &config,
layers_
.
reserve
(
config
.
num_hidden_layers
);
for
(
size_t
i
=
0
;
i
<
config
.
num_hidden_layers
;
++
i
)
{
layers_
.
push_back
(
this
->
register_module
<
LlamaDecoderLayer
>
(
"layers."
+
std
::
to_string
(
i
),
config
,
device
,
i
,
dtype
,
rank_info
));
"layers."
+
std
::
to_string
(
i
),
config
,
device
,
i
,
rank_info
));
}
// Initialize final layer normalization
...
...
csrc/models/llama/llama_model.hpp
View file @
13a4154a
...
...
@@ -40,7 +40,6 @@ public:
*/
LlamaModel
(
const
LlamaConfig
&
config
,
const
infinicore
::
Device
&
device
,
infinicore
::
DataType
dtype
=
infinicore
::
DataType
::
F32
,
engine
::
distributed
::
RankInfo
rank_info
=
engine
::
distributed
::
RankInfo
());
/**
...
...
csrc/models/model_factory.cpp
View file @
13a4154a
...
...
@@ -10,7 +10,7 @@ std::shared_ptr<InfinilmModel> InfinilmModelFactory::createModel(
if
(
const
auto
llama_config_ptr
=
dynamic_cast
<
const
models
::
llama
::
LlamaConfig
*>
(
&
config
))
{
const
auto
&
llama_config
=
*
llama_config_ptr
;
auto
model
=
std
::
make_shared
<
models
::
llama
::
LlamaForCausalLM
>
(
llama_config
,
rank_info
.
device
,
infinicore
::
DataType
::
BF16
,
rank_info
);
llama_config
,
rank_info
.
device
,
rank_info
);
if
(
cache_ptr
!=
nullptr
)
{
model
->
model
().
set_external_cache
(
cache_ptr
);
...
...
csrc/pybind11/models/llama.hpp
View file @
13a4154a
...
...
@@ -45,6 +45,8 @@ inline void bind_llama(py::module &m) {
py
::
class_
<
LlamaConfig
,
InfinilmModel
::
Config
>
llama_config
(
m
,
"LlamaConfig"
);
llama_config
.
def
(
py
::
init
<>
())
// TODO: Change this to `dtype` after updating InfiniCore pybind11 exposing mechanism.
.
def_readwrite
(
"_dtype"
,
&
LlamaConfig
::
dtype
)
.
def_readwrite
(
"vocab_size"
,
&
LlamaConfig
::
vocab_size
)
.
def_readwrite
(
"hidden_size"
,
&
LlamaConfig
::
hidden_size
)
.
def_readwrite
(
"intermediate_size"
,
&
LlamaConfig
::
intermediate_size
)
...
...
examples/bench.py
View file @
13a4154a
...
...
@@ -141,14 +141,6 @@ def get_args():
required
=
True
,
help
=
"model path"
,
)
parser
.
add_argument
(
"--dtype"
,
type
=
str
,
default
=
"bfloat16"
,
help
=
"bfloat16"
,
)
parser
.
add_argument
(
"--batch-size"
,
type
=
parse_list
,
...
...
@@ -195,7 +187,6 @@ class TestModel:
def
__init__
(
self
,
model_path
,
infini_dtype
=
infinicore
.
bfloat16
,
infini_device
=
infinicore
.
device
(
"cpu"
,
0
),
tp
=
1
,
)
->
None
:
...
...
@@ -206,7 +197,6 @@ class TestModel:
model
=
infinilm
.
AutoLlamaModel
.
from_pretrained
(
model_path
,
device
=
infini_device
,
dtype
=
infini_dtype
,
backend
=
"cpp"
,
distributed_config
=
DistConfig
(
tp
),
)
...
...
@@ -214,7 +204,7 @@ class TestModel:
# ---------------------------------------------------------------------------- #
# 加载权重
# ---------------------------------------------------------------------------- #
load_model_state_dict_by_file
(
model
,
model_path
,
dtype
=
infini_
dtype
)
load_model_state_dict_by_file
(
model
,
model_path
,
dtype
=
model
.
config
.
dtype
)
# ---------------------------------------------------------------------------- #
# 创建 tokenizer
...
...
@@ -289,14 +279,6 @@ if __name__ == "__main__":
model_path
=
args
.
model
infini_device
=
infinicore
.
device
(
device_str
,
0
)
if
args
.
dtype
==
"float32"
:
infini_dtype
=
infinicore
.
float32
elif
args
.
dtype
==
"bfloat16"
:
infini_dtype
=
infinicore
.
bfloat16
elif
args
.
dtype
==
"float16"
:
infini_dtype
=
infinicore
.
float16
else
:
raise
ValueError
(
f
"Unsupported dtype:
{
args
.
dtype
}
"
)
tp
=
args
.
tensor_parallel_size
...
...
@@ -321,7 +303,6 @@ if __name__ == "__main__":
test
=
TestModel
(
model_path
,
infini_dtype
=
infini_dtype
,
infini_device
=
infini_device
,
tp
=
tp
,
)
...
...
examples/jiuge.py
View file @
13a4154a
...
...
@@ -58,12 +58,6 @@ def get_args():
default
=
"cpp"
,
help
=
"python or cpp model"
,
)
parser
.
add_argument
(
"--dtype"
,
type
=
str
,
default
=
"bfloat16"
,
help
=
"float32, float16, bfloat16"
,
)
parser
.
add_argument
(
"--batch-size"
,
type
=
int
,
...
...
@@ -90,7 +84,6 @@ def test(
prompts
:
str
|
list
[
str
],
model_path
,
max_new_tokens
=
100
,
infini_dtype
=
infinicore
.
bfloat16
,
infini_device
=
infinicore
.
device
(
"cpu"
,
0
),
backend
=
"python"
,
tp
=
1
,
...
...
@@ -102,7 +95,6 @@ def test(
model
=
infinilm
.
AutoLlamaModel
.
from_pretrained
(
model_path
,
device
=
infini_device
,
dtype
=
infini_dtype
,
backend
=
backend
,
distributed_config
=
DistConfig
(
tp
),
)
...
...
@@ -110,7 +102,7 @@ def test(
# ---------------------------------------------------------------------------- #
# 加载权重
# ---------------------------------------------------------------------------- #
load_model_state_dict_by_file
(
model
,
model_path
,
dtype
=
infini_
dtype
)
load_model_state_dict_by_file
(
model
,
model_path
,
dtype
=
model
.
config
.
dtype
)
# ---------------------------------------------------------------------------- #
# 创建 tokenizer
...
...
@@ -203,21 +195,12 @@ if __name__ == "__main__":
tp
=
args
.
tp
infini_device
=
infinicore
.
device
(
device_str
,
0
)
if
args
.
dtype
==
"float32"
:
infini_dtype
=
infinicore
.
float32
elif
args
.
dtype
==
"bfloat16"
:
infini_dtype
=
infinicore
.
bfloat16
elif
args
.
dtype
==
"float16"
:
infini_dtype
=
infinicore
.
float16
else
:
raise
ValueError
(
f
"Unsupported dtype:
{
args
.
dtype
}
"
)
test
(
prompts
,
model_path
,
max_new_tokens
,
infini_device
=
infini_device
,
infini_dtype
=
infini_dtype
,
backend
=
backend
,
tp
=
tp
,
)
examples/llama.py
View file @
13a4154a
...
...
@@ -57,12 +57,6 @@ def get_args():
default
=
"python"
,
help
=
"python or cpp model"
,
)
parser
.
add_argument
(
"--dtype"
,
type
=
str
,
default
=
"float32"
,
help
=
"float32, float16, bfloat16"
,
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
...
...
@@ -83,7 +77,6 @@ def test(
prompts
:
str
|
list
[
str
],
model_path
,
max_new_tokens
=
100
,
infini_dtype
=
infinicore
.
bfloat16
,
infini_device
=
infinicore
.
device
(
"cpu"
,
0
),
backend
=
"python"
,
):
...
...
@@ -94,7 +87,6 @@ def test(
model
=
infinilm
.
AutoLlamaModel
.
from_pretrained
(
model_path
,
device
=
infini_device
,
dtype
=
infini_dtype
,
backend
=
backend
,
)
...
...
@@ -104,7 +96,7 @@ def test(
model_param_infini
=
get_model_state_dict
(
model_path
,
device
=
infini_device
,
dtype
=
infini_
dtype
,
dtype
=
model
.
config
.
dtype
,
)
model
.
load_state_dict
(
model_param_infini
,
strict
=
True
)
...
...
@@ -201,20 +193,11 @@ if __name__ == "__main__":
backend
=
args
.
backend
infini_device
=
infinicore
.
device
(
device_str
,
0
)
if
args
.
dtype
==
"float32"
:
infini_dtype
=
infinicore
.
float32
elif
args
.
dtype
==
"bfloat16"
:
infini_dtype
=
infinicore
.
bfloat16
elif
args
.
dtype
==
"float16"
:
infini_dtype
=
infinicore
.
float16
else
:
raise
ValueError
(
f
"Unsupported dtype:
{
args
.
dtype
}
"
)
test
(
prompts
,
model_path
,
max_new_tokens
,
infini_device
=
infini_device
,
infini_dtype
=
infini_dtype
,
backend
=
backend
,
)
python/infinilm/models/llama/__init__.py
View file @
13a4154a
...
...
@@ -30,7 +30,6 @@ class AutoLlamaModel:
instance
=
modeling_llama
.
LlamaForCausalLM
.
from_pretrained
(
model_path
,
device
=
device
,
dtype
=
dtype
,
**
kwargs
,
)
...
...
@@ -45,7 +44,6 @@ class AutoLlamaModel:
instance
=
cpp
.
LlamaForCausalLM
.
from_pretrained
(
model_path
,
device
=
device
,
dtype
=
dtype
,
**
kwargs
,
)
else
:
...
...
python/infinilm/models/llama/configuration_llama.py
View file @
13a4154a
...
...
@@ -15,6 +15,8 @@
"""LLaMA model configuration"""
import
infinicore
from
infinilm.lib
import
_infinilm
from
...configuration_utils
import
PretrainedConfig
...
...
@@ -179,6 +181,7 @@ class LlamaConfig(PretrainedConfig, _infinilm.LlamaConfig):
attention_dropout
=
0.0
,
mlp_bias
=
False
,
head_dim
=
None
,
torch_dtype
=
None
,
**
kwargs
,
):
_infinilm
.
LlamaConfig
.
__init__
(
self
)
...
...
@@ -225,6 +228,12 @@ class LlamaConfig(PretrainedConfig, _infinilm.LlamaConfig):
self
.
rope_scaling
[
"rope_type"
]
=
self
.
rope_scaling
[
"type"
]
# rope_config_validation(self)
if
torch_dtype
in
{
"float32"
,
"bfloat16"
,
"float16"
}:
self
.
dtype
=
getattr
(
infinicore
,
torch_dtype
)
self
.
_dtype
=
self
.
dtype
.
_underlying
else
:
raise
ValueError
(
f
"Unsupported dtype:
{
torch_dtype
}
"
)
PretrainedConfig
.
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
...
...
python/infinilm/models/llama/modeling_llama.py
View file @
13a4154a
...
...
@@ -98,15 +98,16 @@ class LlamaMLP(infinicore.nn.Module):
hidden_size
=
config
.
hidden_size
intermediate_size
=
config
.
intermediate_size
mlp_bias
=
config
.
mlp_bias
dtype
=
config
.
dtype
self
.
gate_proj
=
infinicore
.
nn
.
Linear
(
hidden_size
,
intermediate_size
,
bias
=
mlp_bias
,
**
kwargs
hidden_size
,
intermediate_size
,
bias
=
mlp_bias
,
dtype
=
dtype
,
**
kwargs
)
self
.
up_proj
=
infinicore
.
nn
.
Linear
(
hidden_size
,
intermediate_size
,
bias
=
mlp_bias
,
**
kwargs
hidden_size
,
intermediate_size
,
bias
=
mlp_bias
,
dtype
=
dtype
,
**
kwargs
)
self
.
down_proj
=
infinicore
.
nn
.
Linear
(
intermediate_size
,
hidden_size
,
bias
=
mlp_bias
,
**
kwargs
intermediate_size
,
hidden_size
,
bias
=
mlp_bias
,
dtype
=
dtype
,
**
kwargs
)
self
.
act_fn
=
infinicore
.
nn
.
functional
.
silu
...
...
@@ -133,10 +134,13 @@ class LlamaAttention(infinicore.nn.Module):
self
.
scaling
=
self
.
head_dim
**-
0.5
dtype
=
config
.
dtype
self
.
q_proj
=
infinicore
.
nn
.
Linear
(
self
.
hidden_size
,
self
.
num_attention_heads
*
self
.
head_dim
,
bias
=
attention_bias
,
dtype
=
dtype
,
**
kwargs
,
)
...
...
@@ -144,6 +148,7 @@ class LlamaAttention(infinicore.nn.Module):
self
.
hidden_size
,
self
.
num_key_value_heads
*
self
.
head_dim
,
bias
=
attention_bias
,
dtype
=
dtype
,
**
kwargs
,
)
...
...
@@ -151,6 +156,7 @@ class LlamaAttention(infinicore.nn.Module):
self
.
hidden_size
,
self
.
num_key_value_heads
*
self
.
head_dim
,
bias
=
attention_bias
,
dtype
=
dtype
,
**
kwargs
,
)
...
...
@@ -158,6 +164,7 @@ class LlamaAttention(infinicore.nn.Module):
self
.
num_attention_heads
*
self
.
head_dim
,
self
.
hidden_size
,
bias
=
False
,
dtype
=
dtype
,
**
kwargs
,
)
...
...
@@ -258,13 +265,16 @@ class LlamaDecoderLayer(infinicore.nn.Module):
super
().
__init__
()
hidden_size
=
config
.
hidden_size
rms_norm_eps
=
config
.
rms_norm_eps
dtype
=
config
.
dtype
self
.
self_attn
=
LlamaAttention
(
config
=
config
,
layer_idx
=
layer_idx
,
**
kwargs
)
self
.
mlp
=
LlamaMLP
(
config
=
config
,
**
kwargs
)
self
.
input_layernorm
=
LlamaRMSNorm
(
hidden_size
,
eps
=
rms_norm_eps
,
**
kwargs
)
self
.
input_layernorm
=
LlamaRMSNorm
(
hidden_size
,
eps
=
rms_norm_eps
,
dtype
=
dtype
,
**
kwargs
)
self
.
post_attention_layernorm
=
LlamaRMSNorm
(
hidden_size
,
eps
=
rms_norm_eps
,
**
kwargs
hidden_size
,
eps
=
rms_norm_eps
,
dtype
=
dtype
,
**
kwargs
)
def
forward
(
...
...
@@ -317,7 +327,7 @@ class LlamaModel(infinicore.nn.Module):
)
self
.
embed_tokens
=
infinicore
.
nn
.
Embedding
(
config
.
vocab_size
,
config
.
hidden_size
,
**
kwargs
config
.
vocab_size
,
config
.
hidden_size
,
dtype
=
config
.
dtype
,
**
kwargs
)
self
.
layers
=
infinicore
.
nn
.
ModuleList
(
...
...
@@ -326,12 +336,15 @@ class LlamaModel(infinicore.nn.Module):
for
layer_idx
in
range
(
config
.
num_hidden_layers
)
]
)
self
.
norm
=
LlamaRMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
,
**
kwargs
)
self
.
norm
=
LlamaRMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
,
dtype
=
config
.
dtype
,
**
kwargs
)
self
.
rope_instance
=
infinicore
.
nn
.
RoPE
(
max_position_embeddings
=
config
.
max_position_embeddings
,
rope_theta
=
config
.
rope_theta
,
head_dim
=
head_dim
,
dtype
=
config
.
dtype
,
**
kwargs
,
)
...
...
@@ -394,6 +407,7 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
config
.
hidden_size
,
config
.
vocab_size
,
bias
=
False
,
dtype
=
config
.
dtype
,
**
kwargs
,
)
self
.
device
=
kwargs
.
get
(
"device"
,
infinicore
.
device
(
"cpu"
))
...
...
@@ -420,7 +434,6 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
cls
,
model_path
:
Optional
[
Union
[
str
,
os
.
PathLike
]],
device
:
infinicore
.
device
,
dtype
=
infinicore
.
dtype
,
):
def
load_config_json
(
dir_path_
:
str
):
with
open
(
os
.
path
.
join
(
dir_path_
,
"config.json"
),
"r"
)
as
f
:
...
...
@@ -430,4 +443,4 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
config_dict
=
load_config_json
(
os
.
path
.
join
(
model_path
))
config
=
LlamaConfig
(
**
config_dict
)
return
LlamaForCausalLM
(
config
,
device
=
device
,
dtype
=
dtype
)
return
LlamaForCausalLM
(
config
,
device
=
device
)
test/bench/test_benchmark.py
View file @
13a4154a
...
...
@@ -77,7 +77,6 @@ class InfiniLMBenchmark(BaseBenchmark):
# When CUDA_VISIBLE_DEVICES=5 is set, CUDA only sees device 5 as device 0
# So device index 0 will automatically map to the first visible device
self
.
device
=
infinicore
.
device
(
device_name
,
0
)
self
.
dtype
=
infinicore
.
bfloat16
# Load config and tokenizer
with
open
(
os
.
path
.
join
(
model_dir_path
,
"config.json"
),
"r"
)
as
f
:
...
...
@@ -117,7 +116,6 @@ class InfiniLMBenchmark(BaseBenchmark):
self
.
model
=
AutoLlamaModel
.
from_pretrained
(
model_dir_path
,
device
=
self
.
device
,
dtype
=
self
.
dtype
,
backend
=
backend
,
distributed_config
=
DistConfig
(
ndev
),
)
...
...
@@ -130,7 +128,7 @@ class InfiniLMBenchmark(BaseBenchmark):
load_model_state_dict_by_file
(
self
.
model
,
model_dir_path
,
dtype
=
self
.
dtype
,
dtype
=
self
.
model
.
config
.
dtype
,
)
print
(
"Model loaded successfully"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment