Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
9c256a17
Unverified
Commit
9c256a17
authored
Dec 08, 2025
by
Ceng
Committed by
GitHub
Dec 08, 2025
Browse files
issue/106 适配模型9G7B
parent
39bea30a
Changes
24
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
678 additions
and
118 deletions
+678
-118
python/infinilm/models/llama/configuration_llama.py
python/infinilm/models/llama/configuration_llama.py
+1
-1
python/infinilm/models/llama/modeling_llama.py
python/infinilm/models/llama/modeling_llama.py
+1
-1
test/bench/test_benchmark.py
test/bench/test_benchmark.py
+504
-0
test/models/llama/test_forward_validation.py
test/models/llama/test_forward_validation.py
+172
-116
No files found.
python/infinilm/models/llama/configuration_llama.py
View file @
9c256a17
...
...
@@ -173,7 +173,7 @@ class LlamaConfig(PretrainedConfig):
tie_word_embeddings
=
False
,
rope_theta
=
10000.0
,
rope_scaling
=
None
,
attention_bias
=
Fals
e
,
attention_bias
=
Tru
e
,
attention_dropout
=
0.0
,
mlp_bias
=
False
,
head_dim
=
None
,
...
...
python/infinilm/models/llama/modeling_llama.py
View file @
9c256a17
...
...
@@ -157,7 +157,7 @@ class LlamaAttention(infinicore.nn.Module):
self
.
o_proj
=
infinicore
.
nn
.
Linear
(
self
.
num_attention_heads
*
self
.
head_dim
,
self
.
hidden_size
,
bias
=
attention_bias
,
bias
=
False
,
**
kwargs
,
)
...
...
test/bench/test_benchmark.py
0 → 100644
View file @
9c256a17
import
sys
import
os
import
argparse
import
time
import
re
from
datasets
import
load_dataset
import
infinicore
import
infinilm
from
infinilm.models.llama
import
AutoLlamaModel
from
infinilm.modeling_utils
import
get_model_state_dict
from
infinilm.distributed
import
DistConfig
from
abc
import
ABC
,
abstractmethod
class
BaseBenchmark
(
ABC
):
"""Base class for benchmark evaluation with common tokenizer and generation utilities"""
def
encode_text
(
self
,
text
):
"""Encode text to token IDs - reused across backends"""
return
self
.
tokenizer
.
encode
(
text
)
def
decode_token
(
self
,
token_id
):
"""Decode token ID to text - reused across backends"""
return
self
.
tokenizer
.
decode
(
token_id
)
@
abstractmethod
def
render_input_content
(
self
,
*
args
,
**
kwargs
):
"""Render input content - benchmark-specific implementation"""
pass
@
abstractmethod
def
generate
(
self
,
*
args
,
**
kwargs
):
"""Generate response - benchmark-specific implementation"""
pass
@
abstractmethod
def
_generate_step
(
self
,
tokens
,
max_steps
,
topp_
,
topk_
,
temperature_
):
"""Backend-specific generation implementation"""
pass
class
InfiniLMBenchmark
(
BaseBenchmark
):
"""Wrapper class for InfiniLM cpp backend for benchmark evaluation"""
def
__init__
(
self
,
model_dir_path
,
device_type_str
=
"cpu"
,
ndev
=
1
,
backend
=
"cpp"
,
benchmark
=
"ceval"
):
import
transformers
self
.
benchmark
=
benchmark
# Map device type string to infinicore device
device_map
=
{
"cpu"
:
"cpu"
,
"nvidia"
:
"cuda"
,
"cambricon"
:
"cambricon"
,
"ascend"
:
"ascend"
,
"metax"
:
"metax"
,
"moore"
:
"moore"
,
"iluvatar"
:
"iluvatar"
,
"kunlun"
:
"kunlun"
,
"hygon"
:
"hygon"
,
}
device_name
=
device_map
.
get
(
device_type_str
.
lower
(),
"cpu"
)
# CUDA_VISIBLE_DEVICES is automatically respected by CUDA runtime API
# When CUDA_VISIBLE_DEVICES=5 is set, CUDA only sees device 5 as device 0
# So device index 0 will automatically map to the first visible device
self
.
device
=
infinicore
.
device
(
device_name
,
0
)
self
.
dtype
=
infinicore
.
bfloat16
# Load config and tokenizer
with
open
(
os
.
path
.
join
(
model_dir_path
,
"config.json"
),
"r"
)
as
f
:
import
json
self
.
config_dict
=
json
.
load
(
f
)
# Align tokenizer initialization with jiuge backend (010)
# Match the exact same initialization logic based on model type
model_type
=
self
.
config_dict
.
get
(
"model_type"
,
""
)
if
model_type
==
"llama"
:
# For llama models: no trust_remote_code (matches jiuge line 465)
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
model_dir_path
)
elif
model_type
in
[
"fm9g"
,
"minicpm"
,
"fm9g7b"
]:
# For fm9g/minicpm/fm9g7b models: use trust_remote_code=True (matches jiuge lines 493-495, 518-520)
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
model_dir_path
,
trust_remote_code
=
True
)
elif
model_type
in
[
"qwen2"
,
"qwen3"
]:
# For qwen2/qwen3 models: no trust_remote_code (matches jiuge line 534-536)
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
model_dir_path
)
else
:
# Default: use trust_remote_code=True for other models
self
.
tokenizer
=
transformers
.
AutoTokenizer
.
from_pretrained
(
model_dir_path
,
trust_remote_code
=
True
)
eos_token_id
=
self
.
config_dict
.
get
(
"eos_token_id"
)
self
.
eos_token_id
=
(
[
eos_token_id
]
if
isinstance
(
eos_token_id
,
int
)
else
eos_token_id
)
# Create model with cpp backend
print
(
"Loading model with cpp backend..."
)
self
.
model
=
AutoLlamaModel
.
from_pretrained
(
model_dir_path
,
device
=
self
.
device
,
dtype
=
self
.
dtype
,
backend
=
backend
,
distributed_config
=
DistConfig
(
ndev
),
)
# Enable KV cache for generation
self
.
model
.
use_cache
=
True
# Load weights
print
(
"Loading model weights..."
)
model_param_infini
=
get_model_state_dict
(
model_dir_path
,
device
=
self
.
device
,
dtype
=
self
.
dtype
,
)
self
.
model
.
load_state_dict
(
model_param_infini
)
print
(
"Model loaded successfully"
)
def
max_context_len
(
self
):
return
self
.
config_dict
.
get
(
"max_position_embeddings"
,
2048
)
def
render_input_content
(
self
,
*
args
,
**
kwargs
):
"""Render input content based on benchmark type"""
if
self
.
benchmark
==
"ceval"
:
return
self
.
_render_ceval
(
*
args
,
**
kwargs
)
elif
self
.
benchmark
==
"mmlu"
:
return
self
.
_render_mmlu
(
*
args
,
**
kwargs
)
else
:
raise
ValueError
(
f
"Unknown benchmark:
{
self
.
benchmark
}
"
)
def
_render_ceval
(
self
,
conversation
):
"""Render C-Eval conversation to input content"""
return
(
self
.
tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
add_generation_prompt
=
True
,
tokenize
=
False
,
)
+
"正确答案是"
)
def
_render_mmlu
(
self
,
question
,
choices
):
"""Render MMLU question and choices to input content"""
choices_text
=
"
\n
"
.
join
([
f
"
{
chr
(
65
+
i
)
}
.
{
choice
}
"
for
i
,
choice
in
enumerate
(
choices
)])
instruction
=
(
"You are a multiple-choice question solver. "
"Select the correct option and respond with only the letter A, B, C, or D."
)
prompt
=
f
"
{
instruction
}
\n\n
Question:
{
question
}
\n
{
choices_text
}
\n
Answer:"
# Use chat template if available, otherwise return plain text
if
hasattr
(
self
.
tokenizer
,
'apply_chat_template'
):
conversation
=
[
{
"role"
:
"system"
,
"content"
:
instruction
},
{
"role"
:
"user"
,
"content"
:
f
"
{
question
}
\n
{
choices_text
}
\n
Answer:"
}
]
try
:
return
self
.
tokenizer
.
apply_chat_template
(
conversation
=
conversation
,
add_generation_prompt
=
True
,
tokenize
=
False
,
)
except
Exception
:
return
prompt
return
prompt
def
generate
(
self
,
*
args
,
max_steps
=
500
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
):
"""Generate response based on benchmark type"""
# Render input content
input_content
=
self
.
render_input_content
(
*
args
)
print
(
input_content
,
end
=
""
,
flush
=
True
)
# Encode input
tokens
=
self
.
encode_text
(
input_content
)
# Delegate to backend-specific generation implementation
output_content
,
avg_time
=
self
.
_generate_step
(
tokens
,
max_steps
,
topp_
,
topk_
,
temperature_
)
return
output_content
,
avg_time
def
_generate_step
(
self
,
tokens
,
max_steps
,
topp_
,
topk_
,
temperature_
):
"""
InfiniLM cpp backend-specific generation implementation
NOTE: Validation confirmed input configs are identical between backends.
The issue was that manual generation loop called InferEngine.generate() which
doesn't maintain KV cache. Solution: Use model's built-in generate() method
which properly handles KV cache through GenerationMixin.
"""
# Convert tokens to infinicore format
input_ids_list
=
[
tokens
]
input_ids
=
infinicore
.
from_list
(
input_ids_list
,
dtype
=
infinicore
.
int64
).
to
(
self
.
device
)
# Use model's built-in generate() method which properly handles KV cache
# Pass sampling parameters (temperature, topk, topp) via kwargs
output_tokens_list
,
output_content
=
self
.
model
.
generate
(
input_ids
=
input_ids
,
max_new_tokens
=
max_steps
,
tokenizer
=
self
.
tokenizer
,
stop_on_eos
=
True
,
temperature
=
temperature_
,
topk
=
topk_
,
topp
=
topp_
,
)
# Calculate average time (GenerationMixin doesn't return timing info)
# We'll use a placeholder since the timing info isn't available
print
(
"
\n
"
)
avg_time
=
0.0
# GenerationMixin doesn't expose per-step timing
print
(
f
"Time per step: N/A (using GenerationMixin.generate)"
)
return
output_content
,
avg_time
def
destroy_model_instance
(
self
):
# Cleanup if needed
del
self
.
model
print
(
"Model destroyed"
)
def
extract_answer_ceval
(
output_content
,
answer
):
"""Extract predicted answer from C-Eval output"""
output_upper
=
output_content
.
upper
().
strip
()
position
=
0
ABCD
=
output_upper
[
position
:
position
+
2
]
return
answer
in
ABCD
def
extract_answer_mmlu
(
output_content
):
"""Extract predicted answer from MMLU output (returns 0-3 index or None)"""
output_upper
=
output_content
.
upper
().
strip
()
# Find first meaningful token
match
=
re
.
search
(
r
"\b([ABCD])\b"
,
output_upper
)
if
match
:
return
ord
(
match
.
group
(
1
))
-
ord
(
'A'
)
else
:
match_num
=
re
.
search
(
r
"\b([0-3])\b"
,
output_upper
)
if
match_num
:
return
int
(
match_num
.
group
(
1
))
return
None
def
test
():
# Parse arguments manually to handle device flags properly
if
len
(
sys
.
argv
)
<
4
:
print
(
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N]"
)
sys
.
exit
(
1
)
# Parse device flag (first argument)
device_flag
=
sys
.
argv
[
1
]
model_path
=
sys
.
argv
[
2
]
# Parse optional arguments
backend
=
"cpp"
ndev
=
1
benchmark
=
None
subject
=
None
# For MMLU
dataset_name
=
"middle_school_mathematics"
# For C-Eval
num_samples
=
None
max_new_tokens
=
500
i
=
3
while
i
<
len
(
sys
.
argv
):
if
sys
.
argv
[
i
]
==
"--bench"
and
i
+
1
<
len
(
sys
.
argv
):
benchmark
=
sys
.
argv
[
i
+
1
]
i
+=
2
elif
sys
.
argv
[
i
]
==
"--backend"
and
i
+
1
<
len
(
sys
.
argv
):
backend
=
sys
.
argv
[
i
+
1
]
i
+=
2
elif
sys
.
argv
[
i
]
==
"--ndev"
and
i
+
1
<
len
(
sys
.
argv
):
ndev
=
int
(
sys
.
argv
[
i
+
1
])
i
+=
2
elif
sys
.
argv
[
i
]
==
"--subject"
and
i
+
1
<
len
(
sys
.
argv
):
subject
=
sys
.
argv
[
i
+
1
]
i
+=
2
elif
sys
.
argv
[
i
]
==
"--dataset"
and
i
+
1
<
len
(
sys
.
argv
):
dataset_name
=
sys
.
argv
[
i
+
1
]
i
+=
2
elif
sys
.
argv
[
i
]
==
"--num_samples"
and
i
+
1
<
len
(
sys
.
argv
):
num_samples
=
int
(
sys
.
argv
[
i
+
1
])
i
+=
2
elif
sys
.
argv
[
i
]
==
"--max_new_tokens"
and
i
+
1
<
len
(
sys
.
argv
):
max_new_tokens
=
int
(
sys
.
argv
[
i
+
1
])
i
+=
2
else
:
i
+=
1
if
benchmark
is
None
:
print
(
"Error: --bench argument is required. Choose 'ceval' or 'mmlu'"
)
sys
.
exit
(
1
)
if
benchmark
not
in
[
"ceval"
,
"mmlu"
]:
print
(
f
"Error: Unknown benchmark '
{
benchmark
}
'. Choose 'ceval' or 'mmlu'"
)
sys
.
exit
(
1
)
# Parse device type
device_type_str
=
"cpu"
if
device_flag
==
"--cpu"
:
device_type_str
=
"cpu"
elif
device_flag
==
"--nvidia"
:
device_type_str
=
"nvidia"
elif
device_flag
==
"--cambricon"
:
device_type_str
=
"cambricon"
elif
device_flag
==
"--ascend"
:
device_type_str
=
"ascend"
elif
device_flag
==
"--metax"
:
device_type_str
=
"metax"
elif
device_flag
==
"--moore"
:
device_type_str
=
"moore"
elif
device_flag
==
"--iluvatar"
:
device_type_str
=
"iluvatar"
elif
device_flag
==
"--kunlun"
:
device_type_str
=
"kunlun"
elif
device_flag
==
"--hygon"
:
device_type_str
=
"hygon"
else
:
print
(
"Usage: python test_benchmark.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> --bench [ceval|mmlu] [--backend cpp] [--ndev N] [--subject SUBJECT] [--num_samples N] [--max_new_tokens N]"
)
sys
.
exit
(
1
)
# Load dataset based on benchmark
if
benchmark
==
"ceval"
:
# Load C-Eval dataset
# https://huggingface.co/datasets/ceval/ceval-exam/tree/main/middle_school_geography
print
(
f
"Loading C-Eval dataset (dataset:
{
dataset_name
}
)..."
)
try
:
dataset
=
load_dataset
(
r
"ceval/ceval-exam"
,
name
=
dataset_name
)
samples
=
dataset
[
"val"
]
# Convert Dataset to list if needed
if
hasattr
(
samples
,
'to_list'
):
samples
=
samples
.
to_list
()
else
:
samples
=
list
(
samples
)
except
Exception
as
e
:
print
(
f
"Error loading dataset:
{
e
}
"
)
print
(
"Available datasets: middle_school_mathematics, high_school_history, high_school_chinese, high_school_physics, middle_school_geography, middle_school_physics"
)
sys
.
exit
(
1
)
elif
benchmark
==
"mmlu"
:
# Load MMLU dataset
# https://huggingface.co/datasets/cais/mmlu
if
subject
is
None
:
subject
=
"all"
print
(
f
"Loading MMLU dataset (subject:
{
subject
}
)..."
)
try
:
if
subject
==
"all"
:
dataset
=
load_dataset
(
"cais/mmlu"
,
"all"
)
# Combine all subjects into a single dataset
samples
=
[]
for
subject_name
in
dataset
.
keys
():
if
subject_name
in
[
"train"
,
"validation"
,
"test"
]:
continue
# Convert Dataset to list
test_data
=
dataset
[
subject_name
][
"test"
]
if
hasattr
(
test_data
,
'to_list'
):
samples
.
extend
(
test_data
.
to_list
())
else
:
samples
.
extend
(
list
(
test_data
))
else
:
dataset
=
load_dataset
(
"cais/mmlu"
,
subject
)
test_data
=
dataset
[
"test"
]
# Convert Dataset to list
if
hasattr
(
test_data
,
'to_list'
):
samples
=
test_data
.
to_list
()
else
:
samples
=
list
(
test_data
)
except
Exception
as
e
:
print
(
f
"Error loading dataset:
{
e
}
"
)
print
(
"Available subjects: abstract_algebra, anatomy, astronomy, business_ethics, etc."
)
print
(
"Use --subject all to load all subjects"
)
sys
.
exit
(
1
)
print
(
f
"Loaded
{
len
(
samples
)
}
samples"
)
# Limit number of samples if specified
if
num_samples
is
not
None
and
num_samples
>
0
:
original_count
=
len
(
samples
)
samples
=
samples
[:
num_samples
]
print
(
f
"Limited to
{
len
(
samples
)
}
samples for validation (from
{
original_count
}
total)"
)
# Create model based on backend
if
backend
!=
"010"
:
model
=
InfiniLMBenchmark
(
model_path
,
device_type_str
,
ndev
,
backend
,
benchmark
)
else
:
print
(
f
"test 010 backend by scripts/test_ceval.py"
)
exit
(
0
)
# Test with first sample if available
if
len
(
samples
)
>
0
:
sample
=
samples
[
0
]
if
benchmark
==
"ceval"
:
input_content
=
f
"'question':
{
sample
[
'question'
]
}
,'A':
{
sample
[
'A'
]
}
, 'B':
{
sample
[
'B'
]
}
, 'C':
{
sample
[
'C'
]
}
,'D':
{
sample
[
'D'
]
}
。"
test_conversation
=
[
{
"role"
:
"system"
,
"content"
:
"请从question的A,B,C,D四个选项中选择正确的选项。例如,标准答案:A。"
,
},
{
"role"
:
"user"
,
"content"
:
input_content
},
]
test_output
,
_
=
model
.
generate
(
test_conversation
,
max_steps
=
max_new_tokens
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
)
elif
benchmark
==
"mmlu"
:
question
=
sample
[
'question'
]
choices
=
sample
[
'choices'
]
test_output
,
_
=
model
.
generate
(
question
,
choices
,
max_steps
=
max_new_tokens
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
)
print
(
f
"
\n
Test output:
{
test_output
}
"
)
answers_list
=
[]
for
idx
,
sample
in
enumerate
(
samples
):
if
benchmark
==
"ceval"
:
input_content
=
f
"'question':
{
sample
[
'question'
]
}
,'A':
{
sample
[
'A'
]
}
, 'B':
{
sample
[
'B'
]
}
, 'C':
{
sample
[
'C'
]
}
,'D':
{
sample
[
'D'
]
}
。"
conversation
=
[
{
"role"
:
"system"
,
"content"
:
"请从question的A,B,C,D四个选项中选择正确的选项。例如,标准答案:A。"
,
},
{
"role"
:
"user"
,
"content"
:
input_content
},
]
answer
=
sample
[
"answer"
]
output_content
,
avg_time
=
model
.
generate
(
conversation
,
max_steps
=
max_new_tokens
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
)
is_correct
=
extract_answer_ceval
(
output_content
,
answer
)
answers_list
.
append
({
"id"
:
sample
.
get
(
"id"
,
idx
),
"output_content"
:
output_content
,
"answer"
:
answer
,
"is_correct"
:
is_correct
})
if
benchmark
==
"ceval"
:
print
(
"标准答案:"
,
answer
)
elif
benchmark
==
"mmlu"
:
question
=
sample
[
'question'
]
choices
=
sample
[
'choices'
]
answer_idx
=
sample
[
'answer'
]
# MMLU answer is 0-3 index
output_content
,
avg_time
=
model
.
generate
(
question
,
choices
,
max_steps
=
max_new_tokens
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
)
predicted_answer
=
extract_answer_mmlu
(
output_content
)
# Convert answer index to letter for display
answer_letter
=
chr
(
65
+
answer_idx
)
if
answer_idx
<
4
else
"?"
predicted_letter
=
chr
(
65
+
predicted_answer
)
if
predicted_answer
is
not
None
and
predicted_answer
<
4
else
"?"
print
(
f
"Sample
{
idx
}
: Correct answer:
{
answer_letter
}
(
{
answer_idx
}
), Predicted:
{
predicted_letter
}
(
{
predicted_answer
}
)"
)
answers_list
.
append
({
"id"
:
idx
,
"output_content"
:
output_content
,
"answer"
:
answer_idx
,
"predicted"
:
predicted_answer
})
model
.
destroy_model_instance
()
print
(
"-------------------------------------------------------------"
)
# Evaluate results
true_num
=
0
all_num
=
0
for
cont
in
answers_list
:
id
=
cont
[
"id"
]
all_num
=
all_num
+
1
if
benchmark
==
"ceval"
:
answer
=
cont
[
"answer"
]
is_correct
=
cont
[
"is_correct"
]
if
is_correct
:
true_num
=
true_num
+
1
print
(
f
"id
{
id
}
: "
,
"正确"
)
else
:
print
(
f
"id
{
id
}
: "
,
"错误"
)
elif
benchmark
==
"mmlu"
:
answer
=
cont
[
"answer"
]
predicted
=
cont
[
"predicted"
]
if
predicted
is
not
None
and
predicted
==
answer
:
true_num
=
true_num
+
1
print
(
f
"id
{
id
}
: Correct"
)
else
:
answer_letter
=
chr
(
65
+
answer
)
if
answer
<
4
else
"?"
predicted_letter
=
chr
(
65
+
predicted
)
if
predicted
is
not
None
and
predicted
<
4
else
"?"
print
(
f
"id
{
id
}
: Wrong (correct:
{
answer_letter
}
, predicted:
{
predicted_letter
}
)"
)
accuracy
=
true_num
/
all_num
if
all_num
>
0
else
0.0
if
benchmark
==
"ceval"
:
print
(
f
"成绩:
{
true_num
}
/
{
all_num
}
"
,
accuracy
)
else
:
print
(
f
"Accuracy:
{
true_num
}
/
{
all_num
}
=
{
accuracy
:.
2
%
}
"
)
if
__name__
==
"__main__"
:
test
()
test/models/llama/test_forward_validation.py
View file @
9c256a17
...
...
@@ -4,7 +4,6 @@ Test script to validate forward pass across different backends and dtypes.
Tests:
1. Python backend with bfloat16
2. C++ backend with float32
3. C++ backend with bfloat16
This script runs a prefill step (full sequence forward pass with KV cache)
...
...
@@ -81,6 +80,12 @@ def get_args():
default
=
"How are you"
,
help
=
"Test prompt (default: 'How are you')"
,
)
parser
.
add_argument
(
"--num_decode_steps"
,
type
=
int
,
default
=
2
,
help
=
"Number of decode steps to run after prefill (default: 2)"
,
)
return
parser
.
parse_args
()
...
...
@@ -116,9 +121,9 @@ def create_inputs(prompt, tokenizer, device, backend="cpp"):
return
input_ids_infini
,
position_ids_infini
,
input_content
def
run_forward_pass
(
model
,
input_ids
,
position_ids
,
backend
,
dtype
):
"""Run prefill and
first
decode step with KV cache, return decode step logits."""
print
(
f
" Running forward pass (prefill +
first
decode step)..."
)
def
run_forward_pass
(
model
,
input_ids
,
position_ids
,
backend
,
dtype
,
num_decode_steps
=
2
):
"""Run prefill and
multiple
decode step
s
with KV cache, return
all
decode step logits."""
print
(
f
" Running forward pass (prefill +
{
num_decode_steps
}
decode step
(s)
)..."
)
try
:
# Get the underlying model
...
...
@@ -162,19 +167,6 @@ def run_forward_pass(model, input_ids, position_ids, backend, dtype):
print
(
f
" Prefill logits stats: min=
{
prefill_logits_np
.
min
():.
6
f
}
, max=
{
prefill_logits_np
.
max
():.
6
f
}
, mean=
{
prefill_logits_np
.
mean
():.
6
f
}
"
)
# Step 2: Decode - run forward pass with single token
# Get the predicted token from prefill
if
np
.
isnan
(
prefill_logits_np
).
any
():
# If prefill has NaN, use a default token to continue testing decode step
print
(
f
" ⚠ WARNING: Using default token 29902 due to NaN in prefill logits"
)
predicted_token_id
=
29902
else
:
predicted_token_id
=
int
(
prefill_logits_np
.
argmax
(
axis
=-
1
)[
0
,
0
])
print
(
f
" Step 2: Decode (next_token_id=
{
predicted_token_id
}
)..."
)
# Get device from input_ids
if
hasattr
(
input_ids
,
"device"
):
input_device
=
input_ids
.
device
...
...
@@ -182,19 +174,59 @@ def run_forward_pass(model, input_ids, position_ids, backend, dtype):
input_device
=
getattr
(
position_ids
,
"device"
,
infinicore
.
device
(
"cpu"
,
0
))
# Create single token input for decode step
decode_input_ids
=
infinicore
.
from_list
(
[[
predicted_token_id
]],
device
=
input_device
)
# Create position_ids for decode step (should be seq_len, since we've processed seq_len tokens)
# Initialize decode logits list
decode_logits_list
=
[]
seq_len
=
input_ids
.
shape
[
1
]
decode_position_ids
=
infinicore
.
from_list
(
[[
seq_len
]],
dtype
=
infinicore
.
int64
,
device
=
input_device
)
# Run decode step - C++ backend manages cache internally
decode_logits
=
underlying_model
.
forward
(
decode_input_ids
,
decode_position_ids
)
current_token_id
=
None
# Run multiple decode steps
for
decode_step
in
range
(
num_decode_steps
):
# Get the predicted token from previous step
if
decode_step
==
0
:
# First decode step: use token from prefill
if
np
.
isnan
(
prefill_logits_np
).
any
():
print
(
f
" ⚠ WARNING: Using default token 29902 due to NaN in prefill logits"
)
current_token_id
=
29902
else
:
current_token_id
=
int
(
prefill_logits_np
.
argmax
(
axis
=-
1
)[
0
,
0
])
else
:
# Subsequent decode steps: use token from previous decode
prev_logits_np
=
decode_logits_list
[
-
1
]
if
np
.
isnan
(
prev_logits_np
).
any
():
print
(
f
" ⚠ WARNING: Using default token 29902 due to NaN in decode step
{
decode_step
}
logits"
)
current_token_id
=
29902
else
:
current_token_id
=
int
(
prev_logits_np
.
argmax
(
axis
=-
1
)[
0
,
0
])
print
(
f
" Step
{
decode_step
+
2
}
: Decode step
{
decode_step
+
1
}
(next_token_id=
{
current_token_id
}
)..."
)
# Create single token input for decode step
decode_input_ids
=
infinicore
.
from_list
(
[[
current_token_id
]],
device
=
input_device
)
# Create position_ids for decode step
decode_position_ids
=
infinicore
.
from_list
(
[[
seq_len
+
decode_step
]],
dtype
=
infinicore
.
int64
,
device
=
input_device
)
# Run decode step - C++ backend manages cache internally
decode_logits
=
underlying_model
.
forward
(
decode_input_ids
,
decode_position_ids
)
# Convert decode logits to numpy
decode_logits_np
=
infinicore_to_numpy
(
decode_logits
)
decode_logits_list
.
append
(
decode_logits_np
)
print
(
f
" ✓ Decode step
{
decode_step
+
1
}
completed, logits shape:
{
decode_logits_np
.
shape
}
"
)
# Check decode logits for issues
if
np
.
isnan
(
decode_logits_np
).
any
():
print
(
f
" ⚠ WARNING: Decode step
{
decode_step
+
1
}
logits contain NaN values!"
)
print
(
f
" NaN count:
{
np
.
isnan
(
decode_logits_np
).
sum
()
}
"
)
if
np
.
isinf
(
decode_logits_np
).
any
():
print
(
f
" ⚠ WARNING: Decode step
{
decode_step
+
1
}
logits contain Inf values!"
)
print
(
f
" Inf count:
{
np
.
isinf
(
decode_logits_np
).
sum
()
}
"
)
if
not
np
.
isnan
(
decode_logits_np
).
any
():
print
(
f
" Decode step
{
decode_step
+
1
}
logits stats: min=
{
decode_logits_np
.
min
():.
6
f
}
, max=
{
decode_logits_np
.
max
():.
6
f
}
, mean=
{
decode_logits_np
.
mean
():.
6
f
}
"
)
else
:
# Python backend uses DynamicCache
# Get model config
...
...
@@ -217,12 +249,6 @@ def run_forward_pass(model, input_ids, position_ids, backend, dtype):
print
(
f
" ✓ Prefill completed, logits shape:
{
prefill_logits_np
.
shape
}
"
)
# Step 2: Decode - run forward pass with single token
# Get the predicted token from prefill
predicted_token_id
=
int
(
prefill_logits_np
.
argmax
(
axis
=-
1
)[
0
,
0
])
print
(
f
" Step 2: Decode (next_token_id=
{
predicted_token_id
}
)..."
)
# Get device from input_ids
if
hasattr
(
input_ids
,
"device"
):
input_device
=
input_ids
.
device
...
...
@@ -231,48 +257,87 @@ def run_forward_pass(model, input_ids, position_ids, backend, dtype):
input_device
=
getattr
(
position_ids
,
"device"
,
infinicore
.
device
(
"cpu"
,
0
))
# Create single token input for decode step
decode_input_ids
=
infinicore
.
from_list
(
[[
predicted_token_id
]],
device
=
input_device
)
# Create position_ids for decode step (should be seq_len, since we've processed seq_len tokens)
# Initialize decode logits list
decode_logits_list
=
[]
seq_len
=
input_ids
.
shape
[
1
]
decode_position_ids
=
infinicore
.
from_list
(
[[
seq_len
]],
dtype
=
infinicore
.
int64
,
device
=
input_device
)
# Run decode step with KV cache
decode_logits
=
underlying_model
.
forward
(
decode_input_ids
,
decode_position_ids
,
past_key_values
=
past_key_values
,
use_cache
=
True
)
# Convert decode logits to numpy for analysis
logits_np
=
infinicore_to_numpy
(
decode_logits
)
print
(
f
" ✓ Forward pass completed (prefill + decode)"
)
print
(
f
" Decode logits shape:
{
logits_np
.
shape
}
"
)
print
(
f
" Decode logits dtype:
{
logits_np
.
dtype
}
"
)
print
(
f
" Decode logits stats: min=
{
logits_np
.
min
():.
6
f
}
, max=
{
logits_np
.
max
():.
6
f
}
, mean=
{
logits_np
.
mean
():.
6
f
}
"
)
# Check for issues
if
np
.
isnan
(
logits_np
).
any
():
print
(
f
" ⚠ WARNING: Logits contain NaN values!"
)
return
None
,
True
if
np
.
isinf
(
logits_np
).
any
():
print
(
f
" ⚠ WARNING: Logits contain Inf values!"
)
return
None
,
True
# Check if logits are too small (might indicate model not working)
if
np
.
abs
(
logits_np
).
max
()
<
1.0
:
print
(
f
" ⚠ WARNING: Logits are very small (max abs:
{
np
.
abs
(
logits_np
).
max
():.
6
f
}
)"
)
# Get predicted token from decode step
predicted_token
=
int
(
logits_np
.
argmax
(
axis
=-
1
)[
0
,
0
])
print
(
f
" Predicted token ID from decode:
{
predicted_token
}
"
)
return
logits_np
,
False
current_token_id
=
None
# Run multiple decode steps
for
decode_step
in
range
(
num_decode_steps
):
# Get the predicted token from previous step
if
decode_step
==
0
:
# First decode step: use token from prefill
if
np
.
isnan
(
prefill_logits_np
).
any
():
print
(
f
" ⚠ WARNING: Using default token 29902 due to NaN in prefill logits"
)
current_token_id
=
29902
else
:
current_token_id
=
int
(
prefill_logits_np
.
argmax
(
axis
=-
1
)[
0
,
0
])
else
:
# Subsequent decode steps: use token from previous decode
prev_logits_np
=
decode_logits_list
[
-
1
]
if
np
.
isnan
(
prev_logits_np
).
any
():
print
(
f
" ⚠ WARNING: Using default token 29902 due to NaN in decode step
{
decode_step
}
logits"
)
current_token_id
=
29902
else
:
current_token_id
=
int
(
prev_logits_np
.
argmax
(
axis
=-
1
)[
0
,
0
])
print
(
f
" Step
{
decode_step
+
2
}
: Decode step
{
decode_step
+
1
}
(next_token_id=
{
current_token_id
}
)..."
)
# Create single token input for decode step
decode_input_ids
=
infinicore
.
from_list
(
[[
current_token_id
]],
device
=
input_device
)
# Create position_ids for decode step
decode_position_ids
=
infinicore
.
from_list
(
[[
seq_len
+
decode_step
]],
dtype
=
infinicore
.
int64
,
device
=
input_device
)
# Run decode step with KV cache
decode_logits
=
underlying_model
.
forward
(
decode_input_ids
,
decode_position_ids
,
past_key_values
=
past_key_values
,
use_cache
=
True
)
# Convert decode logits to numpy
decode_logits_np
=
infinicore_to_numpy
(
decode_logits
)
decode_logits_list
.
append
(
decode_logits_np
)
print
(
f
" ✓ Decode step
{
decode_step
+
1
}
completed, logits shape:
{
decode_logits_np
.
shape
}
"
)
# Check decode logits for issues
if
np
.
isnan
(
decode_logits_np
).
any
():
print
(
f
" ⚠ WARNING: Decode step
{
decode_step
+
1
}
logits contain NaN values!"
)
print
(
f
" NaN count:
{
np
.
isnan
(
decode_logits_np
).
sum
()
}
"
)
if
np
.
isinf
(
decode_logits_np
).
any
():
print
(
f
" ⚠ WARNING: Decode step
{
decode_step
+
1
}
logits contain Inf values!"
)
print
(
f
" Inf count:
{
np
.
isinf
(
decode_logits_np
).
sum
()
}
"
)
if
not
np
.
isnan
(
decode_logits_np
).
any
():
print
(
f
" Decode step
{
decode_step
+
1
}
logits stats: min=
{
decode_logits_np
.
min
():.
6
f
}
, max=
{
decode_logits_np
.
max
():.
6
f
}
, mean=
{
decode_logits_np
.
mean
():.
6
f
}
"
)
# Summary of all decode steps
print
(
f
" ✓ Forward pass completed (prefill +
{
num_decode_steps
}
decode step(s))"
)
for
i
,
logits_np
in
enumerate
(
decode_logits_list
):
print
(
f
" Decode step
{
i
+
1
}
logits shape:
{
logits_np
.
shape
}
, dtype:
{
logits_np
.
dtype
}
"
)
# Check for issues in all decode steps
has_error
=
False
for
i
,
logits_np
in
enumerate
(
decode_logits_list
):
if
np
.
isnan
(
logits_np
).
any
():
print
(
f
" ⚠ WARNING: Decode step
{
i
+
1
}
logits contain NaN values!"
)
print
(
f
" NaN count:
{
np
.
isnan
(
logits_np
).
sum
()
}
"
)
has_error
=
True
if
np
.
isinf
(
logits_np
).
any
():
print
(
f
" ⚠ WARNING: Decode step
{
i
+
1
}
logits contain Inf values!"
)
print
(
f
" Inf count:
{
np
.
isinf
(
logits_np
).
sum
()
}
"
)
has_error
=
True
if
np
.
abs
(
logits_np
).
max
()
<
1.0
:
print
(
f
" ⚠ WARNING: Decode step
{
i
+
1
}
logits are very small (max abs:
{
np
.
abs
(
logits_np
).
max
():.
6
f
}
)"
)
# Get predicted token from last decode step
if
decode_logits_list
and
not
np
.
isnan
(
decode_logits_list
[
-
1
]).
any
():
predicted_token
=
int
(
decode_logits_list
[
-
1
].
argmax
(
axis
=-
1
)[
0
,
0
])
print
(
f
" Predicted token ID from decode step
{
num_decode_steps
}
:
{
predicted_token
}
"
)
# Return tuple of all decode logits
return
tuple
(
decode_logits_list
),
has_error
except
Exception
as
e
:
print
(
f
" ✗ Forward pass failed:
{
e
}
"
)
...
...
@@ -353,7 +418,7 @@ def infinicore_to_numpy(tensor):
return
result
def
test_configuration
(
model_path
,
device
,
backend
,
dtype
,
prompt
):
def
test_configuration
(
model_path
,
device
,
backend
,
dtype
,
prompt
,
num_decode_steps
=
2
):
"""Test a specific backend/dtype configuration."""
print
(
"
\n
"
+
"="
*
80
)
print
(
f
"Testing: Backend=
{
backend
}
, Dtype=
{
dtype
}
"
)
...
...
@@ -377,7 +442,7 @@ def test_configuration(model_path, device, backend, dtype, prompt):
# Load tokenizer
print
(
"
\n
1. Loading tokenizer..."
)
try
:
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
print
(
f
" ✓ Tokenizer loaded"
)
except
Exception
as
e
:
print
(
f
" ✗ Failed to load tokenizer:
{
e
}
"
)
...
...
@@ -428,25 +493,25 @@ def test_configuration(model_path, device, backend, dtype, prompt):
traceback
.
print_exc
()
return
None
,
True
# Run forward pass (prefill + decode step)
print
(
f
"
\n
5. Running forward pass (prefill +
first
decode step)..."
)
logits
,
has_error
=
run_forward_pass
(
model
,
input_ids
,
position_ids
,
backend
,
dtype
)
# Run forward pass (prefill +
multiple
decode step
s
)
print
(
f
"
\n
5. Running forward pass (prefill +
{
num_decode_steps
}
decode step
(s)
)..."
)
logits
_tuple
,
has_error
=
run_forward_pass
(
model
,
input_ids
,
position_ids
,
backend
,
dtype
,
num_decode_steps
)
if
has_error
:
return
None
,
True
return
logits
,
False
return
logits
_tuple
,
False
def
compare_logits
(
logits1
,
logits2
,
name1
,
name2
):
def
compare_logits
(
logits1
,
logits2
,
name1
,
name2
,
step_name
=
"logits"
):
"""Compare two logits arrays."""
print
(
f
"
\n
{
'='
*
80
}
"
)
print
(
f
"Comparing:
{
name1
}
vs
{
name2
}
"
)
print
(
f
"Comparing:
{
name1
}
vs
{
name2
}
(
{
step_name
}
)
"
)
print
(
f
"
{
'='
*
80
}
"
)
if
logits1
is
None
or
logits2
is
None
:
print
(
" ✗ Cannot compare: one or both
logits
are None"
)
print
(
f
" ✗ Cannot compare: one or both
{
step_name
}
are None"
)
return
False
if
logits1
.
shape
!=
logits2
.
shape
:
...
...
@@ -469,9 +534,9 @@ def compare_logits(logits1, logits2, name1, name2):
is_close
=
np
.
allclose
(
logits1
,
logits2
,
rtol
=
rtol
,
atol
=
atol
)
if
is_close
:
print
(
f
" ✓
Logits
are close (within tolerance)"
)
print
(
f
" ✓
{
step_name
.
capitalize
()
}
are close (within tolerance)"
)
else
:
print
(
f
" ⚠
Logits
differ significantly"
)
print
(
f
" ⚠
{
step_name
.
capitalize
()
}
differ significantly"
)
# Show top differences
flat_diff
=
diff
.
flatten
()
top_indices
=
np
.
argsort
(
flat_diff
)[
-
10
:][::
-
1
]
...
...
@@ -493,6 +558,7 @@ def main():
print
(
f
"Model path:
{
args
.
model_path
}
"
)
print
(
f
"Device:
{
args
.
device
}
"
)
print
(
f
"Prompt:
{
args
.
prompt
}
"
)
print
(
f
"Number of decode steps:
{
args
.
num_decode_steps
}
"
)
print
(
"="
*
80
)
results
=
{}
...
...
@@ -502,25 +568,16 @@ def main():
print
(
"TEST 1: Python Backend + BFloat16"
)
print
(
"="
*
80
)
logits_py_bf16
,
error
=
test_configuration
(
args
.
model_path
,
args
.
device
,
"python"
,
"bfloat16"
,
args
.
prompt
args
.
model_path
,
args
.
device
,
"python"
,
"bfloat16"
,
args
.
prompt
,
args
.
num_decode_steps
)
results
[
"python_bf16"
]
=
(
logits_py_bf16
,
error
)
# Test 2: C++ backend with float32
print
(
"
\n\n
"
+
"="
*
80
)
print
(
"TEST 2: C++ Backend + Float32"
)
print
(
"="
*
80
)
logits_cpp_f32
,
error
=
test_configuration
(
args
.
model_path
,
args
.
device
,
"cpp"
,
"float32"
,
args
.
prompt
)
results
[
"cpp_f32"
]
=
(
logits_cpp_f32
,
error
)
# Test 3: C++ backend with bfloat16
print
(
"
\n\n
"
+
"="
*
80
)
print
(
"TEST 3: C++ Backend + BFloat16"
)
print
(
"="
*
80
)
logits_cpp_bf16
,
error
=
test_configuration
(
args
.
model_path
,
args
.
device
,
"cpp"
,
"bfloat16"
,
args
.
prompt
args
.
model_path
,
args
.
device
,
"cpp"
,
"bfloat16"
,
args
.
prompt
,
args
.
num_decode_steps
)
results
[
"cpp_bf16"
]
=
(
logits_cpp_bf16
,
error
)
...
...
@@ -533,23 +590,22 @@ def main():
# Compare Python BF16 vs C++ BF16 (should be similar)
if
not
results
[
"python_bf16"
][
1
]
and
not
results
[
"cpp_bf16"
][
1
]:
is_close
=
compare_logits
(
results
[
"python_bf16"
][
0
],
results
[
"cpp_bf16"
][
0
],
"Python BF16"
,
"C++ BF16"
)
comparisons
.
append
((
"Python BF16 vs C++ BF16"
,
is_close
))
# Compare C++ F32 vs C++ BF16 (should be similar but with some differences)
if
not
results
[
"cpp_f32"
][
1
]
and
not
results
[
"cpp_bf16"
][
1
]:
is_close
=
compare_logits
(
results
[
"cpp_f32"
][
0
],
results
[
"cpp_bf16"
][
0
],
"C++ F32"
,
"C++ BF16"
)
comparisons
.
append
((
"C++ F32 vs C++ BF16"
,
is_close
))
py_logits
=
results
[
"python_bf16"
][
0
]
cpp_logits
=
results
[
"cpp_bf16"
][
0
]
if
py_logits
is
not
None
and
cpp_logits
is
not
None
:
# Compare all decode steps
num_steps
=
min
(
len
(
py_logits
),
len
(
cpp_logits
))
for
step_idx
in
range
(
num_steps
):
step_name
=
f
"decode step
{
step_idx
+
1
}
"
is_close
=
compare_logits
(
py_logits
[
step_idx
],
cpp_logits
[
step_idx
],
"Python BF16"
,
"C++ BF16"
,
step_name
)
comparisons
.
append
((
f
"Python BF16 vs C++ BF16 (
{
step_name
}
)"
,
is_close
))
# Summary
print
(
"
\n\n
"
+
"="
*
80
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment