Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zk
GroundingDINO-DCU-Optimized
Commits
f1a225f3
Commit
f1a225f3
authored
Apr 16, 2026
by
zk
Browse files
update
parent
552b62f3
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
3 additions
and
672 deletions
+3
-672
.gitignore
.gitignore
+1
-0
export_onnx.py
export_onnx.py
+2
-2
export_onnx_migraphx_debug.py
export_onnx_migraphx_debug.py
+0
-66
migraphx_infer/migraphx_infer.bash
migraphx_infer/migraphx_infer.bash
+0
-0
migraphx_infer/migraphx_infer.py
migraphx_infer/migraphx_infer.py
+0
-0
onnx_inference1_migraphx_xiongke.py
onnx_inference1_migraphx_xiongke.py
+0
-270
onnx_inference_test.py
onnx_inference_test.py
+0
-322
resnet/profile.json_2026-04-03_11-42-27.json
resnet/profile.json_2026-04-03_11-42-27.json
+0
-4
resnet/profile.json_2026-04-03_11-43-38.json
resnet/profile.json_2026-04-03_11-43-38.json
+0
-4
resnet/profile.json_2026-04-03_11-44-14.json
resnet/profile.json_2026-04-03_11-44-14.json
+0
-4
No files found.
.gitignore
View file @
f1a225f3
...
...
@@ -158,6 +158,7 @@ tmp/
xiongke_log.txt
migraphx_log.txt
weights/
weights_400x600/
checkpoints/
# 忽略环境安装包
...
...
export_onnx.py
View file @
f1a225f3
...
...
@@ -54,8 +54,8 @@ torch.onnx.export(
output_names
=
[
"logits"
,
"boxes"
],
dynamic_axes
=
None
,
# 静态维度导出
opset_version
=
17
,
verbose
=
False
# 关闭详细日志,如需调试可改为True
#
do_constant_folding=True # 常量折叠优化,提升简化效果
verbose
=
False
,
# 关闭详细日志,如需调试可改为True
do_constant_folding
=
True
# 常量折叠优化,提升简化效果
)
print
(
f
"ONNX模型已成功导出到:
{
onnx_output_path
}
"
)
...
...
export_onnx_migraphx_debug.py
deleted
100644 → 0
View file @
552b62f3
import
torch
import
onnx
from
onnxsim
import
simplify
from
groundingdino.models
import
build_model
from
groundingdino.util.slconfig
import
SLConfig
from
groundingdino.util.utils
import
clean_state_dict
config_file
=
'./groundingdino/config/GroundingDINO_SwinB_cfg.py'
checkpoint_path
=
'./weights/groundingdino_swinb_cogcoor.pth'
def
load_model
(
model_config_path
,
model_checkpoint_path
,
cpu_only
=
False
):
args
=
SLConfig
.
fromfile
(
model_config_path
)
args
.
device
=
"cuda"
if
not
cpu_only
else
"cpu"
# modified config
args
.
use_checkpoint
=
False
args
.
use_transformer_ckpt
=
False
model
=
build_model
(
args
)
checkpoint
=
torch
.
load
(
model_checkpoint_path
,
map_location
=
"cpu"
)
model
.
load_state_dict
(
clean_state_dict
(
checkpoint
[
"model"
]),
strict
=
False
)
_
=
model
.
eval
()
return
model
# 加载模型
model
=
load_model
(
config_file
,
checkpoint_path
,
cpu_only
=
True
)
# 正式推理时使用的提示词,以及相关的mask
caption
=
"car ."
input_ids
=
model
.
tokenizer
([
caption
],
return_tensors
=
"pt"
)[
"input_ids"
]
position_ids
=
torch
.
tensor
([[
0
,
0
,
1
,
0
]])
token_type_ids
=
torch
.
tensor
([[
0
,
0
,
0
,
0
]])
attention_mask
=
torch
.
tensor
([[
True
,
True
,
True
,
True
]])
text_token_mask
=
torch
.
tensor
([[[
True
,
False
,
False
,
False
],
[
False
,
True
,
True
,
False
],
[
False
,
True
,
True
,
False
],
[
False
,
False
,
False
,
True
]]])
# 固定输入分辨率
img
=
torch
.
randn
(
1
,
3
,
800
,
1200
)
# onnx模型可以支持动态输入,在转换engine时建议注销
dynamic_axes
=
{
"input_ids"
:
{
0
:
"batch_size"
,
1
:
"seq_len"
},
"attention_mask"
:
{
0
:
"batch_size"
,
1
:
"seq_len"
},
"position_ids"
:
{
0
:
"batch_size"
,
1
:
"seq_len"
},
"token_type_ids"
:
{
0
:
"batch_size"
,
1
:
"seq_len"
},
"text_token_mask"
:
{
0
:
"batch_size"
,
1
:
"seq_len"
,
2
:
"seq_len"
},
"img"
:
{
0
:
"batch_size"
,
2
:
"height"
,
3
:
"width"
},
"logits"
:
{
0
:
"batch_size"
},
"boxes"
:
{
0
:
"batch_size"
}
}
# 导出原始ONNX模型
onnx_output_path
=
"weights/ground.onnx"
torch
.
onnx
.
export
(
model
,
f
=
onnx_output_path
,
args
=
(
img
,
input_ids
,
attention_mask
,
position_ids
,
token_type_ids
,
text_token_mask
),
input_names
=
[
"img"
,
"input_ids"
,
"attention_mask"
,
"position_ids"
,
"token_type_ids"
,
"text_token_mask"
],
output_names
=
[
"logits"
,
"boxes"
],
opset_version
=
17
,
verbose
=
False
,
# 关闭详细日志,如需调试可改为True
do_constant_folding
=
True
# 常量折叠优化,提升简化效果
)
onn
x_infer
ence1_
migraphx.bash
→
migraph
x_infer
/
migraphx
_infer
.bash
View file @
f1a225f3
File moved
onn
x_infer
ence1_
migraphx.py
→
migraph
x_infer
/
migraphx
_infer
.py
View file @
f1a225f3
File moved
onnx_inference1_migraphx_xiongke.py
deleted
100644 → 0
View file @
552b62f3
import
cv2
import
numpy
as
np
import
torch
import
time
import
os
os
.
environ
[
"MIGRAPHX_SAVE_TEMPS"
]
=
"1"
os
.
environ
[
"MIGRAPHX_TRACE"
]
=
"1"
os
.
environ
[
"MIGRAPHX_LOG_LEVEL"
]
=
"DEBUG"
import
migraphx
from
transformers
import
BertTokenizer
from
groundingdino.util.inference
import
load_image
from
groundingdino.models.GroundingDINO.bertwarper
import
generate_masks_with_special_tokens_and_transfer_map
# =========================
# 工具函数
# =========================
def
sigmoid
(
x
):
return
1
/
(
1
+
np
.
exp
(
-
x
))
def
preprocess_caption
(
caption
:
str
)
->
str
:
result
=
caption
.
lower
().
strip
()
if
result
.
endswith
(
"."
):
return
result
return
result
+
"."
def
to_mgx
(
x
):
if
x
.
dtype
==
np
.
int64
:
return
migraphx
.
argument
(
x
.
astype
(
np
.
int64
))
elif
x
.
dtype
==
np
.
bool_
:
return
migraphx
.
argument
(
x
.
astype
(
np
.
bool_
))
else
:
return
migraphx
.
argument
(
x
.
astype
(
np
.
float32
))
def
_mgx_shape_to_numpy
(
shape
):
"""将 migraphx shape 转为 numpy dtype 和 lens。"""
shape_str
=
str
(
shape
)
if
"int64_type"
in
shape_str
:
dtype
=
np
.
int64
elif
"bool_type"
in
shape_str
:
dtype
=
np
.
bool_
elif
"half_type"
in
shape_str
:
dtype
=
np
.
float16
else
:
dtype
=
np
.
float32
return
dtype
,
list
(
shape
.
lens
())
# =========================
# 🚀 MIGraphX 推理类(带缓存)
# =========================
class
MIGraphXModel
:
def
__init__
(
self
,
onnx_path
,
cache_path
=
"weights/ground_xiongke.mxr"
,
force_recompile
=
False
):
self
.
cache_path
=
cache_path
# ====== 优先加载缓存 ======
if
os
.
path
.
exists
(
cache_path
)
and
not
force_recompile
:
print
(
f
"⚡ 直接加载已编译模型:
{
cache_path
}
"
)
self
.
model
=
migraphx
.
load
(
cache_path
)
else
:
print
(
"🔍 从 ONNX 构建 MIGraphX"
)
self
.
model
=
migraphx
.
parse_onnx
(
onnx_path
)
print
(
self
.
model
)
# ====================== 2. 打印模型输入输出信息 ======================
print
(
"=== 模型输入信息 ==="
)
inputs
=
self
.
model
.
get_inputs
()
for
key
,
value
in
inputs
.
items
():
print
(
f
"
{
key
}
:
{
value
}
"
)
print
(
"
\n
=== 模型输出信息 ==="
)
outputs
=
self
.
model
.
get_outputs
()
for
key
,
value
in
outputs
.
items
():
print
(
f
"
{
key
}
:
{
value
}
"
)
# 获取输入节点名称和输入形状
inputName
=
list
(
self
.
model
.
get_inputs
().
keys
())[
0
]
inputShape
=
inputs
[
inputName
].
lens
()
print
(
f
"
\n
输入节点名称:
{
inputName
}
"
)
print
(
f
"输入形状 (N, C, H, W):
{
inputShape
}
"
)
inputName1
=
list
(
self
.
model
.
get_inputs
().
keys
())[
1
]
inputShape1
=
inputs
[
inputName
].
lens
()
print
(
f
"
\n
输入节点名称:
{
inputName1
}
"
)
print
(
f
"输入形状 (N, C, H, W):
{
inputShape1
}
"
)
"""
=== 模型输入信息 ===
text_token_mask: bool_type, {1, 4, 4}, {16, 4, 1}
token_type_ids: int64_type, {1, 4}, {4, 1}
position_ids: int64_type, {1, 4}, {4, 1}
attention_mask: bool_type, {1, 4}, {4, 1}
input_ids: int64_type, {1, 4}, {4, 1}
img: float_type, {1, 3, 800, 1200}, {2880000, 960000, 1200, 1}
=== 模型输出信息 ===
boxes: float_type, {1, 900, 4}, {3600, 4, 1}
logits: float_type, {1, 900, 256}, {230400, 256, 1}
输入节点名称: text_token_mask
输入形状 (N, C, H, W): [1, 4, 4]
"""
# print("\n⚡ 量化模型(FP16)")
# migraphx.quantize_fp16(self.model)
# passes = [
# migraphx.pass_dead_code_elimination(), # 删除未使用的节点/常量
# migraphx.pass_eliminate_contiguous(), # 合并相邻的 contiguous 操作
# migraphx.pass_simplify_reshapes(), # 合并/简化 reshape
# migraphx.pass_simplify_algebra(), # 简化代数表达式 (add/mul/..)
# migraphx.pass_eliminate_identity(), # 删除 Identity ops
# migraphx.pass_common_subexpression_elimination(), # CSE
# ]
# self.model.apply_passes(passes)
print
(
"⚙️ 编译 MIGraphX(GPU)"
)
self
.
model
.
compile
(
t
=
migraphx
.
get_target
(
"gpu"
),
device_id
=
5
)
# offload_copy=False, fast_math=False, exhaustive_tune=False
# ====== 保存缓存 ======
print
(
f
"💾 保存编译模型到:
{
cache_path
}
"
)
migraphx
.
save
(
self
.
model
,
cache_path
)
self
.
param_names
=
self
.
model
.
get_parameter_names
()
self
.
input_shapes
=
self
.
model
.
get_inputs
()
print
(
"✅ 输入节点:"
,
self
.
param_names
)
def
infer
(
self
,
input_dict
):
mgx_inputs
=
{
k
:
to_mgx
(
v
)
for
k
,
v
in
input_dict
.
items
()}
# 某些通过 disable passes 生成的 mxr 会多出内部别名参数(如 main:#output_*)。
# 若缺失,运行期可能触发 VMFault,这里按 shape 自动补零缓冲区。
auto_filled
=
[]
for
name
in
self
.
param_names
:
if
name
in
mgx_inputs
:
continue
if
name
not
in
self
.
input_shapes
:
continue
dtype
,
lens
=
_mgx_shape_to_numpy
(
self
.
input_shapes
[
name
])
mgx_inputs
[
name
]
=
to_mgx
(
np
.
zeros
(
lens
,
dtype
=
dtype
))
auto_filled
.
append
((
name
,
lens
,
dtype
.
__name__
))
if
auto_filled
:
print
(
"⚠️ 自动补齐内部输入参数:"
)
for
item
in
auto_filled
:
print
(
f
" -
{
item
[
0
]
}
shape=
{
item
[
1
]
}
dtype=
{
item
[
2
]
}
"
)
start
=
time
.
time
()
result
=
self
.
model
.
run
(
mgx_inputs
)
infer_time
=
time
.
time
()
-
start
outputs
=
[
np
.
array
(
r
)
for
r
in
result
]
return
outputs
,
infer_time
# =========================
# 推理函数
# =========================
def
predict
(
model
,
tokenizer
,
image
,
caption
,
box_threshold
,
text_threshold
,
is_benchmark
=
False
):
caption
=
preprocess_caption
(
caption
)
captions
=
[
caption
]
tokenized
=
tokenizer
(
captions
,
padding
=
"longest"
,
return_tensors
=
"pt"
)
specical_tokens
=
tokenizer
.
convert_tokens_to_ids
([
"[CLS]"
,
"[SEP]"
,
"."
,
"?"
])
(
text_self_attention_masks
,
position_ids
,
_
)
=
generate_masks_with_special_tokens_and_transfer_map
(
tokenized
,
specical_tokens
,
tokenizer
)
max_text_len
=
256
if
text_self_attention_masks
.
shape
[
1
]
>
max_text_len
:
text_self_attention_masks
=
text_self_attention_masks
[:,
:
max_text_len
,
:
max_text_len
]
position_ids
=
position_ids
[:,
:
max_text_len
]
tokenized
[
"input_ids"
]
=
tokenized
[
"input_ids"
][:,
:
max_text_len
]
tokenized
[
"attention_mask"
]
=
tokenized
[
"attention_mask"
][:,
:
max_text_len
]
tokenized
[
"token_type_ids"
]
=
tokenized
[
"token_type_ids"
][:,
:
max_text_len
]
input_dict
=
{
"img"
:
np
.
expand_dims
(
np
.
asarray
(
image
),
axis
=
0
).
astype
(
np
.
float32
),
"input_ids"
:
np
.
asarray
(
tokenized
[
"input_ids"
]).
astype
(
np
.
int64
),
"attention_mask"
:
np
.
asarray
(
tokenized
[
"attention_mask"
]).
astype
(
np
.
bool_
),
"position_ids"
:
np
.
asarray
(
position_ids
).
astype
(
np
.
int64
),
"token_type_ids"
:
np
.
asarray
(
tokenized
[
"token_type_ids"
]).
astype
(
np
.
int64
),
"text_token_mask"
:
np
.
asarray
(
text_self_attention_masks
).
astype
(
np
.
bool_
)
}
outputs
,
infer_time
=
model
.
infer
(
input_dict
)
if
not
is_benchmark
:
print
(
f
"Inference time:
{
infer_time
*
1000
:.
2
f
}
ms"
)
logits
=
sigmoid
(
outputs
[
0
][
0
])
boxes
=
outputs
[
1
][
0
]
max_values
=
np
.
max
(
logits
,
axis
=
1
)
mask
=
max_values
>
box_threshold
logits
=
logits
[
mask
]
boxes
=
boxes
[
mask
]
phrases
=
[
"object"
]
*
len
(
boxes
)
return
boxes
,
np
.
max
(
logits
,
axis
=
1
),
phrases
# =========================
# Benchmark
# =========================
def
benchmark
(
model
,
tokenizer
,
image
,
caption
,
box_th
,
text_th
,
warmup
=
5
,
runs
=
10
):
print
(
"
\n
🔥 预热"
)
for
_
in
range
(
warmup
):
predict
(
model
,
tokenizer
,
image
,
caption
,
box_th
,
text_th
,
True
)
print
(
"
\n
🚀 测试"
)
times
=
[]
for
i
in
range
(
runs
):
start
=
time
.
time
()
predict
(
model
,
tokenizer
,
image
,
caption
,
box_th
,
text_th
,
True
)
times
.
append
(
time
.
time
()
-
start
)
print
(
f
"
\n
平均耗时:
{
np
.
mean
(
times
)
*
1000
:.
2
f
}
ms"
)
print
(
f
"FPS:
{
1
/
np
.
mean
(
times
):.
2
f
}
"
)
# =========================
# 主函数
# =========================
if
__name__
==
"__main__"
:
#model_path = "weights/ground.onnx"
model_path
=
"weights/ground_fixed.onnx"
cache_path
=
"weights/ground_xiongke.mxr"
# ⭐ 缓存文件
img_path
=
"images/in/car_1.jpg"
TEXT_PROMPT
=
"car ."
BOX_TRESHOLD
=
0.35
TEXT_TRESHOLD
=
0.25
# 🚀 加载模型(自动缓存)
model
=
MIGraphXModel
(
model_path
,
cache_path
=
cache_path
,
force_recompile
=
False
# 改成 True 可强制重编译
)
tokenizer
=
BertTokenizer
.
from_pretrained
(
"bert-base-uncased"
)
image_source
,
image
=
load_image
(
img_path
)
benchmark
(
model
,
tokenizer
,
image
,
TEXT_PROMPT
,
BOX_TRESHOLD
,
TEXT_TRESHOLD
)
boxes
,
confs
,
phrases
=
predict
(
model
,
tokenizer
,
image
,
TEXT_PROMPT
,
BOX_TRESHOLD
,
TEXT_TRESHOLD
)
print
(
"检测结果:"
,
phrases
)
onnx_inference_test.py
deleted
100644 → 0
View file @
552b62f3
from
typing
import
Tuple
,
List
,
Dict
import
cv2
import
numpy
as
np
import
torch
import
onnxruntime
as
ort
from
transformers
import
BertTokenizer
,
AutoTokenizer
import
bisect
import
time
from
groundingdino.util.inference
import
load_image
from
groundingdino.models.GroundingDINO.bertwarper
import
generate_masks_with_special_tokens_and_transfer_map
# 加入推理延迟等指标
def
sigmoid
(
x
):
return
1
/
(
1
+
np
.
exp
(
-
x
))
def
get_phrases_from_posmap
(
posmap
:
np
.
ndarray
,
tokenized
:
Dict
,
tokenizer
:
AutoTokenizer
,
left_idx
:
int
=
0
,
right_idx
:
int
=
255
):
assert
isinstance
(
posmap
,
np
.
ndarray
),
"posmap must be np.ndarray"
if
posmap
.
ndim
==
1
:
# 将指定范围内的元素设为 False
posmap
[:
left_idx
+
1
]
=
False
posmap
[
right_idx
:]
=
False
# 获取非零元素的索引
non_zero_idx
=
np
.
nonzero
(
posmap
)[
0
]
token_ids
=
[
tokenized
[
"input_ids"
][
i
]
for
i
in
non_zero_idx
]
return
tokenizer
.
decode
(
token_ids
)
else
:
raise
NotImplementedError
(
"posmap must be 1-dim"
)
def
preprocess_caption
(
caption
:
str
)
->
str
:
result
=
caption
.
lower
().
strip
()
if
result
.
endswith
(
"."
):
return
result
return
result
+
"."
# 核心优化:增加tokenizer参数,从外部传入
def
predict
(
ort_session
,
tokenizer
:
AutoTokenizer
,
# 外部预加载的tokenizer
image
:
np
.
array
,
caption
:
str
,
box_threshold
:
float
,
text_threshold
:
float
,
device
:
str
=
"cpu"
,
remove_combined
:
bool
=
False
,
is_benchmark
:
bool
=
False
# 新增:标记是否为基准测试(控制日志输出)
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
List
[
str
]]:
# 1. 文本预处理
t0
=
time
.
time
()
caption
=
preprocess_caption
(
caption
=
caption
)
if
not
is_benchmark
:
print
(
f
"Caption processing took
{
(
time
.
time
()
-
t0
):.
3
f
}
s"
)
captions
=
[
caption
]
# 3. 编码文本
t0
=
time
.
time
()
# 移除重复加载tokenizer的性能黑洞
tokenized
=
tokenizer
(
captions
,
padding
=
"longest"
,
return_tensors
=
"pt"
).
to
(
device
)
specical_tokens
=
tokenizer
.
convert_tokens_to_ids
([
"[CLS]"
,
"[SEP]"
,
"."
,
"?"
])
if
not
is_benchmark
:
print
(
f
"Word embedding took
{
(
time
.
time
()
-
t0
):.
3
f
}
s"
)
# 4. 生成注意力掩码和位置信息
t0
=
time
.
time
()
(
text_self_attention_masks
,
position_ids
,
cate_to_token_mask_list
,
)
=
generate_masks_with_special_tokens_and_transfer_map
(
tokenized
,
specical_tokens
,
tokenizer
)
if
not
is_benchmark
:
print
(
f
"Generate attention masks took
{
(
time
.
time
()
-
t0
):.
3
f
}
s"
)
# 5. 处理超长文本
max_text_len
=
256
if
text_self_attention_masks
.
shape
[
1
]
>
max_text_len
:
text_self_attention_masks
=
text_self_attention_masks
[
:,
:
max_text_len
,
:
max_text_len
]
position_ids
=
position_ids
[:,
:
max_text_len
]
tokenized
[
"input_ids"
]
=
tokenized
[
"input_ids"
][:,
:
max_text_len
]
tokenized
[
"attention_mask"
]
=
tokenized
[
"attention_mask"
][:,
:
max_text_len
]
tokenized
[
"token_type_ids"
]
=
tokenized
[
"token_type_ids"
][:,
:
max_text_len
]
# 6. 执行模型推理
# attention_mask: True=可见 → False=mask
attention_mask
=
tokenized
[
"attention_mask"
].
float
()
attention_mask
=
(
1
-
attention_mask
)
*
-
1e9
# 关键!
attention_mask
=
np
.
asarray
(
attention_mask
)
# text_token_mask 同理(如果参与 attention)
text_self_attention_masks
=
text_self_attention_masks
.
float
()
text_self_attention_masks
=
(
1
-
text_self_attention_masks
)
*
-
1e9
input_dict
=
{
"img"
:
np
.
expand_dims
(
np
.
asarray
(
image
),
axis
=
0
),
"input_ids"
:
np
.
asarray
(
tokenized
[
"input_ids"
]),
"attention_mask"
:
attention_mask
,
"position_ids"
:
np
.
asarray
(
position_ids
),
"token_type_ids"
:
np
.
asarray
(
tokenized
[
"token_type_ids"
]),
"text_token_mask"
:
np
.
asarray
(
text_self_attention_masks
)
}
# input_dict = {
# "img": np.expand_dims(np.asarray(image), axis=0),
# "input_ids": np.asarray(tokenized["input_ids"]),
# "attention_mask": attention_mask,
# "position_ids": np.asarray(position_ids),
# "token_type_ids": np.asarray(tokenized["token_type_ids"]),
# "text_token_mask": np.asarray(text_self_attention_masks)
# }
t0
=
time
.
time
()
outputs
=
ort_session
.
run
([
'logits'
,
'boxes'
],
input_dict
)
infer_time
=
time
.
time
()
-
t0
if
not
is_benchmark
:
print
(
f
"Inference time:
{
infer_time
:.
3
f
}
s"
)
# 7. 获取预测结果
prediction_logits
=
np
.
apply_along_axis
(
sigmoid
,
-
1
,
outputs
[
0
][
0
])
prediction_boxes
=
outputs
[
1
][
0
]
if
not
is_benchmark
:
print
(
f
"
\n
=== Debug Info ==="
)
print
(
f
"Prediction logits shape:
{
prediction_logits
.
shape
}
"
)
print
(
f
"Prediction boxes shape:
{
prediction_boxes
.
shape
}
"
)
print
(
f
"Max logit value:
{
np
.
max
(
prediction_logits
):.
4
f
}
"
)
print
(
f
"Mean logit value:
{
np
.
mean
(
prediction_logits
):.
4
f
}
"
)
# 8. 应用过滤条件
max_values
=
np
.
max
(
prediction_logits
,
axis
=
1
)
mask
=
max_values
>
box_threshold
logits
=
prediction_logits
[
mask
]
boxes
=
prediction_boxes
[
mask
]
# 9. 处理文本匹配
tokenized
=
tokenizer
(
caption
)
# 10. 处理特殊标记
if
remove_combined
:
sep_idx
=
[
i
for
i
in
range
(
len
(
tokenized
[
'input_ids'
]))
if
tokenized
[
'input_ids'
][
i
]
in
[
101
,
102
,
1012
]]
phrases
=
[]
for
logit
in
logits
:
max_idx
=
logit
.
argmax
()
insert_idx
=
bisect
.
bisect_left
(
sep_idx
,
max_idx
)
right_idx
=
sep_idx
[
insert_idx
]
left_idx
=
sep_idx
[
insert_idx
-
1
]
phrases
.
append
(
get_phrases_from_posmap
(
logit
>
text_threshold
,
tokenized
,
tokenizer
,
left_idx
,
right_idx
).
replace
(
'.'
,
''
)
)
else
:
phrases
=
[
get_phrases_from_posmap
(
logit
>
text_threshold
,
tokenized
,
tokenizer
).
replace
(
'.'
,
''
)
for
logit
in
logits
]
return
boxes
,
np
.
max
(
logits
,
axis
=
1
),
phrases
# 新增:完整的性能测试函数(包含预热+实际推理)
def
benchmark_performance
(
ort_session
,
tokenizer
,
image
,
caption
,
box_threshold
,
text_threshold
,
warmup_runs
=
5
,
test_runs
=
10
,
device
=
"cpu"
):
"""
性能测试函数:包含预热和实际推理
:param warmup_runs: 预热次数
:param test_runs: 实际测试次数
"""
print
(
"="
*
60
)
print
(
"📊 开始性能测试(包含预热+实际推理)"
)
print
(
"="
*
60
)
# 1. 预热阶段
print
(
f
"
\n
🔥 预热阶段(
{
warmup_runs
}
次)- 不计入性能统计"
)
warmup_start
=
time
.
time
()
for
i
in
range
(
warmup_runs
):
t0
=
time
.
time
()
predict
(
ort_session
,
tokenizer
,
image
,
caption
,
box_threshold
,
text_threshold
,
device
,
is_benchmark
=
True
)
warmup_time
=
time
.
time
()
-
t0
print
(
f
"预热
{
i
+
1
}
/
{
warmup_runs
}
- 耗时:
{
warmup_time
*
1000
:.
2
f
}
ms"
)
total_warmup_time
=
time
.
time
()
-
warmup_start
print
(
f
"
\n
预热完成 - 总耗时:
{
total_warmup_time
:.
3
f
}
s, 平均每次:
{
total_warmup_time
/
warmup_runs
*
1000
:.
2
f
}
ms"
)
# 2. 实际推理测试阶段
print
(
f
"
\n
🚀 实际推理测试阶段(
{
test_runs
}
次)- 统计性能指标"
)
test_start
=
time
.
time
()
infer_times
=
[]
# 记录每次推理耗时
for
i
in
range
(
test_runs
):
t0
=
time
.
time
()
predict
(
ort_session
,
tokenizer
,
image
,
caption
,
box_threshold
,
text_threshold
,
device
,
is_benchmark
=
True
)
infer_time
=
time
.
time
()
-
t0
infer_times
.
append
(
infer_time
)
print
(
f
"实际推理
{
i
+
1
}
/
{
test_runs
}
- 耗时:
{
infer_time
*
1000
:.
2
f
}
ms"
)
# 3. 计算性能指标
total_test_time
=
time
.
time
()
-
test_start
avg_infer_time
=
np
.
mean
(
infer_times
)
std_infer_time
=
np
.
std
(
infer_times
)
max_infer_time
=
np
.
max
(
infer_times
)
min_infer_time
=
np
.
min
(
infer_times
)
fps
=
test_runs
/
total_test_time
# 4. 输出性能报告
print
(
"
\n
"
+
"="
*
60
)
print
(
"📈 性能测试报告(仅实际推理阶段)"
)
print
(
"="
*
60
)
print
(
f
"测试次数:
{
test_runs
}
次"
)
print
(
f
"总推理耗时:
{
total_test_time
:.
3
f
}
s"
)
print
(
f
"平均推理耗时:
{
avg_infer_time
*
1000
:.
2
f
}
ms (±
{
std_infer_time
*
1000
:.
2
f
}
ms)"
)
print
(
f
"最大推理耗时:
{
max_infer_time
*
1000
:.
2
f
}
ms"
)
print
(
f
"最小推理耗时:
{
min_infer_time
*
1000
:.
2
f
}
ms"
)
print
(
f
"平均FPS:
{
fps
:.
2
f
}
帧/秒"
)
print
(
"="
*
60
)
return
{
"warmup_runs"
:
warmup_runs
,
"test_runs"
:
test_runs
,
"avg_infer_time_ms"
:
avg_infer_time
*
1000
,
"std_infer_time_ms"
:
std_infer_time
*
1000
,
"max_infer_time_ms"
:
max_infer_time
*
1000
,
"min_infer_time_ms"
:
min_infer_time
*
1000
,
"fps"
:
fps
}
if
__name__
==
'__main__'
:
# 配置参数
model_path
=
'weights/ground_test.onnx'
img_path
=
'images/in/car_1.jpg'
TEXT_PROMPT
=
"car ."
BOX_TRESHOLD
=
0.35
TEXT_TRESHOLD
=
0.25
DEVICE
=
"cpu"
WARMUP_RUNS
=
5
# 预热次数
TEST_RUNS
=
10
# 实际测试次数
# 加载图片
image_source
,
image
=
load_image
(
img_path
)
# 加载ONNX模型(启用优化)
print
(
"🔍 加载ONNX模型"
)
sess_options
=
ort
.
SessionOptions
()
sess_options
.
graph_optimization_level
=
ort
.
GraphOptimizationLevel
.
ORT_ENABLE_ALL
# 启用所有图优化
sess_options
.
log_severity_level
=
3
# 减少日志输出
# sess_options.enable_profiling = True # 启用性能分析
ort_session
=
ort
.
InferenceSession
(
model_path
,
sess_options
=
sess_options
,
providers
=
[
'ROCMExecutionProvider'
]
# provider_options=[{
# "device_id": 0,
# "migraphx_fp16_enable": "False",
# "migraphx_int8_enable": "False",
# # 尝试禁用 MIGraphX 内部优化
# "migraphx_save_compiled_model": "False",
# }]
)
# 查看当前执行引擎
current_provider
=
ort_session
.
get_providers
()
print
(
f
"✅ 模型加载完成 - 当前执行引擎:
{
current_provider
}
"
)
# 预加载tokenizer(只加载一次,核心优化)
print
(
"
\n
📝 预加载BERT Tokenizer(仅加载一次)"
)
t0
=
time
.
time
()
tokenizer
=
BertTokenizer
.
from_pretrained
(
'bert-base-uncased'
)
print
(
f
"✅ Tokenizer加载完成 - 耗时:
{
(
time
.
time
()
-
t0
):.
3
f
}
s"
)
# 第一步:运行完整的性能测试(预热+实际推理)
performance_result
=
benchmark_performance
(
ort_session
,
tokenizer
,
image
,
TEXT_PROMPT
,
BOX_TRESHOLD
,
TEXT_TRESHOLD
,
WARMUP_RUNS
,
TEST_RUNS
,
DEVICE
)
# 第二步:执行一次完整推理(带详细日志,保存结果图片)
print
(
"
\n
"
+
"="
*
60
)
print
(
"🎯 执行最终推理(带详细日志+保存结果)"
)
print
(
"="
*
60
)
boxes
,
confs
,
phrases
=
predict
(
ort_session
,
tokenizer
,
image
,
TEXT_PROMPT
,
BOX_TRESHOLD
,
TEXT_TRESHOLD
,
DEVICE
)
# 绘制并保存结果图片
ori_img
=
cv2
.
imread
(
img_path
)
img_h
=
ori_img
.
shape
[
0
]
img_w
=
ori_img
.
shape
[
1
]
for
i
in
range
(
len
(
boxes
)):
one_box
=
boxes
[
i
]
one_conf
=
confs
[
i
]
one_cls
=
phrases
[
i
]
x1
=
int
((
one_box
[
0
]
-
one_box
[
2
]
/
2
)
*
img_w
)
y1
=
int
((
one_box
[
1
]
-
one_box
[
3
]
/
2
)
*
img_h
)
x2
=
int
((
one_box
[
0
]
+
one_box
[
2
]
/
2
)
*
img_w
)
y2
=
int
((
one_box
[
1
]
+
one_box
[
3
]
/
2
)
*
img_h
)
cv2
.
rectangle
(
ori_img
,
(
x1
,
y1
),
(
x2
,
y2
),
(
0
,
0
,
255
),
2
)
cv2
.
putText
(
ori_img
,
f
'
{
one_cls
}
{
one_conf
:.
2
f
}
'
,
(
x1
-
15
,
y1
-
15
),
fontFace
=
cv2
.
FONT_HERSHEY_SIMPLEX
,
color
=
(
255
,
255
,
255
),
fontScale
=
1.5
,
thickness
=
3
)
# 保存结果
cv2
.
imwrite
(
'./images/out/result.jpg'
,
ori_img
)
print
(
f
"
\n
✅ 结果已保存至: ./images/out/result.jpg"
)
print
(
f
"✅ 检测到目标:
{
phrases
}
(共
{
len
(
boxes
)
}
个)"
)
# profile_file = ort_session.end_profiling()
# print(f"\n📊 Profiling 文件已生成: {profile_file}")
\ No newline at end of file
resnet/profile.json_2026-04-03_11-42-27.json
deleted
100644 → 0
View file @
552b62f3
[
{
"cat"
:
"Session"
,
"pid"
:
1773812
,
"tid"
:
1773812
,
"dur"
:
91000
,
"ts"
:
20
,
"ph"
:
"X"
,
"name"
:
"model_loading_uri"
,
"args"
:
{}},
{
"cat"
:
"Session"
,
"pid"
:
1773812
,
"tid"
:
1773812
,
"dur"
:
4754984
,
"ts"
:
91208
,
"ph"
:
"X"
,
"name"
:
"session_initialization"
,
"args"
:
{}}
]
resnet/profile.json_2026-04-03_11-43-38.json
deleted
100644 → 0
View file @
552b62f3
[
{
"cat"
:
"Session"
,
"pid"
:
1774715
,
"tid"
:
1774715
,
"dur"
:
73864
,
"ts"
:
6
,
"ph"
:
"X"
,
"name"
:
"model_loading_uri"
,
"args"
:
{}},
{
"cat"
:
"Session"
,
"pid"
:
1774715
,
"tid"
:
1774715
,
"dur"
:
4308806
,
"ts"
:
73985
,
"ph"
:
"X"
,
"name"
:
"session_initialization"
,
"args"
:
{}}
]
resnet/profile.json_2026-04-03_11-44-14.json
deleted
100644 → 0
View file @
552b62f3
[
{
"cat"
:
"Session"
,
"pid"
:
1775618
,
"tid"
:
1775618
,
"dur"
:
76845
,
"ts"
:
5
,
"ph"
:
"X"
,
"name"
:
"model_loading_uri"
,
"args"
:
{}},
{
"cat"
:
"Session"
,
"pid"
:
1775618
,
"tid"
:
1775618
,
"dur"
:
4130635
,
"ts"
:
76963
,
"ph"
:
"X"
,
"name"
:
"session_initialization"
,
"args"
:
{}}
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment