Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
zk
GroundingDINO-DCU-Optimized
Commits
ca23112b
Commit
ca23112b
authored
Apr 15, 2026
by
zk
Browse files
适配了输入400x800,删除垃圾文件
parent
74fbd52c
Changes
34
Show whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
3 additions
and
855 deletions
+3
-855
mha/run.sh
mha/run.sh
+0
-14
migraphx_infer/test.py
migraphx_infer/test.py
+0
-13
migraphx_infer/test1.py
migraphx_infer/test1.py
+0
-107
migraphx_infer/test2.py
migraphx_infer/test2.py
+0
-113
migraphx_infer/test3.py
migraphx_infer/test3.py
+0
-15
migraphx_infer/test4.py
migraphx_infer/test4.py
+0
-6
migraphx_infer/test5.py
migraphx_infer/test5.py
+0
-77
migraphx_infer/test6.py
migraphx_infer/test6.py
+0
-11
migraphx_infer/test7.py
migraphx_infer/test7.py
+0
-27
onnx_inference1.py
onnx_inference1.py
+1
-1
onnx_inference1_HIP.py
onnx_inference1_HIP.py
+0
-375
onnx_inference1_migraphx_test.py
onnx_inference1_migraphx_test.py
+0
-14
tmprllsblav/_remote_module_non_scriptable.py
tmprllsblav/_remote_module_non_scriptable.py
+0
-81
utils/utils.py
utils/utils.py
+2
-1
No files found.
mha/run.sh
deleted
100644 → 0
View file @
74fbd52c
# python3 -m onnxruntime.transformers.optimizer \
# --input ../weights/ground.onnx \
# --output ./mha.onnx \
# --use_multi_head_attention \
# # --num_heads 12 \
# # --hidden_size 256 \
# --model_type bert \
# --disable_skip_layer_norm \
# --disable_gelu \
# --use_gpu \
# --disable_embed_layer_norm \
# --use_mask_index \
# --use_raw_attention_mask
python3
-m
onnxruntime.transformers.optimizer
--input
../weights/ground.onnx
--output
./ground.onnx
--model_type
bert
--use_gpu
\ No newline at end of file
migraphx_infer/test.py
deleted
100644 → 0
View file @
74fbd52c
import
onnx
model
=
onnx
.
load
(
"weights/ground.onnx"
)
for
vi
in
model
.
graph
.
value_info
:
dims
=
[
d
.
dim_value
for
d
in
vi
.
type
.
tensor_type
.
shape
.
dim
]
if
any
(
d
==
0
for
d
in
dims
):
print
(
"⚠️ zero dim in value_info:"
,
vi
.
name
,
dims
)
for
vi
in
model
.
graph
.
output
:
dims
=
[
d
.
dim_value
for
d
in
vi
.
type
.
tensor_type
.
shape
.
dim
]
if
any
(
d
==
0
for
d
in
dims
):
print
(
"⚠️ zero dim in output:"
,
vi
.
name
,
dims
)
\ No newline at end of file
migraphx_infer/test1.py
deleted
100644 → 0
View file @
74fbd52c
import
onnx
model
=
onnx
.
load
(
"weights/ground_simplified.onnx"
)
# 基本信息
print
(
f
"模型名称:
{
model
.
graph
.
name
}
"
)
print
(
f
"opset 版本:
{
model
.
opset_import
[
0
].
version
}
"
)
# 输入
print
(
"
\n
=== 输入 ==="
)
for
inp
in
model
.
graph
.
input
:
shape
=
[
d
.
dim_value
if
d
.
dim_value
>
0
else
d
.
dim_param
for
d
in
inp
.
type
.
tensor_type
.
shape
.
dim
]
print
(
f
"
{
inp
.
name
}
:
{
inp
.
type
.
tensor_type
.
elem_type
}
, shape=
{
shape
}
"
)
# 输出
print
(
"
\n
=== 输出 ==="
)
for
out
in
model
.
graph
.
output
:
shape
=
[
d
.
dim_value
if
d
.
dim_value
>
0
else
d
.
dim_param
for
d
in
out
.
type
.
tensor_type
.
shape
.
dim
]
print
(
f
"
{
out
.
name
}
:
{
out
.
type
.
tensor_type
.
elem_type
}
, shape=
{
shape
}
"
)
# 统计算子类型
from
collections
import
Counter
op_counts
=
Counter
(
node
.
op_type
for
node
in
model
.
graph
.
node
)
print
(
"
\n
=== 算子统计 (前20) ==="
)
for
op
,
count
in
op_counts
.
most_common
(
20
):
print
(
f
"
{
op
}
:
{
count
}
"
)
# 检查是否有控制流算子
control_ops
=
[
op
for
op
in
op_counts
if
op
in
[
"If"
,
"Loop"
,
"Scan"
,
"SequenceMap"
]]
if
control_ops
:
print
(
f
"
\n
⚠️ 包含控制流算子:
{
control_ops
}
"
)
'''
模型名称: main_graph
opset 版本: 17
=== 输入 ===
img: 1, shape=[1, 3, 800, 1200]
input_ids: 7, shape=[1, 4]
attention_mask: 9, shape=[1, 4]
position_ids: 7, shape=[1, 4]
token_type_ids: 7, shape=[1, 4]
text_token_mask: 9, shape=[1, 4, 4]
=== 输出 ===
logits: 1, shape=['Gatherlogits_dim_0', 'Gatherlogits_dim_1', 'Gatherlogits_dim_2']
boxes: 1, shape=['Gatherboxes_dim_0', 'Gatherboxes_dim_1', 4]
=== 算子统计 (前20) ===
Constant: 7315
Unsqueeze: 1919
Concat: 1051
Reshape: 916
Shape: 843
Gather: 762
Add: 716
Slice: 603
MatMul: 528
Mul: 513
Transpose: 507
Cast: 459
Div: 265
Where: 230
Expand: 223
ConstantOfShape: 218
Equal: 183
LayerNormalization: 147
Sub: 79
Softmax: 78
# 经过简化后:
=== 输入 ===
img: 1, shape=[1, 3, 800, 1200]
input_ids: 7, shape=[1, 4]
attention_mask: 9, shape=[1, 4]
position_ids: 7, shape=[1, 4]
token_type_ids: 7, shape=[1, 4]
text_token_mask: 9, shape=[1, 4, 4]
=== 输出 ===
logits: 1, shape=[1, 900, 256]
boxes: 1, shape=[1, 900, 4]
=== 算子统计 (前20) ===
Reshape: 703
Add: 679
MatMul: 527
Transpose: 459
Mul: 204
Slice: 194
Gather: 155
Unsqueeze: 152
LayerNormalization: 147
Concat: 97
Div: 96
Softmax: 78
Clip: 57
Relu: 48
GridSample: 48
Sub: 36
Erf: 36
Where: 35
Pad: 25
Sin: 25
'''
\ No newline at end of file
migraphx_infer/test2.py
deleted
100644 → 0
View file @
74fbd52c
import
onnx
from
onnx
import
numpy_helper
# 加载你的模型
model
=
onnx
.
load
(
"weights/ground_sim_fp16.onnx"
)
print
(
"=== 检查所有常量张量大小 ==="
)
for
init
in
model
.
graph
.
initializer
:
name
=
init
.
name
shape
=
tuple
(
init
.
dims
)
# 计算元素个数
elem_count
=
1
for
d
in
shape
:
elem_count
*=
d
# 计算大小(MB)
dtype_size
=
onnx
.
helper
.
tensor_dtype_to_np_dtype
(
init
.
data_type
).
itemsize
size_mb
=
(
elem_count
*
dtype_size
)
/
(
1024
*
1024
)
# 只打印 >10MB 的常量(你可以改阈值)
if
size_mb
>
10
:
print
(
f
"⚠️ 超大常量:
{
name
}
"
)
print
(
f
" 形状:
{
shape
}
"
)
print
(
f
" 大小:
{
size_mb
:.
2
f
}
MB
\n
"
)
"""
=== ground.onnx检查所有常量张量大小 ===
⚠️ 超大常量:bert.embeddings.word_embeddings.weight
形状:(30522, 768)
大小:89.42 MB
⚠️ 超大常量:onnx::MatMul_25479
形状:(1024, 3072)
大小:12.00 MB
⚠️ 超大常量:onnx::MatMul_25503
形状:(1024, 4096)
大小:16.00 MB
⚠️ 超大常量:onnx::MatMul_25504
形状:(4096, 1024)
大小:16.00 MB
⚠️ 超大常量:onnx::MatMul_25513
形状:(1024, 3072)
大小:12.00 MB
⚠️ 超大常量:onnx::MatMul_25541
形状:(1024, 4096)
大小:16.00 MB
⚠️ 超大常量:onnx::MatMul_25542
形状:(4096, 1024)
大小:16.00 MB
ground_simplified.onnx
=== 检查所有常量张量大小 ===
⚠️ 超大常量:bert.embeddings.word_embeddings.weight
形状:(30522, 768)
大小:89.42 MB
⚠️ 超大常量:onnx::MatMul_25479
形状:(1024, 3072)
大小:12.00 MB
⚠️ 超大常量:onnx::MatMul_25503
形状:(1024, 4096)
大小:16.00 MB
⚠️ 超大常量:onnx::MatMul_25504
形状:(4096, 1024)
大小:16.00 MB
⚠️ 超大常量:onnx::MatMul_25513
形状:(1024, 3072)
大小:12.00 MB
⚠️ 超大常量:onnx::MatMul_25541
形状:(1024, 4096)
大小:16.00 MB
⚠️ 超大常量:onnx::MatMul_25542
形状:(4096, 1024)
大小:16.00 MB
⚠️ 超大常量:/backbone/backbone.0/layers.0/blocks.1/attn/Unsqueeze_7_output_0
形状:(1, 425, 1, 144, 144)
大小:33.62 MB
⚠️ 超大常量:/transformer/Concat_10_output_0
形状:(1, 19947, 256)
大小:19.48 MB
⚠️ 超大常量:/transformer/enc_out_class_embed/ConstantOfShape_output_0
形状:(1, 19947, 256)
大小:19.48 MB
"""
"""=== ground_fp16.onnx检查所有常量张量大小 ===
⚠️ 超大常量:bert.embeddings.word_embeddings.weight
形状:(30522, 768)
大小:44.71 MB
"""
""" ground_sim_fp16.onnx
=== 检查所有常量张量大小 ===
⚠️ 超大常量:bert.embeddings.word_embeddings.weight
形状:(30522, 768)
大小:44.71 MB
⚠️ 超大常量:/backbone/backbone.0/layers.0/blocks.1/attn/Unsqueeze_7_output_0
形状:(1, 425, 1, 144, 144)
大小:16.81 MB
"""
\ No newline at end of file
migraphx_infer/test3.py
deleted
100644 → 0
View file @
74fbd52c
import
onnx
,
onnx
.
external_data_helper
as
ex
import
sys
,
os
print
(
"🟢 python :"
,
sys
.
executable
)
print
(
"🟢 onnx :"
,
onnx
.
__file__
)
# 路径,确认是哪个包
print
(
"🟢 version:"
,
onnx
.
__version__
)
# 必须是 >= 1.12
src
=
"weights/ground.onnx"
dst
=
"weights/ground_external.onnx"
m
=
onnx
.
load
(
src
)
# 把每个 Tensor 都做 external data,统一写入 ground_weights.bin
ex
.
convert_model_to_external_data
(
m
,
all_tensors_to_one_file
=
True
,
location
=
"ground_weights.bin"
)
onnx
.
save_model
(
m
,
dst
)
print
(
"[✅] external ONNX →"
,
dst
)
\ No newline at end of file
migraphx_infer/test4.py
deleted
100644 → 0
View file @
74fbd52c
import
onnx
from
onnx
import
shape_inference
model
=
onnx
.
load
(
"weights/ground.onnx"
)
model
=
shape_inference
.
infer_shapes
(
model
)
onnx
.
save
(
model
,
"weights/ground_shape.onnx"
)
\ No newline at end of file
migraphx_infer/test5.py
deleted
100644 → 0
View file @
74fbd52c
import
onnx
from
onnx
import
helper
INPUT_MODEL
=
"weights/ground_simplified.onnx"
OUTPUT_MODEL
=
"weights/ground_fix.onnx"
def
add_identity
(
graph
,
input_name
,
suffix
,
new_nodes
,
processed
):
if
input_name
in
processed
:
return
input_name
+
suffix
new_name
=
input_name
+
suffix
identity_node
=
helper
.
make_node
(
"Identity"
,
inputs
=
[
input_name
],
outputs
=
[
new_name
],
name
=
input_name
+
suffix
+
"_identity"
)
new_nodes
.
append
(
identity_node
)
processed
.
add
(
input_name
)
return
new_name
def
patch_model
(
model
):
graph
=
model
.
graph
new_nodes
=
[]
processed
=
set
()
for
node
in
graph
.
node
:
# ✅ 1. 处理 Gather(你之前做的)
if
node
.
op_type
==
"Gather"
:
idx
=
node
.
input
[
1
]
node
.
input
[
1
]
=
add_identity
(
graph
,
idx
,
"_block"
,
new_nodes
,
processed
)
# ✅ 2. 🔥 关键:处理 ScatterND
if
node
.
op_type
.
lower
().
startswith
(
"scatter"
):
# scatternd(data, indices, updates)
data
=
node
.
input
[
0
]
indices
=
node
.
input
[
1
]
updates
=
node
.
input
[
2
]
node
.
input
[
0
]
=
add_identity
(
graph
,
data
,
"_block"
,
new_nodes
,
processed
)
node
.
input
[
1
]
=
add_identity
(
graph
,
indices
,
"_block"
,
new_nodes
,
processed
)
node
.
input
[
2
]
=
add_identity
(
graph
,
updates
,
"_block"
,
new_nodes
,
processed
)
# ✅ 3. where(也可能触发 constant folding)
if
node
.
op_type
==
"Where"
:
for
i
in
range
(
3
):
node
.
input
[
i
]
=
add_identity
(
graph
,
node
.
input
[
i
],
"_block"
,
new_nodes
,
processed
)
# 插入到最前面
for
i
,
n
in
enumerate
(
new_nodes
):
graph
.
node
.
insert
(
i
,
n
)
return
model
def
main
():
print
(
"🔍 加载模型..."
)
model
=
onnx
.
load
(
INPUT_MODEL
)
print
(
"⚙️ 全面阻断 constant folding(Gather + ScatterND + Where)..."
)
model
=
patch_model
(
model
)
print
(
"💾 保存模型..."
)
onnx
.
save
(
model
,
OUTPUT_MODEL
)
print
(
"✅ 完成:"
,
OUTPUT_MODEL
)
if
__name__
==
"__main__"
:
main
()
\ No newline at end of file
migraphx_infer/test6.py
deleted
100644 → 0
View file @
74fbd52c
import
onnx
from
onnx
import
numpy_helper
model
=
onnx
.
load
(
"weights/ground.onnx"
)
for
init
in
model
.
graph
.
initializer
:
if
"Constant"
in
init
.
name
:
arr
=
numpy_helper
.
to_array
(
init
)
if
arr
.
dtype
in
[
np
.
int32
,
np
.
int64
]:
if
(
arr
<
0
).
any
()
or
(
arr
>
10000
).
any
():
print
(
"🚨 可疑 index:"
,
init
.
name
,
arr
)
\ No newline at end of file
migraphx_infer/test7.py
deleted
100644 → 0
View file @
74fbd52c
import
onnx
import
numpy
as
np
from
onnx
import
numpy_helper
model
=
onnx
.
load
(
"weights/ground.onnx"
)
# 找所有 initializer
init_map
=
{
i
.
name
:
numpy_helper
.
to_array
(
i
)
for
i
in
model
.
graph
.
initializer
}
for
node
in
model
.
graph
.
node
:
if
node
.
op_type
==
"Gather"
:
index_name
=
node
.
input
[
1
]
if
index_name
in
init_map
:
idx
=
init_map
[
index_name
]
print
(
"
\n
🚨 Gather index:"
,
index_name
)
print
(
"dtype:"
,
idx
.
dtype
)
print
(
"min:"
,
idx
.
min
())
print
(
"max:"
,
idx
.
max
())
print
(
"shape:"
,
idx
.
shape
)
if
(
idx
<
0
).
any
():
print
(
"❌ NEGATIVE index"
)
if
(
idx
>
10000
).
any
():
print
(
"❌ SUSPICIOUS LARGE index"
)
\ No newline at end of file
onnx_inference1.py
View file @
ca23112b
...
...
@@ -240,7 +240,7 @@ def benchmark_performance(
if
__name__
==
'__main__'
:
# 配置参数
model_path
=
'weights/ground.onnx'
model_path
=
'weights
_400x600
/ground.onnx'
img_path
=
'images/in/car_1.jpg'
TEXT_PROMPT
=
"car ."
BOX_TRESHOLD
=
0.35
...
...
onnx_inference1_HIP.py
deleted
100644 → 0
View file @
74fbd52c
from
typing
import
Tuple
,
List
,
Dict
import
os
import
cv2
import
numpy
as
np
import
torch
import
onnxruntime
as
ort
from
transformers
import
BertTokenizer
,
AutoTokenizer
import
bisect
import
time
import
warnings
warnings
.
filterwarnings
(
'ignore'
)
from
groundingdino.util.inference
import
load_image
from
groundingdino.models.GroundingDINO.bertwarper
import
generate_masks_with_special_tokens_and_transfer_map
# 加入推理延迟等指标
def
sigmoid
(
x
):
return
1
/
(
1
+
np
.
exp
(
-
x
))
def
get_phrases_from_posmap
(
posmap
:
np
.
ndarray
,
tokenized
:
Dict
,
tokenizer
:
AutoTokenizer
,
left_idx
:
int
=
0
,
right_idx
:
int
=
255
):
assert
isinstance
(
posmap
,
np
.
ndarray
),
"posmap must be np.ndarray"
if
posmap
.
ndim
==
1
:
# 将指定范围内的元素设为 False
posmap
[:
left_idx
+
1
]
=
False
posmap
[
right_idx
:]
=
False
# 获取非零元素的索引
non_zero_idx
=
np
.
nonzero
(
posmap
)[
0
]
token_ids
=
[
tokenized
[
"input_ids"
][
i
]
for
i
in
non_zero_idx
]
return
tokenizer
.
decode
(
token_ids
)
else
:
raise
NotImplementedError
(
"posmap must be 1-dim"
)
def
preprocess_caption
(
caption
:
str
)
->
str
:
result
=
caption
.
lower
().
strip
()
if
result
.
endswith
(
"."
):
return
result
return
result
+
"."
# 核心优化:固定尺寸内存池(800x1200),batch_size=1
class
HIPMemoryPool
:
def
__init__
(
self
,
img_shape
=
(
3
,
800
,
1200
),
max_text_len
=
256
,
device
=
"cpu"
):
self
.
img_shape
=
img_shape
# 固定800x1200
self
.
max_text_len
=
max_text_len
self
.
device
=
device
self
.
pool
=
{}
# 预分配所有内存(固定尺寸,无动态分配)
self
.
_preallocate_all_buffers
()
def
_preallocate_all_buffers
(
self
):
"""预分配固定尺寸的所有内存(800x1200,batch_size=1)"""
# 图像内存 (1, 3, 800, 1200) - 固定尺寸
self
.
pool
[
"img"
]
=
np
.
zeros
((
1
,)
+
self
.
img_shape
,
dtype
=
np
.
float32
)
# 文本相关内存 (batch_size=1, 256)
self
.
pool
[
"input_ids"
]
=
np
.
zeros
((
1
,
self
.
max_text_len
),
dtype
=
np
.
int64
)
self
.
pool
[
"attention_mask"
]
=
np
.
zeros
((
1
,
self
.
max_text_len
),
dtype
=
bool
)
self
.
pool
[
"position_ids"
]
=
np
.
zeros
((
1
,
self
.
max_text_len
),
dtype
=
np
.
int64
)
self
.
pool
[
"token_type_ids"
]
=
np
.
zeros
((
1
,
self
.
max_text_len
),
dtype
=
np
.
int64
)
self
.
pool
[
"text_token_mask"
]
=
np
.
zeros
((
1
,
self
.
max_text_len
,
self
.
max_text_len
),
dtype
=
bool
)
def
update_img_buffer
(
self
,
image
:
np
.
array
):
"""更新图像缓冲区(固定800x1200尺寸)"""
# 校验输入尺寸,确保是800x1200
if
image
.
shape
!=
self
.
img_shape
:
raise
ValueError
(
f
"图片尺寸必须为
{
self
.
img_shape
}
,当前为
{
image
.
shape
}
"
)
self
.
pool
[
"img"
][
0
]
=
image
return
self
.
pool
[
"img"
]
def
update_text_buffers
(
self
,
tokenized
,
position_ids
,
text_self_attention_masks
):
"""更新文本缓冲区(复用固定内存)"""
# 截断并复制文本数据到预分配缓冲区
text_len
=
min
(
tokenized
[
"input_ids"
].
shape
[
1
],
self
.
max_text_len
)
self
.
pool
[
"input_ids"
][
0
,
:
text_len
]
=
tokenized
[
"input_ids"
][
0
,
:
text_len
].
cpu
().
numpy
()
self
.
pool
[
"attention_mask"
][
0
,
:
text_len
]
=
tokenized
[
"attention_mask"
][
0
,
:
text_len
].
cpu
().
numpy
().
astype
(
bool
)
self
.
pool
[
"position_ids"
][
0
,
:
text_len
]
=
position_ids
[
0
,
:
text_len
].
cpu
().
numpy
()
self
.
pool
[
"token_type_ids"
][
0
,
:
text_len
]
=
tokenized
[
"token_type_ids"
][
0
,
:
text_len
].
cpu
().
numpy
()
# 文本注意力掩码
mask_len
=
min
(
text_self_attention_masks
.
shape
[
1
],
self
.
max_text_len
)
self
.
pool
[
"text_token_mask"
][
0
,
:
mask_len
,
:
mask_len
]
=
text_self_attention_masks
[
0
,
:
mask_len
,
:
mask_len
].
cpu
().
numpy
()
return
{
"input_ids"
:
self
.
pool
[
"input_ids"
],
"attention_mask"
:
self
.
pool
[
"attention_mask"
],
"position_ids"
:
self
.
pool
[
"position_ids"
],
"token_type_ids"
:
self
.
pool
[
"token_type_ids"
],
"text_token_mask"
:
self
.
pool
[
"text_token_mask"
]
}
# 核心推理函数(适配固定尺寸+batch_size=1)
def
predict
(
ort_session
,
tokenizer
:
AutoTokenizer
,
memory_pool
:
HIPMemoryPool
,
image
:
np
.
array
,
caption
:
str
,
box_threshold
:
float
,
text_threshold
:
float
,
device
:
str
=
"cpu"
,
remove_combined
:
bool
=
False
,
is_benchmark
:
bool
=
False
)
->
Tuple
[
torch
.
Tensor
,
torch
.
Tensor
,
List
[
str
]]:
# 1. 文本预处理
caption
=
preprocess_caption
(
caption
=
caption
)
# 2. 编码文本(复用tokenizer)
tokenized
=
tokenizer
([
caption
],
padding
=
"longest"
,
return_tensors
=
"pt"
).
to
(
device
)
specical_tokens
=
tokenizer
.
convert_tokens_to_ids
([
"[CLS]"
,
"[SEP]"
,
"."
,
"?"
])
# 3. 生成注意力掩码和位置信息
(
text_self_attention_masks
,
position_ids
,
cate_to_token_mask_list
,
)
=
generate_masks_with_special_tokens_and_transfer_map
(
tokenized
,
specical_tokens
,
tokenizer
)
# 4. 处理超长文本(截断)
max_text_len
=
memory_pool
.
max_text_len
if
text_self_attention_masks
.
shape
[
1
]
>
max_text_len
:
text_self_attention_masks
=
text_self_attention_masks
[:,
:
max_text_len
,
:
max_text_len
]
position_ids
=
position_ids
[:,
:
max_text_len
]
tokenized
[
"input_ids"
]
=
tokenized
[
"input_ids"
][:,
:
max_text_len
]
tokenized
[
"attention_mask"
]
=
tokenized
[
"attention_mask"
][:,
:
max_text_len
]
tokenized
[
"token_type_ids"
]
=
tokenized
[
"token_type_ids"
][:,
:
max_text_len
]
# 5. 复用固定尺寸内存池
img_input
=
memory_pool
.
update_img_buffer
(
image
)
text_inputs
=
memory_pool
.
update_text_buffers
(
tokenized
,
position_ids
,
text_self_attention_masks
)
input_dict
=
{
"img"
:
img_input
,
"input_ids"
:
text_inputs
[
"input_ids"
],
"attention_mask"
:
text_inputs
[
"attention_mask"
],
"position_ids"
:
text_inputs
[
"position_ids"
],
"token_type_ids"
:
text_inputs
[
"token_type_ids"
],
"text_token_mask"
:
text_inputs
[
"text_token_mask"
]
}
# 6. 执行模型推理(无分步计时,减少同步)
t0
=
time
.
time
()
outputs
=
ort_session
.
run
([
'logits'
,
'boxes'
],
input_dict
)
infer_time
=
time
.
time
()
-
t0
if
not
is_benchmark
:
print
(
f
"Inference time:
{
infer_time
:.
3
f
}
s"
)
# 7. 处理预测结果
prediction_logits
=
np
.
apply_along_axis
(
sigmoid
,
-
1
,
outputs
[
0
][
0
])
prediction_boxes
=
outputs
[
1
][
0
]
if
not
is_benchmark
:
print
(
f
"
\n
=== Debug Info ==="
)
print
(
f
"Prediction logits shape:
{
prediction_logits
.
shape
}
"
)
print
(
f
"Prediction boxes shape:
{
prediction_boxes
.
shape
}
"
)
print
(
f
"Max logit value:
{
np
.
max
(
prediction_logits
):.
4
f
}
"
)
print
(
f
"Mean logit value:
{
np
.
mean
(
prediction_logits
):.
4
f
}
"
)
# 8. 过滤结果
max_values
=
np
.
max
(
prediction_logits
,
axis
=
1
)
mask
=
max_values
>
box_threshold
logits
=
prediction_logits
[
mask
]
boxes
=
prediction_boxes
[
mask
]
# 9. 生成文本标签
tokenized_caption
=
tokenizer
(
caption
)
if
remove_combined
:
sep_idx
=
[
i
for
i
in
range
(
len
(
tokenized_caption
[
'input_ids'
]))
if
tokenized_caption
[
'input_ids'
][
i
]
in
[
101
,
102
,
1012
]]
phrases
=
[]
for
logit
in
logits
:
max_idx
=
logit
.
argmax
()
insert_idx
=
bisect
.
bisect_left
(
sep_idx
,
max_idx
)
right_idx
=
sep_idx
[
insert_idx
]
if
insert_idx
<
len
(
sep_idx
)
else
len
(
logit
)
left_idx
=
sep_idx
[
insert_idx
-
1
]
if
insert_idx
>
0
else
0
phrases
.
append
(
get_phrases_from_posmap
(
logit
>
text_threshold
,
tokenized_caption
,
tokenizer
,
left_idx
,
right_idx
).
replace
(
'.'
,
''
)
)
else
:
phrases
=
[
get_phrases_from_posmap
(
logit
>
text_threshold
,
tokenized_caption
,
tokenizer
).
replace
(
'.'
,
''
)
for
logit
in
logits
]
return
boxes
,
np
.
max
(
logits
,
axis
=
1
),
phrases
# 性能测试函数(适配batch_size=1)
def
benchmark_performance
(
ort_session
,
tokenizer
,
memory_pool
,
image
,
caption
,
box_threshold
,
text_threshold
,
warmup_runs
=
5
,
test_runs
=
10
,
device
=
"cpu"
,
batch_size
=
1
):
"""
性能测试函数:batch_size=1,固定800x1200尺寸
"""
print
(
"="
*
60
)
print
(
"📊 开始性能测试(固定800x1200,batch_size=1)"
)
print
(
"="
*
60
)
# 1. 预热阶段(加载HIP模块)
print
(
f
"
\n
🔥 预热阶段(
{
warmup_runs
}
次)- 加载HIP模块"
)
warmup_start
=
time
.
time
()
for
i
in
range
(
warmup_runs
):
t0
=
time
.
time
()
predict
(
ort_session
,
tokenizer
,
memory_pool
,
image
,
caption
,
box_threshold
,
text_threshold
,
device
,
is_benchmark
=
True
)
warmup_time
=
time
.
time
()
-
t0
print
(
f
"预热
{
i
+
1
}
/
{
warmup_runs
}
- 耗时:
{
warmup_time
*
1000
:.
2
f
}
ms"
)
total_warmup_time
=
time
.
time
()
-
warmup_start
print
(
f
"
\n
预热完成 - 总耗时:
{
total_warmup_time
:.
3
f
}
s (HIP模块已加载完成)"
)
# 2. 实际推理测试(batch_size=1)
print
(
f
"
\n
🚀 实际推理测试(
{
test_runs
}
次,batch_size=1)"
)
test_start
=
time
.
time
()
infer_times
=
[]
# 单张推理(batch_size=1)
for
i
in
range
(
test_runs
):
t0
=
time
.
time
()
predict
(
ort_session
,
tokenizer
,
memory_pool
,
image
,
caption
,
box_threshold
,
text_threshold
,
device
,
is_benchmark
=
True
)
infer_time
=
time
.
time
()
-
t0
infer_times
.
append
(
infer_time
)
print
(
f
"实际推理
{
i
+
1
}
/
{
test_runs
}
- 耗时:
{
infer_time
*
1000
:.
2
f
}
ms"
)
# 计算性能指标
total_test_time
=
time
.
time
()
-
test_start
avg_infer_time
=
np
.
mean
(
infer_times
)
std_infer_time
=
np
.
std
(
infer_times
)
max_infer_time
=
np
.
max
(
infer_times
)
min_infer_time
=
np
.
min
(
infer_times
)
fps
=
test_runs
/
total_test_time
# 输出性能报告
print
(
"
\n
"
+
"="
*
60
)
print
(
"📈 优化后性能测试报告(固定800x1200)"
)
print
(
"="
*
60
)
print
(
f
"测试次数:
{
test_runs
}
次 (batch_size=1)"
)
print
(
f
"总推理耗时:
{
total_test_time
:.
3
f
}
s"
)
print
(
f
"平均推理耗时:
{
avg_infer_time
*
1000
:.
2
f
}
ms (±
{
std_infer_time
*
1000
:.
2
f
}
ms)"
)
print
(
f
"最大推理耗时:
{
max_infer_time
*
1000
:.
2
f
}
ms"
)
print
(
f
"最小推理耗时:
{
min_infer_time
*
1000
:.
2
f
}
ms"
)
print
(
f
"平均FPS:
{
fps
:.
2
f
}
帧/秒"
)
print
(
"="
*
60
)
return
{
"warmup_runs"
:
warmup_runs
,
"test_runs"
:
test_runs
,
"batch_size"
:
batch_size
,
"avg_infer_time_ms"
:
avg_infer_time
*
1000
,
"std_infer_time_ms"
:
std_infer_time
*
1000
,
"max_infer_time_ms"
:
max_infer_time
*
1000
,
"min_infer_time_ms"
:
min_infer_time
*
1000
,
"fps"
:
fps
}
if
__name__
==
'__main__'
:
# ========== 固定配置参数(800x1200,batch_size=1) ==========
model_path
=
'weights/ground.onnx'
img_path
=
'images/in/car_1.jpg'
TEXT_PROMPT
=
"car ."
BOX_TRESHOLD
=
0.35
TEXT_TRESHOLD
=
0.25
DEVICE
=
"cpu"
# 实际使用时改为"rocm"
WARMUP_RUNS
=
5
# 预热次数
TEST_RUNS
=
10
# 实际测试次数
BATCH_SIZE
=
1
# 固定为1
IMG_SHAPE
=
(
3
,
800
,
1200
)
# 固定导出尺寸
MAX_TEXT_LEN
=
256
# ========== ONNX Runtime优化配置(针对ROCM/HIP) ==========
print
(
"🔍 加载ONNX模型(固定800x1200,batch_size=1)"
)
sess_options
=
ort
.
SessionOptions
()
# 启用所有图优化
sess_options
.
graph_optimization_level
=
ort
.
GraphOptimizationLevel
.
ORT_ENABLE_ALL
# 禁用按需加载内核(预加载所有HIP内核)
sess_options
.
execution_mode
=
ort
.
ExecutionMode
.
ORT_SEQUENTIAL
sess_options
.
enable_cpu_mem_arena
=
False
sess_options
.
enable_mem_pattern
=
True
sess_options
.
log_severity_level
=
3
# ROCM/HIP优化配置
providers
=
[
(
'ROCMExecutionProvider'
,
{
'device_id'
:
0
,
'arena_extend_strategy'
:
'kNextPowerOfTwo'
,
'gpu_mem_limit'
:
8
*
1024
*
1024
*
1024
,
# 8GB GPU内存
'cudnn_conv_algo_search'
:
'EXHAUSTIVE'
,
'do_copy_in_default_stream'
:
True
# 减少流同步
}),
'CPUExecutionProvider'
]
# ========== 加载模型(仅一次,解决hipModuleLoadData瓶颈) ==========
ort_session
=
ort
.
InferenceSession
(
model_path
,
sess_options
=
sess_options
,
providers
=
providers
)
current_provider
=
ort_session
.
get_providers
()
print
(
f
"✅ 模型加载完成 - 当前执行引擎:
{
current_provider
}
"
)
# ========== 预加载tokenizer(仅一次) ==========
print
(
"
\n
📝 预加载BERT Tokenizer"
)
t0
=
time
.
time
()
tokenizer
=
BertTokenizer
.
from_pretrained
(
'bert-base-uncased'
)
print
(
f
"✅ Tokenizer加载完成 - 耗时:
{
(
time
.
time
()
-
t0
):.
3
f
}
s"
)
# ========== 初始化固定尺寸内存池(800x1200) ==========
print
(
"
\n
🗃️ 初始化固定尺寸内存池(800x1200)"
)
memory_pool
=
HIPMemoryPool
(
img_shape
=
IMG_SHAPE
,
max_text_len
=
MAX_TEXT_LEN
,
device
=
DEVICE
)
print
(
f
"✅ 内存池初始化完成 - 固定尺寸:
{
IMG_SHAPE
}
"
)
# ========== 加载并校验图片尺寸 ==========
print
(
"
\n
🖼️ 加载并预处理测试图片(强制800x1200)"
)
image_source
,
image
=
load_image
(
img_path
)
# 强制调整为800x1200(确保和导出尺寸一致)
if
image
.
shape
!=
IMG_SHAPE
:
print
(
f
"⚠️ 图片尺寸
{
image
.
shape
}
不符,强制调整为
{
IMG_SHAPE
}
"
)
image
=
cv2
.
resize
(
image
.
transpose
(
1
,
2
,
0
),
(
IMG_SHAPE
[
2
],
IMG_SHAPE
[
1
])).
transpose
(
2
,
0
,
1
)
print
(
f
"✅ 图片加载完成 - 最终尺寸:
{
image
.
shape
}
"
)
# ========== 性能测试 ==========
performance_result
=
benchmark_performance
(
ort_session
,
tokenizer
,
memory_pool
,
image
,
TEXT_PROMPT
,
BOX_TRESHOLD
,
TEXT_TRESHOLD
,
WARMUP_RUNS
,
TEST_RUNS
,
DEVICE
,
BATCH_SIZE
)
# ========== 最终推理 ==========
print
(
"
\n
"
+
"="
*
60
)
print
(
"🎯 执行最终推理(固定800x1200)"
)
print
(
"="
*
60
)
boxes
,
confs
,
phrases
=
predict
(
ort_session
,
tokenizer
,
memory_pool
,
image
,
TEXT_PROMPT
,
BOX_TRESHOLD
,
TEXT_TRESHOLD
,
DEVICE
)
# 绘制并保存结果
os
.
makedirs
(
'./images/out'
,
exist_ok
=
True
)
ori_img
=
cv2
.
imread
(
img_path
)
# 强制调整原始图片尺寸以匹配推理尺寸
ori_img
=
cv2
.
resize
(
ori_img
,
(
IMG_SHAPE
[
2
],
IMG_SHAPE
[
1
]))
img_h
,
img_w
=
ori_img
.
shape
[:
2
]
for
i
in
range
(
len
(
boxes
)):
one_box
=
boxes
[
i
]
one_conf
=
confs
[
i
]
one_cls
=
phrases
[
i
]
# 转换box坐标 (cx, cy, w, h) -> (x1, y1, x2, y2)
x1
=
int
((
one_box
[
0
]
-
one_box
[
2
]
/
2
)
*
img_w
)
y1
=
int
((
one_box
[
1
]
-
one_box
[
3
]
/
2
)
*
img_h
)
x2
=
int
((
one_box
[
0
]
+
one_box
[
2
]
/
2
)
*
img_w
)
y2
=
int
((
one_box
[
1
]
+
one_box
[
3
]
/
2
)
*
img_h
)
# 绘制框和标签
cv2
.
rectangle
(
ori_img
,
(
x1
,
y1
),
(
x2
,
y2
),
(
0
,
0
,
255
),
2
)
cv2
.
putText
(
ori_img
,
f
'
{
one_cls
}
{
one_conf
:.
2
f
}
'
,
(
x1
-
15
,
y1
-
15
),
cv2
.
FONT_HERSHEY_SIMPLEX
,
1.5
,
(
255
,
255
,
255
),
3
)
# 保存结果
cv2
.
imwrite
(
'./images/out/result_800x1200.jpg'
,
ori_img
)
print
(
f
"
\n
✅ 结果已保存至: ./images/out/result_800x1200.jpg"
)
print
(
f
"✅ 检测到目标:
{
phrases
}
(共
{
len
(
boxes
)
}
个)"
)
print
(
f
"✅ 性能指标: FPS=
{
performance_result
[
'fps'
]:.
2
f
}
, 平均耗时=
{
performance_result
[
'avg_infer_time_ms'
]:.
2
f
}
ms"
)
\ No newline at end of file
onnx_inference1_migraphx_test.py
deleted
100644 → 0
View file @
74fbd52c
import
migraphx
as
mgx
p
=
mgx
.
parse_onnx
(
"weights/ground_external.onnx"
)
# 只读取,不优化
passes
=
[
mgx
.
pass_dead_code_elimination
(),
# 删除未使用的节点/常量
mgx
.
pass_eliminate_contiguous
(),
# 合并相邻的 contiguous 操作
mgx
.
pass_simplify_reshapes
(),
# 合并/简化 reshape
mgx
.
pass_simplify_algebra
(),
# 简化代数表达式 (add/mul/..)
mgx
.
pass_eliminate_identity
(),
# 删除 Identity ops
mgx
.
pass_common_subexpression_elimination
(),
# CSE
]
p
.
apply_passes
(
passes
)
# 手动执行
p
.
compile
(
mgx
.
target
(
"gpu"
))
p
.
save
(
"weights/ground.mgx"
)
\ No newline at end of file
tmprllsblav/_remote_module_non_scriptable.py
deleted
100644 → 0
View file @
74fbd52c
from
typing
import
*
import
torch
import
torch.distributed.rpc
as
rpc
from
torch
import
Tensor
from
torch._jit_internal
import
Future
from
torch.distributed.rpc
import
RRef
from
typing
import
Tuple
# pyre-ignore: unused import
module_interface_cls
=
None
def
forward_async
(
self
,
*
args
,
**
kwargs
):
args
=
(
self
.
module_rref
,
self
.
device
,
self
.
is_device_map_set
,
*
args
)
kwargs
=
{
**
kwargs
}
return
rpc
.
rpc_async
(
self
.
module_rref
.
owner
(),
_remote_forward
,
args
,
kwargs
,
)
def
forward
(
self
,
*
args
,
**
kwargs
):
args
=
(
self
.
module_rref
,
self
.
device
,
self
.
is_device_map_set
,
*
args
)
kwargs
=
{
**
kwargs
}
ret_fut
=
rpc
.
rpc_async
(
self
.
module_rref
.
owner
(),
_remote_forward
,
args
,
kwargs
,
)
return
ret_fut
.
wait
()
_generated_methods
=
[
forward_async
,
forward
,
]
def
_remote_forward
(
module_rref
:
RRef
[
module_interface_cls
],
device
:
str
,
is_device_map_set
:
bool
,
*
args
,
**
kwargs
):
module
=
module_rref
.
local_value
()
device
=
torch
.
device
(
device
)
if
device
.
type
!=
"cuda"
:
return
module
.
forward
(
*
args
,
**
kwargs
)
# If the module is on a cuda device,
# move any CPU tensor in args or kwargs to the same cuda device.
# Since torch script does not support generator expression,
# have to use concatenation instead of
# ``tuple(i.to(device) if isinstance(i, Tensor) else i for i in *args)``.
args
=
(
*
args
,)
out_args
:
Tuple
[()]
=
()
for
arg
in
args
:
arg
=
(
arg
.
to
(
device
),)
if
isinstance
(
arg
,
Tensor
)
else
(
arg
,)
out_args
=
out_args
+
arg
kwargs
=
{
**
kwargs
}
for
k
,
v
in
kwargs
.
items
():
if
isinstance
(
v
,
Tensor
):
kwargs
[
k
]
=
kwargs
[
k
].
to
(
device
)
if
is_device_map_set
:
return
module
.
forward
(
*
out_args
,
**
kwargs
)
# If the device map is empty, then only CPU tensors are allowed to send over wire,
# so have to move any GPU tensor to CPU in the output.
# Since torch script does not support generator expression,
# have to use concatenation instead of
# ``tuple(i.cpu() if isinstance(i, Tensor) else i for i in module.forward(*out_args, **kwargs))``.
ret
:
Tuple
[()]
=
()
for
i
in
module
.
forward
(
*
out_args
,
**
kwargs
):
i
=
(
i
.
cpu
(),)
if
isinstance
(
i
,
Tensor
)
else
(
i
,)
ret
=
ret
+
i
return
ret
utils/utils.py
View file @
ca23112b
...
...
@@ -24,7 +24,8 @@ def preprocess_caption(caption: str):
def
load_image
(
image_path
:
str
):
transform
=
T
.
Compose
(
[
T
.
RandomResize
([
800
],
max_size
=
1333
),
# T.RandomResize([800], max_size=1333), # 800x1200输入大小
T
.
RandomResize
([
400
],
max_size
=
1333
),
# 400x600输入大小
T
.
ToTensor
(),
T
.
Normalize
([
0.485
,
0.456
,
0.406
],
[
0.229
,
0.224
,
0.225
]),
]
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment