新增migraphx部分

a1865640 · zk · 0896d47e · a1865640 · a1865640 · a1865640
Commit a1865640 authored Apr 27, 2026 by zk
17 changed files
--- a/README.md
+++ b/README.md
@@ -157,7 +157,7 @@ python onnx_inference_deform_optim.py
 如需使用更低分辨率的图像输入（如 400x800）以进一步加速推理，可按以下步骤操作：
-### 6.1 修改导出脚本
+1. 修改导出脚本
 编辑 `deform_ort/export_onnx_deform.py`，修改图像尺寸与导出路径：
@@ -169,7 +169,7 @@ img = torch.randn(1, 3, 400, 800).to(device)
 onnx_output_path = "../weights_400x800/ground_deform.onnx"
 ```
-### 6.2 正常导出并量化
+2. 正常导出并量化
 ```bash
 cd deform_ort
@@ -177,7 +177,7 @@ python export_onnx_deform.py
 python onnx_optimize.py
 ```
-### 6.3 修改推理预处理分辨率
+3. 修改推理预处理分辨率
 编辑 `groundingdino/util/inference.py` 中的 `load_image` 函数，将 `RandomResize` 的参数从 800 改为 400：
@@ -186,7 +186,7 @@ python onnx_optimize.py
 T.RandomResize([400], max_size=1333),
 ```
-### 6.4. 执行 ORT 推理
+4. 执行 ORT 推理
 运行推理脚本，并确保代码中的 ONNX 模型路径指向 `weights_400x800/` 下对应的模型文件：
@@ -198,7 +198,26 @@ python onnx_inference_deform_optim.py
 -----
-## 7\. 测试结果对比
+## 7\. migraphx推理
+1. 进入migraphx_infer文件夹
+```bash
+cd migraphx_infer
+```
+2. 运行转换onnx脚本
+将简化后的onnx转换为要用migraphx推理的onnx
+```bash
+bash migraphx_export.bash
+```
+3. 如果已经得到了mxr文件，直接测试
+```bash
+bash migraphx_perf.bash
+```
+-----
+## 8\. 测试结果对比
 *以下测试均包含 5 轮预热（Warmup）和 10 轮正式测试。*
@@ -208,7 +227,7 @@ python onnx_inference_deform_optim.py
 >   * **模型文件**：默认存放于 `../weights/` 目录下。
 >   * **自定义算子目录**：对应的完整动态库路径均为 `../[目录名]/build/libms_deform_attn_ort.so`。
-### 7.1 BW150 测试结果
+### 8.1 ORT BW150 测试结果
 单张 BW150 卡，图像输入 800x1200，Batch Size = 1
@@ -221,7 +240,7 @@ python onnx_inference_deform_optim.py
 | **ORT + Plugin** | +自定义算子<br>+FP16 纯量化方案 B | `ground_deform_fp16_all.onnx` | `ort_plugin_fp16_B` | 87.34 | 11.44 |
 | **ORT + Plugin** | +自定义算子<br>+FP16 极致优化方案 C | `ground_deform_fp16_all.onnx` | `ort_plugin_fp16_C` | 84.52 | 11.82 |
-### 7.2 BW100 测试结果
+### 8.2 ORT BW100 测试结果
 单张 BW100 卡，图像输入 800x1200，Batch Size = 1
@@ -233,11 +252,22 @@ python onnx_inference_deform_optim.py
 | **ORT + Plugin** | +自定义算子<br>+FP16 纯量化方案 B | `ground_deform_fp16_all.onnx` | `ort_plugin_fp16_B` | 105.35 | 9.49 |
 | **ORT + Plugin** | +自定义算子<br>+FP16 极致优化方案 C | `ground_deform_fp16_all.onnx` | `ort_plugin_fp16_C` | 100.91 | 9.90 |
+### 8.3 migraphx BW100 测试结果
+```
+Batch size: 1
+Rate: 6.05197 inferences/sec
+Total time: 165.235ms (Min: 165.115ms, Max: 165.535ms, 
+Mean: 165.258ms, Median: 165.225ms)
+Percentiles (90%, 95%, 99%): (165.358ms, 165.358ms, 165.358ms)
+Total instructions time: 205.275ms
+Overhead time: 2.32812ms, -40.0399ms
+Overhead: 1%, -24%
+```
 -----
 ## 参考项目
-本项目在开发过程中参考了以下优秀开源项目，在此表示感谢：
+本项目在开发过程中参考了以下开源项目：
  - [**GroundingDINO**](https://github.com/IDEA-Research/GroundingDINO) - GroundingDINO 官方仓库，提供基础模型与算法实现。
  - [**GroundingDINO-TensorRT-and-ONNX-Inference**](https://github.com/wingdzero/GroundingDINO-TensorRT-and-ONNX-Inference) - 提供了 GroundingDINO 的 TensorRT 及 ONNX 推理部署参考实现。
\ No newline at end of file
--- a/deform_ort/onnx_inference_deform_optim.py
+++ b/deform_ort/onnx_inference_deform_optim.py
@@ -7,20 +7,35 @@ import onnxruntime as ort
 import bisect
 import time
 import os
+from typing import Tuple
+import groundingdino.datasets.transforms as T
+from PIL import Image
 """
    针对模型前后处理和代码结构进行优化
        1.预测结果获取优化prediction_logits = sigmoid(outputs[0][0])
        2.输入数据提前获取直接传入，移除了对tokenizer的依赖
 """
-from groundingdino.util.inference import load_image
 so_options = ort.SessionOptions()
-custom_op_lib_path = "../ort_plugin/build/libms_deform_attn_ort.so" 
+custom_op_lib_path = "../ort_plugin_fp16_C/build/libms_deform_attn_ort.so" 
 so_options.register_custom_ops_library(custom_op_lib_path)
 # 开启ort优化
 so_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+def load_image(image_path: str) -> Tuple[np.array, torch.Tensor]:
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            # T.RandomResize([400], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image_source = Image.open(image_path).convert("RGB")
+    image = np.asarray(image_source)
+    image_transformed, _ = transform(image_source, None)
+    return image, image_transformed
 def sigmoid(x):
    return 1 / (1 + np.exp(-x))
@@ -180,7 +195,7 @@ def benchmark_performance(
 if __name__ == '__main__':
    # 配置参数
-    model_path = '../weights_400x600/ground_deform.onnx'
+    model_path = '../weights/ground_deform_fp16_all.onnx'
    """
        ../weights/ground_deform.onnx 普通版本
        ../weights/ground_deform_sim.onnx 简化版本
@@ -264,6 +279,6 @@ if __name__ == '__main__':
        )
    # 保存结果
-    cv2.imwrite('./result.jpg', ori_img)
+    cv2.imwrite('../weights/result.jpg', ori_img)
-    print(f"\n✅ 结果已保存至: ./result.jpg")
+    print(f"\n✅ 结果已保存至: ../weights/result.jpg")
    print(f"✅ 检测到目标: {phrases} (共 {len(boxes)} 个)")
--- a/deform_ort/onnx_inference_deform_optim_iobinding.py
+++ b/deform_ort/onnx_inference_deform_optim_iobinding.py
@@ -7,6 +7,9 @@ import onnxruntime as ort
 import bisect
 import time
 import os
+from typing import Tuple
+import groundingdino.datasets.transforms as T
+from PIL import Image
 """
 针对模型前后处理和代码结构进行优化
 1.预测结果获取优化prediction_logits = sigmoid(outputs[0][0])
@@ -14,14 +17,28 @@ import os
 3.IO binding优化
 """
-from groundingdino.util.inference import load_image
 so_options = ort.SessionOptions()
-custom_op_lib_path = "../ort_plugin/build/libms_deform_attn_ort.so" 
+# 如何想要查看ORT的详细日志，可以取消下面这行的注释，并设置合适的日志级别
+# so_options.enable_profiling = True
+custom_op_lib_path = "../ort_plugin_fp16_C/build/libms_deform_attn_ort.so" 
 so_options.register_custom_ops_library(custom_op_lib_path)
 # 开启ort优化
 so_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+def load_image(image_path: str) -> Tuple[np.array, torch.Tensor]:
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            # T.RandomResize([400], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image_source = Image.open(image_path).convert("RGB")
+    image = np.asarray(image_source)
+    image_transformed, _ = transform(image_source, None)
+    return image, image_transformed
 def sigmoid(x):
    return 1 / (1 + np.exp(-x))
@@ -67,20 +84,17 @@ def predict(
    t0 = time.time()
-    # 1. 仅仅绑定当前这帧发生变化的图片！其他文本输入早就在显存里躺好了。
+    # 1. 仅仅绑定当前这帧发生变化的图片，其他文本输入绑定好了
    img_tensor = np.expand_dims(np.asarray(image), axis=0)
+    # 尝试输入进行fp16转换，导出onnx时输入转换为fp16，但是推理性能下降了
+    # img_tensor = np.expand_dims(np.asarray(image), axis=0).astype(np.float16)
    io_binding.bind_cpu_input('img', img_tensor)
-    # 2. 绑定需要获取的输出
-    io_binding.bind_output('logits')
-    io_binding.bind_output('boxes')
-    # 3. 极速执行推理
+    # 2. 执行推理
    ort_session.run_with_iobinding(io_binding)
-    ort_outputs = io_binding.copy_outputs_to_cpu()
-    # 清空输出绑定，否则下一次循环会内存泄漏报错
+    # 3. 结果从GPU 复制回 CPU
-    io_binding.clear_binding_outputs()
+    ort_outputs = io_binding.copy_outputs_to_cpu()
    infer_time = time.time() - t0
    if not is_benchmark:
@@ -204,7 +218,7 @@ def benchmark_performance(
 if __name__ == '__main__':
    # 配置参数
-    model_path = '../weights/ground_deform_fp16.onnx'
+    model_path = '../weights_opt/ground_deform_opt_fp16_all.onnx'
    img_path = '../images/in/car_1.jpg'
    TEXT_PROMPT = "car ."
    BOX_TRESHOLD = 0.35
@@ -241,6 +255,9 @@ if __name__ == '__main__':
    for key in static_keys:
        io_binding.bind_cpu_input(key, TEXT_CACHE[key])
+    io_binding.bind_output('logits')
+    io_binding.bind_output('boxes')
    # 第一步：运行完整的性能测试（预热+实际推理）
    performance_result = benchmark_performance(
        ort_session, io_binding, image, TEXT_CACHE,
@@ -281,6 +298,6 @@ if __name__ == '__main__':
        )
    # 保存结果
-    cv2.imwrite('./images/out/result.jpg', ori_img)
+    cv2.imwrite('../images/out/result.jpg', ori_img)
-    print(f"\n✅ 结果已保存至: ./images/out/result.jpg")
+    print(f"\n✅ 结果已保存至: ../images/out/result.jpg")
    print(f"✅ 检测到目标: {phrases} (共 {len(boxes)} 个)")
--- a/deform_ort/onnx_optimize.py
+++ b/deform_ort/onnx_optimize.py
@@ -2,25 +2,26 @@ import onnx
 from onnxsim import simplify
 from onnxconverter_common import float16
-onnx_model_path = "../weights_400x600/ground_deform.onnx"
+onnx_model_path = "../weights/ground_deform.onnx"
-sim_model_path = "../weights_400x600/ground_deform_sim.onnx"
+sim_model_path = "../weights_opt/ground_deform_opt.onnx"
-fp16_model_path = "../weights_400x600/ground_deform_fp16.onnx"
+fp16_model_path = "../weights_opt/ground_deform_opt_fp16.onnx"
-fp16_all_model_path = "../weights_400x600/ground_deform_fp16_all.onnx"
+fp16_all_model_path = "../weights_opt/ground_deform_opt_fp16_all.onnx"
 custom_op_lib_path = "../ort_plugin_fp16/build/libms_deform_attn_ort.so" 
-# ==========================================
+# # ==========================================
-# 第一步：ONNX Simplify (附带自定义算子库)
+# # 第一步：ONNX Simplify (附带自定义算子库)
-# ==========================================
+# # ==========================================
-print("1️⃣ 正在进行 ONNX Simplify...")
+# print("1️⃣ 正在进行 ONNX Simplify...")
-model = onnx.load(onnx_model_path)
+# model = onnx.load(onnx_model_path)
-model_simp, check = simplify(model, custom_lib=custom_op_lib_path)
+# model_simp, check = simplify(model, custom_lib=custom_op_lib_path)
-if check:
+# if check:
-    onnx.save(model_simp, sim_model_path)
+#     onnx.save(model_simp, sim_model_path)
-    print(f"✅ Simplify 完成！已保存至 {sim_model_path}")
+#     print(f"✅ Simplify 完成！已保存至 {sim_model_path}")
-else:
+# else:
-    print("❌ Simplify 验证失败！")
+#     print("❌ Simplify 验证失败！")
-    exit()
+#     exit()
@@ -30,30 +31,28 @@ else:
 # 重新加载 sim 后的模型
 model_to_fp16 = onnx.load(sim_model_path)
-print("\n2️⃣ 正在进行 FP16 混合精度转换...")
 original_cast_nodes = [node.name for node in model_to_fp16.graph.node if node.op_type == "Cast"]
 print(f"🔍 查找到 {len(original_cast_nodes)} 个原生 Cast 节点，已全部加入保护名单。")
+print("\n2️⃣ 正在进行 FP16 混合精度转换...")
 model_fp16 = float16.convert_float_to_float16(
    model_to_fp16,
    op_block_list=["ms_deform_attn"],     # 屏蔽自定义的注意力算子, 如果是fp32版本自定义算子
    node_block_list=original_cast_nodes,  # 保护所有原生的 Cast 节点
    keep_io_types=True                # 保持整个模型的总输入/输出还是 FP32
 )
 onnx.save(model_fp16, fp16_model_path)
 print(f"✅ FP16 转换完成(避开自定义算子)！已保存至 {fp16_model_path}")
-print("\n2️⃣ 正在进行纯 FP16 精度转换...")
+print("\n2️⃣ 正在进行纯 FP16 精度转换...")
 model_fp16_all = float16.convert_float_to_float16(
    model_to_fp16,
    node_block_list=original_cast_nodes,  # 保护所有原生的 Cast 节点
    keep_io_types=True                # 保持整个模型的总输入/输出还是 FP32
 )
 onnx.save(model_fp16_all, fp16_all_model_path)
-print(f"✅ FP16 转换完成！已保存至 {fp16_all_model_path}")
+print(f"✅ 纯 FP16 转换完成！已保存至 {fp16_all_model_path}")
--- a/deform_ort/profile_analyzer.py
+++ b/deform_ort/profile_analyzer.py
+import json
+import sys
+from collections import defaultdict
+def analyze_profile(json_path):
+    print(f"🔍 正在解析性能文件: {json_path}\n")
+    with open(json_path, 'r') as f:
+        data = json.load(f)
+    # 兼容不同的 JSON 根节点格式
+    events = data if isinstance(data, list) else data.get('traceEvents', [])
+    # 按“算子类型”(如 MatMul, Conv) 统计总耗时
+    op_type_times = defaultdict(float)
+    # 按“具体节点名”(如 /transformer/encoder/MatMul_1) 统计总耗时
+    node_name_times = defaultdict(float)
+    total_inference_time = 0.0
+    for event in events:
+        # 只统计包含持续时间(dur)和参数(args)的事件
+        if 'dur' in event and 'args' in event:
+            args = event['args']
+            # ORT 通常把算子类型记录在 args 里的 op_name
+            if 'op_name' in args:
+                op_type = args['op_name']
+                # event['name'] 通常包含完整的节点路径
+                node_name = event.get('name', 'Unknown_Node')
+                # JSON 里的 dur 单位是微秒 (microseconds)，转成毫秒 (ms)
+                dur_ms = event['dur'] / 1000.0 
+                op_type_times[op_type] += dur_ms
+                node_name_times[node_name] += dur_ms
+                total_inference_time += dur_ms
+    # 对字典进行降序排序
+    sorted_op_types = sorted(op_type_times.items(), key=lambda x: x[1], reverse=True)
+    sorted_nodes = sorted(node_name_times.items(), key=lambda x: x[1], reverse=True)
+    print("="*50)
+    print("🏆 按【算子类型 (OpType)】耗时总和排名 Top 10")
+    print("="*50)
+    for i, (op, time_ms) in enumerate(sorted_op_types[:10]):
+        percentage = (time_ms / total_inference_time) * 100 if total_inference_time > 0 else 0
+        print(f"{i+1:2d}. {op:<20} | 耗时: {time_ms:>8.3f} ms | 占比: {percentage:>5.2f}%")
+    print("\n" + "="*50)
+    print("🎯 按【单个具体节点 (Node)】耗时排名 Top 15")
+    print("="*50)
+    for i, (node, time_ms) in enumerate(sorted_nodes[:15]):
+        percentage = (time_ms / total_inference_time) * 100 if total_inference_time > 0 else 0
+        print(f"{i+1:2d}. 耗时: {time_ms:>8.3f} ms ({percentage:>5.2f}%) | 节点: {node}")
+if __name__ == "__main__":
+    # 把这里换成你刚刚生成的 json 文件名
+    profile_file = "./onnxruntime_profile__2026-04-27_13-58-17.json" 
+    if len(sys.argv) > 1:
+        profile_file = sys.argv[1]
+    analyze_profile(profile_file)
\ No newline at end of file
--- a/deform_ort/result.jpg
+++ b/deform_ort/result.jpg
--- a/groundingdino/util/inference.py
+++ b/groundingdino/util/inference.py
@@ -39,8 +39,8 @@ def load_model(model_config_path: str, model_checkpoint_path: str, device: str =
 def load_image(image_path: str) -> Tuple[np.array, torch.Tensor]:
    transform = T.Compose(
        [
-            # T.RandomResize([800], max_size=1333),
+            T.RandomResize([800], max_size=1333),
-            T.RandomResize([400], max_size=1333),
+            # T.RandomResize([400], max_size=1333),
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]

--- a/images/out/result.jpg
+++ b/images/out/result.jpg
--- a/migraphx_infer/migraphx_export.bash
+++ b/migraphx_infer/migraphx_export.bash
+export MIGRAPHX_ENABLE_MIOPEN_CONCAT=1
+migraphx-driver perf --onnx \
+    ../weights/ground_opt.onnx \
+    --fp16 \
+    --output \
+    ../weights/ground_opt.mxr
\ No newline at end of file
--- a/migraphx_infer/migraphx_infer.bash
+++ b/migraphx_infer/migraphx_infer.bash
-MIGRAPHX_LOG=debug migraphx-driver compile \
-    --onnx weights/ground_external.onnx \
-    --gpu \
-    -p dead_code_elimination \
-    --output weights/ground.mgx
-    # -p eliminate_contiguous \
-    # -p simplify_reshapes \
-    # -p simplify_algebra \
-    # -p eliminate_identity \
-    # -p common_subexpression_elimination \
\ No newline at end of file
--- a/migraphx_infer/migraphx_infer.py
+++ b/migraphx_infer/migraphx_infer.py
@@ -57,7 +57,7 @@ def _mgx_shape_to_numpy(shape):
 # 🚀 MIGraphX 推理类（带缓存）
 # =========================
 class MIGraphXModel:
-    def __init__(self, onnx_path, cache_path="weights/ground.mxr", force_recompile=False):
+    def __init__(self, onnx_path, cache_path="weights/ground_opt.mxr", force_recompile=False):
        self.cache_path = cache_path
        # ====== 优先加载缓存 ======
@@ -228,10 +228,10 @@ def benchmark(model, tokenizer, image, caption, box_th, text_th, warmup=5, runs=
 # =========================
 if __name__ == "__main__":
-    model_path = "weights/ground_simplified.onnx"
+    model_path = "../weights/ground_opt.onnx"
-    cache_path = "weights/ground_simplified.mxr"   # ⭐ 缓存文件
+    cache_path = "../weights/ground_opt.mxr"   # ⭐ 缓存文件
-    img_path = "images/in/car_1.jpg"
+    img_path = "../images/in/car_1.jpg"
    TEXT_PROMPT = "car ."
    BOX_TRESHOLD = 0.35

--- a/migraphx_infer/migraphx_infer1.py
+++ b/migraphx_infer/migraphx_infer1.py
+import cv2
+import numpy as np
+import time
+import os
+import migraphx
+from typing import Tuple
+import torch
+import groundingdino.datasets.transforms as T
+from PIL import Image
+def load_image(image_path: str) -> Tuple[np.array, torch.Tensor]:
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            # T.RandomResize([400], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image_source = Image.open(image_path).convert("RGB")
+    image = np.asarray(image_source)
+    image_transformed, _ = transform(image_source, None)
+    return image, image_transformed
+def sigmoid(x):
+    return 1 / (1 + np.exp(-x))
+def _mgx_shape_to_numpy(shape):
+    shape_str = str(shape)
+    if "int64_type" in shape_str:
+        dtype = np.int64
+    elif "bool_type" in shape_str:
+        dtype = np.bool_
+    elif "half_type" in shape_str:
+        dtype = np.float16
+    else:
+        dtype = np.float32
+    try:
+        dims = list(shape.dims())
+    except Exception:
+        dims = []
+    try:
+        lens = list(shape.lens())
+    except Exception:
+        lens = []
+    return dtype, (dims if len(dims) > 0 else lens)
+# =========================
+# 🚀 MIGraphX 推理类（带缓存与生命周期管理）
+# =========================
+class MIGraphXModel:
+    def __init__(self, onnx_path, cache_path="weights/ground_opt.mxr", force_recompile=False, device_id=0):
+        self.cache_path = cache_path
+        if os.path.exists(cache_path) and not force_recompile:
+            print(f"⚡ 直接加载已编译模型: {cache_path}")
+            self.model = migraphx.load(cache_path)
+        else:
+            print("🔍 从 ONNX 构建 MIGraphX")
+            self.model = migraphx.parse_onnx(onnx_path) 
+            print(f"⚙️ 编译 MIGraphX（GPU {device_id}）")
+            self.model.compile(t=migraphx.get_target("gpu"), device_id=device_id)
+            print(f"💾 保存编译模型到: {cache_path}")
+            migraphx.save(self.model, cache_path)
+        self.input_shapes = self.model.get_inputs()
+    def infer(self, input_dict):
+        mgx_inputs = {}
+        # 【关键修复区】：用于保持 NumPy 数组存活，防止 Python 垃圾回收导致底层指针失效
+        self._keep_alive_cache = {} 
+        provided_names = set(input_dict.keys())
+        required_names = {
+            k for k in self.input_shapes.keys()
+            if not str(k).startswith("main:#output")
+        }
+        for name in required_names:
+            shape = self.input_shapes[name]
+            target_dtype, lens = _mgx_shape_to_numpy(shape)
+            if name in provided_names:
+                # 1. 必须转为连续内存！防止 PyTorch 转过来的 array 内存步长不一致
+                arr = np.ascontiguousarray(input_dict[name])
+                # 2. 强制类型转换
+                if arr.dtype != target_dtype:
+                    arr = arr.astype(target_dtype)
+            else:
+                # 缺失的输入用 0 补齐
+                arr = np.zeros(lens, dtype=target_dtype)
+            # 3. 将数组塞进字典，强行续命！
+            self._keep_alive_cache[name] = arr
+            # 4. 安全地将指针移交给 migraphx
+            mgx_inputs[name] = migraphx.argument(arr)
+        start = time.time()
+        result = self.model.run(mgx_inputs)
+        infer_time = time.time() - start
+        outputs = [np.array(r) for r in result]
+        # 推理结束，释放内存
+        self._keep_alive_cache.clear()
+        return outputs, infer_time
+# =========================
+# 推理函数 (硬编码输入，无 Tokenizer)
+# =========================
+def predict(model, image, box_threshold, is_benchmark=False):
+    input_dict = {
+        "img": np.expand_dims(np.asarray(image), axis=0),
+        "position_ids": np.array([[0, 0, 1, 0]]),
+        "input_ids": np.array([[101, 2482, 1012, 102]]),
+        "token_type_ids": np.array([[0, 0, 0, 0]]),
+        "text_token_mask": np.array([[
+            [True, False, False, False],
+            [False, True, True, False],
+            [False, True, True, False],
+            [False, False, False, True]
+        ]]),
+        "attention_mask": np.array([[True, True, True, True]])
+    }
+    outputs, infer_time = model.infer(input_dict)
+    if not is_benchmark:
+        print(f"Inference time: {infer_time*1000:.2f} ms")
+    logits = sigmoid(outputs[0][0])
+    boxes = outputs[1][0]
+    max_values = np.max(logits, axis=1)
+    mask = max_values > box_threshold
+    logits = logits[mask]
+    boxes = boxes[mask]
+    phrases = ["car"] * len(boxes)
+    return boxes, np.max(logits, axis=1), phrases
+# =========================
+# Benchmark
+# =========================
+def benchmark(model, image, box_th, warmup=5, runs=10):
+    print("\n🔥 预热")
+    for _ in range(warmup):
+        predict(model, image, box_th, True)
+    print("\n🚀 测试")
+    times = []
+    for i in range(runs):
+        start = time.time()
+        predict(model, image, box_th, True)
+        times.append(time.time() - start)
+    print(f"\n平均耗时: {np.mean(times)*1000:.2f} ms")
+    print(f"FPS: {1/np.mean(times):.2f}")
+# =========================
+# 主函数
+# =========================
+# if __name__ == "__main__":
+#     model_path = "../weights/ground_opt.onnx"
+#     cache_path = "../weights/ground_opt.mxr" 
+#     img_path = "../images/in/car_1.jpg"
+#     BOX_TRESHOLD = 0.35
+#     DEVICE_ID = 5 # 匹配你之前报错堆栈里的 device: 5 / 0 的情况，按需修改
+#     model = MIGraphXModel(
+#         model_path,
+#         cache_path=cache_path,
+#         force_recompile=False,
+#         device_id=DEVICE_ID
+#     )
+#     image_source, image = load_image(img_path)
+#     benchmark(model, image, BOX_TRESHOLD)
+#     boxes, confs, phrases = predict(model, image, BOX_TRESHOLD)
+#     print("检测结果:", phrases)
+def test_like_perf(model):
+    print("\n" + "="*60)
+    print("🛠️ 模拟 perf 工具：生成完美对齐的 Dummy 数据测试")
+    print("="*60)
+    mgx_inputs = {}
+    keep_alive_cache = [] # 强行续命池
+    # 1. 严格按照模型要求的形状造假数据
+    for name, shape in model.get_inputs().items():
+        if str(name).startswith("main:#output"):
+            continue
+        # 解析真实需要的类型和形状
+        target_dtype, lens = _mgx_shape_to_numpy(shape)
+        print(f"  📦 分配 {name}: shape={lens}, dtype={target_dtype.__name__}")
+        # 生成分毫不差的全零矩阵（完美模拟 migraphx-driver）
+        dummy_data = np.zeros(lens, dtype=target_dtype)
+        keep_alive_cache.append(dummy_data)
+        # 移交指针
+        mgx_inputs[name] = migraphx.argument(dummy_data)
+    print("\n🚀 开始 Dummy 推理测试...")
+    try:
+        start = time.time()
+        model.run(mgx_inputs)
+        print(f"✅ Python 端 Dummy 推理成功！没有任何 VMFault！耗时: {(time.time()-start)*1000:.2f}ms")
+    except Exception as e:
+        print(f"❌ 依然报错: {e}")
+# ------------------
+# 在主函数里这样调用：
+# ------------------
+if __name__ == "__main__":
+    model_path = "../weights/ground_opt.onnx"
+    cache_path = "../weights/ground_opt.mxr" 
+    model = migraphx.load(cache_path) # 直接加载你确定没问题的 mxr
+    # 运行模拟测试
+    test_like_perf(model)
\ No newline at end of file
--- a/migraphx_infer/migraphx_perf.bash
+++ b/migraphx_infer/migraphx_perf.bash
+migraphx-driver perf --batch 1 \
+    -n 10 \
+    --fp16 \
+    --migraphx ../weights/ground_opt.mxr
\ No newline at end of file
--- a/migraphx_infer/modify_onnx.py
+++ b/migraphx_infer/modify_onnx.py
--- a/migraphx_infer/modify_onnx1.py
+++ b/migraphx_infer/modify_onnx1.py
--- a/migraphx_infer/onnx_modifier.py
+++ b/migraphx_infer/onnx_modifier.py
--- a/migraphx_infer/onnx_sim.py
+++ b/migraphx_infer/onnx_sim.py
+import onnx
+from onnxsim import simplify
+from onnxconverter_common import float16
+onnx_model_path = "./weights/ground.onnx"
+sim_model_path = "./weights/ground_sim.onnx"
+print("1️⃣ 正在进行 ONNX Simplify...")
+model = onnx.load(onnx_model_path)
+model_simp, check = simplify(model)
+if check:
+    onnx.save(model_simp, sim_model_path)
+    print(f"✅ Simplify 完成！已保存至 {sim_model_path}")
+else:
+    print("❌ Simplify 验证失败！")
+    exit()
\ No newline at end of file