update

f1a225f3 · zk · 552b62f3 · f1a225f3 · f1a225f3 · 552b62f3
Commit f1a225f3 authored Apr 16, 2026 by zk
10 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -158,6 +158,7 @@ tmp/
 xiongke_log.txt
 migraphx_log.txt
 weights/
+weights_400x600/
 checkpoints/

 # 忽略环境安装包

--- a/export_onnx.py
+++ b/export_onnx.py
@@ -54,8 +54,8 @@ torch.onnx.export(
    output_names=["logits", "boxes"],
    dynamic_axes=None, # 静态维度导出
    opset_version=17,
-    verbose=False  # 关闭详细日志，如需调试可改为True
-    # do_constant_folding=True  # 常量折叠优化，提升简化效果
+    verbose=False,  # 关闭详细日志，如需调试可改为True
+    do_constant_folding=True  # 常量折叠优化，提升简化效果
 )
 print(f"ONNX模型已成功导出到: {onnx_output_path}")


--- a/export_onnx_migraphx_debug.py
+++ b/export_onnx_migraphx_debug.py
-import torch
-import onnx
-from onnxsim import simplify
-
-from groundingdino.models import build_model
-from groundingdino.util.slconfig import SLConfig
-from groundingdino.util.utils import clean_state_dict
-
-config_file = './groundingdino/config/GroundingDINO_SwinB_cfg.py'
-checkpoint_path = './weights/groundingdino_swinb_cogcoor.pth'
-
-def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
-    args = SLConfig.fromfile(model_config_path)
-    args.device = "cuda" if not cpu_only else "cpu"
-
-    # modified config
-    args.use_checkpoint = False
-    args.use_transformer_ckpt = False
-
-    model = build_model(args)
-    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
-    model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
-    _ = model.eval()
-    return model
-
-# 加载模型
-model = load_model(config_file, checkpoint_path, cpu_only=True)
-
-# 正式推理时使用的提示词，以及相关的mask
-caption = "car ."
-input_ids = model.tokenizer([caption], return_tensors="pt")["input_ids"]
-position_ids = torch.tensor([[0, 0, 1, 0]])
-token_type_ids = torch.tensor([[0, 0, 0, 0]])
-attention_mask = torch.tensor([[True, True, True, True]])
-text_token_mask = torch.tensor([[[True, False, False, False],
-                                 [False,  True,  True,  False],
-                                 [False,  True,  True,  False],
-                                 [False,  False, False, True]]])
-# 固定输入分辨率
-img = torch.randn(1, 3, 800, 1200)
-
-# onnx模型可以支持动态输入，在转换engine时建议注销
-dynamic_axes = {
-    "input_ids": {0: "batch_size", 1: "seq_len"},
-    "attention_mask": {0: "batch_size", 1: "seq_len"},
-    "position_ids": {0: "batch_size", 1: "seq_len"},
-    "token_type_ids": {0: "batch_size", 1: "seq_len"},
-    "text_token_mask": {0: "batch_size", 1: "seq_len", 2: "seq_len"},
-    "img": {0: "batch_size", 2: "height", 3: "width"},
-    "logits": {0: "batch_size"},
-    "boxes": {0: "batch_size"}
-}
-
-# 导出原始ONNX模型
-onnx_output_path = "weights/ground.onnx"
-
-torch.onnx.export(
-    model,
-    f=onnx_output_path,
-    args=(img, input_ids, attention_mask, position_ids, token_type_ids, text_token_mask),
-    input_names=["img", "input_ids", "attention_mask", "position_ids", "token_type_ids", "text_token_mask"],
-    output_names=["logits", "boxes"],
-    opset_version=17,
-    verbose=False,  # 关闭详细日志，如需调试可改为True
-    do_constant_folding=True  # 常量折叠优化，提升简化效果
-)
--- a/onnx_inference1_migraphx.bash
+++ b/onnx_inference1_migraphx.bash
--- a/onnx_inference1_migraphx.py
+++ b/onnx_inference1_migraphx.py
--- a/onnx_inference1_migraphx_xiongke.py
+++ b/onnx_inference1_migraphx_xiongke.py
-import cv2
-import numpy as np
-import torch
-import time
-import os
-os.environ["MIGRAPHX_SAVE_TEMPS"] = "1"
-os.environ["MIGRAPHX_TRACE"] = "1"
-os.environ["MIGRAPHX_LOG_LEVEL"] = "DEBUG" 
-import migraphx
-
-from transformers import BertTokenizer
-from groundingdino.util.inference import load_image
-from groundingdino.models.GroundingDINO.bertwarper import generate_masks_with_special_tokens_and_transfer_map
-
-# =========================
-# 工具函数
-# =========================
-def sigmoid(x):
-    return 1 / (1 + np.exp(-x))
-
-def preprocess_caption(caption: str) -> str:
-    result = caption.lower().strip()
-    if result.endswith("."):
-        return result
-    return result + "."
-
-def to_mgx(x):
-    if x.dtype == np.int64:
-        return migraphx.argument(x.astype(np.int64))
-    elif x.dtype == np.bool_:
-        return migraphx.argument(x.astype(np.bool_))
-    else:
-        return migraphx.argument(x.astype(np.float32))
-
-
-def _mgx_shape_to_numpy(shape):
-    """将 migraphx shape 转为 numpy dtype 和 lens。"""
-    shape_str = str(shape)
-    if "int64_type" in shape_str:
-        dtype = np.int64
-    elif "bool_type" in shape_str:
-        dtype = np.bool_
-    elif "half_type" in shape_str:
-        dtype = np.float16
-    else:
-        dtype = np.float32
-    return dtype, list(shape.lens())
-
-# =========================
-# 🚀 MIGraphX 推理类（带缓存）
-# =========================
-class MIGraphXModel:
-    def __init__(self, onnx_path, cache_path="weights/ground_xiongke.mxr", force_recompile=False):
-        self.cache_path = cache_path
-
-        # ====== 优先加载缓存 ======
-        if os.path.exists(cache_path) and not force_recompile:
-            print(f"⚡ 直接加载已编译模型: {cache_path}")
-            self.model = migraphx.load(cache_path)
-        else:
-            print("🔍 从 ONNX 构建 MIGraphX")
-            self.model = migraphx.parse_onnx(onnx_path) 
-            print(self.model)
-
-             # ====================== 2. 打印模型输入输出信息 ======================
-            print("=== 模型输入信息 ===")
-            inputs = self.model.get_inputs()
-            for key, value in inputs.items():
-                print(f"{key}: {value}")
-            
-            print("\n=== 模型输出信息 ===")
-            outputs = self.model.get_outputs()
-            for key, value in outputs.items():
-                print(f"{key}: {value}")
-            
-            # 获取输入节点名称和输入形状
-            inputName = list(self.model.get_inputs().keys())[0]
-            inputShape = inputs[inputName].lens()
-            print(f"\n输入节点名称: {inputName}")
-            print(f"输入形状 (N, C, H, W): {inputShape}")
-            inputName1 = list(self.model.get_inputs().keys())[1]
-            inputShape1 = inputs[inputName].lens()
-            print(f"\n输入节点名称: {inputName1}")
-            print(f"输入形状 (N, C, H, W): {inputShape1}")
-            """
-            === 模型输入信息 ===
-            text_token_mask: bool_type, {1, 4, 4}, {16, 4, 1}
-            token_type_ids: int64_type, {1, 4}, {4, 1}
-            position_ids: int64_type, {1, 4}, {4, 1}
-            attention_mask: bool_type, {1, 4}, {4, 1}
-            input_ids: int64_type, {1, 4}, {4, 1}
-            img: float_type, {1, 3, 800, 1200}, {2880000, 960000, 1200, 1}
-
-            === 模型输出信息 ===
-            boxes: float_type, {1, 900, 4}, {3600, 4, 1}
-            logits: float_type, {1, 900, 256}, {230400, 256, 1}
-
-            输入节点名称: text_token_mask
-            输入形状 (N, C, H, W): [1, 4, 4]
-            """
-            # print("\n⚡ 量化模型（FP16）")
-            # migraphx.quantize_fp16(self.model)
-
-            # passes = [
-            #     migraphx.pass_dead_code_elimination(),               # 删除未使用的节点/常量
-            #     migraphx.pass_eliminate_contiguous(),                # 合并相邻的 contiguous 操作
-            #     migraphx.pass_simplify_reshapes(),                   # 合并/简化 reshape
-            #     migraphx.pass_simplify_algebra(),                    # 简化代数表达式 (add/mul/..)
-            #     migraphx.pass_eliminate_identity(),                  # 删除 Identity ops
-            #     migraphx.pass_common_subexpression_elimination(),    # CSE
-            # ]
-            # self.model.apply_passes(passes)
-
-
-            print("⚙️ 编译 MIGraphX（GPU）")
-            self.model.compile(
-                t=migraphx.get_target("gpu"),device_id=5
-            )
-            # offload_copy=False, fast_math=False, exhaustive_tune=False
-
-            # ====== 保存缓存 ======
-            print(f"💾 保存编译模型到: {cache_path}")
-            migraphx.save(self.model, cache_path)
-
-        self.param_names = self.model.get_parameter_names()
-        self.input_shapes = self.model.get_inputs()
-        print("✅ 输入节点:", self.param_names)
-
-    def infer(self, input_dict):
-        mgx_inputs = {k: to_mgx(v) for k, v in input_dict.items()}
-
-        # 某些通过 disable passes 生成的 mxr 会多出内部别名参数（如 main:#output_*）。
-        # 若缺失，运行期可能触发 VMFault，这里按 shape 自动补零缓冲区。
-        auto_filled = []
-        for name in self.param_names:
-            if name in mgx_inputs:
-                continue
-            if name not in self.input_shapes:
-                continue
-            dtype, lens = _mgx_shape_to_numpy(self.input_shapes[name])
-            mgx_inputs[name] = to_mgx(np.zeros(lens, dtype=dtype))
-            auto_filled.append((name, lens, dtype.__name__))
-        if auto_filled:
-            print("⚠️ 自动补齐内部输入参数:")
-            for item in auto_filled:
-                print(f"   - {item[0]} shape={item[1]} dtype={item[2]}")
-
-        start = time.time()
-        result = self.model.run(mgx_inputs)
-        infer_time = time.time() - start
-
-        outputs = [np.array(r) for r in result]
-        return outputs, infer_time
-
-
-# =========================
-# 推理函数
-# =========================
-def predict(
-        model,
-        tokenizer,
-        image,
-        caption,
-        box_threshold,
-        text_threshold,
-        is_benchmark=False
-):
-    caption = preprocess_caption(caption)
-    captions = [caption]
-
-    tokenized = tokenizer(captions, padding="longest", return_tensors="pt")
-    specical_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
-
-    (
-        text_self_attention_masks,
-        position_ids,
-        _
-    ) = generate_masks_with_special_tokens_and_transfer_map(
-        tokenized, specical_tokens, tokenizer
-    )
-
-    max_text_len = 256
-    if text_self_attention_masks.shape[1] > max_text_len:
-        text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
-        position_ids = position_ids[:, :max_text_len]
-        tokenized["input_ids"] = tokenized["input_ids"][:, :max_text_len]
-        tokenized["attention_mask"] = tokenized["attention_mask"][:, :max_text_len]
-        tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :max_text_len]
-
-    input_dict = {
-        "img": np.expand_dims(np.asarray(image), axis=0).astype(np.float32),
-        "input_ids": np.asarray(tokenized["input_ids"]).astype(np.int64),
-        "attention_mask": np.asarray(tokenized["attention_mask"]).astype(np.bool_),
-        "position_ids": np.asarray(position_ids).astype(np.int64),
-        "token_type_ids": np.asarray(tokenized["token_type_ids"]).astype(np.int64),
-        "text_token_mask": np.asarray(text_self_attention_masks).astype(np.bool_)
-    }
-
-    outputs, infer_time = model.infer(input_dict)
-
-    if not is_benchmark:
-        print(f"Inference time: {infer_time*1000:.2f} ms")
-
-    logits = sigmoid(outputs[0][0])
-    boxes = outputs[1][0]
-
-    max_values = np.max(logits, axis=1)
-    mask = max_values > box_threshold
-
-    logits = logits[mask]
-    boxes = boxes[mask]
-
-    phrases = ["object"] * len(boxes)
-
-    return boxes, np.max(logits, axis=1), phrases
-
-
-# =========================
-# Benchmark
-# =========================
-def benchmark(model, tokenizer, image, caption, box_th, text_th, warmup=5, runs=10):
-    print("\n🔥 预热")
-    for _ in range(warmup):
-        predict(model, tokenizer, image, caption, box_th, text_th, True)
-
-    print("\n🚀 测试")
-    times = []
-    for i in range(runs):
-        start = time.time()
-        predict(model, tokenizer, image, caption, box_th, text_th, True)
-        times.append(time.time() - start)
-
-    print(f"\n平均耗时: {np.mean(times)*1000:.2f} ms")
-    print(f"FPS: {1/np.mean(times):.2f}")
-
-
-# =========================
-# 主函数
-# =========================
-if __name__ == "__main__":
-
-    #model_path = "weights/ground.onnx"
-    model_path = "weights/ground_fixed.onnx"
-    cache_path = "weights/ground_xiongke.mxr"   # ⭐ 缓存文件
-
-    img_path = "images/in/car_1.jpg"
-
-    TEXT_PROMPT = "car ."
-    BOX_TRESHOLD = 0.35
-    TEXT_TRESHOLD = 0.25
-
-    # 🚀 加载模型（自动缓存）
-    model = MIGraphXModel(
-        model_path,
-        cache_path=cache_path,
-        force_recompile=False  # 改成 True 可强制重编译
-    )
-
-    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-
-    image_source, image = load_image(img_path)
-
-    benchmark(model, tokenizer, image, TEXT_PROMPT, BOX_TRESHOLD, TEXT_TRESHOLD)
-
-    boxes, confs, phrases = predict(
-        model, tokenizer, image,
-        TEXT_PROMPT, BOX_TRESHOLD, TEXT_TRESHOLD
-    )
-
-    print("检测结果:", phrases)
--- a/onnx_inference_test.py
+++ b/onnx_inference_test.py
-from typing import Tuple, List, Dict
-
-import cv2
-import numpy as np
-import torch
-import onnxruntime as ort
-from transformers import BertTokenizer, AutoTokenizer
-import bisect
-import time
-
-from groundingdino.util.inference import load_image
-from groundingdino.models.GroundingDINO.bertwarper import generate_masks_with_special_tokens_and_transfer_map
-
-# 加入推理延迟等指标
-
-def sigmoid(x):
-    return 1 / (1 + np.exp(-x))
-
-def get_phrases_from_posmap(
-    posmap: np.ndarray, tokenized: Dict, tokenizer: AutoTokenizer, left_idx: int = 0, right_idx: int = 255
-):
-    assert isinstance(posmap, np.ndarray), "posmap must be np.ndarray"
-    if posmap.ndim == 1:
-        # 将指定范围内的元素设为 False
-        posmap[:left_idx + 1] = False
-        posmap[right_idx:] = False
-
-        # 获取非零元素的索引
-        non_zero_idx = np.nonzero(posmap)[0]
-        token_ids = [tokenized["input_ids"][i] for i in non_zero_idx]
-        return tokenizer.decode(token_ids)
-    else:
-        raise NotImplementedError("posmap must be 1-dim")
-
-def preprocess_caption(caption: str) -> str:
-    result = caption.lower().strip()
-    if result.endswith("."):
-        return result
-    return result + "."
-
-# 核心优化：增加tokenizer参数，从外部传入
-def predict(
-        ort_session,
-        tokenizer: AutoTokenizer,  # 外部预加载的tokenizer
-        image: np.array,
-        caption: str,
-        box_threshold: float,
-        text_threshold: float,
-        device: str = "cpu",
-        remove_combined: bool = False,
-        is_benchmark: bool = False  # 新增：标记是否为基准测试（控制日志输出）
-) -> Tuple[torch.Tensor, torch.Tensor, List[str]]:
-    # 1. 文本预处理
-    t0 = time.time()
-    caption = preprocess_caption(caption=caption)
-    if not is_benchmark:
-        print(f"Caption processing took {(time.time() - t0):.3f}s")
-
-    captions = [caption]
-    # 3. 编码文本
-    t0 = time.time()
-    # 移除重复加载tokenizer的性能黑洞
-    tokenized = tokenizer(captions, padding="longest", return_tensors="pt").to(device)
-    specical_tokens = tokenizer.convert_tokens_to_ids (["[CLS]", "[SEP]", ".", "?"])
-    if not is_benchmark:
-        print(f"Word embedding took {(time.time() - t0):.3f}s")
-
-    # 4. 生成注意力掩码和位置信息
-    t0 = time.time()
-    (
-        text_self_attention_masks,
-        position_ids,
-        cate_to_token_mask_list,
-    ) = generate_masks_with_special_tokens_and_transfer_map(
-        tokenized, specical_tokens, tokenizer)
-    if not is_benchmark:
-        print(f"Generate attention masks took {(time.time() - t0):.3f}s")
-
-    # 5. 处理超长文本
-    max_text_len = 256
-    if text_self_attention_masks.shape[1] > max_text_len:
-        text_self_attention_masks = text_self_attention_masks[
-                                    :, : max_text_len, : max_text_len]
-
-        position_ids = position_ids[:, : max_text_len]
-        tokenized["input_ids"] = tokenized["input_ids"][:, : max_text_len]
-        tokenized["attention_mask"] = tokenized["attention_mask"][:, : max_text_len]
-        tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : max_text_len]
-
-    #  6. 执行模型推理
-
-
-    # attention_mask: True=可见 → False=mask
-    attention_mask = tokenized["attention_mask"].float()
-    attention_mask = (1 - attention_mask) * -1e9  # 关键！
-    attention_mask = np.asarray(attention_mask)
-
-    # text_token_mask 同理（如果参与 attention）
-    text_self_attention_masks = text_self_attention_masks.float()
-    text_self_attention_masks = (1 - text_self_attention_masks) * -1e9
-
-
-    input_dict = {
-        "img": np.expand_dims(np.asarray(image), axis=0),
-        "input_ids": np.asarray(tokenized["input_ids"]),
-        "attention_mask": attention_mask,
-        "position_ids": np.asarray(position_ids),
-        "token_type_ids": np.asarray(tokenized["token_type_ids"]),
-        "text_token_mask": np.asarray(text_self_attention_masks)
-    }
-    # input_dict = {
-    #     "img": np.expand_dims(np.asarray(image), axis=0),
-    #     "input_ids": np.asarray(tokenized["input_ids"]),
-    #     "attention_mask": attention_mask,
-    #     "position_ids": np.asarray(position_ids),
-    #     "token_type_ids": np.asarray(tokenized["token_type_ids"]),
-    #     "text_token_mask": np.asarray(text_self_attention_masks)
-    # }
-    
-    t0 = time.time()
-    outputs = ort_session.run(['logits', 'boxes'], input_dict)
-    infer_time = time.time() - t0
-    if not is_benchmark:
-        print(f"Inference time: {infer_time:.3f}s")
-
-    # 7. 获取预测结果
-    prediction_logits = np.apply_along_axis(sigmoid, -1, outputs[0][0])
-    prediction_boxes = outputs[1][0]
-
-    if not is_benchmark:
-        print(f"\n=== Debug Info ===")
-        print(f"Prediction logits shape: {prediction_logits.shape}")
-        print(f"Prediction boxes shape: {prediction_boxes.shape}")
-        print(f"Max logit value: {np.max(prediction_logits):.4f}")
-        print(f"Mean logit value: {np.mean(prediction_logits):.4f}")
-
-    # 8. 应用过滤条件
-    max_values = np.max(prediction_logits, axis=1)
-    mask = max_values > box_threshold
-    logits = prediction_logits[mask]
-    boxes = prediction_boxes[mask]
-
-    # 9. 处理文本匹配
-    tokenized = tokenizer(caption)
-
-    # 10. 处理特殊标记
-    if remove_combined:
-        sep_idx = [i for i in range(len(tokenized['input_ids'])) if tokenized['input_ids'][i] in [101, 102, 1012]]
-
-        phrases = []
-        for logit in logits:
-            max_idx = logit.argmax()
-            insert_idx = bisect.bisect_left(sep_idx, max_idx)
-            right_idx = sep_idx[insert_idx]
-            left_idx = sep_idx[insert_idx - 1]
-            phrases.append(
-                get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer, left_idx, right_idx).replace('.', '')
-            )
-    else:
-        phrases = [
-            get_phrases_from_posmap(logit > text_threshold, tokenized, tokenizer).replace('.', '')
-            for logit in logits
-        ]
-
-    return boxes, np.max(logits, axis=1), phrases
-
-# 新增：完整的性能测试函数（包含预热+实际推理）
-def benchmark_performance(
-    ort_session, tokenizer, image, caption, box_threshold, text_threshold,
-    warmup_runs=5, test_runs=10, device="cpu"
-):
-    """
-    性能测试函数：包含预热和实际推理
-    :param warmup_runs: 预热次数
-    :param test_runs: 实际测试次数
-    """
-    print("="*60)
-    print("📊 开始性能测试（包含预热+实际推理）")
-    print("="*60)
-
-    # 1. 预热阶段
-    print(f"\n🔥 预热阶段（{warmup_runs} 次）- 不计入性能统计")
-    warmup_start = time.time()
-    for i in range(warmup_runs):
-        t0 = time.time()
-        predict(ort_session, tokenizer, image, caption, box_threshold, text_threshold, device, is_benchmark=True)
-        warmup_time = time.time() - t0
-        print(f"预热 {i+1}/{warmup_runs} - 耗时: {warmup_time*1000:.2f} ms")
-    total_warmup_time = time.time() - warmup_start
-    print(f"\n预热完成 - 总耗时: {total_warmup_time:.3f} s, 平均每次: {total_warmup_time/warmup_runs*1000:.2f} ms")
-
-    # 2. 实际推理测试阶段
-    print(f"\n🚀 实际推理测试阶段（{test_runs} 次）- 统计性能指标")
-    test_start = time.time()
-    infer_times = []  # 记录每次推理耗时
-
-    for i in range(test_runs):
-        t0 = time.time()
-        predict(ort_session, tokenizer, image, caption, box_threshold, text_threshold, device, is_benchmark=True)
-        infer_time = time.time() - t0
-        infer_times.append(infer_time)
-        print(f"实际推理 {i+1}/{test_runs} - 耗时: {infer_time*1000:.2f} ms")
-
-    # 3. 计算性能指标
-    total_test_time = time.time() - test_start
-    avg_infer_time = np.mean(infer_times)
-    std_infer_time = np.std(infer_times)
-    max_infer_time = np.max(infer_times)
-    min_infer_time = np.min(infer_times)
-    fps = test_runs / total_test_time
-
-    # 4. 输出性能报告
-    print("\n" + "="*60)
-    print("📈 性能测试报告（仅实际推理阶段）")
-    print("="*60)
-    print(f"测试次数: {test_runs} 次")
-    print(f"总推理耗时: {total_test_time:.3f} s")
-    print(f"平均推理耗时: {avg_infer_time*1000:.2f} ms (±{std_infer_time*1000:.2f} ms)")
-    print(f"最大推理耗时: {max_infer_time*1000:.2f} ms")
-    print(f"最小推理耗时: {min_infer_time*1000:.2f} ms")
-    print(f"平均FPS: {fps:.2f} 帧/秒")
-    print("="*60)
-
-    return {
-        "warmup_runs": warmup_runs,
-        "test_runs": test_runs,
-        "avg_infer_time_ms": avg_infer_time*1000,
-        "std_infer_time_ms": std_infer_time*1000,
-        "max_infer_time_ms": max_infer_time*1000,
-        "min_infer_time_ms": min_infer_time*1000,
-        "fps": fps
-    }
-
-if __name__ == '__main__':
-    # 配置参数
-    model_path = 'weights/ground_test.onnx'
-    img_path = 'images/in/car_1.jpg'
-    TEXT_PROMPT = "car ."
-    BOX_TRESHOLD = 0.35
-    TEXT_TRESHOLD = 0.25
-    DEVICE = "cpu"
-    WARMUP_RUNS = 5    # 预热次数
-    TEST_RUNS = 10     # 实际测试次数
-
-    # 加载图片
-    image_source, image = load_image(img_path)
-
-    # 加载ONNX模型（启用优化）
-    print("🔍 加载ONNX模型")
-
-    sess_options = ort.SessionOptions()
-    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL  # 启用所有图优化
-    sess_options.log_severity_level = 3  # 减少日志输出
-    # sess_options.enable_profiling = True # 启用性能分析
-
-    ort_session = ort.InferenceSession(model_path, 
-                                       sess_options=sess_options, 
-                                       providers=['ROCMExecutionProvider']
-                                        #    provider_options=[{
-                                        #         "device_id": 0,
-                                        #         "migraphx_fp16_enable": "False",
-                                        #         "migraphx_int8_enable": "False",
-                                        #         # 尝试禁用 MIGraphX 内部优化
-                                        #         "migraphx_save_compiled_model": "False",
-                                        #     }]
-                                       )
-
-    # 查看当前执行引擎
-    current_provider = ort_session.get_providers()
-    print(f"✅ 模型加载完成 - 当前执行引擎: {current_provider}")
-
-    # 预加载tokenizer（只加载一次，核心优化）
-    print("\n📝 预加载BERT Tokenizer（仅加载一次）")
-    t0 = time.time()
-    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-    print(f"✅ Tokenizer加载完成 - 耗时: {(time.time() - t0):.3f} s")
-
-    # 第一步：运行完整的性能测试（预热+实际推理）
-    performance_result = benchmark_performance(
-        ort_session, tokenizer, image, TEXT_PROMPT,
-        BOX_TRESHOLD, TEXT_TRESHOLD,
-        WARMUP_RUNS, TEST_RUNS, DEVICE
-    )
-
-    # 第二步：执行一次完整推理（带详细日志，保存结果图片）
-    print("\n" + "="*60)
-    print("🎯 执行最终推理（带详细日志+保存结果）")
-    print("="*60)
-    boxes, confs, phrases = predict(
-        ort_session, tokenizer, image, TEXT_PROMPT,
-        BOX_TRESHOLD, TEXT_TRESHOLD, DEVICE
-    )
-
-    # 绘制并保存结果图片
-    ori_img = cv2.imread(img_path)
-    img_h = ori_img.shape[0]
-    img_w = ori_img.shape[1]
-    for i in range(len(boxes)):
-        one_box = boxes[i]
-        one_conf = confs[i]
-        one_cls = phrases[i]
-        x1 = int((one_box[0] - one_box[2] / 2) * img_w)
-        y1 = int((one_box[1] - one_box[3] / 2) * img_h)
-        x2 = int((one_box[0] + one_box[2] / 2) * img_w)
-        y2 = int((one_box[1] + one_box[3] / 2) * img_h)
-        cv2.rectangle(ori_img, (x1, y1), (x2, y2), (0, 0, 255), 2)
-        cv2.putText(
-            ori_img, f'{one_cls} {one_conf:.2f}', 
-            (x1-15, y1-15), 
-            fontFace=cv2.FONT_HERSHEY_SIMPLEX, 
-            color=(255, 255, 255), 
-            fontScale=1.5, 
-            thickness=3
-        )
-
-    # 保存结果
-    cv2.imwrite('./images/out/result.jpg', ori_img)
-    print(f"\n✅ 结果已保存至: ./images/out/result.jpg")
-    print(f"✅ 检测到目标: {phrases} (共 {len(boxes)} 个)")
-
-    # profile_file = ort_session.end_profiling()
-    # print(f"\n📊 Profiling 文件已生成: {profile_file}")
\ No newline at end of file
--- a/resnet/profile.json_2026-04-03_11-42-27.json
+++ b/resnet/profile.json_2026-04-03_11-42-27.json
-[
-{"cat" : "Session","pid" :1773812,"tid" :1773812,"dur" :91000,"ts" :20,"ph" : "X","name" :"model_loading_uri","args" : {}},
-{"cat" : "Session","pid" :1773812,"tid" :1773812,"dur" :4754984,"ts" :91208,"ph" : "X","name" :"session_initialization","args" : {}}
-]
--- a/resnet/profile.json_2026-04-03_11-43-38.json
+++ b/resnet/profile.json_2026-04-03_11-43-38.json
-[
-{"cat" : "Session","pid" :1774715,"tid" :1774715,"dur" :73864,"ts" :6,"ph" : "X","name" :"model_loading_uri","args" : {}},
-{"cat" : "Session","pid" :1774715,"tid" :1774715,"dur" :4308806,"ts" :73985,"ph" : "X","name" :"session_initialization","args" : {}}
-]
--- a/resnet/profile.json_2026-04-03_11-44-14.json
+++ b/resnet/profile.json_2026-04-03_11-44-14.json
-[
-{"cat" : "Session","pid" :1775618,"tid" :1775618,"dur" :76845,"ts" :5,"ph" : "X","name" :"model_loading_uri","args" : {}},
-{"cat" : "Session","pid" :1775618,"tid" :1775618,"dur" :4130635,"ts" :76963,"ph" : "X","name" :"session_initialization","args" : {}}
-]