适配了输入400x800,删除垃圾文件

ca23112b · zk · 74fbd52c · 74fbd52c · 74fbd52c · 74fbd52c
Commit ca23112b authored Apr 15, 2026 by zk
14 changed files
--- a/mha/run.sh
+++ b/mha/run.sh
-# python3 -m onnxruntime.transformers.optimizer \
-#     --input ../weights/ground.onnx \
-#     --output ./mha.onnx \
-#     --use_multi_head_attention \
-#     # --num_heads 12 \
-#     # --hidden_size 256 \
-#     --model_type bert \
-#     --disable_skip_layer_norm \
-#     --disable_gelu \
-#     --use_gpu \
-#     --disable_embed_layer_norm \
-#     --use_mask_index \
-#     --use_raw_attention_mask
-python3 -m onnxruntime.transformers.optimizer --input ../weights/ground.onnx --output ./ground.onnx --model_type bert --use_gpu
\ No newline at end of file
--- a/migraphx_infer/test.py
+++ b/migraphx_infer/test.py
-import onnx
-
-model = onnx.load("weights/ground.onnx")
-
-for vi in model.graph.value_info:
-    dims = [d.dim_value for d in vi.type.tensor_type.shape.dim]
-    if any(d == 0 for d in dims):
-        print("⚠️ zero dim in value_info:", vi.name, dims)
-
-for vi in model.graph.output:
-    dims = [d.dim_value for d in vi.type.tensor_type.shape.dim]
-    if any(d == 0 for d in dims):
-        print("⚠️ zero dim in output:", vi.name, dims)
\ No newline at end of file
--- a/migraphx_infer/test1.py
+++ b/migraphx_infer/test1.py
-import onnx
-
-model = onnx.load("weights/ground_simplified.onnx")
-
-# 基本信息
-print(f"模型名称: {model.graph.name}")
-print(f"opset 版本: {model.opset_import[0].version}")
-
-# 输入
-print("\n=== 输入 ===")
-for inp in model.graph.input:
-    shape = [d.dim_value if d.dim_value > 0 else d.dim_param for d in inp.type.tensor_type.shape.dim]
-    print(f"  {inp.name}: {inp.type.tensor_type.elem_type}, shape={shape}")
-
-# 输出
-print("\n=== 输出 ===")
-for out in model.graph.output:
-    shape = [d.dim_value if d.dim_value > 0 else d.dim_param for d in out.type.tensor_type.shape.dim]
-    print(f"  {out.name}: {out.type.tensor_type.elem_type}, shape={shape}")
-
-# 统计算子类型
-from collections import Counter
-op_counts = Counter(node.op_type for node in model.graph.node)
-print("\n=== 算子统计 (前20) ===")
-for op, count in op_counts.most_common(20):
-    print(f"  {op}: {count}")
-
-# 检查是否有控制流算子
-control_ops = [op for op in op_counts if op in ["If", "Loop", "Scan", "SequenceMap"]]
-if control_ops:
-    print(f"\n⚠️  包含控制流算子: {control_ops}")
-
-'''
-模型名称: main_graph
-opset 版本: 17
-
-=== 输入 ===
-  img: 1, shape=[1, 3, 800, 1200]
-  input_ids: 7, shape=[1, 4]
-  attention_mask: 9, shape=[1, 4]
-  position_ids: 7, shape=[1, 4]
-  token_type_ids: 7, shape=[1, 4]
-  text_token_mask: 9, shape=[1, 4, 4]
-
-=== 输出 ===
-  logits: 1, shape=['Gatherlogits_dim_0', 'Gatherlogits_dim_1', 'Gatherlogits_dim_2']
-  boxes: 1, shape=['Gatherboxes_dim_0', 'Gatherboxes_dim_1', 4]
-
-=== 算子统计 (前20) ===
-  Constant: 7315
-  Unsqueeze: 1919
-  Concat: 1051
-  Reshape: 916
-  Shape: 843
-  Gather: 762
-  Add: 716
-  Slice: 603
-  MatMul: 528
-  Mul: 513
-  Transpose: 507
-  Cast: 459
-  Div: 265
-  Where: 230
-  Expand: 223
-  ConstantOfShape: 218
-  Equal: 183
-  LayerNormalization: 147
-  Sub: 79
-  Softmax: 78
-
-  # 经过简化后：
-  === 输入 ===
-  img: 1, shape=[1, 3, 800, 1200]
-  input_ids: 7, shape=[1, 4]
-  attention_mask: 9, shape=[1, 4]
-  position_ids: 7, shape=[1, 4]
-  token_type_ids: 7, shape=[1, 4]
-  text_token_mask: 9, shape=[1, 4, 4]
-
-=== 输出 ===
-  logits: 1, shape=[1, 900, 256]
-  boxes: 1, shape=[1, 900, 4]
-
-=== 算子统计 (前20) ===
-  Reshape: 703
-  Add: 679
-  MatMul: 527
-  Transpose: 459
-  Mul: 204
-  Slice: 194
-  Gather: 155
-  Unsqueeze: 152
-  LayerNormalization: 147
-  Concat: 97
-  Div: 96
-  Softmax: 78
-  Clip: 57
-  Relu: 48
-  GridSample: 48
-  Sub: 36
-  Erf: 36
-  Where: 35
-  Pad: 25
-  Sin: 25
-  
-  
-  '''
\ No newline at end of file
--- a/migraphx_infer/test2.py
+++ b/migraphx_infer/test2.py
-import onnx
-from onnx import numpy_helper
-
-# 加载你的模型
-model = onnx.load("weights/ground_sim_fp16.onnx")
-
-print("=== 检查所有常量张量大小 ===")
-for init in model.graph.initializer:
-    name = init.name
-    shape = tuple(init.dims)
-    # 计算元素个数
-    elem_count = 1
-    for d in shape:
-        elem_count *= d
-    # 计算大小（MB）
-    dtype_size = onnx.helper.tensor_dtype_to_np_dtype(init.data_type).itemsize
-    size_mb = (elem_count * dtype_size) / (1024 * 1024)
-    
-    # 只打印 >10MB 的常量（你可以改阈值）
-    if size_mb > 10:
-        print(f"⚠️  超大常量：{name}")
-        print(f"   形状：{shape}")
-        print(f"   大小：{size_mb:.2f} MB\n")
-"""
-=== ground.onnx检查所有常量张量大小 ===
-⚠️  超大常量：bert.embeddings.word_embeddings.weight
-   形状：(30522, 768)
-   大小：89.42 MB
-
-⚠️  超大常量：onnx::MatMul_25479
-   形状：(1024, 3072)
-   大小：12.00 MB
-
-⚠️  超大常量：onnx::MatMul_25503
-   形状：(1024, 4096)
-   大小：16.00 MB
-
-⚠️  超大常量：onnx::MatMul_25504
-   形状：(4096, 1024)
-   大小：16.00 MB
-
-⚠️  超大常量：onnx::MatMul_25513
-   形状：(1024, 3072)
-   大小：12.00 MB
-
-⚠️  超大常量：onnx::MatMul_25541
-   形状：(1024, 4096)
-   大小：16.00 MB
-
-⚠️  超大常量：onnx::MatMul_25542
-   形状：(4096, 1024)
-   大小：16.00 MB
-
-   ground_simplified.onnx
-=== 检查所有常量张量大小 ===
-⚠️  超大常量：bert.embeddings.word_embeddings.weight
-   形状：(30522, 768)
-   大小：89.42 MB
-
-⚠️  超大常量：onnx::MatMul_25479
-   形状：(1024, 3072)
-   大小：12.00 MB
-
-⚠️  超大常量：onnx::MatMul_25503
-   形状：(1024, 4096)
-   大小：16.00 MB
-
-⚠️  超大常量：onnx::MatMul_25504
-   形状：(4096, 1024)
-   大小：16.00 MB
-
-⚠️  超大常量：onnx::MatMul_25513
-   形状：(1024, 3072)
-   大小：12.00 MB
-
-⚠️  超大常量：onnx::MatMul_25541
-   形状：(1024, 4096)
-   大小：16.00 MB
-
-⚠️  超大常量：onnx::MatMul_25542
-   形状：(4096, 1024)
-   大小：16.00 MB
-
-⚠️  超大常量：/backbone/backbone.0/layers.0/blocks.1/attn/Unsqueeze_7_output_0
-   形状：(1, 425, 1, 144, 144)
-   大小：33.62 MB
-
-⚠️  超大常量：/transformer/Concat_10_output_0
-   形状：(1, 19947, 256)
-   大小：19.48 MB
-
-⚠️  超大常量：/transformer/enc_out_class_embed/ConstantOfShape_output_0
-   形状：(1, 19947, 256)
-   大小：19.48 MB   
-   
-"""
-
-"""=== ground_fp16.onnx检查所有常量张量大小 ===
-⚠️  超大常量：bert.embeddings.word_embeddings.weight
-   形状：(30522, 768)
-   大小：44.71 MB
-"""
-
-""" ground_sim_fp16.onnx
-=== 检查所有常量张量大小 ===
-⚠️  超大常量：bert.embeddings.word_embeddings.weight
-   形状：(30522, 768)
-   大小：44.71 MB
-
-⚠️  超大常量：/backbone/backbone.0/layers.0/blocks.1/attn/Unsqueeze_7_output_0
-   形状：(1, 425, 1, 144, 144)
-   大小：16.81 MB
-"""
\ No newline at end of file
--- a/migraphx_infer/test3.py
+++ b/migraphx_infer/test3.py
-import onnx, onnx.external_data_helper as ex
-import sys, os
-print("🟢 python :", sys.executable)
-print("🟢 onnx   :", onnx.__file__)          # 路径，确认是哪个包
-print("🟢 version:", onnx.__version__)       # 必须是 >= 1.12
-src = "weights/ground.onnx"
-dst = "weights/ground_external.onnx"
-
-m = onnx.load(src)
-# 把每个 Tensor 都做 external data，统一写入 ground_weights.bin
-ex.convert_model_to_external_data(m,
-                                  all_tensors_to_one_file=True,
-                                  location="ground_weights.bin")
-onnx.save_model(m, dst)
-print("[✅] external ONNX →", dst)
\ No newline at end of file
--- a/migraphx_infer/test4.py
+++ b/migraphx_infer/test4.py
-import onnx
-from onnx import shape_inference
-
-model = onnx.load("weights/ground.onnx")
-model = shape_inference.infer_shapes(model)
-onnx.save(model, "weights/ground_shape.onnx")
\ No newline at end of file
--- a/migraphx_infer/test5.py
+++ b/migraphx_infer/test5.py
-import onnx
-from onnx import helper
-
-INPUT_MODEL = "weights/ground_simplified.onnx"
-OUTPUT_MODEL = "weights/ground_fix.onnx"
-
-
-def add_identity(graph, input_name, suffix, new_nodes, processed):
-    if input_name in processed:
-        return input_name + suffix
-
-    new_name = input_name + suffix
-
-    identity_node = helper.make_node(
-        "Identity",
-        inputs=[input_name],
-        outputs=[new_name],
-        name=input_name + suffix + "_identity"
-    )
-
-    new_nodes.append(identity_node)
-    processed.add(input_name)
-
-    return new_name
-
-
-def patch_model(model):
-    graph = model.graph
-
-    new_nodes = []
-    processed = set()
-
-    for node in graph.node:
-
-        # ✅ 1. 处理 Gather（你之前做的）
-        if node.op_type == "Gather":
-            idx = node.input[1]
-            node.input[1] = add_identity(graph, idx, "_block", new_nodes, processed)
-
-        # ✅ 2. 🔥 关键：处理 ScatterND
-        if node.op_type.lower().startswith("scatter"):
-            # scatternd(data, indices, updates)
-            data = node.input[0]
-            indices = node.input[1]
-            updates = node.input[2]
-
-            node.input[0] = add_identity(graph, data, "_block", new_nodes, processed)
-            node.input[1] = add_identity(graph, indices, "_block", new_nodes, processed)
-            node.input[2] = add_identity(graph, updates, "_block", new_nodes, processed)
-
-        # ✅ 3. where（也可能触发 constant folding）
-        if node.op_type == "Where":
-            for i in range(3):
-                node.input[i] = add_identity(graph, node.input[i], "_block", new_nodes, processed)
-
-    # 插入到最前面
-    for i, n in enumerate(new_nodes):
-        graph.node.insert(i, n)
-
-    return model
-
-
-def main():
-    print("🔍 加载模型...")
-    model = onnx.load(INPUT_MODEL)
-
-    print("⚙️ 全面阻断 constant folding（Gather + ScatterND + Where）...")
-    model = patch_model(model)
-
-    print("💾 保存模型...")
-    onnx.save(model, OUTPUT_MODEL)
-
-    print("✅ 完成:", OUTPUT_MODEL)
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
--- a/migraphx_infer/test6.py
+++ b/migraphx_infer/test6.py
-import onnx
-from onnx import numpy_helper
-
-model = onnx.load("weights/ground.onnx")
-
-for init in model.graph.initializer:
-    if "Constant" in init.name:
-        arr = numpy_helper.to_array(init)
-        if arr.dtype in [np.int32, np.int64]:
-            if (arr < 0).any() or (arr > 10000).any():
-                print("🚨 可疑 index:", init.name, arr)
\ No newline at end of file
--- a/migraphx_infer/test7.py
+++ b/migraphx_infer/test7.py
-import onnx
-import numpy as np
-from onnx import numpy_helper
-
-model = onnx.load("weights/ground.onnx")
-
-# 找所有 initializer
-init_map = {i.name: numpy_helper.to_array(i) for i in model.graph.initializer}
-
-for node in model.graph.node:
-    if node.op_type == "Gather":
-        index_name = node.input[1]
-
-        if index_name in init_map:
-            idx = init_map[index_name]
-
-            print("\n🚨 Gather index:", index_name)
-            print("dtype:", idx.dtype)
-            print("min:", idx.min())
-            print("max:", idx.max())
-            print("shape:", idx.shape)
-
-            if (idx < 0).any():
-                print("❌ NEGATIVE index")
-
-            if (idx > 10000).any():
-                print("❌ SUSPICIOUS LARGE index")
\ No newline at end of file
--- a/onnx_inference1.py
+++ b/onnx_inference1.py
@@ -240,7 +240,7 @@ def benchmark_performance(

 if __name__ == '__main__':
    # 配置参数
-    model_path = 'weights/ground.onnx'
+    model_path = 'weights_400x600/ground.onnx'
    img_path = 'images/in/car_1.jpg'
    TEXT_PROMPT = "car ."
    BOX_TRESHOLD = 0.35

--- a/onnx_inference1_HIP.py
+++ b/onnx_inference1_HIP.py
-from typing import Tuple, List, Dict
-import os
-import cv2
-import numpy as np
-import torch
-import onnxruntime as ort
-from transformers import BertTokenizer, AutoTokenizer
-import bisect
-import time
-import warnings
-warnings.filterwarnings('ignore')
-
-from groundingdino.util.inference import load_image
-from groundingdino.models.GroundingDINO.bertwarper import generate_masks_with_special_tokens_and_transfer_map
-
-# 加入推理延迟等指标
-def sigmoid(x):
-    return 1 / (1 + np.exp(-x))
-
-def get_phrases_from_posmap(
-    posmap: np.ndarray, tokenized: Dict, tokenizer: AutoTokenizer, left_idx: int = 0, right_idx: int = 255
-):
-    assert isinstance(posmap, np.ndarray), "posmap must be np.ndarray"
-    if posmap.ndim == 1:
-        # 将指定范围内的元素设为 False
-        posmap[:left_idx + 1] = False
-        posmap[right_idx:] = False
-
-        # 获取非零元素的索引
-        non_zero_idx = np.nonzero(posmap)[0]
-        token_ids = [tokenized["input_ids"][i] for i in non_zero_idx]
-        return tokenizer.decode(token_ids)
-    else:
-        raise NotImplementedError("posmap must be 1-dim")
-
-def preprocess_caption(caption: str) -> str:
-    result = caption.lower().strip()
-    if result.endswith("."):
-        return result
-    return result + "."
-
-# 核心优化：固定尺寸内存池（800x1200），batch_size=1
-class HIPMemoryPool:
-    def __init__(self, img_shape=(3, 800, 1200), max_text_len=256, device="cpu"):
-        self.img_shape = img_shape  # 固定800x1200
-        self.max_text_len = max_text_len
-        self.device = device
-        self.pool = {}
-        # 预分配所有内存（固定尺寸，无动态分配）
-        self._preallocate_all_buffers()
-
-    def _preallocate_all_buffers(self):
-        """预分配固定尺寸的所有内存（800x1200，batch_size=1）"""
-        # 图像内存 (1, 3, 800, 1200) - 固定尺寸
-        self.pool["img"] = np.zeros((1,) + self.img_shape, dtype=np.float32)
-        
-        # 文本相关内存 (batch_size=1, 256)
-        self.pool["input_ids"] = np.zeros((1, self.max_text_len), dtype=np.int64)
-        self.pool["attention_mask"] = np.zeros((1, self.max_text_len), dtype=bool)
-        self.pool["position_ids"] = np.zeros((1, self.max_text_len), dtype=np.int64)
-        self.pool["token_type_ids"] = np.zeros((1, self.max_text_len), dtype=np.int64)
-        self.pool["text_token_mask"] = np.zeros((1, self.max_text_len, self.max_text_len), dtype=bool)
-
-    def update_img_buffer(self, image: np.array):
-        """更新图像缓冲区（固定800x1200尺寸）"""
-        # 校验输入尺寸，确保是800x1200
-        if image.shape != self.img_shape:
-            raise ValueError(f"图片尺寸必须为{self.img_shape}，当前为{image.shape}")
-        self.pool["img"][0] = image
-        return self.pool["img"]
-
-    def update_text_buffers(self, tokenized, position_ids, text_self_attention_masks):
-        """更新文本缓冲区（复用固定内存）"""
-        # 截断并复制文本数据到预分配缓冲区
-        text_len = min(tokenized["input_ids"].shape[1], self.max_text_len)
-        
-        self.pool["input_ids"][0, :text_len] = tokenized["input_ids"][0, :text_len].cpu().numpy()
-        self.pool["attention_mask"][0, :text_len] = tokenized["attention_mask"][0, :text_len].cpu().numpy().astype(bool)
-        self.pool["position_ids"][0, :text_len] = position_ids[0, :text_len].cpu().numpy()
-        self.pool["token_type_ids"][0, :text_len] = tokenized["token_type_ids"][0, :text_len].cpu().numpy()
-        
-        # 文本注意力掩码
-        mask_len = min(text_self_attention_masks.shape[1], self.max_text_len)
-        self.pool["text_token_mask"][0, :mask_len, :mask_len] = text_self_attention_masks[0, :mask_len, :mask_len].cpu().numpy()
-        
-        return {
-            "input_ids": self.pool["input_ids"],
-            "attention_mask": self.pool["attention_mask"],
-            "position_ids": self.pool["position_ids"],
-            "token_type_ids": self.pool["token_type_ids"],
-            "text_token_mask": self.pool["text_token_mask"]
-        }
-
-# 核心推理函数（适配固定尺寸+batch_size=1）
-def predict(
-        ort_session,
-        tokenizer: AutoTokenizer,
-        memory_pool: HIPMemoryPool,
-        image: np.array,
-        caption: str,
-        box_threshold: float,
-        text_threshold: float,
-        device: str = "cpu",
-        remove_combined: bool = False,
-        is_benchmark: bool = False
-) -> Tuple[torch.Tensor, torch.Tensor, List[str]]:
-    # 1. 文本预处理
-    caption = preprocess_caption(caption=caption)
-
-    # 2. 编码文本（复用tokenizer）
-    tokenized = tokenizer([caption], padding="longest", return_tensors="pt").to(device)
-    specical_tokens = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
-
-    # 3. 生成注意力掩码和位置信息
-    (
-        text_self_attention_masks,
-        position_ids,
-        cate_to_token_mask_list,
-    ) = generate_masks_with_special_tokens_and_transfer_map(
-        tokenized, specical_tokens, tokenizer)
-
-    # 4. 处理超长文本（截断）
-    max_text_len = memory_pool.max_text_len
-    if text_self_attention_masks.shape[1] > max_text_len:
-        text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len]
-        position_ids = position_ids[:, :max_text_len]
-        tokenized["input_ids"] = tokenized["input_ids"][:, :max_text_len]
-        tokenized["attention_mask"] = tokenized["attention_mask"][:, :max_text_len]
-        tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :max_text_len]
-
-    # 5. 复用固定尺寸内存池
-    img_input = memory_pool.update_img_buffer(image)
-    text_inputs = memory_pool.update_text_buffers(tokenized, position_ids, text_self_attention_masks)
-
-    input_dict = {
-        "img": img_input,
-        "input_ids": text_inputs["input_ids"],
-        "attention_mask": text_inputs["attention_mask"],
-        "position_ids": text_inputs["position_ids"],
-        "token_type_ids": text_inputs["token_type_ids"],
-        "text_token_mask": text_inputs["text_token_mask"]
-    }
-
-    # 6. 执行模型推理（无分步计时，减少同步）
-    t0 = time.time()
-    outputs = ort_session.run(['logits', 'boxes'], input_dict)
-    infer_time = time.time() - t0
-
-    if not is_benchmark:
-        print(f"Inference time: {infer_time:.3f}s")
-
-    # 7. 处理预测结果
-    prediction_logits = np.apply_along_axis(sigmoid, -1, outputs[0][0])
-    prediction_boxes = outputs[1][0]
-
-    if not is_benchmark:
-        print(f"\n=== Debug Info ===")
-        print(f"Prediction logits shape: {prediction_logits.shape}")
-        print(f"Prediction boxes shape: {prediction_boxes.shape}")
-        print(f"Max logit value: {np.max(prediction_logits):.4f}")
-        print(f"Mean logit value: {np.mean(prediction_logits):.4f}")
-
-    # 8. 过滤结果
-    max_values = np.max(prediction_logits, axis=1)
-    mask = max_values > box_threshold
-    logits = prediction_logits[mask]
-    boxes = prediction_boxes[mask]
-
-    # 9. 生成文本标签
-    tokenized_caption = tokenizer(caption)
-    if remove_combined:
-        sep_idx = [i for i in range(len(tokenized_caption['input_ids'])) 
-                   if tokenized_caption['input_ids'][i] in [101, 102, 1012]]
-        phrases = []
-        for logit in logits:
-            max_idx = logit.argmax()
-            insert_idx = bisect.bisect_left(sep_idx, max_idx)
-            right_idx = sep_idx[insert_idx] if insert_idx < len(sep_idx) else len(logit)
-            left_idx = sep_idx[insert_idx - 1] if insert_idx > 0 else 0
-            phrases.append(
-                get_phrases_from_posmap(logit > text_threshold, tokenized_caption, 
-                                      tokenizer, left_idx, right_idx).replace('.', '')
-            )
-    else:
-        phrases = [
-            get_phrases_from_posmap(logit > text_threshold, tokenized_caption, tokenizer).replace('.', '')
-            for logit in logits
-        ]
-
-    return boxes, np.max(logits, axis=1), phrases
-
-# 性能测试函数（适配batch_size=1）
-def benchmark_performance(
-    ort_session, tokenizer, memory_pool, image, caption, box_threshold, text_threshold,
-    warmup_runs=5, test_runs=10, device="cpu", batch_size=1
-):
-    """
-    性能测试函数：batch_size=1，固定800x1200尺寸
-    """
-    print("="*60)
-    print("📊 开始性能测试（固定800x1200，batch_size=1）")
-    print("="*60)
-
-    # 1. 预热阶段（加载HIP模块）
-    print(f"\n🔥 预热阶段（{warmup_runs} 次）- 加载HIP模块")
-    warmup_start = time.time()
-    for i in range(warmup_runs):
-        t0 = time.time()
-        predict(ort_session, tokenizer, memory_pool, image, caption, 
-                box_threshold, text_threshold, device, is_benchmark=True)
-        warmup_time = time.time() - t0
-        print(f"预热 {i+1}/{warmup_runs} - 耗时: {warmup_time*1000:.2f} ms")
-    total_warmup_time = time.time() - warmup_start
-    print(f"\n预热完成 - 总耗时: {total_warmup_time:.3f} s (HIP模块已加载完成)")
-
-    # 2. 实际推理测试（batch_size=1）
-    print(f"\n🚀 实际推理测试（{test_runs} 次，batch_size=1）")
-    test_start = time.time()
-    infer_times = []
-    
-    # 单张推理（batch_size=1）
-    for i in range(test_runs):
-        t0 = time.time()
-        predict(ort_session, tokenizer, memory_pool, image, caption,
-                box_threshold, text_threshold, device, is_benchmark=True)
-        infer_time = time.time() - t0
-        infer_times.append(infer_time)
-        print(f"实际推理 {i+1}/{test_runs} - 耗时: {infer_time*1000:.2f} ms")
-
-    # 计算性能指标
-    total_test_time = time.time() - test_start
-    avg_infer_time = np.mean(infer_times)
-    std_infer_time = np.std(infer_times)
-    max_infer_time = np.max(infer_times)
-    min_infer_time = np.min(infer_times)
-    fps = test_runs / total_test_time
-
-    # 输出性能报告
-    print("\n" + "="*60)
-    print("📈 优化后性能测试报告（固定800x1200）")
-    print("="*60)
-    print(f"测试次数: {test_runs} 次 (batch_size=1)")
-    print(f"总推理耗时: {total_test_time:.3f} s")
-    print(f"平均推理耗时: {avg_infer_time*1000:.2f} ms (±{std_infer_time*1000:.2f} ms)")
-    print(f"最大推理耗时: {max_infer_time*1000:.2f} ms")
-    print(f"最小推理耗时: {min_infer_time*1000:.2f} ms")
-    print(f"平均FPS: {fps:.2f} 帧/秒")
-    print("="*60)
-
-    return {
-        "warmup_runs": warmup_runs,
-        "test_runs": test_runs,
-        "batch_size": batch_size,
-        "avg_infer_time_ms": avg_infer_time*1000,
-        "std_infer_time_ms": std_infer_time*1000,
-        "max_infer_time_ms": max_infer_time*1000,
-        "min_infer_time_ms": min_infer_time*1000,
-        "fps": fps
-    }
-
-if __name__ == '__main__':
-    # ========== 固定配置参数（800x1200，batch_size=1） ==========
-    model_path = 'weights/ground.onnx'
-    img_path = 'images/in/car_1.jpg'
-    TEXT_PROMPT = "car ."
-    BOX_TRESHOLD = 0.35
-    TEXT_TRESHOLD = 0.25
-    DEVICE = "cpu"  # 实际使用时改为"rocm"
-    WARMUP_RUNS = 5    # 预热次数
-    TEST_RUNS = 10     # 实际测试次数
-    BATCH_SIZE = 1     # 固定为1
-    IMG_SHAPE = (3, 800, 1200)  # 固定导出尺寸
-    MAX_TEXT_LEN = 256
-
-    # ========== ONNX Runtime优化配置（针对ROCM/HIP） ==========
-    print("🔍 加载ONNX模型（固定800x1200，batch_size=1）")
-    sess_options = ort.SessionOptions()
-    # 启用所有图优化
-    sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-    # 禁用按需加载内核（预加载所有HIP内核）
-    sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
-    sess_options.enable_cpu_mem_arena = False
-    sess_options.enable_mem_pattern = True
-    sess_options.log_severity_level = 3
-    
-    # ROCM/HIP优化配置
-    providers = [
-        ('ROCMExecutionProvider', {
-            'device_id': 0,
-            'arena_extend_strategy': 'kNextPowerOfTwo',
-            'gpu_mem_limit': 8 * 1024 * 1024 * 1024,  # 8GB GPU内存
-            'cudnn_conv_algo_search': 'EXHAUSTIVE',
-            'do_copy_in_default_stream': True  # 减少流同步
-        }),
-        'CPUExecutionProvider'
-    ]
-
-    # ========== 加载模型（仅一次，解决hipModuleLoadData瓶颈） ==========
-    ort_session = ort.InferenceSession(
-        model_path, 
-        sess_options=sess_options, 
-        providers=providers
-    )
-    current_provider = ort_session.get_providers()
-    print(f"✅ 模型加载完成 - 当前执行引擎: {current_provider}")
-
-    # ========== 预加载tokenizer（仅一次） ==========
-    print("\n📝 预加载BERT Tokenizer")
-    t0 = time.time()
-    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-    print(f"✅ Tokenizer加载完成 - 耗时: {(time.time() - t0):.3f} s")
-
-    # ========== 初始化固定尺寸内存池（800x1200） ==========
-    print("\n🗃️ 初始化固定尺寸内存池（800x1200）")
-    memory_pool = HIPMemoryPool(img_shape=IMG_SHAPE, max_text_len=MAX_TEXT_LEN, device=DEVICE)
-    print(f"✅ 内存池初始化完成 - 固定尺寸: {IMG_SHAPE}")
-
-    # ========== 加载并校验图片尺寸 ==========
-    print("\n🖼️ 加载并预处理测试图片（强制800x1200）")
-    image_source, image = load_image(img_path)
-    
-    # 强制调整为800x1200（确保和导出尺寸一致）
-    if image.shape != IMG_SHAPE:
-        print(f"⚠️ 图片尺寸{image.shape}不符，强制调整为{IMG_SHAPE}")
-        image = cv2.resize(image.transpose(1,2,0), (IMG_SHAPE[2], IMG_SHAPE[1])).transpose(2,0,1)
-    
-    print(f"✅ 图片加载完成 - 最终尺寸: {image.shape}")
-
-    # ========== 性能测试 ==========
-    performance_result = benchmark_performance(
-        ort_session, tokenizer, memory_pool, image, TEXT_PROMPT,
-        BOX_TRESHOLD, TEXT_TRESHOLD,
-        WARMUP_RUNS, TEST_RUNS, DEVICE, BATCH_SIZE
-    )
-
-    # ========== 最终推理 ==========
-    print("\n" + "="*60)
-    print("🎯 执行最终推理（固定800x1200）")
-    print("="*60)
-    boxes, confs, phrases = predict(
-        ort_session, tokenizer, memory_pool, image, TEXT_PROMPT,
-        BOX_TRESHOLD, TEXT_TRESHOLD, DEVICE
-    )
-
-    # 绘制并保存结果
-    os.makedirs('./images/out', exist_ok=True)
-    ori_img = cv2.imread(img_path)
-    # 强制调整原始图片尺寸以匹配推理尺寸
-    ori_img = cv2.resize(ori_img, (IMG_SHAPE[2], IMG_SHAPE[1]))
-    img_h, img_w = ori_img.shape[:2]
-    
-    for i in range(len(boxes)):
-        one_box = boxes[i]
-        one_conf = confs[i]
-        one_cls = phrases[i]
-        # 转换box坐标 (cx, cy, w, h) -> (x1, y1, x2, y2)
-        x1 = int((one_box[0] - one_box[2] / 2) * img_w)
-        y1 = int((one_box[1] - one_box[3] / 2) * img_h)
-        x2 = int((one_box[0] + one_box[2] / 2) * img_w)
-        y2 = int((one_box[1] + one_box[3] / 2) * img_h)
-        
-        # 绘制框和标签
-        cv2.rectangle(ori_img, (x1, y1), (x2, y2), (0, 0, 255), 2)
-        cv2.putText(
-            ori_img, f'{one_cls} {one_conf:.2f}', 
-            (x1-15, y1-15), 
-            cv2.FONT_HERSHEY_SIMPLEX, 
-            1.5, (255, 255, 255), 3
-        )
-
-    # 保存结果
-    cv2.imwrite('./images/out/result_800x1200.jpg', ori_img)
-    print(f"\n✅ 结果已保存至: ./images/out/result_800x1200.jpg")
-    print(f"✅ 检测到目标: {phrases} (共 {len(boxes)} 个)")
-    print(f"✅ 性能指标: FPS={performance_result['fps']:.2f}, 平均耗时={performance_result['avg_infer_time_ms']:.2f}ms")
\ No newline at end of file
--- a/onnx_inference1_migraphx_test.py
+++ b/onnx_inference1_migraphx_test.py
-import migraphx as mgx
-
-p = mgx.parse_onnx("weights/ground_external.onnx")   # 只读取，不优化
-passes = [
-    mgx.pass_dead_code_elimination(),               # 删除未使用的节点/常量
-    mgx.pass_eliminate_contiguous(),                # 合并相邻的 contiguous 操作
-    mgx.pass_simplify_reshapes(),                   # 合并/简化 reshape
-    mgx.pass_simplify_algebra(),                    # 简化代数表达式 (add/mul/..)
-    mgx.pass_eliminate_identity(),                  # 删除 Identity ops
-    mgx.pass_common_subexpression_elimination(),    # CSE
-]
-p.apply_passes(passes)            # 手动执行
-p.compile(mgx.target("gpu"))
-p.save("weights/ground.mgx")
\ No newline at end of file
--- a/tmprllsblav/_remote_module_non_scriptable.py
+++ b/tmprllsblav/_remote_module_non_scriptable.py
-from typing import *
-
-import torch
-import torch.distributed.rpc as rpc
-from torch import Tensor
-from torch._jit_internal import Future
-from torch.distributed.rpc import RRef
-from typing import Tuple  # pyre-ignore: unused import
-
-
-module_interface_cls = None
-
-
-def forward_async(self, *args, **kwargs):
-    args = (self.module_rref, self.device, self.is_device_map_set, *args)
-    kwargs = {**kwargs}
-    return rpc.rpc_async(
-        self.module_rref.owner(),
-        _remote_forward,
-        args,
-        kwargs,
-    )
-
-
-def forward(self, *args, **kwargs):
-    args = (self.module_rref, self.device, self.is_device_map_set, *args)
-    kwargs = {**kwargs}
-    ret_fut = rpc.rpc_async(
-        self.module_rref.owner(),
-        _remote_forward,
-        args,
-        kwargs,
-    )
-    return ret_fut.wait()
-
-
-_generated_methods = [
-    forward_async,
-    forward,
-]
-
-
-
-
-def _remote_forward(
-    module_rref: RRef[module_interface_cls], device: str, is_device_map_set: bool, *args, **kwargs):
-    module = module_rref.local_value()
-    device = torch.device(device)
-
-    if device.type != "cuda":
-        return module.forward(*args, **kwargs)
-
-    # If the module is on a cuda device,
-    # move any CPU tensor in args or kwargs to the same cuda device.
-    # Since torch script does not support generator expression,
-    # have to use concatenation instead of
-    # ``tuple(i.to(device) if isinstance(i, Tensor) else i for i in *args)``.
-    args = (*args,)
-    out_args: Tuple[()] = ()
-    for arg in args:
-        arg = (arg.to(device),) if isinstance(arg, Tensor) else (arg,)
-        out_args = out_args + arg
-
-    kwargs = {**kwargs}
-    for k, v in kwargs.items():
-        if isinstance(v, Tensor):
-            kwargs[k] = kwargs[k].to(device)
-
-    if is_device_map_set:
-        return module.forward(*out_args, **kwargs)
-
-    # If the device map is empty, then only CPU tensors are allowed to send over wire,
-    # so have to move any GPU tensor to CPU in the output.
-    # Since torch script does not support generator expression,
-    # have to use concatenation instead of
-    # ``tuple(i.cpu() if isinstance(i, Tensor) else i for i in module.forward(*out_args, **kwargs))``.
-    ret: Tuple[()] = ()
-    for i in module.forward(*out_args, **kwargs):
-        i = (i.cpu(),) if isinstance(i, Tensor) else (i,)
-        ret = ret + i
-    return ret
--- a/utils/utils.py
+++ b/utils/utils.py
@@ -24,7 +24,8 @@ def preprocess_caption(caption: str):
 def load_image(image_path: str):
    transform = T.Compose(
        [
-            T.RandomResize([800], max_size=1333),
+            # T.RandomResize([800], max_size=1333), # 800x1200输入大小
+            T.RandomResize([400], max_size=1333), # 400x600输入大小
            T.ToTensor(),
            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
        ]