首次提交

34e4011b · zk · 34e4011b · 34e4011b · 34e4011b · 34e4011b
Commit 34e4011b authored Apr 14, 2026 by zk
20 changed files
--- a/demo/image_editing_with_groundingdino_gligen.ipynb
+++ b/demo/image_editing_with_groundingdino_gligen.ipynb
--- a/demo/image_editing_with_groundingdino_stablediffusion.ipynb
+++ b/demo/image_editing_with_groundingdino_stablediffusion.ipynb
--- a/demo/infer_batchsize.py
+++ b/demo/infer_batchsize.py
+import argparse
+import os
+import sys
+import time
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+
+import groundingdino.datasets.transforms as T
+from groundingdino.models import build_model
+from groundingdino.util import box_ops
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+from groundingdino.util.vl_utils import create_positive_map_from_span
+
+# ====================== 核心配置 - 只需要改这里就能调整batch size ======================
+INFERENCE_BATCH_SIZE = 8  # 推理批次大小，修改这个值即可改变batch size
+# ====================================================================================
+
+def plot_boxes_to_image(image_pil, tgt):
+    H, W = tgt["size"]
+    boxes = tgt["boxes"]
+    labels = tgt["labels"]
+    assert len(boxes) == len(labels), "boxes and labels must have same length"
+
+    draw = ImageDraw.Draw(image_pil)
+    mask = Image.new("L", image_pil.size, 0)
+    mask_draw = ImageDraw.Draw(mask)
+
+    # draw boxes and masks
+    for box, label in zip(boxes, labels):
+        # from 0..1 to 0..W, 0..H
+        box = box * torch.Tensor([W, H, W, H])
+        # from xywh to xyxy
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        # random color
+        color = tuple(np.random.randint(0, 255, size=3).tolist())
+        # draw
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+        # draw.text((x0, y0), str(label), fill=color)
+
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), str(label), font)
+        else:
+            w, h = draw.textsize(str(label), font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        # bbox = draw.textbbox((x0, y0), str(label))
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), str(label), fill="white")
+
+        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
+
+    return image_pil, mask
+
+
+def load_image(image_path):
+    # load image
+    image_pil = Image.open(image_path).convert("RGB")  # load image
+
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image_pil, image
+
+
+def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = "cuda" if not cpu_only else "cpu"
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    print(load_res)
+    _ = model.eval()
+    return model
+
+
+def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
+    assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    device = "cuda" if not cpu_only else "cpu"
+    model = model.to(device)
+    
+    # ========== 修改1: 构建batch数据 ==========
+    # 复制图像数据以构建batch (batch_size, 3, H, W)
+    image_batch = image.unsqueeze(0).repeat(INFERENCE_BATCH_SIZE, 1, 1, 1).to(device)
+    # 复制文本prompt以构建batch (batch_size,)
+    caption_batch = [caption] * INFERENCE_BATCH_SIZE
+    
+    # 核心推理计时
+    torch.cuda.synchronize() if device == "cuda" else None  # 等待GPU操作完成，确保计时准确
+    start_time = time.perf_counter()  # 高精度计时
+    
+    with torch.no_grad():
+        outputs = model(image_batch, captions=caption_batch)  # 传入batch数据
+    
+    torch.cuda.synchronize() if device == "cuda" else None  # 等待GPU推理完成
+    infer_time = time.perf_counter() - start_time  # 推理耗时（秒）
+    # 计算单张图片的平均推理时间
+    avg_single_infer_time = infer_time / INFERENCE_BATCH_SIZE
+    
+    # ========== 修改2: 处理batch输出 ==========
+    # 取第一个样本的输出作为结果（所有样本结果相同）
+    logits = outputs["pred_logits"].sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"][0]  # (nq, 4)
+
+    # filter output
+    if token_spans is None:
+        logits_filt = logits.cpu().clone()
+        boxes_filt = boxes.cpu().clone()
+        filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+        logits_filt = logits_filt[filt_mask]  # num_filt, 256
+        boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+
+        # get phrase
+        tokenlizer = model.tokenizer
+        tokenized = tokenlizer(caption)
+        # build pred
+        pred_phrases = []
+        for logit, box in zip(logits_filt, boxes_filt):
+            pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+            if with_logits:
+                pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+            else:
+                pred_phrases.append(pred_phrase)
+    else:
+        # given-phrase mode
+        positive_maps = create_positive_map_from_span(
+            model.tokenizer(caption),
+            token_span=token_spans
+        ).to(image.device) # n_phrase, 256
+
+        logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
+        all_logits = []
+        all_phrases = []
+        all_boxes = []
+        for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
+            # get phrase
+            phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
+            # get mask
+            filt_mask = logit_phr > box_threshold
+            # filt box
+            all_boxes.append(boxes[filt_mask])
+            # filt logits
+            all_logits.append(logit_phr[filt_mask])
+            if with_logits:
+                logit_phr_num = logit_phr[filt_mask]
+                all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
+            else:
+                all_phrases.extend([phrase for _ in range(len(filt_mask))])
+        boxes_filt = torch.cat(all_boxes, dim=0).cpu()
+        pred_phrases = all_phrases
+
+    # 返回batch推理总时间，方便性能计算
+    return boxes_filt, pred_phrases, infer_time, avg_single_infer_time
+
+
+def benchmark_performance(model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, warmup_runs=5, test_runs=10):
+    """
+    性能基准测试：预热 + 多次推理计算平均FPS和时延
+    适配batch推理，计算正确的吞吐量
+    Args:
+        warmup_runs: 预热次数（排除初始加载的影响）
+        test_runs: 正式测试次数
+    """
+    # 1. 预热阶段（忽略耗时）
+    print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
+    for i in range(warmup_runs):
+        _, _, _, _ = get_grounding_output(
+            model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
+        )
+        print(f"预热完成 {i+1}/{warmup_runs}")
+    
+    # 2. 正式测试阶段
+    print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
+    total_batch_time = 0.0  # 总batch推理时间
+    total_single_time = 0.0  # 总单张推理时间
+    batch_times = []        # 记录每次batch推理的时延
+    single_times = []       # 记录每次单张推理的平均时延
+    
+    for i in range(test_runs):
+        _, _, batch_infer_time, avg_single_infer_time = get_grounding_output(
+            model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
+        )
+        batch_times.append(batch_infer_time)
+        single_times.append(avg_single_infer_time)
+        total_batch_time += batch_infer_time
+        total_single_time += avg_single_infer_time
+        print(f"测试 {i+1}/{test_runs} - Batch推理时延: {batch_infer_time*1000:.2f} ms | 单张平均时延: {avg_single_infer_time*1000:.2f} ms")
+    
+    # 3. 计算性能指标
+    avg_batch_time = total_batch_time / test_runs                # 平均batch推理时延（秒）
+    avg_single_time = total_single_time / test_runs              # 平均单张推理时延（秒）
+    batch_throughput = (test_runs * INFERENCE_BATCH_SIZE) / total_batch_time  # 总吞吐量 (张/秒)
+    batch_std_time = np.std(batch_times)                         # batch时延标准差
+    single_std_time = np.std(single_times)                       # 单张时延标准差
+    
+    # 4. 输出性能报告
+    print("\n" + "="*60)
+    print("📊 性能测试报告 (Batch Size = {})".format(INFERENCE_BATCH_SIZE))
+    print("="*60)
+    print(f"测试环境: {'GPU (CUDA)' if not cpu_only else 'CPU'}")
+    print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
+    print(f"Batch Size: {INFERENCE_BATCH_SIZE}")
+    print(f"平均Batch推理时延: {avg_batch_time*1000:.2f} ms (±{batch_std_time*1000:.2f} ms)")
+    print(f"平均单张推理时延: {avg_single_time*1000:.2f} ms (±{single_std_time*1000:.2f} ms)")
+    print(f"最大Batch时延: {max(batch_times)*1000:.2f} ms | 最大单张时延: {max(single_times)*1000:.2f} ms")
+    print(f"最小Batch时延: {min(batch_times)*1000:.2f} ms | 最小单张时延: {min(single_times)*1000:.2f} ms")
+    print(f"总吞吐量: {batch_throughput:.2f} 张/秒")
+    print("="*60 + "\n")
+    
+    return avg_batch_time, avg_single_time, batch_throughput, batch_times, single_times
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser("Grounding DINO 性能测试 (Batch推理)", add_help=True)
+    parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
+    parser.add_argument("--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file")
+    parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
+    parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
+    parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True, help="output directory")
+    parser.add_argument("--box_threshold", type=float, default=0.35, help="box threshold")
+    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
+    parser.add_argument("--token_spans", type=str, default=None, help=
+                        "The positions of start and end positions of phrases of interest. \
+                        For example, a caption is 'a cat and a dog', \
+                        if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
+                        if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
+                        ")
+    parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
+    # 新增性能测试参数
+    parser.add_argument("--warmup-runs", type=int, default=5, help="预热运行次数，默认5")
+    parser.add_argument("--test-runs", type=int, default=10, help="正式测试运行次数，默认10")
+    
+    args = parser.parse_args()
+
+    # cfg
+    config_file = args.config_file
+    checkpoint_path = args.checkpoint_path
+    image_path = args.image_path
+    text_prompt = args.text_prompt
+    output_dir = args.output_dir
+    box_threshold = args.box_threshold
+    text_threshold = args.text_threshold
+    token_spans = args.token_spans
+    warmup_runs = args.warmup_runs
+    test_runs = args.test_runs
+
+    # 打印基础信息
+    print(f"📌 使用设备: {'GPU' if not args.cpu_only else 'CPU'}")
+    print(f"📌 推理Batch Size: {INFERENCE_BATCH_SIZE}")
+    if not args.cpu_only and torch.cuda.is_available():
+        print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
+        print(f"📌 GPU编号: {torch.cuda.current_device()}")
+
+    # make dir
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # load image
+    image_pil, image = load_image(image_path)
+    
+    # load model
+    model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
+
+    # visualize raw image
+    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
+
+    # set the text_threshold to None if token_spans is set.
+    if token_spans is not None:
+        text_threshold = None
+        print("Using token_spans. Set the text_threshold to None.")
+
+    # 运行性能基准测试
+    avg_batch_time, avg_single_time, throughput, batch_times, single_times = benchmark_performance(
+        model, image, text_prompt, box_threshold, text_threshold, 
+        args.cpu_only, eval(f"{token_spans}") if token_spans else None,
+        warmup_runs, test_runs
+    )
+
+    # 单次推理并保存结果（保留原有功能）
+    print("\n=== 生成推理结果图片 ===")
+    boxes_filt, pred_phrases, batch_infer_time, single_infer_time = get_grounding_output(
+        model, image, text_prompt, box_threshold, text_threshold, args.cpu_only, 
+        eval(f"{token_spans}") if token_spans else None
+    )
+
+    # visualize pred
+    size = image_pil.size
+    pred_dict = {
+        "boxes": boxes_filt,
+        "size": [size[1], size[0]],  # H,W
+        "labels": pred_phrases,
+    }
+    image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
+    image_with_box.save(os.path.join(output_dir, "pred.jpg"))
+    
+    # 保存性能结果到文件
+    performance_file = os.path.join(output_dir, "performance_report.txt")
+    with open(performance_file, "w", encoding="utf-8") as f:
+        f.write("="*60 + "\n")
+        f.write(f"Grounding DINO 性能测试报告 (Batch Size = {INFERENCE_BATCH_SIZE})\n")
+        f.write("="*60 + "\n")
+        f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"测试设备: {'GPU' if not args.cpu_only else 'CPU'}\n")
+        if not args.cpu_only and torch.cuda.is_available():
+            f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
+        f.write(f"Batch Size: {INFERENCE_BATCH_SIZE}\n")
+        f.write(f"预热次数: {warmup_runs}\n")
+        f.write(f"测试次数: {test_runs}\n")
+        f.write(f"平均Batch推理时延: {avg_batch_time*1000:.2f} ms\n")
+        f.write(f"Batch时延标准差: {np.std(batch_times)*1000:.2f} ms\n")
+        f.write(f"平均单张推理时延: {avg_single_time*1000:.2f} ms\n")
+        f.write(f"单张时延标准差: {np.std(single_times)*1000:.2f} ms\n")
+        f.write(f"最大Batch时延: {max(batch_times)*1000:.2f} ms\n")
+        f.write(f"最小Batch时延: {min(batch_times)*1000:.2f} ms\n")
+        f.write(f"总吞吐量: {throughput:.2f} 张/秒\n")
+        f.write(f"最后一次Batch推理时延: {batch_infer_time*1000:.2f} ms\n")
+        f.write(f"最后一次单张推理时延: {single_infer_time*1000:.2f} ms\n")
+    
+    print(f"\n✅ 性能报告已保存至: {performance_file}")
+    print(f"✅ 推理结果图片已保存至: {os.path.join(output_dir, 'pred.jpg')}")
\ No newline at end of file
--- a/demo/infer_onnx.py
+++ b/demo/infer_onnx.py
+import argparse
+import os
+import time
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+
+import groundingdino.datasets.transforms as T
+from groundingdino.util.utils import get_phrases_from_posmap
+from groundingdino.util import get_tokenlizer
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.models.GroundingDINO.bertwarper import (
+    generate_masks_with_special_tokens_and_transfer_map,
+)
+
+
+def plot_boxes_to_image(image_pil, tgt):
+    H, W = tgt["size"]
+    boxes = tgt["boxes"]
+    labels = tgt["labels"]
+    assert len(boxes) == len(labels), "boxes and labels must have same length"
+
+    draw = ImageDraw.Draw(image_pil)
+    mask = Image.new("L", image_pil.size, 0)
+    mask_draw = ImageDraw.Draw(mask)
+
+    for box, label in zip(boxes, labels):
+        box = box * torch.Tensor([W, H, W, H])
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        color = tuple(np.random.randint(0, 255, size=3).tolist())
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), str(label), font)
+        else:
+            w, h = draw.textsize(str(label), font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), str(label), fill="white")
+
+        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
+
+    return image_pil, mask
+
+
+def load_image(image_path):
+    image_pil = Image.open(image_path).convert("RGB")
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3,h,w
+    return image_pil, image
+
+
+def preprocess_caption(caption: str) -> str:
+    caption = caption.lower().strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    return caption
+
+
+def sigmoid(x: np.ndarray) -> np.ndarray:
+    return 1.0 / (1.0 + np.exp(-x))
+
+
+def build_text_tensors(
+    config_file: str,
+    caption: str,
+    device: str,
+):
+    cfg = SLConfig.fromfile(config_file)
+    tokenizer = get_tokenlizer.get_tokenlizer(cfg.text_encoder_type)
+    special_token_ids = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
+
+    caption = preprocess_caption(caption)
+    tokenized = tokenizer([caption], padding="longest", return_tensors="pt")
+    tokenized = {k: v.to(device) for k, v in tokenized.items()}
+
+    text_self_attention_masks, position_ids, _ = generate_masks_with_special_tokens_and_transfer_map(
+        tokenized, special_token_ids, tokenizer
+    )
+
+    max_text_len = getattr(cfg, "max_text_len", 256)
+    if text_self_attention_masks.shape[1] > max_text_len:
+        s = max_text_len
+        text_self_attention_masks = text_self_attention_masks[:, :s, :s]
+        position_ids = position_ids[:, :s]
+        tokenized["input_ids"] = tokenized["input_ids"][:, :s]
+        tokenized["attention_mask"] = tokenized["attention_mask"][:, :s]
+        tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :s]
+
+    # 同时返回 tokenizer 和“单句 tokenize”（用于 get_phrases_from_posmap 行为对齐）
+    tokenized_single = tokenizer(caption)
+
+    return (
+        cfg,
+        tokenizer,
+        tokenized_single,
+        tokenized["input_ids"].to(torch.int64),
+        tokenized["token_type_ids"].to(torch.int64),
+        tokenized["attention_mask"].to(torch.int64),
+        position_ids.to(torch.int64),
+        text_self_attention_masks,
+    )
+
+
+def ort_create_session(onnx_path: str, device: str, num_threads: int = 0):
+    import onnxruntime as ort
+
+    so = ort.SessionOptions()
+    if num_threads and num_threads > 0:
+        so.intra_op_num_threads = int(num_threads)
+        so.inter_op_num_threads = int(num_threads)
+
+    providers = ["CPUExecutionProvider"]
+    if device == "cuda":
+        # 若环境支持 onnxruntime-gpu，会自动启用 CUDA provider
+        providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+
+    return ort.InferenceSession(onnx_path, sess_options=so, providers=providers)
+
+
+def onnx_infer_once(
+    sess,
+    image: torch.Tensor,
+    input_ids: torch.Tensor,
+    token_type_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    position_ids: torch.Tensor,
+    text_self_attention_masks: torch.Tensor,
+    use_cuda_sync: bool,
+) -> Tuple[np.ndarray, np.ndarray, float]:
+    # ORT 输入必须是 numpy
+    feeds = {
+        "image": image[None].detach().cpu().numpy().astype(np.float32),
+        "input_ids": input_ids.detach().cpu().numpy().astype(np.int64),
+        "token_type_ids": token_type_ids.detach().cpu().numpy().astype(np.int64),
+        "attention_mask": attention_mask.detach().cpu().numpy().astype(np.int64),
+        "position_ids": position_ids.detach().cpu().numpy().astype(np.int64),
+        "text_self_attention_masks": text_self_attention_masks.detach().cpu().numpy(),
+    }
+
+    if use_cuda_sync:
+        torch.cuda.synchronize()
+    start = time.perf_counter()
+    pred_logits, pred_boxes = sess.run(["pred_logits", "pred_boxes"], feeds)
+    if use_cuda_sync:
+        torch.cuda.synchronize()
+    infer_time = time.perf_counter() - start
+    return pred_logits, pred_boxes, infer_time
+
+
+def postprocess_and_phrases(
+    pred_logits: np.ndarray,  # [B,NQ,S]
+    pred_boxes: np.ndarray,  # [B,NQ,4]
+    tokenized_single,
+    tokenizer,
+    box_threshold: float,
+    text_threshold: float,
+    with_logits: bool = True,
+):
+    # 对齐 torch 版：取 batch=0
+    logits = sigmoid(pred_logits[0])  # [NQ,S]
+    boxes = pred_boxes[0]  # [NQ,4]
+
+    max_per_query = logits.max(axis=1)
+    mask = max_per_query > box_threshold
+    logits_filt = logits[mask]
+    boxes_filt = boxes[mask]
+
+    pred_phrases: List[str] = []
+    for logit in logits_filt:
+        posmap = torch.from_numpy(logit) > text_threshold
+        phrase = get_phrases_from_posmap(posmap, tokenized_single, tokenizer)
+        phrase = phrase.replace(".", "")
+        if with_logits:
+            pred_phrases.append(phrase + f"({str(float(logit.max()))[:4]})")
+        else:
+            pred_phrases.append(phrase)
+
+    return torch.from_numpy(boxes_filt), pred_phrases
+
+
+def benchmark_performance_onnx(
+    sess,
+    image: torch.Tensor,
+    input_ids: torch.Tensor,
+    token_type_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    position_ids: torch.Tensor,
+    text_self_attention_masks: torch.Tensor,
+    warmup_runs: int = 5,
+    test_runs: int = 10,
+    use_cuda_sync: bool = False,
+):
+    print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
+    for i in range(warmup_runs):
+        _ = onnx_infer_once(
+            sess,
+            image,
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            position_ids,
+            text_self_attention_masks,
+            use_cuda_sync=use_cuda_sync,
+        )
+        print(f"预热完成 {i+1}/{warmup_runs}")
+
+    print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
+    total_time = 0.0
+    infer_times = []
+    for i in range(test_runs):
+        _, _, infer_time = onnx_infer_once(
+            sess,
+            image,
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            position_ids,
+            text_self_attention_masks,
+            use_cuda_sync=use_cuda_sync,
+        )
+        infer_times.append(infer_time)
+        total_time += infer_time
+        print(f"测试 {i+1}/{test_runs} - 单次推理时延: {infer_time*1000:.2f} ms")
+
+    avg_infer_time = total_time / test_runs
+    fps = test_runs / total_time
+    std_infer_time = float(np.std(infer_times))
+
+    print("\n" + "=" * 50)
+    print("📊 ONNX 性能测试报告")
+    print("=" * 50)
+    print(f"测试环境: {'GPU (CUDAExecutionProvider)' if use_cuda_sync else 'CPU/Unknown'}")
+    print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
+    print(f"平均推理时延: {avg_infer_time*1000:.2f} ms (±{std_infer_time*1000:.2f} ms)")
+    print(f"最大推理时延: {max(infer_times)*1000:.2f} ms")
+    print(f"最小推理时延: {min(infer_times)*1000:.2f} ms")
+    print(f"平均FPS: {fps:.2f} 帧/秒")
+    print("=" * 50 + "\n")
+
+    return avg_infer_time, fps, infer_times
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Grounding DINO ONNX 推理与性能测试", add_help=True)
+    parser.add_argument("--onnx_path", type=str, required=True, help="onnx 模型路径")
+    parser.add_argument("--config_file", "-c", type=str, required=True, help="用于加载 tokenizer 等配置")
+    parser.add_argument("--image_path", "-i", type=str, required=True)
+    parser.add_argument("--text_prompt", "-t", type=str, required=True)
+    parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True)
+    parser.add_argument("--box_threshold", type=float, default=0.3)
+    parser.add_argument("--text_threshold", type=float, default=0.25)
+    parser.add_argument("--cpu-only", action="store_true")
+    parser.add_argument("--warmup-runs", type=int, default=5)
+    parser.add_argument("--test-runs", type=int, default=10)
+    parser.add_argument("--ort-threads", type=int, default=0, help="onnxruntime 线程数(0=默认)")
+    args = parser.parse_args()
+
+    device = "cpu" if args.cpu_only else ("cuda" if torch.cuda.is_available() else "cpu")
+    use_cuda_sync = device == "cuda"
+
+    print(f"📌 ORT 设备偏好: {device}")
+    if use_cuda_sync:
+        print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
+
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    image_pil, image = load_image(args.image_path)
+    image_pil.save(os.path.join(args.output_dir, "raw_image.jpg"))
+
+    (
+        _cfg,
+        tokenizer,
+        tokenized_single,
+        input_ids,
+        token_type_ids,
+        attention_mask,
+        position_ids,
+        text_self_attention_masks,
+    ) = build_text_tensors(args.config_file, args.text_prompt, device="cpu")
+
+    # image 在 GPU 上计时同步更准确，但 feeds 最终还是走 numpy(cpu)；这里只保持与 torch 版一致：
+    # 计时逻辑保留 + 可视化保留；模型本体推理走 ORT
+    sess = ort_create_session(args.onnx_path, device=device, num_threads=args.ort_threads)
+
+    avg_infer_time, fps, infer_times = benchmark_performance_onnx(
+        sess,
+        image,
+        input_ids,
+        token_type_ids,
+        attention_mask,
+        position_ids,
+        text_self_attention_masks,
+        warmup_runs=args.warmup_runs,
+        test_runs=args.test_runs,
+        use_cuda_sync=use_cuda_sync,
+    )
+
+    print("\n=== 生成推理结果图片 ===")
+    pred_logits, pred_boxes, single_infer_time = onnx_infer_once(
+        sess,
+        image,
+        input_ids,
+        token_type_ids,
+        attention_mask,
+        position_ids,
+        text_self_attention_masks,
+        use_cuda_sync=use_cuda_sync,
+    )
+
+    boxes_filt, pred_phrases = postprocess_and_phrases(
+        pred_logits=pred_logits,
+        pred_boxes=pred_boxes,
+        tokenized_single=tokenized_single,
+        tokenizer=tokenizer,
+        box_threshold=args.box_threshold,
+        text_threshold=args.text_threshold,
+        with_logits=True,
+    )
+
+    size = image_pil.size
+    pred_dict = {
+        "boxes": boxes_filt,
+        "size": [size[1], size[0]],  # H,W
+        "labels": pred_phrases,
+    }
+    image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
+    image_with_box.save(os.path.join(args.output_dir, "pred.jpg"))
+
+    performance_file = os.path.join(args.output_dir, "performance_report_onnx.txt")
+    with open(performance_file, "w", encoding="utf-8") as f:
+        f.write("=" * 50 + "\n")
+        f.write("Grounding DINO ONNX 性能测试报告\n")
+        f.write("=" * 50 + "\n")
+        f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"推理后端: onnxruntime\n")
+        f.write(f"设备偏好: {device}\n")
+        if use_cuda_sync:
+            f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
+        f.write(f"预热次数: {args.warmup_runs}\n")
+        f.write(f"测试次数: {args.test_runs}\n")
+        f.write(f"平均推理时延: {avg_infer_time*1000:.2f} ms\n")
+        f.write(f"时延标准差: {np.std(infer_times)*1000:.2f} ms\n")
+        f.write(f"最大时延: {max(infer_times)*1000:.2f} ms\n")
+        f.write(f"最小时延: {min(infer_times)*1000:.2f} ms\n")
+        f.write(f"平均FPS: {fps:.2f} 帧/秒\n")
+        f.write(f"单次推理时延（最后一次）: {single_infer_time*1000:.2f} ms\n")
+
+    print(f"\n✅ 性能报告已保存至: {performance_file}")
+    print(f"✅ 推理结果图片已保存至: {os.path.join(args.output_dir, 'pred.jpg')}")
+
--- a/demo/infer_torch.py
+++ b/demo/infer_torch.py
+import argparse
+import os
+import sys
+import time
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+
+import groundingdino.datasets.transforms as T
+from groundingdino.models import build_model
+from groundingdino.util import box_ops
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+from groundingdino.util.vl_utils import create_positive_map_from_span
+
+
+def plot_boxes_to_image(image_pil, tgt):
+    H, W = tgt["size"]
+    boxes = tgt["boxes"]
+    labels = tgt["labels"]
+    assert len(boxes) == len(labels), "boxes and labels must have same length"
+
+    draw = ImageDraw.Draw(image_pil)
+    mask = Image.new("L", image_pil.size, 0)
+    mask_draw = ImageDraw.Draw(mask)
+
+    # draw boxes and masks
+    for box, label in zip(boxes, labels):
+        # from 0..1 to 0..W, 0..H
+        box = box * torch.Tensor([W, H, W, H])
+        # from xywh to xyxy
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        # random color
+        color = tuple(np.random.randint(0, 255, size=3).tolist())
+        # draw
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+        # draw.text((x0, y0), str(label), fill=color)
+
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), str(label), font)
+        else:
+            w, h = draw.textsize(str(label), font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        # bbox = draw.textbbox((x0, y0), str(label))
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), str(label), fill="white")
+
+        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
+
+    return image_pil, mask
+
+
+def load_image(image_path):
+    # load image
+    image_pil = Image.open(image_path).convert("RGB")  # load image
+
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image_pil, image
+
+
+def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = "cuda" if not cpu_only else "cpu"
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    print(load_res)
+    _ = model.eval()
+    return model
+
+
+def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
+    assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    device = "cuda" if not cpu_only else "cpu"
+    model = model.to(device)
+    image = image.to(device)
+    
+    # 核心推理计时
+    torch.cuda.synchronize() if device == "cuda" else None  # 等待GPU操作完成，确保计时准确
+    start_time = time.perf_counter()  # 高精度计时
+    
+    with torch.no_grad():
+        outputs = model(image[None], captions=[caption])
+    
+    torch.cuda.synchronize() if device == "cuda" else None  # 等待GPU推理完成
+    infer_time = time.perf_counter() - start_time  # 推理耗时（秒）
+    
+    logits = outputs["pred_logits"].sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"][0]  # (nq, 4)
+
+    # filter output
+    if token_spans is None:
+        logits_filt = logits.cpu().clone()
+        boxes_filt = boxes.cpu().clone()
+        filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+        logits_filt = logits_filt[filt_mask]  # num_filt, 256
+        boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+
+        # get phrase
+        tokenlizer = model.tokenizer
+        tokenized = tokenlizer(caption)
+        # build pred
+        pred_phrases = []
+        for logit, box in zip(logits_filt, boxes_filt):
+            pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+            if with_logits:
+                pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+            else:
+                pred_phrases.append(pred_phrase)
+    else:
+        # given-phrase mode
+        positive_maps = create_positive_map_from_span(
+            model.tokenizer(text_prompt),
+            token_span=token_spans
+        ).to(image.device) # n_phrase, 256
+
+        logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
+        all_logits = []
+        all_phrases = []
+        all_boxes = []
+        for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
+            # get phrase
+            phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
+            # get mask
+            filt_mask = logit_phr > box_threshold
+            # filt box
+            all_boxes.append(boxes[filt_mask])
+            # filt logits
+            all_logits.append(logit_phr[filt_mask])
+            if with_logits:
+                logit_phr_num = logit_phr[filt_mask]
+                all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
+            else:
+                all_phrases.extend([phrase for _ in range(len(filt_mask))])
+        boxes_filt = torch.cat(all_boxes, dim=0).cpu()
+        pred_phrases = all_phrases
+
+    return boxes_filt, pred_phrases, infer_time
+
+
+def benchmark_performance(model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, warmup_runs=5, test_runs=10):
+    """
+    性能基准测试：预热 + 多次推理计算平均FPS和时延
+    Args:
+        warmup_runs: 预热次数（排除初始加载的影响）
+        test_runs: 正式测试次数
+    """
+    # 1. 预热阶段（忽略耗时）
+    print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
+    for i in range(warmup_runs):
+        _, _, _ = get_grounding_output(
+            model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
+        )
+        print(f"预热完成 {i+1}/{warmup_runs}")
+    
+    # 2. 正式测试阶段
+    print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
+    total_time = 0.0
+    infer_times = []  # 记录每次推理的时延
+    
+    for i in range(test_runs):
+        _, _, infer_time = get_grounding_output(
+            model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
+        )
+        infer_times.append(infer_time)
+        total_time += infer_time
+        print(f"测试 {i+1}/{test_runs} - 单次推理时延: {infer_time*1000:.2f} ms")
+    
+    # 3. 计算性能指标
+    avg_infer_time = total_time / test_runs  # 平均时延（秒）
+    fps = test_runs / total_time            # 平均FPS
+    std_infer_time = np.std(infer_times)    # 时延标准差（稳定性）
+    
+    # 4. 输出性能报告
+    print("\n" + "="*50)
+    print("📊 性能测试报告")
+    print("="*50)
+    print(f"测试环境: {'GPU (CUDA)' if not cpu_only else 'CPU'}")
+    print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
+    print(f"平均推理时延: {avg_infer_time*1000:.2f} ms (±{std_infer_time*1000:.2f} ms)")
+    print(f"最大推理时延: {max(infer_times)*1000:.2f} ms")
+    print(f"最小推理时延: {min(infer_times)*1000:.2f} ms")
+    print(f"平均FPS: {fps:.2f} 帧/秒")
+    print("="*50 + "\n")
+    
+    return avg_infer_time, fps, infer_times
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser("Grounding DINO 性能测试", add_help=True)
+    parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
+    parser.add_argument("--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file")
+    parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
+    parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
+    parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True, help="output directory")
+    parser.add_argument("--box_threshold", type=float, default=0.35, help="box threshold")
+    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
+    parser.add_argument("--token_spans", type=str, default=None, help=
+                        "The positions of start and end positions of phrases of interest. \
+                        For example, a caption is 'a cat and a dog', \
+                        if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
+                        if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
+                        ")
+    parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
+    # 新增性能测试参数
+    parser.add_argument("--warmup-runs", type=int, default=5, help="预热运行次数，默认5")
+    parser.add_argument("--test-runs", type=int, default=10, help="正式测试运行次数，默认10")
+    
+    args = parser.parse_args()
+
+    # cfg
+    config_file = args.config_file
+    checkpoint_path = args.checkpoint_path
+    image_path = args.image_path
+    text_prompt = args.text_prompt
+    output_dir = args.output_dir
+    box_threshold = args.box_threshold
+    text_threshold = args.text_threshold
+    token_spans = args.token_spans
+    warmup_runs = args.warmup_runs
+    test_runs = args.test_runs
+
+    # 打印基础信息
+    print(f"📌 使用设备: {'GPU' if not args.cpu_only else 'CPU'}")
+    if not args.cpu_only and torch.cuda.is_available():
+        print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
+        print(f"📌 GPU编号: {torch.cuda.current_device()}")
+
+    # make dir
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # load image
+    image_pil, image = load_image(image_path)
+    
+    # load model
+    model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
+
+    # visualize raw image
+    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
+
+    # set the text_threshold to None if token_spans is set.
+    if token_spans is not None:
+        text_threshold = None
+        print("Using token_spans. Set the text_threshold to None.")
+
+    # 运行性能基准测试
+    avg_infer_time, fps, infer_times = benchmark_performance(
+        model, image, text_prompt, box_threshold, text_threshold, 
+        args.cpu_only, eval(f"{token_spans}") if token_spans else None,
+        warmup_runs, test_runs
+    )
+
+    # 单次推理并保存结果（保留原有功能）
+    print("\n=== 生成推理结果图片 ===")
+    boxes_filt, pred_phrases, single_infer_time = get_grounding_output(
+        model, image, text_prompt, box_threshold, text_threshold, args.cpu_only, 
+        eval(f"{token_spans}") if token_spans else None
+    )
+
+    # visualize pred
+    size = image_pil.size
+    pred_dict = {
+        "boxes": boxes_filt,
+        "size": [size[1], size[0]],  # H,W
+        "labels": pred_phrases,
+    }
+    image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
+    image_with_box.save(os.path.join(output_dir, "pred.jpg"))
+    
+    # 保存性能结果到文件
+    performance_file = os.path.join(output_dir, "performance_report.txt")
+    with open(performance_file, "w", encoding="utf-8") as f:
+        f.write("="*50 + "\n")
+        f.write("Grounding DINO 性能测试报告\n")
+        f.write("="*50 + "\n")
+        f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"测试设备: {'GPU' if not args.cpu_only else 'CPU'}\n")
+        if not args.cpu_only and torch.cuda.is_available():
+            f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
+        f.write(f"预热次数: {warmup_runs}\n")
+        f.write(f"测试次数: {test_runs}\n")
+        f.write(f"平均推理时延: {avg_infer_time*1000:.2f} ms\n")
+        f.write(f"时延标准差: {np.std(infer_times)*1000:.2f} ms\n")
+        f.write(f"最大时延: {max(infer_times)*1000:.2f} ms\n")
+        f.write(f"最小时延: {min(infer_times)*1000:.2f} ms\n")
+        f.write(f"平均FPS: {fps:.2f} 帧/秒\n")
+        f.write(f"单次推理时延（最后一次）: {single_infer_time*1000:.2f} ms\n")
+    
+    print(f"\n✅ 性能报告已保存至: {performance_file}")
+    print(f"✅ 推理结果图片已保存至: {os.path.join(output_dir, 'pred.jpg')}")
\ No newline at end of file
--- a/demo/infer_torch_fp16.py
+++ b/demo/infer_torch_fp16.py
+import argparse
+import os
+import sys
+import time
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+
+import groundingdino.datasets.transforms as T
+from groundingdino.models import build_model
+from groundingdino.util import box_ops
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+from groundingdino.util.vl_utils import create_positive_map_from_span
+
+
+def plot_boxes_to_image(image_pil, tgt):
+    H, W = tgt["size"]
+    boxes = tgt["boxes"]
+    labels = tgt["labels"]
+    assert len(boxes) == len(labels), "boxes and labels must have same length"
+
+    draw = ImageDraw.Draw(image_pil)
+    mask = Image.new("L", image_pil.size, 0)
+    mask_draw = ImageDraw.Draw(mask)
+
+    # draw boxes and masks
+    for box, label in zip(boxes, labels):
+        # from 0..1 to 0..W, 0..H
+        box = box * torch.Tensor([W, H, W, H])
+        # from xywh to xyxy
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        # random color
+        color = tuple(np.random.randint(0, 255, size=3).tolist())
+        # draw
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+        # draw.text((x0, y0), str(label), fill=color)
+
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), str(label), font)
+        else:
+            w, h = draw.textsize(str(label), font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        # bbox = draw.textbbox((x0, y0), str(label))
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), str(label), fill="white")
+
+        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
+
+    return image_pil, mask
+
+
+def load_image(image_path):
+    # load image
+    image_pil = Image.open(image_path).convert("RGB")  # load image
+
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image_pil, image
+
+
+def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = "cuda" if not cpu_only else "cpu"
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    print(load_res)
+    _ = model.eval()
+    return model
+
+
+def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None, use_fp16=False):
+    assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    device = "cuda" if not cpu_only else "cpu"
+    model = model.to(device)
+    image = image.to(device)
+    
+    # 核心推理计时
+    torch.cuda.synchronize() if device == "cuda" else None  # 等待GPU操作完成，确保计时准确
+    start_time = time.perf_counter()  # 高精度计时
+    
+    with torch.no_grad():
+        if use_fp16 and device == "cuda":
+            with torch.autocast(device_type="cuda", dtype=torch.float16):
+                outputs = model(image[None], captions=[caption])
+        else:
+            outputs = model(image[None], captions=[caption])
+    
+    torch.cuda.synchronize() if device == "cuda" else None  # 等待GPU推理完成
+    infer_time = time.perf_counter() - start_time  # 推理耗时（秒）
+    
+    # 输出转回 FP32，避免后续 CPU 操作出问题
+    logits = outputs["pred_logits"].float().sigmoid()[0]
+    boxes = outputs["pred_boxes"].float()[0]
+
+    # filter output
+    if token_spans is None:
+        logits_filt = logits.cpu().clone()
+        boxes_filt = boxes.cpu().clone()
+        filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+        logits_filt = logits_filt[filt_mask]  # num_filt, 256
+        boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+
+        # get phrase
+        tokenlizer = model.tokenizer
+        tokenized = tokenlizer(caption)
+        # build pred
+        pred_phrases = []
+        for logit, box in zip(logits_filt, boxes_filt):
+            pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+            if with_logits:
+                pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+            else:
+                pred_phrases.append(pred_phrase)
+    else:
+        # given-phrase mode
+        positive_maps = create_positive_map_from_span(
+            model.tokenizer(text_prompt),
+            token_span=token_spans
+        ).to(image.device) # n_phrase, 256
+
+        logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
+        all_logits = []
+        all_phrases = []
+        all_boxes = []
+        for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
+            # get phrase
+            phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
+            # get mask
+            filt_mask = logit_phr > box_threshold
+            # filt box
+            all_boxes.append(boxes[filt_mask])
+            # filt logits
+            all_logits.append(logit_phr[filt_mask])
+            if with_logits:
+                logit_phr_num = logit_phr[filt_mask]
+                all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
+            else:
+                all_phrases.extend([phrase for _ in range(len(filt_mask))])
+        boxes_filt = torch.cat(all_boxes, dim=0).cpu()
+        pred_phrases = all_phrases
+
+    return boxes_filt, pred_phrases, infer_time
+
+
+def benchmark_performance(model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, warmup_runs=5, test_runs=10, use_fp16=False):
+    """
+    性能基准测试：预热 + 多次推理计算平均FPS和时延
+    Args:
+        warmup_runs: 预热次数（排除初始加载的影响）
+        test_runs: 正式测试次数
+    """
+    # 1. 预热阶段（忽略耗时）
+    print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
+    for i in range(warmup_runs):
+        _, _, _ = get_grounding_output(
+            model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, use_fp16=use_fp16
+        )
+        print(f"预热完成 {i+1}/{warmup_runs}")
+    
+    # 2. 正式测试阶段
+    print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
+    total_time = 0.0
+    infer_times = []  # 记录每次推理的时延
+    
+    for i in range(test_runs):
+        _, _, infer_time = get_grounding_output(
+            model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, use_fp16=use_fp16
+        )
+        infer_times.append(infer_time)
+        total_time += infer_time
+        print(f"测试 {i+1}/{test_runs} - 单次推理时延: {infer_time*1000:.2f} ms")
+    
+    # 3. 计算性能指标
+    avg_infer_time = total_time / test_runs  # 平均时延（秒）
+    fps = test_runs / total_time            # 平均FPS
+    std_infer_time = np.std(infer_times)    # 时延标准差（稳定性）
+    
+    # 4. 输出性能报告
+    print("\n" + "="*50)
+    print("📊 性能测试报告")
+    print("="*50)
+    print(f"测试环境: {'GPU (CUDA)' if not cpu_only else 'CPU'}")
+    print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
+    print(f"平均推理时延: {avg_infer_time*1000:.2f} ms (±{std_infer_time*1000:.2f} ms)")
+    print(f"最大推理时延: {max(infer_times)*1000:.2f} ms")
+    print(f"最小推理时延: {min(infer_times)*1000:.2f} ms")
+    print(f"平均FPS: {fps:.2f} 帧/秒")
+    print("="*50 + "\n")
+    
+    return avg_infer_time, fps, infer_times
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser("Grounding DINO 性能测试", add_help=True)
+    parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
+    parser.add_argument("--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file")
+    parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
+    parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
+    parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True, help="output directory")
+    parser.add_argument("--box_threshold", type=float, default=0.35, help="box threshold")
+    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
+    parser.add_argument("--token_spans", type=str, default=None, help=
+                        "The positions of start and end positions of phrases of interest. \
+                        For example, a caption is 'a cat and a dog', \
+                        if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
+                        if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
+                        ")
+    parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
+    # 新增性能测试参数
+    parser.add_argument("--warmup-runs", type=int, default=5, help="预热运行次数，默认5")
+    parser.add_argument("--test-runs", type=int, default=10, help="正式测试运行次数，默认10")
+    parser.add_argument("--fp16", action="store_true", help="Enable FP16 inference")
+
+    args = parser.parse_args()
+
+    # cfg
+    config_file = args.config_file
+    checkpoint_path = args.checkpoint_path
+    image_path = args.image_path
+    text_prompt = args.text_prompt
+    output_dir = args.output_dir
+    box_threshold = args.box_threshold
+    text_threshold = args.text_threshold
+    token_spans = args.token_spans
+    warmup_runs = args.warmup_runs
+    test_runs = args.test_runs
+
+    # 打印基础信息
+    print(f"📌 使用设备: {'GPU' if not args.cpu_only else 'CPU'}")
+    if not args.cpu_only and torch.cuda.is_available():
+        print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
+        print(f"📌 GPU编号: {torch.cuda.current_device()}")
+
+    # make dir
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # load image
+    image_pil, image = load_image(image_path)
+    
+    # load model
+    model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
+
+    # visualize raw image
+    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
+
+    # set the text_threshold to None if token_spans is set.
+    if token_spans is not None:
+        text_threshold = None
+        print("Using token_spans. Set the text_threshold to None.")
+
+    # 运行性能基准测试
+    avg_infer_time, fps, infer_times = benchmark_performance(
+        model, image, text_prompt, box_threshold, text_threshold, 
+        args.cpu_only, eval(f"{token_spans}") if token_spans else None,
+        warmup_runs, test_runs, use_fp16=args.fp16
+    )
+
+    # 单次推理并保存结果（保留原有功能）
+    print("\n=== 生成推理结果图片 ===")
+    boxes_filt, pred_phrases, single_infer_time = get_grounding_output(
+        model, image, text_prompt, box_threshold, text_threshold, args.cpu_only, 
+        eval(f"{token_spans}") if token_spans else None, use_fp16=args.fp16
+    )
+
+    # visualize pred
+    size = image_pil.size
+    pred_dict = {
+        "boxes": boxes_filt,
+        "size": [size[1], size[0]],  # H,W
+        "labels": pred_phrases,
+    }
+    image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
+    image_with_box.save(os.path.join(output_dir, "pred.jpg"))
+    
+    # 保存性能结果到文件
+    performance_file = os.path.join(output_dir, "performance_report.txt")
+    with open(performance_file, "w", encoding="utf-8") as f:
+        f.write("="*50 + "\n")
+        f.write("Grounding DINO 性能测试报告\n")
+        f.write("="*50 + "\n")
+        f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"测试设备: {'GPU' if not args.cpu_only else 'CPU'}\n")
+        if not args.cpu_only and torch.cuda.is_available():
+            f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
+        f.write(f"预热次数: {warmup_runs}\n")
+        f.write(f"测试次数: {test_runs}\n")
+        f.write(f"平均推理时延: {avg_infer_time*1000:.2f} ms\n")
+        f.write(f"时延标准差: {np.std(infer_times)*1000:.2f} ms\n")
+        f.write(f"最大时延: {max(infer_times)*1000:.2f} ms\n")
+        f.write(f"最小时延: {min(infer_times)*1000:.2f} ms\n")
+        f.write(f"平均FPS: {fps:.2f} 帧/秒\n")
+        f.write(f"单次推理时延（最后一次）: {single_infer_time*1000:.2f} ms\n")
+    
+    print(f"\n✅ 性能报告已保存至: {performance_file}")
+    print(f"✅ 推理结果图片已保存至: {os.path.join(output_dir, 'pred.jpg')}")
\ No newline at end of file
--- a/demo/inference_on_a_image.py
+++ b/demo/inference_on_a_image.py
+import argparse
+import os
+import sys
+
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+
+import groundingdino.datasets.transforms as T
+from groundingdino.models import build_model
+from groundingdino.util import box_ops
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+from groundingdino.util.vl_utils import create_positive_map_from_span
+
+
+def plot_boxes_to_image(image_pil, tgt):
+    H, W = tgt["size"]
+    boxes = tgt["boxes"]
+    labels = tgt["labels"]
+    assert len(boxes) == len(labels), "boxes and labels must have same length"
+
+    draw = ImageDraw.Draw(image_pil)
+    mask = Image.new("L", image_pil.size, 0)
+    mask_draw = ImageDraw.Draw(mask)
+
+    # draw boxes and masks
+    for box, label in zip(boxes, labels):
+        # from 0..1 to 0..W, 0..H
+        box = box * torch.Tensor([W, H, W, H])
+        # from xywh to xyxy
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        # random color
+        color = tuple(np.random.randint(0, 255, size=3).tolist())
+        # draw
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+        # draw.text((x0, y0), str(label), fill=color)
+
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), str(label), font)
+        else:
+            w, h = draw.textsize(str(label), font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        # bbox = draw.textbbox((x0, y0), str(label))
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), str(label), fill="white")
+
+        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
+
+    return image_pil, mask
+
+
+def load_image(image_path):
+    # load image
+    image_pil = Image.open(image_path).convert("RGB")  # load image
+
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image_pil, image
+
+
+def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = "cuda" if not cpu_only else "cpu"
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    print(load_res)
+    _ = model.eval()
+    return model
+
+
+def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
+    assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    device = "cuda" if not cpu_only else "cpu"
+    model = model.to(device)
+    image = image.to(device)
+    with torch.no_grad():
+        outputs = model(image[None], captions=[caption])
+    logits = outputs["pred_logits"].sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"][0]  # (nq, 4)
+
+    # filter output
+    if token_spans is None:
+        logits_filt = logits.cpu().clone()
+        boxes_filt = boxes.cpu().clone()
+        filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+        logits_filt = logits_filt[filt_mask]  # num_filt, 256
+        boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+
+        # get phrase
+        tokenlizer = model.tokenizer
+        tokenized = tokenlizer(caption)
+        # build pred
+        pred_phrases = []
+        for logit, box in zip(logits_filt, boxes_filt):
+            pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+            if with_logits:
+                pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+            else:
+                pred_phrases.append(pred_phrase)
+    else:
+        # given-phrase mode
+        positive_maps = create_positive_map_from_span(
+            model.tokenizer(text_prompt),
+            token_span=token_spans
+        ).to(image.device) # n_phrase, 256
+
+        logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
+        all_logits = []
+        all_phrases = []
+        all_boxes = []
+        for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
+            # get phrase
+            phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
+            # get mask
+            filt_mask = logit_phr > box_threshold
+            # filt box
+            all_boxes.append(boxes[filt_mask])
+            # filt logits
+            all_logits.append(logit_phr[filt_mask])
+            if with_logits:
+                logit_phr_num = logit_phr[filt_mask]
+                all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
+            else:
+                all_phrases.extend([phrase for _ in range(len(filt_mask))])
+        boxes_filt = torch.cat(all_boxes, dim=0).cpu()
+        pred_phrases = all_phrases
+
+
+    return boxes_filt, pred_phrases
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser("Grounding DINO example", add_help=True)
+    parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
+    parser.add_argument(
+        "--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file"
+    )
+    parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
+    parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
+    parser.add_argument(
+        "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
+    )
+
+    parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
+    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
+    parser.add_argument("--token_spans", type=str, default=None, help=
+                        "The positions of start and end positions of phrases of interest. \
+                        For example, a caption is 'a cat and a dog', \
+                        if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
+                        if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
+                        ")
+
+    parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
+    args = parser.parse_args()
+
+    # cfg
+    config_file = args.config_file  # change the path of the model config file
+    checkpoint_path = args.checkpoint_path  # change the path of the model
+    image_path = args.image_path
+    text_prompt = args.text_prompt
+    output_dir = args.output_dir
+    box_threshold = args.box_threshold
+    text_threshold = args.text_threshold
+    token_spans = args.token_spans
+
+    # make dir
+    os.makedirs(output_dir, exist_ok=True)
+    # load image
+    image_pil, image = load_image(image_path)
+    # load model
+    model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
+
+    # visualize raw image
+    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
+
+    # set the text_threshold to None if token_spans is set.
+    if token_spans is not None:
+        text_threshold = None
+        print("Using token_spans. Set the text_threshold to None.")
+
+
+    # run model
+    boxes_filt, pred_phrases = get_grounding_output(
+        model, image, text_prompt, box_threshold, text_threshold, cpu_only=args.cpu_only, token_spans=eval(f"{token_spans}")
+    )
+
+    # visualize pred
+    size = image_pil.size
+    pred_dict = {
+        "boxes": boxes_filt,
+        "size": [size[1], size[0]],  # H,W
+        "labels": pred_phrases,
+    }
+    # import ipdb; ipdb.set_trace()
+    image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
+    image_with_box.save(os.path.join(output_dir, "pred.jpg"))
--- a/demo/test_ap_on_coco.py
+++ b/demo/test_ap_on_coco.py
+import argparse
+import os
+import sys
+import time
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, DistributedSampler
+
+from groundingdino.models import build_model
+import groundingdino.datasets.transforms as T
+from groundingdino.util import box_ops, get_tokenlizer
+from groundingdino.util.misc import clean_state_dict, collate_fn
+from groundingdino.util.slconfig import SLConfig
+
+# from torchvision.datasets import CocoDetection
+import torchvision
+
+from groundingdino.util.vl_utils import build_captions_and_token_span, create_positive_map_from_span
+from groundingdino.datasets.cocogrounding_eval import CocoGroundingEvaluator
+
+
+def load_model(model_config_path: str, model_checkpoint_path: str, device: str = "cuda"):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = device
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    model.eval()
+    return model
+
+
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms):
+        super().__init__(img_folder, ann_file)
+        self._transforms = transforms
+
+    def __getitem__(self, idx):
+        img, target = super().__getitem__(idx)  # target: list
+
+        # import ipdb; ipdb.set_trace()
+
+        w, h = img.size
+        boxes = [obj["bbox"] for obj in target]
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]  # xywh -> xyxy
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        # filt invalid boxes/masks/keypoints
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+
+        target_new = {}
+        image_id = self.ids[idx]
+        target_new["image_id"] = image_id
+        target_new["boxes"] = boxes
+        target_new["orig_size"] = torch.as_tensor([int(h), int(w)])
+
+        if self._transforms is not None:
+            img, target = self._transforms(img, target_new)
+
+        return img, target
+
+
+class PostProcessCocoGrounding(nn.Module):
+    """ This module converts the model's output into the format expected by the coco api"""
+
+    def __init__(self, num_select=300, coco_api=None, tokenlizer=None) -> None:
+        super().__init__()
+        self.num_select = num_select
+
+        assert coco_api is not None
+        category_dict = coco_api.dataset['categories']
+        cat_list = [item['name'] for item in category_dict]
+        captions, cat2tokenspan = build_captions_and_token_span(cat_list, True)
+        tokenspanlist = [cat2tokenspan[cat] for cat in cat_list]
+        positive_map = create_positive_map_from_span(
+            tokenlizer(captions), tokenspanlist)  # 80, 256. normed
+
+        id_map = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 13, 12: 14, 13: 15, 14: 16, 15: 17, 16: 18, 17: 19, 18: 20, 19: 21, 20: 22, 21: 23, 22: 24, 23: 25, 24: 27, 25: 28, 26: 31, 27: 32, 28: 33, 29: 34, 30: 35, 31: 36, 32: 37, 33: 38, 34: 39, 35: 40, 36: 41, 37: 42, 38: 43, 39: 44, 40: 46,
+                  41: 47, 42: 48, 43: 49, 44: 50, 45: 51, 46: 52, 47: 53, 48: 54, 49: 55, 50: 56, 51: 57, 52: 58, 53: 59, 54: 60, 55: 61, 56: 62, 57: 63, 58: 64, 59: 65, 60: 67, 61: 70, 62: 72, 63: 73, 64: 74, 65: 75, 66: 76, 67: 77, 68: 78, 69: 79, 70: 80, 71: 81, 72: 82, 73: 84, 74: 85, 75: 86, 76: 87, 77: 88, 78: 89, 79: 90}
+
+        # build a mapping from label_id to pos_map
+        new_pos_map = torch.zeros((91, 256))
+        for k, v in id_map.items():
+            new_pos_map[v] = positive_map[k]
+        self.positive_map = new_pos_map
+
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes, not_to_xyxy=False):
+        """ Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
+                          For evaluation, this must be the original image size (before any data augmentation)
+                          For visualization, this should be the image size after data augment, but before padding
+        """
+        num_select = self.num_select
+        out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
+
+        # pos map to logit
+        prob_to_token = out_logits.sigmoid()  # bs, 100, 256
+        pos_maps = self.positive_map.to(prob_to_token.device)
+        # (bs, 100, 256) @ (91, 256).T -> (bs, 100, 91)
+        prob_to_label = prob_to_token @ pos_maps.T
+
+        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+
+        prob = prob_to_label
+        topk_values, topk_indexes = torch.topk(
+            prob.view(out_logits.shape[0], -1), num_select, dim=1)
+        scores = topk_values
+        topk_boxes = topk_indexes // prob.shape[2]
+        labels = topk_indexes % prob.shape[2]
+
+        if not_to_xyxy:
+            boxes = out_bbox
+        else:
+            boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+
+        boxes = torch.gather(
+            boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{'scores': s, 'labels': l, 'boxes': b}
+                   for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+
+def main(args):
+    # config
+    cfg = SLConfig.fromfile(args.config_file)
+
+    # build model
+    model = load_model(args.config_file, args.checkpoint_path)
+    model = model.to(args.device)
+    model = model.eval()
+
+    # build dataloader
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    dataset = CocoDetection(
+        args.image_dir, args.anno_path, transforms=transform)
+    data_loader = DataLoader(
+        dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn)
+
+    # build post processor
+    tokenlizer = get_tokenlizer.get_tokenlizer(cfg.text_encoder_type)
+    postprocessor = PostProcessCocoGrounding(
+        coco_api=dataset.coco, tokenlizer=tokenlizer)
+
+    # build evaluator
+    evaluator = CocoGroundingEvaluator(
+        dataset.coco, iou_types=("bbox",), useCats=True)
+
+    # build captions
+    category_dict = dataset.coco.dataset['categories']
+    cat_list = [item['name'] for item in category_dict]
+    caption = " . ".join(cat_list) + ' .'
+    print("Input text prompt:", caption)
+
+    # run inference
+    start = time.time()
+    for i, (images, targets) in enumerate(data_loader):
+        # get images and captions
+        images = images.tensors.to(args.device)
+        bs = images.shape[0]
+        input_captions = [caption] * bs
+
+        # feed to the model
+        outputs = model(images, captions=input_captions)
+
+        orig_target_sizes = torch.stack(
+            [t["orig_size"] for t in targets], dim=0).to(images.device)
+        results = postprocessor(outputs, orig_target_sizes)
+        cocogrounding_res = {
+            target["image_id"]: output for target, output in zip(targets, results)}
+        evaluator.update(cocogrounding_res)
+
+        if (i+1) % 30 == 0:
+            used_time = time.time() - start
+            eta = len(data_loader) / (i+1e-5) * used_time - used_time
+            print(
+                f"processed {i}/{len(data_loader)} images. time: {used_time:.2f}s, ETA: {eta:.2f}s")
+
+    evaluator.synchronize_between_processes()
+    evaluator.accumulate()
+    evaluator.summarize()
+
+    print("Final results:", evaluator.coco_eval["bbox"].stats.tolist())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        "Grounding DINO eval on COCO", add_help=True)
+    # load model
+    parser.add_argument("--config_file", "-c", type=str,
+                        required=True, help="path to config file")
+    parser.add_argument(
+        "--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file"
+    )
+    parser.add_argument("--device", type=str, default="cuda",
+                        help="running device (default: cuda)")
+
+    # post processing
+    parser.add_argument("--num_select", type=int, default=300,
+                        help="number of topk to select")
+
+    # coco info
+    parser.add_argument("--anno_path", type=str,
+                        required=True, help="coco root")
+    parser.add_argument("--image_dir", type=str,
+                        required=True, help="coco image dir")
+    parser.add_argument("--num_workers", type=int, default=4,
+                        help="number of workers for dataloader")
+    args = parser.parse_args()
+
+    main(args)
--- a/demo/test_ap_on_coco_onnx.py
+++ b/demo/test_ap_on_coco_onnx.py
+import argparse
+import os
+import sys
+import time
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, DistributedSampler
+import torchvision
+import onnxruntime as ort
+
+import groundingdino.datasets.transforms as T
+from groundingdino.util import box_ops, get_tokenlizer
+from groundingdino.util.misc import collate_fn
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.vl_utils import build_captions_and_token_span, create_positive_map_from_span
+from groundingdino.datasets.cocogrounding_eval import CocoGroundingEvaluator
+
+
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms):
+        super().__init__(img_folder, ann_file)
+        self._transforms = transforms
+
+    def __getitem__(self, idx):
+        img, target = super().__getitem__(idx)
+
+        w, h = img.size
+        boxes = [obj["bbox"] for obj in target]
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]  # xywh -> xyxy
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+
+        target_new = {}
+        image_id = self.ids[idx]
+        target_new["image_id"] = image_id
+        target_new["boxes"] = boxes
+        target_new["orig_size"] = torch.as_tensor([int(h), int(w)])
+
+        if self._transforms is not None:
+            img, target = self._transforms(img, target_new)
+
+        return img, target
+
+
+class PostProcessCocoGrounding(nn.Module):
+    """保持和原代码一致的后处理逻辑"""
+    def __init__(self, num_select=300, coco_api=None, tokenlizer=None) -> None:
+        super().__init__()
+        self.num_select = num_select
+
+        assert coco_api is not None
+        category_dict = coco_api.dataset['categories']
+        cat_list = [item['name'] for item in category_dict]
+        captions, cat2tokenspan = build_captions_and_token_span(cat_list, True)
+        tokenspanlist = [cat2tokenspan[cat] for cat in cat_list]
+        positive_map = create_positive_map_from_span(
+            tokenlizer(captions), tokenspanlist)  # 80, 256. normed
+
+        id_map = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 13, 12: 14, 13: 15, 14: 16, 15: 17, 16: 18, 17: 19, 18: 20, 19: 21, 20: 22, 21: 23, 22: 24, 23: 25, 24: 27, 25: 28, 26: 31, 27: 32, 28: 33, 29: 34, 30: 35, 31: 36, 32: 37, 33: 38, 34: 39, 35: 40, 36: 41, 37: 42, 38: 43, 39: 44, 40: 46,
+                  41: 47, 42: 48, 43: 49, 44: 50, 45: 51, 46: 52, 47: 53, 48: 54, 49: 55, 50: 56, 51: 57, 52: 58, 53: 59, 54: 60, 55: 61, 56: 62, 57: 63, 58: 64, 59: 65, 60: 67, 61: 70, 62: 72, 63: 73, 64: 74, 65: 75, 66: 76, 67: 77, 68: 78, 69: 79, 70: 80, 71: 81, 72: 82, 73: 84, 74: 85, 75: 86, 76: 87, 77: 88, 78: 89, 79: 90}
+
+        new_pos_map = torch.zeros((91, 256))
+        for k, v in id_map.items():
+            new_pos_map[v] = positive_map[k]
+        self.positive_map = new_pos_map
+
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes, not_to_xyxy=False):
+        num_select = self.num_select
+        out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
+
+        prob_to_token = torch.from_numpy(out_logits).sigmoid()  # 适配numpy输入
+        pos_maps = self.positive_map.to(prob_to_token.device)
+        prob_to_label = prob_to_token @ pos_maps.T
+
+        assert prob_to_label.shape[0] == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+
+        prob = prob_to_label
+        topk_values, topk_indexes = torch.topk(
+            prob.view(prob_to_label.shape[0], -1), num_select, dim=1)
+        scores = topk_values
+        topk_boxes = topk_indexes // prob.shape[2]
+        labels = topk_indexes % prob.shape[2]
+
+        if not_to_xyxy:
+            boxes = torch.from_numpy(out_bbox)
+        else:
+            boxes = box_ops.box_cxcywh_to_xyxy(torch.from_numpy(out_bbox))
+
+        boxes = torch.gather(
+            boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        results = [{'scores': s, 'labels': l, 'boxes': b}
+                   for s, l, b in zip(scores, labels, boxes)]
+
+        return results
+
+
+def load_onnx_model(onnx_path, device="cuda"):
+    """加载ONNX模型并创建推理session"""
+    providers = ['CPUExecutionProvider']
+    if device == "cuda" and ort.get_device() == "GPU":
+        providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
+    
+    session = ort.InferenceSession(
+        onnx_path,
+        providers=providers,
+        provider_options=[{'device_id': 0}] if "CUDAExecutionProvider" in providers else []
+    )
+    return session
+
+
+def onnx_inference(session, images, captions):
+    """ONNX模型推理（需匹配模型输入格式）"""
+    # 转换为numpy（ONNX Runtime不支持torch tensor）
+    images_np = images.cpu().numpy().astype(np.float32)
+    
+    # 注意：此处需根据你的ONNX模型输入名调整（可通过netron查看）
+    # 假设模型输入为 "images" 和 "captions"（需根据实际情况修改）
+    input_feed = {
+        session.get_inputs()[0].name: images_np,
+        # 如果caption是文本token，需补充token化逻辑，此处假设已处理
+        # session.get_inputs()[1].name: captions_np
+    }
+    
+    # 执行推理
+    outputs = session.run(None, input_feed)
+    
+    # 解析输出（需匹配模型输出格式，假设输出为logits和bbox）
+    # 需根据你的ONNX模型输出调整维度和顺序
+    pred_logits = outputs[0]  # 形状: [bs, 100, 256]
+    pred_boxes = outputs[1]   # 形状: [bs, 100, 4]
+    
+    return {"pred_logits": pred_logits, "pred_boxes": pred_boxes}
+
+
+def main(args):
+    # 配置（主要用于tokenizer和后处理）
+    cfg = SLConfig.fromfile(args.config_file)
+
+    # 加载ONNX模型
+    onnx_session = load_onnx_model(args.onnx_path, args.device)
+
+    # 构建数据加载器（和原代码一致）
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    dataset = CocoDetection(
+        args.image_dir, args.anno_path, transforms=transform)
+    data_loader = DataLoader(
+        dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn)
+
+    # 构建后处理器（和原代码一致）
+    tokenlizer = get_tokenlizer.get_tokenlizer(cfg.text_encoder_type)
+    postprocessor = PostProcessCocoGrounding(
+        coco_api=dataset.coco, tokenlizer=tokenlizer)
+
+    # 构建评估器（和原代码一致）
+    evaluator = CocoGroundingEvaluator(
+        dataset.coco, iou_types=("bbox",), useCats=True)
+
+    # 构建文本提示（和原代码一致）
+    category_dict = dataset.coco.dataset['categories']
+    cat_list = [item['name'] for item in category_dict]
+    caption = " . ".join(cat_list) + ' .'
+    print("Input text prompt:", caption)
+
+    # 运行推理
+    start = time.time()
+    for i, (images, targets) in enumerate(data_loader):
+        # 预处理图像（和原代码一致）
+        images = images.tensors.to(args.device)
+        bs = images.shape[0]
+        input_captions = [caption] * bs
+
+        # ONNX推理（替换原PyTorch模型推理）
+        outputs = onnx_inference(onnx_session, images, input_captions)
+
+        # 后处理（适配ONNX输出格式）
+        orig_target_sizes = torch.stack(
+            [t["orig_size"] for t in targets], dim=0).to(args.device)
+        results = postprocessor(outputs, orig_target_sizes)
+        cocogrounding_res = {
+            target["image_id"]: output for target, output in zip(targets, results)}
+        evaluator.update(cocogrounding_res)
+
+        # 打印进度
+        if (i+1) % 30 == 0:
+            used_time = time.time() - start
+            eta = len(data_loader) / (i+1e-5) * used_time - used_time
+            print(
+                f"processed {i}/{len(data_loader)} images. time: {used_time:.2f}s, ETA: {eta:.2f}s")
+
+    # 评估指标汇总
+    evaluator.synchronize_between_processes()
+    evaluator.accumulate()
+    evaluator.summarize()
+
+    print("Final results:", evaluator.coco_eval["bbox"].stats.tolist())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        "Grounding DINO ONNX eval on COCO", add_help=True)
+    # 新增ONNX模型路径参数
+    parser.add_argument("--onnx_path", type=str, required=True, help="path to onnx model file")
+    # 保留原配置文件参数（用于tokenizer和后处理）
+    parser.add_argument("--config_file", "-c", type=str,
+                        required=True, help="path to config file")
+    parser.add_argument("--device", type=str, default="cuda",
+                        help="running device (default: cuda)")
+    # 后处理参数
+    parser.add_argument("--num_select", type=int, default=300,
+                        help="number of topk to select")
+    # COCO数据集参数
+    parser.add_argument("--anno_path", type=str,
+                        required=True, help="coco annotation path")
+    parser.add_argument("--image_dir", type=str,
+                        required=True, help="coco image dir")
+    parser.add_argument("--num_workers", type=int, default=4,
+                        help="number of workers for dataloader")
+    args = parser.parse_args()
+
+    main(args)
\ No newline at end of file
--- a/docker_test.py
+++ b/docker_test.py
+from groundingdino.util.inference import load_model, load_image, predict, annotate
+import torch
+import cv2
+
+model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.pyy", "weights/groundingdino_swint_ogc.pth")
+model = model.to('cuda:0')
+print(torch.cuda.is_available())
+print('DONE!')
\ No newline at end of file
--- a/environment.yaml
+++ b/environment.yaml
+name: dino
+channels:
+  - pytorch
+  - nvidia
+  - conda-forge
+  - defaults
+dependencies:
+  - addict=2.4.0=pyhd8ed1ab_2
+  - aiohttp=3.8.5=py39ha55989b_0
+  - aiosignal=1.3.1=pyhd8ed1ab_0
+  - asttokens=2.0.5=pyhd3eb1b0_0
+  - async-timeout=4.0.3=pyhd8ed1ab_0
+  - attrs=23.1.0=pyh71513ae_1
+  - aws-c-auth=0.7.0=h6f3c987_2
+  - aws-c-cal=0.6.0=h6ba3258_0
+  - aws-c-common=0.8.23=hcfcfb64_0
+  - aws-c-compression=0.2.17=h420beca_1
+  - aws-c-event-stream=0.3.1=had47b81_1
+  - aws-c-http=0.7.11=h72ba615_0
+  - aws-c-io=0.13.28=ha35c040_0
+  - aws-c-mqtt=0.8.14=h4941efa_2
+  - aws-c-s3=0.3.13=he04eaa7_2
+  - aws-c-sdkutils=0.1.11=h420beca_1
+  - aws-checksums=0.1.16=h420beca_1
+  - aws-crt-cpp=0.20.3=h247a981_4
+  - aws-sdk-cpp=1.10.57=h1a0519f_17
+  - backcall=0.2.0=pyhd3eb1b0_0
+  - blas=2.118=mkl
+  - blas-devel=3.9.0=18_win64_mkl
+  - brotli=1.0.9=hcfcfb64_9
+  - brotli-bin=1.0.9=hcfcfb64_9
+  - brotli-python=1.0.9=py39h99910a6_9
+  - bzip2=1.0.8=h8ffe710_4
+  - c-ares=1.19.1=hcfcfb64_0
+  - ca-certificates=2023.08.22=haa95532_0
+  - certifi=2023.7.22=py39haa95532_0
+  - charset-normalizer=3.2.0=pyhd8ed1ab_0
+  - click=8.1.7=win_pyh7428d3b_0
+  - colorama=0.4.6=pyhd8ed1ab_0
+  - comm=0.1.2=py39haa95532_0
+  - contourpy=1.1.1=py39h1f6ef14_1
+  - cuda-cccl=12.2.140=0
+  - cuda-cudart=11.8.89=0
+  - cuda-cudart-dev=11.8.89=0
+  - cuda-cupti=11.8.87=0
+  - cuda-libraries=11.8.0=0
+  - cuda-libraries-dev=11.8.0=0
+  - cuda-nvrtc=11.8.89=0
+  - cuda-nvrtc-dev=11.8.89=0
+  - cuda-nvtx=11.8.86=0
+  - cuda-profiler-api=12.2.140=0
+  - cuda-runtime=11.8.0=0
+  - cycler=0.11.0=pyhd8ed1ab_0
+  - cython=3.0.0=py39h2bbff1b_0
+  - dataclasses=0.8=pyhc8e2a94_3
+  - datasets=2.14.5=pyhd8ed1ab_0
+  - debugpy=1.6.7=py39hd77b12b_0
+  - decorator=5.1.1=pyhd3eb1b0_0
+  - dill=0.3.7=pyhd8ed1ab_0
+  - exceptiongroup=1.0.4=py39haa95532_0
+  - executing=0.8.3=pyhd3eb1b0_0
+  - filelock=3.12.4=pyhd8ed1ab_0
+  - fonttools=4.42.1=py39ha55989b_0
+  - freeglut=3.2.2=h63175ca_2
+  - freetype=2.12.1=hdaf720e_2
+  - frozenlist=1.4.0=py39ha55989b_1
+  - fsspec=2023.6.0=pyh1a96a4e_0
+  - gettext=0.21.1=h5728263_0
+  - glib=2.78.0=h12be248_0
+  - glib-tools=2.78.0=h12be248_0
+  - gst-plugins-base=1.22.6=h001b923_1
+  - gstreamer=1.22.6=hb4038d2_1
+  - huggingface_hub=0.17.3=pyhd8ed1ab_0
+  - icu=70.1=h0e60522_0
+  - idna=3.4=pyhd8ed1ab_0
+  - importlib-metadata=6.8.0=pyha770c72_0
+  - importlib-resources=6.1.0=pyhd8ed1ab_0
+  - importlib_metadata=6.8.0=hd8ed1ab_0
+  - importlib_resources=6.1.0=pyhd8ed1ab_0
+  - intel-openmp=2023.2.0=h57928b3_49503
+  - ipykernel=6.25.0=py39h9909e9c_0
+  - ipython=8.15.0=py39haa95532_0
+  - jasper=2.0.33=hc2e4405_1
+  - jedi=0.18.1=py39haa95532_1
+  - jinja2=3.1.2=pyhd8ed1ab_1
+  - joblib=1.3.2=pyhd8ed1ab_0
+  - jpeg=9e=hcfcfb64_3
+  - jupyter_client=8.1.0=py39haa95532_0
+  - jupyter_core=5.3.0=py39haa95532_0
+  - kiwisolver=1.4.5=py39h1f6ef14_1
+  - krb5=1.20.1=heb0366b_0
+  - lcms2=2.14=h90d422f_0
+  - lerc=4.0.0=h63175ca_0
+  - libabseil=20230125.3=cxx17_h63175ca_0
+  - libarrow=12.0.1=h12e5d06_5_cpu
+  - libblas=3.9.0=18_win64_mkl
+  - libbrotlicommon=1.0.9=hcfcfb64_9
+  - libbrotlidec=1.0.9=hcfcfb64_9
+  - libbrotlienc=1.0.9=hcfcfb64_9
+  - libcblas=3.9.0=18_win64_mkl
+  - libclang=15.0.7=default_h77d9078_3
+  - libclang13=15.0.7=default_h77d9078_3
+  - libcrc32c=1.1.2=h0e60522_0
+  - libcublas=11.11.3.6=0
+  - libcublas-dev=11.11.3.6=0
+  - libcufft=10.9.0.58=0
+  - libcufft-dev=10.9.0.58=0
+  - libcurand=10.3.3.141=0
+  - libcurand-dev=10.3.3.141=0
+  - libcurl=8.1.2=h68f0423_0
+  - libcusolver=11.4.1.48=0
+  - libcusolver-dev=11.4.1.48=0
+  - libcusparse=11.7.5.86=0
+  - libcusparse-dev=11.7.5.86=0
+  - libdeflate=1.14=hcfcfb64_0
+  - libevent=2.1.12=h3671451_1
+  - libffi=3.4.2=h8ffe710_5
+  - libglib=2.78.0=he8f3873_0
+  - libgoogle-cloud=2.12.0=h00b2bdc_1
+  - libgrpc=1.54.3=ha177ca7_0
+  - libhwloc=2.9.3=default_haede6df_1009
+  - libiconv=1.17=h8ffe710_0
+  - liblapack=3.9.0=18_win64_mkl
+  - liblapacke=3.9.0=18_win64_mkl
+  - libnpp=11.8.0.86=0
+  - libnpp-dev=11.8.0.86=0
+  - libnvjpeg=11.9.0.86=0
+  - libnvjpeg-dev=11.9.0.86=0
+  - libogg=1.3.4=h8ffe710_1
+  - libopencv=4.5.3=py39h488c12c_8
+  - libpng=1.6.39=h19919ed_0
+  - libprotobuf=3.21.12=h12be248_2
+  - libsodium=1.0.18=h62dcd97_0
+  - libsqlite=3.43.0=hcfcfb64_0
+  - libssh2=1.11.0=h7dfc565_0
+  - libthrift=0.18.1=h06f6336_2
+  - libtiff=4.4.0=hc4f729c_5
+  - libutf8proc=2.8.0=h82a8f57_0
+  - libuv=1.44.2=hcfcfb64_1
+  - libvorbis=1.3.7=h0e60522_0
+  - libwebp-base=1.3.2=hcfcfb64_0
+  - libxcb=1.13=hcd874cb_1004
+  - libxml2=2.11.5=hc3477c8_1
+  - libzlib=1.2.13=hcfcfb64_5
+  - lz4-c=1.9.4=hcfcfb64_0
+  - m2w64-gcc-libgfortran=5.3.0=6
+  - m2w64-gcc-libs=5.3.0=7
+  - m2w64-gcc-libs-core=5.3.0=7
+  - m2w64-gmp=6.1.0=2
+  - m2w64-libwinpthread-git=5.0.0.4634.697f757=2
+  - markupsafe=2.1.3=py39ha55989b_1
+  - matplotlib-base=3.8.0=py39hf19769e_1
+  - matplotlib-inline=0.1.6=py39haa95532_0
+  - mkl=2022.1.0=h6a75c08_874
+  - mkl-devel=2022.1.0=h57928b3_875
+  - mkl-include=2022.1.0=h6a75c08_874
+  - mpmath=1.3.0=pyhd8ed1ab_0
+  - msys2-conda-epoch=20160418=1
+  - multidict=6.0.4=py39ha55989b_0
+  - multiprocess=0.70.15=py39ha55989b_1
+  - munkres=1.1.4=pyh9f0ad1d_0
+  - nest-asyncio=1.5.6=py39haa95532_0
+  - networkx=3.1=pyhd8ed1ab_0
+  - numpy=1.26.0=py39hddb5d58_0
+  - opencv=4.5.3=py39hcbf5309_8
+  - openjpeg=2.5.0=hc9384bd_1
+  - openssl=3.1.3=hcfcfb64_0
+  - orc=1.9.0=hada7b9e_1
+  - packaging=23.1=pyhd8ed1ab_0
+  - pandas=2.1.1=py39h32e6231_0
+  - parso=0.8.3=pyhd3eb1b0_0
+  - pcre2=10.40=h17e33f8_0
+  - pickleshare=0.7.5=pyhd3eb1b0_1003
+  - pillow=9.2.0=py39h595c93f_3
+  - pip=23.2.1=pyhd8ed1ab_0
+  - platformdirs=3.10.0=pyhd8ed1ab_0
+  - prompt-toolkit=3.0.36=py39haa95532_0
+  - psutil=5.9.0=py39h2bbff1b_0
+  - pthread-stubs=0.4=hcd874cb_1001
+  - pthreads-win32=2.9.1=hfa6e2cd_3
+  - pure_eval=0.2.2=pyhd3eb1b0_0
+  - py-opencv=4.5.3=py39h00e5391_8
+  - pyarrow=12.0.1=py39hca4e8af_5_cpu
+  - pycocotools=2.0.6=py39hc266a54_1
+  - pygments=2.15.1=py39haa95532_1
+  - pyparsing=3.1.1=pyhd8ed1ab_0
+  - pysocks=1.7.1=pyh0701188_6
+  - python=3.9.18=h4de0772_0_cpython
+  - python-dateutil=2.8.2=pyhd8ed1ab_0
+  - python-tzdata=2023.3=pyhd8ed1ab_0
+  - python-xxhash=3.3.0=py39ha55989b_1
+  - python_abi=3.9=4_cp39
+  - pytorch=2.0.1=py3.9_cuda11.8_cudnn8_0
+  - pytorch-cuda=11.8=h24eeafa_5
+  - pytorch-mutex=1.0=cuda
+  - pytz=2023.3.post1=pyhd8ed1ab_0
+  - pywin32=305=py39h2bbff1b_0
+  - pyyaml=6.0.1=py39ha55989b_1
+  - pyzmq=25.1.0=py39hd77b12b_0
+  - qt-main=5.15.8=h720456b_6
+  - re2=2023.03.02=hd4eee63_0
+  - regex=2023.8.8=py39ha55989b_1
+  - requests=2.31.0=pyhd8ed1ab_0
+  - sacremoses=0.0.53=pyhd8ed1ab_0
+  - safetensors=0.3.3=py39hf21820d_1
+  - setuptools=68.2.2=pyhd8ed1ab_0
+  - six=1.16.0=pyh6c4a22f_0
+  - snappy=1.1.10=hfb803bf_0
+  - stack_data=0.2.0=pyhd3eb1b0_0
+  - sympy=1.12=pyh04b8f61_3
+  - tbb=2021.10.0=h91493d7_1
+  - timm=0.9.7=pyhd8ed1ab_0
+  - tk=8.6.13=hcfcfb64_0
+  - tokenizers=0.13.3=py39hca44cb7_0
+  - tomli=2.0.1=pyhd8ed1ab_0
+  - tornado=6.3.2=py39h2bbff1b_0
+  - tqdm=4.66.1=pyhd8ed1ab_0
+  - traitlets=5.7.1=py39haa95532_0
+  - transformers=4.33.2=pyhd8ed1ab_0
+  - typing-extensions=4.8.0=hd8ed1ab_0
+  - typing_extensions=4.8.0=pyha770c72_0
+  - tzdata=2023c=h71feb2d_0
+  - ucrt=10.0.22621.0=h57928b3_0
+  - unicodedata2=15.0.0=py39ha55989b_1
+  - urllib3=2.0.5=pyhd8ed1ab_0
+  - vc=14.3=h64f974e_17
+  - vc14_runtime=14.36.32532=hdcecf7f_17
+  - vs2015_runtime=14.36.32532=h05e6639_17
+  - wcwidth=0.2.5=pyhd3eb1b0_0
+  - wheel=0.41.2=pyhd8ed1ab_0
+  - win_inet_pton=1.1.0=pyhd8ed1ab_6
+  - xorg-libxau=1.0.11=hcd874cb_0
+  - xorg-libxdmcp=1.1.3=hcd874cb_0
+  - xxhash=0.8.2=hcfcfb64_0
+  - xz=5.2.6=h8d14728_0
+  - yaml=0.2.5=h8ffe710_2
+  - yapf=0.40.1=pyhd8ed1ab_0
+  - yarl=1.9.2=py39ha55989b_0
+  - zeromq=4.3.4=hd77b12b_0
+  - zipp=3.17.0=pyhd8ed1ab_0
+  - zlib=1.2.13=hcfcfb64_5
+  - zstd=1.5.5=h12be248_0
+  - pip:
+      - opencv-python==4.8.0.76
+      - supervision==0.6.0
+      - torchaudio==2.0.2
+      - torchvision==0.15.2
+prefix: C:\Users\Makoto\miniconda3\envs\dino
--- a/export_log.txt
+++ b/export_log.txt
+final text_encoder_type: bert-base-uncased
--- a/export_onnx.py
+++ b/export_onnx.py
+import torch
+import onnx
+from onnxsim import simplify
+
+from groundingdino.models import build_model
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict
+
+config_file = './groundingdino/config/GroundingDINO_SwinB_cfg.py'
+checkpoint_path = './weights/groundingdino_swinb_cogcoor.pth'
+
+def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = "cuda" if not cpu_only else "cpu"
+
+    # modified config
+    args.use_checkpoint = False
+    args.use_transformer_ckpt = False
+
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    _ = model.eval()
+    return model
+
+# 加载模型
+model = load_model(config_file, checkpoint_path, cpu_only=True)
+
+# 正式推理时使用的提示词，以及相关的mask
+caption = "car ."
+input_ids = model.tokenizer([caption], return_tensors="pt")["input_ids"]
+position_ids = torch.tensor([[0, 0, 1, 0]])
+token_type_ids = torch.tensor([[0, 0, 0, 0]])
+attention_mask = torch.tensor([[True, True, True, True]])
+text_token_mask = torch.tensor([[[True, False, False, False],
+                                 [False,  True,  True,  False],
+                                 [False,  True,  True,  False],
+                                 [False,  False, False, True]]])
+
+# 固定输入分辨率
+img = torch.randn(1, 3, 800, 1200)
+
+# 导出原始ONNX模型
+onnx_output_path = "weights/ground.onnx"
+simplified_onnx_path = "weights/ground_simplified1.onnx"
+
+
+torch.onnx.export(
+    model,
+    f=onnx_output_path,
+    args=(img, input_ids, attention_mask, position_ids, token_type_ids, text_token_mask),
+    input_names=["img", "input_ids", "attention_mask", "position_ids", "token_type_ids", "text_token_mask"],
+    output_names=["logits", "boxes"],
+    dynamic_axes=None, # 静态维度导出
+    opset_version=17,
+    verbose=False  # 关闭详细日志，如需调试可改为True
+    # do_constant_folding=True  # 常量折叠优化，提升简化效果
+)
+print(f"ONNX模型已成功导出到: {onnx_output_path}")
+
+# # 使用onnxsim简化模型
+# print(f"开始简化ONNX模型: {onnx_output_path}")
+# try:
+#     # 加载原始ONNX模型
+#     onnx_model = onnx.load(onnx_output_path)
+    
+#     # 简化模型（enable_fuse_bn=True 融合批归一化层，更彻底的简化）
+#     simplified_model, check = simplify(
+#         onnx_model,
+#         skip_fuse_bn=True,
+#         skip_constant_folding=True,
+#         dynamic_input_shape=False, 
+#         input_shapes={  # 指定输入形状，确保简化准确
+#             "img": (1, 3, 800, 1200),
+#             "input_ids": tuple(input_ids.shape),
+#             "attention_mask": tuple(attention_mask.shape),
+#             "position_ids": tuple(position_ids.shape),
+#             "token_type_ids": tuple(token_type_ids.shape),
+#             "text_token_mask": tuple(text_token_mask.shape)
+#         }
+#     )
+    
+#     # 验证简化后的模型
+#     assert check, "简化后的ONNX模型验证失败！"
+    
+#     # 保存简化后的模型
+#     onnx.save(simplified_model, simplified_onnx_path)
+#     print(f"ONNX模型简化完成，已保存至: {simplified_onnx_path}")
+    
+# except Exception as e:
+#     print(f"ONNX简化过程出错: {e}")
+#     print("将使用原始未简化的ONNX模型")
\ No newline at end of file
--- a/export_onnx_batchsize.py
+++ b/export_onnx_batchsize.py
+import torch
+import onnx
+from onnxsim import simplify
+
+from groundingdino.models import build_model
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict
+
+config_file = './groundingdino/config/GroundingDINO_SwinB_cfg.py'
+checkpoint_path = './weights/groundingdino_swinb_cogcoor.pth'
+
+def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = "cuda" if not cpu_only else "cpu"
+
+    # modified config
+    args.use_checkpoint = False
+    args.use_transformer_ckpt = False
+
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    _ = model.eval()
+    return model
+
+# 加载模型
+model = load_model(config_file, checkpoint_path, cpu_only=True)
+
+# ===================== 核心修改：batch_size=4 =====================
+BATCH_SIZE = 8
+# 正式推理时使用的提示词，以及相关的mask
+caption = "car ."
+
+# 1. 文本输入扩展到batch_size=4
+# 重复caption BATCH_SIZE次，构建批量文本输入
+input_ids = model.tokenizer([caption]*BATCH_SIZE, return_tensors="pt", padding="longest")["input_ids"]
+seq_len = input_ids.shape[1]  # 获取序列长度（适配不同caption）
+
+# 2. 扩展position_ids到batch_size=4
+position_ids = torch.tensor([[0, 0, 1, 0]]).repeat(BATCH_SIZE, 1)
+# 确保position_ids长度匹配seq_len（截断/补零）
+if position_ids.shape[1] < seq_len:
+    pad_len = seq_len - position_ids.shape[1]
+    position_ids = torch.cat([position_ids, torch.zeros(BATCH_SIZE, pad_len, dtype=torch.long)], dim=1)
+else:
+    position_ids = position_ids[:, :seq_len]
+
+# 3. 扩展token_type_ids到batch_size=4
+token_type_ids = torch.tensor([[0, 0, 0, 0]]).repeat(BATCH_SIZE, 1)
+if token_type_ids.shape[1] < seq_len:
+    pad_len = seq_len - token_type_ids.shape[1]
+    token_type_ids = torch.cat([token_type_ids, torch.zeros(BATCH_SIZE, pad_len, dtype=torch.long)], dim=1)
+else:
+    token_type_ids = token_type_ids[:, :seq_len]
+
+# 4. 扩展attention_mask到batch_size=4
+attention_mask = torch.tensor([[True, True, True, True]]).repeat(BATCH_SIZE, 1)
+if attention_mask.shape[1] < seq_len:
+    pad_len = seq_len - attention_mask.shape[1]
+    attention_mask = torch.cat([attention_mask, torch.ones(BATCH_SIZE, pad_len, dtype=torch.bool)], dim=1)
+else:
+    attention_mask = attention_mask[:, :seq_len]
+
+# 5. 扩展text_token_mask到batch_size=4
+text_token_mask = torch.tensor([[[True, False, False, False],
+                                 [False,  True,  True,  False],
+                                 [False,  True,  True,  False],
+                                 [False,  False, False, True]]]).repeat(BATCH_SIZE, 1, 1)
+# 调整mask维度匹配seq_len
+if text_token_mask.shape[1] < seq_len:
+    pad_len = seq_len - text_token_mask.shape[1]
+    # 补全mask的行和列
+    pad_row = torch.zeros(BATCH_SIZE, pad_len, text_token_mask.shape[2], dtype=torch.bool)
+    text_token_mask = torch.cat([text_token_mask, pad_row], dim=1)
+    pad_col = torch.zeros(BATCH_SIZE, seq_len, pad_len, dtype=torch.bool)
+    text_token_mask = torch.cat([text_token_mask, pad_col], dim=2)
+else:
+    text_token_mask = text_token_mask[:, :seq_len, :seq_len]
+
+# 6. 扩展图像输入到batch_size=4 (1,3,800,1200) -> (4,3,800,1200)
+img = torch.randn(BATCH_SIZE, 3, 800, 1200)
+
+# 打印输入形状，验证batch_size=8
+print("="*50)
+print("输入形状验证（batch_size=8）:")
+print(f"img: {img.shape}")
+print(f"input_ids: {input_ids.shape}")
+print(f"attention_mask: {attention_mask.shape}")
+print(f"position_ids: {position_ids.shape}")
+print(f"token_type_ids: {token_type_ids.shape}")
+print(f"text_token_mask: {text_token_mask.shape}")
+print("="*50)
+
+# onnx模型可以支持动态输入，在转换engine时建议注销
+dynamic_axes = {
+    "input_ids": {0: "batch_size", 1: "seq_len"},
+    "attention_mask": {0: "batch_size", 1: "seq_len"},
+    "position_ids": {0: "batch_size", 1: "seq_len"},
+    "token_type_ids": {0: "batch_size", 1: "seq_len"},
+    "text_token_mask": {0: "batch_size", 1: "seq_len", 2: "seq_len"},
+    "img": {0: "batch_size", 2: "height", 3: "width"},
+    "logits": {0: "batch_size"},
+    "boxes": {0: "batch_size"}
+}
+
+# 导出原始ONNX模型
+onnx_output_path = "weights/ground_bs8.onnx"
+simplified_onnx_path = "weights/ground_simplified_bs8.onnx"
+
+torch.onnx.export(
+    model,
+    f=onnx_output_path,
+    args=(img, input_ids, attention_mask, position_ids, token_type_ids, text_token_mask),
+    input_names=["img", "input_ids", "attention_mask", "position_ids", "token_type_ids", "text_token_mask"],
+    output_names=["logits", "boxes"],
+    # dynamic_axes=dynamic_axes,  # 转换engine时建议注释
+    opset_version=17,
+    verbose=False,
+    do_constant_folding=True  # 常量折叠优化，提升简化效果
+)
+
+# # 使用onnxsim简化模型
+# print(f"\n开始简化ONNX模型: {onnx_output_path}")
+# try:
+#     # 加载原始ONNX模型
+#     onnx_model = onnx.load(onnx_output_path)
+    
+#     # 简化模型（enable_fuse_bn=True 融合批归一化层，更彻底的简化）
+#     simplified_model, check = simplify(
+#         onnx_model,
+#         dynamic_input_shape=False,  # 因为固定了batch_size和分辨率，设为False
+#         input_shapes={  # 指定batch_size=4的输入形状
+#             "img": (BATCH_SIZE, 3, 800, 1200),
+#             "input_ids": tuple(input_ids.shape),
+#             "attention_mask": tuple(attention_mask.shape),
+#             "position_ids": tuple(position_ids.shape),
+#             "token_type_ids": tuple(token_type_ids.shape),
+#             "text_token_mask": tuple(text_token_mask.shape)
+#         }
+#     )
+    
+#     # 验证简化后的模型
+#     assert check, "简化后的ONNX模型验证失败！"
+    
+#     # 保存简化后的模型
+#     onnx.save(simplified_model, simplified_onnx_path)
+#     print(f"ONNX模型简化完成，已保存至: {simplified_onnx_path}")
+    
+# except Exception as e:
+#     print(f"ONNX简化过程出错: {e}")
+#     print("将使用原始未简化的ONNX模型")
\ No newline at end of file
--- a/export_onnx_migraphx_debug.py
+++ b/export_onnx_migraphx_debug.py
+import torch
+import onnx
+from onnxsim import simplify
+
+from groundingdino.models import build_model
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict
+
+config_file = './groundingdino/config/GroundingDINO_SwinB_cfg.py'
+checkpoint_path = './weights/groundingdino_swinb_cogcoor.pth'
+
+def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = "cuda" if not cpu_only else "cpu"
+
+    # modified config
+    args.use_checkpoint = False
+    args.use_transformer_ckpt = False
+
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    _ = model.eval()
+    return model
+
+# 加载模型
+model = load_model(config_file, checkpoint_path, cpu_only=True)
+
+# 正式推理时使用的提示词，以及相关的mask
+caption = "car ."
+input_ids = model.tokenizer([caption], return_tensors="pt")["input_ids"]
+position_ids = torch.tensor([[0, 0, 1, 0]])
+token_type_ids = torch.tensor([[0, 0, 0, 0]])
+attention_mask = torch.tensor([[True, True, True, True]])
+text_token_mask = torch.tensor([[[True, False, False, False],
+                                 [False,  True,  True,  False],
+                                 [False,  True,  True,  False],
+                                 [False,  False, False, True]]])
+# 固定输入分辨率
+img = torch.randn(1, 3, 800, 1200)
+
+# onnx模型可以支持动态输入，在转换engine时建议注销
+dynamic_axes = {
+    "input_ids": {0: "batch_size", 1: "seq_len"},
+    "attention_mask": {0: "batch_size", 1: "seq_len"},
+    "position_ids": {0: "batch_size", 1: "seq_len"},
+    "token_type_ids": {0: "batch_size", 1: "seq_len"},
+    "text_token_mask": {0: "batch_size", 1: "seq_len", 2: "seq_len"},
+    "img": {0: "batch_size", 2: "height", 3: "width"},
+    "logits": {0: "batch_size"},
+    "boxes": {0: "batch_size"}
+}
+
+# 导出原始ONNX模型
+onnx_output_path = "weights/ground.onnx"
+
+torch.onnx.export(
+    model,
+    f=onnx_output_path,
+    args=(img, input_ids, attention_mask, position_ids, token_type_ids, text_token_mask),
+    input_names=["img", "input_ids", "attention_mask", "position_ids", "token_type_ids", "text_token_mask"],
+    output_names=["logits", "boxes"],
+    opset_version=17,
+    verbose=False,  # 关闭详细日志，如需调试可改为True
+    do_constant_folding=True  # 常量折叠优化，提升简化效果
+)
--- a/fp16/fp16_fix.py
+++ b/fp16/fp16_fix.py
+import onnx
+from onnx import helper, TensorProto, numpy_helper
+import numpy as np
+
+
+def convert_fp16_manual(input_path, output_path, keep_io_types=True):
+    model = onnx.load(input_path)
+    graph = model.graph
+
+    fp32 = TensorProto.FLOAT
+    fp16 = TensorProto.FLOAT16
+
+    # ========== 1. 收集所有 name -> type ==========
+    type_map = {}
+
+    for init in graph.initializer:
+        type_map[init.name] = init.data_type
+
+    for inp in graph.input:
+        type_map[inp.name] = inp.type.tensor_type.elem_type
+
+    for out in graph.output:
+        type_map[out.name] = out.type.tensor_type.elem_type
+
+    # ========== 2. Initializer: FP32 -> FP16 ==========
+    for i, init in enumerate(graph.initializer):
+        if init.data_type == fp32:
+            arr = numpy_helper.to_array(init)
+            # 处理 inf / -inf / 超大值
+            arr = np.clip(arr, -65504, 65504)
+            arr = arr.astype(np.float16)
+            new_init = numpy_helper.from_array(arr, init.name)
+            graph.initializer[i].CopyFrom(new_init)
+            type_map[init.name] = fp16
+
+    # ========== 3. Constant 节点: FP32 -> FP16 ==========
+    for node in graph.node:
+        if node.op_type != "Constant":
+            continue
+        for attr in node.attribute:
+            if attr.t.data_type == fp32:
+                arr = numpy_helper.to_array(attr.t)
+                arr = np.clip(arr, -65504, 65504).astype(np.float16)
+                attr.t.CopyFrom(numpy_helper.from_array(arr))
+                type_map[node.output[0]] = fp16
+
+    # ========== 4. 遍历节点，插入 Cast ==========
+    new_nodes = []
+    cast_id = [0]
+
+    # 需要保持 FP32 的 op（不转其输出）
+    fp32_ops = {"Shape", "NonMaxSuppression", "Range",
+                "TopK", "SequenceConstruct", "SequenceEmpty"}
+
+    for node in graph.node:
+        if node.op_type == "Constant":
+            new_nodes.append(node)
+            continue
+
+        # 这些 op 输出本身就是整数或索引，跳过
+        if node.op_type in fp32_ops:
+            new_nodes.append(node)
+            for o in node.output:
+                type_map[o] = fp32  # 标记为 FP32（实际是 int64 等）
+            continue
+
+        # ---- 找目标类型：用第一个已知输入的类型 ----
+        target = None
+        for inp_name in node.input:
+            if inp_name and inp_name in type_map:
+                t = type_map[inp_name]
+                if t in (fp32, fp16):
+                    target = t
+                    break
+
+        # 默认目标类型 = FP16
+        if target is None:
+            target = fp16
+
+        # ---- 对每个输入做类型检查 ----
+        for idx, inp_name in enumerate(node.input):
+            if not inp_name or inp_name not in type_map:
+                continue
+            inp_type = type_map[inp_name]
+
+            # 输入是 FP32，目标是 FP16 -> 插 Cast to FP16
+            if inp_type == fp32 and target == fp16:
+                cast_out = f"_cast_{cast_id[0]}"
+                cast_id[0] += 1
+                cast_node = helper.make_node(
+                    "Cast", inputs=[inp_name], outputs=[cast_out], to=fp16
+                )
+                new_nodes.append(cast_node)
+                node.input[idx] = cast_out
+                type_map[cast_out] = fp16
+
+            # 输入是 FP16，目标是 FP32 -> 插 Cast to FP32
+            elif inp_type == fp16 and target == fp32:
+                cast_out = f"_cast_{cast_id[0]}"
+                cast_id[0] += 1
+                cast_node = helper.make_node(
+                    "Cast", inputs=[inp_name], outputs=[cast_out], to=fp32
+                )
+                new_nodes.append(cast_node)
+                node.input[idx] = cast_out
+                type_map[cast_out] = fp32
+
+        new_nodes.append(node)
+
+        # ---- 更新输出类型 ----
+        for o in node.output:
+            type_map[o] = target
+
+    # ========== 5. 替换节点 ==========
+    del graph.node[:]
+    graph.node.extend(new_nodes)
+
+    # ========== 6. 修复 graph output 类型声明 ==========
+    if keep_io_types:
+        # 保持原始 IO 类型为 FP32
+        # 输出需要 Cast 回 FP32
+        for out in graph.output:
+            if out.name in type_map and type_map[out.name] == fp16:
+                cast_out = f"_cast_out_{out.name}"
+                cast_node = helper.make_node(
+                    "Cast", inputs=[cast_out], outputs=[out.name], to=fp32
+                )
+                # 重命名原始输出
+                # 先找到最后产生这个输出的节点，改其输出名
+                for node in graph.node:
+                    for i, o in enumerate(node.output):
+                        if o == out.name:
+                            node.output[i] = cast_out
+                            break
+                graph.node.append(cast_node)
+                type_map[out.name] = fp32
+    else:
+        # 输出也改为 FP16
+        for out in graph.output:
+            if out.name in type_map:
+                out.type.tensor_type.elem_type = type_map[out.name]
+
+    # ========== 7. 验证 ==========
+    onnx.checker.check_model(model)
+    onnx.save(model, output_path)
+    print(f"✅ 转换完成 -> {output_path}")
+    print(f"   节点数: {len(graph.node)}")
+    print(f"   Cast 插入数: {cast_id[0]}")
+
+
+# ========== 运行 ==========
+convert_fp16_manual(
+    "weights/ground.onnx",
+    "weights/ground_fp16.onnx",
+    keep_io_types=True,
+)
--- a/fp16/onnx_fp16.py
+++ b/fp16/onnx_fp16.py
+import onnx
+from onnxconverter_common import float16
+
+# 1. 加载模型
+model = onnx.load("weights/ground.onnx")
+
+# 2. 转换为 FP16
+model_fp16 = float16.convert_float_to_float16(
+    model,    
+    keep_io_types=True,
+    # op_block_list=["Cast"]
+)
+
+# 3. 验证模型
+onnx.checker.check_model(model_fp16)
+
+# 4. 保存
+onnx.save(model_fp16, "weights/ground_fp16.onnx")
+print("FP16 model saved!")
--- a/fp16/onnx_fp16_runtime.py
+++ b/fp16/onnx_fp16_runtime.py
+import onnx
+from onnxruntime.transformers.float16 import convert_float_to_float16
+
+# ===== 1. 路径 =====
+input_model = "weights/ground.onnx"
+output_model = "weights/ground_fp16.onnx"
+
+# ===== 2. 加载 =====
+model = onnx.load(input_model)
+
+# ===== 3. 转换 =====
+model_fp16 = convert_float_to_float16(
+    model,
+    keep_io_types=True,  # ⭐ 强烈建议
+)
+
+# ===== 4. 保存 =====
+onnx.save(model_fp16, output_model)
+
+print("✅ ONNXRuntime FP16 转换完成")
\ No newline at end of file
--- a/fp16/onnx_int8.py
+++ b/fp16/onnx_int8.py
+from onnxruntime.quantization import quantize_dynamic, QuantType
+
+quantize_dynamic(
+    model_input="weights/ground.onnx",
+    model_output="weights/ground_int8.onnx",
+    weight_type=QuantType.QInt8,
+)
+print("int8 quantization done!")
--- a/fp16/test.py
+++ b/fp16/test.py
+import onnx
+from onnx import TensorProto
+
+# 加载你报错的FP16模型
+model = onnx.load("weights/ground_fp16.onnx")
+
+# 🔥 精准修复：找到报错的中间张量，强制修改类型为FP16
+target_arg = "/backbone/backbone.0/Cast_output_0"
+
+# 遍历模型所有张量类型声明，修复冲突项
+for vi in model.graph.value_info:
+    if vi.name == target_arg:
+        vi.type.tensor_type.elem_type = TensorProto.FLOAT16
+        print(f"✅ 已修复：{target_arg} 类型 → FP16")
+
+# 额外校验+保存修复后的模型
+onnx.checker.check_model(model)
+onnx.save(model, "weights/ground_fp16_fixed.onnx")
+print("\n🎉 模型修复完成！加载：ground_fp16_fixed.onnx")
\ No newline at end of file