首次提交

34e4011b · zk · 34e4011b · 34e4011b · 34e4011b · 34e4011b
Commit 34e4011b authored Apr 14, 2026 by zk
20 changed files
--- a/demo/image_editing_with_groundingdino_gligen.ipynb
+++ b/demo/image_editing_with_groundingdino_gligen.ipynb
--- a/demo/image_editing_with_groundingdino_stablediffusion.ipynb
+++ b/demo/image_editing_with_groundingdino_stablediffusion.ipynb
--- a/demo/infer_batchsize.py
+++ b/demo/infer_batchsize.py
+import argparse
+import os
+import sys
+import time
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+import groundingdino.datasets.transforms as T
+from groundingdino.models import build_model
+from groundingdino.util import box_ops
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+from groundingdino.util.vl_utils import create_positive_map_from_span
+# ====================== 核心配置 - 只需要改这里就能调整batch size ======================
+INFERENCE_BATCH_SIZE = 8  # 推理批次大小，修改这个值即可改变batch size
+# ====================================================================================
+def plot_boxes_to_image(image_pil, tgt):
+    H, W = tgt["size"]
+    boxes = tgt["boxes"]
+    labels = tgt["labels"]
+    assert len(boxes) == len(labels), "boxes and labels must have same length"
+    draw = ImageDraw.Draw(image_pil)
+    mask = Image.new("L", image_pil.size, 0)
+    mask_draw = ImageDraw.Draw(mask)
+    # draw boxes and masks
+    for box, label in zip(boxes, labels):
+        # from 0..1 to 0..W, 0..H
+        box = box * torch.Tensor([W, H, W, H])
+        # from xywh to xyxy
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        # random color
+        color = tuple(np.random.randint(0, 255, size=3).tolist())
+        # draw
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+        # draw.text((x0, y0), str(label), fill=color)
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), str(label), font)
+        else:
+            w, h = draw.textsize(str(label), font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        # bbox = draw.textbbox((x0, y0), str(label))
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), str(label), fill="white")
+        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
+    return image_pil, mask
+def load_image(image_path):
+    # load image
+    image_pil = Image.open(image_path).convert("RGB")  # load image
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image_pil, image
+def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = "cuda" if not cpu_only else "cpu"
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    print(load_res)
+    _ = model.eval()
+    return model
+def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
+    assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    device = "cuda" if not cpu_only else "cpu"
+    model = model.to(device)
+    # ========== 修改1: 构建batch数据 ==========
+    # 复制图像数据以构建batch (batch_size, 3, H, W)
+    image_batch = image.unsqueeze(0).repeat(INFERENCE_BATCH_SIZE, 1, 1, 1).to(device)
+    # 复制文本prompt以构建batch (batch_size,)
+    caption_batch = [caption] * INFERENCE_BATCH_SIZE
+    # 核心推理计时
+    torch.cuda.synchronize() if device == "cuda" else None  # 等待GPU操作完成，确保计时准确
+    start_time = time.perf_counter()  # 高精度计时
+    with torch.no_grad():
+        outputs = model(image_batch, captions=caption_batch)  # 传入batch数据
+    torch.cuda.synchronize() if device == "cuda" else None  # 等待GPU推理完成
+    infer_time = time.perf_counter() - start_time  # 推理耗时（秒）
+    # 计算单张图片的平均推理时间
+    avg_single_infer_time = infer_time / INFERENCE_BATCH_SIZE
+    # ========== 修改2: 处理batch输出 ==========
+    # 取第一个样本的输出作为结果（所有样本结果相同）
+    logits = outputs["pred_logits"].sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"][0]  # (nq, 4)
+    # filter output
+    if token_spans is None:
+        logits_filt = logits.cpu().clone()
+        boxes_filt = boxes.cpu().clone()
+        filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+        logits_filt = logits_filt[filt_mask]  # num_filt, 256
+        boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+        # get phrase
+        tokenlizer = model.tokenizer
+        tokenized = tokenlizer(caption)
+        # build pred
+        pred_phrases = []
+        for logit, box in zip(logits_filt, boxes_filt):
+            pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+            if with_logits:
+                pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+            else:
+                pred_phrases.append(pred_phrase)
+    else:
+        # given-phrase mode
+        positive_maps = create_positive_map_from_span(
+            model.tokenizer(caption),
+            token_span=token_spans
+        ).to(image.device) # n_phrase, 256
+        logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
+        all_logits = []
+        all_phrases = []
+        all_boxes = []
+        for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
+            # get phrase
+            phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
+            # get mask
+            filt_mask = logit_phr > box_threshold
+            # filt box
+            all_boxes.append(boxes[filt_mask])
+            # filt logits
+            all_logits.append(logit_phr[filt_mask])
+            if with_logits:
+                logit_phr_num = logit_phr[filt_mask]
+                all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
+            else:
+                all_phrases.extend([phrase for _ in range(len(filt_mask))])
+        boxes_filt = torch.cat(all_boxes, dim=0).cpu()
+        pred_phrases = all_phrases
+    # 返回batch推理总时间，方便性能计算
+    return boxes_filt, pred_phrases, infer_time, avg_single_infer_time
+def benchmark_performance(model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, warmup_runs=5, test_runs=10):
+    """
+    性能基准测试：预热 + 多次推理计算平均FPS和时延
+    适配batch推理，计算正确的吞吐量
+    Args:
+        warmup_runs: 预热次数（排除初始加载的影响）
+        test_runs: 正式测试次数
+    """
+    # 1. 预热阶段（忽略耗时）
+    print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
+    for i in range(warmup_runs):
+        _, _, _, _ = get_grounding_output(
+            model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
+        )
+        print(f"预热完成 {i+1}/{warmup_runs}")
+    # 2. 正式测试阶段
+    print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
+    total_batch_time = 0.0  # 总batch推理时间
+    total_single_time = 0.0  # 总单张推理时间
+    batch_times = []        # 记录每次batch推理的时延
+    single_times = []       # 记录每次单张推理的平均时延
+    for i in range(test_runs):
+        _, _, batch_infer_time, avg_single_infer_time = get_grounding_output(
+            model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
+        )
+        batch_times.append(batch_infer_time)
+        single_times.append(avg_single_infer_time)
+        total_batch_time += batch_infer_time
+        total_single_time += avg_single_infer_time
+        print(f"测试 {i+1}/{test_runs} - Batch推理时延: {batch_infer_time*1000:.2f} ms | 单张平均时延: {avg_single_infer_time*1000:.2f} ms")
+    # 3. 计算性能指标
+    avg_batch_time = total_batch_time / test_runs                # 平均batch推理时延（秒）
+    avg_single_time = total_single_time / test_runs              # 平均单张推理时延（秒）
+    batch_throughput = (test_runs * INFERENCE_BATCH_SIZE) / total_batch_time  # 总吞吐量 (张/秒)
+    batch_std_time = np.std(batch_times)                         # batch时延标准差
+    single_std_time = np.std(single_times)                       # 单张时延标准差
+    # 4. 输出性能报告
+    print("\n" + "="*60)
+    print("📊 性能测试报告 (Batch Size = {})".format(INFERENCE_BATCH_SIZE))
+    print("="*60)
+    print(f"测试环境: {'GPU (CUDA)' if not cpu_only else 'CPU'}")
+    print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
+    print(f"Batch Size: {INFERENCE_BATCH_SIZE}")
+    print(f"平均Batch推理时延: {avg_batch_time*1000:.2f} ms (±{batch_std_time*1000:.2f} ms)")
+    print(f"平均单张推理时延: {avg_single_time*1000:.2f} ms (±{single_std_time*1000:.2f} ms)")
+    print(f"最大Batch时延: {max(batch_times)*1000:.2f} ms | 最大单张时延: {max(single_times)*1000:.2f} ms")
+    print(f"最小Batch时延: {min(batch_times)*1000:.2f} ms | 最小单张时延: {min(single_times)*1000:.2f} ms")
+    print(f"总吞吐量: {batch_throughput:.2f} 张/秒")
+    print("="*60 + "\n")
+    return avg_batch_time, avg_single_time, batch_throughput, batch_times, single_times
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Grounding DINO 性能测试 (Batch推理)", add_help=True)
+    parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
+    parser.add_argument("--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file")
+    parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
+    parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
+    parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True, help="output directory")
+    parser.add_argument("--box_threshold", type=float, default=0.35, help="box threshold")
+    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
+    parser.add_argument("--token_spans", type=str, default=None, help=
+                        "The positions of start and end positions of phrases of interest. \
+                        For example, a caption is 'a cat and a dog', \
+                        if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
+                        if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
+                        ")
+    parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
+    # 新增性能测试参数
+    parser.add_argument("--warmup-runs", type=int, default=5, help="预热运行次数，默认5")
+    parser.add_argument("--test-runs", type=int, default=10, help="正式测试运行次数，默认10")
+    args = parser.parse_args()
+    # cfg
+    config_file = args.config_file
+    checkpoint_path = args.checkpoint_path
+    image_path = args.image_path
+    text_prompt = args.text_prompt
+    output_dir = args.output_dir
+    box_threshold = args.box_threshold
+    text_threshold = args.text_threshold
+    token_spans = args.token_spans
+    warmup_runs = args.warmup_runs
+    test_runs = args.test_runs
+    # 打印基础信息
+    print(f"📌 使用设备: {'GPU' if not args.cpu_only else 'CPU'}")
+    print(f"📌 推理Batch Size: {INFERENCE_BATCH_SIZE}")
+    if not args.cpu_only and torch.cuda.is_available():
+        print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
+        print(f"📌 GPU编号: {torch.cuda.current_device()}")
+    # make dir
+    os.makedirs(output_dir, exist_ok=True)
+    # load image
+    image_pil, image = load_image(image_path)
+    # load model
+    model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
+    # visualize raw image
+    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
+    # set the text_threshold to None if token_spans is set.
+    if token_spans is not None:
+        text_threshold = None
+        print("Using token_spans. Set the text_threshold to None.")
+    # 运行性能基准测试
+    avg_batch_time, avg_single_time, throughput, batch_times, single_times = benchmark_performance(
+        model, image, text_prompt, box_threshold, text_threshold, 
+        args.cpu_only, eval(f"{token_spans}") if token_spans else None,
+        warmup_runs, test_runs
+    )
+    # 单次推理并保存结果（保留原有功能）
+    print("\n=== 生成推理结果图片 ===")
+    boxes_filt, pred_phrases, batch_infer_time, single_infer_time = get_grounding_output(
+        model, image, text_prompt, box_threshold, text_threshold, args.cpu_only, 
+        eval(f"{token_spans}") if token_spans else None
+    )
+    # visualize pred
+    size = image_pil.size
+    pred_dict = {
+        "boxes": boxes_filt,
+        "size": [size[1], size[0]],  # H,W
+        "labels": pred_phrases,
+    }
+    image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
+    image_with_box.save(os.path.join(output_dir, "pred.jpg"))
+    # 保存性能结果到文件
+    performance_file = os.path.join(output_dir, "performance_report.txt")
+    with open(performance_file, "w", encoding="utf-8") as f:
+        f.write("="*60 + "\n")
+        f.write(f"Grounding DINO 性能测试报告 (Batch Size = {INFERENCE_BATCH_SIZE})\n")
+        f.write("="*60 + "\n")
+        f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"测试设备: {'GPU' if not args.cpu_only else 'CPU'}\n")
+        if not args.cpu_only and torch.cuda.is_available():
+            f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
+        f.write(f"Batch Size: {INFERENCE_BATCH_SIZE}\n")
+        f.write(f"预热次数: {warmup_runs}\n")
+        f.write(f"测试次数: {test_runs}\n")
+        f.write(f"平均Batch推理时延: {avg_batch_time*1000:.2f} ms\n")
+        f.write(f"Batch时延标准差: {np.std(batch_times)*1000:.2f} ms\n")
+        f.write(f"平均单张推理时延: {avg_single_time*1000:.2f} ms\n")
+        f.write(f"单张时延标准差: {np.std(single_times)*1000:.2f} ms\n")
+        f.write(f"最大Batch时延: {max(batch_times)*1000:.2f} ms\n")
+        f.write(f"最小Batch时延: {min(batch_times)*1000:.2f} ms\n")
+        f.write(f"总吞吐量: {throughput:.2f} 张/秒\n")
+        f.write(f"最后一次Batch推理时延: {batch_infer_time*1000:.2f} ms\n")
+        f.write(f"最后一次单张推理时延: {single_infer_time*1000:.2f} ms\n")
+    print(f"\n✅ 性能报告已保存至: {performance_file}")
+    print(f"✅ 推理结果图片已保存至: {os.path.join(output_dir, 'pred.jpg')}")
\ No newline at end of file
--- a/demo/infer_onnx.py
+++ b/demo/infer_onnx.py
+import argparse
+import os
+import time
+from typing import List, Optional, Tuple
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+import groundingdino.datasets.transforms as T
+from groundingdino.util.utils import get_phrases_from_posmap
+from groundingdino.util import get_tokenlizer
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.models.GroundingDINO.bertwarper import (
+    generate_masks_with_special_tokens_and_transfer_map,
+)
+def plot_boxes_to_image(image_pil, tgt):
+    H, W = tgt["size"]
+    boxes = tgt["boxes"]
+    labels = tgt["labels"]
+    assert len(boxes) == len(labels), "boxes and labels must have same length"
+    draw = ImageDraw.Draw(image_pil)
+    mask = Image.new("L", image_pil.size, 0)
+    mask_draw = ImageDraw.Draw(mask)
+    for box, label in zip(boxes, labels):
+        box = box * torch.Tensor([W, H, W, H])
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        color = tuple(np.random.randint(0, 255, size=3).tolist())
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), str(label), font)
+        else:
+            w, h = draw.textsize(str(label), font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), str(label), fill="white")
+        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
+    return image_pil, mask
+def load_image(image_path):
+    image_pil = Image.open(image_path).convert("RGB")
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3,h,w
+    return image_pil, image
+def preprocess_caption(caption: str) -> str:
+    caption = caption.lower().strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    return caption
+def sigmoid(x: np.ndarray) -> np.ndarray:
+    return 1.0 / (1.0 + np.exp(-x))
+def build_text_tensors(
+    config_file: str,
+    caption: str,
+    device: str,
+):
+    cfg = SLConfig.fromfile(config_file)
+    tokenizer = get_tokenlizer.get_tokenlizer(cfg.text_encoder_type)
+    special_token_ids = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
+    caption = preprocess_caption(caption)
+    tokenized = tokenizer([caption], padding="longest", return_tensors="pt")
+    tokenized = {k: v.to(device) for k, v in tokenized.items()}
+    text_self_attention_masks, position_ids, _ = generate_masks_with_special_tokens_and_transfer_map(
+        tokenized, special_token_ids, tokenizer
+    )
+    max_text_len = getattr(cfg, "max_text_len", 256)
+    if text_self_attention_masks.shape[1] > max_text_len:
+        s = max_text_len
+        text_self_attention_masks = text_self_attention_masks[:, :s, :s]
+        position_ids = position_ids[:, :s]
+        tokenized["input_ids"] = tokenized["input_ids"][:, :s]
+        tokenized["attention_mask"] = tokenized["attention_mask"][:, :s]
+        tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :s]
+    # 同时返回 tokenizer 和“单句 tokenize”（用于 get_phrases_from_posmap 行为对齐）
+    tokenized_single = tokenizer(caption)
+    return (
+        cfg,
+        tokenizer,
+        tokenized_single,
+        tokenized["input_ids"].to(torch.int64),
+        tokenized["token_type_ids"].to(torch.int64),
+        tokenized["attention_mask"].to(torch.int64),
+        position_ids.to(torch.int64),
+        text_self_attention_masks,
+    )
+def ort_create_session(onnx_path: str, device: str, num_threads: int = 0):
+    import onnxruntime as ort
+    so = ort.SessionOptions()
+    if num_threads and num_threads > 0:
+        so.intra_op_num_threads = int(num_threads)
+        so.inter_op_num_threads = int(num_threads)
+    providers = ["CPUExecutionProvider"]
+    if device == "cuda":
+        # 若环境支持 onnxruntime-gpu，会自动启用 CUDA provider
+        providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
+    return ort.InferenceSession(onnx_path, sess_options=so, providers=providers)
+def onnx_infer_once(
+    sess,
+    image: torch.Tensor,
+    input_ids: torch.Tensor,
+    token_type_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    position_ids: torch.Tensor,
+    text_self_attention_masks: torch.Tensor,
+    use_cuda_sync: bool,
+) -> Tuple[np.ndarray, np.ndarray, float]:
+    # ORT 输入必须是 numpy
+    feeds = {
+        "image": image[None].detach().cpu().numpy().astype(np.float32),
+        "input_ids": input_ids.detach().cpu().numpy().astype(np.int64),
+        "token_type_ids": token_type_ids.detach().cpu().numpy().astype(np.int64),
+        "attention_mask": attention_mask.detach().cpu().numpy().astype(np.int64),
+        "position_ids": position_ids.detach().cpu().numpy().astype(np.int64),
+        "text_self_attention_masks": text_self_attention_masks.detach().cpu().numpy(),
+    }
+    if use_cuda_sync:
+        torch.cuda.synchronize()
+    start = time.perf_counter()
+    pred_logits, pred_boxes = sess.run(["pred_logits", "pred_boxes"], feeds)
+    if use_cuda_sync:
+        torch.cuda.synchronize()
+    infer_time = time.perf_counter() - start
+    return pred_logits, pred_boxes, infer_time
+def postprocess_and_phrases(
+    pred_logits: np.ndarray,  # [B,NQ,S]
+    pred_boxes: np.ndarray,  # [B,NQ,4]
+    tokenized_single,
+    tokenizer,
+    box_threshold: float,
+    text_threshold: float,
+    with_logits: bool = True,
+):
+    # 对齐 torch 版：取 batch=0
+    logits = sigmoid(pred_logits[0])  # [NQ,S]
+    boxes = pred_boxes[0]  # [NQ,4]
+    max_per_query = logits.max(axis=1)
+    mask = max_per_query > box_threshold
+    logits_filt = logits[mask]
+    boxes_filt = boxes[mask]
+    pred_phrases: List[str] = []
+    for logit in logits_filt:
+        posmap = torch.from_numpy(logit) > text_threshold
+        phrase = get_phrases_from_posmap(posmap, tokenized_single, tokenizer)
+        phrase = phrase.replace(".", "")
+        if with_logits:
+            pred_phrases.append(phrase + f"({str(float(logit.max()))[:4]})")
+        else:
+            pred_phrases.append(phrase)
+    return torch.from_numpy(boxes_filt), pred_phrases
+def benchmark_performance_onnx(
+    sess,
+    image: torch.Tensor,
+    input_ids: torch.Tensor,
+    token_type_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    position_ids: torch.Tensor,
+    text_self_attention_masks: torch.Tensor,
+    warmup_runs: int = 5,
+    test_runs: int = 10,
+    use_cuda_sync: bool = False,
+):
+    print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
+    for i in range(warmup_runs):
+        _ = onnx_infer_once(
+            sess,
+            image,
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            position_ids,
+            text_self_attention_masks,
+            use_cuda_sync=use_cuda_sync,
+        )
+        print(f"预热完成 {i+1}/{warmup_runs}")
+    print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
+    total_time = 0.0
+    infer_times = []
+    for i in range(test_runs):
+        _, _, infer_time = onnx_infer_once(
+            sess,
+            image,
+            input_ids,
+            token_type_ids,
+            attention_mask,
+            position_ids,
+            text_self_attention_masks,
+            use_cuda_sync=use_cuda_sync,
+        )
+        infer_times.append(infer_time)
+        total_time += infer_time
+        print(f"测试 {i+1}/{test_runs} - 单次推理时延: {infer_time*1000:.2f} ms")
+    avg_infer_time = total_time / test_runs
+    fps = test_runs / total_time
+    std_infer_time = float(np.std(infer_times))
+    print("\n" + "=" * 50)
+    print("📊 ONNX 性能测试报告")
+    print("=" * 50)
+    print(f"测试环境: {'GPU (CUDAExecutionProvider)' if use_cuda_sync else 'CPU/Unknown'}")
+    print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
+    print(f"平均推理时延: {avg_infer_time*1000:.2f} ms (±{std_infer_time*1000:.2f} ms)")
+    print(f"最大推理时延: {max(infer_times)*1000:.2f} ms")
+    print(f"最小推理时延: {min(infer_times)*1000:.2f} ms")
+    print(f"平均FPS: {fps:.2f} 帧/秒")
+    print("=" * 50 + "\n")
+    return avg_infer_time, fps, infer_times
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Grounding DINO ONNX 推理与性能测试", add_help=True)
+    parser.add_argument("--onnx_path", type=str, required=True, help="onnx 模型路径")
+    parser.add_argument("--config_file", "-c", type=str, required=True, help="用于加载 tokenizer 等配置")
+    parser.add_argument("--image_path", "-i", type=str, required=True)
+    parser.add_argument("--text_prompt", "-t", type=str, required=True)
+    parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True)
+    parser.add_argument("--box_threshold", type=float, default=0.3)
+    parser.add_argument("--text_threshold", type=float, default=0.25)
+    parser.add_argument("--cpu-only", action="store_true")
+    parser.add_argument("--warmup-runs", type=int, default=5)
+    parser.add_argument("--test-runs", type=int, default=10)
+    parser.add_argument("--ort-threads", type=int, default=0, help="onnxruntime 线程数(0=默认)")
+    args = parser.parse_args()
+    device = "cpu" if args.cpu_only else ("cuda" if torch.cuda.is_available() else "cpu")
+    use_cuda_sync = device == "cuda"
+    print(f"📌 ORT 设备偏好: {device}")
+    if use_cuda_sync:
+        print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
+    os.makedirs(args.output_dir, exist_ok=True)
+    image_pil, image = load_image(args.image_path)
+    image_pil.save(os.path.join(args.output_dir, "raw_image.jpg"))
+    (
+        _cfg,
+        tokenizer,
+        tokenized_single,
+        input_ids,
+        token_type_ids,
+        attention_mask,
+        position_ids,
+        text_self_attention_masks,
+    ) = build_text_tensors(args.config_file, args.text_prompt, device="cpu")
+    # image 在 GPU 上计时同步更准确，但 feeds 最终还是走 numpy(cpu)；这里只保持与 torch 版一致：
+    # 计时逻辑保留 + 可视化保留；模型本体推理走 ORT
+    sess = ort_create_session(args.onnx_path, device=device, num_threads=args.ort_threads)
+    avg_infer_time, fps, infer_times = benchmark_performance_onnx(
+        sess,
+        image,
+        input_ids,
+        token_type_ids,
+        attention_mask,
+        position_ids,
+        text_self_attention_masks,
+        warmup_runs=args.warmup_runs,
+        test_runs=args.test_runs,
+        use_cuda_sync=use_cuda_sync,
+    )
+    print("\n=== 生成推理结果图片 ===")
+    pred_logits, pred_boxes, single_infer_time = onnx_infer_once(
+        sess,
+        image,
+        input_ids,
+        token_type_ids,
+        attention_mask,
+        position_ids,
+        text_self_attention_masks,
+        use_cuda_sync=use_cuda_sync,
+    )
+    boxes_filt, pred_phrases = postprocess_and_phrases(
+        pred_logits=pred_logits,
+        pred_boxes=pred_boxes,
+        tokenized_single=tokenized_single,
+        tokenizer=tokenizer,
+        box_threshold=args.box_threshold,
+        text_threshold=args.text_threshold,
+        with_logits=True,
+    )
+    size = image_pil.size
+    pred_dict = {
+        "boxes": boxes_filt,
+        "size": [size[1], size[0]],  # H,W
+        "labels": pred_phrases,
+    }
+    image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
+    image_with_box.save(os.path.join(args.output_dir, "pred.jpg"))
+    performance_file = os.path.join(args.output_dir, "performance_report_onnx.txt")
+    with open(performance_file, "w", encoding="utf-8") as f:
+        f.write("=" * 50 + "\n")
+        f.write("Grounding DINO ONNX 性能测试报告\n")
+        f.write("=" * 50 + "\n")
+        f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"推理后端: onnxruntime\n")
+        f.write(f"设备偏好: {device}\n")
+        if use_cuda_sync:
+            f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
+        f.write(f"预热次数: {args.warmup_runs}\n")
+        f.write(f"测试次数: {args.test_runs}\n")
+        f.write(f"平均推理时延: {avg_infer_time*1000:.2f} ms\n")
+        f.write(f"时延标准差: {np.std(infer_times)*1000:.2f} ms\n")
+        f.write(f"最大时延: {max(infer_times)*1000:.2f} ms\n")
+        f.write(f"最小时延: {min(infer_times)*1000:.2f} ms\n")
+        f.write(f"平均FPS: {fps:.2f} 帧/秒\n")
+        f.write(f"单次推理时延（最后一次）: {single_infer_time*1000:.2f} ms\n")
+    print(f"\n✅ 性能报告已保存至: {performance_file}")
+    print(f"✅ 推理结果图片已保存至: {os.path.join(args.output_dir, 'pred.jpg')}")
--- a/demo/infer_torch.py
+++ b/demo/infer_torch.py
+import argparse
+import os
+import sys
+import time
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+import groundingdino.datasets.transforms as T
+from groundingdino.models import build_model
+from groundingdino.util import box_ops
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+from groundingdino.util.vl_utils import create_positive_map_from_span
+def plot_boxes_to_image(image_pil, tgt):
+    H, W = tgt["size"]
+    boxes = tgt["boxes"]
+    labels = tgt["labels"]
+    assert len(boxes) == len(labels), "boxes and labels must have same length"
+    draw = ImageDraw.Draw(image_pil)
+    mask = Image.new("L", image_pil.size, 0)
+    mask_draw = ImageDraw.Draw(mask)
+    # draw boxes and masks
+    for box, label in zip(boxes, labels):
+        # from 0..1 to 0..W, 0..H
+        box = box * torch.Tensor([W, H, W, H])
+        # from xywh to xyxy
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        # random color
+        color = tuple(np.random.randint(0, 255, size=3).tolist())
+        # draw
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+        # draw.text((x0, y0), str(label), fill=color)
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), str(label), font)
+        else:
+            w, h = draw.textsize(str(label), font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        # bbox = draw.textbbox((x0, y0), str(label))
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), str(label), fill="white")
+        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
+    return image_pil, mask
+def load_image(image_path):
+    # load image
+    image_pil = Image.open(image_path).convert("RGB")  # load image
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image_pil, image
+def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = "cuda" if not cpu_only else "cpu"
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    print(load_res)
+    _ = model.eval()
+    return model
+def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
+    assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    device = "cuda" if not cpu_only else "cpu"
+    model = model.to(device)
+    image = image.to(device)
+    # 核心推理计时
+    torch.cuda.synchronize() if device == "cuda" else None  # 等待GPU操作完成，确保计时准确
+    start_time = time.perf_counter()  # 高精度计时
+    with torch.no_grad():
+        outputs = model(image[None], captions=[caption])
+    torch.cuda.synchronize() if device == "cuda" else None  # 等待GPU推理完成
+    infer_time = time.perf_counter() - start_time  # 推理耗时（秒）
+    logits = outputs["pred_logits"].sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"][0]  # (nq, 4)
+    # filter output
+    if token_spans is None:
+        logits_filt = logits.cpu().clone()
+        boxes_filt = boxes.cpu().clone()
+        filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+        logits_filt = logits_filt[filt_mask]  # num_filt, 256
+        boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+        # get phrase
+        tokenlizer = model.tokenizer
+        tokenized = tokenlizer(caption)
+        # build pred
+        pred_phrases = []
+        for logit, box in zip(logits_filt, boxes_filt):
+            pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+            if with_logits:
+                pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+            else:
+                pred_phrases.append(pred_phrase)
+    else:
+        # given-phrase mode
+        positive_maps = create_positive_map_from_span(
+            model.tokenizer(text_prompt),
+            token_span=token_spans
+        ).to(image.device) # n_phrase, 256
+        logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
+        all_logits = []
+        all_phrases = []
+        all_boxes = []
+        for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
+            # get phrase
+            phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
+            # get mask
+            filt_mask = logit_phr > box_threshold
+            # filt box
+            all_boxes.append(boxes[filt_mask])
+            # filt logits
+            all_logits.append(logit_phr[filt_mask])
+            if with_logits:
+                logit_phr_num = logit_phr[filt_mask]
+                all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
+            else:
+                all_phrases.extend([phrase for _ in range(len(filt_mask))])
+        boxes_filt = torch.cat(all_boxes, dim=0).cpu()
+        pred_phrases = all_phrases
+    return boxes_filt, pred_phrases, infer_time
+def benchmark_performance(model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, warmup_runs=5, test_runs=10):
+    """
+    性能基准测试：预热 + 多次推理计算平均FPS和时延
+    Args:
+        warmup_runs: 预热次数（排除初始加载的影响）
+        test_runs: 正式测试次数
+    """
+    # 1. 预热阶段（忽略耗时）
+    print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
+    for i in range(warmup_runs):
+        _, _, _ = get_grounding_output(
+            model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
+        )
+        print(f"预热完成 {i+1}/{warmup_runs}")
+    # 2. 正式测试阶段
+    print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
+    total_time = 0.0
+    infer_times = []  # 记录每次推理的时延
+    for i in range(test_runs):
+        _, _, infer_time = get_grounding_output(
+            model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
+        )
+        infer_times.append(infer_time)
+        total_time += infer_time
+        print(f"测试 {i+1}/{test_runs} - 单次推理时延: {infer_time*1000:.2f} ms")
+    # 3. 计算性能指标
+    avg_infer_time = total_time / test_runs  # 平均时延（秒）
+    fps = test_runs / total_time            # 平均FPS
+    std_infer_time = np.std(infer_times)    # 时延标准差（稳定性）
+    # 4. 输出性能报告
+    print("\n" + "="*50)
+    print("📊 性能测试报告")
+    print("="*50)
+    print(f"测试环境: {'GPU (CUDA)' if not cpu_only else 'CPU'}")
+    print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
+    print(f"平均推理时延: {avg_infer_time*1000:.2f} ms (±{std_infer_time*1000:.2f} ms)")
+    print(f"最大推理时延: {max(infer_times)*1000:.2f} ms")
+    print(f"最小推理时延: {min(infer_times)*1000:.2f} ms")
+    print(f"平均FPS: {fps:.2f} 帧/秒")
+    print("="*50 + "\n")
+    return avg_infer_time, fps, infer_times
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Grounding DINO 性能测试", add_help=True)
+    parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
+    parser.add_argument("--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file")
+    parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
+    parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
+    parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True, help="output directory")
+    parser.add_argument("--box_threshold", type=float, default=0.35, help="box threshold")
+    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
+    parser.add_argument("--token_spans", type=str, default=None, help=
+                        "The positions of start and end positions of phrases of interest. \
+                        For example, a caption is 'a cat and a dog', \
+                        if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
+                        if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
+                        ")
+    parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
+    # 新增性能测试参数
+    parser.add_argument("--warmup-runs", type=int, default=5, help="预热运行次数，默认5")
+    parser.add_argument("--test-runs", type=int, default=10, help="正式测试运行次数，默认10")
+    args = parser.parse_args()
+    # cfg
+    config_file = args.config_file
+    checkpoint_path = args.checkpoint_path
+    image_path = args.image_path
+    text_prompt = args.text_prompt
+    output_dir = args.output_dir
+    box_threshold = args.box_threshold
+    text_threshold = args.text_threshold
+    token_spans = args.token_spans
+    warmup_runs = args.warmup_runs
+    test_runs = args.test_runs
+    # 打印基础信息
+    print(f"📌 使用设备: {'GPU' if not args.cpu_only else 'CPU'}")
+    if not args.cpu_only and torch.cuda.is_available():
+        print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
+        print(f"📌 GPU编号: {torch.cuda.current_device()}")
+    # make dir
+    os.makedirs(output_dir, exist_ok=True)
+    # load image
+    image_pil, image = load_image(image_path)
+    # load model
+    model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
+    # visualize raw image
+    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
+    # set the text_threshold to None if token_spans is set.
+    if token_spans is not None:
+        text_threshold = None
+        print("Using token_spans. Set the text_threshold to None.")
+    # 运行性能基准测试
+    avg_infer_time, fps, infer_times = benchmark_performance(
+        model, image, text_prompt, box_threshold, text_threshold, 
+        args.cpu_only, eval(f"{token_spans}") if token_spans else None,
+        warmup_runs, test_runs
+    )
+    # 单次推理并保存结果（保留原有功能）
+    print("\n=== 生成推理结果图片 ===")
+    boxes_filt, pred_phrases, single_infer_time = get_grounding_output(
+        model, image, text_prompt, box_threshold, text_threshold, args.cpu_only, 
+        eval(f"{token_spans}") if token_spans else None
+    )
+    # visualize pred
+    size = image_pil.size
+    pred_dict = {
+        "boxes": boxes_filt,
+        "size": [size[1], size[0]],  # H,W
+        "labels": pred_phrases,
+    }
+    image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
+    image_with_box.save(os.path.join(output_dir, "pred.jpg"))
+    # 保存性能结果到文件
+    performance_file = os.path.join(output_dir, "performance_report.txt")
+    with open(performance_file, "w", encoding="utf-8") as f:
+        f.write("="*50 + "\n")
+        f.write("Grounding DINO 性能测试报告\n")
+        f.write("="*50 + "\n")
+        f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"测试设备: {'GPU' if not args.cpu_only else 'CPU'}\n")
+        if not args.cpu_only and torch.cuda.is_available():
+            f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
+        f.write(f"预热次数: {warmup_runs}\n")
+        f.write(f"测试次数: {test_runs}\n")
+        f.write(f"平均推理时延: {avg_infer_time*1000:.2f} ms\n")
+        f.write(f"时延标准差: {np.std(infer_times)*1000:.2f} ms\n")
+        f.write(f"最大时延: {max(infer_times)*1000:.2f} ms\n")
+        f.write(f"最小时延: {min(infer_times)*1000:.2f} ms\n")
+        f.write(f"平均FPS: {fps:.2f} 帧/秒\n")
+        f.write(f"单次推理时延（最后一次）: {single_infer_time*1000:.2f} ms\n")
+    print(f"\n✅ 性能报告已保存至: {performance_file}")
+    print(f"✅ 推理结果图片已保存至: {os.path.join(output_dir, 'pred.jpg')}")
\ No newline at end of file
--- a/demo/infer_torch_fp16.py
+++ b/demo/infer_torch_fp16.py
+import argparse
+import os
+import sys
+import time
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+import groundingdino.datasets.transforms as T
+from groundingdino.models import build_model
+from groundingdino.util import box_ops
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+from groundingdino.util.vl_utils import create_positive_map_from_span
+def plot_boxes_to_image(image_pil, tgt):
+    H, W = tgt["size"]
+    boxes = tgt["boxes"]
+    labels = tgt["labels"]
+    assert len(boxes) == len(labels), "boxes and labels must have same length"
+    draw = ImageDraw.Draw(image_pil)
+    mask = Image.new("L", image_pil.size, 0)
+    mask_draw = ImageDraw.Draw(mask)
+    # draw boxes and masks
+    for box, label in zip(boxes, labels):
+        # from 0..1 to 0..W, 0..H
+        box = box * torch.Tensor([W, H, W, H])
+        # from xywh to xyxy
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        # random color
+        color = tuple(np.random.randint(0, 255, size=3).tolist())
+        # draw
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+        # draw.text((x0, y0), str(label), fill=color)
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), str(label), font)
+        else:
+            w, h = draw.textsize(str(label), font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        # bbox = draw.textbbox((x0, y0), str(label))
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), str(label), fill="white")
+        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
+    return image_pil, mask
+def load_image(image_path):
+    # load image
+    image_pil = Image.open(image_path).convert("RGB")  # load image
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image_pil, image
+def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = "cuda" if not cpu_only else "cpu"
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    print(load_res)
+    _ = model.eval()
+    return model
+def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None, use_fp16=False):
+    assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    device = "cuda" if not cpu_only else "cpu"
+    model = model.to(device)
+    image = image.to(device)
+    # 核心推理计时
+    torch.cuda.synchronize() if device == "cuda" else None  # 等待GPU操作完成，确保计时准确
+    start_time = time.perf_counter()  # 高精度计时
+    with torch.no_grad():
+        if use_fp16 and device == "cuda":
+            with torch.autocast(device_type="cuda", dtype=torch.float16):
+                outputs = model(image[None], captions=[caption])
+        else:
+            outputs = model(image[None], captions=[caption])
+    torch.cuda.synchronize() if device == "cuda" else None  # 等待GPU推理完成
+    infer_time = time.perf_counter() - start_time  # 推理耗时（秒）
+    # 输出转回 FP32，避免后续 CPU 操作出问题
+    logits = outputs["pred_logits"].float().sigmoid()[0]
+    boxes = outputs["pred_boxes"].float()[0]
+    # filter output
+    if token_spans is None:
+        logits_filt = logits.cpu().clone()
+        boxes_filt = boxes.cpu().clone()
+        filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+        logits_filt = logits_filt[filt_mask]  # num_filt, 256
+        boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+        # get phrase
+        tokenlizer = model.tokenizer
+        tokenized = tokenlizer(caption)
+        # build pred
+        pred_phrases = []
+        for logit, box in zip(logits_filt, boxes_filt):
+            pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+            if with_logits:
+                pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+            else:
+                pred_phrases.append(pred_phrase)
+    else:
+        # given-phrase mode
+        positive_maps = create_positive_map_from_span(
+            model.tokenizer(text_prompt),
+            token_span=token_spans
+        ).to(image.device) # n_phrase, 256
+        logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
+        all_logits = []
+        all_phrases = []
+        all_boxes = []
+        for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
+            # get phrase
+            phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
+            # get mask
+            filt_mask = logit_phr > box_threshold
+            # filt box
+            all_boxes.append(boxes[filt_mask])
+            # filt logits
+            all_logits.append(logit_phr[filt_mask])
+            if with_logits:
+                logit_phr_num = logit_phr[filt_mask]
+                all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
+            else:
+                all_phrases.extend([phrase for _ in range(len(filt_mask))])
+        boxes_filt = torch.cat(all_boxes, dim=0).cpu()
+        pred_phrases = all_phrases
+    return boxes_filt, pred_phrases, infer_time
+def benchmark_performance(model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, warmup_runs=5, test_runs=10, use_fp16=False):
+    """
+    性能基准测试：预热 + 多次推理计算平均FPS和时延
+    Args:
+        warmup_runs: 预热次数（排除初始加载的影响）
+        test_runs: 正式测试次数
+    """
+    # 1. 预热阶段（忽略耗时）
+    print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
+    for i in range(warmup_runs):
+        _, _, _ = get_grounding_output(
+            model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, use_fp16=use_fp16
+        )
+        print(f"预热完成 {i+1}/{warmup_runs}")
+    # 2. 正式测试阶段
+    print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
+    total_time = 0.0
+    infer_times = []  # 记录每次推理的时延
+    for i in range(test_runs):
+        _, _, infer_time = get_grounding_output(
+            model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, use_fp16=use_fp16
+        )
+        infer_times.append(infer_time)
+        total_time += infer_time
+        print(f"测试 {i+1}/{test_runs} - 单次推理时延: {infer_time*1000:.2f} ms")
+    # 3. 计算性能指标
+    avg_infer_time = total_time / test_runs  # 平均时延（秒）
+    fps = test_runs / total_time            # 平均FPS
+    std_infer_time = np.std(infer_times)    # 时延标准差（稳定性）
+    # 4. 输出性能报告
+    print("\n" + "="*50)
+    print("📊 性能测试报告")
+    print("="*50)
+    print(f"测试环境: {'GPU (CUDA)' if not cpu_only else 'CPU'}")
+    print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
+    print(f"平均推理时延: {avg_infer_time*1000:.2f} ms (±{std_infer_time*1000:.2f} ms)")
+    print(f"最大推理时延: {max(infer_times)*1000:.2f} ms")
+    print(f"最小推理时延: {min(infer_times)*1000:.2f} ms")
+    print(f"平均FPS: {fps:.2f} 帧/秒")
+    print("="*50 + "\n")
+    return avg_infer_time, fps, infer_times
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Grounding DINO 性能测试", add_help=True)
+    parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
+    parser.add_argument("--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file")
+    parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
+    parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
+    parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True, help="output directory")
+    parser.add_argument("--box_threshold", type=float, default=0.35, help="box threshold")
+    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
+    parser.add_argument("--token_spans", type=str, default=None, help=
+                        "The positions of start and end positions of phrases of interest. \
+                        For example, a caption is 'a cat and a dog', \
+                        if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
+                        if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
+                        ")
+    parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
+    # 新增性能测试参数
+    parser.add_argument("--warmup-runs", type=int, default=5, help="预热运行次数，默认5")
+    parser.add_argument("--test-runs", type=int, default=10, help="正式测试运行次数，默认10")
+    parser.add_argument("--fp16", action="store_true", help="Enable FP16 inference")
+    args = parser.parse_args()
+    # cfg
+    config_file = args.config_file
+    checkpoint_path = args.checkpoint_path
+    image_path = args.image_path
+    text_prompt = args.text_prompt
+    output_dir = args.output_dir
+    box_threshold = args.box_threshold
+    text_threshold = args.text_threshold
+    token_spans = args.token_spans
+    warmup_runs = args.warmup_runs
+    test_runs = args.test_runs
+    # 打印基础信息
+    print(f"📌 使用设备: {'GPU' if not args.cpu_only else 'CPU'}")
+    if not args.cpu_only and torch.cuda.is_available():
+        print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
+        print(f"📌 GPU编号: {torch.cuda.current_device()}")
+    # make dir
+    os.makedirs(output_dir, exist_ok=True)
+    # load image
+    image_pil, image = load_image(image_path)
+    # load model
+    model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
+    # visualize raw image
+    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
+    # set the text_threshold to None if token_spans is set.
+    if token_spans is not None:
+        text_threshold = None
+        print("Using token_spans. Set the text_threshold to None.")
+    # 运行性能基准测试
+    avg_infer_time, fps, infer_times = benchmark_performance(
+        model, image, text_prompt, box_threshold, text_threshold, 
+        args.cpu_only, eval(f"{token_spans}") if token_spans else None,
+        warmup_runs, test_runs, use_fp16=args.fp16
+    )
+    # 单次推理并保存结果（保留原有功能）
+    print("\n=== 生成推理结果图片 ===")
+    boxes_filt, pred_phrases, single_infer_time = get_grounding_output(
+        model, image, text_prompt, box_threshold, text_threshold, args.cpu_only, 
+        eval(f"{token_spans}") if token_spans else None, use_fp16=args.fp16
+    )
+    # visualize pred
+    size = image_pil.size
+    pred_dict = {
+        "boxes": boxes_filt,
+        "size": [size[1], size[0]],  # H,W
+        "labels": pred_phrases,
+    }
+    image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
+    image_with_box.save(os.path.join(output_dir, "pred.jpg"))
+    # 保存性能结果到文件
+    performance_file = os.path.join(output_dir, "performance_report.txt")
+    with open(performance_file, "w", encoding="utf-8") as f:
+        f.write("="*50 + "\n")
+        f.write("Grounding DINO 性能测试报告\n")
+        f.write("="*50 + "\n")
+        f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
+        f.write(f"测试设备: {'GPU' if not args.cpu_only else 'CPU'}\n")
+        if not args.cpu_only and torch.cuda.is_available():
+            f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
+        f.write(f"预热次数: {warmup_runs}\n")
+        f.write(f"测试次数: {test_runs}\n")
+        f.write(f"平均推理时延: {avg_infer_time*1000:.2f} ms\n")
+        f.write(f"时延标准差: {np.std(infer_times)*1000:.2f} ms\n")
+        f.write(f"最大时延: {max(infer_times)*1000:.2f} ms\n")
+        f.write(f"最小时延: {min(infer_times)*1000:.2f} ms\n")
+        f.write(f"平均FPS: {fps:.2f} 帧/秒\n")
+        f.write(f"单次推理时延（最后一次）: {single_infer_time*1000:.2f} ms\n")
+    print(f"\n✅ 性能报告已保存至: {performance_file}")
+    print(f"✅ 推理结果图片已保存至: {os.path.join(output_dir, 'pred.jpg')}")
\ No newline at end of file
--- a/demo/inference_on_a_image.py
+++ b/demo/inference_on_a_image.py
+import argparse
+import os
+import sys
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+import groundingdino.datasets.transforms as T
+from groundingdino.models import build_model
+from groundingdino.util import box_ops
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
+from groundingdino.util.vl_utils import create_positive_map_from_span
+def plot_boxes_to_image(image_pil, tgt):
+    H, W = tgt["size"]
+    boxes = tgt["boxes"]
+    labels = tgt["labels"]
+    assert len(boxes) == len(labels), "boxes and labels must have same length"
+    draw = ImageDraw.Draw(image_pil)
+    mask = Image.new("L", image_pil.size, 0)
+    mask_draw = ImageDraw.Draw(mask)
+    # draw boxes and masks
+    for box, label in zip(boxes, labels):
+        # from 0..1 to 0..W, 0..H
+        box = box * torch.Tensor([W, H, W, H])
+        # from xywh to xyxy
+        box[:2] -= box[2:] / 2
+        box[2:] += box[:2]
+        # random color
+        color = tuple(np.random.randint(0, 255, size=3).tolist())
+        # draw
+        x0, y0, x1, y1 = box
+        x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
+        draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
+        # draw.text((x0, y0), str(label), fill=color)
+        font = ImageFont.load_default()
+        if hasattr(font, "getbbox"):
+            bbox = draw.textbbox((x0, y0), str(label), font)
+        else:
+            w, h = draw.textsize(str(label), font)
+            bbox = (x0, y0, w + x0, y0 + h)
+        # bbox = draw.textbbox((x0, y0), str(label))
+        draw.rectangle(bbox, fill=color)
+        draw.text((x0, y0), str(label), fill="white")
+        mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
+    return image_pil, mask
+def load_image(image_path):
+    # load image
+    image_pil = Image.open(image_path).convert("RGB")  # load image
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    image, _ = transform(image_pil, None)  # 3, h, w
+    return image_pil, image
+def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = "cuda" if not cpu_only else "cpu"
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    print(load_res)
+    _ = model.eval()
+    return model
+def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
+    assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
+    caption = caption.lower()
+    caption = caption.strip()
+    if not caption.endswith("."):
+        caption = caption + "."
+    device = "cuda" if not cpu_only else "cpu"
+    model = model.to(device)
+    image = image.to(device)
+    with torch.no_grad():
+        outputs = model(image[None], captions=[caption])
+    logits = outputs["pred_logits"].sigmoid()[0]  # (nq, 256)
+    boxes = outputs["pred_boxes"][0]  # (nq, 4)
+    # filter output
+    if token_spans is None:
+        logits_filt = logits.cpu().clone()
+        boxes_filt = boxes.cpu().clone()
+        filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+        logits_filt = logits_filt[filt_mask]  # num_filt, 256
+        boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+        # get phrase
+        tokenlizer = model.tokenizer
+        tokenized = tokenlizer(caption)
+        # build pred
+        pred_phrases = []
+        for logit, box in zip(logits_filt, boxes_filt):
+            pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
+            if with_logits:
+                pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+            else:
+                pred_phrases.append(pred_phrase)
+    else:
+        # given-phrase mode
+        positive_maps = create_positive_map_from_span(
+            model.tokenizer(text_prompt),
+            token_span=token_spans
+        ).to(image.device) # n_phrase, 256
+        logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
+        all_logits = []
+        all_phrases = []
+        all_boxes = []
+        for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
+            # get phrase
+            phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
+            # get mask
+            filt_mask = logit_phr > box_threshold
+            # filt box
+            all_boxes.append(boxes[filt_mask])
+            # filt logits
+            all_logits.append(logit_phr[filt_mask])
+            if with_logits:
+                logit_phr_num = logit_phr[filt_mask]
+                all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
+            else:
+                all_phrases.extend([phrase for _ in range(len(filt_mask))])
+        boxes_filt = torch.cat(all_boxes, dim=0).cpu()
+        pred_phrases = all_phrases
+    return boxes_filt, pred_phrases
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser("Grounding DINO example", add_help=True)
+    parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
+    parser.add_argument(
+        "--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file"
+    )
+    parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
+    parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
+    parser.add_argument(
+        "--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
+    )
+    parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
+    parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
+    parser.add_argument("--token_spans", type=str, default=None, help=
+                        "The positions of start and end positions of phrases of interest. \
+                        For example, a caption is 'a cat and a dog', \
+                        if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
+                        if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
+                        ")
+    parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
+    args = parser.parse_args()
+    # cfg
+    config_file = args.config_file  # change the path of the model config file
+    checkpoint_path = args.checkpoint_path  # change the path of the model
+    image_path = args.image_path
+    text_prompt = args.text_prompt
+    output_dir = args.output_dir
+    box_threshold = args.box_threshold
+    text_threshold = args.text_threshold
+    token_spans = args.token_spans
+    # make dir
+    os.makedirs(output_dir, exist_ok=True)
+    # load image
+    image_pil, image = load_image(image_path)
+    # load model
+    model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
+    # visualize raw image
+    image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
+    # set the text_threshold to None if token_spans is set.
+    if token_spans is not None:
+        text_threshold = None
+        print("Using token_spans. Set the text_threshold to None.")
+    # run model
+    boxes_filt, pred_phrases = get_grounding_output(
+        model, image, text_prompt, box_threshold, text_threshold, cpu_only=args.cpu_only, token_spans=eval(f"{token_spans}")
+    )
+    # visualize pred
+    size = image_pil.size
+    pred_dict = {
+        "boxes": boxes_filt,
+        "size": [size[1], size[0]],  # H,W
+        "labels": pred_phrases,
+    }
+    # import ipdb; ipdb.set_trace()
+    image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
+    image_with_box.save(os.path.join(output_dir, "pred.jpg"))
--- a/demo/test_ap_on_coco.py
+++ b/demo/test_ap_on_coco.py
+import argparse
+import os
+import sys
+import time
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, DistributedSampler
+from groundingdino.models import build_model
+import groundingdino.datasets.transforms as T
+from groundingdino.util import box_ops, get_tokenlizer
+from groundingdino.util.misc import clean_state_dict, collate_fn
+from groundingdino.util.slconfig import SLConfig
+# from torchvision.datasets import CocoDetection
+import torchvision
+from groundingdino.util.vl_utils import build_captions_and_token_span, create_positive_map_from_span
+from groundingdino.datasets.cocogrounding_eval import CocoGroundingEvaluator
+def load_model(model_config_path: str, model_checkpoint_path: str, device: str = "cuda"):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = device
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    model.eval()
+    return model
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms):
+        super().__init__(img_folder, ann_file)
+        self._transforms = transforms
+    def __getitem__(self, idx):
+        img, target = super().__getitem__(idx)  # target: list
+        # import ipdb; ipdb.set_trace()
+        w, h = img.size
+        boxes = [obj["bbox"] for obj in target]
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]  # xywh -> xyxy
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        # filt invalid boxes/masks/keypoints
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        target_new = {}
+        image_id = self.ids[idx]
+        target_new["image_id"] = image_id
+        target_new["boxes"] = boxes
+        target_new["orig_size"] = torch.as_tensor([int(h), int(w)])
+        if self._transforms is not None:
+            img, target = self._transforms(img, target_new)
+        return img, target
+class PostProcessCocoGrounding(nn.Module):
+    """ This module converts the model's output into the format expected by the coco api"""
+    def __init__(self, num_select=300, coco_api=None, tokenlizer=None) -> None:
+        super().__init__()
+        self.num_select = num_select
+        assert coco_api is not None
+        category_dict = coco_api.dataset['categories']
+        cat_list = [item['name'] for item in category_dict]
+        captions, cat2tokenspan = build_captions_and_token_span(cat_list, True)
+        tokenspanlist = [cat2tokenspan[cat] for cat in cat_list]
+        positive_map = create_positive_map_from_span(
+            tokenlizer(captions), tokenspanlist)  # 80, 256. normed
+        id_map = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 13, 12: 14, 13: 15, 14: 16, 15: 17, 16: 18, 17: 19, 18: 20, 19: 21, 20: 22, 21: 23, 22: 24, 23: 25, 24: 27, 25: 28, 26: 31, 27: 32, 28: 33, 29: 34, 30: 35, 31: 36, 32: 37, 33: 38, 34: 39, 35: 40, 36: 41, 37: 42, 38: 43, 39: 44, 40: 46,
+                  41: 47, 42: 48, 43: 49, 44: 50, 45: 51, 46: 52, 47: 53, 48: 54, 49: 55, 50: 56, 51: 57, 52: 58, 53: 59, 54: 60, 55: 61, 56: 62, 57: 63, 58: 64, 59: 65, 60: 67, 61: 70, 62: 72, 63: 73, 64: 74, 65: 75, 66: 76, 67: 77, 68: 78, 69: 79, 70: 80, 71: 81, 72: 82, 73: 84, 74: 85, 75: 86, 76: 87, 77: 88, 78: 89, 79: 90}
+        # build a mapping from label_id to pos_map
+        new_pos_map = torch.zeros((91, 256))
+        for k, v in id_map.items():
+            new_pos_map[v] = positive_map[k]
+        self.positive_map = new_pos_map
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes, not_to_xyxy=False):
+        """ Perform the computation
+        Parameters:
+            outputs: raw outputs of the model
+            target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
+                          For evaluation, this must be the original image size (before any data augmentation)
+                          For visualization, this should be the image size after data augment, but before padding
+        """
+        num_select = self.num_select
+        out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
+        # pos map to logit
+        prob_to_token = out_logits.sigmoid()  # bs, 100, 256
+        pos_maps = self.positive_map.to(prob_to_token.device)
+        # (bs, 100, 256) @ (91, 256).T -> (bs, 100, 91)
+        prob_to_label = prob_to_token @ pos_maps.T
+        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+        prob = prob_to_label
+        topk_values, topk_indexes = torch.topk(
+            prob.view(out_logits.shape[0], -1), num_select, dim=1)
+        scores = topk_values
+        topk_boxes = topk_indexes // prob.shape[2]
+        labels = topk_indexes % prob.shape[2]
+        if not_to_xyxy:
+            boxes = out_bbox
+        else:
+            boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+        boxes = torch.gather(
+            boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+        results = [{'scores': s, 'labels': l, 'boxes': b}
+                   for s, l, b in zip(scores, labels, boxes)]
+        return results
+def main(args):
+    # config
+    cfg = SLConfig.fromfile(args.config_file)
+    # build model
+    model = load_model(args.config_file, args.checkpoint_path)
+    model = model.to(args.device)
+    model = model.eval()
+    # build dataloader
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    dataset = CocoDetection(
+        args.image_dir, args.anno_path, transforms=transform)
+    data_loader = DataLoader(
+        dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn)
+    # build post processor
+    tokenlizer = get_tokenlizer.get_tokenlizer(cfg.text_encoder_type)
+    postprocessor = PostProcessCocoGrounding(
+        coco_api=dataset.coco, tokenlizer=tokenlizer)
+    # build evaluator
+    evaluator = CocoGroundingEvaluator(
+        dataset.coco, iou_types=("bbox",), useCats=True)
+    # build captions
+    category_dict = dataset.coco.dataset['categories']
+    cat_list = [item['name'] for item in category_dict]
+    caption = " . ".join(cat_list) + ' .'
+    print("Input text prompt:", caption)
+    # run inference
+    start = time.time()
+    for i, (images, targets) in enumerate(data_loader):
+        # get images and captions
+        images = images.tensors.to(args.device)
+        bs = images.shape[0]
+        input_captions = [caption] * bs
+        # feed to the model
+        outputs = model(images, captions=input_captions)
+        orig_target_sizes = torch.stack(
+            [t["orig_size"] for t in targets], dim=0).to(images.device)
+        results = postprocessor(outputs, orig_target_sizes)
+        cocogrounding_res = {
+            target["image_id"]: output for target, output in zip(targets, results)}
+        evaluator.update(cocogrounding_res)
+        if (i+1) % 30 == 0:
+            used_time = time.time() - start
+            eta = len(data_loader) / (i+1e-5) * used_time - used_time
+            print(
+                f"processed {i}/{len(data_loader)} images. time: {used_time:.2f}s, ETA: {eta:.2f}s")
+    evaluator.synchronize_between_processes()
+    evaluator.accumulate()
+    evaluator.summarize()
+    print("Final results:", evaluator.coco_eval["bbox"].stats.tolist())
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        "Grounding DINO eval on COCO", add_help=True)
+    # load model
+    parser.add_argument("--config_file", "-c", type=str,
+                        required=True, help="path to config file")
+    parser.add_argument(
+        "--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file"
+    )
+    parser.add_argument("--device", type=str, default="cuda",
+                        help="running device (default: cuda)")
+    # post processing
+    parser.add_argument("--num_select", type=int, default=300,
+                        help="number of topk to select")
+    # coco info
+    parser.add_argument("--anno_path", type=str,
+                        required=True, help="coco root")
+    parser.add_argument("--image_dir", type=str,
+                        required=True, help="coco image dir")
+    parser.add_argument("--num_workers", type=int, default=4,
+                        help="number of workers for dataloader")
+    args = parser.parse_args()
+    main(args)
--- a/demo/test_ap_on_coco_onnx.py
+++ b/demo/test_ap_on_coco_onnx.py
+import argparse
+import os
+import sys
+import time
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, DistributedSampler
+import torchvision
+import onnxruntime as ort
+import groundingdino.datasets.transforms as T
+from groundingdino.util import box_ops, get_tokenlizer
+from groundingdino.util.misc import collate_fn
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.vl_utils import build_captions_and_token_span, create_positive_map_from_span
+from groundingdino.datasets.cocogrounding_eval import CocoGroundingEvaluator
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms):
+        super().__init__(img_folder, ann_file)
+        self._transforms = transforms
+    def __getitem__(self, idx):
+        img, target = super().__getitem__(idx)
+        w, h = img.size
+        boxes = [obj["bbox"] for obj in target]
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]  # xywh -> xyxy
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        target_new = {}
+        image_id = self.ids[idx]
+        target_new["image_id"] = image_id
+        target_new["boxes"] = boxes
+        target_new["orig_size"] = torch.as_tensor([int(h), int(w)])
+        if self._transforms is not None:
+            img, target = self._transforms(img, target_new)
+        return img, target
+class PostProcessCocoGrounding(nn.Module):
+    """保持和原代码一致的后处理逻辑"""
+    def __init__(self, num_select=300, coco_api=None, tokenlizer=None) -> None:
+        super().__init__()
+        self.num_select = num_select
+        assert coco_api is not None
+        category_dict = coco_api.dataset['categories']
+        cat_list = [item['name'] for item in category_dict]
+        captions, cat2tokenspan = build_captions_and_token_span(cat_list, True)
+        tokenspanlist = [cat2tokenspan[cat] for cat in cat_list]
+        positive_map = create_positive_map_from_span(
+            tokenlizer(captions), tokenspanlist)  # 80, 256. normed
+        id_map = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 13, 12: 14, 13: 15, 14: 16, 15: 17, 16: 18, 17: 19, 18: 20, 19: 21, 20: 22, 21: 23, 22: 24, 23: 25, 24: 27, 25: 28, 26: 31, 27: 32, 28: 33, 29: 34, 30: 35, 31: 36, 32: 37, 33: 38, 34: 39, 35: 40, 36: 41, 37: 42, 38: 43, 39: 44, 40: 46,
+                  41: 47, 42: 48, 43: 49, 44: 50, 45: 51, 46: 52, 47: 53, 48: 54, 49: 55, 50: 56, 51: 57, 52: 58, 53: 59, 54: 60, 55: 61, 56: 62, 57: 63, 58: 64, 59: 65, 60: 67, 61: 70, 62: 72, 63: 73, 64: 74, 65: 75, 66: 76, 67: 77, 68: 78, 69: 79, 70: 80, 71: 81, 72: 82, 73: 84, 74: 85, 75: 86, 76: 87, 77: 88, 78: 89, 79: 90}
+        new_pos_map = torch.zeros((91, 256))
+        for k, v in id_map.items():
+            new_pos_map[v] = positive_map[k]
+        self.positive_map = new_pos_map
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes, not_to_xyxy=False):
+        num_select = self.num_select
+        out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
+        prob_to_token = torch.from_numpy(out_logits).sigmoid()  # 适配numpy输入
+        pos_maps = self.positive_map.to(prob_to_token.device)
+        prob_to_label = prob_to_token @ pos_maps.T
+        assert prob_to_label.shape[0] == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+        prob = prob_to_label
+        topk_values, topk_indexes = torch.topk(
+            prob.view(prob_to_label.shape[0], -1), num_select, dim=1)
+        scores = topk_values
+        topk_boxes = topk_indexes // prob.shape[2]
+        labels = topk_indexes % prob.shape[2]
+        if not_to_xyxy:
+            boxes = torch.from_numpy(out_bbox)
+        else:
+            boxes = box_ops.box_cxcywh_to_xyxy(torch.from_numpy(out_bbox))
+        boxes = torch.gather(
+            boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+        results = [{'scores': s, 'labels': l, 'boxes': b}
+                   for s, l, b in zip(scores, labels, boxes)]
+        return results
+def load_onnx_model(onnx_path, device="cuda"):
+    """加载ONNX模型并创建推理session"""
+    providers = ['CPUExecutionProvider']
+    if device == "cuda" and ort.get_device() == "GPU":
+        providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
+    session = ort.InferenceSession(
+        onnx_path,
+        providers=providers,
+        provider_options=[{'device_id': 0}] if "CUDAExecutionProvider" in providers else []
+    )
+    return session
+def onnx_inference(session, images, captions):
+    """ONNX模型推理（需匹配模型输入格式）"""
+    # 转换为numpy（ONNX Runtime不支持torch tensor）
+    images_np = images.cpu().numpy().astype(np.float32)
+    # 注意：此处需根据你的ONNX模型输入名调整（可通过netron查看）
+    # 假设模型输入为 "images" 和 "captions"（需根据实际情况修改）
+    input_feed = {
+        session.get_inputs()[0].name: images_np,
+        # 如果caption是文本token，需补充token化逻辑，此处假设已处理
+        # session.get_inputs()[1].name: captions_np
+    }
+    # 执行推理
+    outputs = session.run(None, input_feed)
+    # 解析输出（需匹配模型输出格式，假设输出为logits和bbox）
+    # 需根据你的ONNX模型输出调整维度和顺序
+    pred_logits = outputs[0]  # 形状: [bs, 100, 256]
+    pred_boxes = outputs[1]   # 形状: [bs, 100, 4]
+    return {"pred_logits": pred_logits, "pred_boxes": pred_boxes}
+def main(args):
+    # 配置（主要用于tokenizer和后处理）
+    cfg = SLConfig.fromfile(args.config_file)
+    # 加载ONNX模型
+    onnx_session = load_onnx_model(args.onnx_path, args.device)
+    # 构建数据加载器（和原代码一致）
+    transform = T.Compose(
+        [
+            T.RandomResize([800], max_size=1333),
+            T.ToTensor(),
+            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+        ]
+    )
+    dataset = CocoDetection(
+        args.image_dir, args.anno_path, transforms=transform)
+    data_loader = DataLoader(
+        dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn)
+    # 构建后处理器（和原代码一致）
+    tokenlizer = get_tokenlizer.get_tokenlizer(cfg.text_encoder_type)
+    postprocessor = PostProcessCocoGrounding(
+        coco_api=dataset.coco, tokenlizer=tokenlizer)
+    # 构建评估器（和原代码一致）
+    evaluator = CocoGroundingEvaluator(
+        dataset.coco, iou_types=("bbox",), useCats=True)
+    # 构建文本提示（和原代码一致）
+    category_dict = dataset.coco.dataset['categories']
+    cat_list = [item['name'] for item in category_dict]
+    caption = " . ".join(cat_list) + ' .'
+    print("Input text prompt:", caption)
+    # 运行推理
+    start = time.time()
+    for i, (images, targets) in enumerate(data_loader):
+        # 预处理图像（和原代码一致）
+        images = images.tensors.to(args.device)
+        bs = images.shape[0]
+        input_captions = [caption] * bs
+        # ONNX推理（替换原PyTorch模型推理）
+        outputs = onnx_inference(onnx_session, images, input_captions)
+        # 后处理（适配ONNX输出格式）
+        orig_target_sizes = torch.stack(
+            [t["orig_size"] for t in targets], dim=0).to(args.device)
+        results = postprocessor(outputs, orig_target_sizes)
+        cocogrounding_res = {
+            target["image_id"]: output for target, output in zip(targets, results)}
+        evaluator.update(cocogrounding_res)
+        # 打印进度
+        if (i+1) % 30 == 0:
+            used_time = time.time() - start
+            eta = len(data_loader) / (i+1e-5) * used_time - used_time
+            print(
+                f"processed {i}/{len(data_loader)} images. time: {used_time:.2f}s, ETA: {eta:.2f}s")
+    # 评估指标汇总
+    evaluator.synchronize_between_processes()
+    evaluator.accumulate()
+    evaluator.summarize()
+    print("Final results:", evaluator.coco_eval["bbox"].stats.tolist())
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        "Grounding DINO ONNX eval on COCO", add_help=True)
+    # 新增ONNX模型路径参数
+    parser.add_argument("--onnx_path", type=str, required=True, help="path to onnx model file")
+    # 保留原配置文件参数（用于tokenizer和后处理）
+    parser.add_argument("--config_file", "-c", type=str,
+                        required=True, help="path to config file")
+    parser.add_argument("--device", type=str, default="cuda",
+                        help="running device (default: cuda)")
+    # 后处理参数
+    parser.add_argument("--num_select", type=int, default=300,
+                        help="number of topk to select")
+    # COCO数据集参数
+    parser.add_argument("--anno_path", type=str,
+                        required=True, help="coco annotation path")
+    parser.add_argument("--image_dir", type=str,
+                        required=True, help="coco image dir")
+    parser.add_argument("--num_workers", type=int, default=4,
+                        help="number of workers for dataloader")
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
--- a/docker_test.py
+++ b/docker_test.py
+from groundingdino.util.inference import load_model, load_image, predict, annotate
+import torch
+import cv2
+model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.pyy", "weights/groundingdino_swint_ogc.pth")
+model = model.to('cuda:0')
+print(torch.cuda.is_available())
+print('DONE!')
\ No newline at end of file
--- a/environment.yaml
+++ b/environment.yaml
--- a/export_log.txt
+++ b/export_log.txt
+final text_encoder_type: bert-base-uncased
--- a/export_onnx.py
+++ b/export_onnx.py
--- a/export_onnx_batchsize.py
+++ b/export_onnx_batchsize.py
--- a/export_onnx_migraphx_debug.py
+++ b/export_onnx_migraphx_debug.py
+import torch
+import onnx
+from onnxsim import simplify
+from groundingdino.models import build_model
+from groundingdino.util.slconfig import SLConfig
+from groundingdino.util.utils import clean_state_dict
+config_file = './groundingdino/config/GroundingDINO_SwinB_cfg.py'
+checkpoint_path = './weights/groundingdino_swinb_cogcoor.pth'
+def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
+    args = SLConfig.fromfile(model_config_path)
+    args.device = "cuda" if not cpu_only else "cpu"
+    # modified config
+    args.use_checkpoint = False
+    args.use_transformer_ckpt = False
+    model = build_model(args)
+    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
+    model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
+    _ = model.eval()
+    return model
+# 加载模型
+model = load_model(config_file, checkpoint_path, cpu_only=True)
+# 正式推理时使用的提示词，以及相关的mask
+caption = "car ."
+input_ids = model.tokenizer([caption], return_tensors="pt")["input_ids"]
+position_ids = torch.tensor([[0, 0, 1, 0]])
+token_type_ids = torch.tensor([[0, 0, 0, 0]])
+attention_mask = torch.tensor([[True, True, True, True]])
+text_token_mask = torch.tensor([[[True, False, False, False],
+                                 [False,  True,  True,  False],
+                                 [False,  True,  True,  False],
+                                 [False,  False, False, True]]])
+# 固定输入分辨率
+img = torch.randn(1, 3, 800, 1200)
+# onnx模型可以支持动态输入，在转换engine时建议注销
+dynamic_axes = {
+    "input_ids": {0: "batch_size", 1: "seq_len"},
+    "attention_mask": {0: "batch_size", 1: "seq_len"},
+    "position_ids": {0: "batch_size", 1: "seq_len"},
+    "token_type_ids": {0: "batch_size", 1: "seq_len"},
+    "text_token_mask": {0: "batch_size", 1: "seq_len", 2: "seq_len"},
+    "img": {0: "batch_size", 2: "height", 3: "width"},
+    "logits": {0: "batch_size"},
+    "boxes": {0: "batch_size"}
+}
+# 导出原始ONNX模型
+onnx_output_path = "weights/ground.onnx"
+torch.onnx.export(
+    model,
+    f=onnx_output_path,
+    args=(img, input_ids, attention_mask, position_ids, token_type_ids, text_token_mask),
+    input_names=["img", "input_ids", "attention_mask", "position_ids", "token_type_ids", "text_token_mask"],
+    output_names=["logits", "boxes"],
+    opset_version=17,
+    verbose=False,  # 关闭详细日志，如需调试可改为True
+    do_constant_folding=True  # 常量折叠优化，提升简化效果
+)
--- a/fp16/fp16_fix.py
+++ b/fp16/fp16_fix.py
--- a/fp16/onnx_fp16.py
+++ b/fp16/onnx_fp16.py
--- a/fp16/onnx_fp16_runtime.py
+++ b/fp16/onnx_fp16_runtime.py
--- a/fp16/onnx_int8.py
+++ b/fp16/onnx_int8.py
+from onnxruntime.quantization import quantize_dynamic, QuantType
+quantize_dynamic(
+    model_input="weights/ground.onnx",
+    model_output="weights/ground_int8.onnx",
+    weight_type=QuantType.QInt8,
+)
+print("int8 quantization done!")
--- a/fp16/test.py
+++ b/fp16/test.py