Commit 34e4011b authored by zk's avatar zk
Browse files

首次提交

parents
Pipeline #3503 failed with stages
in 0 seconds
This diff is collapsed.
This diff is collapsed.
import argparse
import os
import sys
import time
import numpy as np
import torch
from PIL import Image, ImageDraw, ImageFont
import groundingdino.datasets.transforms as T
from groundingdino.models import build_model
from groundingdino.util import box_ops
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
from groundingdino.util.vl_utils import create_positive_map_from_span
# ====================== 核心配置 - 只需要改这里就能调整batch size ======================
INFERENCE_BATCH_SIZE = 8 # 推理批次大小,修改这个值即可改变batch size
# ====================================================================================
def plot_boxes_to_image(image_pil, tgt):
H, W = tgt["size"]
boxes = tgt["boxes"]
labels = tgt["labels"]
assert len(boxes) == len(labels), "boxes and labels must have same length"
draw = ImageDraw.Draw(image_pil)
mask = Image.new("L", image_pil.size, 0)
mask_draw = ImageDraw.Draw(mask)
# draw boxes and masks
for box, label in zip(boxes, labels):
# from 0..1 to 0..W, 0..H
box = box * torch.Tensor([W, H, W, H])
# from xywh to xyxy
box[:2] -= box[2:] / 2
box[2:] += box[:2]
# random color
color = tuple(np.random.randint(0, 255, size=3).tolist())
# draw
x0, y0, x1, y1 = box
x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
# draw.text((x0, y0), str(label), fill=color)
font = ImageFont.load_default()
if hasattr(font, "getbbox"):
bbox = draw.textbbox((x0, y0), str(label), font)
else:
w, h = draw.textsize(str(label), font)
bbox = (x0, y0, w + x0, y0 + h)
# bbox = draw.textbbox((x0, y0), str(label))
draw.rectangle(bbox, fill=color)
draw.text((x0, y0), str(label), fill="white")
mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
return image_pil, mask
def load_image(image_path):
# load image
image_pil = Image.open(image_path).convert("RGB") # load image
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
image, _ = transform(image_pil, None) # 3, h, w
return image_pil, image
def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
args = SLConfig.fromfile(model_config_path)
args.device = "cuda" if not cpu_only else "cpu"
model = build_model(args)
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
print(load_res)
_ = model.eval()
return model
def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
caption = caption.lower()
caption = caption.strip()
if not caption.endswith("."):
caption = caption + "."
device = "cuda" if not cpu_only else "cpu"
model = model.to(device)
# ========== 修改1: 构建batch数据 ==========
# 复制图像数据以构建batch (batch_size, 3, H, W)
image_batch = image.unsqueeze(0).repeat(INFERENCE_BATCH_SIZE, 1, 1, 1).to(device)
# 复制文本prompt以构建batch (batch_size,)
caption_batch = [caption] * INFERENCE_BATCH_SIZE
# 核心推理计时
torch.cuda.synchronize() if device == "cuda" else None # 等待GPU操作完成,确保计时准确
start_time = time.perf_counter() # 高精度计时
with torch.no_grad():
outputs = model(image_batch, captions=caption_batch) # 传入batch数据
torch.cuda.synchronize() if device == "cuda" else None # 等待GPU推理完成
infer_time = time.perf_counter() - start_time # 推理耗时(秒)
# 计算单张图片的平均推理时间
avg_single_infer_time = infer_time / INFERENCE_BATCH_SIZE
# ========== 修改2: 处理batch输出 ==========
# 取第一个样本的输出作为结果(所有样本结果相同)
logits = outputs["pred_logits"].sigmoid()[0] # (nq, 256)
boxes = outputs["pred_boxes"][0] # (nq, 4)
# filter output
if token_spans is None:
logits_filt = logits.cpu().clone()
boxes_filt = boxes.cpu().clone()
filt_mask = logits_filt.max(dim=1)[0] > box_threshold
logits_filt = logits_filt[filt_mask] # num_filt, 256
boxes_filt = boxes_filt[filt_mask] # num_filt, 4
# get phrase
tokenlizer = model.tokenizer
tokenized = tokenlizer(caption)
# build pred
pred_phrases = []
for logit, box in zip(logits_filt, boxes_filt):
pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
if with_logits:
pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
else:
pred_phrases.append(pred_phrase)
else:
# given-phrase mode
positive_maps = create_positive_map_from_span(
model.tokenizer(caption),
token_span=token_spans
).to(image.device) # n_phrase, 256
logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
all_logits = []
all_phrases = []
all_boxes = []
for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
# get phrase
phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
# get mask
filt_mask = logit_phr > box_threshold
# filt box
all_boxes.append(boxes[filt_mask])
# filt logits
all_logits.append(logit_phr[filt_mask])
if with_logits:
logit_phr_num = logit_phr[filt_mask]
all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
else:
all_phrases.extend([phrase for _ in range(len(filt_mask))])
boxes_filt = torch.cat(all_boxes, dim=0).cpu()
pred_phrases = all_phrases
# 返回batch推理总时间,方便性能计算
return boxes_filt, pred_phrases, infer_time, avg_single_infer_time
def benchmark_performance(model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, warmup_runs=5, test_runs=10):
"""
性能基准测试:预热 + 多次推理计算平均FPS和时延
适配batch推理,计算正确的吞吐量
Args:
warmup_runs: 预热次数(排除初始加载的影响)
test_runs: 正式测试次数
"""
# 1. 预热阶段(忽略耗时)
print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
for i in range(warmup_runs):
_, _, _, _ = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
)
print(f"预热完成 {i+1}/{warmup_runs}")
# 2. 正式测试阶段
print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
total_batch_time = 0.0 # 总batch推理时间
total_single_time = 0.0 # 总单张推理时间
batch_times = [] # 记录每次batch推理的时延
single_times = [] # 记录每次单张推理的平均时延
for i in range(test_runs):
_, _, batch_infer_time, avg_single_infer_time = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
)
batch_times.append(batch_infer_time)
single_times.append(avg_single_infer_time)
total_batch_time += batch_infer_time
total_single_time += avg_single_infer_time
print(f"测试 {i+1}/{test_runs} - Batch推理时延: {batch_infer_time*1000:.2f} ms | 单张平均时延: {avg_single_infer_time*1000:.2f} ms")
# 3. 计算性能指标
avg_batch_time = total_batch_time / test_runs # 平均batch推理时延(秒)
avg_single_time = total_single_time / test_runs # 平均单张推理时延(秒)
batch_throughput = (test_runs * INFERENCE_BATCH_SIZE) / total_batch_time # 总吞吐量 (张/秒)
batch_std_time = np.std(batch_times) # batch时延标准差
single_std_time = np.std(single_times) # 单张时延标准差
# 4. 输出性能报告
print("\n" + "="*60)
print("📊 性能测试报告 (Batch Size = {})".format(INFERENCE_BATCH_SIZE))
print("="*60)
print(f"测试环境: {'GPU (CUDA)' if not cpu_only else 'CPU'}")
print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
print(f"Batch Size: {INFERENCE_BATCH_SIZE}")
print(f"平均Batch推理时延: {avg_batch_time*1000:.2f} ms (±{batch_std_time*1000:.2f} ms)")
print(f"平均单张推理时延: {avg_single_time*1000:.2f} ms (±{single_std_time*1000:.2f} ms)")
print(f"最大Batch时延: {max(batch_times)*1000:.2f} ms | 最大单张时延: {max(single_times)*1000:.2f} ms")
print(f"最小Batch时延: {min(batch_times)*1000:.2f} ms | 最小单张时延: {min(single_times)*1000:.2f} ms")
print(f"总吞吐量: {batch_throughput:.2f} 张/秒")
print("="*60 + "\n")
return avg_batch_time, avg_single_time, batch_throughput, batch_times, single_times
if __name__ == "__main__":
parser = argparse.ArgumentParser("Grounding DINO 性能测试 (Batch推理)", add_help=True)
parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
parser.add_argument("--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file")
parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True, help="output directory")
parser.add_argument("--box_threshold", type=float, default=0.35, help="box threshold")
parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
parser.add_argument("--token_spans", type=str, default=None, help=
"The positions of start and end positions of phrases of interest. \
For example, a caption is 'a cat and a dog', \
if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
")
parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
# 新增性能测试参数
parser.add_argument("--warmup-runs", type=int, default=5, help="预热运行次数,默认5")
parser.add_argument("--test-runs", type=int, default=10, help="正式测试运行次数,默认10")
args = parser.parse_args()
# cfg
config_file = args.config_file
checkpoint_path = args.checkpoint_path
image_path = args.image_path
text_prompt = args.text_prompt
output_dir = args.output_dir
box_threshold = args.box_threshold
text_threshold = args.text_threshold
token_spans = args.token_spans
warmup_runs = args.warmup_runs
test_runs = args.test_runs
# 打印基础信息
print(f"📌 使用设备: {'GPU' if not args.cpu_only else 'CPU'}")
print(f"📌 推理Batch Size: {INFERENCE_BATCH_SIZE}")
if not args.cpu_only and torch.cuda.is_available():
print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
print(f"📌 GPU编号: {torch.cuda.current_device()}")
# make dir
os.makedirs(output_dir, exist_ok=True)
# load image
image_pil, image = load_image(image_path)
# load model
model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
# visualize raw image
image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
# set the text_threshold to None if token_spans is set.
if token_spans is not None:
text_threshold = None
print("Using token_spans. Set the text_threshold to None.")
# 运行性能基准测试
avg_batch_time, avg_single_time, throughput, batch_times, single_times = benchmark_performance(
model, image, text_prompt, box_threshold, text_threshold,
args.cpu_only, eval(f"{token_spans}") if token_spans else None,
warmup_runs, test_runs
)
# 单次推理并保存结果(保留原有功能)
print("\n=== 生成推理结果图片 ===")
boxes_filt, pred_phrases, batch_infer_time, single_infer_time = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, args.cpu_only,
eval(f"{token_spans}") if token_spans else None
)
# visualize pred
size = image_pil.size
pred_dict = {
"boxes": boxes_filt,
"size": [size[1], size[0]], # H,W
"labels": pred_phrases,
}
image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
image_with_box.save(os.path.join(output_dir, "pred.jpg"))
# 保存性能结果到文件
performance_file = os.path.join(output_dir, "performance_report.txt")
with open(performance_file, "w", encoding="utf-8") as f:
f.write("="*60 + "\n")
f.write(f"Grounding DINO 性能测试报告 (Batch Size = {INFERENCE_BATCH_SIZE})\n")
f.write("="*60 + "\n")
f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"测试设备: {'GPU' if not args.cpu_only else 'CPU'}\n")
if not args.cpu_only and torch.cuda.is_available():
f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
f.write(f"Batch Size: {INFERENCE_BATCH_SIZE}\n")
f.write(f"预热次数: {warmup_runs}\n")
f.write(f"测试次数: {test_runs}\n")
f.write(f"平均Batch推理时延: {avg_batch_time*1000:.2f} ms\n")
f.write(f"Batch时延标准差: {np.std(batch_times)*1000:.2f} ms\n")
f.write(f"平均单张推理时延: {avg_single_time*1000:.2f} ms\n")
f.write(f"单张时延标准差: {np.std(single_times)*1000:.2f} ms\n")
f.write(f"最大Batch时延: {max(batch_times)*1000:.2f} ms\n")
f.write(f"最小Batch时延: {min(batch_times)*1000:.2f} ms\n")
f.write(f"总吞吐量: {throughput:.2f} 张/秒\n")
f.write(f"最后一次Batch推理时延: {batch_infer_time*1000:.2f} ms\n")
f.write(f"最后一次单张推理时延: {single_infer_time*1000:.2f} ms\n")
print(f"\n✅ 性能报告已保存至: {performance_file}")
print(f"✅ 推理结果图片已保存至: {os.path.join(output_dir, 'pred.jpg')}")
\ No newline at end of file
import argparse
import os
import time
from typing import List, Optional, Tuple
import numpy as np
import torch
from PIL import Image, ImageDraw, ImageFont
import groundingdino.datasets.transforms as T
from groundingdino.util.utils import get_phrases_from_posmap
from groundingdino.util import get_tokenlizer
from groundingdino.util.slconfig import SLConfig
from groundingdino.models.GroundingDINO.bertwarper import (
generate_masks_with_special_tokens_and_transfer_map,
)
def plot_boxes_to_image(image_pil, tgt):
H, W = tgt["size"]
boxes = tgt["boxes"]
labels = tgt["labels"]
assert len(boxes) == len(labels), "boxes and labels must have same length"
draw = ImageDraw.Draw(image_pil)
mask = Image.new("L", image_pil.size, 0)
mask_draw = ImageDraw.Draw(mask)
for box, label in zip(boxes, labels):
box = box * torch.Tensor([W, H, W, H])
box[:2] -= box[2:] / 2
box[2:] += box[:2]
color = tuple(np.random.randint(0, 255, size=3).tolist())
x0, y0, x1, y1 = box
x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
font = ImageFont.load_default()
if hasattr(font, "getbbox"):
bbox = draw.textbbox((x0, y0), str(label), font)
else:
w, h = draw.textsize(str(label), font)
bbox = (x0, y0, w + x0, y0 + h)
draw.rectangle(bbox, fill=color)
draw.text((x0, y0), str(label), fill="white")
mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
return image_pil, mask
def load_image(image_path):
image_pil = Image.open(image_path).convert("RGB")
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
image, _ = transform(image_pil, None) # 3,h,w
return image_pil, image
def preprocess_caption(caption: str) -> str:
caption = caption.lower().strip()
if not caption.endswith("."):
caption = caption + "."
return caption
def sigmoid(x: np.ndarray) -> np.ndarray:
return 1.0 / (1.0 + np.exp(-x))
def build_text_tensors(
config_file: str,
caption: str,
device: str,
):
cfg = SLConfig.fromfile(config_file)
tokenizer = get_tokenlizer.get_tokenlizer(cfg.text_encoder_type)
special_token_ids = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
caption = preprocess_caption(caption)
tokenized = tokenizer([caption], padding="longest", return_tensors="pt")
tokenized = {k: v.to(device) for k, v in tokenized.items()}
text_self_attention_masks, position_ids, _ = generate_masks_with_special_tokens_and_transfer_map(
tokenized, special_token_ids, tokenizer
)
max_text_len = getattr(cfg, "max_text_len", 256)
if text_self_attention_masks.shape[1] > max_text_len:
s = max_text_len
text_self_attention_masks = text_self_attention_masks[:, :s, :s]
position_ids = position_ids[:, :s]
tokenized["input_ids"] = tokenized["input_ids"][:, :s]
tokenized["attention_mask"] = tokenized["attention_mask"][:, :s]
tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :s]
# 同时返回 tokenizer 和“单句 tokenize”(用于 get_phrases_from_posmap 行为对齐)
tokenized_single = tokenizer(caption)
return (
cfg,
tokenizer,
tokenized_single,
tokenized["input_ids"].to(torch.int64),
tokenized["token_type_ids"].to(torch.int64),
tokenized["attention_mask"].to(torch.int64),
position_ids.to(torch.int64),
text_self_attention_masks,
)
def ort_create_session(onnx_path: str, device: str, num_threads: int = 0):
import onnxruntime as ort
so = ort.SessionOptions()
if num_threads and num_threads > 0:
so.intra_op_num_threads = int(num_threads)
so.inter_op_num_threads = int(num_threads)
providers = ["CPUExecutionProvider"]
if device == "cuda":
# 若环境支持 onnxruntime-gpu,会自动启用 CUDA provider
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
return ort.InferenceSession(onnx_path, sess_options=so, providers=providers)
def onnx_infer_once(
sess,
image: torch.Tensor,
input_ids: torch.Tensor,
token_type_ids: torch.Tensor,
attention_mask: torch.Tensor,
position_ids: torch.Tensor,
text_self_attention_masks: torch.Tensor,
use_cuda_sync: bool,
) -> Tuple[np.ndarray, np.ndarray, float]:
# ORT 输入必须是 numpy
feeds = {
"image": image[None].detach().cpu().numpy().astype(np.float32),
"input_ids": input_ids.detach().cpu().numpy().astype(np.int64),
"token_type_ids": token_type_ids.detach().cpu().numpy().astype(np.int64),
"attention_mask": attention_mask.detach().cpu().numpy().astype(np.int64),
"position_ids": position_ids.detach().cpu().numpy().astype(np.int64),
"text_self_attention_masks": text_self_attention_masks.detach().cpu().numpy(),
}
if use_cuda_sync:
torch.cuda.synchronize()
start = time.perf_counter()
pred_logits, pred_boxes = sess.run(["pred_logits", "pred_boxes"], feeds)
if use_cuda_sync:
torch.cuda.synchronize()
infer_time = time.perf_counter() - start
return pred_logits, pred_boxes, infer_time
def postprocess_and_phrases(
pred_logits: np.ndarray, # [B,NQ,S]
pred_boxes: np.ndarray, # [B,NQ,4]
tokenized_single,
tokenizer,
box_threshold: float,
text_threshold: float,
with_logits: bool = True,
):
# 对齐 torch 版:取 batch=0
logits = sigmoid(pred_logits[0]) # [NQ,S]
boxes = pred_boxes[0] # [NQ,4]
max_per_query = logits.max(axis=1)
mask = max_per_query > box_threshold
logits_filt = logits[mask]
boxes_filt = boxes[mask]
pred_phrases: List[str] = []
for logit in logits_filt:
posmap = torch.from_numpy(logit) > text_threshold
phrase = get_phrases_from_posmap(posmap, tokenized_single, tokenizer)
phrase = phrase.replace(".", "")
if with_logits:
pred_phrases.append(phrase + f"({str(float(logit.max()))[:4]})")
else:
pred_phrases.append(phrase)
return torch.from_numpy(boxes_filt), pred_phrases
def benchmark_performance_onnx(
sess,
image: torch.Tensor,
input_ids: torch.Tensor,
token_type_ids: torch.Tensor,
attention_mask: torch.Tensor,
position_ids: torch.Tensor,
text_self_attention_masks: torch.Tensor,
warmup_runs: int = 5,
test_runs: int = 10,
use_cuda_sync: bool = False,
):
print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
for i in range(warmup_runs):
_ = onnx_infer_once(
sess,
image,
input_ids,
token_type_ids,
attention_mask,
position_ids,
text_self_attention_masks,
use_cuda_sync=use_cuda_sync,
)
print(f"预热完成 {i+1}/{warmup_runs}")
print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
total_time = 0.0
infer_times = []
for i in range(test_runs):
_, _, infer_time = onnx_infer_once(
sess,
image,
input_ids,
token_type_ids,
attention_mask,
position_ids,
text_self_attention_masks,
use_cuda_sync=use_cuda_sync,
)
infer_times.append(infer_time)
total_time += infer_time
print(f"测试 {i+1}/{test_runs} - 单次推理时延: {infer_time*1000:.2f} ms")
avg_infer_time = total_time / test_runs
fps = test_runs / total_time
std_infer_time = float(np.std(infer_times))
print("\n" + "=" * 50)
print("📊 ONNX 性能测试报告")
print("=" * 50)
print(f"测试环境: {'GPU (CUDAExecutionProvider)' if use_cuda_sync else 'CPU/Unknown'}")
print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
print(f"平均推理时延: {avg_infer_time*1000:.2f} ms (±{std_infer_time*1000:.2f} ms)")
print(f"最大推理时延: {max(infer_times)*1000:.2f} ms")
print(f"最小推理时延: {min(infer_times)*1000:.2f} ms")
print(f"平均FPS: {fps:.2f} 帧/秒")
print("=" * 50 + "\n")
return avg_infer_time, fps, infer_times
if __name__ == "__main__":
parser = argparse.ArgumentParser("Grounding DINO ONNX 推理与性能测试", add_help=True)
parser.add_argument("--onnx_path", type=str, required=True, help="onnx 模型路径")
parser.add_argument("--config_file", "-c", type=str, required=True, help="用于加载 tokenizer 等配置")
parser.add_argument("--image_path", "-i", type=str, required=True)
parser.add_argument("--text_prompt", "-t", type=str, required=True)
parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True)
parser.add_argument("--box_threshold", type=float, default=0.3)
parser.add_argument("--text_threshold", type=float, default=0.25)
parser.add_argument("--cpu-only", action="store_true")
parser.add_argument("--warmup-runs", type=int, default=5)
parser.add_argument("--test-runs", type=int, default=10)
parser.add_argument("--ort-threads", type=int, default=0, help="onnxruntime 线程数(0=默认)")
args = parser.parse_args()
device = "cpu" if args.cpu_only else ("cuda" if torch.cuda.is_available() else "cpu")
use_cuda_sync = device == "cuda"
print(f"📌 ORT 设备偏好: {device}")
if use_cuda_sync:
print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
os.makedirs(args.output_dir, exist_ok=True)
image_pil, image = load_image(args.image_path)
image_pil.save(os.path.join(args.output_dir, "raw_image.jpg"))
(
_cfg,
tokenizer,
tokenized_single,
input_ids,
token_type_ids,
attention_mask,
position_ids,
text_self_attention_masks,
) = build_text_tensors(args.config_file, args.text_prompt, device="cpu")
# image 在 GPU 上计时同步更准确,但 feeds 最终还是走 numpy(cpu);这里只保持与 torch 版一致:
# 计时逻辑保留 + 可视化保留;模型本体推理走 ORT
sess = ort_create_session(args.onnx_path, device=device, num_threads=args.ort_threads)
avg_infer_time, fps, infer_times = benchmark_performance_onnx(
sess,
image,
input_ids,
token_type_ids,
attention_mask,
position_ids,
text_self_attention_masks,
warmup_runs=args.warmup_runs,
test_runs=args.test_runs,
use_cuda_sync=use_cuda_sync,
)
print("\n=== 生成推理结果图片 ===")
pred_logits, pred_boxes, single_infer_time = onnx_infer_once(
sess,
image,
input_ids,
token_type_ids,
attention_mask,
position_ids,
text_self_attention_masks,
use_cuda_sync=use_cuda_sync,
)
boxes_filt, pred_phrases = postprocess_and_phrases(
pred_logits=pred_logits,
pred_boxes=pred_boxes,
tokenized_single=tokenized_single,
tokenizer=tokenizer,
box_threshold=args.box_threshold,
text_threshold=args.text_threshold,
with_logits=True,
)
size = image_pil.size
pred_dict = {
"boxes": boxes_filt,
"size": [size[1], size[0]], # H,W
"labels": pred_phrases,
}
image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
image_with_box.save(os.path.join(args.output_dir, "pred.jpg"))
performance_file = os.path.join(args.output_dir, "performance_report_onnx.txt")
with open(performance_file, "w", encoding="utf-8") as f:
f.write("=" * 50 + "\n")
f.write("Grounding DINO ONNX 性能测试报告\n")
f.write("=" * 50 + "\n")
f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"推理后端: onnxruntime\n")
f.write(f"设备偏好: {device}\n")
if use_cuda_sync:
f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
f.write(f"预热次数: {args.warmup_runs}\n")
f.write(f"测试次数: {args.test_runs}\n")
f.write(f"平均推理时延: {avg_infer_time*1000:.2f} ms\n")
f.write(f"时延标准差: {np.std(infer_times)*1000:.2f} ms\n")
f.write(f"最大时延: {max(infer_times)*1000:.2f} ms\n")
f.write(f"最小时延: {min(infer_times)*1000:.2f} ms\n")
f.write(f"平均FPS: {fps:.2f} 帧/秒\n")
f.write(f"单次推理时延(最后一次): {single_infer_time*1000:.2f} ms\n")
print(f"\n✅ 性能报告已保存至: {performance_file}")
print(f"✅ 推理结果图片已保存至: {os.path.join(args.output_dir, 'pred.jpg')}")
import argparse
import os
import sys
import time
import numpy as np
import torch
from PIL import Image, ImageDraw, ImageFont
import groundingdino.datasets.transforms as T
from groundingdino.models import build_model
from groundingdino.util import box_ops
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
from groundingdino.util.vl_utils import create_positive_map_from_span
def plot_boxes_to_image(image_pil, tgt):
H, W = tgt["size"]
boxes = tgt["boxes"]
labels = tgt["labels"]
assert len(boxes) == len(labels), "boxes and labels must have same length"
draw = ImageDraw.Draw(image_pil)
mask = Image.new("L", image_pil.size, 0)
mask_draw = ImageDraw.Draw(mask)
# draw boxes and masks
for box, label in zip(boxes, labels):
# from 0..1 to 0..W, 0..H
box = box * torch.Tensor([W, H, W, H])
# from xywh to xyxy
box[:2] -= box[2:] / 2
box[2:] += box[:2]
# random color
color = tuple(np.random.randint(0, 255, size=3).tolist())
# draw
x0, y0, x1, y1 = box
x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
# draw.text((x0, y0), str(label), fill=color)
font = ImageFont.load_default()
if hasattr(font, "getbbox"):
bbox = draw.textbbox((x0, y0), str(label), font)
else:
w, h = draw.textsize(str(label), font)
bbox = (x0, y0, w + x0, y0 + h)
# bbox = draw.textbbox((x0, y0), str(label))
draw.rectangle(bbox, fill=color)
draw.text((x0, y0), str(label), fill="white")
mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
return image_pil, mask
def load_image(image_path):
# load image
image_pil = Image.open(image_path).convert("RGB") # load image
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
image, _ = transform(image_pil, None) # 3, h, w
return image_pil, image
def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
args = SLConfig.fromfile(model_config_path)
args.device = "cuda" if not cpu_only else "cpu"
model = build_model(args)
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
print(load_res)
_ = model.eval()
return model
def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
caption = caption.lower()
caption = caption.strip()
if not caption.endswith("."):
caption = caption + "."
device = "cuda" if not cpu_only else "cpu"
model = model.to(device)
image = image.to(device)
# 核心推理计时
torch.cuda.synchronize() if device == "cuda" else None # 等待GPU操作完成,确保计时准确
start_time = time.perf_counter() # 高精度计时
with torch.no_grad():
outputs = model(image[None], captions=[caption])
torch.cuda.synchronize() if device == "cuda" else None # 等待GPU推理完成
infer_time = time.perf_counter() - start_time # 推理耗时(秒)
logits = outputs["pred_logits"].sigmoid()[0] # (nq, 256)
boxes = outputs["pred_boxes"][0] # (nq, 4)
# filter output
if token_spans is None:
logits_filt = logits.cpu().clone()
boxes_filt = boxes.cpu().clone()
filt_mask = logits_filt.max(dim=1)[0] > box_threshold
logits_filt = logits_filt[filt_mask] # num_filt, 256
boxes_filt = boxes_filt[filt_mask] # num_filt, 4
# get phrase
tokenlizer = model.tokenizer
tokenized = tokenlizer(caption)
# build pred
pred_phrases = []
for logit, box in zip(logits_filt, boxes_filt):
pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
if with_logits:
pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
else:
pred_phrases.append(pred_phrase)
else:
# given-phrase mode
positive_maps = create_positive_map_from_span(
model.tokenizer(text_prompt),
token_span=token_spans
).to(image.device) # n_phrase, 256
logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
all_logits = []
all_phrases = []
all_boxes = []
for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
# get phrase
phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
# get mask
filt_mask = logit_phr > box_threshold
# filt box
all_boxes.append(boxes[filt_mask])
# filt logits
all_logits.append(logit_phr[filt_mask])
if with_logits:
logit_phr_num = logit_phr[filt_mask]
all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
else:
all_phrases.extend([phrase for _ in range(len(filt_mask))])
boxes_filt = torch.cat(all_boxes, dim=0).cpu()
pred_phrases = all_phrases
return boxes_filt, pred_phrases, infer_time
def benchmark_performance(model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, warmup_runs=5, test_runs=10):
"""
性能基准测试:预热 + 多次推理计算平均FPS和时延
Args:
warmup_runs: 预热次数(排除初始加载的影响)
test_runs: 正式测试次数
"""
# 1. 预热阶段(忽略耗时)
print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
for i in range(warmup_runs):
_, _, _ = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
)
print(f"预热完成 {i+1}/{warmup_runs}")
# 2. 正式测试阶段
print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
total_time = 0.0
infer_times = [] # 记录每次推理的时延
for i in range(test_runs):
_, _, infer_time = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
)
infer_times.append(infer_time)
total_time += infer_time
print(f"测试 {i+1}/{test_runs} - 单次推理时延: {infer_time*1000:.2f} ms")
# 3. 计算性能指标
avg_infer_time = total_time / test_runs # 平均时延(秒)
fps = test_runs / total_time # 平均FPS
std_infer_time = np.std(infer_times) # 时延标准差(稳定性)
# 4. 输出性能报告
print("\n" + "="*50)
print("📊 性能测试报告")
print("="*50)
print(f"测试环境: {'GPU (CUDA)' if not cpu_only else 'CPU'}")
print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
print(f"平均推理时延: {avg_infer_time*1000:.2f} ms (±{std_infer_time*1000:.2f} ms)")
print(f"最大推理时延: {max(infer_times)*1000:.2f} ms")
print(f"最小推理时延: {min(infer_times)*1000:.2f} ms")
print(f"平均FPS: {fps:.2f} 帧/秒")
print("="*50 + "\n")
return avg_infer_time, fps, infer_times
if __name__ == "__main__":
parser = argparse.ArgumentParser("Grounding DINO 性能测试", add_help=True)
parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
parser.add_argument("--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file")
parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True, help="output directory")
parser.add_argument("--box_threshold", type=float, default=0.35, help="box threshold")
parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
parser.add_argument("--token_spans", type=str, default=None, help=
"The positions of start and end positions of phrases of interest. \
For example, a caption is 'a cat and a dog', \
if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
")
parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
# 新增性能测试参数
parser.add_argument("--warmup-runs", type=int, default=5, help="预热运行次数,默认5")
parser.add_argument("--test-runs", type=int, default=10, help="正式测试运行次数,默认10")
args = parser.parse_args()
# cfg
config_file = args.config_file
checkpoint_path = args.checkpoint_path
image_path = args.image_path
text_prompt = args.text_prompt
output_dir = args.output_dir
box_threshold = args.box_threshold
text_threshold = args.text_threshold
token_spans = args.token_spans
warmup_runs = args.warmup_runs
test_runs = args.test_runs
# 打印基础信息
print(f"📌 使用设备: {'GPU' if not args.cpu_only else 'CPU'}")
if not args.cpu_only and torch.cuda.is_available():
print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
print(f"📌 GPU编号: {torch.cuda.current_device()}")
# make dir
os.makedirs(output_dir, exist_ok=True)
# load image
image_pil, image = load_image(image_path)
# load model
model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
# visualize raw image
image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
# set the text_threshold to None if token_spans is set.
if token_spans is not None:
text_threshold = None
print("Using token_spans. Set the text_threshold to None.")
# 运行性能基准测试
avg_infer_time, fps, infer_times = benchmark_performance(
model, image, text_prompt, box_threshold, text_threshold,
args.cpu_only, eval(f"{token_spans}") if token_spans else None,
warmup_runs, test_runs
)
# 单次推理并保存结果(保留原有功能)
print("\n=== 生成推理结果图片 ===")
boxes_filt, pred_phrases, single_infer_time = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, args.cpu_only,
eval(f"{token_spans}") if token_spans else None
)
# visualize pred
size = image_pil.size
pred_dict = {
"boxes": boxes_filt,
"size": [size[1], size[0]], # H,W
"labels": pred_phrases,
}
image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
image_with_box.save(os.path.join(output_dir, "pred.jpg"))
# 保存性能结果到文件
performance_file = os.path.join(output_dir, "performance_report.txt")
with open(performance_file, "w", encoding="utf-8") as f:
f.write("="*50 + "\n")
f.write("Grounding DINO 性能测试报告\n")
f.write("="*50 + "\n")
f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"测试设备: {'GPU' if not args.cpu_only else 'CPU'}\n")
if not args.cpu_only and torch.cuda.is_available():
f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
f.write(f"预热次数: {warmup_runs}\n")
f.write(f"测试次数: {test_runs}\n")
f.write(f"平均推理时延: {avg_infer_time*1000:.2f} ms\n")
f.write(f"时延标准差: {np.std(infer_times)*1000:.2f} ms\n")
f.write(f"最大时延: {max(infer_times)*1000:.2f} ms\n")
f.write(f"最小时延: {min(infer_times)*1000:.2f} ms\n")
f.write(f"平均FPS: {fps:.2f} 帧/秒\n")
f.write(f"单次推理时延(最后一次): {single_infer_time*1000:.2f} ms\n")
print(f"\n✅ 性能报告已保存至: {performance_file}")
print(f"✅ 推理结果图片已保存至: {os.path.join(output_dir, 'pred.jpg')}")
\ No newline at end of file
import argparse
import os
import sys
import time
import numpy as np
import torch
from PIL import Image, ImageDraw, ImageFont
import groundingdino.datasets.transforms as T
from groundingdino.models import build_model
from groundingdino.util import box_ops
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
from groundingdino.util.vl_utils import create_positive_map_from_span
def plot_boxes_to_image(image_pil, tgt):
H, W = tgt["size"]
boxes = tgt["boxes"]
labels = tgt["labels"]
assert len(boxes) == len(labels), "boxes and labels must have same length"
draw = ImageDraw.Draw(image_pil)
mask = Image.new("L", image_pil.size, 0)
mask_draw = ImageDraw.Draw(mask)
# draw boxes and masks
for box, label in zip(boxes, labels):
# from 0..1 to 0..W, 0..H
box = box * torch.Tensor([W, H, W, H])
# from xywh to xyxy
box[:2] -= box[2:] / 2
box[2:] += box[:2]
# random color
color = tuple(np.random.randint(0, 255, size=3).tolist())
# draw
x0, y0, x1, y1 = box
x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
# draw.text((x0, y0), str(label), fill=color)
font = ImageFont.load_default()
if hasattr(font, "getbbox"):
bbox = draw.textbbox((x0, y0), str(label), font)
else:
w, h = draw.textsize(str(label), font)
bbox = (x0, y0, w + x0, y0 + h)
# bbox = draw.textbbox((x0, y0), str(label))
draw.rectangle(bbox, fill=color)
draw.text((x0, y0), str(label), fill="white")
mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
return image_pil, mask
def load_image(image_path):
# load image
image_pil = Image.open(image_path).convert("RGB") # load image
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
image, _ = transform(image_pil, None) # 3, h, w
return image_pil, image
def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
args = SLConfig.fromfile(model_config_path)
args.device = "cuda" if not cpu_only else "cpu"
model = build_model(args)
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
print(load_res)
_ = model.eval()
return model
def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None, use_fp16=False):
assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
caption = caption.lower()
caption = caption.strip()
if not caption.endswith("."):
caption = caption + "."
device = "cuda" if not cpu_only else "cpu"
model = model.to(device)
image = image.to(device)
# 核心推理计时
torch.cuda.synchronize() if device == "cuda" else None # 等待GPU操作完成,确保计时准确
start_time = time.perf_counter() # 高精度计时
with torch.no_grad():
if use_fp16 and device == "cuda":
with torch.autocast(device_type="cuda", dtype=torch.float16):
outputs = model(image[None], captions=[caption])
else:
outputs = model(image[None], captions=[caption])
torch.cuda.synchronize() if device == "cuda" else None # 等待GPU推理完成
infer_time = time.perf_counter() - start_time # 推理耗时(秒)
# 输出转回 FP32,避免后续 CPU 操作出问题
logits = outputs["pred_logits"].float().sigmoid()[0]
boxes = outputs["pred_boxes"].float()[0]
# filter output
if token_spans is None:
logits_filt = logits.cpu().clone()
boxes_filt = boxes.cpu().clone()
filt_mask = logits_filt.max(dim=1)[0] > box_threshold
logits_filt = logits_filt[filt_mask] # num_filt, 256
boxes_filt = boxes_filt[filt_mask] # num_filt, 4
# get phrase
tokenlizer = model.tokenizer
tokenized = tokenlizer(caption)
# build pred
pred_phrases = []
for logit, box in zip(logits_filt, boxes_filt):
pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
if with_logits:
pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
else:
pred_phrases.append(pred_phrase)
else:
# given-phrase mode
positive_maps = create_positive_map_from_span(
model.tokenizer(text_prompt),
token_span=token_spans
).to(image.device) # n_phrase, 256
logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
all_logits = []
all_phrases = []
all_boxes = []
for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
# get phrase
phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
# get mask
filt_mask = logit_phr > box_threshold
# filt box
all_boxes.append(boxes[filt_mask])
# filt logits
all_logits.append(logit_phr[filt_mask])
if with_logits:
logit_phr_num = logit_phr[filt_mask]
all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
else:
all_phrases.extend([phrase for _ in range(len(filt_mask))])
boxes_filt = torch.cat(all_boxes, dim=0).cpu()
pred_phrases = all_phrases
return boxes_filt, pred_phrases, infer_time
def benchmark_performance(model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, warmup_runs=5, test_runs=10, use_fp16=False):
"""
性能基准测试:预热 + 多次推理计算平均FPS和时延
Args:
warmup_runs: 预热次数(排除初始加载的影响)
test_runs: 正式测试次数
"""
# 1. 预热阶段(忽略耗时)
print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
for i in range(warmup_runs):
_, _, _ = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, use_fp16=use_fp16
)
print(f"预热完成 {i+1}/{warmup_runs}")
# 2. 正式测试阶段
print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
total_time = 0.0
infer_times = [] # 记录每次推理的时延
for i in range(test_runs):
_, _, infer_time = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, use_fp16=use_fp16
)
infer_times.append(infer_time)
total_time += infer_time
print(f"测试 {i+1}/{test_runs} - 单次推理时延: {infer_time*1000:.2f} ms")
# 3. 计算性能指标
avg_infer_time = total_time / test_runs # 平均时延(秒)
fps = test_runs / total_time # 平均FPS
std_infer_time = np.std(infer_times) # 时延标准差(稳定性)
# 4. 输出性能报告
print("\n" + "="*50)
print("📊 性能测试报告")
print("="*50)
print(f"测试环境: {'GPU (CUDA)' if not cpu_only else 'CPU'}")
print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
print(f"平均推理时延: {avg_infer_time*1000:.2f} ms (±{std_infer_time*1000:.2f} ms)")
print(f"最大推理时延: {max(infer_times)*1000:.2f} ms")
print(f"最小推理时延: {min(infer_times)*1000:.2f} ms")
print(f"平均FPS: {fps:.2f} 帧/秒")
print("="*50 + "\n")
return avg_infer_time, fps, infer_times
if __name__ == "__main__":
parser = argparse.ArgumentParser("Grounding DINO 性能测试", add_help=True)
parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
parser.add_argument("--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file")
parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True, help="output directory")
parser.add_argument("--box_threshold", type=float, default=0.35, help="box threshold")
parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
parser.add_argument("--token_spans", type=str, default=None, help=
"The positions of start and end positions of phrases of interest. \
For example, a caption is 'a cat and a dog', \
if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
")
parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
# 新增性能测试参数
parser.add_argument("--warmup-runs", type=int, default=5, help="预热运行次数,默认5")
parser.add_argument("--test-runs", type=int, default=10, help="正式测试运行次数,默认10")
parser.add_argument("--fp16", action="store_true", help="Enable FP16 inference")
args = parser.parse_args()
# cfg
config_file = args.config_file
checkpoint_path = args.checkpoint_path
image_path = args.image_path
text_prompt = args.text_prompt
output_dir = args.output_dir
box_threshold = args.box_threshold
text_threshold = args.text_threshold
token_spans = args.token_spans
warmup_runs = args.warmup_runs
test_runs = args.test_runs
# 打印基础信息
print(f"📌 使用设备: {'GPU' if not args.cpu_only else 'CPU'}")
if not args.cpu_only and torch.cuda.is_available():
print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
print(f"📌 GPU编号: {torch.cuda.current_device()}")
# make dir
os.makedirs(output_dir, exist_ok=True)
# load image
image_pil, image = load_image(image_path)
# load model
model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
# visualize raw image
image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
# set the text_threshold to None if token_spans is set.
if token_spans is not None:
text_threshold = None
print("Using token_spans. Set the text_threshold to None.")
# 运行性能基准测试
avg_infer_time, fps, infer_times = benchmark_performance(
model, image, text_prompt, box_threshold, text_threshold,
args.cpu_only, eval(f"{token_spans}") if token_spans else None,
warmup_runs, test_runs, use_fp16=args.fp16
)
# 单次推理并保存结果(保留原有功能)
print("\n=== 生成推理结果图片 ===")
boxes_filt, pred_phrases, single_infer_time = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, args.cpu_only,
eval(f"{token_spans}") if token_spans else None, use_fp16=args.fp16
)
# visualize pred
size = image_pil.size
pred_dict = {
"boxes": boxes_filt,
"size": [size[1], size[0]], # H,W
"labels": pred_phrases,
}
image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
image_with_box.save(os.path.join(output_dir, "pred.jpg"))
# 保存性能结果到文件
performance_file = os.path.join(output_dir, "performance_report.txt")
with open(performance_file, "w", encoding="utf-8") as f:
f.write("="*50 + "\n")
f.write("Grounding DINO 性能测试报告\n")
f.write("="*50 + "\n")
f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"测试设备: {'GPU' if not args.cpu_only else 'CPU'}\n")
if not args.cpu_only and torch.cuda.is_available():
f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
f.write(f"预热次数: {warmup_runs}\n")
f.write(f"测试次数: {test_runs}\n")
f.write(f"平均推理时延: {avg_infer_time*1000:.2f} ms\n")
f.write(f"时延标准差: {np.std(infer_times)*1000:.2f} ms\n")
f.write(f"最大时延: {max(infer_times)*1000:.2f} ms\n")
f.write(f"最小时延: {min(infer_times)*1000:.2f} ms\n")
f.write(f"平均FPS: {fps:.2f} 帧/秒\n")
f.write(f"单次推理时延(最后一次): {single_infer_time*1000:.2f} ms\n")
print(f"\n✅ 性能报告已保存至: {performance_file}")
print(f"✅ 推理结果图片已保存至: {os.path.join(output_dir, 'pred.jpg')}")
\ No newline at end of file
import argparse
import os
import sys
import numpy as np
import torch
from PIL import Image, ImageDraw, ImageFont
import groundingdino.datasets.transforms as T
from groundingdino.models import build_model
from groundingdino.util import box_ops
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
from groundingdino.util.vl_utils import create_positive_map_from_span
def plot_boxes_to_image(image_pil, tgt):
H, W = tgt["size"]
boxes = tgt["boxes"]
labels = tgt["labels"]
assert len(boxes) == len(labels), "boxes and labels must have same length"
draw = ImageDraw.Draw(image_pil)
mask = Image.new("L", image_pil.size, 0)
mask_draw = ImageDraw.Draw(mask)
# draw boxes and masks
for box, label in zip(boxes, labels):
# from 0..1 to 0..W, 0..H
box = box * torch.Tensor([W, H, W, H])
# from xywh to xyxy
box[:2] -= box[2:] / 2
box[2:] += box[:2]
# random color
color = tuple(np.random.randint(0, 255, size=3).tolist())
# draw
x0, y0, x1, y1 = box
x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
# draw.text((x0, y0), str(label), fill=color)
font = ImageFont.load_default()
if hasattr(font, "getbbox"):
bbox = draw.textbbox((x0, y0), str(label), font)
else:
w, h = draw.textsize(str(label), font)
bbox = (x0, y0, w + x0, y0 + h)
# bbox = draw.textbbox((x0, y0), str(label))
draw.rectangle(bbox, fill=color)
draw.text((x0, y0), str(label), fill="white")
mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
return image_pil, mask
def load_image(image_path):
# load image
image_pil = Image.open(image_path).convert("RGB") # load image
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
image, _ = transform(image_pil, None) # 3, h, w
return image_pil, image
def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
args = SLConfig.fromfile(model_config_path)
args.device = "cuda" if not cpu_only else "cpu"
model = build_model(args)
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
print(load_res)
_ = model.eval()
return model
def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
caption = caption.lower()
caption = caption.strip()
if not caption.endswith("."):
caption = caption + "."
device = "cuda" if not cpu_only else "cpu"
model = model.to(device)
image = image.to(device)
with torch.no_grad():
outputs = model(image[None], captions=[caption])
logits = outputs["pred_logits"].sigmoid()[0] # (nq, 256)
boxes = outputs["pred_boxes"][0] # (nq, 4)
# filter output
if token_spans is None:
logits_filt = logits.cpu().clone()
boxes_filt = boxes.cpu().clone()
filt_mask = logits_filt.max(dim=1)[0] > box_threshold
logits_filt = logits_filt[filt_mask] # num_filt, 256
boxes_filt = boxes_filt[filt_mask] # num_filt, 4
# get phrase
tokenlizer = model.tokenizer
tokenized = tokenlizer(caption)
# build pred
pred_phrases = []
for logit, box in zip(logits_filt, boxes_filt):
pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
if with_logits:
pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
else:
pred_phrases.append(pred_phrase)
else:
# given-phrase mode
positive_maps = create_positive_map_from_span(
model.tokenizer(text_prompt),
token_span=token_spans
).to(image.device) # n_phrase, 256
logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
all_logits = []
all_phrases = []
all_boxes = []
for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
# get phrase
phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
# get mask
filt_mask = logit_phr > box_threshold
# filt box
all_boxes.append(boxes[filt_mask])
# filt logits
all_logits.append(logit_phr[filt_mask])
if with_logits:
logit_phr_num = logit_phr[filt_mask]
all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
else:
all_phrases.extend([phrase for _ in range(len(filt_mask))])
boxes_filt = torch.cat(all_boxes, dim=0).cpu()
pred_phrases = all_phrases
return boxes_filt, pred_phrases
if __name__ == "__main__":
parser = argparse.ArgumentParser("Grounding DINO example", add_help=True)
parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
parser.add_argument(
"--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file"
)
parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
parser.add_argument(
"--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
)
parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
parser.add_argument("--token_spans", type=str, default=None, help=
"The positions of start and end positions of phrases of interest. \
For example, a caption is 'a cat and a dog', \
if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
")
parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
args = parser.parse_args()
# cfg
config_file = args.config_file # change the path of the model config file
checkpoint_path = args.checkpoint_path # change the path of the model
image_path = args.image_path
text_prompt = args.text_prompt
output_dir = args.output_dir
box_threshold = args.box_threshold
text_threshold = args.text_threshold
token_spans = args.token_spans
# make dir
os.makedirs(output_dir, exist_ok=True)
# load image
image_pil, image = load_image(image_path)
# load model
model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
# visualize raw image
image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
# set the text_threshold to None if token_spans is set.
if token_spans is not None:
text_threshold = None
print("Using token_spans. Set the text_threshold to None.")
# run model
boxes_filt, pred_phrases = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, cpu_only=args.cpu_only, token_spans=eval(f"{token_spans}")
)
# visualize pred
size = image_pil.size
pred_dict = {
"boxes": boxes_filt,
"size": [size[1], size[0]], # H,W
"labels": pred_phrases,
}
# import ipdb; ipdb.set_trace()
image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
image_with_box.save(os.path.join(output_dir, "pred.jpg"))
import argparse
import os
import sys
import time
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, DistributedSampler
from groundingdino.models import build_model
import groundingdino.datasets.transforms as T
from groundingdino.util import box_ops, get_tokenlizer
from groundingdino.util.misc import clean_state_dict, collate_fn
from groundingdino.util.slconfig import SLConfig
# from torchvision.datasets import CocoDetection
import torchvision
from groundingdino.util.vl_utils import build_captions_and_token_span, create_positive_map_from_span
from groundingdino.datasets.cocogrounding_eval import CocoGroundingEvaluator
def load_model(model_config_path: str, model_checkpoint_path: str, device: str = "cuda"):
args = SLConfig.fromfile(model_config_path)
args.device = device
model = build_model(args)
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
model.eval()
return model
class CocoDetection(torchvision.datasets.CocoDetection):
def __init__(self, img_folder, ann_file, transforms):
super().__init__(img_folder, ann_file)
self._transforms = transforms
def __getitem__(self, idx):
img, target = super().__getitem__(idx) # target: list
# import ipdb; ipdb.set_trace()
w, h = img.size
boxes = [obj["bbox"] for obj in target]
boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
boxes[:, 2:] += boxes[:, :2] # xywh -> xyxy
boxes[:, 0::2].clamp_(min=0, max=w)
boxes[:, 1::2].clamp_(min=0, max=h)
# filt invalid boxes/masks/keypoints
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
boxes = boxes[keep]
target_new = {}
image_id = self.ids[idx]
target_new["image_id"] = image_id
target_new["boxes"] = boxes
target_new["orig_size"] = torch.as_tensor([int(h), int(w)])
if self._transforms is not None:
img, target = self._transforms(img, target_new)
return img, target
class PostProcessCocoGrounding(nn.Module):
""" This module converts the model's output into the format expected by the coco api"""
def __init__(self, num_select=300, coco_api=None, tokenlizer=None) -> None:
super().__init__()
self.num_select = num_select
assert coco_api is not None
category_dict = coco_api.dataset['categories']
cat_list = [item['name'] for item in category_dict]
captions, cat2tokenspan = build_captions_and_token_span(cat_list, True)
tokenspanlist = [cat2tokenspan[cat] for cat in cat_list]
positive_map = create_positive_map_from_span(
tokenlizer(captions), tokenspanlist) # 80, 256. normed
id_map = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 13, 12: 14, 13: 15, 14: 16, 15: 17, 16: 18, 17: 19, 18: 20, 19: 21, 20: 22, 21: 23, 22: 24, 23: 25, 24: 27, 25: 28, 26: 31, 27: 32, 28: 33, 29: 34, 30: 35, 31: 36, 32: 37, 33: 38, 34: 39, 35: 40, 36: 41, 37: 42, 38: 43, 39: 44, 40: 46,
41: 47, 42: 48, 43: 49, 44: 50, 45: 51, 46: 52, 47: 53, 48: 54, 49: 55, 50: 56, 51: 57, 52: 58, 53: 59, 54: 60, 55: 61, 56: 62, 57: 63, 58: 64, 59: 65, 60: 67, 61: 70, 62: 72, 63: 73, 64: 74, 65: 75, 66: 76, 67: 77, 68: 78, 69: 79, 70: 80, 71: 81, 72: 82, 73: 84, 74: 85, 75: 86, 76: 87, 77: 88, 78: 89, 79: 90}
# build a mapping from label_id to pos_map
new_pos_map = torch.zeros((91, 256))
for k, v in id_map.items():
new_pos_map[v] = positive_map[k]
self.positive_map = new_pos_map
@torch.no_grad()
def forward(self, outputs, target_sizes, not_to_xyxy=False):
""" Perform the computation
Parameters:
outputs: raw outputs of the model
target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
For evaluation, this must be the original image size (before any data augmentation)
For visualization, this should be the image size after data augment, but before padding
"""
num_select = self.num_select
out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
# pos map to logit
prob_to_token = out_logits.sigmoid() # bs, 100, 256
pos_maps = self.positive_map.to(prob_to_token.device)
# (bs, 100, 256) @ (91, 256).T -> (bs, 100, 91)
prob_to_label = prob_to_token @ pos_maps.T
# if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
# import ipdb; ipdb.set_trace()
assert len(out_logits) == len(target_sizes)
assert target_sizes.shape[1] == 2
prob = prob_to_label
topk_values, topk_indexes = torch.topk(
prob.view(out_logits.shape[0], -1), num_select, dim=1)
scores = topk_values
topk_boxes = topk_indexes // prob.shape[2]
labels = topk_indexes % prob.shape[2]
if not_to_xyxy:
boxes = out_bbox
else:
boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
boxes = torch.gather(
boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
# and from relative [0, 1] to absolute [0, height] coordinates
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
boxes = boxes * scale_fct[:, None, :]
results = [{'scores': s, 'labels': l, 'boxes': b}
for s, l, b in zip(scores, labels, boxes)]
return results
def main(args):
# config
cfg = SLConfig.fromfile(args.config_file)
# build model
model = load_model(args.config_file, args.checkpoint_path)
model = model.to(args.device)
model = model.eval()
# build dataloader
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
dataset = CocoDetection(
args.image_dir, args.anno_path, transforms=transform)
data_loader = DataLoader(
dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn)
# build post processor
tokenlizer = get_tokenlizer.get_tokenlizer(cfg.text_encoder_type)
postprocessor = PostProcessCocoGrounding(
coco_api=dataset.coco, tokenlizer=tokenlizer)
# build evaluator
evaluator = CocoGroundingEvaluator(
dataset.coco, iou_types=("bbox",), useCats=True)
# build captions
category_dict = dataset.coco.dataset['categories']
cat_list = [item['name'] for item in category_dict]
caption = " . ".join(cat_list) + ' .'
print("Input text prompt:", caption)
# run inference
start = time.time()
for i, (images, targets) in enumerate(data_loader):
# get images and captions
images = images.tensors.to(args.device)
bs = images.shape[0]
input_captions = [caption] * bs
# feed to the model
outputs = model(images, captions=input_captions)
orig_target_sizes = torch.stack(
[t["orig_size"] for t in targets], dim=0).to(images.device)
results = postprocessor(outputs, orig_target_sizes)
cocogrounding_res = {
target["image_id"]: output for target, output in zip(targets, results)}
evaluator.update(cocogrounding_res)
if (i+1) % 30 == 0:
used_time = time.time() - start
eta = len(data_loader) / (i+1e-5) * used_time - used_time
print(
f"processed {i}/{len(data_loader)} images. time: {used_time:.2f}s, ETA: {eta:.2f}s")
evaluator.synchronize_between_processes()
evaluator.accumulate()
evaluator.summarize()
print("Final results:", evaluator.coco_eval["bbox"].stats.tolist())
if __name__ == "__main__":
parser = argparse.ArgumentParser(
"Grounding DINO eval on COCO", add_help=True)
# load model
parser.add_argument("--config_file", "-c", type=str,
required=True, help="path to config file")
parser.add_argument(
"--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file"
)
parser.add_argument("--device", type=str, default="cuda",
help="running device (default: cuda)")
# post processing
parser.add_argument("--num_select", type=int, default=300,
help="number of topk to select")
# coco info
parser.add_argument("--anno_path", type=str,
required=True, help="coco root")
parser.add_argument("--image_dir", type=str,
required=True, help="coco image dir")
parser.add_argument("--num_workers", type=int, default=4,
help="number of workers for dataloader")
args = parser.parse_args()
main(args)
import argparse
import os
import sys
import time
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, DistributedSampler
import torchvision
import onnxruntime as ort
import groundingdino.datasets.transforms as T
from groundingdino.util import box_ops, get_tokenlizer
from groundingdino.util.misc import collate_fn
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.vl_utils import build_captions_and_token_span, create_positive_map_from_span
from groundingdino.datasets.cocogrounding_eval import CocoGroundingEvaluator
class CocoDetection(torchvision.datasets.CocoDetection):
def __init__(self, img_folder, ann_file, transforms):
super().__init__(img_folder, ann_file)
self._transforms = transforms
def __getitem__(self, idx):
img, target = super().__getitem__(idx)
w, h = img.size
boxes = [obj["bbox"] for obj in target]
boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
boxes[:, 2:] += boxes[:, :2] # xywh -> xyxy
boxes[:, 0::2].clamp_(min=0, max=w)
boxes[:, 1::2].clamp_(min=0, max=h)
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
boxes = boxes[keep]
target_new = {}
image_id = self.ids[idx]
target_new["image_id"] = image_id
target_new["boxes"] = boxes
target_new["orig_size"] = torch.as_tensor([int(h), int(w)])
if self._transforms is not None:
img, target = self._transforms(img, target_new)
return img, target
class PostProcessCocoGrounding(nn.Module):
"""保持和原代码一致的后处理逻辑"""
def __init__(self, num_select=300, coco_api=None, tokenlizer=None) -> None:
super().__init__()
self.num_select = num_select
assert coco_api is not None
category_dict = coco_api.dataset['categories']
cat_list = [item['name'] for item in category_dict]
captions, cat2tokenspan = build_captions_and_token_span(cat_list, True)
tokenspanlist = [cat2tokenspan[cat] for cat in cat_list]
positive_map = create_positive_map_from_span(
tokenlizer(captions), tokenspanlist) # 80, 256. normed
id_map = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 13, 12: 14, 13: 15, 14: 16, 15: 17, 16: 18, 17: 19, 18: 20, 19: 21, 20: 22, 21: 23, 22: 24, 23: 25, 24: 27, 25: 28, 26: 31, 27: 32, 28: 33, 29: 34, 30: 35, 31: 36, 32: 37, 33: 38, 34: 39, 35: 40, 36: 41, 37: 42, 38: 43, 39: 44, 40: 46,
41: 47, 42: 48, 43: 49, 44: 50, 45: 51, 46: 52, 47: 53, 48: 54, 49: 55, 50: 56, 51: 57, 52: 58, 53: 59, 54: 60, 55: 61, 56: 62, 57: 63, 58: 64, 59: 65, 60: 67, 61: 70, 62: 72, 63: 73, 64: 74, 65: 75, 66: 76, 67: 77, 68: 78, 69: 79, 70: 80, 71: 81, 72: 82, 73: 84, 74: 85, 75: 86, 76: 87, 77: 88, 78: 89, 79: 90}
new_pos_map = torch.zeros((91, 256))
for k, v in id_map.items():
new_pos_map[v] = positive_map[k]
self.positive_map = new_pos_map
@torch.no_grad()
def forward(self, outputs, target_sizes, not_to_xyxy=False):
num_select = self.num_select
out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
prob_to_token = torch.from_numpy(out_logits).sigmoid() # 适配numpy输入
pos_maps = self.positive_map.to(prob_to_token.device)
prob_to_label = prob_to_token @ pos_maps.T
assert prob_to_label.shape[0] == len(target_sizes)
assert target_sizes.shape[1] == 2
prob = prob_to_label
topk_values, topk_indexes = torch.topk(
prob.view(prob_to_label.shape[0], -1), num_select, dim=1)
scores = topk_values
topk_boxes = topk_indexes // prob.shape[2]
labels = topk_indexes % prob.shape[2]
if not_to_xyxy:
boxes = torch.from_numpy(out_bbox)
else:
boxes = box_ops.box_cxcywh_to_xyxy(torch.from_numpy(out_bbox))
boxes = torch.gather(
boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
boxes = boxes * scale_fct[:, None, :]
results = [{'scores': s, 'labels': l, 'boxes': b}
for s, l, b in zip(scores, labels, boxes)]
return results
def load_onnx_model(onnx_path, device="cuda"):
"""加载ONNX模型并创建推理session"""
providers = ['CPUExecutionProvider']
if device == "cuda" and ort.get_device() == "GPU":
providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
session = ort.InferenceSession(
onnx_path,
providers=providers,
provider_options=[{'device_id': 0}] if "CUDAExecutionProvider" in providers else []
)
return session
def onnx_inference(session, images, captions):
"""ONNX模型推理(需匹配模型输入格式)"""
# 转换为numpy(ONNX Runtime不支持torch tensor)
images_np = images.cpu().numpy().astype(np.float32)
# 注意:此处需根据你的ONNX模型输入名调整(可通过netron查看)
# 假设模型输入为 "images" 和 "captions"(需根据实际情况修改)
input_feed = {
session.get_inputs()[0].name: images_np,
# 如果caption是文本token,需补充token化逻辑,此处假设已处理
# session.get_inputs()[1].name: captions_np
}
# 执行推理
outputs = session.run(None, input_feed)
# 解析输出(需匹配模型输出格式,假设输出为logits和bbox)
# 需根据你的ONNX模型输出调整维度和顺序
pred_logits = outputs[0] # 形状: [bs, 100, 256]
pred_boxes = outputs[1] # 形状: [bs, 100, 4]
return {"pred_logits": pred_logits, "pred_boxes": pred_boxes}
def main(args):
# 配置(主要用于tokenizer和后处理)
cfg = SLConfig.fromfile(args.config_file)
# 加载ONNX模型
onnx_session = load_onnx_model(args.onnx_path, args.device)
# 构建数据加载器(和原代码一致)
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
dataset = CocoDetection(
args.image_dir, args.anno_path, transforms=transform)
data_loader = DataLoader(
dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn)
# 构建后处理器(和原代码一致)
tokenlizer = get_tokenlizer.get_tokenlizer(cfg.text_encoder_type)
postprocessor = PostProcessCocoGrounding(
coco_api=dataset.coco, tokenlizer=tokenlizer)
# 构建评估器(和原代码一致)
evaluator = CocoGroundingEvaluator(
dataset.coco, iou_types=("bbox",), useCats=True)
# 构建文本提示(和原代码一致)
category_dict = dataset.coco.dataset['categories']
cat_list = [item['name'] for item in category_dict]
caption = " . ".join(cat_list) + ' .'
print("Input text prompt:", caption)
# 运行推理
start = time.time()
for i, (images, targets) in enumerate(data_loader):
# 预处理图像(和原代码一致)
images = images.tensors.to(args.device)
bs = images.shape[0]
input_captions = [caption] * bs
# ONNX推理(替换原PyTorch模型推理)
outputs = onnx_inference(onnx_session, images, input_captions)
# 后处理(适配ONNX输出格式)
orig_target_sizes = torch.stack(
[t["orig_size"] for t in targets], dim=0).to(args.device)
results = postprocessor(outputs, orig_target_sizes)
cocogrounding_res = {
target["image_id"]: output for target, output in zip(targets, results)}
evaluator.update(cocogrounding_res)
# 打印进度
if (i+1) % 30 == 0:
used_time = time.time() - start
eta = len(data_loader) / (i+1e-5) * used_time - used_time
print(
f"processed {i}/{len(data_loader)} images. time: {used_time:.2f}s, ETA: {eta:.2f}s")
# 评估指标汇总
evaluator.synchronize_between_processes()
evaluator.accumulate()
evaluator.summarize()
print("Final results:", evaluator.coco_eval["bbox"].stats.tolist())
if __name__ == "__main__":
parser = argparse.ArgumentParser(
"Grounding DINO ONNX eval on COCO", add_help=True)
# 新增ONNX模型路径参数
parser.add_argument("--onnx_path", type=str, required=True, help="path to onnx model file")
# 保留原配置文件参数(用于tokenizer和后处理)
parser.add_argument("--config_file", "-c", type=str,
required=True, help="path to config file")
parser.add_argument("--device", type=str, default="cuda",
help="running device (default: cuda)")
# 后处理参数
parser.add_argument("--num_select", type=int, default=300,
help="number of topk to select")
# COCO数据集参数
parser.add_argument("--anno_path", type=str,
required=True, help="coco annotation path")
parser.add_argument("--image_dir", type=str,
required=True, help="coco image dir")
parser.add_argument("--num_workers", type=int, default=4,
help="number of workers for dataloader")
args = parser.parse_args()
main(args)
\ No newline at end of file
from groundingdino.util.inference import load_model, load_image, predict, annotate
import torch
import cv2
model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.pyy", "weights/groundingdino_swint_ogc.pth")
model = model.to('cuda:0')
print(torch.cuda.is_available())
print('DONE!')
\ No newline at end of file
This diff is collapsed.
final text_encoder_type: bert-base-uncased
This diff is collapsed.
This diff is collapsed.
import torch
import onnx
from onnxsim import simplify
from groundingdino.models import build_model
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict
config_file = './groundingdino/config/GroundingDINO_SwinB_cfg.py'
checkpoint_path = './weights/groundingdino_swinb_cogcoor.pth'
def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
args = SLConfig.fromfile(model_config_path)
args.device = "cuda" if not cpu_only else "cpu"
# modified config
args.use_checkpoint = False
args.use_transformer_ckpt = False
model = build_model(args)
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
_ = model.eval()
return model
# 加载模型
model = load_model(config_file, checkpoint_path, cpu_only=True)
# 正式推理时使用的提示词,以及相关的mask
caption = "car ."
input_ids = model.tokenizer([caption], return_tensors="pt")["input_ids"]
position_ids = torch.tensor([[0, 0, 1, 0]])
token_type_ids = torch.tensor([[0, 0, 0, 0]])
attention_mask = torch.tensor([[True, True, True, True]])
text_token_mask = torch.tensor([[[True, False, False, False],
[False, True, True, False],
[False, True, True, False],
[False, False, False, True]]])
# 固定输入分辨率
img = torch.randn(1, 3, 800, 1200)
# onnx模型可以支持动态输入,在转换engine时建议注销
dynamic_axes = {
"input_ids": {0: "batch_size", 1: "seq_len"},
"attention_mask": {0: "batch_size", 1: "seq_len"},
"position_ids": {0: "batch_size", 1: "seq_len"},
"token_type_ids": {0: "batch_size", 1: "seq_len"},
"text_token_mask": {0: "batch_size", 1: "seq_len", 2: "seq_len"},
"img": {0: "batch_size", 2: "height", 3: "width"},
"logits": {0: "batch_size"},
"boxes": {0: "batch_size"}
}
# 导出原始ONNX模型
onnx_output_path = "weights/ground.onnx"
torch.onnx.export(
model,
f=onnx_output_path,
args=(img, input_ids, attention_mask, position_ids, token_type_ids, text_token_mask),
input_names=["img", "input_ids", "attention_mask", "position_ids", "token_type_ids", "text_token_mask"],
output_names=["logits", "boxes"],
opset_version=17,
verbose=False, # 关闭详细日志,如需调试可改为True
do_constant_folding=True # 常量折叠优化,提升简化效果
)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic(
model_input="weights/ground.onnx",
model_output="weights/ground_int8.onnx",
weight_type=QuantType.QInt8,
)
print("int8 quantization done!")
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment