"test/run_tests_util.py" did not exist on "bcf926ec656688d7eb03159faaddbf56bd4ec8e2"
Commit 34e4011b authored by zk's avatar zk
Browse files

首次提交

parents
Pipeline #3503 failed with stages
in 0 seconds
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
import argparse
import os
import sys
import time
import numpy as np
import torch
from PIL import Image, ImageDraw, ImageFont
import groundingdino.datasets.transforms as T
from groundingdino.models import build_model
from groundingdino.util import box_ops
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
from groundingdino.util.vl_utils import create_positive_map_from_span
# ====================== 核心配置 - 只需要改这里就能调整batch size ======================
INFERENCE_BATCH_SIZE = 8 # 推理批次大小,修改这个值即可改变batch size
# ====================================================================================
def plot_boxes_to_image(image_pil, tgt):
H, W = tgt["size"]
boxes = tgt["boxes"]
labels = tgt["labels"]
assert len(boxes) == len(labels), "boxes and labels must have same length"
draw = ImageDraw.Draw(image_pil)
mask = Image.new("L", image_pil.size, 0)
mask_draw = ImageDraw.Draw(mask)
# draw boxes and masks
for box, label in zip(boxes, labels):
# from 0..1 to 0..W, 0..H
box = box * torch.Tensor([W, H, W, H])
# from xywh to xyxy
box[:2] -= box[2:] / 2
box[2:] += box[:2]
# random color
color = tuple(np.random.randint(0, 255, size=3).tolist())
# draw
x0, y0, x1, y1 = box
x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
# draw.text((x0, y0), str(label), fill=color)
font = ImageFont.load_default()
if hasattr(font, "getbbox"):
bbox = draw.textbbox((x0, y0), str(label), font)
else:
w, h = draw.textsize(str(label), font)
bbox = (x0, y0, w + x0, y0 + h)
# bbox = draw.textbbox((x0, y0), str(label))
draw.rectangle(bbox, fill=color)
draw.text((x0, y0), str(label), fill="white")
mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
return image_pil, mask
def load_image(image_path):
# load image
image_pil = Image.open(image_path).convert("RGB") # load image
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
image, _ = transform(image_pil, None) # 3, h, w
return image_pil, image
def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
args = SLConfig.fromfile(model_config_path)
args.device = "cuda" if not cpu_only else "cpu"
model = build_model(args)
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
print(load_res)
_ = model.eval()
return model
def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
caption = caption.lower()
caption = caption.strip()
if not caption.endswith("."):
caption = caption + "."
device = "cuda" if not cpu_only else "cpu"
model = model.to(device)
# ========== 修改1: 构建batch数据 ==========
# 复制图像数据以构建batch (batch_size, 3, H, W)
image_batch = image.unsqueeze(0).repeat(INFERENCE_BATCH_SIZE, 1, 1, 1).to(device)
# 复制文本prompt以构建batch (batch_size,)
caption_batch = [caption] * INFERENCE_BATCH_SIZE
# 核心推理计时
torch.cuda.synchronize() if device == "cuda" else None # 等待GPU操作完成,确保计时准确
start_time = time.perf_counter() # 高精度计时
with torch.no_grad():
outputs = model(image_batch, captions=caption_batch) # 传入batch数据
torch.cuda.synchronize() if device == "cuda" else None # 等待GPU推理完成
infer_time = time.perf_counter() - start_time # 推理耗时(秒)
# 计算单张图片的平均推理时间
avg_single_infer_time = infer_time / INFERENCE_BATCH_SIZE
# ========== 修改2: 处理batch输出 ==========
# 取第一个样本的输出作为结果(所有样本结果相同)
logits = outputs["pred_logits"].sigmoid()[0] # (nq, 256)
boxes = outputs["pred_boxes"][0] # (nq, 4)
# filter output
if token_spans is None:
logits_filt = logits.cpu().clone()
boxes_filt = boxes.cpu().clone()
filt_mask = logits_filt.max(dim=1)[0] > box_threshold
logits_filt = logits_filt[filt_mask] # num_filt, 256
boxes_filt = boxes_filt[filt_mask] # num_filt, 4
# get phrase
tokenlizer = model.tokenizer
tokenized = tokenlizer(caption)
# build pred
pred_phrases = []
for logit, box in zip(logits_filt, boxes_filt):
pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
if with_logits:
pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
else:
pred_phrases.append(pred_phrase)
else:
# given-phrase mode
positive_maps = create_positive_map_from_span(
model.tokenizer(caption),
token_span=token_spans
).to(image.device) # n_phrase, 256
logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
all_logits = []
all_phrases = []
all_boxes = []
for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
# get phrase
phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
# get mask
filt_mask = logit_phr > box_threshold
# filt box
all_boxes.append(boxes[filt_mask])
# filt logits
all_logits.append(logit_phr[filt_mask])
if with_logits:
logit_phr_num = logit_phr[filt_mask]
all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
else:
all_phrases.extend([phrase for _ in range(len(filt_mask))])
boxes_filt = torch.cat(all_boxes, dim=0).cpu()
pred_phrases = all_phrases
# 返回batch推理总时间,方便性能计算
return boxes_filt, pred_phrases, infer_time, avg_single_infer_time
def benchmark_performance(model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, warmup_runs=5, test_runs=10):
"""
性能基准测试:预热 + 多次推理计算平均FPS和时延
适配batch推理,计算正确的吞吐量
Args:
warmup_runs: 预热次数(排除初始加载的影响)
test_runs: 正式测试次数
"""
# 1. 预热阶段(忽略耗时)
print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
for i in range(warmup_runs):
_, _, _, _ = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
)
print(f"预热完成 {i+1}/{warmup_runs}")
# 2. 正式测试阶段
print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
total_batch_time = 0.0 # 总batch推理时间
total_single_time = 0.0 # 总单张推理时间
batch_times = [] # 记录每次batch推理的时延
single_times = [] # 记录每次单张推理的平均时延
for i in range(test_runs):
_, _, batch_infer_time, avg_single_infer_time = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
)
batch_times.append(batch_infer_time)
single_times.append(avg_single_infer_time)
total_batch_time += batch_infer_time
total_single_time += avg_single_infer_time
print(f"测试 {i+1}/{test_runs} - Batch推理时延: {batch_infer_time*1000:.2f} ms | 单张平均时延: {avg_single_infer_time*1000:.2f} ms")
# 3. 计算性能指标
avg_batch_time = total_batch_time / test_runs # 平均batch推理时延(秒)
avg_single_time = total_single_time / test_runs # 平均单张推理时延(秒)
batch_throughput = (test_runs * INFERENCE_BATCH_SIZE) / total_batch_time # 总吞吐量 (张/秒)
batch_std_time = np.std(batch_times) # batch时延标准差
single_std_time = np.std(single_times) # 单张时延标准差
# 4. 输出性能报告
print("\n" + "="*60)
print("📊 性能测试报告 (Batch Size = {})".format(INFERENCE_BATCH_SIZE))
print("="*60)
print(f"测试环境: {'GPU (CUDA)' if not cpu_only else 'CPU'}")
print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
print(f"Batch Size: {INFERENCE_BATCH_SIZE}")
print(f"平均Batch推理时延: {avg_batch_time*1000:.2f} ms (±{batch_std_time*1000:.2f} ms)")
print(f"平均单张推理时延: {avg_single_time*1000:.2f} ms (±{single_std_time*1000:.2f} ms)")
print(f"最大Batch时延: {max(batch_times)*1000:.2f} ms | 最大单张时延: {max(single_times)*1000:.2f} ms")
print(f"最小Batch时延: {min(batch_times)*1000:.2f} ms | 最小单张时延: {min(single_times)*1000:.2f} ms")
print(f"总吞吐量: {batch_throughput:.2f} 张/秒")
print("="*60 + "\n")
return avg_batch_time, avg_single_time, batch_throughput, batch_times, single_times
if __name__ == "__main__":
parser = argparse.ArgumentParser("Grounding DINO 性能测试 (Batch推理)", add_help=True)
parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
parser.add_argument("--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file")
parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True, help="output directory")
parser.add_argument("--box_threshold", type=float, default=0.35, help="box threshold")
parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
parser.add_argument("--token_spans", type=str, default=None, help=
"The positions of start and end positions of phrases of interest. \
For example, a caption is 'a cat and a dog', \
if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
")
parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
# 新增性能测试参数
parser.add_argument("--warmup-runs", type=int, default=5, help="预热运行次数,默认5")
parser.add_argument("--test-runs", type=int, default=10, help="正式测试运行次数,默认10")
args = parser.parse_args()
# cfg
config_file = args.config_file
checkpoint_path = args.checkpoint_path
image_path = args.image_path
text_prompt = args.text_prompt
output_dir = args.output_dir
box_threshold = args.box_threshold
text_threshold = args.text_threshold
token_spans = args.token_spans
warmup_runs = args.warmup_runs
test_runs = args.test_runs
# 打印基础信息
print(f"📌 使用设备: {'GPU' if not args.cpu_only else 'CPU'}")
print(f"📌 推理Batch Size: {INFERENCE_BATCH_SIZE}")
if not args.cpu_only and torch.cuda.is_available():
print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
print(f"📌 GPU编号: {torch.cuda.current_device()}")
# make dir
os.makedirs(output_dir, exist_ok=True)
# load image
image_pil, image = load_image(image_path)
# load model
model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
# visualize raw image
image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
# set the text_threshold to None if token_spans is set.
if token_spans is not None:
text_threshold = None
print("Using token_spans. Set the text_threshold to None.")
# 运行性能基准测试
avg_batch_time, avg_single_time, throughput, batch_times, single_times = benchmark_performance(
model, image, text_prompt, box_threshold, text_threshold,
args.cpu_only, eval(f"{token_spans}") if token_spans else None,
warmup_runs, test_runs
)
# 单次推理并保存结果(保留原有功能)
print("\n=== 生成推理结果图片 ===")
boxes_filt, pred_phrases, batch_infer_time, single_infer_time = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, args.cpu_only,
eval(f"{token_spans}") if token_spans else None
)
# visualize pred
size = image_pil.size
pred_dict = {
"boxes": boxes_filt,
"size": [size[1], size[0]], # H,W
"labels": pred_phrases,
}
image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
image_with_box.save(os.path.join(output_dir, "pred.jpg"))
# 保存性能结果到文件
performance_file = os.path.join(output_dir, "performance_report.txt")
with open(performance_file, "w", encoding="utf-8") as f:
f.write("="*60 + "\n")
f.write(f"Grounding DINO 性能测试报告 (Batch Size = {INFERENCE_BATCH_SIZE})\n")
f.write("="*60 + "\n")
f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"测试设备: {'GPU' if not args.cpu_only else 'CPU'}\n")
if not args.cpu_only and torch.cuda.is_available():
f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
f.write(f"Batch Size: {INFERENCE_BATCH_SIZE}\n")
f.write(f"预热次数: {warmup_runs}\n")
f.write(f"测试次数: {test_runs}\n")
f.write(f"平均Batch推理时延: {avg_batch_time*1000:.2f} ms\n")
f.write(f"Batch时延标准差: {np.std(batch_times)*1000:.2f} ms\n")
f.write(f"平均单张推理时延: {avg_single_time*1000:.2f} ms\n")
f.write(f"单张时延标准差: {np.std(single_times)*1000:.2f} ms\n")
f.write(f"最大Batch时延: {max(batch_times)*1000:.2f} ms\n")
f.write(f"最小Batch时延: {min(batch_times)*1000:.2f} ms\n")
f.write(f"总吞吐量: {throughput:.2f} 张/秒\n")
f.write(f"最后一次Batch推理时延: {batch_infer_time*1000:.2f} ms\n")
f.write(f"最后一次单张推理时延: {single_infer_time*1000:.2f} ms\n")
print(f"\n✅ 性能报告已保存至: {performance_file}")
print(f"✅ 推理结果图片已保存至: {os.path.join(output_dir, 'pred.jpg')}")
\ No newline at end of file
import argparse
import os
import time
from typing import List, Optional, Tuple
import numpy as np
import torch
from PIL import Image, ImageDraw, ImageFont
import groundingdino.datasets.transforms as T
from groundingdino.util.utils import get_phrases_from_posmap
from groundingdino.util import get_tokenlizer
from groundingdino.util.slconfig import SLConfig
from groundingdino.models.GroundingDINO.bertwarper import (
generate_masks_with_special_tokens_and_transfer_map,
)
def plot_boxes_to_image(image_pil, tgt):
H, W = tgt["size"]
boxes = tgt["boxes"]
labels = tgt["labels"]
assert len(boxes) == len(labels), "boxes and labels must have same length"
draw = ImageDraw.Draw(image_pil)
mask = Image.new("L", image_pil.size, 0)
mask_draw = ImageDraw.Draw(mask)
for box, label in zip(boxes, labels):
box = box * torch.Tensor([W, H, W, H])
box[:2] -= box[2:] / 2
box[2:] += box[:2]
color = tuple(np.random.randint(0, 255, size=3).tolist())
x0, y0, x1, y1 = box
x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
font = ImageFont.load_default()
if hasattr(font, "getbbox"):
bbox = draw.textbbox((x0, y0), str(label), font)
else:
w, h = draw.textsize(str(label), font)
bbox = (x0, y0, w + x0, y0 + h)
draw.rectangle(bbox, fill=color)
draw.text((x0, y0), str(label), fill="white")
mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
return image_pil, mask
def load_image(image_path):
image_pil = Image.open(image_path).convert("RGB")
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
image, _ = transform(image_pil, None) # 3,h,w
return image_pil, image
def preprocess_caption(caption: str) -> str:
caption = caption.lower().strip()
if not caption.endswith("."):
caption = caption + "."
return caption
def sigmoid(x: np.ndarray) -> np.ndarray:
return 1.0 / (1.0 + np.exp(-x))
def build_text_tensors(
config_file: str,
caption: str,
device: str,
):
cfg = SLConfig.fromfile(config_file)
tokenizer = get_tokenlizer.get_tokenlizer(cfg.text_encoder_type)
special_token_ids = tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
caption = preprocess_caption(caption)
tokenized = tokenizer([caption], padding="longest", return_tensors="pt")
tokenized = {k: v.to(device) for k, v in tokenized.items()}
text_self_attention_masks, position_ids, _ = generate_masks_with_special_tokens_and_transfer_map(
tokenized, special_token_ids, tokenizer
)
max_text_len = getattr(cfg, "max_text_len", 256)
if text_self_attention_masks.shape[1] > max_text_len:
s = max_text_len
text_self_attention_masks = text_self_attention_masks[:, :s, :s]
position_ids = position_ids[:, :s]
tokenized["input_ids"] = tokenized["input_ids"][:, :s]
tokenized["attention_mask"] = tokenized["attention_mask"][:, :s]
tokenized["token_type_ids"] = tokenized["token_type_ids"][:, :s]
# 同时返回 tokenizer 和“单句 tokenize”(用于 get_phrases_from_posmap 行为对齐)
tokenized_single = tokenizer(caption)
return (
cfg,
tokenizer,
tokenized_single,
tokenized["input_ids"].to(torch.int64),
tokenized["token_type_ids"].to(torch.int64),
tokenized["attention_mask"].to(torch.int64),
position_ids.to(torch.int64),
text_self_attention_masks,
)
def ort_create_session(onnx_path: str, device: str, num_threads: int = 0):
import onnxruntime as ort
so = ort.SessionOptions()
if num_threads and num_threads > 0:
so.intra_op_num_threads = int(num_threads)
so.inter_op_num_threads = int(num_threads)
providers = ["CPUExecutionProvider"]
if device == "cuda":
# 若环境支持 onnxruntime-gpu,会自动启用 CUDA provider
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
return ort.InferenceSession(onnx_path, sess_options=so, providers=providers)
def onnx_infer_once(
sess,
image: torch.Tensor,
input_ids: torch.Tensor,
token_type_ids: torch.Tensor,
attention_mask: torch.Tensor,
position_ids: torch.Tensor,
text_self_attention_masks: torch.Tensor,
use_cuda_sync: bool,
) -> Tuple[np.ndarray, np.ndarray, float]:
# ORT 输入必须是 numpy
feeds = {
"image": image[None].detach().cpu().numpy().astype(np.float32),
"input_ids": input_ids.detach().cpu().numpy().astype(np.int64),
"token_type_ids": token_type_ids.detach().cpu().numpy().astype(np.int64),
"attention_mask": attention_mask.detach().cpu().numpy().astype(np.int64),
"position_ids": position_ids.detach().cpu().numpy().astype(np.int64),
"text_self_attention_masks": text_self_attention_masks.detach().cpu().numpy(),
}
if use_cuda_sync:
torch.cuda.synchronize()
start = time.perf_counter()
pred_logits, pred_boxes = sess.run(["pred_logits", "pred_boxes"], feeds)
if use_cuda_sync:
torch.cuda.synchronize()
infer_time = time.perf_counter() - start
return pred_logits, pred_boxes, infer_time
def postprocess_and_phrases(
pred_logits: np.ndarray, # [B,NQ,S]
pred_boxes: np.ndarray, # [B,NQ,4]
tokenized_single,
tokenizer,
box_threshold: float,
text_threshold: float,
with_logits: bool = True,
):
# 对齐 torch 版:取 batch=0
logits = sigmoid(pred_logits[0]) # [NQ,S]
boxes = pred_boxes[0] # [NQ,4]
max_per_query = logits.max(axis=1)
mask = max_per_query > box_threshold
logits_filt = logits[mask]
boxes_filt = boxes[mask]
pred_phrases: List[str] = []
for logit in logits_filt:
posmap = torch.from_numpy(logit) > text_threshold
phrase = get_phrases_from_posmap(posmap, tokenized_single, tokenizer)
phrase = phrase.replace(".", "")
if with_logits:
pred_phrases.append(phrase + f"({str(float(logit.max()))[:4]})")
else:
pred_phrases.append(phrase)
return torch.from_numpy(boxes_filt), pred_phrases
def benchmark_performance_onnx(
sess,
image: torch.Tensor,
input_ids: torch.Tensor,
token_type_ids: torch.Tensor,
attention_mask: torch.Tensor,
position_ids: torch.Tensor,
text_self_attention_masks: torch.Tensor,
warmup_runs: int = 5,
test_runs: int = 10,
use_cuda_sync: bool = False,
):
print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
for i in range(warmup_runs):
_ = onnx_infer_once(
sess,
image,
input_ids,
token_type_ids,
attention_mask,
position_ids,
text_self_attention_masks,
use_cuda_sync=use_cuda_sync,
)
print(f"预热完成 {i+1}/{warmup_runs}")
print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
total_time = 0.0
infer_times = []
for i in range(test_runs):
_, _, infer_time = onnx_infer_once(
sess,
image,
input_ids,
token_type_ids,
attention_mask,
position_ids,
text_self_attention_masks,
use_cuda_sync=use_cuda_sync,
)
infer_times.append(infer_time)
total_time += infer_time
print(f"测试 {i+1}/{test_runs} - 单次推理时延: {infer_time*1000:.2f} ms")
avg_infer_time = total_time / test_runs
fps = test_runs / total_time
std_infer_time = float(np.std(infer_times))
print("\n" + "=" * 50)
print("📊 ONNX 性能测试报告")
print("=" * 50)
print(f"测试环境: {'GPU (CUDAExecutionProvider)' if use_cuda_sync else 'CPU/Unknown'}")
print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
print(f"平均推理时延: {avg_infer_time*1000:.2f} ms (±{std_infer_time*1000:.2f} ms)")
print(f"最大推理时延: {max(infer_times)*1000:.2f} ms")
print(f"最小推理时延: {min(infer_times)*1000:.2f} ms")
print(f"平均FPS: {fps:.2f} 帧/秒")
print("=" * 50 + "\n")
return avg_infer_time, fps, infer_times
if __name__ == "__main__":
parser = argparse.ArgumentParser("Grounding DINO ONNX 推理与性能测试", add_help=True)
parser.add_argument("--onnx_path", type=str, required=True, help="onnx 模型路径")
parser.add_argument("--config_file", "-c", type=str, required=True, help="用于加载 tokenizer 等配置")
parser.add_argument("--image_path", "-i", type=str, required=True)
parser.add_argument("--text_prompt", "-t", type=str, required=True)
parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True)
parser.add_argument("--box_threshold", type=float, default=0.3)
parser.add_argument("--text_threshold", type=float, default=0.25)
parser.add_argument("--cpu-only", action="store_true")
parser.add_argument("--warmup-runs", type=int, default=5)
parser.add_argument("--test-runs", type=int, default=10)
parser.add_argument("--ort-threads", type=int, default=0, help="onnxruntime 线程数(0=默认)")
args = parser.parse_args()
device = "cpu" if args.cpu_only else ("cuda" if torch.cuda.is_available() else "cpu")
use_cuda_sync = device == "cuda"
print(f"📌 ORT 设备偏好: {device}")
if use_cuda_sync:
print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
os.makedirs(args.output_dir, exist_ok=True)
image_pil, image = load_image(args.image_path)
image_pil.save(os.path.join(args.output_dir, "raw_image.jpg"))
(
_cfg,
tokenizer,
tokenized_single,
input_ids,
token_type_ids,
attention_mask,
position_ids,
text_self_attention_masks,
) = build_text_tensors(args.config_file, args.text_prompt, device="cpu")
# image 在 GPU 上计时同步更准确,但 feeds 最终还是走 numpy(cpu);这里只保持与 torch 版一致:
# 计时逻辑保留 + 可视化保留;模型本体推理走 ORT
sess = ort_create_session(args.onnx_path, device=device, num_threads=args.ort_threads)
avg_infer_time, fps, infer_times = benchmark_performance_onnx(
sess,
image,
input_ids,
token_type_ids,
attention_mask,
position_ids,
text_self_attention_masks,
warmup_runs=args.warmup_runs,
test_runs=args.test_runs,
use_cuda_sync=use_cuda_sync,
)
print("\n=== 生成推理结果图片 ===")
pred_logits, pred_boxes, single_infer_time = onnx_infer_once(
sess,
image,
input_ids,
token_type_ids,
attention_mask,
position_ids,
text_self_attention_masks,
use_cuda_sync=use_cuda_sync,
)
boxes_filt, pred_phrases = postprocess_and_phrases(
pred_logits=pred_logits,
pred_boxes=pred_boxes,
tokenized_single=tokenized_single,
tokenizer=tokenizer,
box_threshold=args.box_threshold,
text_threshold=args.text_threshold,
with_logits=True,
)
size = image_pil.size
pred_dict = {
"boxes": boxes_filt,
"size": [size[1], size[0]], # H,W
"labels": pred_phrases,
}
image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
image_with_box.save(os.path.join(args.output_dir, "pred.jpg"))
performance_file = os.path.join(args.output_dir, "performance_report_onnx.txt")
with open(performance_file, "w", encoding="utf-8") as f:
f.write("=" * 50 + "\n")
f.write("Grounding DINO ONNX 性能测试报告\n")
f.write("=" * 50 + "\n")
f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"推理后端: onnxruntime\n")
f.write(f"设备偏好: {device}\n")
if use_cuda_sync:
f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
f.write(f"预热次数: {args.warmup_runs}\n")
f.write(f"测试次数: {args.test_runs}\n")
f.write(f"平均推理时延: {avg_infer_time*1000:.2f} ms\n")
f.write(f"时延标准差: {np.std(infer_times)*1000:.2f} ms\n")
f.write(f"最大时延: {max(infer_times)*1000:.2f} ms\n")
f.write(f"最小时延: {min(infer_times)*1000:.2f} ms\n")
f.write(f"平均FPS: {fps:.2f} 帧/秒\n")
f.write(f"单次推理时延(最后一次): {single_infer_time*1000:.2f} ms\n")
print(f"\n✅ 性能报告已保存至: {performance_file}")
print(f"✅ 推理结果图片已保存至: {os.path.join(args.output_dir, 'pred.jpg')}")
import argparse
import os
import sys
import time
import numpy as np
import torch
from PIL import Image, ImageDraw, ImageFont
import groundingdino.datasets.transforms as T
from groundingdino.models import build_model
from groundingdino.util import box_ops
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
from groundingdino.util.vl_utils import create_positive_map_from_span
def plot_boxes_to_image(image_pil, tgt):
H, W = tgt["size"]
boxes = tgt["boxes"]
labels = tgt["labels"]
assert len(boxes) == len(labels), "boxes and labels must have same length"
draw = ImageDraw.Draw(image_pil)
mask = Image.new("L", image_pil.size, 0)
mask_draw = ImageDraw.Draw(mask)
# draw boxes and masks
for box, label in zip(boxes, labels):
# from 0..1 to 0..W, 0..H
box = box * torch.Tensor([W, H, W, H])
# from xywh to xyxy
box[:2] -= box[2:] / 2
box[2:] += box[:2]
# random color
color = tuple(np.random.randint(0, 255, size=3).tolist())
# draw
x0, y0, x1, y1 = box
x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
# draw.text((x0, y0), str(label), fill=color)
font = ImageFont.load_default()
if hasattr(font, "getbbox"):
bbox = draw.textbbox((x0, y0), str(label), font)
else:
w, h = draw.textsize(str(label), font)
bbox = (x0, y0, w + x0, y0 + h)
# bbox = draw.textbbox((x0, y0), str(label))
draw.rectangle(bbox, fill=color)
draw.text((x0, y0), str(label), fill="white")
mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
return image_pil, mask
def load_image(image_path):
# load image
image_pil = Image.open(image_path).convert("RGB") # load image
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
image, _ = transform(image_pil, None) # 3, h, w
return image_pil, image
def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
args = SLConfig.fromfile(model_config_path)
args.device = "cuda" if not cpu_only else "cpu"
model = build_model(args)
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
print(load_res)
_ = model.eval()
return model
def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
caption = caption.lower()
caption = caption.strip()
if not caption.endswith("."):
caption = caption + "."
device = "cuda" if not cpu_only else "cpu"
model = model.to(device)
image = image.to(device)
# 核心推理计时
torch.cuda.synchronize() if device == "cuda" else None # 等待GPU操作完成,确保计时准确
start_time = time.perf_counter() # 高精度计时
with torch.no_grad():
outputs = model(image[None], captions=[caption])
torch.cuda.synchronize() if device == "cuda" else None # 等待GPU推理完成
infer_time = time.perf_counter() - start_time # 推理耗时(秒)
logits = outputs["pred_logits"].sigmoid()[0] # (nq, 256)
boxes = outputs["pred_boxes"][0] # (nq, 4)
# filter output
if token_spans is None:
logits_filt = logits.cpu().clone()
boxes_filt = boxes.cpu().clone()
filt_mask = logits_filt.max(dim=1)[0] > box_threshold
logits_filt = logits_filt[filt_mask] # num_filt, 256
boxes_filt = boxes_filt[filt_mask] # num_filt, 4
# get phrase
tokenlizer = model.tokenizer
tokenized = tokenlizer(caption)
# build pred
pred_phrases = []
for logit, box in zip(logits_filt, boxes_filt):
pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
if with_logits:
pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
else:
pred_phrases.append(pred_phrase)
else:
# given-phrase mode
positive_maps = create_positive_map_from_span(
model.tokenizer(text_prompt),
token_span=token_spans
).to(image.device) # n_phrase, 256
logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
all_logits = []
all_phrases = []
all_boxes = []
for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
# get phrase
phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
# get mask
filt_mask = logit_phr > box_threshold
# filt box
all_boxes.append(boxes[filt_mask])
# filt logits
all_logits.append(logit_phr[filt_mask])
if with_logits:
logit_phr_num = logit_phr[filt_mask]
all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
else:
all_phrases.extend([phrase for _ in range(len(filt_mask))])
boxes_filt = torch.cat(all_boxes, dim=0).cpu()
pred_phrases = all_phrases
return boxes_filt, pred_phrases, infer_time
def benchmark_performance(model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, warmup_runs=5, test_runs=10):
"""
性能基准测试:预热 + 多次推理计算平均FPS和时延
Args:
warmup_runs: 预热次数(排除初始加载的影响)
test_runs: 正式测试次数
"""
# 1. 预热阶段(忽略耗时)
print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
for i in range(warmup_runs):
_, _, _ = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
)
print(f"预热完成 {i+1}/{warmup_runs}")
# 2. 正式测试阶段
print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
total_time = 0.0
infer_times = [] # 记录每次推理的时延
for i in range(test_runs):
_, _, infer_time = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans
)
infer_times.append(infer_time)
total_time += infer_time
print(f"测试 {i+1}/{test_runs} - 单次推理时延: {infer_time*1000:.2f} ms")
# 3. 计算性能指标
avg_infer_time = total_time / test_runs # 平均时延(秒)
fps = test_runs / total_time # 平均FPS
std_infer_time = np.std(infer_times) # 时延标准差(稳定性)
# 4. 输出性能报告
print("\n" + "="*50)
print("📊 性能测试报告")
print("="*50)
print(f"测试环境: {'GPU (CUDA)' if not cpu_only else 'CPU'}")
print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
print(f"平均推理时延: {avg_infer_time*1000:.2f} ms (±{std_infer_time*1000:.2f} ms)")
print(f"最大推理时延: {max(infer_times)*1000:.2f} ms")
print(f"最小推理时延: {min(infer_times)*1000:.2f} ms")
print(f"平均FPS: {fps:.2f} 帧/秒")
print("="*50 + "\n")
return avg_infer_time, fps, infer_times
if __name__ == "__main__":
parser = argparse.ArgumentParser("Grounding DINO 性能测试", add_help=True)
parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
parser.add_argument("--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file")
parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True, help="output directory")
parser.add_argument("--box_threshold", type=float, default=0.35, help="box threshold")
parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
parser.add_argument("--token_spans", type=str, default=None, help=
"The positions of start and end positions of phrases of interest. \
For example, a caption is 'a cat and a dog', \
if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
")
parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
# 新增性能测试参数
parser.add_argument("--warmup-runs", type=int, default=5, help="预热运行次数,默认5")
parser.add_argument("--test-runs", type=int, default=10, help="正式测试运行次数,默认10")
args = parser.parse_args()
# cfg
config_file = args.config_file
checkpoint_path = args.checkpoint_path
image_path = args.image_path
text_prompt = args.text_prompt
output_dir = args.output_dir
box_threshold = args.box_threshold
text_threshold = args.text_threshold
token_spans = args.token_spans
warmup_runs = args.warmup_runs
test_runs = args.test_runs
# 打印基础信息
print(f"📌 使用设备: {'GPU' if not args.cpu_only else 'CPU'}")
if not args.cpu_only and torch.cuda.is_available():
print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
print(f"📌 GPU编号: {torch.cuda.current_device()}")
# make dir
os.makedirs(output_dir, exist_ok=True)
# load image
image_pil, image = load_image(image_path)
# load model
model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
# visualize raw image
image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
# set the text_threshold to None if token_spans is set.
if token_spans is not None:
text_threshold = None
print("Using token_spans. Set the text_threshold to None.")
# 运行性能基准测试
avg_infer_time, fps, infer_times = benchmark_performance(
model, image, text_prompt, box_threshold, text_threshold,
args.cpu_only, eval(f"{token_spans}") if token_spans else None,
warmup_runs, test_runs
)
# 单次推理并保存结果(保留原有功能)
print("\n=== 生成推理结果图片 ===")
boxes_filt, pred_phrases, single_infer_time = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, args.cpu_only,
eval(f"{token_spans}") if token_spans else None
)
# visualize pred
size = image_pil.size
pred_dict = {
"boxes": boxes_filt,
"size": [size[1], size[0]], # H,W
"labels": pred_phrases,
}
image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
image_with_box.save(os.path.join(output_dir, "pred.jpg"))
# 保存性能结果到文件
performance_file = os.path.join(output_dir, "performance_report.txt")
with open(performance_file, "w", encoding="utf-8") as f:
f.write("="*50 + "\n")
f.write("Grounding DINO 性能测试报告\n")
f.write("="*50 + "\n")
f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"测试设备: {'GPU' if not args.cpu_only else 'CPU'}\n")
if not args.cpu_only and torch.cuda.is_available():
f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
f.write(f"预热次数: {warmup_runs}\n")
f.write(f"测试次数: {test_runs}\n")
f.write(f"平均推理时延: {avg_infer_time*1000:.2f} ms\n")
f.write(f"时延标准差: {np.std(infer_times)*1000:.2f} ms\n")
f.write(f"最大时延: {max(infer_times)*1000:.2f} ms\n")
f.write(f"最小时延: {min(infer_times)*1000:.2f} ms\n")
f.write(f"平均FPS: {fps:.2f} 帧/秒\n")
f.write(f"单次推理时延(最后一次): {single_infer_time*1000:.2f} ms\n")
print(f"\n✅ 性能报告已保存至: {performance_file}")
print(f"✅ 推理结果图片已保存至: {os.path.join(output_dir, 'pred.jpg')}")
\ No newline at end of file
import argparse
import os
import sys
import time
import numpy as np
import torch
from PIL import Image, ImageDraw, ImageFont
import groundingdino.datasets.transforms as T
from groundingdino.models import build_model
from groundingdino.util import box_ops
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
from groundingdino.util.vl_utils import create_positive_map_from_span
def plot_boxes_to_image(image_pil, tgt):
H, W = tgt["size"]
boxes = tgt["boxes"]
labels = tgt["labels"]
assert len(boxes) == len(labels), "boxes and labels must have same length"
draw = ImageDraw.Draw(image_pil)
mask = Image.new("L", image_pil.size, 0)
mask_draw = ImageDraw.Draw(mask)
# draw boxes and masks
for box, label in zip(boxes, labels):
# from 0..1 to 0..W, 0..H
box = box * torch.Tensor([W, H, W, H])
# from xywh to xyxy
box[:2] -= box[2:] / 2
box[2:] += box[:2]
# random color
color = tuple(np.random.randint(0, 255, size=3).tolist())
# draw
x0, y0, x1, y1 = box
x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
# draw.text((x0, y0), str(label), fill=color)
font = ImageFont.load_default()
if hasattr(font, "getbbox"):
bbox = draw.textbbox((x0, y0), str(label), font)
else:
w, h = draw.textsize(str(label), font)
bbox = (x0, y0, w + x0, y0 + h)
# bbox = draw.textbbox((x0, y0), str(label))
draw.rectangle(bbox, fill=color)
draw.text((x0, y0), str(label), fill="white")
mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
return image_pil, mask
def load_image(image_path):
# load image
image_pil = Image.open(image_path).convert("RGB") # load image
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
image, _ = transform(image_pil, None) # 3, h, w
return image_pil, image
def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
args = SLConfig.fromfile(model_config_path)
args.device = "cuda" if not cpu_only else "cpu"
model = build_model(args)
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
print(load_res)
_ = model.eval()
return model
def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None, use_fp16=False):
assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
caption = caption.lower()
caption = caption.strip()
if not caption.endswith("."):
caption = caption + "."
device = "cuda" if not cpu_only else "cpu"
model = model.to(device)
image = image.to(device)
# 核心推理计时
torch.cuda.synchronize() if device == "cuda" else None # 等待GPU操作完成,确保计时准确
start_time = time.perf_counter() # 高精度计时
with torch.no_grad():
if use_fp16 and device == "cuda":
with torch.autocast(device_type="cuda", dtype=torch.float16):
outputs = model(image[None], captions=[caption])
else:
outputs = model(image[None], captions=[caption])
torch.cuda.synchronize() if device == "cuda" else None # 等待GPU推理完成
infer_time = time.perf_counter() - start_time # 推理耗时(秒)
# 输出转回 FP32,避免后续 CPU 操作出问题
logits = outputs["pred_logits"].float().sigmoid()[0]
boxes = outputs["pred_boxes"].float()[0]
# filter output
if token_spans is None:
logits_filt = logits.cpu().clone()
boxes_filt = boxes.cpu().clone()
filt_mask = logits_filt.max(dim=1)[0] > box_threshold
logits_filt = logits_filt[filt_mask] # num_filt, 256
boxes_filt = boxes_filt[filt_mask] # num_filt, 4
# get phrase
tokenlizer = model.tokenizer
tokenized = tokenlizer(caption)
# build pred
pred_phrases = []
for logit, box in zip(logits_filt, boxes_filt):
pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
if with_logits:
pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
else:
pred_phrases.append(pred_phrase)
else:
# given-phrase mode
positive_maps = create_positive_map_from_span(
model.tokenizer(text_prompt),
token_span=token_spans
).to(image.device) # n_phrase, 256
logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
all_logits = []
all_phrases = []
all_boxes = []
for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
# get phrase
phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
# get mask
filt_mask = logit_phr > box_threshold
# filt box
all_boxes.append(boxes[filt_mask])
# filt logits
all_logits.append(logit_phr[filt_mask])
if with_logits:
logit_phr_num = logit_phr[filt_mask]
all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
else:
all_phrases.extend([phrase for _ in range(len(filt_mask))])
boxes_filt = torch.cat(all_boxes, dim=0).cpu()
pred_phrases = all_phrases
return boxes_filt, pred_phrases, infer_time
def benchmark_performance(model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, warmup_runs=5, test_runs=10, use_fp16=False):
"""
性能基准测试:预热 + 多次推理计算平均FPS和时延
Args:
warmup_runs: 预热次数(排除初始加载的影响)
test_runs: 正式测试次数
"""
# 1. 预热阶段(忽略耗时)
print(f"\n=== 预热阶段 ({warmup_runs} 次) ===")
for i in range(warmup_runs):
_, _, _ = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, use_fp16=use_fp16
)
print(f"预热完成 {i+1}/{warmup_runs}")
# 2. 正式测试阶段
print(f"\n=== 正式测试阶段 ({test_runs} 次) ===")
total_time = 0.0
infer_times = [] # 记录每次推理的时延
for i in range(test_runs):
_, _, infer_time = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, cpu_only, token_spans, use_fp16=use_fp16
)
infer_times.append(infer_time)
total_time += infer_time
print(f"测试 {i+1}/{test_runs} - 单次推理时延: {infer_time*1000:.2f} ms")
# 3. 计算性能指标
avg_infer_time = total_time / test_runs # 平均时延(秒)
fps = test_runs / total_time # 平均FPS
std_infer_time = np.std(infer_times) # 时延标准差(稳定性)
# 4. 输出性能报告
print("\n" + "="*50)
print("📊 性能测试报告")
print("="*50)
print(f"测试环境: {'GPU (CUDA)' if not cpu_only else 'CPU'}")
print(f"测试次数: {test_runs} 次 (预热 {warmup_runs} 次)")
print(f"平均推理时延: {avg_infer_time*1000:.2f} ms (±{std_infer_time*1000:.2f} ms)")
print(f"最大推理时延: {max(infer_times)*1000:.2f} ms")
print(f"最小推理时延: {min(infer_times)*1000:.2f} ms")
print(f"平均FPS: {fps:.2f} 帧/秒")
print("="*50 + "\n")
return avg_infer_time, fps, infer_times
if __name__ == "__main__":
parser = argparse.ArgumentParser("Grounding DINO 性能测试", add_help=True)
parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
parser.add_argument("--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file")
parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
parser.add_argument("--output_dir", "-o", type=str, default="outputs", required=True, help="output directory")
parser.add_argument("--box_threshold", type=float, default=0.35, help="box threshold")
parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
parser.add_argument("--token_spans", type=str, default=None, help=
"The positions of start and end positions of phrases of interest. \
For example, a caption is 'a cat and a dog', \
if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
")
parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
# 新增性能测试参数
parser.add_argument("--warmup-runs", type=int, default=5, help="预热运行次数,默认5")
parser.add_argument("--test-runs", type=int, default=10, help="正式测试运行次数,默认10")
parser.add_argument("--fp16", action="store_true", help="Enable FP16 inference")
args = parser.parse_args()
# cfg
config_file = args.config_file
checkpoint_path = args.checkpoint_path
image_path = args.image_path
text_prompt = args.text_prompt
output_dir = args.output_dir
box_threshold = args.box_threshold
text_threshold = args.text_threshold
token_spans = args.token_spans
warmup_runs = args.warmup_runs
test_runs = args.test_runs
# 打印基础信息
print(f"📌 使用设备: {'GPU' if not args.cpu_only else 'CPU'}")
if not args.cpu_only and torch.cuda.is_available():
print(f"📌 GPU型号: {torch.cuda.get_device_name(0)}")
print(f"📌 GPU编号: {torch.cuda.current_device()}")
# make dir
os.makedirs(output_dir, exist_ok=True)
# load image
image_pil, image = load_image(image_path)
# load model
model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
# visualize raw image
image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
# set the text_threshold to None if token_spans is set.
if token_spans is not None:
text_threshold = None
print("Using token_spans. Set the text_threshold to None.")
# 运行性能基准测试
avg_infer_time, fps, infer_times = benchmark_performance(
model, image, text_prompt, box_threshold, text_threshold,
args.cpu_only, eval(f"{token_spans}") if token_spans else None,
warmup_runs, test_runs, use_fp16=args.fp16
)
# 单次推理并保存结果(保留原有功能)
print("\n=== 生成推理结果图片 ===")
boxes_filt, pred_phrases, single_infer_time = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, args.cpu_only,
eval(f"{token_spans}") if token_spans else None, use_fp16=args.fp16
)
# visualize pred
size = image_pil.size
pred_dict = {
"boxes": boxes_filt,
"size": [size[1], size[0]], # H,W
"labels": pred_phrases,
}
image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
image_with_box.save(os.path.join(output_dir, "pred.jpg"))
# 保存性能结果到文件
performance_file = os.path.join(output_dir, "performance_report.txt")
with open(performance_file, "w", encoding="utf-8") as f:
f.write("="*50 + "\n")
f.write("Grounding DINO 性能测试报告\n")
f.write("="*50 + "\n")
f.write(f"测试时间: {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"测试设备: {'GPU' if not args.cpu_only else 'CPU'}\n")
if not args.cpu_only and torch.cuda.is_available():
f.write(f"GPU型号: {torch.cuda.get_device_name(0)}\n")
f.write(f"预热次数: {warmup_runs}\n")
f.write(f"测试次数: {test_runs}\n")
f.write(f"平均推理时延: {avg_infer_time*1000:.2f} ms\n")
f.write(f"时延标准差: {np.std(infer_times)*1000:.2f} ms\n")
f.write(f"最大时延: {max(infer_times)*1000:.2f} ms\n")
f.write(f"最小时延: {min(infer_times)*1000:.2f} ms\n")
f.write(f"平均FPS: {fps:.2f} 帧/秒\n")
f.write(f"单次推理时延(最后一次): {single_infer_time*1000:.2f} ms\n")
print(f"\n✅ 性能报告已保存至: {performance_file}")
print(f"✅ 推理结果图片已保存至: {os.path.join(output_dir, 'pred.jpg')}")
\ No newline at end of file
import argparse
import os
import sys
import numpy as np
import torch
from PIL import Image, ImageDraw, ImageFont
import groundingdino.datasets.transforms as T
from groundingdino.models import build_model
from groundingdino.util import box_ops
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict, get_phrases_from_posmap
from groundingdino.util.vl_utils import create_positive_map_from_span
def plot_boxes_to_image(image_pil, tgt):
H, W = tgt["size"]
boxes = tgt["boxes"]
labels = tgt["labels"]
assert len(boxes) == len(labels), "boxes and labels must have same length"
draw = ImageDraw.Draw(image_pil)
mask = Image.new("L", image_pil.size, 0)
mask_draw = ImageDraw.Draw(mask)
# draw boxes and masks
for box, label in zip(boxes, labels):
# from 0..1 to 0..W, 0..H
box = box * torch.Tensor([W, H, W, H])
# from xywh to xyxy
box[:2] -= box[2:] / 2
box[2:] += box[:2]
# random color
color = tuple(np.random.randint(0, 255, size=3).tolist())
# draw
x0, y0, x1, y1 = box
x0, y0, x1, y1 = int(x0), int(y0), int(x1), int(y1)
draw.rectangle([x0, y0, x1, y1], outline=color, width=6)
# draw.text((x0, y0), str(label), fill=color)
font = ImageFont.load_default()
if hasattr(font, "getbbox"):
bbox = draw.textbbox((x0, y0), str(label), font)
else:
w, h = draw.textsize(str(label), font)
bbox = (x0, y0, w + x0, y0 + h)
# bbox = draw.textbbox((x0, y0), str(label))
draw.rectangle(bbox, fill=color)
draw.text((x0, y0), str(label), fill="white")
mask_draw.rectangle([x0, y0, x1, y1], fill=255, width=6)
return image_pil, mask
def load_image(image_path):
# load image
image_pil = Image.open(image_path).convert("RGB") # load image
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
image, _ = transform(image_pil, None) # 3, h, w
return image_pil, image
def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
args = SLConfig.fromfile(model_config_path)
args.device = "cuda" if not cpu_only else "cpu"
model = build_model(args)
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
load_res = model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
print(load_res)
_ = model.eval()
return model
def get_grounding_output(model, image, caption, box_threshold, text_threshold=None, with_logits=True, cpu_only=False, token_spans=None):
assert text_threshold is not None or token_spans is not None, "text_threshould and token_spans should not be None at the same time!"
caption = caption.lower()
caption = caption.strip()
if not caption.endswith("."):
caption = caption + "."
device = "cuda" if not cpu_only else "cpu"
model = model.to(device)
image = image.to(device)
with torch.no_grad():
outputs = model(image[None], captions=[caption])
logits = outputs["pred_logits"].sigmoid()[0] # (nq, 256)
boxes = outputs["pred_boxes"][0] # (nq, 4)
# filter output
if token_spans is None:
logits_filt = logits.cpu().clone()
boxes_filt = boxes.cpu().clone()
filt_mask = logits_filt.max(dim=1)[0] > box_threshold
logits_filt = logits_filt[filt_mask] # num_filt, 256
boxes_filt = boxes_filt[filt_mask] # num_filt, 4
# get phrase
tokenlizer = model.tokenizer
tokenized = tokenlizer(caption)
# build pred
pred_phrases = []
for logit, box in zip(logits_filt, boxes_filt):
pred_phrase = get_phrases_from_posmap(logit > text_threshold, tokenized, tokenlizer)
if with_logits:
pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
else:
pred_phrases.append(pred_phrase)
else:
# given-phrase mode
positive_maps = create_positive_map_from_span(
model.tokenizer(text_prompt),
token_span=token_spans
).to(image.device) # n_phrase, 256
logits_for_phrases = positive_maps @ logits.T # n_phrase, nq
all_logits = []
all_phrases = []
all_boxes = []
for (token_span, logit_phr) in zip(token_spans, logits_for_phrases):
# get phrase
phrase = ' '.join([caption[_s:_e] for (_s, _e) in token_span])
# get mask
filt_mask = logit_phr > box_threshold
# filt box
all_boxes.append(boxes[filt_mask])
# filt logits
all_logits.append(logit_phr[filt_mask])
if with_logits:
logit_phr_num = logit_phr[filt_mask]
all_phrases.extend([phrase + f"({str(logit.item())[:4]})" for logit in logit_phr_num])
else:
all_phrases.extend([phrase for _ in range(len(filt_mask))])
boxes_filt = torch.cat(all_boxes, dim=0).cpu()
pred_phrases = all_phrases
return boxes_filt, pred_phrases
if __name__ == "__main__":
parser = argparse.ArgumentParser("Grounding DINO example", add_help=True)
parser.add_argument("--config_file", "-c", type=str, required=True, help="path to config file")
parser.add_argument(
"--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file"
)
parser.add_argument("--image_path", "-i", type=str, required=True, help="path to image file")
parser.add_argument("--text_prompt", "-t", type=str, required=True, help="text prompt")
parser.add_argument(
"--output_dir", "-o", type=str, default="outputs", required=True, help="output directory"
)
parser.add_argument("--box_threshold", type=float, default=0.3, help="box threshold")
parser.add_argument("--text_threshold", type=float, default=0.25, help="text threshold")
parser.add_argument("--token_spans", type=str, default=None, help=
"The positions of start and end positions of phrases of interest. \
For example, a caption is 'a cat and a dog', \
if you would like to detect 'cat', the token_spans should be '[[[2, 5]], ]', since 'a cat and a dog'[2:5] is 'cat'. \
if you would like to detect 'a cat', the token_spans should be '[[[0, 1], [2, 5]], ]', since 'a cat and a dog'[0:1] is 'a', and 'a cat and a dog'[2:5] is 'cat'. \
")
parser.add_argument("--cpu-only", action="store_true", help="running on cpu only!, default=False")
args = parser.parse_args()
# cfg
config_file = args.config_file # change the path of the model config file
checkpoint_path = args.checkpoint_path # change the path of the model
image_path = args.image_path
text_prompt = args.text_prompt
output_dir = args.output_dir
box_threshold = args.box_threshold
text_threshold = args.text_threshold
token_spans = args.token_spans
# make dir
os.makedirs(output_dir, exist_ok=True)
# load image
image_pil, image = load_image(image_path)
# load model
model = load_model(config_file, checkpoint_path, cpu_only=args.cpu_only)
# visualize raw image
image_pil.save(os.path.join(output_dir, "raw_image.jpg"))
# set the text_threshold to None if token_spans is set.
if token_spans is not None:
text_threshold = None
print("Using token_spans. Set the text_threshold to None.")
# run model
boxes_filt, pred_phrases = get_grounding_output(
model, image, text_prompt, box_threshold, text_threshold, cpu_only=args.cpu_only, token_spans=eval(f"{token_spans}")
)
# visualize pred
size = image_pil.size
pred_dict = {
"boxes": boxes_filt,
"size": [size[1], size[0]], # H,W
"labels": pred_phrases,
}
# import ipdb; ipdb.set_trace()
image_with_box = plot_boxes_to_image(image_pil, pred_dict)[0]
image_with_box.save(os.path.join(output_dir, "pred.jpg"))
import argparse
import os
import sys
import time
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, DistributedSampler
from groundingdino.models import build_model
import groundingdino.datasets.transforms as T
from groundingdino.util import box_ops, get_tokenlizer
from groundingdino.util.misc import clean_state_dict, collate_fn
from groundingdino.util.slconfig import SLConfig
# from torchvision.datasets import CocoDetection
import torchvision
from groundingdino.util.vl_utils import build_captions_and_token_span, create_positive_map_from_span
from groundingdino.datasets.cocogrounding_eval import CocoGroundingEvaluator
def load_model(model_config_path: str, model_checkpoint_path: str, device: str = "cuda"):
args = SLConfig.fromfile(model_config_path)
args.device = device
model = build_model(args)
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
model.eval()
return model
class CocoDetection(torchvision.datasets.CocoDetection):
def __init__(self, img_folder, ann_file, transforms):
super().__init__(img_folder, ann_file)
self._transforms = transforms
def __getitem__(self, idx):
img, target = super().__getitem__(idx) # target: list
# import ipdb; ipdb.set_trace()
w, h = img.size
boxes = [obj["bbox"] for obj in target]
boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
boxes[:, 2:] += boxes[:, :2] # xywh -> xyxy
boxes[:, 0::2].clamp_(min=0, max=w)
boxes[:, 1::2].clamp_(min=0, max=h)
# filt invalid boxes/masks/keypoints
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
boxes = boxes[keep]
target_new = {}
image_id = self.ids[idx]
target_new["image_id"] = image_id
target_new["boxes"] = boxes
target_new["orig_size"] = torch.as_tensor([int(h), int(w)])
if self._transforms is not None:
img, target = self._transforms(img, target_new)
return img, target
class PostProcessCocoGrounding(nn.Module):
""" This module converts the model's output into the format expected by the coco api"""
def __init__(self, num_select=300, coco_api=None, tokenlizer=None) -> None:
super().__init__()
self.num_select = num_select
assert coco_api is not None
category_dict = coco_api.dataset['categories']
cat_list = [item['name'] for item in category_dict]
captions, cat2tokenspan = build_captions_and_token_span(cat_list, True)
tokenspanlist = [cat2tokenspan[cat] for cat in cat_list]
positive_map = create_positive_map_from_span(
tokenlizer(captions), tokenspanlist) # 80, 256. normed
id_map = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 13, 12: 14, 13: 15, 14: 16, 15: 17, 16: 18, 17: 19, 18: 20, 19: 21, 20: 22, 21: 23, 22: 24, 23: 25, 24: 27, 25: 28, 26: 31, 27: 32, 28: 33, 29: 34, 30: 35, 31: 36, 32: 37, 33: 38, 34: 39, 35: 40, 36: 41, 37: 42, 38: 43, 39: 44, 40: 46,
41: 47, 42: 48, 43: 49, 44: 50, 45: 51, 46: 52, 47: 53, 48: 54, 49: 55, 50: 56, 51: 57, 52: 58, 53: 59, 54: 60, 55: 61, 56: 62, 57: 63, 58: 64, 59: 65, 60: 67, 61: 70, 62: 72, 63: 73, 64: 74, 65: 75, 66: 76, 67: 77, 68: 78, 69: 79, 70: 80, 71: 81, 72: 82, 73: 84, 74: 85, 75: 86, 76: 87, 77: 88, 78: 89, 79: 90}
# build a mapping from label_id to pos_map
new_pos_map = torch.zeros((91, 256))
for k, v in id_map.items():
new_pos_map[v] = positive_map[k]
self.positive_map = new_pos_map
@torch.no_grad()
def forward(self, outputs, target_sizes, not_to_xyxy=False):
""" Perform the computation
Parameters:
outputs: raw outputs of the model
target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
For evaluation, this must be the original image size (before any data augmentation)
For visualization, this should be the image size after data augment, but before padding
"""
num_select = self.num_select
out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
# pos map to logit
prob_to_token = out_logits.sigmoid() # bs, 100, 256
pos_maps = self.positive_map.to(prob_to_token.device)
# (bs, 100, 256) @ (91, 256).T -> (bs, 100, 91)
prob_to_label = prob_to_token @ pos_maps.T
# if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
# import ipdb; ipdb.set_trace()
assert len(out_logits) == len(target_sizes)
assert target_sizes.shape[1] == 2
prob = prob_to_label
topk_values, topk_indexes = torch.topk(
prob.view(out_logits.shape[0], -1), num_select, dim=1)
scores = topk_values
topk_boxes = topk_indexes // prob.shape[2]
labels = topk_indexes % prob.shape[2]
if not_to_xyxy:
boxes = out_bbox
else:
boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
boxes = torch.gather(
boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
# and from relative [0, 1] to absolute [0, height] coordinates
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
boxes = boxes * scale_fct[:, None, :]
results = [{'scores': s, 'labels': l, 'boxes': b}
for s, l, b in zip(scores, labels, boxes)]
return results
def main(args):
# config
cfg = SLConfig.fromfile(args.config_file)
# build model
model = load_model(args.config_file, args.checkpoint_path)
model = model.to(args.device)
model = model.eval()
# build dataloader
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
dataset = CocoDetection(
args.image_dir, args.anno_path, transforms=transform)
data_loader = DataLoader(
dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn)
# build post processor
tokenlizer = get_tokenlizer.get_tokenlizer(cfg.text_encoder_type)
postprocessor = PostProcessCocoGrounding(
coco_api=dataset.coco, tokenlizer=tokenlizer)
# build evaluator
evaluator = CocoGroundingEvaluator(
dataset.coco, iou_types=("bbox",), useCats=True)
# build captions
category_dict = dataset.coco.dataset['categories']
cat_list = [item['name'] for item in category_dict]
caption = " . ".join(cat_list) + ' .'
print("Input text prompt:", caption)
# run inference
start = time.time()
for i, (images, targets) in enumerate(data_loader):
# get images and captions
images = images.tensors.to(args.device)
bs = images.shape[0]
input_captions = [caption] * bs
# feed to the model
outputs = model(images, captions=input_captions)
orig_target_sizes = torch.stack(
[t["orig_size"] for t in targets], dim=0).to(images.device)
results = postprocessor(outputs, orig_target_sizes)
cocogrounding_res = {
target["image_id"]: output for target, output in zip(targets, results)}
evaluator.update(cocogrounding_res)
if (i+1) % 30 == 0:
used_time = time.time() - start
eta = len(data_loader) / (i+1e-5) * used_time - used_time
print(
f"processed {i}/{len(data_loader)} images. time: {used_time:.2f}s, ETA: {eta:.2f}s")
evaluator.synchronize_between_processes()
evaluator.accumulate()
evaluator.summarize()
print("Final results:", evaluator.coco_eval["bbox"].stats.tolist())
if __name__ == "__main__":
parser = argparse.ArgumentParser(
"Grounding DINO eval on COCO", add_help=True)
# load model
parser.add_argument("--config_file", "-c", type=str,
required=True, help="path to config file")
parser.add_argument(
"--checkpoint_path", "-p", type=str, required=True, help="path to checkpoint file"
)
parser.add_argument("--device", type=str, default="cuda",
help="running device (default: cuda)")
# post processing
parser.add_argument("--num_select", type=int, default=300,
help="number of topk to select")
# coco info
parser.add_argument("--anno_path", type=str,
required=True, help="coco root")
parser.add_argument("--image_dir", type=str,
required=True, help="coco image dir")
parser.add_argument("--num_workers", type=int, default=4,
help="number of workers for dataloader")
args = parser.parse_args()
main(args)
import argparse
import os
import sys
import time
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, DistributedSampler
import torchvision
import onnxruntime as ort
import groundingdino.datasets.transforms as T
from groundingdino.util import box_ops, get_tokenlizer
from groundingdino.util.misc import collate_fn
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.vl_utils import build_captions_and_token_span, create_positive_map_from_span
from groundingdino.datasets.cocogrounding_eval import CocoGroundingEvaluator
class CocoDetection(torchvision.datasets.CocoDetection):
def __init__(self, img_folder, ann_file, transforms):
super().__init__(img_folder, ann_file)
self._transforms = transforms
def __getitem__(self, idx):
img, target = super().__getitem__(idx)
w, h = img.size
boxes = [obj["bbox"] for obj in target]
boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
boxes[:, 2:] += boxes[:, :2] # xywh -> xyxy
boxes[:, 0::2].clamp_(min=0, max=w)
boxes[:, 1::2].clamp_(min=0, max=h)
keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
boxes = boxes[keep]
target_new = {}
image_id = self.ids[idx]
target_new["image_id"] = image_id
target_new["boxes"] = boxes
target_new["orig_size"] = torch.as_tensor([int(h), int(w)])
if self._transforms is not None:
img, target = self._transforms(img, target_new)
return img, target
class PostProcessCocoGrounding(nn.Module):
"""保持和原代码一致的后处理逻辑"""
def __init__(self, num_select=300, coco_api=None, tokenlizer=None) -> None:
super().__init__()
self.num_select = num_select
assert coco_api is not None
category_dict = coco_api.dataset['categories']
cat_list = [item['name'] for item in category_dict]
captions, cat2tokenspan = build_captions_and_token_span(cat_list, True)
tokenspanlist = [cat2tokenspan[cat] for cat in cat_list]
positive_map = create_positive_map_from_span(
tokenlizer(captions), tokenspanlist) # 80, 256. normed
id_map = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9, 9: 10, 10: 11, 11: 13, 12: 14, 13: 15, 14: 16, 15: 17, 16: 18, 17: 19, 18: 20, 19: 21, 20: 22, 21: 23, 22: 24, 23: 25, 24: 27, 25: 28, 26: 31, 27: 32, 28: 33, 29: 34, 30: 35, 31: 36, 32: 37, 33: 38, 34: 39, 35: 40, 36: 41, 37: 42, 38: 43, 39: 44, 40: 46,
41: 47, 42: 48, 43: 49, 44: 50, 45: 51, 46: 52, 47: 53, 48: 54, 49: 55, 50: 56, 51: 57, 52: 58, 53: 59, 54: 60, 55: 61, 56: 62, 57: 63, 58: 64, 59: 65, 60: 67, 61: 70, 62: 72, 63: 73, 64: 74, 65: 75, 66: 76, 67: 77, 68: 78, 69: 79, 70: 80, 71: 81, 72: 82, 73: 84, 74: 85, 75: 86, 76: 87, 77: 88, 78: 89, 79: 90}
new_pos_map = torch.zeros((91, 256))
for k, v in id_map.items():
new_pos_map[v] = positive_map[k]
self.positive_map = new_pos_map
@torch.no_grad()
def forward(self, outputs, target_sizes, not_to_xyxy=False):
num_select = self.num_select
out_logits, out_bbox = outputs['pred_logits'], outputs['pred_boxes']
prob_to_token = torch.from_numpy(out_logits).sigmoid() # 适配numpy输入
pos_maps = self.positive_map.to(prob_to_token.device)
prob_to_label = prob_to_token @ pos_maps.T
assert prob_to_label.shape[0] == len(target_sizes)
assert target_sizes.shape[1] == 2
prob = prob_to_label
topk_values, topk_indexes = torch.topk(
prob.view(prob_to_label.shape[0], -1), num_select, dim=1)
scores = topk_values
topk_boxes = topk_indexes // prob.shape[2]
labels = topk_indexes % prob.shape[2]
if not_to_xyxy:
boxes = torch.from_numpy(out_bbox)
else:
boxes = box_ops.box_cxcywh_to_xyxy(torch.from_numpy(out_bbox))
boxes = torch.gather(
boxes, 1, topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
img_h, img_w = target_sizes.unbind(1)
scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
boxes = boxes * scale_fct[:, None, :]
results = [{'scores': s, 'labels': l, 'boxes': b}
for s, l, b in zip(scores, labels, boxes)]
return results
def load_onnx_model(onnx_path, device="cuda"):
"""加载ONNX模型并创建推理session"""
providers = ['CPUExecutionProvider']
if device == "cuda" and ort.get_device() == "GPU":
providers = ['ROCMExecutionProvider', 'CPUExecutionProvider']
session = ort.InferenceSession(
onnx_path,
providers=providers,
provider_options=[{'device_id': 0}] if "CUDAExecutionProvider" in providers else []
)
return session
def onnx_inference(session, images, captions):
"""ONNX模型推理(需匹配模型输入格式)"""
# 转换为numpy(ONNX Runtime不支持torch tensor)
images_np = images.cpu().numpy().astype(np.float32)
# 注意:此处需根据你的ONNX模型输入名调整(可通过netron查看)
# 假设模型输入为 "images" 和 "captions"(需根据实际情况修改)
input_feed = {
session.get_inputs()[0].name: images_np,
# 如果caption是文本token,需补充token化逻辑,此处假设已处理
# session.get_inputs()[1].name: captions_np
}
# 执行推理
outputs = session.run(None, input_feed)
# 解析输出(需匹配模型输出格式,假设输出为logits和bbox)
# 需根据你的ONNX模型输出调整维度和顺序
pred_logits = outputs[0] # 形状: [bs, 100, 256]
pred_boxes = outputs[1] # 形状: [bs, 100, 4]
return {"pred_logits": pred_logits, "pred_boxes": pred_boxes}
def main(args):
# 配置(主要用于tokenizer和后处理)
cfg = SLConfig.fromfile(args.config_file)
# 加载ONNX模型
onnx_session = load_onnx_model(args.onnx_path, args.device)
# 构建数据加载器(和原代码一致)
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
dataset = CocoDetection(
args.image_dir, args.anno_path, transforms=transform)
data_loader = DataLoader(
dataset, batch_size=1, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn)
# 构建后处理器(和原代码一致)
tokenlizer = get_tokenlizer.get_tokenlizer(cfg.text_encoder_type)
postprocessor = PostProcessCocoGrounding(
coco_api=dataset.coco, tokenlizer=tokenlizer)
# 构建评估器(和原代码一致)
evaluator = CocoGroundingEvaluator(
dataset.coco, iou_types=("bbox",), useCats=True)
# 构建文本提示(和原代码一致)
category_dict = dataset.coco.dataset['categories']
cat_list = [item['name'] for item in category_dict]
caption = " . ".join(cat_list) + ' .'
print("Input text prompt:", caption)
# 运行推理
start = time.time()
for i, (images, targets) in enumerate(data_loader):
# 预处理图像(和原代码一致)
images = images.tensors.to(args.device)
bs = images.shape[0]
input_captions = [caption] * bs
# ONNX推理(替换原PyTorch模型推理)
outputs = onnx_inference(onnx_session, images, input_captions)
# 后处理(适配ONNX输出格式)
orig_target_sizes = torch.stack(
[t["orig_size"] for t in targets], dim=0).to(args.device)
results = postprocessor(outputs, orig_target_sizes)
cocogrounding_res = {
target["image_id"]: output for target, output in zip(targets, results)}
evaluator.update(cocogrounding_res)
# 打印进度
if (i+1) % 30 == 0:
used_time = time.time() - start
eta = len(data_loader) / (i+1e-5) * used_time - used_time
print(
f"processed {i}/{len(data_loader)} images. time: {used_time:.2f}s, ETA: {eta:.2f}s")
# 评估指标汇总
evaluator.synchronize_between_processes()
evaluator.accumulate()
evaluator.summarize()
print("Final results:", evaluator.coco_eval["bbox"].stats.tolist())
if __name__ == "__main__":
parser = argparse.ArgumentParser(
"Grounding DINO ONNX eval on COCO", add_help=True)
# 新增ONNX模型路径参数
parser.add_argument("--onnx_path", type=str, required=True, help="path to onnx model file")
# 保留原配置文件参数(用于tokenizer和后处理)
parser.add_argument("--config_file", "-c", type=str,
required=True, help="path to config file")
parser.add_argument("--device", type=str, default="cuda",
help="running device (default: cuda)")
# 后处理参数
parser.add_argument("--num_select", type=int, default=300,
help="number of topk to select")
# COCO数据集参数
parser.add_argument("--anno_path", type=str,
required=True, help="coco annotation path")
parser.add_argument("--image_dir", type=str,
required=True, help="coco image dir")
parser.add_argument("--num_workers", type=int, default=4,
help="number of workers for dataloader")
args = parser.parse_args()
main(args)
\ No newline at end of file
from groundingdino.util.inference import load_model, load_image, predict, annotate
import torch
import cv2
model = load_model("groundingdino/config/GroundingDINO_SwinT_OGC.pyy", "weights/groundingdino_swint_ogc.pth")
model = model.to('cuda:0')
print(torch.cuda.is_available())
print('DONE!')
\ No newline at end of file
name: dino
channels:
- pytorch
- nvidia
- conda-forge
- defaults
dependencies:
- addict=2.4.0=pyhd8ed1ab_2
- aiohttp=3.8.5=py39ha55989b_0
- aiosignal=1.3.1=pyhd8ed1ab_0
- asttokens=2.0.5=pyhd3eb1b0_0
- async-timeout=4.0.3=pyhd8ed1ab_0
- attrs=23.1.0=pyh71513ae_1
- aws-c-auth=0.7.0=h6f3c987_2
- aws-c-cal=0.6.0=h6ba3258_0
- aws-c-common=0.8.23=hcfcfb64_0
- aws-c-compression=0.2.17=h420beca_1
- aws-c-event-stream=0.3.1=had47b81_1
- aws-c-http=0.7.11=h72ba615_0
- aws-c-io=0.13.28=ha35c040_0
- aws-c-mqtt=0.8.14=h4941efa_2
- aws-c-s3=0.3.13=he04eaa7_2
- aws-c-sdkutils=0.1.11=h420beca_1
- aws-checksums=0.1.16=h420beca_1
- aws-crt-cpp=0.20.3=h247a981_4
- aws-sdk-cpp=1.10.57=h1a0519f_17
- backcall=0.2.0=pyhd3eb1b0_0
- blas=2.118=mkl
- blas-devel=3.9.0=18_win64_mkl
- brotli=1.0.9=hcfcfb64_9
- brotli-bin=1.0.9=hcfcfb64_9
- brotli-python=1.0.9=py39h99910a6_9
- bzip2=1.0.8=h8ffe710_4
- c-ares=1.19.1=hcfcfb64_0
- ca-certificates=2023.08.22=haa95532_0
- certifi=2023.7.22=py39haa95532_0
- charset-normalizer=3.2.0=pyhd8ed1ab_0
- click=8.1.7=win_pyh7428d3b_0
- colorama=0.4.6=pyhd8ed1ab_0
- comm=0.1.2=py39haa95532_0
- contourpy=1.1.1=py39h1f6ef14_1
- cuda-cccl=12.2.140=0
- cuda-cudart=11.8.89=0
- cuda-cudart-dev=11.8.89=0
- cuda-cupti=11.8.87=0
- cuda-libraries=11.8.0=0
- cuda-libraries-dev=11.8.0=0
- cuda-nvrtc=11.8.89=0
- cuda-nvrtc-dev=11.8.89=0
- cuda-nvtx=11.8.86=0
- cuda-profiler-api=12.2.140=0
- cuda-runtime=11.8.0=0
- cycler=0.11.0=pyhd8ed1ab_0
- cython=3.0.0=py39h2bbff1b_0
- dataclasses=0.8=pyhc8e2a94_3
- datasets=2.14.5=pyhd8ed1ab_0
- debugpy=1.6.7=py39hd77b12b_0
- decorator=5.1.1=pyhd3eb1b0_0
- dill=0.3.7=pyhd8ed1ab_0
- exceptiongroup=1.0.4=py39haa95532_0
- executing=0.8.3=pyhd3eb1b0_0
- filelock=3.12.4=pyhd8ed1ab_0
- fonttools=4.42.1=py39ha55989b_0
- freeglut=3.2.2=h63175ca_2
- freetype=2.12.1=hdaf720e_2
- frozenlist=1.4.0=py39ha55989b_1
- fsspec=2023.6.0=pyh1a96a4e_0
- gettext=0.21.1=h5728263_0
- glib=2.78.0=h12be248_0
- glib-tools=2.78.0=h12be248_0
- gst-plugins-base=1.22.6=h001b923_1
- gstreamer=1.22.6=hb4038d2_1
- huggingface_hub=0.17.3=pyhd8ed1ab_0
- icu=70.1=h0e60522_0
- idna=3.4=pyhd8ed1ab_0
- importlib-metadata=6.8.0=pyha770c72_0
- importlib-resources=6.1.0=pyhd8ed1ab_0
- importlib_metadata=6.8.0=hd8ed1ab_0
- importlib_resources=6.1.0=pyhd8ed1ab_0
- intel-openmp=2023.2.0=h57928b3_49503
- ipykernel=6.25.0=py39h9909e9c_0
- ipython=8.15.0=py39haa95532_0
- jasper=2.0.33=hc2e4405_1
- jedi=0.18.1=py39haa95532_1
- jinja2=3.1.2=pyhd8ed1ab_1
- joblib=1.3.2=pyhd8ed1ab_0
- jpeg=9e=hcfcfb64_3
- jupyter_client=8.1.0=py39haa95532_0
- jupyter_core=5.3.0=py39haa95532_0
- kiwisolver=1.4.5=py39h1f6ef14_1
- krb5=1.20.1=heb0366b_0
- lcms2=2.14=h90d422f_0
- lerc=4.0.0=h63175ca_0
- libabseil=20230125.3=cxx17_h63175ca_0
- libarrow=12.0.1=h12e5d06_5_cpu
- libblas=3.9.0=18_win64_mkl
- libbrotlicommon=1.0.9=hcfcfb64_9
- libbrotlidec=1.0.9=hcfcfb64_9
- libbrotlienc=1.0.9=hcfcfb64_9
- libcblas=3.9.0=18_win64_mkl
- libclang=15.0.7=default_h77d9078_3
- libclang13=15.0.7=default_h77d9078_3
- libcrc32c=1.1.2=h0e60522_0
- libcublas=11.11.3.6=0
- libcublas-dev=11.11.3.6=0
- libcufft=10.9.0.58=0
- libcufft-dev=10.9.0.58=0
- libcurand=10.3.3.141=0
- libcurand-dev=10.3.3.141=0
- libcurl=8.1.2=h68f0423_0
- libcusolver=11.4.1.48=0
- libcusolver-dev=11.4.1.48=0
- libcusparse=11.7.5.86=0
- libcusparse-dev=11.7.5.86=0
- libdeflate=1.14=hcfcfb64_0
- libevent=2.1.12=h3671451_1
- libffi=3.4.2=h8ffe710_5
- libglib=2.78.0=he8f3873_0
- libgoogle-cloud=2.12.0=h00b2bdc_1
- libgrpc=1.54.3=ha177ca7_0
- libhwloc=2.9.3=default_haede6df_1009
- libiconv=1.17=h8ffe710_0
- liblapack=3.9.0=18_win64_mkl
- liblapacke=3.9.0=18_win64_mkl
- libnpp=11.8.0.86=0
- libnpp-dev=11.8.0.86=0
- libnvjpeg=11.9.0.86=0
- libnvjpeg-dev=11.9.0.86=0
- libogg=1.3.4=h8ffe710_1
- libopencv=4.5.3=py39h488c12c_8
- libpng=1.6.39=h19919ed_0
- libprotobuf=3.21.12=h12be248_2
- libsodium=1.0.18=h62dcd97_0
- libsqlite=3.43.0=hcfcfb64_0
- libssh2=1.11.0=h7dfc565_0
- libthrift=0.18.1=h06f6336_2
- libtiff=4.4.0=hc4f729c_5
- libutf8proc=2.8.0=h82a8f57_0
- libuv=1.44.2=hcfcfb64_1
- libvorbis=1.3.7=h0e60522_0
- libwebp-base=1.3.2=hcfcfb64_0
- libxcb=1.13=hcd874cb_1004
- libxml2=2.11.5=hc3477c8_1
- libzlib=1.2.13=hcfcfb64_5
- lz4-c=1.9.4=hcfcfb64_0
- m2w64-gcc-libgfortran=5.3.0=6
- m2w64-gcc-libs=5.3.0=7
- m2w64-gcc-libs-core=5.3.0=7
- m2w64-gmp=6.1.0=2
- m2w64-libwinpthread-git=5.0.0.4634.697f757=2
- markupsafe=2.1.3=py39ha55989b_1
- matplotlib-base=3.8.0=py39hf19769e_1
- matplotlib-inline=0.1.6=py39haa95532_0
- mkl=2022.1.0=h6a75c08_874
- mkl-devel=2022.1.0=h57928b3_875
- mkl-include=2022.1.0=h6a75c08_874
- mpmath=1.3.0=pyhd8ed1ab_0
- msys2-conda-epoch=20160418=1
- multidict=6.0.4=py39ha55989b_0
- multiprocess=0.70.15=py39ha55989b_1
- munkres=1.1.4=pyh9f0ad1d_0
- nest-asyncio=1.5.6=py39haa95532_0
- networkx=3.1=pyhd8ed1ab_0
- numpy=1.26.0=py39hddb5d58_0
- opencv=4.5.3=py39hcbf5309_8
- openjpeg=2.5.0=hc9384bd_1
- openssl=3.1.3=hcfcfb64_0
- orc=1.9.0=hada7b9e_1
- packaging=23.1=pyhd8ed1ab_0
- pandas=2.1.1=py39h32e6231_0
- parso=0.8.3=pyhd3eb1b0_0
- pcre2=10.40=h17e33f8_0
- pickleshare=0.7.5=pyhd3eb1b0_1003
- pillow=9.2.0=py39h595c93f_3
- pip=23.2.1=pyhd8ed1ab_0
- platformdirs=3.10.0=pyhd8ed1ab_0
- prompt-toolkit=3.0.36=py39haa95532_0
- psutil=5.9.0=py39h2bbff1b_0
- pthread-stubs=0.4=hcd874cb_1001
- pthreads-win32=2.9.1=hfa6e2cd_3
- pure_eval=0.2.2=pyhd3eb1b0_0
- py-opencv=4.5.3=py39h00e5391_8
- pyarrow=12.0.1=py39hca4e8af_5_cpu
- pycocotools=2.0.6=py39hc266a54_1
- pygments=2.15.1=py39haa95532_1
- pyparsing=3.1.1=pyhd8ed1ab_0
- pysocks=1.7.1=pyh0701188_6
- python=3.9.18=h4de0772_0_cpython
- python-dateutil=2.8.2=pyhd8ed1ab_0
- python-tzdata=2023.3=pyhd8ed1ab_0
- python-xxhash=3.3.0=py39ha55989b_1
- python_abi=3.9=4_cp39
- pytorch=2.0.1=py3.9_cuda11.8_cudnn8_0
- pytorch-cuda=11.8=h24eeafa_5
- pytorch-mutex=1.0=cuda
- pytz=2023.3.post1=pyhd8ed1ab_0
- pywin32=305=py39h2bbff1b_0
- pyyaml=6.0.1=py39ha55989b_1
- pyzmq=25.1.0=py39hd77b12b_0
- qt-main=5.15.8=h720456b_6
- re2=2023.03.02=hd4eee63_0
- regex=2023.8.8=py39ha55989b_1
- requests=2.31.0=pyhd8ed1ab_0
- sacremoses=0.0.53=pyhd8ed1ab_0
- safetensors=0.3.3=py39hf21820d_1
- setuptools=68.2.2=pyhd8ed1ab_0
- six=1.16.0=pyh6c4a22f_0
- snappy=1.1.10=hfb803bf_0
- stack_data=0.2.0=pyhd3eb1b0_0
- sympy=1.12=pyh04b8f61_3
- tbb=2021.10.0=h91493d7_1
- timm=0.9.7=pyhd8ed1ab_0
- tk=8.6.13=hcfcfb64_0
- tokenizers=0.13.3=py39hca44cb7_0
- tomli=2.0.1=pyhd8ed1ab_0
- tornado=6.3.2=py39h2bbff1b_0
- tqdm=4.66.1=pyhd8ed1ab_0
- traitlets=5.7.1=py39haa95532_0
- transformers=4.33.2=pyhd8ed1ab_0
- typing-extensions=4.8.0=hd8ed1ab_0
- typing_extensions=4.8.0=pyha770c72_0
- tzdata=2023c=h71feb2d_0
- ucrt=10.0.22621.0=h57928b3_0
- unicodedata2=15.0.0=py39ha55989b_1
- urllib3=2.0.5=pyhd8ed1ab_0
- vc=14.3=h64f974e_17
- vc14_runtime=14.36.32532=hdcecf7f_17
- vs2015_runtime=14.36.32532=h05e6639_17
- wcwidth=0.2.5=pyhd3eb1b0_0
- wheel=0.41.2=pyhd8ed1ab_0
- win_inet_pton=1.1.0=pyhd8ed1ab_6
- xorg-libxau=1.0.11=hcd874cb_0
- xorg-libxdmcp=1.1.3=hcd874cb_0
- xxhash=0.8.2=hcfcfb64_0
- xz=5.2.6=h8d14728_0
- yaml=0.2.5=h8ffe710_2
- yapf=0.40.1=pyhd8ed1ab_0
- yarl=1.9.2=py39ha55989b_0
- zeromq=4.3.4=hd77b12b_0
- zipp=3.17.0=pyhd8ed1ab_0
- zlib=1.2.13=hcfcfb64_5
- zstd=1.5.5=h12be248_0
- pip:
- opencv-python==4.8.0.76
- supervision==0.6.0
- torchaudio==2.0.2
- torchvision==0.15.2
prefix: C:\Users\Makoto\miniconda3\envs\dino
final text_encoder_type: bert-base-uncased
import torch
import onnx
from onnxsim import simplify
from groundingdino.models import build_model
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict
config_file = './groundingdino/config/GroundingDINO_SwinB_cfg.py'
checkpoint_path = './weights/groundingdino_swinb_cogcoor.pth'
def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
args = SLConfig.fromfile(model_config_path)
args.device = "cuda" if not cpu_only else "cpu"
# modified config
args.use_checkpoint = False
args.use_transformer_ckpt = False
model = build_model(args)
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
_ = model.eval()
return model
# 加载模型
model = load_model(config_file, checkpoint_path, cpu_only=True)
# 正式推理时使用的提示词,以及相关的mask
caption = "car ."
input_ids = model.tokenizer([caption], return_tensors="pt")["input_ids"]
position_ids = torch.tensor([[0, 0, 1, 0]])
token_type_ids = torch.tensor([[0, 0, 0, 0]])
attention_mask = torch.tensor([[True, True, True, True]])
text_token_mask = torch.tensor([[[True, False, False, False],
[False, True, True, False],
[False, True, True, False],
[False, False, False, True]]])
# 固定输入分辨率
img = torch.randn(1, 3, 800, 1200)
# 导出原始ONNX模型
onnx_output_path = "weights/ground.onnx"
simplified_onnx_path = "weights/ground_simplified1.onnx"
torch.onnx.export(
model,
f=onnx_output_path,
args=(img, input_ids, attention_mask, position_ids, token_type_ids, text_token_mask),
input_names=["img", "input_ids", "attention_mask", "position_ids", "token_type_ids", "text_token_mask"],
output_names=["logits", "boxes"],
dynamic_axes=None, # 静态维度导出
opset_version=17,
verbose=False # 关闭详细日志,如需调试可改为True
# do_constant_folding=True # 常量折叠优化,提升简化效果
)
print(f"ONNX模型已成功导出到: {onnx_output_path}")
# # 使用onnxsim简化模型
# print(f"开始简化ONNX模型: {onnx_output_path}")
# try:
# # 加载原始ONNX模型
# onnx_model = onnx.load(onnx_output_path)
# # 简化模型(enable_fuse_bn=True 融合批归一化层,更彻底的简化)
# simplified_model, check = simplify(
# onnx_model,
# skip_fuse_bn=True,
# skip_constant_folding=True,
# dynamic_input_shape=False,
# input_shapes={ # 指定输入形状,确保简化准确
# "img": (1, 3, 800, 1200),
# "input_ids": tuple(input_ids.shape),
# "attention_mask": tuple(attention_mask.shape),
# "position_ids": tuple(position_ids.shape),
# "token_type_ids": tuple(token_type_ids.shape),
# "text_token_mask": tuple(text_token_mask.shape)
# }
# )
# # 验证简化后的模型
# assert check, "简化后的ONNX模型验证失败!"
# # 保存简化后的模型
# onnx.save(simplified_model, simplified_onnx_path)
# print(f"ONNX模型简化完成,已保存至: {simplified_onnx_path}")
# except Exception as e:
# print(f"ONNX简化过程出错: {e}")
# print("将使用原始未简化的ONNX模型")
\ No newline at end of file
import torch
import onnx
from onnxsim import simplify
from groundingdino.models import build_model
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict
config_file = './groundingdino/config/GroundingDINO_SwinB_cfg.py'
checkpoint_path = './weights/groundingdino_swinb_cogcoor.pth'
def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
args = SLConfig.fromfile(model_config_path)
args.device = "cuda" if not cpu_only else "cpu"
# modified config
args.use_checkpoint = False
args.use_transformer_ckpt = False
model = build_model(args)
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
_ = model.eval()
return model
# 加载模型
model = load_model(config_file, checkpoint_path, cpu_only=True)
# ===================== 核心修改:batch_size=4 =====================
BATCH_SIZE = 8
# 正式推理时使用的提示词,以及相关的mask
caption = "car ."
# 1. 文本输入扩展到batch_size=4
# 重复caption BATCH_SIZE次,构建批量文本输入
input_ids = model.tokenizer([caption]*BATCH_SIZE, return_tensors="pt", padding="longest")["input_ids"]
seq_len = input_ids.shape[1] # 获取序列长度(适配不同caption)
# 2. 扩展position_ids到batch_size=4
position_ids = torch.tensor([[0, 0, 1, 0]]).repeat(BATCH_SIZE, 1)
# 确保position_ids长度匹配seq_len(截断/补零)
if position_ids.shape[1] < seq_len:
pad_len = seq_len - position_ids.shape[1]
position_ids = torch.cat([position_ids, torch.zeros(BATCH_SIZE, pad_len, dtype=torch.long)], dim=1)
else:
position_ids = position_ids[:, :seq_len]
# 3. 扩展token_type_ids到batch_size=4
token_type_ids = torch.tensor([[0, 0, 0, 0]]).repeat(BATCH_SIZE, 1)
if token_type_ids.shape[1] < seq_len:
pad_len = seq_len - token_type_ids.shape[1]
token_type_ids = torch.cat([token_type_ids, torch.zeros(BATCH_SIZE, pad_len, dtype=torch.long)], dim=1)
else:
token_type_ids = token_type_ids[:, :seq_len]
# 4. 扩展attention_mask到batch_size=4
attention_mask = torch.tensor([[True, True, True, True]]).repeat(BATCH_SIZE, 1)
if attention_mask.shape[1] < seq_len:
pad_len = seq_len - attention_mask.shape[1]
attention_mask = torch.cat([attention_mask, torch.ones(BATCH_SIZE, pad_len, dtype=torch.bool)], dim=1)
else:
attention_mask = attention_mask[:, :seq_len]
# 5. 扩展text_token_mask到batch_size=4
text_token_mask = torch.tensor([[[True, False, False, False],
[False, True, True, False],
[False, True, True, False],
[False, False, False, True]]]).repeat(BATCH_SIZE, 1, 1)
# 调整mask维度匹配seq_len
if text_token_mask.shape[1] < seq_len:
pad_len = seq_len - text_token_mask.shape[1]
# 补全mask的行和列
pad_row = torch.zeros(BATCH_SIZE, pad_len, text_token_mask.shape[2], dtype=torch.bool)
text_token_mask = torch.cat([text_token_mask, pad_row], dim=1)
pad_col = torch.zeros(BATCH_SIZE, seq_len, pad_len, dtype=torch.bool)
text_token_mask = torch.cat([text_token_mask, pad_col], dim=2)
else:
text_token_mask = text_token_mask[:, :seq_len, :seq_len]
# 6. 扩展图像输入到batch_size=4 (1,3,800,1200) -> (4,3,800,1200)
img = torch.randn(BATCH_SIZE, 3, 800, 1200)
# 打印输入形状,验证batch_size=8
print("="*50)
print("输入形状验证(batch_size=8):")
print(f"img: {img.shape}")
print(f"input_ids: {input_ids.shape}")
print(f"attention_mask: {attention_mask.shape}")
print(f"position_ids: {position_ids.shape}")
print(f"token_type_ids: {token_type_ids.shape}")
print(f"text_token_mask: {text_token_mask.shape}")
print("="*50)
# onnx模型可以支持动态输入,在转换engine时建议注销
dynamic_axes = {
"input_ids": {0: "batch_size", 1: "seq_len"},
"attention_mask": {0: "batch_size", 1: "seq_len"},
"position_ids": {0: "batch_size", 1: "seq_len"},
"token_type_ids": {0: "batch_size", 1: "seq_len"},
"text_token_mask": {0: "batch_size", 1: "seq_len", 2: "seq_len"},
"img": {0: "batch_size", 2: "height", 3: "width"},
"logits": {0: "batch_size"},
"boxes": {0: "batch_size"}
}
# 导出原始ONNX模型
onnx_output_path = "weights/ground_bs8.onnx"
simplified_onnx_path = "weights/ground_simplified_bs8.onnx"
torch.onnx.export(
model,
f=onnx_output_path,
args=(img, input_ids, attention_mask, position_ids, token_type_ids, text_token_mask),
input_names=["img", "input_ids", "attention_mask", "position_ids", "token_type_ids", "text_token_mask"],
output_names=["logits", "boxes"],
# dynamic_axes=dynamic_axes, # 转换engine时建议注释
opset_version=17,
verbose=False,
do_constant_folding=True # 常量折叠优化,提升简化效果
)
# # 使用onnxsim简化模型
# print(f"\n开始简化ONNX模型: {onnx_output_path}")
# try:
# # 加载原始ONNX模型
# onnx_model = onnx.load(onnx_output_path)
# # 简化模型(enable_fuse_bn=True 融合批归一化层,更彻底的简化)
# simplified_model, check = simplify(
# onnx_model,
# dynamic_input_shape=False, # 因为固定了batch_size和分辨率,设为False
# input_shapes={ # 指定batch_size=4的输入形状
# "img": (BATCH_SIZE, 3, 800, 1200),
# "input_ids": tuple(input_ids.shape),
# "attention_mask": tuple(attention_mask.shape),
# "position_ids": tuple(position_ids.shape),
# "token_type_ids": tuple(token_type_ids.shape),
# "text_token_mask": tuple(text_token_mask.shape)
# }
# )
# # 验证简化后的模型
# assert check, "简化后的ONNX模型验证失败!"
# # 保存简化后的模型
# onnx.save(simplified_model, simplified_onnx_path)
# print(f"ONNX模型简化完成,已保存至: {simplified_onnx_path}")
# except Exception as e:
# print(f"ONNX简化过程出错: {e}")
# print("将使用原始未简化的ONNX模型")
\ No newline at end of file
import torch
import onnx
from onnxsim import simplify
from groundingdino.models import build_model
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict
config_file = './groundingdino/config/GroundingDINO_SwinB_cfg.py'
checkpoint_path = './weights/groundingdino_swinb_cogcoor.pth'
def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
args = SLConfig.fromfile(model_config_path)
args.device = "cuda" if not cpu_only else "cpu"
# modified config
args.use_checkpoint = False
args.use_transformer_ckpt = False
model = build_model(args)
checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
_ = model.eval()
return model
# 加载模型
model = load_model(config_file, checkpoint_path, cpu_only=True)
# 正式推理时使用的提示词,以及相关的mask
caption = "car ."
input_ids = model.tokenizer([caption], return_tensors="pt")["input_ids"]
position_ids = torch.tensor([[0, 0, 1, 0]])
token_type_ids = torch.tensor([[0, 0, 0, 0]])
attention_mask = torch.tensor([[True, True, True, True]])
text_token_mask = torch.tensor([[[True, False, False, False],
[False, True, True, False],
[False, True, True, False],
[False, False, False, True]]])
# 固定输入分辨率
img = torch.randn(1, 3, 800, 1200)
# onnx模型可以支持动态输入,在转换engine时建议注销
dynamic_axes = {
"input_ids": {0: "batch_size", 1: "seq_len"},
"attention_mask": {0: "batch_size", 1: "seq_len"},
"position_ids": {0: "batch_size", 1: "seq_len"},
"token_type_ids": {0: "batch_size", 1: "seq_len"},
"text_token_mask": {0: "batch_size", 1: "seq_len", 2: "seq_len"},
"img": {0: "batch_size", 2: "height", 3: "width"},
"logits": {0: "batch_size"},
"boxes": {0: "batch_size"}
}
# 导出原始ONNX模型
onnx_output_path = "weights/ground.onnx"
torch.onnx.export(
model,
f=onnx_output_path,
args=(img, input_ids, attention_mask, position_ids, token_type_ids, text_token_mask),
input_names=["img", "input_ids", "attention_mask", "position_ids", "token_type_ids", "text_token_mask"],
output_names=["logits", "boxes"],
opset_version=17,
verbose=False, # 关闭详细日志,如需调试可改为True
do_constant_folding=True # 常量折叠优化,提升简化效果
)
import onnx
from onnx import helper, TensorProto, numpy_helper
import numpy as np
def convert_fp16_manual(input_path, output_path, keep_io_types=True):
model = onnx.load(input_path)
graph = model.graph
fp32 = TensorProto.FLOAT
fp16 = TensorProto.FLOAT16
# ========== 1. 收集所有 name -> type ==========
type_map = {}
for init in graph.initializer:
type_map[init.name] = init.data_type
for inp in graph.input:
type_map[inp.name] = inp.type.tensor_type.elem_type
for out in graph.output:
type_map[out.name] = out.type.tensor_type.elem_type
# ========== 2. Initializer: FP32 -> FP16 ==========
for i, init in enumerate(graph.initializer):
if init.data_type == fp32:
arr = numpy_helper.to_array(init)
# 处理 inf / -inf / 超大值
arr = np.clip(arr, -65504, 65504)
arr = arr.astype(np.float16)
new_init = numpy_helper.from_array(arr, init.name)
graph.initializer[i].CopyFrom(new_init)
type_map[init.name] = fp16
# ========== 3. Constant 节点: FP32 -> FP16 ==========
for node in graph.node:
if node.op_type != "Constant":
continue
for attr in node.attribute:
if attr.t.data_type == fp32:
arr = numpy_helper.to_array(attr.t)
arr = np.clip(arr, -65504, 65504).astype(np.float16)
attr.t.CopyFrom(numpy_helper.from_array(arr))
type_map[node.output[0]] = fp16
# ========== 4. 遍历节点,插入 Cast ==========
new_nodes = []
cast_id = [0]
# 需要保持 FP32 的 op(不转其输出)
fp32_ops = {"Shape", "NonMaxSuppression", "Range",
"TopK", "SequenceConstruct", "SequenceEmpty"}
for node in graph.node:
if node.op_type == "Constant":
new_nodes.append(node)
continue
# 这些 op 输出本身就是整数或索引,跳过
if node.op_type in fp32_ops:
new_nodes.append(node)
for o in node.output:
type_map[o] = fp32 # 标记为 FP32(实际是 int64 等)
continue
# ---- 找目标类型:用第一个已知输入的类型 ----
target = None
for inp_name in node.input:
if inp_name and inp_name in type_map:
t = type_map[inp_name]
if t in (fp32, fp16):
target = t
break
# 默认目标类型 = FP16
if target is None:
target = fp16
# ---- 对每个输入做类型检查 ----
for idx, inp_name in enumerate(node.input):
if not inp_name or inp_name not in type_map:
continue
inp_type = type_map[inp_name]
# 输入是 FP32,目标是 FP16 -> 插 Cast to FP16
if inp_type == fp32 and target == fp16:
cast_out = f"_cast_{cast_id[0]}"
cast_id[0] += 1
cast_node = helper.make_node(
"Cast", inputs=[inp_name], outputs=[cast_out], to=fp16
)
new_nodes.append(cast_node)
node.input[idx] = cast_out
type_map[cast_out] = fp16
# 输入是 FP16,目标是 FP32 -> 插 Cast to FP32
elif inp_type == fp16 and target == fp32:
cast_out = f"_cast_{cast_id[0]}"
cast_id[0] += 1
cast_node = helper.make_node(
"Cast", inputs=[inp_name], outputs=[cast_out], to=fp32
)
new_nodes.append(cast_node)
node.input[idx] = cast_out
type_map[cast_out] = fp32
new_nodes.append(node)
# ---- 更新输出类型 ----
for o in node.output:
type_map[o] = target
# ========== 5. 替换节点 ==========
del graph.node[:]
graph.node.extend(new_nodes)
# ========== 6. 修复 graph output 类型声明 ==========
if keep_io_types:
# 保持原始 IO 类型为 FP32
# 输出需要 Cast 回 FP32
for out in graph.output:
if out.name in type_map and type_map[out.name] == fp16:
cast_out = f"_cast_out_{out.name}"
cast_node = helper.make_node(
"Cast", inputs=[cast_out], outputs=[out.name], to=fp32
)
# 重命名原始输出
# 先找到最后产生这个输出的节点,改其输出名
for node in graph.node:
for i, o in enumerate(node.output):
if o == out.name:
node.output[i] = cast_out
break
graph.node.append(cast_node)
type_map[out.name] = fp32
else:
# 输出也改为 FP16
for out in graph.output:
if out.name in type_map:
out.type.tensor_type.elem_type = type_map[out.name]
# ========== 7. 验证 ==========
onnx.checker.check_model(model)
onnx.save(model, output_path)
print(f"✅ 转换完成 -> {output_path}")
print(f" 节点数: {len(graph.node)}")
print(f" Cast 插入数: {cast_id[0]}")
# ========== 运行 ==========
convert_fp16_manual(
"weights/ground.onnx",
"weights/ground_fp16.onnx",
keep_io_types=True,
)
import onnx
from onnxconverter_common import float16
# 1. 加载模型
model = onnx.load("weights/ground.onnx")
# 2. 转换为 FP16
model_fp16 = float16.convert_float_to_float16(
model,
keep_io_types=True,
# op_block_list=["Cast"]
)
# 3. 验证模型
onnx.checker.check_model(model_fp16)
# 4. 保存
onnx.save(model_fp16, "weights/ground_fp16.onnx")
print("FP16 model saved!")
import onnx
from onnxruntime.transformers.float16 import convert_float_to_float16
# ===== 1. 路径 =====
input_model = "weights/ground.onnx"
output_model = "weights/ground_fp16.onnx"
# ===== 2. 加载 =====
model = onnx.load(input_model)
# ===== 3. 转换 =====
model_fp16 = convert_float_to_float16(
model,
keep_io_types=True, # ⭐ 强烈建议
)
# ===== 4. 保存 =====
onnx.save(model_fp16, output_model)
print("✅ ONNXRuntime FP16 转换完成")
\ No newline at end of file
from onnxruntime.quantization import quantize_dynamic, QuantType
quantize_dynamic(
model_input="weights/ground.onnx",
model_output="weights/ground_int8.onnx",
weight_type=QuantType.QInt8,
)
print("int8 quantization done!")
import onnx
from onnx import TensorProto
# 加载你报错的FP16模型
model = onnx.load("weights/ground_fp16.onnx")
# 🔥 精准修复:找到报错的中间张量,强制修改类型为FP16
target_arg = "/backbone/backbone.0/Cast_output_0"
# 遍历模型所有张量类型声明,修复冲突项
for vi in model.graph.value_info:
if vi.name == target_arg:
vi.type.tensor_type.elem_type = TensorProto.FLOAT16
print(f"✅ 已修复:{target_arg} 类型 → FP16")
# 额外校验+保存修复后的模型
onnx.checker.check_model(model)
onnx.save(model, "weights/ground_fp16_fixed.onnx")
print("\n🎉 模型修复完成!加载:ground_fp16_fixed.onnx")
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment