export_onnx_deform.py

import torch
import onnx
from onnxsim import simplify

from groundingdino.models import build_model
from groundingdino.util.slconfig import SLConfig
from groundingdino.util.utils import clean_state_dict

from torch.onnx import register_custom_op_symbolic
from groundingdino.models.GroundingDINO.ms_deform_attn import MultiScaleDeformableAttnFunction

def ms_deform_attn_symbolic(g, value, spatial_shapes, level_start_index, sampling_locations, attention_weights, im2col_step):
    return g.op("custom::ms_deform_attn", 
                value, spatial_shapes, level_start_index, sampling_locations, attention_weights,
                im2col_step_i=im2col_step)

MultiScaleDeformableAttnFunction.symbolic = ms_deform_attn_symbolic

config_file = '../groundingdino/config/GroundingDINO_SwinB_cfg.py'
checkpoint_path = '../weights/groundingdino_swinb_cogcoor.pth'

def load_model(model_config_path, model_checkpoint_path, cpu_only=False):
    args = SLConfig.fromfile(model_config_path)
    args.device = "cuda" if not cpu_only else "cpu"
    args.use_checkpoint = False
    args.use_transformer_ckpt = False
    model = build_model(args)
    checkpoint = torch.load(model_checkpoint_path, map_location="cpu")
    model.load_state_dict(clean_state_dict(checkpoint["model"]), strict=False)
    model.eval()
    return model

# 加载模型
model = load_model(config_file, checkpoint_path, cpu_only=False)  # <-- 必须 cuda，否则算子不触发
device = "cuda"
model = model.to(device)

caption = "car ."
input_ids = model.tokenizer([caption], return_tensors="pt")["input_ids"].to(device)
position_ids = torch.tensor([[0, 0, 1, 0]]).to(device)
token_type_ids = torch.tensor([[0, 0, 0, 0]]).to(device)
attention_mask = torch.tensor([[True, True, True, True]]).to(device)
text_token_mask = torch.tensor([[[True, False, False, False],
                                 [False,  True,  True,  False],
                                 [False,  True,  True,  False],
                                 [False,  False, False, True]]]).to(device)

img = torch.randn(1, 3, 800, 1200).to(device)

# 导出 ONNX
onnx_output_path = "../weights/ground_deform.onnx"

torch.onnx.export(
    model,
    args=(img, input_ids, attention_mask, position_ids, token_type_ids, text_token_mask),
    f=onnx_output_path,
    input_names=["img", "input_ids", "attention_mask", "position_ids", "token_type_ids", "text_token_mask"],
    output_names=["logits", "boxes"],
    opset_version=17,
    custom_opsets={"custom": 1},
    export_custom_ops=True,
    do_constant_folding=True,
)

print("✅ ONNX 导出成功！")