Commit 7bc21d37 authored by zk's avatar zk
Browse files

Update-MIGraphX-optimization-workflow

parent 3191f720
...@@ -202,18 +202,67 @@ python onnx_inference_deform_optim.py ...@@ -202,18 +202,67 @@ python onnx_inference_deform_optim.py
## 7\. migraphx推理 ## 7\. migraphx推理
1. 进入migraphx_infer文件夹 1. 环境准备(http://42.228.13.241:10068/wangwf/groundingdino)
* 下载 DTK-26.04-txpl-temp-0312-ubuntu20.04-x86_64(http://112.11.77.146:65182/jenkins/rocm/26.04/intel/ubuntu20.04/DTK-26.04-txpl-temp-0312-ubuntu20.04-x86_64.tar.gz) ,解压后替换掉原 /opt/dtk 目录。
```bash
tar -zxvf DTK-26.04-txpl-temp-0312-ubuntu20.04-x86_64.tar.gz -C /opt/
rm /opt/dtk # 删除原来的软链接
ln -s /opt/dtk-26.04-txpl-temp-0312 /opt/dtk # 创建新的软链接
```
* 替换hipdnn:
```bash
tar -zxvf package_resize.tar.gz
cd package_resize
cp -r install/lib/hipdnn_plugins /opt/dtk/lib/
cp -r install/lib/libhipdnn_backend.so /opt/dtk/lib/
rm -rf /opt/dtk/include/hipdnn
cp -r install/include/hipdnn /opt/dtk/include/hipdnn
rm -rf /opt/dtk/lib/cmake/hipdnn*
cp -r install/lib/cmake/* /opt/dtk/lib/cmake/
rm -f /opt/dtk/lib/hipdnn_plugins/engines/libmiopen_legacy_plugin.so
```
* 激活dtk
```bash
source /opt/dtk/env.sh
export LD_LIBRARY_PATH=/usr/local/lib/python3.10/dist-packages/torch/lib:${LD_LIBRARY_PATH}
```
* 安装migraphx
```bash ```bash
chmod +x ./migraphx-5.1.2+das.opt1.ab9210b.dtk2604-cp310-cp310-manylinux_2_35_x86_64.run
./migraphx-5.1.2+das.opt1.ab9210b.dtk2604-cp310-cp310-manylinux_2_35_x86_64.run
```
* 模型优化
```bash
cd weights
onnxsim ground.onnx ground_sim.onnx
cd migraphx_infer cd migraphx_infer
python modify_onnx_0601.py ../weights/ground_sim.onnx ../weights/ground_opt.onnx
``` ```
2. 运行转换onnx脚本
将简化后的onnx转换为要用migraphx推理的onnx(ground_sim.onnx->ground_opt.onnx)
2. 性能测试(编译加运行)
```bash ```bash
export MIGRAPHX_ENABLE_GRAPHAPI_REDUCTION=1
export MIGRAPHX_ENABLE_LAYERNORM_FUSION=1
migraphx-driver perf --onnx ground_opt.onnx --fp16 --output ground_opt.mxr
```
或者进入migraphx_infer文件夹,运行
```bash
cd migraphx_infer
bash migraphx_export.bash bash migraphx_export.bash
``` ```
3. 如果已经得到了mxr文件,直接测试 3. 如果已经得到了mxr文件,也可以直接测试
```bash ```bash
bash migraphx_perf.bash bash migraphx_perf.bash
``` ```
......
...@@ -45,12 +45,11 @@ text_token_mask = torch.tensor([[[True, False, False, False], ...@@ -45,12 +45,11 @@ text_token_mask = torch.tensor([[[True, False, False, False],
[False, True, True, False], [False, True, True, False],
[False, False, False, True]]]).to(device) [False, False, False, True]]]).to(device)
# img = torch.randn(1, 3, 800, 1200).to(device) img = torch.randn(1, 3, 800, 1200).to(device)
img = torch.randn(1, 3, 400, 600).to(device) # img = torch.randn(1, 3, 400, 600).to(device)
# 导出 ONNX # 导出 ONNX
# onnx_output_path = "../weights/ground_deform.onnx" onnx_output_path = "../weights/ground_deform.onnx"
onnx_output_path = "../weights_400x600/ground_deform.onnx"
torch.onnx.export( torch.onnx.export(
model, model,
......
...@@ -17,7 +17,7 @@ from PIL import Image ...@@ -17,7 +17,7 @@ from PIL import Image
""" """
so_options = ort.SessionOptions() so_options = ort.SessionOptions()
custom_op_lib_path = "../ort_plugin_fp16_C/build/libms_deform_attn_ort.so" custom_op_lib_path = "../ort_plugin/build/libms_deform_attn_ort.so"
so_options.register_custom_ops_library(custom_op_lib_path) so_options.register_custom_ops_library(custom_op_lib_path)
# 开启ort优化 # 开启ort优化
so_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL so_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
...@@ -195,7 +195,7 @@ def benchmark_performance( ...@@ -195,7 +195,7 @@ def benchmark_performance(
if __name__ == '__main__': if __name__ == '__main__':
# 配置参数 # 配置参数
model_path = '../weights/ground_deform_fp16_all.onnx' model_path = '../weights/ground_deform_sim_fp16.onnx'
""" """
../weights/ground_deform.onnx 普通版本 ../weights/ground_deform.onnx 普通版本
../weights/ground_deform_sim.onnx 简化版本 ../weights/ground_deform_sim.onnx 简化版本
......
...@@ -3,25 +3,25 @@ from onnxsim import simplify ...@@ -3,25 +3,25 @@ from onnxsim import simplify
from onnxconverter_common import float16 from onnxconverter_common import float16
onnx_model_path = "../weights/ground_deform.onnx" onnx_model_path = "../weights/ground_deform.onnx"
sim_model_path = "../weights_opt/ground_deform_opt.onnx" sim_model_path = "../weights/ground_deform_sim.onnx"
fp16_model_path = "../weights_opt/ground_deform_opt_fp16.onnx" fp16_model_path = "../weights/ground_deform_sim_fp16.onnx"
fp16_all_model_path = "../weights_opt/ground_deform_opt_fp16_all.onnx" # fp16_all_model_path = "../weights/ground_deform_opt_fp16_all.onnx"
custom_op_lib_path = "../ort_plugin_fp16/build/libms_deform_attn_ort.so" custom_op_lib_path = "../ort_plugin/build/libms_deform_attn_ort.so"
# # ========================================== # # ==========================================
# # 第一步:ONNX Simplify (附带自定义算子库) # # 第一步:ONNX Simplify (附带自定义算子库)
# # ========================================== # # ==========================================
# print("1️⃣ 正在进行 ONNX Simplify...") print("1️⃣ 正在进行 ONNX Simplify...")
# model = onnx.load(onnx_model_path) model = onnx.load(onnx_model_path)
# model_simp, check = simplify(model, custom_lib=custom_op_lib_path) model_simp, check = simplify(model, custom_lib=custom_op_lib_path)
# if check: if check:
# onnx.save(model_simp, sim_model_path) onnx.save(model_simp, sim_model_path)
# print(f"✅ Simplify 完成!已保存至 {sim_model_path}") print(f"✅ Simplify 完成!已保存至 {sim_model_path}")
# else: else:
# print("❌ Simplify 验证失败!") print("❌ Simplify 验证失败!")
# exit() exit()
...@@ -47,12 +47,12 @@ print(f"✅ FP16 转换完成(避开自定义算子)!已保存至 {fp16_model_ ...@@ -47,12 +47,12 @@ print(f"✅ FP16 转换完成(避开自定义算子)!已保存至 {fp16_model_
print("\n2️⃣ 正在进行纯 FP16 精度转换...") # print("\n2️⃣ 正在进行纯 FP16 精度转换...")
model_fp16_all = float16.convert_float_to_float16( # model_fp16_all = float16.convert_float_to_float16(
model_to_fp16, # model_to_fp16,
node_block_list=original_cast_nodes, # 保护所有原生的 Cast 节点 # node_block_list=original_cast_nodes, # 保护所有原生的 Cast 节点
keep_io_types=True # 保持整个模型的总输入/输出还是 FP32 # keep_io_types=True # 保持整个模型的总输入/输出还是 FP32
) # )
onnx.save(model_fp16_all, fp16_all_model_path) # onnx.save(model_fp16_all, fp16_all_model_path)
print(f"✅ 纯 FP16 转换完成!已保存至 {fp16_all_model_path}") # print(f"✅ 纯 FP16 转换完成!已保存至 {fp16_all_model_path}")
...@@ -82,25 +82,6 @@ at::Tensor ms_deform_attn_cuda_forward( ...@@ -82,25 +82,6 @@ at::Tensor ms_deform_attn_cuda_forward(
return output; return output;
} }
// at::Tensor ms_deform_attn_forward_wrapper(
// const at::Tensor &value,
// const at::Tensor &spatial_shapes,
// const at::Tensor &level_start_index,
// const at::Tensor &sampling_loc,
// const at::Tensor &attn_weight,
// int64_t im2col_step // ✅ 注意这里
// )
// {
// return groundingdino::ms_deform_attn_cuda_forward(
// value,
// spatial_shapes,
// level_start_index,
// sampling_loc,
// attn_weight,
// im2col_step
// );
// }
std::vector<at::Tensor> ms_deform_attn_cuda_backward( std::vector<at::Tensor> ms_deform_attn_cuda_backward(
const at::Tensor &value, const at::Tensor &value,
const at::Tensor &spatial_shapes, const at::Tensor &spatial_shapes,
...@@ -174,15 +155,3 @@ std::vector<at::Tensor> ms_deform_attn_cuda_backward( ...@@ -174,15 +155,3 @@ std::vector<at::Tensor> ms_deform_attn_cuda_backward(
} }
} // namespace groundingdino } // namespace groundingdino
\ No newline at end of file
// #include <torch/library.h>
// // 注册 schema
// TORCH_LIBRARY(my_ops, m) {
// m.def("ms_deform_attn(Tensor value, Tensor spatial_shapes, Tensor level_start_index, Tensor sampling_loc, Tensor attn_weight, int im2col_step) -> Tensor");
// }
// // CUDA实现
// TORCH_LIBRARY_IMPL(my_ops, CUDA, m) {
// m.impl("ms_deform_attn", groundingdino::ms_deform_attn_forward_wrapper);
// }
\ No newline at end of file
==================================================
Grounding DINO 性能测试报告
==================================================
测试时间: 2026-06-01 13:39:35
测试设备: GPU
GPU型号: K100_AI
预热次数: 5
测试次数: 10
平均推理时延: 186.29 ms
时延标准差: 0.69 ms
最大时延: 188.28 ms
最小时延: 185.77 ms
平均FPS: 5.37 帧/秒
单次推理时延(最后一次): 186.12 ms
images/out/result.jpg

1.35 MB | W: | H:

images/out/result.jpg

1.35 MB | W: | H:

images/out/result.jpg
images/out/result.jpg
images/out/result.jpg
images/out/result.jpg
  • 2-up
  • Swipe
  • Onion skin
CUDA_VISIBLE_DEVICES=1 python demo/infer_torch.py \ HIP_VISIBLE_DEVICES=4 python demo/infer_torch.py \
-c groundingdino/config/GroundingDINO_SwinB_cfg.py \ -c groundingdino/config/GroundingDINO_SwinB_cfg.py \
-p weights/groundingdino_swinb_cogcoor.pth \ -p weights/groundingdino_swinb_cogcoor.pth \
-i images/in/car_1.jpg \ -i images/in/car_1.jpg \
......
...@@ -3,9 +3,9 @@ MIGRAPHX_ENABLE_GRAPHAPI_REDUCTION=1 ...@@ -3,9 +3,9 @@ MIGRAPHX_ENABLE_GRAPHAPI_REDUCTION=1
MIGRAPHX_ENABLE_LAYERNORM_FUSION=1 MIGRAPHX_ENABLE_LAYERNORM_FUSION=1
migraphx-driver perf --onnx \ migraphx-driver perf --onnx \
../test0525/ground_opt_0509.onnx \ ../weights/ground_opt_0601.onnx \
--fp16 \ --fp16 \
--output \ --output \
../test0525/ground_opt_0515.mxr ../weights/ground_opt_0601.mxr
# ../weights/ground_opt_0430.mxr > migraphx_log.log 2>&1 # ../weights/ground_opt_0430.mxr > migraphx_log.log 2>&1
\ No newline at end of file
...@@ -275,8 +275,8 @@ def benchmark_performance( ...@@ -275,8 +275,8 @@ def benchmark_performance(
# ========================= # =========================
if __name__ == "__main__": if __name__ == "__main__":
model_path = "../weights/ground_opt_0430.onnx" model_path = "../weights/ground_opt_0601.onnx"
cache_path = "../weights/ground_opt_0515_1.mxr" cache_path = "../weights/ground_opt_0601.mxr"
img_path = "../images/in/car_1.jpg" img_path = "../images/in/car_1.jpg"
BOX_TRESHOLD = 0.35 BOX_TRESHOLD = 0.35
......
migraphx-driver perf --batch 1 \ migraphx-driver perf --batch 1 \
-n 10 \ -n 10 \
--fp16 \ --fp16 \
--migraphx ../weights/ground_opt_0515_1.mxr --migraphx ../weights/ground_opt_0601.mxr
\ No newline at end of file \ No newline at end of file
This diff is collapsed.
...@@ -188,7 +188,9 @@ def optimize_transpose_nodes(om: ONNXModifier): ...@@ -188,7 +188,9 @@ def optimize_transpose_nodes(om: ONNXModifier):
om.get_node("/transformer/enc_out_class_embed/Transpose").set_attribute("perm", [0, 2, 1]) om.get_node("/transformer/enc_out_class_embed/Transpose").set_attribute("perm", [0, 2, 1])
# modify /transformer/decoder/Reshape_* # modify /transformer/decoder/Reshape_*
om.set_initializer_value("_v_5525", np.array([1, 900, -1], np.int64)) dst_shape_name = om.get_node("/transformer/decoder/Reshape").inputs[1]
# om.set_initializer_value("_v_5525", np.array([1, 900, -1], np.int64))
om.set_initializer_value(dst_shape_name, np.array([1, 900, -1], np.int64))
# modify /transformer/decoder/layers.*/self_attn/Reshape_4 # modify /transformer/decoder/layers.*/self_attn/Reshape_4
# modify /transformer/decoder/layers.*/ca_text/Reshape_6 # modify /transformer/decoder/layers.*/ca_text/Reshape_6
...@@ -313,7 +315,8 @@ def optmize_sin_cos_block(om: ONNXModifier): ...@@ -313,7 +315,8 @@ def optmize_sin_cos_block(om: ONNXModifier):
om.infer_shape() om.infer_shape()
def fuse_one_attention(om: ONNXModifier, softmax_name: str, new_mask: bool = None, num_heads: int = 12): def fuse_one_attention(om: ONNXModifier, softmax_name: str, padding_mask: bool = None,
attn_mask: bool = None, num_heads: int = 12, block_type: str = "bert"):
softmax_node = om.get_node(softmax_name) softmax_node = om.get_node(softmax_name)
tmp_node = om.get_prev_nodes(softmax_node)[0] tmp_node = om.get_prev_nodes(softmax_node)[0]
assert tmp_node.op_type in ["MatMul", "Add"] assert tmp_node.op_type in ["MatMul", "Add"]
...@@ -326,16 +329,33 @@ def fuse_one_attention(om: ONNXModifier, softmax_name: str, new_mask: bool = Non ...@@ -326,16 +329,33 @@ def fuse_one_attention(om: ONNXModifier, softmax_name: str, new_mask: bool = Non
tmp_node = om.get_from_node(tmp_node.inputs[0]) tmp_node = om.get_from_node(tmp_node.inputs[0])
assert tmp_node.op_type == "MatMul" assert tmp_node.op_type == "MatMul"
mask = mask_node.inputs[1] mask = mask_node.inputs[1]
assert new_mask is not None if padding_mask is not None and attn_mask is not None:
raise ValueError("padding_mask and attn_mask cannot be provided at the same time")
if padding_mask is None and attn_mask is None:
raise ValueError("padding_mask or attn_mask must be provided")
tmp_node1 = om.get_from_node(tmp_node.inputs[0]) tmp_node1 = om.get_from_node(tmp_node.inputs[0])
if tmp_node1.op_type == "Mul": if tmp_node1.op_type == "Mul":
tmp_node1 = om.get_prev_nodes(tmp_node1)[0] tmp_node1 = om.get_prev_nodes(tmp_node1)[0]
tmp_node2 = om.get_from_node(tmp_node.inputs[1]) tmp_node2 = om.get_from_node(tmp_node.inputs[1])
if tmp_node2.op_type == "Mul":
tmp_node2 = om.get_prev_nodes(tmp_node2)[0]
assert tmp_node1.op_type == tmp_node2.op_type == "Transpose" assert tmp_node1.op_type == tmp_node2.op_type == "Transpose"
tmp_node1 = om.get_prev_nodes(tmp_node1)[0] tmp_node1 = om.get_prev_nodes(tmp_node1)[0]
tmp_node2 = om.get_prev_nodes(tmp_node2)[0] tmp_node2 = om.get_prev_nodes(tmp_node2)[0]
assert tmp_node1.op_type == tmp_node2.op_type == "Reshape" assert tmp_node1.op_type == tmp_node2.op_type == "Reshape"
if attn_mask is not None:
q, k = tmp_node1.outputs[0], tmp_node2.outputs[0]
q_dst_shape_value = om.get_initializer_value(tmp_node1.inputs[1])
if q_dst_shape_value.size == 3:
q_dst_shape_value_new = np.array([1, *q_dst_shape_value.tolist()], np.int64)
om.set_initializer_value(tmp_node1.inputs[1], q_dst_shape_value_new)
k_dst_shape_value = om.get_initializer_value(tmp_node2.inputs[1])
if k_dst_shape_value.size == 3:
k_dst_shape_value_new = np.array([1, *k_dst_shape_value.tolist()], np.int64)
om.set_initializer_value(tmp_node2.inputs[1], k_dst_shape_value_new)
else:
q, k = tmp_node1.inputs[0], tmp_node2.inputs[0] q, k = tmp_node1.inputs[0], tmp_node2.inputs[0]
tmp_node = om.get_next_nodes(softmax_node)[0] tmp_node = om.get_next_nodes(softmax_node)[0]
...@@ -345,6 +365,13 @@ def fuse_one_attention(om: ONNXModifier, softmax_name: str, new_mask: bool = Non ...@@ -345,6 +365,13 @@ def fuse_one_attention(om: ONNXModifier, softmax_name: str, new_mask: bool = Non
assert tmp_node3.op_type == "Transpose" assert tmp_node3.op_type == "Transpose"
tmp_node3 = om.get_prev_nodes(tmp_node3)[0] tmp_node3 = om.get_prev_nodes(tmp_node3)[0]
assert tmp_node3.op_type == "Reshape" assert tmp_node3.op_type == "Reshape"
if attn_mask is not None:
v = tmp_node3.outputs[0]
v_dst_shape_value = om.get_initializer_value(tmp_node3.inputs[1])
if v_dst_shape_value.size == 3:
v_dst_shape_value_new = np.array([1, *v_dst_shape_value.tolist()], np.int64)
om.set_initializer_value(tmp_node3.inputs[1], v_dst_shape_value_new)
else:
v = tmp_node3.inputs[0] v = tmp_node3.inputs[0]
else: else:
v_init = om.get_initializer(tmp_node.inputs[1]) v_init = om.get_initializer(tmp_node.inputs[1])
...@@ -359,54 +386,79 @@ def fuse_one_attention(om: ONNXModifier, softmax_name: str, new_mask: bool = Non ...@@ -359,54 +386,79 @@ def fuse_one_attention(om: ONNXModifier, softmax_name: str, new_mask: bool = Non
assert tmp_node.op_type == "Transpose" assert tmp_node.op_type == "Transpose"
tmp_node = om.get_next_nodes(tmp_node)[0] tmp_node = om.get_next_nodes(tmp_node)[0]
assert tmp_node.op_type == "Reshape" assert tmp_node.op_type == "Reshape"
# if softmax_name == "/transformer/encoder/text_layers.0/self_attn/Softmax":
# breakpoint()
if attn_mask is not None:
mha_next_node = tmp_node
mha_out_shape_value = om.get_initializer_value(tmp_node.inputs[1])
if mha_out_shape_value.size == 2:
mha_out_shape_value_new = np.array([1, -1, mha_out_shape_value[-1].item()], np.int64)
om.set_initializer_value(tmp_node.inputs[1], mha_out_shape_value_new)
else:
mha_next_node = om.get_next_nodes(tmp_node)[0] mha_next_node = om.get_next_nodes(tmp_node)[0]
if mha_next_node.op_type == "Gemm": assert mha_next_node.op_type in ["Gemm", "MatMul"]
gemm_next_node = om.get_next_nodes(mha_next_node)[0]
gemm_node = None
if om.get_next_nodes(tmp_node)[0].op_type == "Gemm":
gemm_node = om.get_next_nodes(tmp_node)[0]
gemm_next_node = om.get_next_nodes(gemm_node)[0]
assert gemm_next_node.op_type == "Reshape" assert gemm_next_node.op_type == "Reshape"
reshape_next_node = om.get_next_nodes(gemm_next_node)[0] reshape_next_node = om.get_next_nodes(gemm_next_node)[0]
assert reshape_next_node.op_type == "Add" assert reshape_next_node.op_type == "Add"
else:
assert mha_next_node.op_type == "MatMul"
name_prefix = '/'.join(softmax_name.split('/')[:-1]) name_prefix = '/'.join(softmax_name.split('/')[:-1])
mha_name = f"{name_prefix}/MultiHeadAttention" node_inputs = [q, k, v]
mha_node = om.create_node("MultiHeadAttention", if mask is None:
mha_type = "MultiHeadAttention"
else:
if padding_mask is not None:
mha_type = "MultiHeadAttention"
node_inputs.append(padding_mask)
elif attn_mask is not None:
mha_type = "MultiHeadAttentionWithAttnMask"
node_inputs.append(attn_mask)
else:
raise ValueError("padding_mask or attn_mask must be provided")
mha_name = f"{name_prefix}/{mha_type}"
mha_node = om.create_node(mha_type,
mha_name, mha_name,
[q, k, v] if mask is None else [q, k, v, new_mask], node_inputs,
[mha_name+'_output_0'], [mha_name+'_output_0'],
num_heads=num_heads, num_heads=num_heads,
domain="com.microsoft", domain="com.microsoft",
index=mha_next_node.index-1) index=mha_next_node.index-1)
mha_next_node.replace_input(mha_next_node.inputs[0], mha_node.outputs[0]) mha_next_node.replace_input(mha_next_node.inputs[0], mha_node.outputs[0])
if mha_next_node.op_type == "Gemm": if gemm_node is not None:
weights = om.get_initializer_value(mha_next_node.inputs[1]) weights = om.get_initializer_value(gemm_node.inputs[1])
transB = mha_next_node.attrs["transB"] transB = gemm_node.attrs["transB"]
assert transB == 1 assert transB == 1
weights = np.ascontiguousarray(weights.transpose(1, 0)) weights = np.ascontiguousarray(weights.transpose(1, 0))
om.set_initializer_value(mha_next_node.inputs[1], weights) om.set_initializer_value(gemm_node.inputs[1], weights)
new_matmul_name = mha_next_node.name.replace("Gemm", "MatMul(Gemm)") new_matmul_name = gemm_node.name.replace("Gemm", "MatMul(Gemm)")
new_matmul_node = om.create_node("MatMul", new_matmul_node = om.create_node("MatMul",
new_matmul_name, new_matmul_name,
[mha_node.outputs[0], mha_next_node.inputs[1]], [mha_next_node.outputs[0] if attn_mask is not None else mha_node.outputs[0],
gemm_node.inputs[1]],
[new_matmul_name + "_output_0"], [new_matmul_name + "_output_0"],
index=mha_next_node.index) index=gemm_node.index)
new_bias_name = mha_next_node.name.replace("Gemm", "Add(Gemm)") new_bias_name = gemm_node.name.replace("Gemm", "Add(Gemm)")
new_add_node = om.create_node("Add", new_add_node = om.create_node("Add",
new_bias_name, new_bias_name,
[new_matmul_node.outputs[0], mha_next_node.inputs[2]], [new_matmul_node.outputs[0], gemm_node.inputs[2]],
[new_bias_name + "_output_0"], [new_bias_name + "_output_0"],
index=new_matmul_node.index+1) index=new_matmul_node.index+1)
reshape_next_node.replace_input(gemm_next_node.outputs[0], new_add_node.outputs[0]) reshape_next_node.replace_input(gemm_next_node.outputs[0], new_add_node.outputs[0])
def optimize_normal_attention(om: ONNXModifier): def optimize_normal_attention(om: ONNXModifier):
def create_new_attention_mask(): def _create_new_padding_mask():
mask_next_node = om.get_to_nodes("attention_mask")[0] mask_next_node = om.get_to_nodes("attention_mask")[0]
cast_node = om.create_node("Cast", cast_node = om.create_node("Cast",
"Cast_for_attention_mask", "Cast_for_padding_mask",
["attention_mask"], ["attention_mask"],
["Cast_for_attention_mask_output_0"], ["Cast_for_padding_mask_output_0"],
# to=1, # float32 # to=1, # float32
to=6, # int32 to=6, # int32
index=mask_next_node.index) index=mask_next_node.index)
...@@ -419,17 +471,32 @@ def optimize_normal_attention(om: ONNXModifier): ...@@ -419,17 +471,32 @@ def optimize_normal_attention(om: ONNXModifier):
index=cast_node.index+1) index=cast_node.index+1)
return reducesum_node.outputs[0] return reducesum_node.outputs[0]
def _create_new_attn_mask(_num_heads: int):
cast_node = om.get_node("/bert/Cast")
tile_node = om.create_node("Tile",
f"Tile_for_attn_mask_{_num_heads}heads",
["/bert/Unsqueeze_output_0",
om.create_initializer(f"{_num_heads}heads_repeats",
np.array([1, _num_heads, 1, 1], np.int64)).name],
[f"Tile_for_attn_mask_{_num_heads}heads_output_0"],
index=cast_node.index)
return tile_node.outputs[0]
padding_mask = _create_new_padding_mask()
attn_mask1 = _create_new_attn_mask(12)
attn_mask2 = _create_new_attn_mask(4)
# bert # bert
# for i in range(12): for i in range(12):
# fuse_one_attention(om, f"/bert/encoder/layer.{i}/attention/self/Softmax", "text_token_mask", num_heads=12) fuse_one_attention(om, f"/bert/encoder/layer.{i}/attention/self/Softmax", attn_mask=attn_mask1, num_heads=12)
new_mask = create_new_attention_mask()
for i in range(6): for i in range(6):
# /transformer/encoder # /transformer/encoder
# fuse_one_attention(om, f"/transformer/encoder/text_layers.{i}/self_attn/Softmax", "text_token_mask", num_heads=4) fuse_one_attention(om, f"/transformer/encoder/text_layers.{i}/self_attn/Softmax", attn_mask=attn_mask2, num_heads=4)
# /transformer/decoder # /transformer/decoder
fuse_one_attention(om, f"/transformer/decoder/layers.{i}/self_attn/Softmax", new_mask, num_heads=8) fuse_one_attention(om, f"/transformer/decoder/layers.{i}/self_attn/Softmax", padding_mask=padding_mask, num_heads=8)
fuse_one_attention(om, f"/transformer/decoder/layers.{i}/ca_text/Softmax", new_mask, num_heads=8) fuse_one_attention(om, f"/transformer/decoder/layers.{i}/ca_text/Softmax", padding_mask=padding_mask, num_heads=8)
om.update_map() om.update_map()
...@@ -437,25 +504,32 @@ def optimize_normal_attention(om: ONNXModifier): ...@@ -437,25 +504,32 @@ def optimize_normal_attention(om: ONNXModifier):
def optimize_backbone_attention(om: ONNXModifier): def optimize_backbone_attention(om: ONNXModifier):
def get_original_mask(mask_name, name_prefix): def get_original_mask(mask_name, name_prefix):
mask_value = om.get_initializer_value(mask_name) mask_value = om.get_initializer_value(mask_name)
orig_mask = np.where(mask_value==0, 1, 0).astype(np.bool_) # mask_value = mask_value.astype(np.int64)
orig_mask_init = om.create_initializer(f"{name_prefix}/mask", orig_mask) assert mask_value.ndim == 5 and mask_value.shape[2] == 1
return orig_mask_init.name mask_value = mask_value.reshape(mask_value.shape[0] * mask_value.shape[1], mask_value.shape[3], mask_value.shape[4])
# = np.where(mask_value==0, 1, 0).astype(np.bool_)
def _fuse_one_attention(softmax_name: str): new_mask_init = om.create_initializer(f"{name_prefix}/mask", mask_value)
name_prefix = '/'.join(softmax_name.split('/')[:-1]) return new_mask_init.name
def _fuse_one_attention_with_bias(softmax_name: str):
name_prefix = '/'.join(softmax_name.split('/')[:-2])
softmax_node = om.get_node(softmax_name) softmax_node = om.get_node(softmax_name)
tmp_node = om.get_prev_nodes(softmax_node)[0] tmp_node = om.get_prev_nodes(softmax_node)[0]
pos_bias_init = None mask = None
if tmp_node.op_type == "Reshape": if tmp_node.op_type == "Reshape":
tmp_node = om.get_prev_nodes(tmp_node)[0] tmp_node = om.get_prev_nodes(tmp_node)[0]
assert tmp_node.op_type == "Add" assert tmp_node.op_type == "Add"
pos_bias_init = om.get_initializer(tmp_node.inputs[1]) mask = get_original_mask(tmp_node.inputs[1], name_prefix)
tmp_node = om.get_prev_nodes(tmp_node)[0] tmp_node = om.get_prev_nodes(tmp_node)[0]
assert tmp_node.op_type == "Reshape" assert tmp_node.op_type == "Reshape"
tmp_node = om.get_prev_nodes(tmp_node)[0] tmp_node = om.get_prev_nodes(tmp_node)[0]
assert tmp_node.op_type == "Add" assert tmp_node.op_type == "Add"
mask = get_original_mask(tmp_node.inputs[1], name_prefix) pos_bias_init_value = om.get_initializer_value(tmp_node.inputs[1])
assert pos_bias_init_value.shape[0] == 1
om.set_initializer_value(tmp_node.inputs[1], pos_bias_init_value.squeeze(axis=0))
pos_bias_init = tmp_node.inputs[1]
tmp_node = om.get_prev_nodes(tmp_node)[0] tmp_node = om.get_prev_nodes(tmp_node)[0]
assert tmp_node.op_type == "MatMul" assert tmp_node.op_type == "MatMul"
...@@ -486,26 +560,27 @@ def optimize_backbone_attention(om: ONNXModifier): ...@@ -486,26 +560,27 @@ def optimize_backbone_attention(om: ONNXModifier):
assert tmp_node.op_type == "Transpose" assert tmp_node.op_type == "Transpose"
tmp_node = om.get_next_nodes(tmp_node)[0] tmp_node = om.get_next_nodes(tmp_node)[0]
assert tmp_node.op_type == "Reshape" assert tmp_node.op_type == "Reshape"
mha_out = tmp_node.outputs[0] # mha_out = tmp_node.outputs[0]
mha_out = tmp_node.inputs[0]
old_dst_shape = om.get_initializer_value(reshape_node.inputs[1]) old_dst_shape = om.get_initializer_value(reshape_node.inputs[1])
b, s, _, h, d = old_dst_shape b, s, _, h, d = old_dst_shape
new_dst_shape = [b, s, _, h*d] # new_dst_shape = [b, s, _, h*d]
new_dst_shape_init = om.create_initializer(f"{name_prefix}/qkv_hidden_states_shape", # new_dst_shape_init = om.create_initializer(f"{name_prefix}/qkv_hidden_states_shape",
np.array(new_dst_shape, np.int64)) # np.array(new_dst_shape, np.int64))
reshape_node.set_input(1, new_dst_shape_init.name) # reshape_node.set_input(1, new_dst_shape_init.name)
for node in [q_gather_node, k_gather_node, v_gather_node]: for node in [q_gather_node, k_gather_node, v_gather_node]:
node.set_input(0, reshape_node.outputs[0]) node.set_input(0, reshape_node.outputs[0])
node.set_attribute("axis", 2) node.set_attribute("axis", 2)
mha_name = f"{name_prefix}/MultiHeadAttention" mha_name = f"{name_prefix}/MultiHeadAttentionWithAttnMask"
inputs = [q_gather_node.outputs[0], inputs = [q_gather_node.outputs[0],
k_gather_node.outputs[0], k_gather_node.outputs[0],
v_gather_node.outputs[0], v_gather_node.outputs[0],
mask] pos_bias_init]
if pos_bias_init is not None: if mask is not None:
inputs.append(pos_bias_init.name) inputs.append(mask)
mha_node = om.create_node("MultiHeadAttention", mha_node = om.create_node("MultiHeadAttentionWithAttnMask",
mha_name, mha_name,
inputs, inputs,
[mha_name+'_output_0'], [mha_name+'_output_0'],
...@@ -519,7 +594,7 @@ def optimize_backbone_attention(om: ONNXModifier): ...@@ -519,7 +594,7 @@ def optimize_backbone_attention(om: ONNXModifier):
for l in range(num_layers): for l in range(num_layers):
num_blocks = 18 if l == 2 else 2 num_blocks = 18 if l == 2 else 2
for b in range(num_blocks): for b in range(num_blocks):
_fuse_one_attention(f"/backbone/backbone.0/layers.{l}/blocks.{b}/attn/softmax/Softmax") _fuse_one_attention_with_bias(f"/backbone/backbone.0/layers.{l}/blocks.{b}/attn/softmax/Softmax")
def optimize_ms_deform_attn(om: ONNXModifier): def optimize_ms_deform_attn(om: ONNXModifier):
...@@ -614,7 +689,7 @@ def main(): ...@@ -614,7 +689,7 @@ def main():
input_onnx_path = sys.argv[1] input_onnx_path = sys.argv[1]
output_onnx_path = sys.argv[2] output_onnx_path = sys.argv[2]
# input_onnx_path = "ground_sim.onnx" # input_onnx_path = "ground_sim.onnx"
# output_onnx_path = "ground_sim_0520_new.onnx" # output_onnx_path = "ground_sim_0529.onnx"
om = ONNXModifier(input_onnx_path) om = ONNXModifier(input_onnx_path)
optimize_where_ndoes(om) # 1. 替换where节点 optimize_where_ndoes(om) # 1. 替换where节点
...@@ -622,9 +697,9 @@ def main(): ...@@ -622,9 +697,9 @@ def main():
optmize_sin_cos_block(om) # 3. 优化位置编码 optmize_sin_cos_block(om) # 3. 优化位置编码
om.add_opset_import("com.microsoft", 1) om.add_opset_import("com.microsoft", 1)
optimize_normal_attention(om) # 4. 融合bert、transformer中的mha optimize_normal_attention(om) # 4. 融合bert、transformer中的mha
# optimize_backbone_attention(om) # 5. 融合backbone中的注意力 optimize_backbone_attention(om) # 5. 融合backbone中的注意力
optimize_ms_deform_attn(om) # 6. 融合多尺度可变形注意力 optimize_ms_deform_attn(om) # 6. 融合多尺度可变形注意力
optimize_bidirect_attention(om) # 7. 优化双向注意力 # optimize_bidirect_attention(om) # 7. 优化双向注意力
om.save(output_onnx_path, save_as_external_data=False) om.save(output_onnx_path, save_as_external_data=False)
......
...@@ -240,7 +240,7 @@ def benchmark_performance( ...@@ -240,7 +240,7 @@ def benchmark_performance(
if __name__ == '__main__': if __name__ == '__main__':
# 配置参数 # 配置参数
model_path = 'weights_400x600/ground.onnx' model_path = 'weights/ground.onnx'
img_path = 'images/in/car_1.jpg' img_path = 'images/in/car_1.jpg'
TEXT_PROMPT = "car ." TEXT_PROMPT = "car ."
BOX_TRESHOLD = 0.35 BOX_TRESHOLD = 0.35
......
...@@ -38,7 +38,8 @@ install_torch() ...@@ -38,7 +38,8 @@ install_torch()
import torch import torch
from setuptools import find_packages, setup from setuptools import find_packages, setup
from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension # from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME, CppExtension, CUDAExtension
# groundingdino version info # groundingdino version info
version = "0.1.0" version = "0.1.0"
...@@ -82,7 +83,26 @@ def get_extensions(): ...@@ -82,7 +83,26 @@ def get_extensions():
extra_compile_args = {"cxx": []} extra_compile_args = {"cxx": []}
define_macros = [] define_macros = []
if CUDA_HOME is not None and (torch.cuda.is_available() or "TORCH_CUDA_ARCH_LIST" in os.environ): # if CUDA_HOME is not None and (torch.cuda.is_available() or "TORCH_CUDA_ARCH_LIST" in os.environ):
cuda_home = CUDA_HOME or os.environ.get("CUDA_HOME") or ROCM_HOME or os.environ.get("ROCM_HOME") or os.environ.get("HIP_HOME")
print("DEBUG imported CUDA_HOME:", CUDA_HOME)
print("DEBUG imported ROCM_HOME:", ROCM_HOME)
print("DEBUG env CUDA_HOME:", os.environ.get("CUDA_HOME"))
print("DEBUG env ROCM_HOME:", os.environ.get("ROCM_HOME"))
print("DEBUG env HIP_HOME:", os.environ.get("HIP_HOME"))
print("DEBUG cuda_home used:", cuda_home)
print("DEBUG torch.cuda.is_available:", torch.cuda.is_available())
print("DEBUG TORCH_CUDA_ARCH_LIST:", os.environ.get("TORCH_CUDA_ARCH_LIST"))
print("DEBUG FORCE_CUDA:", os.environ.get("FORCE_CUDA"))
if cuda_home is not None and (
torch.cuda.is_available()
or "TORCH_CUDA_ARCH_LIST" in os.environ
or os.getenv("FORCE_CUDA", "0") == "1"
):
print("Compiling with CUDA") print("Compiling with CUDA")
extension = CUDAExtension extension = CUDAExtension
sources += source_cuda sources += source_cuda
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment