Initial

ca34d4d2 · yanjl1 · ca34d4d2 · ca34d4d2 · ca34d4d2 · ca34d4d2
Commit ca34d4d2 authored Jun 02, 2026 by yanjl1
20 changed files
--- a/python/conv_fusion/conv_bias_prelu_add.py
+++ b/python/conv_fusion/conv_bias_prelu_add.py
+import hipdnn
+import torch
+
+
+def build_conv_bias_prelu_add_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_bias,
+    torch_tensor_add,
+    padding,
+    stride,
+    dilation,
+    negative_slope,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="conv_bias_prelu_add",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_add = graph.tensor_like(torch_tensor_add)
+
+    # Create op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+    hipdnn_tensor_add_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+    hipdnn_tensor_prelu_output = graph.prelu(
+        input=hipdnn_tensor_add_output, negative_slope=negative_slope, name="prelu"
+    )
+    hipdnn_tensor_y = graph.add(a=hipdnn_tensor_prelu_output, b=hipdnn_tensor_add, name="add")
+    hipdnn_tensor_y.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 16  # Number of input channels
+    h = 16  # Height
+    w = 16  # Width
+
+    # Filter dimensions
+    k = 16  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    # activate parameters
+    negative_slope = 0.01  # Negative slope
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_add = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    ) = build_conv_bias_prelu_add_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        torch_tensor_bias,
+        torch_tensor_add,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        negative_slope,
+        hipdnn_data_type,
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_add: torch_tensor_add.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("conv_bias_prelu_add graph execution complete.")
--- a/python/conv_fusion/conv_bias_relu.py
+++ b/python/conv_fusion/conv_bias_relu.py
+import hipdnn
+import torch
+
+
+def build_conv_bias_relu_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_bias,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="conv_bias_relu",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+
+    # Create op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+    hipdnn_tensor_add_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+    hipdnn_tensor_y = graph.relu(input=hipdnn_tensor_add_output, name="relu")
+    hipdnn_tensor_y.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_bias, hipdnn_tensor_y)
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 16  # Number of input channels
+    h = 16  # Height
+    w = 16  # Width
+
+    # Filter dimensions
+    k = 16  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_bias, hipdnn_tensor_y = (
+        build_conv_bias_relu_graph(
+            hipdnn_handle,
+            torch_tensor_x,
+            torch_tensor_w,
+            torch_tensor_bias,
+            [pad_h, pad_w],
+            [stride_h, stride_w],
+            [dil_h, dil_w],
+            hipdnn_data_type,
+        )
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("conv_bias_relu graph execution complete.")
--- a/python/conv_fusion/conv_bias_swish.py
+++ b/python/conv_fusion/conv_bias_swish.py
+import hipdnn
+import torch
+
+
+def build_conv_bias_swish_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_bias,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="conv_bias_swish",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+
+    # Create op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+    hipdnn_tensor_add_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+    hipdnn_tensor_y = graph.swish(input=hipdnn_tensor_add_output, name="swish")
+    hipdnn_tensor_y.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_bias, hipdnn_tensor_y)
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 16  # Number of input channels
+    h = 16  # Height
+    w = 16  # Width
+
+    # Filter dimensions
+    k = 16  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_bias, hipdnn_tensor_y = (
+        build_conv_bias_swish_graph(
+            hipdnn_handle,
+            torch_tensor_x,
+            torch_tensor_w,
+            torch_tensor_bias,
+            [pad_h, pad_w],
+            [stride_h, stride_w],
+            [dil_h, dil_w],
+            hipdnn_data_type,
+        )
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("conv_bias_swish graph execution complete.")
--- a/python/conv_fusion/conv_bias_swish_add.py
+++ b/python/conv_fusion/conv_bias_swish_add.py
+import hipdnn
+import torch
+
+
+def build_conv_bias_swish_add_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_bias,
+    torch_tensor_add,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="conv_bias_swish_add",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_add = graph.tensor_like(torch_tensor_add)
+
+    # Create op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+    hipdnn_tensor_bias_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+    hipdnn_tensor_swish_output = graph.swish(input=hipdnn_tensor_bias_output, name="swish")
+    hipdnn_tensor_y = graph.add(a=hipdnn_tensor_swish_output, b=hipdnn_tensor_add, name="add")
+    hipdnn_tensor_y.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 16  # Number of input channels
+    h = 16  # Height
+    w = 16  # Width
+
+    # Filter dimensions
+    k = 16  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_add = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    ) = build_conv_bias_swish_add_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        torch_tensor_bias,
+        torch_tensor_add,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_add: torch_tensor_add.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("conv_bias_swish_add graph execution complete.")
--- a/python/conv_fusion/convbwd_bias_relu.py
+++ b/python/conv_fusion/convbwd_bias_relu.py
+import hipdnn
+import torch
+
+
+def build_convBwd_bias_relu_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_bias,
+    padding,
+    stride,
+    dilation,
+    output_padding,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="convBwd_bias_relu",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+
+    # Create op
+    hipdnn_tensor_conv_output = graph.conv_dgrad(
+        loss=hipdnn_tensor_x,
+        filter=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        output_padding=output_padding,
+        name="conv2d",
+    )
+    hipdnn_tensor_add_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+    hipdnn_tensor_y = graph.relu(input=hipdnn_tensor_add_output, name="relu")
+    hipdnn_tensor_y.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_bias, hipdnn_tensor_y)
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 32  # Number of input channels
+    h = 270  # Height
+    w = 480  # Width
+
+    # Filter dimensions
+    k = 32  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 2  # Height stride
+    stride_w = 2  # Width stride
+    pad_h = 0  # Height padding
+    pad_w = 0  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+    output_padding_h = 1  # Output height padding
+    output_padding_w = 1  # Output width padding
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_bias, hipdnn_tensor_y = (
+        build_convBwd_bias_relu_graph(
+            hipdnn_handle,
+            torch_tensor_x,
+            torch_tensor_w,
+            torch_tensor_bias,
+            [pad_h, pad_w],
+            [stride_h, stride_w],
+            [dil_h, dil_w],
+            [output_padding_h, output_padding_w],
+            hipdnn_data_type,
+        )
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("convBwd_bias_relu graph execution complete.")
--- a/python/conv_fusion/convint8_bias.py
+++ b/python/conv_fusion/convint8_bias.py
+import hipdnn
+import torch
+
+
+def build_convint8_bias_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_zero_point_dq,
+    torch_tensor_scale_dq,
+    torch_tensor_bias,
+    torch_tensor_zero_point_q,
+    torch_tensor_scale_q,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn_data_type,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="convint8_bias",
+    )
+
+    # Create hipdnn conv input and filter tensor with NCHWc32 layout
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x).set_vector_count_and_dimension(32, 1)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w).set_vector_count_and_dimension(32, 1)
+
+    # Create conv_fprop op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv_fprop_node",
+    )
+
+    # Create sub node for dequantize:zero_point_dq
+    hipdnn_tensor_zero_point_dq = graph.tensor_like(torch_tensor_zero_point_dq)
+    hipdnn_tensor_zero_point_dq.set_value(0.0)
+
+    hipdnn_tensor_conv_deq_sub_output = graph.sub(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_zero_point_dq, name="conv_deq_sub_node"
+    )
+
+    # Create mul node for dequantize:scale_dq
+    hipdnn_tensor_scale_dq = graph.tensor_like(torch_tensor_scale_dq)
+    hipdnn_tensor_scale_dq.set_value(1.0)
+
+    hipdnn_tensor_conv_deq_mul_output = graph.mul(
+        a=hipdnn_tensor_conv_deq_sub_output, b=hipdnn_tensor_scale_dq, name="conv_deq_mul_node"
+    )
+
+    # Create bias node
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_bias_output = graph.add(
+        a=hipdnn_tensor_conv_deq_mul_output, b=hipdnn_tensor_bias, name="bias_node"
+    )
+
+    # Create div node for quantize:scale_q
+    hipdnn_tensor_scale_q = graph.tensor_like(torch_tensor_scale_q)
+    hipdnn_tensor_scale_q.set_value(1.0)
+    hipdnn_tensor_quantize_div_output = graph.div(
+        a=hipdnn_tensor_bias_output, b=hipdnn_tensor_scale_q, name="quantize_div_node"
+    )
+
+    # Create add node for quantize:zero_point_q
+    hipdnn_tensor_zero_point_q = graph.tensor_like(torch_tensor_zero_point_q)
+    hipdnn_tensor_zero_point_q.set_value(0.0)
+    hipdnn_tensor_output = graph.add(
+        a=hipdnn_tensor_quantize_div_output, b=hipdnn_tensor_zero_point_q, name="quantize_add_node"
+    )
+
+    hipdnn_tensor_output.set_output(True).set_vector_count_and_dimension(32, 1)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_output,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 2  # Batch size
+    c = 32  # Number of input channels
+    h = 16  # Height
+    w = 8  # Width
+
+    # Filter dimensions
+    k = 128  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 0  # Height padding
+    pad_w = 0  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    hipdnn_data_type = hipdnn.data_type.INT8
+    torch_data_type = torch.int8
+    bias_data_type = torch.float32
+    quantize_data_type = torch.float32
+
+    torch_tensor_x = torch.randint(
+        low=-128,
+        high=128,
+        size=(n, c, h, w),
+        dtype=torch_data_type,
+        device="cuda",
+    )
+    torch_tensor_w = torch.randint(
+        low=-128, high=128, size=(k, c, r, s), dtype=torch_data_type, device="cuda"
+    )
+
+    torch_tensor_zero_point_dq = torch.rand(1, 1, 1, 1, device="cuda")
+    torch_tensor_scale_dq = torch.rand(1, 1, 1, 1, device="cuda")
+
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=bias_data_type, device="cuda")
+
+    torch_tensor_zero_point_q = torch.rand(1, 1, 1, 1, device="cuda")
+    torch_tensor_scale_q = torch.rand(1, 1, 1, 1, device="cuda")
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_y,
+    ) = build_convint8_bias_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        torch_tensor_zero_point_dq,
+        torch_tensor_scale_dq,
+        torch_tensor_bias,
+        torch_tensor_zero_point_q,
+        torch_tensor_scale_q,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("convint8_bias graph execution complete.")
--- a/python/conv_fusion/convint8_bias_add.py
+++ b/python/conv_fusion/convint8_bias_add.py
+import hipdnn
+import torch
+
+
+def build_convint8_bias_add_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_zero_point_dq,
+    torch_tensor_scale_dq,
+    torch_tensor_bias,
+    torch_tensor_add,
+    torch_tensor_zero_point_dq_add,
+    torch_tensor_scale_dq_add,
+    torch_tensor_zero_point_q,
+    torch_tensor_scale_q,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn_data_type,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="convint8_bias_add",
+    )
+
+    # Create hipdnn conv input and filter tensor with NCHWc32 layout
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x).set_vector_count_and_dimension(32, 1)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w).set_vector_count_and_dimension(32, 1)
+
+    # Create conv_fprop op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv_fprop_node",
+    )
+
+    # Create sub node for dequantize:zero_point_dq
+    hipdnn_tensor_zero_point_dq = graph.tensor_like(torch_tensor_zero_point_dq)
+    hipdnn_tensor_zero_point_dq.set_value(0.0)
+    hipdnn_tensor_conv_deq_sub_output = graph.sub(
+        hipdnn_tensor_conv_output, hipdnn_tensor_zero_point_dq, name="conv_deq_sub_node"
+    )
+
+    # Create mul node for dequantize:scale_dq
+    hipdnn_tensor_scale_dq = graph.tensor_like(torch_tensor_scale_dq)
+    hipdnn_tensor_scale_dq.set_value(1.0)
+    hipdnn_tensor_conv_deq_mul_output = graph.mul(
+        hipdnn_tensor_conv_deq_sub_output, hipdnn_tensor_scale_dq, name="conv_deq_mul_node"
+    )
+
+    # Create bias node
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_bias_output = graph.add(
+        hipdnn_tensor_conv_deq_mul_output, hipdnn_tensor_bias, name="bias_node"
+    )
+
+    # Cretae add original input(without dequantize)
+    hipdnn_tensor_add = graph.tensor_like(torch_tensor_add)
+    hipdnn_tensor_add.set_vector_count_and_dimension(32, 1)
+
+    # Create sub node for dequantize:zero_point_dq_add
+    hipdnn_tensor_zero_point_dq_add = graph.tensor_like(torch_tensor_zero_point_dq_add)
+    hipdnn_tensor_zero_point_dq_add.set_value(0.0)
+    hipdnn_tensor_add_deq_sub_output = graph.sub(
+        hipdnn_tensor_add, hipdnn_tensor_zero_point_dq_add, name="add_deq_sub_node"
+    )
+
+    # Create mul node for dequantize:scale_dq_add
+    hipdnn_tensor_scale_dq_add = graph.tensor_like(torch_tensor_scale_dq_add)
+    hipdnn_tensor_scale_dq_add.set_value(1.0)
+    hipdnn_tensor_add_deq_mul_output = graph.mul(
+        hipdnn_tensor_add_deq_sub_output, hipdnn_tensor_scale_dq_add, name="add_deq_mul_node"
+    )
+    hipdnn_tensor_add_deq_mul_output
+
+    # Create add op
+    hipdnn_tensor_add_output = graph.add(
+        a=hipdnn_tensor_bias_output, b=hipdnn_tensor_add_deq_mul_output, name="add_node"
+    )
+
+    # Create div node for quantize:scale_q
+    hipdnn_tensor_scale_q = graph.tensor_like(torch_tensor_scale_q)
+    hipdnn_tensor_scale_q.set_value(1.0)
+    hipdnn_tensor_quantize_div_output = graph.div(
+        a=hipdnn_tensor_add_output, b=hipdnn_tensor_scale_q, name="quantize_div_node"
+    )
+
+    # Create add node for quantize:zero_point_q
+    hipdnn_tensor_zero_point_q = graph.tensor_like(torch_tensor_zero_point_q)
+    hipdnn_tensor_zero_point_q.set_value(0.0)
+    hipdnn_tensor_output = graph.add(
+        hipdnn_tensor_quantize_div_output, hipdnn_tensor_zero_point_q, name="quantize_add_node"
+    )
+
+    hipdnn_tensor_output.set_output(True).set_vector_count_and_dimension(32, 1)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_output,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 2  # Batch size
+    c = 32  # Number of input channels
+    h = 16  # Height
+    w = 8  # Width
+
+    # Filter dimensions
+    k = 128  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 0  # Height padding
+    pad_w = 0  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    out_h = int(((h + 2 * pad_h - (dil_h * (r - 1) + 1)) / stride_h) + 1)
+    out_w = int(((w + 2 * pad_w - (dil_w * (s - 1) + 1)) / stride_w) + 1)
+
+    hipdnn_data_type = hipdnn.data_type.INT8
+    torch_data_type = torch.int8
+    bias_data_type = torch.float32
+    quantize_data_type = torch.float32
+
+    torch_tensor_x = torch.randint(
+        low=-128,
+        high=128,
+        size=(n, c, h, w),
+        dtype=torch_data_type,
+        device="cuda",
+    )
+    torch_tensor_w = torch.randint(
+        low=-128, high=128, size=(k, c, r, s), dtype=torch_data_type, device="cuda"
+    )
+
+    torch_tensor_zero_point_dq = torch.rand(1, 1, 1, 1, device="cuda")
+    torch_tensor_scale_dq = torch.rand(1, 1, 1, 1, device="cuda")
+
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=bias_data_type, device="cuda")
+    torch_tensor_add = torch.randint(
+        low=-128, high=128, size=(n, k, out_h, out_w), dtype=torch_data_type, device="cuda"
+    )
+
+    torch_tensor_zero_point_dq_add = torch.rand(1, 1, 1, 1, device="cuda")
+    torch_tensor_scale_dq_add = torch.rand(1, 1, 1, 1, device="cuda")
+
+    torch_tensor_zero_point_q = torch.rand(1, 1, 1, 1, device="cuda")
+    torch_tensor_scale_q = torch.rand(1, 1, 1, 1, device="cuda")
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    ) = build_convint8_bias_add_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        torch_tensor_zero_point_dq,
+        torch_tensor_scale_dq,
+        torch_tensor_bias,
+        torch_tensor_add,
+        torch_tensor_zero_point_dq_add,
+        torch_tensor_scale_dq_add,
+        torch_tensor_zero_point_q,
+        torch_tensor_scale_q,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        device="cuda",
+    )
+
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_add: torch_tensor_add.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("convint8_bias_add graph execution complete.")
--- a/python/conv_fusion/convint8_bias_add_relu.py
+++ b/python/conv_fusion/convint8_bias_add_relu.py
+import hipdnn
+import torch
+
+
+def build_convint8_bias_add_relu_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_zero_point_dq,
+    torch_tensor_scale_dq,
+    torch_tensor_bias,
+    torch_tensor_add,
+    torch_tensor_zero_point_dq_add,
+    torch_tensor_scale_dq_add,
+    torch_tensor_zero_point_q,
+    torch_tensor_scale_q,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn_data_type,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="convint8_bias_add_relu",
+    )
+
+    # Create hipdnn conv input and filter tensor with NCHWc32 layout
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x).set_vector_count_and_dimension(32, 1)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w).set_vector_count_and_dimension(32, 1)
+
+    # Create conv_fprop op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv_fprop_node",
+    )
+
+    # Create sub node for dequantize:zero_point_dq
+    hipdnn_tensor_zero_point_dq = graph.tensor_like(torch_tensor_zero_point_dq)
+    hipdnn_tensor_zero_point_dq.set_value(0.0)
+    hipdnn_tensor_conv_deq_sub_output = graph.sub(
+        hipdnn_tensor_conv_output, hipdnn_tensor_zero_point_dq, name="conv_deq_sub_node"
+    )
+
+    # Create mul node for dequantize:scale_dq
+    hipdnn_tensor_scale_dq = graph.tensor_like(torch_tensor_scale_dq)
+    hipdnn_tensor_scale_dq.set_value(1.0)
+    hipdnn_tensor_conv_deq_mul_output = graph.mul(
+        hipdnn_tensor_conv_deq_sub_output, hipdnn_tensor_scale_dq, name="conv_deq_mul_node"
+    )
+
+    # Create bias node
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_bias_output = graph.add(
+        hipdnn_tensor_conv_deq_mul_output, hipdnn_tensor_bias, name="bias_node"
+    )
+
+    # Cretae add original input(without dequantize)
+    hipdnn_tensor_add = graph.tensor_like(torch_tensor_add)
+    hipdnn_tensor_add.set_vector_count_and_dimension(32, 1)
+
+    # Create sub node for dequantize:zero_point_dq_add
+    hipdnn_tensor_zero_point_dq_add = graph.tensor_like(torch_tensor_zero_point_dq_add)
+    hipdnn_tensor_zero_point_dq_add.set_value(0.0)
+    hipdnn_tensor_add_deq_sub_output = graph.sub(
+        hipdnn_tensor_add, hipdnn_tensor_zero_point_dq_add, name="add_deq_sub_node"
+    )
+
+    # Create mul node for dequantize:scale_dq_add
+    hipdnn_tensor_scale_dq_add = graph.tensor_like(torch_tensor_scale_dq_add)
+    hipdnn_tensor_scale_dq_add.set_value(1.0)
+    hipdnn_tensor_add_deq_mul_output = graph.mul(
+        hipdnn_tensor_add_deq_sub_output, hipdnn_tensor_scale_dq_add, name="add_deq_mul_node"
+    )
+    hipdnn_tensor_add_deq_mul_output
+
+    # Create add op
+    hipdnn_tensor_add_output = graph.add(
+        a=hipdnn_tensor_bias_output, b=hipdnn_tensor_add_deq_mul_output, name="add_node"
+    )
+
+    # Create relu node
+    hipdnn_tensor_relu_output = graph.relu(input=hipdnn_tensor_add_output, name="relu_node")
+
+    # Create div node for quantize:scale_q
+    hipdnn_tensor_scale_q = graph.tensor_like(torch_tensor_scale_q)
+    hipdnn_tensor_scale_q.set_value(1.0)
+    hipdnn_tensor_quantize_div_output = graph.div(
+        a=hipdnn_tensor_relu_output, b=hipdnn_tensor_scale_q, name="quantize_div_node"
+    )
+
+    # Create add node for quantize:zero_point_q
+    hipdnn_tensor_zero_point_q = graph.tensor_like(torch_tensor_zero_point_q)
+    hipdnn_tensor_zero_point_q.set_value(0.0)
+    hipdnn_tensor_output = graph.add(
+        hipdnn_tensor_quantize_div_output, hipdnn_tensor_zero_point_q, name="quantize_add_node"
+    )
+
+    hipdnn_tensor_output.set_output(True).set_vector_count_and_dimension(32, 1)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_output,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 2  # Batch size
+    c = 32  # Number of input channels
+    h = 16  # Height
+    w = 8  # Width
+
+    # Filter dimensions
+    k = 128  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 0  # Height padding
+    pad_w = 0  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    out_h = int(((h + 2 * pad_h - (dil_h * (r - 1) + 1)) / stride_h) + 1)
+    out_w = int(((w + 2 * pad_w - (dil_w * (s - 1) + 1)) / stride_w) + 1)
+
+    hipdnn_data_type = hipdnn.data_type.INT8
+    torch_data_type = torch.int8
+    bias_data_type = torch.float32
+    quantize_data_type = torch.float32
+
+    torch_tensor_x = torch.randint(
+        low=-128,
+        high=128,
+        size=(n, c, h, w),
+        dtype=torch_data_type,
+        device="cuda",
+    )
+    torch_tensor_w = torch.randint(
+        low=-128, high=128, size=(k, c, r, s), dtype=torch_data_type, device="cuda"
+    )
+
+    torch_tensor_zero_point_dq = torch.rand(1, 1, 1, 1, device="cuda")
+    torch_tensor_scale_dq = torch.rand(1, 1, 1, 1, device="cuda")
+
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=bias_data_type, device="cuda")
+    torch_tensor_add = torch.randint(
+        low=-128, high=128, size=(n, k, out_h, out_w), dtype=torch_data_type, device="cuda"
+    )
+
+    torch_tensor_zero_point_dq_add = torch.rand(1, 1, 1, 1, device="cuda")
+    torch_tensor_scale_dq_add = torch.rand(1, 1, 1, 1, device="cuda")
+
+    torch_tensor_zero_point_q = torch.rand(1, 1, 1, 1, device="cuda")
+    torch_tensor_scale_q = torch.rand(1, 1, 1, 1, device="cuda")
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    ) = build_convint8_bias_add_relu_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        torch_tensor_zero_point_dq,
+        torch_tensor_scale_dq,
+        torch_tensor_bias,
+        torch_tensor_add,
+        torch_tensor_zero_point_dq_add,
+        torch_tensor_scale_dq_add,
+        torch_tensor_zero_point_q,
+        torch_tensor_scale_q,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        device="cuda",
+    )
+
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_add: torch_tensor_add.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("convint8_bias_add_relu graph execution complete.")
--- a/python/conv_fusion/convint8_bias_relu.py
+++ b/python/conv_fusion/convint8_bias_relu.py
+import hipdnn
+import torch
+
+
+def build_convint8_bias_relu_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_zero_point_dq,
+    torch_tensor_scale_dq,
+    torch_tensor_bias,
+    torch_tensor_zero_point_q,
+    torch_tensor_scale_q,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn_data_type,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="convint8_bias_relu",
+    )
+
+    # Create hipdnn conv input and filter tensor with NCHWc32 layout
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x).set_vector_count_and_dimension(32, 1)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w).set_vector_count_and_dimension(32, 1)
+
+    # Create conv_fprop op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv_fprop_node",
+    )
+
+    # Create sub node for dequantize:zero_point_dq
+    hipdnn_tensor_zero_point_dq = graph.tensor_like(torch_tensor_zero_point_dq)
+    hipdnn_tensor_zero_point_dq.set_value(0.0)
+
+    hipdnn_tensor_conv_deq_sub_output = graph.sub(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_zero_point_dq, name="conv_deq_sub_node"
+    )
+
+    # Create mul node for dequantize:scale_dq
+    hipdnn_tensor_scale_dq = graph.tensor_like(torch_tensor_scale_dq)
+    hipdnn_tensor_scale_dq.set_value(1.0)
+
+    hipdnn_tensor_conv_deq_mul_output = graph.mul(
+        a=hipdnn_tensor_conv_deq_sub_output, b=hipdnn_tensor_scale_dq, name="conv_deq_mul_node"
+    )
+
+    # Create bias node
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_bias_output = graph.add(
+        a=hipdnn_tensor_conv_deq_mul_output, b=hipdnn_tensor_bias, name="bias_node"
+    )
+
+    # Create relu node
+    hipdnn_tensor_relu_output = graph.relu(input=hipdnn_tensor_bias_output, name="relu_node")
+
+    # Create div node for quantize:scale_q
+    hipdnn_tensor_scale_q = graph.tensor_like(torch_tensor_scale_q)
+    hipdnn_tensor_scale_q.set_value(1.0)
+    hipdnn_tensor_quantize_div_output = graph.div(
+        a=hipdnn_tensor_relu_output, b=hipdnn_tensor_scale_q, name="quantize_div_node"
+    )
+
+    # Create add node for quantize:zero_point_q
+    hipdnn_tensor_zero_point_q = graph.tensor_like(torch_tensor_zero_point_q)
+    hipdnn_tensor_zero_point_q.set_value(0.0)
+    hipdnn_tensor_output = graph.add(
+        a=hipdnn_tensor_quantize_div_output, b=hipdnn_tensor_zero_point_q, name="quantize_add_node"
+    )
+
+    hipdnn_tensor_output.set_output(True).set_vector_count_and_dimension(32, 1)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_output,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 2  # Batch size
+    c = 32  # Number of input channels
+    h = 16  # Height
+    w = 8  # Width
+
+    # Filter dimensions
+    k = 128  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 0  # Height padding
+    pad_w = 0  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    hipdnn_data_type = hipdnn.data_type.INT8
+    torch_data_type = torch.int8
+    bias_data_type = torch.float32
+    quantize_data_type = torch.float32
+
+    torch_tensor_x = torch.randint(
+        low=-128,
+        high=128,
+        size=(n, c, h, w),
+        dtype=torch_data_type,
+        device="cuda",
+    )
+    torch_tensor_w = torch.randint(
+        low=-128, high=128, size=(k, c, r, s), dtype=torch_data_type, device="cuda"
+    )
+
+    torch_tensor_zero_point_dq = torch.rand(1, 1, 1, 1, device="cuda")
+    torch_tensor_scale_dq = torch.rand(1, 1, 1, 1, device="cuda")
+
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=bias_data_type, device="cuda")
+
+    torch_tensor_zero_point_q = torch.rand(1, 1, 1, 1, device="cuda")
+    torch_tensor_scale_q = torch.rand(1, 1, 1, 1, device="cuda")
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_y,
+    ) = build_convint8_bias_relu_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        torch_tensor_zero_point_dq,
+        torch_tensor_scale_dq,
+        torch_tensor_bias,
+        torch_tensor_zero_point_q,
+        torch_tensor_scale_q,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("convint8_bias_relu graph execution complete.")
--- a/python/conv_fusion/convint8_bias_relu_add.py
+++ b/python/conv_fusion/convint8_bias_relu_add.py
+import hipdnn
+import torch
+
+
+def build_convint8_bias_relu_add_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_zero_point_dq,
+    torch_tensor_scale_dq,
+    torch_tensor_bias,
+    torch_tensor_add,
+    torch_tensor_zero_point_dq_add,
+    torch_tensor_scale_dq_add,
+    torch_tensor_zero_point_q,
+    torch_tensor_scale_q,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn_data_type,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="convint8_bias_relu_add",
+    )
+
+    # Create hipdnn conv input and filter tensor with NCHWc32 layout
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x).set_vector_count_and_dimension(32, 1)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w).set_vector_count_and_dimension(32, 1)
+
+    # Create conv_fprop op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv_fprop_node",
+    )
+
+    # Create sub node for dequantize:zero_point_dq
+    hipdnn_tensor_zero_point_dq = graph.tensor_like(torch_tensor_zero_point_dq)
+    hipdnn_tensor_zero_point_dq.set_value(0.0)
+    hipdnn_tensor_conv_deq_sub_output = graph.sub(
+        hipdnn_tensor_conv_output, hipdnn_tensor_zero_point_dq, name="conv_deq_sub_node"
+    )
+
+    # Create mul node for dequantize:scale_dq
+    hipdnn_tensor_scale_dq = graph.tensor_like(torch_tensor_scale_dq)
+    hipdnn_tensor_scale_dq.set_value(1.0)
+    hipdnn_tensor_conv_deq_mul_output = graph.mul(
+        hipdnn_tensor_conv_deq_sub_output, hipdnn_tensor_scale_dq, name="conv_deq_mul_node"
+    )
+
+    # Create bias node
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_bias_output = graph.add(
+        hipdnn_tensor_conv_deq_mul_output, hipdnn_tensor_bias, name="bias_node"
+    )
+
+    # Create relu node
+    hipdnn_tensor_relu_output = graph.relu(input=hipdnn_tensor_bias_output, name="relu_node")
+
+    # Cretae add original input(without dequantize)
+    hipdnn_tensor_add = graph.tensor_like(torch_tensor_add)
+    hipdnn_tensor_add.set_vector_count_and_dimension(32, 1)
+
+    # Create sub node for dequantize:zero_point_dq_add
+    hipdnn_tensor_zero_point_dq_add = graph.tensor_like(torch_tensor_zero_point_dq_add)
+    hipdnn_tensor_zero_point_dq_add.set_value(0.0)
+    hipdnn_tensor_add_deq_sub_output = graph.sub(
+        hipdnn_tensor_add, hipdnn_tensor_zero_point_dq_add, name="add_deq_sub_node"
+    )
+
+    # Create mul node for dequantize:scale_dq_add
+    hipdnn_tensor_scale_dq_add = graph.tensor_like(torch_tensor_scale_dq_add)
+    hipdnn_tensor_scale_dq_add.set_value(1.0)
+    hipdnn_tensor_add_deq_mul_output = graph.mul(
+        hipdnn_tensor_add_deq_sub_output, hipdnn_tensor_scale_dq_add, name="add_deq_mul_node"
+    )
+    hipdnn_tensor_add_deq_mul_output
+
+    # Create add op
+    hipdnn_tensor_add_output = graph.add(
+        a=hipdnn_tensor_relu_output, b=hipdnn_tensor_add_deq_mul_output, name="add_node"
+    )
+
+    # Create div node for quantize:scale_q
+    hipdnn_tensor_scale_q = graph.tensor_like(torch_tensor_scale_q)
+    hipdnn_tensor_scale_q.set_value(1.0)
+    hipdnn_tensor_quantize_div_output = graph.div(
+        a=hipdnn_tensor_add_output, b=hipdnn_tensor_scale_q, name="quantize_div_node"
+    )
+
+    # Create add node for quantize:zero_point_q
+    hipdnn_tensor_zero_point_q = graph.tensor_like(torch_tensor_zero_point_q)
+    hipdnn_tensor_zero_point_q.set_value(0.0)
+    hipdnn_tensor_output = graph.add(
+        hipdnn_tensor_quantize_div_output, hipdnn_tensor_zero_point_q, name="quantize_add_node"
+    )
+
+    hipdnn_tensor_output.set_output(True).set_vector_count_and_dimension(32, 1)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_output,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 2  # Batch size
+    c = 32  # Number of input channels
+    h = 16  # Height
+    w = 8  # Width
+
+    # Filter dimensions
+    k = 128  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 0  # Height padding
+    pad_w = 0  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    out_h = int(((h + 2 * pad_h - (dil_h * (r - 1) + 1)) / stride_h) + 1)
+    out_w = int(((w + 2 * pad_w - (dil_w * (s - 1) + 1)) / stride_w) + 1)
+
+    hipdnn_data_type = hipdnn.data_type.INT8
+    torch_data_type = torch.int8
+    bias_data_type = torch.float32
+    quantize_data_type = torch.float32
+
+    torch_tensor_x = torch.randint(
+        low=-128,
+        high=128,
+        size=(n, c, h, w),
+        dtype=torch_data_type,
+        device="cuda",
+    )
+    torch_tensor_w = torch.randint(
+        low=-128, high=128, size=(k, c, r, s), dtype=torch_data_type, device="cuda"
+    )
+
+    torch_tensor_zero_point_dq = torch.rand(1, 1, 1, 1, device="cuda")
+    torch_tensor_scale_dq = torch.rand(1, 1, 1, 1, device="cuda")
+
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=bias_data_type, device="cuda")
+    torch_tensor_add = torch.randint(
+        low=-128, high=128, size=(n, k, out_h, out_w), dtype=torch_data_type, device="cuda"
+    )
+
+    torch_tensor_zero_point_dq_add = torch.rand(1, 1, 1, 1, device="cuda")
+    torch_tensor_scale_dq_add = torch.rand(1, 1, 1, 1, device="cuda")
+
+    torch_tensor_zero_point_q = torch.rand(1, 1, 1, 1, device="cuda")
+    torch_tensor_scale_q = torch.rand(1, 1, 1, 1, device="cuda")
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    ) = build_convint8_bias_relu_add_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        torch_tensor_zero_point_dq,
+        torch_tensor_scale_dq,
+        torch_tensor_bias,
+        torch_tensor_add,
+        torch_tensor_zero_point_dq_add,
+        torch_tensor_scale_dq_add,
+        torch_tensor_zero_point_q,
+        torch_tensor_scale_q,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        device="cuda",
+    )
+
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_add: torch_tensor_add.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("convint8_bias_relu_add graph execution complete.")
--- a/python/convolution/convolution_bwd.py
+++ b/python/convolution/convolution_bwd.py
+import hipdnn
+import torch
+
+
+def build_conv_backward_graph(
+    hipdnn_handle, torch_tensor_dy, torch_tensor_w, padding, stride, dilation, hipdnn_data_type
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="convolution_backward",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_dy = graph.tensor_like(torch_tensor_dy)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+
+    # Create conv op
+    hipdnn_tensor_dx = graph.conv_dgrad(
+        loss=hipdnn_tensor_dy,
+        filter=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d_backward",
+    )
+    hipdnn_tensor_dx.set_output(True)
+    graph.build(hipdnn_handle)
+
+    return (graph, hipdnn_tensor_dy, hipdnn_tensor_w, hipdnn_tensor_dx)
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 4  # Batch size
+    c = 32  # Number of input channels
+    h = 16  # Height
+    w = 16  # Width
+
+    # Filter dimensions
+    k = 64  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_dy = torch.rand(n, k, h, w, dtype=torch_data_type, device="cuda")
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda")
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    graph, hipdnn_tensor_dy, hipdnn_tensor_w, hipdnn_tensor_dx = build_conv_backward_graph(
+        hipdnn_handle,
+        torch_tensor_dy,
+        torch_tensor_w,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+    torch_tensor_dx = torch.empty(hipdnn_tensor_dx.get_dim(), dtype=torch_data_type, device="cuda")
+    variant_pack = {
+        hipdnn_tensor_dy: torch_tensor_dy.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_dx: torch_tensor_dx.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("Convolution backward graph execution complete.")
--- a/python/convolution/convolution_fwd.py
+++ b/python/convolution/convolution_fwd.py
+import hipdnn
+import torch
+
+
+def build_conv_forward_graph(
+    hipdnn_handle, torch_tensor_x, torch_tensor_w, padding, stride, dilation, hipdnn_data_type
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="convolution_forward",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+
+    # Create conv op
+    hipdnn_tensor_y = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d_forward",
+    )
+    hipdnn_tensor_y.set_output(True)
+    graph.build(hipdnn_handle)
+
+    return (graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_y)
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 4  # Batch size
+    c = 16  # Number of input channels
+    h = 56  # Height
+    w = 56  # Width
+
+    # Filter dimensions
+    k = 4  # Number of output channels
+    r = 1  # Filter height
+    s = 1  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda")
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda")
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_y = build_conv_forward_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_y = torch.empty(hipdnn_tensor_y.get_dim(), dtype=torch_data_type, device="cuda")
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("Convolution forward graph execution complete.")
--- a/python/convolution/convolution_wrw.py
+++ b/python/convolution/convolution_wrw.py
+import hipdnn
+import torch
+
+
+def build_conv_wrw_graph(
+    hipdnn_handle, torch_tensor_x, torch_tensor_dy, padding, stride, dilation, hipdnn_data_type
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="convolution_wrw",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_dy = graph.tensor_like(torch_tensor_dy)
+
+    # Create conv op
+    hipdnn_tensor_dw = graph.conv_wgrad(
+        image=hipdnn_tensor_x,
+        loss=hipdnn_tensor_dy,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d_wrw",
+    )
+    hipdnn_tensor_dw.set_output(True)
+    graph.build(hipdnn_handle)
+
+    return (graph, hipdnn_tensor_x, hipdnn_tensor_dy, hipdnn_tensor_dw)
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 4  # Batch size
+    c = 32  # Number of input channels
+    h = 16  # Height
+    w = 16  # Width
+
+    # Filter dimensions
+    k = 64  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda")
+    torch_tensor_dy = torch.rand(n, k, h, w, dtype=torch_data_type, device="cuda")
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    graph, hipdnn_tensor_x, hipdnn_tensor_dy, hipdnn_tensor_dw = build_conv_wrw_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_dy,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+    torch_tensor_dw = torch.empty(hipdnn_tensor_dw.get_dim(), dtype=torch_data_type, device="cuda")
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_dy: torch_tensor_dy.data_ptr(),
+        hipdnn_tensor_dw: torch_tensor_dw.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("Convolution wrw graph execution complete.")
--- a/python/ctc_loss/ctc_loss.py
+++ b/python/ctc_loss/ctc_loss.py
+import hipdnn
+import torch
+
+
+def build_ctc_loss_graph(hipdnn_handle, torch_tensor_probs, hipdnn_data_type):
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="ctc_loss_inference",
+    )
+    hipdnn_tensor_probs = graph.tensor_like(torch_tensor_probs)
+    losses, gradients = graph.ctc_loss(
+        probs=hipdnn_tensor_probs,
+        blank_label_id=0,
+        apply_softmax=False,
+        algo=0,
+        labels=[1, 2, 3, 4, 2, 3, 2],
+        label_lengths=[1, 2, 1, 3],
+        input_lengths=[4, 100, 100, 200],
+        name="ctc_loss",
+    )
+    losses.set_output(True)
+    gradients.set_output(True)
+    graph.build(hipdnn_handle)
+    return (graph, hipdnn_tensor_probs, losses, gradients)
+
+
+if __name__ == "__main__":
+    batch, max_time, num_classes = 4, 500, 5
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+    torch_tensor_probs = torch.rand(
+        max_time, batch, num_classes, dtype=torch_data_type, device="cuda"
+    )
+    hipdnn_handle = hipdnn.create_handle()
+    graph, hipdnn_tensor_probs, hipdnn_tensor_losses, hipdnn_tensor_gradients = (
+        build_ctc_loss_graph(hipdnn_handle, torch_tensor_probs, hipdnn_data_type)
+    )
+    torch_tensor_losses = torch.empty(batch, dtype=torch_data_type, device="cuda")
+    torch_tensor_gradients = torch.empty(
+        batch, max_time, num_classes, dtype=torch_data_type, device="cuda"
+    )
+    variant_pack = {
+        hipdnn_tensor_probs: torch_tensor_probs.data_ptr(),
+        hipdnn_tensor_losses: torch_tensor_losses.data_ptr(),
+        hipdnn_tensor_gradients: torch_tensor_gradients.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("ctc_loss graph execution complete.")
--- a/python/deformattention/deform_attention.py
+++ b/python/deformattention/deform_attention.py
+import hipdnn
+import torch
+
+
+def build_deform_attention_graph(
+    hipdnn_handle,
+    torch_tensor_value,
+    torch_tensor_spatial_shapes,
+    torch_tensor_level_start_index,
+    torch_tensor_sampling_locations,
+    torch_tensor_attention_weights,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="deform_attention",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_value = graph.tensor_like(torch_tensor_value)
+    hipdnn_tensor_spatial_shapes = graph.tensor_like(torch_tensor_spatial_shapes)
+    hipdnn_tensor_level_start_index = graph.tensor_like(torch_tensor_level_start_index)
+    hipdnn_tensor_sampling_locations = graph.tensor_like(torch_tensor_sampling_locations)
+    hipdnn_tensor_attention_weights = graph.tensor_like(torch_tensor_attention_weights)
+
+    # Create deform attn op
+    hipdnn_tensor_y = graph.deform_attn_fprop(
+        value=hipdnn_tensor_value,
+        spatial_shapes=hipdnn_tensor_spatial_shapes,
+        level_start_index=hipdnn_tensor_level_start_index,
+        sampling_locations=hipdnn_tensor_sampling_locations,
+        attention_weights=hipdnn_tensor_attention_weights,
+        name="deform_attn_fprop",
+    )
+    hipdnn_tensor_y.set_output(True)
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_value,
+        hipdnn_tensor_spatial_shapes,
+        hipdnn_tensor_level_start_index,
+        hipdnn_tensor_sampling_locations,
+        hipdnn_tensor_attention_weights,
+        hipdnn_tensor_y,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 2  # batch size
+    n_heads = 2
+    embed_dims_per_head = 32
+    embed_dims = n_heads * embed_dims_per_head
+    n_levels = 2
+    n_points = 2
+    n_queries = 32
+
+    spatial_shapes_cpu = torch.randint(low=1, high=16, size=(n_levels, 2), dtype=torch.int64)
+    # calculate n_keys based on spatial_shapes_cpu
+    n_keys = spatial_shapes_cpu.prod(dim=1).sum()
+    # calculate level_start_index based on spatial_shapes_cpu
+    count_per_level = spatial_shapes_cpu.prod(dim=1)
+    level_start_index_cpu = torch.zeros_like(count_per_level)
+    level_start_index_cpu[1:] = torch.cumsum(count_per_level[:-1], dim=0)
+
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_value = torch.rand(
+        n, n_keys, n_heads, embed_dims_per_head, dtype=torch_data_type, device="cuda"
+    )
+    torch_tensor_spatial_shapes = spatial_shapes_cpu.to("cuda")
+    torch_tensor_level_start_index = level_start_index_cpu.to("cuda")
+    torch_tensor_sampling_locations = torch.rand(
+        n, n_queries, n_heads, n_levels, n_points, 2, dtype=torch_data_type, device="cuda"
+    )
+    torch_tensor_attention_weights = torch.rand(
+        n, n_queries, n_heads, n_levels, n_points, dtype=torch_data_type, device="cuda"
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_value,
+        hipdnn_tensor_spatial_shapes,
+        hipdnn_tensor_level_start_index,
+        hipdnn_tensor_sampling_locations,
+        hipdnn_tensor_attention_weights,
+        hipdnn_tensor_y,
+    ) = build_deform_attention_graph(
+        hipdnn_handle,
+        torch_tensor_value,
+        torch_tensor_spatial_shapes,
+        torch_tensor_level_start_index,
+        torch_tensor_sampling_locations,
+        torch_tensor_attention_weights,
+        hipdnn_data_type,
+    )
+
+    torch_tensor_y = torch.empty(hipdnn_tensor_y.get_dim(), dtype=torch_data_type, device="cuda")
+    variant_pack = {
+        hipdnn_tensor_value: torch_tensor_value.data_ptr(),
+        hipdnn_tensor_spatial_shapes: torch_tensor_spatial_shapes.data_ptr(),
+        hipdnn_tensor_level_start_index: torch_tensor_level_start_index.data_ptr(),
+        hipdnn_tensor_sampling_locations: torch_tensor_sampling_locations.data_ptr(),
+        hipdnn_tensor_attention_weights: torch_tensor_attention_weights.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("Deform attention graph execution complete.")
--- a/python/deformattention/deform_attention_bwd.py
+++ b/python/deformattention/deform_attention_bwd.py
+import hipdnn
+import torch
+
+
+def build_deform_attention_bwd_graph(
+    hipdnn_handle,
+    torch_tensor_value,
+    torch_tensor_spatial_shapes,
+    torch_tensor_level_start_index,
+    torch_tensor_sampling_locations,
+    torch_tensor_attention_weights,
+    torch_tensor_grad_output,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="deform_attention_bwd",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_value = graph.tensor_like(torch_tensor_value)
+    hipdnn_tensor_spatial_shapes = graph.tensor_like(torch_tensor_spatial_shapes)
+    hipdnn_tensor_level_start_index = graph.tensor_like(torch_tensor_level_start_index)
+    hipdnn_tensor_sampling_locations = graph.tensor_like(torch_tensor_sampling_locations)
+    hipdnn_tensor_attention_weights = graph.tensor_like(torch_tensor_attention_weights)
+    hipdnn_tensor_grad_output = graph.tensor_like(torch_tensor_grad_output)
+
+    # Create deform attn op
+    hipdnn_tensor_grad_value, hipdnn_tensor_grad_sampling_loc, hipdnn_tensor_grad_attn_weight = (
+        graph.deform_attn_dgrad(
+            value=hipdnn_tensor_value,
+            spatial_shapes=hipdnn_tensor_spatial_shapes,
+            level_start_index=hipdnn_tensor_level_start_index,
+            sampling_locations=hipdnn_tensor_sampling_locations,
+            attention_weights=hipdnn_tensor_attention_weights,
+            grad_output=hipdnn_tensor_grad_output,
+            name="deform_attn_dgrad",
+        )
+    )
+    hipdnn_tensor_grad_value.set_output(True)
+    hipdnn_tensor_grad_sampling_loc.set_output(True)
+    hipdnn_tensor_grad_attn_weight.set_output(True)
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_value,
+        hipdnn_tensor_spatial_shapes,
+        hipdnn_tensor_level_start_index,
+        hipdnn_tensor_sampling_locations,
+        hipdnn_tensor_attention_weights,
+        hipdnn_tensor_grad_output,
+        hipdnn_tensor_grad_value,
+        hipdnn_tensor_grad_sampling_loc,
+        hipdnn_tensor_grad_attn_weight,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 2  # batch size
+    n_heads = 2
+    embed_dims_per_head = 32
+    embed_dims = n_heads * embed_dims_per_head
+    n_levels = 2
+    n_points = 2
+    n_queries = 32
+
+    spatial_shapes_cpu = torch.randint(low=1, high=16, size=(n_levels, 2), dtype=torch.int64)
+    # calculate n_keys based on spatial_shapes_cpu
+    n_keys = spatial_shapes_cpu.prod(dim=1).sum()
+    # calculate level_start_index based on spatial_shapes_cpu
+    count_per_level = spatial_shapes_cpu.prod(dim=1)
+    level_start_index_cpu = torch.zeros_like(count_per_level)
+    level_start_index_cpu[1:] = torch.cumsum(count_per_level[:-1], dim=0)
+
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_value = torch.rand(
+        n, n_keys, n_heads, embed_dims_per_head, dtype=torch_data_type, device="cuda"
+    )
+    torch_tensor_spatial_shapes = spatial_shapes_cpu.to("cuda")
+    torch_tensor_level_start_index = level_start_index_cpu.to("cuda")
+    torch_tensor_sampling_locations = torch.rand(
+        n, n_queries, n_heads, n_levels, n_points, 2, dtype=torch_data_type, device="cuda"
+    )
+    torch_tensor_attention_weights = torch.rand(
+        n, n_queries, n_heads, n_levels, n_points, dtype=torch_data_type, device="cuda"
+    )
+    torch_tensor_grad_output = torch.rand(
+        n, n_queries, embed_dims, dtype=torch_data_type, device="cuda"
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_value,
+        hipdnn_tensor_spatial_shapes,
+        hipdnn_tensor_level_start_index,
+        hipdnn_tensor_sampling_locations,
+        hipdnn_tensor_attention_weights,
+        hipdnn_tensor_grad_output,
+        hipdnn_tensor_grad_value,
+        hipdnn_tensor_grad_sampling_loc,
+        hipdnn_tensor_grad_attn_weight,
+    ) = build_deform_attention_bwd_graph(
+        hipdnn_handle,
+        torch_tensor_value,
+        torch_tensor_spatial_shapes,
+        torch_tensor_level_start_index,
+        torch_tensor_sampling_locations,
+        torch_tensor_attention_weights,
+        torch_tensor_grad_output,
+        hipdnn_data_type,
+    )
+
+    torch_tensor_grad_value = torch.empty(
+        hipdnn_tensor_grad_value.get_dim(), dtype=torch_data_type, device="cuda"
+    )
+    torch_tensor_grad_sampling_loc = torch.empty(
+        hipdnn_tensor_grad_sampling_loc.get_dim(), dtype=torch_data_type, device="cuda"
+    )
+    torch_tensor_grad_attn_weight = torch.empty(
+        hipdnn_tensor_grad_attn_weight.get_dim(), dtype=torch_data_type, device="cuda"
+    )
+    variant_pack = {
+        hipdnn_tensor_value: torch_tensor_value.data_ptr(),
+        hipdnn_tensor_spatial_shapes: torch_tensor_spatial_shapes.data_ptr(),
+        hipdnn_tensor_level_start_index: torch_tensor_level_start_index.data_ptr(),
+        hipdnn_tensor_sampling_locations: torch_tensor_sampling_locations.data_ptr(),
+        hipdnn_tensor_attention_weights: torch_tensor_attention_weights.data_ptr(),
+        hipdnn_tensor_grad_output: torch_tensor_grad_output.data_ptr(),
+        hipdnn_tensor_grad_value: torch_tensor_grad_value.data_ptr(),
+        hipdnn_tensor_grad_sampling_loc: torch_tensor_grad_sampling_loc.data_ptr(),
+        hipdnn_tensor_grad_attn_weight: torch_tensor_grad_attn_weight.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("Deform attention bwd graph execution complete.")
--- a/python/deformconvolution/deform_convolution.py
+++ b/python/deformconvolution/deform_convolution.py
+import hipdnn
+import torch
+
+
+def build_deform_convolution_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_offset,
+    torch_tensor_w,
+    torch_tensor_bias,
+    torch_tensor_mask,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="deform_convolution",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_offset = graph.tensor_like(torch_tensor_offset)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_mask = graph.tensor_like(torch_tensor_mask)
+
+    # Create op
+    hipdnn_tensor_y = graph.deform_conv_fprop(
+        image=hipdnn_tensor_x,
+        offset=hipdnn_tensor_offset,
+        weight=hipdnn_tensor_w,
+        bias=hipdnn_tensor_bias,
+        mask=hipdnn_tensor_mask,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="deform_conv_fprop",
+    )
+    hipdnn_tensor_y.set_output(True)
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_offset,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_mask,
+        hipdnn_tensor_y,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 16  # Number of input channels
+    h = 16  # Height
+    w = 16  # Width
+
+    # Filter dimensions
+    k = 1  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 0  # Height padding
+    pad_w = 0  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    h_out = int((h + 2 * pad_h - (dil_h * (r - 1) + 1)) / stride_h + 1)
+    w_out = int((w + 2 * pad_w - (dil_w * (s - 1) + 1)) / stride_w + 1)
+
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_offset = torch.rand(
+        n, 2 * r * s, h_out, w_out, dtype=torch_data_type, device="cuda"
+    ).to(memory_format=torch.channels_last)
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(k, dtype=torch_data_type, device="cuda")
+    torch_tensor_mask = torch.rand(n, r * s, h_out, w_out, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_offset,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_mask,
+        hipdnn_tensor_y,
+    ) = build_deform_convolution_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_offset,
+        torch_tensor_w,
+        torch_tensor_bias,
+        torch_tensor_mask,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_offset: torch_tensor_offset.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_mask: torch_tensor_mask.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("deform conv fprop graph execution complete.")
--- a/python/deformconvolution/deform_convolution_bwd.py
+++ b/python/deformconvolution/deform_convolution_bwd.py
+import hipdnn
+import torch
+
+
+def build_deform_convolution_graph(
+    hipdnn_handle,
+    torch_tensor_dy,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_offset,
+    torch_tensor_mask,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="deform_convolution",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_dy = graph.tensor_like(torch_tensor_dy)
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_offset = graph.tensor_like(torch_tensor_offset)
+    hipdnn_tensor_mask = graph.tensor_like(torch_tensor_mask)
+
+    # Create op
+    hipdnn_tensor_dx, hipdnn_tensor_doffset, hipdnn_tensor_dmask = graph.deform_conv_dgrad(
+        loss=hipdnn_tensor_dy,
+        filter=hipdnn_tensor_w,
+        offset=hipdnn_tensor_offset,
+        image=hipdnn_tensor_x,
+        mask=hipdnn_tensor_mask,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="deform_conv_bwd",
+    )
+    hipdnn_tensor_dx.set_output(True)
+    hipdnn_tensor_doffset.set_output(True)
+    hipdnn_tensor_dmask.set_output(True)
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_dy,
+        hipdnn_tensor_w,
+        hipdnn_tensor_offset,
+        hipdnn_tensor_x,
+        hipdnn_tensor_mask,
+        hipdnn_tensor_dx,
+        hipdnn_tensor_doffset,
+        hipdnn_tensor_dmask,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 16  # Number of input channels
+    h = 16  # Height
+    w = 16  # Width
+
+    # Filter dimensions
+    k = 1  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 0  # Height padding
+    pad_w = 0  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    h_out = int((h + 2 * pad_h - (dil_h * (r - 1) + 1)) / stride_h + 1)
+    w_out = int((w + 2 * pad_w - (dil_w * (s - 1) + 1)) / stride_w + 1)
+
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_dy = torch.rand(n, k, h_out, w_out, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_offset = torch.rand(
+        n, 2 * r * s, h_out, w_out, dtype=torch_data_type, device="cuda"
+    ).to(memory_format=torch.channels_last)
+    torch_tensor_mask = torch.rand(n, r * s, h_out, w_out, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_dy,
+        hipdnn_tensor_w,
+        hipdnn_tensor_offset,
+        hipdnn_tensor_x,
+        hipdnn_tensor_mask,
+        hipdnn_tensor_dx,
+        hipdnn_tensor_doffset,
+        hipdnn_tensor_dmask,
+    ) = build_deform_convolution_graph(
+        hipdnn_handle,
+        torch_tensor_dy,
+        torch_tensor_x,
+        torch_tensor_w,
+        torch_tensor_offset,
+        torch_tensor_mask,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_dx = torch.empty(
+        hipdnn_tensor_dx.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    torch_tensor_doffset = torch.empty(
+        hipdnn_tensor_doffset.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    torch_tensor_dmask = torch.empty(
+        hipdnn_tensor_dmask.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_dy: torch_tensor_dy.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_offset: torch_tensor_offset.data_ptr(),
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_mask: torch_tensor_mask.data_ptr(),
+        hipdnn_tensor_dx: torch_tensor_dx.data_ptr(),
+        hipdnn_tensor_doffset: torch_tensor_doffset.data_ptr(),
+        hipdnn_tensor_dmask: torch_tensor_dmask.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("deform conv bwd graph execution complete.")
--- a/python/deformconvolution/deform_convolution_wrw.py
+++ b/python/deformconvolution/deform_convolution_wrw.py
+import hipdnn
+import torch
+
+
+def build_deform_convolution_wrw_graph(
+    hipdnn_handle,
+    torch_tensor_dy,
+    torch_tensor_x,
+    torch_tensor_offset,
+    torch_tensor_mask,
+    padding,
+    stride,
+    dilation,
+    dw_dims,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="deform_convolution_wrw",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_dy = graph.tensor_like(torch_tensor_dy)
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_offset = graph.tensor_like(torch_tensor_offset)
+    hipdnn_tensor_mask = graph.tensor_like(torch_tensor_mask)
+
+    # Create op
+    hipdnn_tensor_dw = graph.deform_conv_wgrad(
+        image=hipdnn_tensor_x,
+        offset=hipdnn_tensor_offset,
+        loss=hipdnn_tensor_dy,
+        mask=hipdnn_tensor_mask,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="deform_conv2d_wrw",
+    )
+    hipdnn_tensor_dw.set_dim(dw_dims).set_output(True)
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_dy,
+        hipdnn_tensor_offset,
+        hipdnn_tensor_x,
+        hipdnn_tensor_mask,
+        hipdnn_tensor_dw,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 4  # Batch size
+    c = 64  # Number of input channels
+    h = 16  # Height
+    w = 16  # Width
+
+    # Filter dimensions
+    k = 64  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 0  # Height padding
+    pad_w = 0  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    h_out = int((h + 2 * pad_h - (dil_h * (r - 1) + 1)) / stride_h + 1)
+    w_out = int((w + 2 * pad_w - (dil_w * (s - 1) + 1)) / stride_w + 1)
+
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_dy = torch.rand(n, k, h_out, w_out, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_offset = torch.rand(
+        n, 2 * r * s, h_out, w_out, dtype=torch_data_type, device="cuda"
+    ).to(memory_format=torch.channels_last)
+    torch_tensor_mask = torch.rand(n, r * s, h_out, w_out, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_dy,
+        hipdnn_tensor_offset,
+        hipdnn_tensor_x,
+        hipdnn_tensor_mask,
+        hipdnn_tensor_dw,
+    ) = build_deform_convolution_wrw_graph(
+        hipdnn_handle,
+        torch_tensor_dy,
+        torch_tensor_x,
+        torch_tensor_offset,
+        torch_tensor_mask,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        [k, c, r, s],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_dw = torch.empty(
+        hipdnn_tensor_dw.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_dy: torch_tensor_dy.data_ptr(),
+        hipdnn_tensor_offset: torch_tensor_offset.data_ptr(),
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_mask: torch_tensor_mask.data_ptr(),
+        hipdnn_tensor_dw: torch_tensor_dw.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("deform conv wrw graph execution complete.")
--- a/python/fusion/add_layernorm.py
+++ b/python/fusion/add_layernorm.py
+import hipdnn
+import torch
+
+
+def build_layernorm_fusion_graph(
+    hipdnn_handle,
+    torch_tensor_x1,
+    torch_tensor_x2,
+    torch_tensor_scale,
+    torch_tensor_bias,
+    torch_tensor_epsilon,
+    mode,
+    eps,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="layernorm_fusion_inference",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x1 = graph.tensor_like(torch_tensor_x1)
+    hipdnn_tensor_x2 = graph.tensor_like(torch_tensor_x2)
+    hipdnn_tensor_scale = graph.tensor_like(torch_tensor_scale)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_epsilon = graph.tensor_like(torch_tensor_epsilon)
+    hipdnn_tensor_epsilon.set_value(eps)
+
+    # Create op
+    hipdnn_tensor_add_output = graph.add(a=hipdnn_tensor_x1, b=hipdnn_tensor_x2, name="add")
+    hipdnn_tensor_add_output.set_output(True)
+
+    hipdnn_tensor_y, hipdnn_tensor_mean, hipdnn_tensor_inv_var = graph.layernorm(
+        mode,
+        hipdnn_tensor_add_output,
+        hipdnn_tensor_scale,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_epsilon,
+        hipdnn.data_type.FLOAT,
+        name="layernorm",
+    )
+    hipdnn_tensor_y.set_output(True)
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x1,
+        hipdnn_tensor_x2,
+        hipdnn_tensor_scale,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add_output,
+        hipdnn_tensor_y,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    batch = 16  # Batch size
+    seq_len = 32  # Number of input seq
+    embedding_dim = 64  # Number of feature
+    mode = hipdnn.norm_forward_phase.INFERENCE  # Mode
+    eps = 1e-5
+
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_x1 = torch.rand(
+        (batch, seq_len, embedding_dim), dtype=torch_data_type, device="cuda"
+    )
+    torch_tensor_x2 = torch.rand(
+        (batch, seq_len, embedding_dim), dtype=torch_data_type, device="cuda"
+    )
+    torch_tensor_scale = torch.rand(embedding_dim, dtype=torch_data_type, device="cuda")
+    torch_tensor_bias = torch.rand(embedding_dim, dtype=torch_data_type, device="cuda")
+    torch_tensor_epsilon = torch.full(
+        (1, 1, 1, 1), eps, dtype=torch.float32, requires_grad=False, device="cpu"
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x1,
+        hipdnn_tensor_x2,
+        hipdnn_tensor_scale,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add_output,
+        hipdnn_tensor_y,
+    ) = build_layernorm_fusion_graph(
+        hipdnn_handle,
+        torch_tensor_x1,
+        torch_tensor_x2,
+        torch_tensor_scale,
+        torch_tensor_bias,
+        torch_tensor_epsilon,
+        mode,
+        eps,
+        hipdnn_data_type,
+    )
+
+    torch_tensor_addoutput = torch.empty(
+        hipdnn_tensor_add_output.get_dim(), dtype=torch_data_type, device="cuda"
+    )
+    torch_tensor_y = torch.empty(hipdnn_tensor_y.get_dim(), dtype=torch_data_type, device="cuda")
+    variant_pack = {
+        hipdnn_tensor_x1: torch_tensor_x1.data_ptr(),
+        hipdnn_tensor_x2: torch_tensor_x2.data_ptr(),
+        hipdnn_tensor_scale: torch_tensor_scale.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_add_output: torch_tensor_addoutput.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("add_layernorm graph execution complete.")