Initial

ca34d4d2 · yanjl1 · ca34d4d2 · ca34d4d2 · ca34d4d2 · ca34d4d2
Commit ca34d4d2 authored Jun 02, 2026 by yanjl1
20 changed files
--- a/python/concat_conv_fusion/concat_conv_bias_add.py
+++ b/python/concat_conv_fusion/concat_conv_bias_add.py
+import hipdnn
+import torch
+
+
+def build_concat_conv_bias_add_graph(
+    hipdnn_handle,
+    torch_tensor_x1,
+    torch_tensor_x2,
+    torch_tensor_w,
+    torch_tensor_bias,
+    torch_tensor_add,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+    concat_axis,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="concat_conv_bias_add",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x1 = graph.tensor_like(torch_tensor_x1)
+    hipdnn_tensor_x2 = graph.tensor_like(torch_tensor_x2)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_add = graph.tensor_like(torch_tensor_add)
+
+    # Create concatenate op
+    hipdnn_tensor_concat_output = graph.concatenate(
+        x=[hipdnn_tensor_x1, hipdnn_tensor_x2], axis=concat_axis, name="concatenate"
+    )
+
+    # Create conv op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_concat_output,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+
+    # Create bias
+    hipdnn_tensor_bias_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+
+    # Create add
+    hipdnn_tensor_y = graph.add(a=hipdnn_tensor_bias_output, b=hipdnn_tensor_add, name="add")
+    hipdnn_tensor_y.set_output(True)
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x1,
+        hipdnn_tensor_x2,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1
+    c = 32
+    h = 128
+    w = 128
+
+    # Filter dimensions
+    k = 32
+    r = 3
+    s = 3
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+    concat_axis = 1
+
+    torch_tensor_x1 = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_x2 = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, 2 * c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_add = torch.rand(
+        n,
+        k,
+        h,
+        w,
+        dtype=torch_data_type,
+        device="cuda",
+    ).to(memory_format=torch.channels_last)
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x1,
+        hipdnn_tensor_x2,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    ) = build_concat_conv_bias_add_graph(
+        hipdnn_handle,
+        torch_tensor_x1,
+        torch_tensor_x2,
+        torch_tensor_w,
+        torch_tensor_bias,
+        torch_tensor_add,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+        concat_axis,
+    )
+
+    torch_tensor_y = torch.empty(hipdnn_tensor_y.get_dim(), dtype=torch_data_type, device="cuda")
+    variant_pack = {
+        hipdnn_tensor_x1: torch_tensor_x1.data_ptr(),
+        hipdnn_tensor_x2: torch_tensor_x2.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_add: torch_tensor_add.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("Concat_conv_bias_add graph execution complete.")
--- a/python/concat_conv_fusion/concat_conv_bias_relu.py
+++ b/python/concat_conv_fusion/concat_conv_bias_relu.py
+import hipdnn
+import torch
+
+
+def build_concat_conv_bias_relu_graph(
+    hipdnn_handle,
+    torch_tensor_x1,
+    torch_tensor_x2,
+    torch_tensor_w,
+    torch_tensor_bias,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+    concat_axis,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="concat_conv_bias_relu",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x1 = graph.tensor_like(torch_tensor_x1)
+    hipdnn_tensor_x2 = graph.tensor_like(torch_tensor_x2)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+
+    # Create concatenate op
+    hipdnn_tensor_concat_output = graph.concatenate(
+        x=[hipdnn_tensor_x1, hipdnn_tensor_x2], axis=concat_axis, name="concatenate"
+    )
+
+    # Create conv op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_concat_output,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+
+    # Create bias
+    hipdnn_tensor_bias_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+
+    # Create relu
+    hipdnn_tensor_y = graph.relu(input=hipdnn_tensor_bias_output, name="relu")
+    hipdnn_tensor_y.set_output(True)
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x1,
+        hipdnn_tensor_x2,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_y,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1
+    c = 32
+    h = 128
+    w = 128
+
+    # Filter dimensions
+    k = 32
+    r = 2
+    s = 2
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+    concat_axis = 1
+
+    torch_tensor_x1 = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_x2 = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, 2 * c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x1,
+        hipdnn_tensor_x2,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_y,
+    ) = build_concat_conv_bias_relu_graph(
+        hipdnn_handle,
+        torch_tensor_x1,
+        torch_tensor_x2,
+        torch_tensor_w,
+        torch_tensor_bias,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+        concat_axis,
+    )
+
+    torch_tensor_y = torch.empty(hipdnn_tensor_y.get_dim(), dtype=torch_data_type, device="cuda")
+    variant_pack = {
+        hipdnn_tensor_x1: torch_tensor_x1.data_ptr(),
+        hipdnn_tensor_x2: torch_tensor_x2.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("Concat_conv_bias_relu graph execution complete.")
--- a/python/concat_conv_fusion/concat_conv_bias_relu_add.py
+++ b/python/concat_conv_fusion/concat_conv_bias_relu_add.py
+import hipdnn
+import torch
+
+
+def build_concat_conv_bias_relu_add_graph(
+    hipdnn_handle,
+    torch_tensor_x1,
+    torch_tensor_x2,
+    torch_tensor_w,
+    torch_tensor_bias,
+    torch_tensor_add,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+    concat_axis,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="concat_conv_bias_relu_add",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x1 = graph.tensor_like(torch_tensor_x1)
+    hipdnn_tensor_x2 = graph.tensor_like(torch_tensor_x2)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_add = graph.tensor_like(torch_tensor_add)
+
+    # Create concatenate op
+    hipdnn_tensor_concat_output = graph.concatenate(
+        x=[hipdnn_tensor_x1, hipdnn_tensor_x2], axis=concat_axis, name="concatenate"
+    )
+
+    # Create conv op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_concat_output,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+
+    # Create bias
+    hipdnn_tensor_bias_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+
+    # Create relu
+    hipdnn_tensor_relu_output = graph.relu(input=hipdnn_tensor_bias_output, name="relu")
+
+    # Create add
+    hipdnn_tensor_y = graph.add(a=hipdnn_tensor_relu_output, b=hipdnn_tensor_add, name="add")
+    hipdnn_tensor_y.set_output(True)
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x1,
+        hipdnn_tensor_x2,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1
+    c = 32
+    h = 128
+    w = 128
+
+    # Filter dimensions
+    k = 32
+    r = 3
+    s = 3
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+    concat_axis = 1
+
+    torch_tensor_x1 = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_x2 = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, 2 * c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_add = torch.rand(
+        n,
+        k,
+        h,
+        w,
+        dtype=torch_data_type,
+        device="cuda",
+    ).to(memory_format=torch.channels_last)
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x1,
+        hipdnn_tensor_x2,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    ) = build_concat_conv_bias_relu_add_graph(
+        hipdnn_handle,
+        torch_tensor_x1,
+        torch_tensor_x2,
+        torch_tensor_w,
+        torch_tensor_bias,
+        torch_tensor_add,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+        concat_axis,
+    )
+
+    torch_tensor_y = torch.empty(hipdnn_tensor_y.get_dim(), dtype=torch_data_type, device="cuda")
+    variant_pack = {
+        hipdnn_tensor_x1: torch_tensor_x1.data_ptr(),
+        hipdnn_tensor_x2: torch_tensor_x2.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_add: torch_tensor_add.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("Concat_conv_bias_relu_add graph execution complete.")
--- a/python/concatenate/concatenate.py
+++ b/python/concatenate/concatenate.py
+import hipdnn
+import torch
+
+
+def build_concatenate_graph(hipdnn_handle, torch_tensor_x1, torch_tensor_x2, hipdnn_data_type):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="concatenate",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x1 = graph.tensor_like(torch_tensor_x1)
+    hipdnn_tensor_x2 = graph.tensor_like(torch_tensor_x2)
+
+    # Create concatenate op
+    hipdnn_tensor_y = graph.concatenate(
+        x=[hipdnn_tensor_x1, hipdnn_tensor_x2], axis=0, name="concatenate"
+    )
+    hipdnn_tensor_y.set_output(True)
+    graph.build(hipdnn_handle)
+
+    return (graph, hipdnn_tensor_x1, hipdnn_tensor_x2, hipdnn_tensor_y)
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    batch, seq_len, embedding_dim = 2, 1024, 768
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_x1 = torch.rand(
+        batch, seq_len, embedding_dim, dtype=torch_data_type, device="cuda"
+    )
+    torch_tensor_x2 = torch.rand(
+        batch, seq_len, embedding_dim, dtype=torch_data_type, device="cuda"
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    graph, hipdnn_tensor_x1, hipdnn_tensor_x2, hipdnn_tensor_y = build_concatenate_graph(
+        hipdnn_handle, torch_tensor_x1, torch_tensor_x2, hipdnn_data_type
+    )
+
+    torch_tensor_y = torch.empty(hipdnn_tensor_y.get_dim(), dtype=torch_data_type, device="cuda")
+    variant_pack = {
+        hipdnn_tensor_x1: torch_tensor_x1.data_ptr(),
+        hipdnn_tensor_x2: torch_tensor_x2.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("Concatenate graph execution complete.")
--- a/python/conv_bn_fusion/conv_genstats.py
+++ b/python/conv_bn_fusion/conv_genstats.py
+import hipdnn
+import torch
+
+
+def build_conv_genstats_graph(
+    hipdnn_handle, torch_tensor_x, torch_tensor_w, padding, stride, dilation, hipdnn_data_type
+):
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="conv_genstats",
+    )
+
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+
+    hipdnn_tensor_y = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv",
+    )
+    hipdnn_tensor_y.set_output(True)
+
+    hipdnn_tensor_sum, hipdnn_tensor_sq_sum = graph.genstats(
+        hipdnn_tensor_y, hipdnn.data_type.FLOAT, name="genstats"
+    )
+    hipdnn_tensor_sum.set_output(True)
+    hipdnn_tensor_sq_sum.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_y,
+        hipdnn_tensor_sum,
+        hipdnn_tensor_sq_sum,
+    )
+
+
+if __name__ == "__main__":
+    n = 4
+    c = 64
+    h = 16
+    w = 16
+    k = 32
+    r = 3
+    s = 3
+
+    stride_h = 1
+    stride_w = 1
+    pad_h = 1
+    pad_w = 1
+    dil_h = 1
+    dil_w = 1
+
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_y,
+        hipdnn_tensor_sum,
+        hipdnn_tensor_sq_sum,
+    ) = build_conv_genstats_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    torch_tensor_sum = torch.empty(
+        hipdnn_tensor_sum.get_dim(), dtype=torch_data_type, device="cuda"
+    )
+    torch_tensor_sq_sum = torch.empty(
+        hipdnn_tensor_sq_sum.get_dim(), dtype=torch_data_type, device="cuda"
+    )
+
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+        hipdnn_tensor_sum: torch_tensor_sum.data_ptr(),
+        hipdnn_tensor_sq_sum: torch_tensor_sq_sum.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("conv_genstats graph execution complete.")
--- a/python/conv_bn_fusion/mul_mul_add_add.py
+++ b/python/conv_bn_fusion/mul_mul_add_add.py
+import hipdnn
+import torch
+
+
+def build_mul_mul_add_add_graph(
+    hipdnn_handle,
+    torch_tensor_a,
+    torch_tensor_x,
+    torch_tensor_b,
+    torch_tensor_y,
+    torch_tensor_bias,
+    hipdnn_data_type,
+):
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="mul_mul_add_add",
+    )
+
+    hipdnn_tensor_a = graph.tensor_like(torch_tensor_a)
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_b = graph.tensor_like(torch_tensor_b)
+    hipdnn_tensor_y = graph.tensor_like(torch_tensor_y)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+
+    hipdnn_tensor_mul0 = graph.mul(a=hipdnn_tensor_x, b=hipdnn_tensor_a, name="mul0")
+    hipdnn_tensor_mul1 = graph.mul(a=hipdnn_tensor_y, b=hipdnn_tensor_b, name="mul1")
+    hipdnn_tensor_add0 = graph.add(a=hipdnn_tensor_mul0, b=hipdnn_tensor_mul1, name="add0")
+    hipdnn_tensor_z = graph.add(a=hipdnn_tensor_add0, b=hipdnn_tensor_bias, name="add1")
+    hipdnn_tensor_z.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_a,
+        hipdnn_tensor_x,
+        hipdnn_tensor_b,
+        hipdnn_tensor_y,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_z,
+    )
+
+
+if __name__ == "__main__":
+    n = 1
+    c = 4
+    h = 32
+    w = 32
+
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_a = torch.rand(1, c, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_b = torch.rand(1, c, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_y = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, c, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_a,
+        hipdnn_tensor_x,
+        hipdnn_tensor_b,
+        hipdnn_tensor_y,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_z,
+    ) = build_mul_mul_add_add_graph(
+        hipdnn_handle,
+        torch_tensor_a,
+        torch_tensor_x,
+        torch_tensor_b,
+        torch_tensor_y,
+        torch_tensor_bias,
+        hipdnn_data_type,
+    )
+
+    torch_tensor_z = torch.empty(
+        hipdnn_tensor_z.get_dim(), dtype=torch_data_type, device="cuda"
+    ).to(memory_format=torch.channels_last)
+    variant_pack = {
+        hipdnn_tensor_a: torch_tensor_a.data_ptr(),
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_b: torch_tensor_b.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_z: torch_tensor_z.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("mul_mul_add_add graph execution complete.")
--- a/python/conv_bn_fusion/scale_bias.py
+++ b/python/conv_bn_fusion/scale_bias.py
+import hipdnn
+import torch
+
+
+def build_scale_bias_graph(
+    hipdnn_handle, torch_tensor_x, torch_tensor_scale, torch_tensor_bias, hipdnn_data_type
+):
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="scale_bias",
+    )
+
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_scale = graph.tensor_like(torch_tensor_scale)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+
+    hipdnn_tensor_scale_out = graph.mul(a=hipdnn_tensor_x, b=hipdnn_tensor_scale, name="scale")
+    hipdnn_tensor_y = graph.add(a=hipdnn_tensor_scale_out, b=hipdnn_tensor_bias, name="bias")
+    hipdnn_tensor_y.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (graph, hipdnn_tensor_x, hipdnn_tensor_scale, hipdnn_tensor_bias, hipdnn_tensor_y)
+
+
+if __name__ == "__main__":
+    n = 1
+    c = 4
+    h = 32
+    w = 32
+
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_scale = torch.rand(1, c, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, c, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    graph, hipdnn_tensor_x, hipdnn_tensor_scale, hipdnn_tensor_bias, hipdnn_tensor_y = (
+        build_scale_bias_graph(
+            hipdnn_handle,
+            torch_tensor_x,
+            torch_tensor_scale,
+            torch_tensor_bias,
+            hipdnn_data_type,
+        )
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(), dtype=torch_data_type, device="cuda"
+    ).to(memory_format=torch.channels_last)
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_scale: torch_tensor_scale.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("scale_bias graph execution complete.")
--- a/python/conv_bn_fusion/scale_bias_relu_conv_genstats.py
+++ b/python/conv_bn_fusion/scale_bias_relu_conv_genstats.py
+import hipdnn
+import torch
+
+
+def build_scale_bias_relu_conv_genstats_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_scale,
+    torch_tensor_bias,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="scale_bias_relu_conv_genstats",
+    )
+
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_scale = graph.tensor_like(torch_tensor_scale)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+
+    hipdnn_tensor_scale_out = graph.mul(a=hipdnn_tensor_x, b=hipdnn_tensor_scale, name="scale")
+    hipdnn_tensor_bias_out = graph.add(a=hipdnn_tensor_scale_out, b=hipdnn_tensor_bias, name="bias")
+    hipdnn_tensor_relu_out = graph.relu(input=hipdnn_tensor_bias_out, name="relu")
+    hipdnn_tensor_conv_out = graph.conv_fprop(
+        image=hipdnn_tensor_relu_out,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv",
+    )
+    hipdnn_tensor_conv_out.set_output(True)
+
+    hipdnn_tensor_sum_out, hipdnn_tensor_sq_sum_out = graph.genstats(
+        hipdnn_tensor_conv_out, hipdnn.data_type.FLOAT, name="genstats"
+    )
+    hipdnn_tensor_sum_out.set_output(True)
+    hipdnn_tensor_sq_sum_out.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_scale,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_conv_out,
+        hipdnn_tensor_sum_out,
+        hipdnn_tensor_sq_sum_out,
+    )
+
+
+if __name__ == "__main__":
+    n = 4
+    c = 64
+    h = 16
+    w = 16
+    k = 32
+    r = 3
+    s = 3
+
+    stride_h = 1
+    stride_w = 1
+    pad_h = 1
+    pad_w = 1
+    dil_h = 1
+    dil_w = 1
+
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_scale = torch.rand(1, c, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, c, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_scale,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_conv_out,
+        hipdnn_tensor_sum_out,
+        hipdnn_tensor_sq_sum_out,
+    ) = build_scale_bias_relu_conv_genstats_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        torch_tensor_scale,
+        torch_tensor_bias,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_conv_out = torch.empty(
+        hipdnn_tensor_conv_out.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    torch_tensor_sum_out = torch.empty(
+        hipdnn_tensor_sum_out.get_dim(), dtype=torch_data_type, device="cuda"
+    )
+    torch_tensor_sq_sum_out = torch.empty(
+        hipdnn_tensor_sq_sum_out.get_dim(), dtype=torch_data_type, device="cuda"
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_scale: torch_tensor_scale.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_conv_out: torch_tensor_conv_out.data_ptr(),
+        hipdnn_tensor_sum_out: torch_tensor_sum_out.data_ptr(),
+        hipdnn_tensor_sq_sum_out: torch_tensor_sq_sum_out.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("scale_bias_relu_conv_genstats graph execution complete.")
--- a/python/conv_bn_fusion/scale_bias_relu_convwrw.py
+++ b/python/conv_bn_fusion/scale_bias_relu_convwrw.py
+import hipdnn
+import torch
+
+
+def build_scale_bias_relu_convwrw_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_dy,
+    torch_tensor_scale,
+    torch_tensor_bias,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="scale_bias_relu_convwrw",
+    )
+
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_dy = graph.tensor_like(torch_tensor_dy)
+    hipdnn_tensor_scale = graph.tensor_like(torch_tensor_scale)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+
+    hipdnn_tensor_scale_out = graph.mul(a=hipdnn_tensor_x, b=hipdnn_tensor_scale, name="scale")
+    hipdnn_tensor_bias_out = graph.add(a=hipdnn_tensor_scale_out, b=hipdnn_tensor_bias, name="bias")
+    hipdnn_tensor_relu_out = graph.relu(input=hipdnn_tensor_bias_out, name="relu")
+    hipdnn_tensor_dw = graph.conv_wgrad(
+        image=hipdnn_tensor_relu_out,
+        loss=hipdnn_tensor_dy,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="convwrw",
+    )
+    hipdnn_tensor_dw.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_dy,
+        hipdnn_tensor_scale,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_dw,
+    )
+
+
+if __name__ == "__main__":
+    n = 1
+    c = 32
+    h = 128
+    w = 128
+    k = 32
+
+    stride_h = 1
+    stride_w = 1
+    pad_h = 1
+    pad_w = 1
+    dil_h = 1
+    dil_w = 1
+
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_dy = torch.rand(n, k, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_scale = torch.rand(1, c, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, c, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_dy,
+        hipdnn_tensor_scale,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_dw,
+    ) = build_scale_bias_relu_convwrw_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_dy,
+        torch_tensor_scale,
+        torch_tensor_bias,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_dw = torch.empty(hipdnn_tensor_dw.get_dim(), dtype=torch_data_type, device="cuda")
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_dy: torch_tensor_dy.data_ptr(),
+        hipdnn_tensor_scale: torch_tensor_scale.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_dw: torch_tensor_dw.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("scale_bias_relu_convwrw graph execution complete.")
--- a/python/conv_bn_fusion/sub_mul_mul_add_convbwd_relubwd_bnwrw.py
+++ b/python/conv_bn_fusion/sub_mul_mul_add_convbwd_relubwd_bnwrw.py
+import hipdnn
+import torch
+
+
+def build_sub_mul_mul_add_convbwd_relubwd_bnwrw_graph(
+    hipdnn_handle,
+    torch_tensor_x_bn,
+    torch_tensor_mean_bn,
+    torch_tensor_inv_std_bn,
+    torch_tensor_scale_bn,
+    torch_tensor_bias_bn,
+    torch_tensor_dy,
+    torch_tensor_filter,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="sub_mul_mul_add_convbwd_relubwd_bnwrw",
+    )
+
+    hipdnn_tensor_x_bn = graph.tensor_like(torch_tensor_x_bn)
+    hipdnn_tensor_mean_bn = graph.tensor_like(torch_tensor_mean_bn)
+    hipdnn_tensor_inv_std_bn = graph.tensor_like(torch_tensor_inv_std_bn)
+    hipdnn_tensor_scale_bn = graph.tensor_like(torch_tensor_scale_bn)
+    hipdnn_tensor_bias_bn = graph.tensor_like(torch_tensor_bias_bn)
+    hipdnn_tensor_dy = graph.tensor_like(torch_tensor_dy)
+    hipdnn_tensor_filter = graph.tensor_like(torch_tensor_filter)
+
+    hipdnn_tensor_sub_out = graph.sub(a=hipdnn_tensor_x_bn, b=hipdnn_tensor_mean_bn, name="sub")
+    hipdnn_tensor_mul_out0 = graph.mul(
+        a=hipdnn_tensor_sub_out, b=hipdnn_tensor_inv_std_bn, name="mul0"
+    )
+    hipdnn_tensor_mul_out1 = graph.mul(
+        a=hipdnn_tensor_mul_out0, b=hipdnn_tensor_scale_bn, name="mul1"
+    )
+    hipdnn_tensor_add_out = graph.add(a=hipdnn_tensor_mul_out1, b=hipdnn_tensor_bias_bn, name="add")
+    hipdnn_tensor_dx = graph.conv_dgrad(
+        loss=hipdnn_tensor_dy,
+        filter=hipdnn_tensor_filter,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv_dgrad",
+    )
+    hipdnn_tensor_drelu = graph.relu_backward(
+        loss=hipdnn_tensor_dx, input=hipdnn_tensor_add_out, name="relu_backward"
+    )
+    hipdnn_tensor_drelu.set_output(True)
+    (
+        hipdnn_tensor_dscale,
+        hipdnn_tensor_dbias,
+        hipdnn_tensor_eq_scale_dy,
+        hipdnn_tensor_eq_scale_x,
+        hipdnn_tensor_eq_bias,
+    ) = graph.dbn_weight(
+        dy=hipdnn_tensor_drelu,
+        input=hipdnn_tensor_x_bn,
+        mean=hipdnn_tensor_mean_bn,
+        inv_variance=hipdnn_tensor_inv_std_bn,
+        scale=hipdnn_tensor_scale_bn,
+        name="bn_backward_weight",
+    )
+    hipdnn_tensor_dscale.set_output(True)
+    hipdnn_tensor_dbias.set_output(True)
+    hipdnn_tensor_eq_scale_dy.set_output(True)
+    hipdnn_tensor_eq_scale_x.set_output(True)
+    hipdnn_tensor_eq_bias.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x_bn,
+        hipdnn_tensor_mean_bn,
+        hipdnn_tensor_inv_std_bn,
+        hipdnn_tensor_scale_bn,
+        hipdnn_tensor_bias_bn,
+        hipdnn_tensor_dy,
+        hipdnn_tensor_filter,
+        hipdnn_tensor_drelu,
+        hipdnn_tensor_dscale,
+        hipdnn_tensor_dbias,
+        hipdnn_tensor_eq_scale_dy,
+        hipdnn_tensor_eq_scale_x,
+        hipdnn_tensor_eq_bias,
+    )
+
+
+if __name__ == "__main__":
+    n = 4
+    c = 64
+    h = 16
+    w = 16
+    k = 32
+    r = 3
+    s = 3
+
+    stride_h = 1
+    stride_w = 1
+    pad_h = 1
+    pad_w = 1
+    dil_h = 1
+    dil_w = 1
+
+    hipdnn_data_type = hipdnn.data_type.FLOAT
+    torch_data_type = torch.float32
+
+    torch_tensor_x_bn = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_mean_bn = torch.rand(1, c, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_inv_std_bn = torch.rand(1, c, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_scale_bn = torch.rand(1, c, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias_bn = torch.rand(1, c, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_dy = torch.rand(n, k, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_filter = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x_bn,
+        hipdnn_tensor_mean_bn,
+        hipdnn_tensor_inv_std_bn,
+        hipdnn_tensor_scale_bn,
+        hipdnn_tensor_bias_bn,
+        hipdnn_tensor_dy,
+        hipdnn_tensor_filter,
+        hipdnn_tensor_drelu,
+        hipdnn_tensor_dscale,
+        hipdnn_tensor_dbias,
+        hipdnn_tensor_eq_scale_dy,
+        hipdnn_tensor_eq_scale_x,
+        hipdnn_tensor_eq_bias,
+    ) = build_sub_mul_mul_add_convbwd_relubwd_bnwrw_graph(
+        hipdnn_handle,
+        torch_tensor_x_bn,
+        torch_tensor_mean_bn,
+        torch_tensor_inv_std_bn,
+        torch_tensor_scale_bn,
+        torch_tensor_bias_bn,
+        torch_tensor_dy,
+        torch_tensor_filter,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_drelu = torch.empty(
+        hipdnn_tensor_drelu.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    torch_tensor_dscale = torch.empty(
+        hipdnn_tensor_dscale.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    torch_tensor_dbias = torch.empty(
+        hipdnn_tensor_dbias.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    torch_tensor_eq_scale_dy = torch.empty(
+        hipdnn_tensor_eq_scale_dy.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    torch_tensor_eq_scale_x = torch.empty(
+        hipdnn_tensor_eq_scale_x.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    torch_tensor_eq_bias = torch.empty(
+        hipdnn_tensor_eq_bias.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+
+    variant_pack = {
+        hipdnn_tensor_x_bn: torch_tensor_x_bn.data_ptr(),
+        hipdnn_tensor_mean_bn: torch_tensor_mean_bn.data_ptr(),
+        hipdnn_tensor_inv_std_bn: torch_tensor_inv_std_bn.data_ptr(),
+        hipdnn_tensor_scale_bn: torch_tensor_scale_bn.data_ptr(),
+        hipdnn_tensor_bias_bn: torch_tensor_bias_bn.data_ptr(),
+        hipdnn_tensor_dy: torch_tensor_dy.data_ptr(),
+        hipdnn_tensor_filter: torch_tensor_filter.data_ptr(),
+        hipdnn_tensor_drelu: torch_tensor_drelu.data_ptr(),
+        hipdnn_tensor_dscale: torch_tensor_dscale.data_ptr(),
+        hipdnn_tensor_dbias: torch_tensor_dbias.data_ptr(),
+        hipdnn_tensor_eq_scale_dy: torch_tensor_eq_scale_dy.data_ptr(),
+        hipdnn_tensor_eq_scale_x: torch_tensor_eq_scale_x.data_ptr(),
+        hipdnn_tensor_eq_bias: torch_tensor_eq_bias.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("sub_mul_mul_add_convbwd_relubwd_bnwrw graph execution complete.")
--- a/python/conv_depthtospace_fusion/conv_bias_add_dts.py
+++ b/python/conv_depthtospace_fusion/conv_bias_add_dts.py
+import hipdnn
+import torch
+
+
+def build_conv_bias_add_dts_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_bias,
+    torch_tensor_add,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+    depth_to_space_mode,
+    block_size,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="conv_bias_add_dts",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_add = graph.tensor_like(torch_tensor_add)
+
+    # Create conv
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+
+    # Create bias
+    hipdnn_tensor_bias_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+
+    # Create add
+    hipdnn_tensor_add_output = graph.add(
+        a=hipdnn_tensor_bias_output, b=hipdnn_tensor_add, name="add"
+    )
+
+    n = torch_tensor_x.shape[0]
+    H = torch_tensor_x.shape[2]
+    W = torch_tensor_x.shape[3]
+
+    k = torch_tensor_w.shape[0]
+    r = torch_tensor_w.shape[2]
+    s = torch_tensor_w.shape[3]
+
+    outH = int((H + 2 * padding[0] - (dilation[0] * (r - 1) + 1)) / stride[0]) + 1
+    outW = int((W + 2 * padding[1] - (dilation[1] * (s - 1) + 1)) / stride[1]) + 1
+
+    if depth_to_space_mode == "CRD":
+        first_reshape_dim = [
+            n,
+            int(k // (block_size * block_size)),
+            block_size,
+            block_size,
+            outH,
+            outW,
+        ]
+        permutation = [0, 1, 4, 2, 5, 3]
+    else:
+        first_reshape_dim = [n, block_size, block_size, k // (block_size * block_size), outH, outW]
+        permutation = [0, 3, 4, 1, 5, 2]
+    second_reshape_dim = [
+        n,
+        int(k // (block_size * block_size)),
+        block_size * outH,
+        block_size * outW,
+    ]
+
+    # Create first reshape
+    hipdnn_tensor_first_reshape_output = graph.reshape(
+        input=hipdnn_tensor_add_output, name="first_reshape"
+    )
+    hipdnn_tensor_first_reshape_output.set_dim(first_reshape_dim)
+
+    # Create transpose
+    hipdnn_tensor_transpose_output = graph.transpose(
+        input=hipdnn_tensor_first_reshape_output,
+        permutation=permutation,
+        name="transpose",
+    )
+
+    # Create second reshape
+    hipdnn_tensor_second_reshape_output = graph.reshape(
+        input=hipdnn_tensor_transpose_output, name="second_reshape"
+    )
+    hipdnn_tensor_second_reshape_output.set_dim(second_reshape_dim).set_stride(
+        [k * outH * outW, 1, k // block_size * outW, k // (block_size * block_size)]
+    )
+    hipdnn_tensor_second_reshape_output.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_second_reshape_output,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 128  # Number of input channels
+    h = 270  # Height
+    w = 480  # Width
+
+    # Filter dimensions
+    k = 128  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+    block_size = 2
+    depth_to_sacpe_mode = "CRD"
+
+    outH = int((h + 2 * pad_h - (dil_h * (r - 1) + 1)) / stride_h) + 1
+    outW = int((w + 2 * pad_w - (dil_w * (s - 1) + 1)) / stride_w) + 1
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_add = torch.rand(
+        n,
+        k,
+        outH,
+        outW,
+        dtype=torch_data_type,
+        device="cuda",
+    ).to(memory_format=torch.channels_last)
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    ) = build_conv_bias_add_dts_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        torch_tensor_bias,
+        torch_tensor_add,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+        depth_to_sacpe_mode,
+        block_size,
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_add: torch_tensor_add.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("conv_bias_add_dts graph execution complete.")
--- a/python/conv_depthtospace_fusion/conv_bias_dts.py
+++ b/python/conv_depthtospace_fusion/conv_bias_dts.py
+import hipdnn
+import torch
+
+
+def build_conv_bias_dts_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_bias,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+    depth_to_space_mode,
+    block_size,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="conv_bias_dts",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+
+    # Create conv
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+
+    # Create bias
+    hipdnn_tensor_bias_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+
+    n = torch_tensor_x.shape[0]
+    H = torch_tensor_x.shape[2]
+    W = torch_tensor_x.shape[3]
+
+    k = torch_tensor_w.shape[0]
+    r = torch_tensor_w.shape[2]
+    s = torch_tensor_w.shape[3]
+
+    outH = int((H + 2 * padding[0] - (dilation[0] * (r - 1) + 1)) / stride[0]) + 1
+    outW = int((W + 2 * padding[1] - (dilation[1] * (s - 1) + 1)) / stride[1]) + 1
+
+    if depth_to_space_mode == "CRD":
+        first_reshape_dim = [
+            n,
+            int(k // (block_size * block_size)),
+            block_size,
+            block_size,
+            outH,
+            outW,
+        ]
+        permutation = [0, 1, 4, 2, 5, 3]
+    else:
+        first_reshape_dim = [n, block_size, block_size, k // (block_size * block_size), outH, outW]
+        permutation = [0, 3, 4, 1, 5, 2]
+    second_reshape_dim = [
+        n,
+        int(k // (block_size * block_size)),
+        block_size * outH,
+        block_size * outW,
+    ]
+
+    # Create first reshape
+    hipdnn_tensor_first_reshape_output = graph.reshape(
+        input=hipdnn_tensor_bias_output, name="first_reshape"
+    )
+    hipdnn_tensor_first_reshape_output.set_dim(first_reshape_dim)
+
+    # Create transpose
+    hipdnn_tensor_transpose_output = graph.transpose(
+        input=hipdnn_tensor_first_reshape_output,
+        permutation=permutation,
+        name="transpose",
+    )
+
+    # Create second reshape
+    hipdnn_tensor_second_reshape_output = graph.reshape(
+        input=hipdnn_tensor_transpose_output, name="second_reshape"
+    )
+    hipdnn_tensor_second_reshape_output.set_dim(second_reshape_dim).set_stride(
+        [k * outH * outW, 1, k // block_size * outW, k // (block_size * block_size)]
+    ).set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_second_reshape_output,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 8  # Number of input channels
+    h = 128  # Height
+    w = 128  # Width
+
+    # Filter dimensions
+    k = 16  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+    block_size = 2
+    depth_to_sacpe_mode = "DCR"
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_bias, hipdnn_tensor_y = (
+        build_conv_bias_dts_graph(
+            hipdnn_handle,
+            torch_tensor_x,
+            torch_tensor_w,
+            torch_tensor_bias,
+            [pad_h, pad_w],
+            [stride_h, stride_w],
+            [dil_h, dil_w],
+            hipdnn_data_type,
+            depth_to_sacpe_mode,
+            block_size,
+        )
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("conv_bias_dts graph execution complete.")
--- a/python/conv_depthtospace_fusion/conv_bias_dts_add.py
+++ b/python/conv_depthtospace_fusion/conv_bias_dts_add.py
+import hipdnn
+import torch
+
+
+def build_conv_bias_dts_add_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_bias,
+    torch_tensor_add,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+    depth_to_space_mode,
+    block_size,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="conv_bias_dts_add",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_add = graph.tensor_like(torch_tensor_add)
+
+    # Create conv
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+
+    # Create bias
+    hipdnn_tensor_bias_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+
+    n = torch_tensor_x.shape[0]
+    H = torch_tensor_x.shape[2]
+    W = torch_tensor_x.shape[3]
+
+    k = torch_tensor_w.shape[0]
+    r = torch_tensor_w.shape[2]
+    s = torch_tensor_w.shape[3]
+
+    outH = int((H + 2 * padding[0] - (dilation[0] * (r - 1) + 1)) / stride[0]) + 1
+    outW = int((W + 2 * padding[1] - (dilation[1] * (s - 1) + 1)) / stride[1]) + 1
+
+    if depth_to_space_mode == "CRD":
+        first_reshape_dim = [
+            n,
+            int(k // (block_size * block_size)),
+            block_size,
+            block_size,
+            outH,
+            outW,
+        ]
+        permutation = [0, 1, 4, 2, 5, 3]
+    else:
+        first_reshape_dim = [n, block_size, block_size, k // (block_size * block_size), outH, outW]
+        permutation = [0, 3, 4, 1, 5, 2]
+    second_reshape_dim = [
+        n,
+        int(k // (block_size * block_size)),
+        block_size * outH,
+        block_size * outW,
+    ]
+
+    # Create first reshape
+    hipdnn_tensor_first_reshape_output = graph.reshape(
+        input=hipdnn_tensor_bias_output, name="first_reshape"
+    )
+    hipdnn_tensor_first_reshape_output.set_dim(first_reshape_dim)
+
+    # Create transpose
+    hipdnn_tensor_transpose_output = graph.transpose(
+        input=hipdnn_tensor_first_reshape_output,
+        permutation=permutation,
+        name="transpose",
+    )
+
+    # Create second reshape
+    hipdnn_tensor_second_reshape_output = graph.reshape(
+        input=hipdnn_tensor_transpose_output, name="second_reshape"
+    )
+    hipdnn_tensor_second_reshape_output.set_dim(second_reshape_dim).set_stride(
+        [k * outH * outW, 1, k // block_size * outW, k // (block_size * block_size)]
+    )
+
+    # Create add
+    hipdnn_tensor_y = graph.add(
+        a=hipdnn_tensor_second_reshape_output, b=hipdnn_tensor_add, name="add"
+    )
+    hipdnn_tensor_y.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 8  # Number of input channels
+    h = 128  # Height
+    w = 128  # Width
+
+    # Filter dimensions
+    k = 16  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+    block_size = 2
+    depth_to_sacpe_mode = "CRD"
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_add = torch.rand(
+        n,
+        k // (block_size * block_size),
+        h * block_size,
+        w * block_size,
+        dtype=torch_data_type,
+        device="cuda",
+    ).to(memory_format=torch.channels_last)
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    ) = build_conv_bias_dts_add_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        torch_tensor_bias,
+        torch_tensor_add,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+        depth_to_sacpe_mode,
+        block_size,
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_add: torch_tensor_add.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("conv_bias_dts_add graph execution complete.")
--- a/python/conv_depthtospace_fusion/conv_bias_dts_leakyrelu.py
+++ b/python/conv_depthtospace_fusion/conv_bias_dts_leakyrelu.py
+import hipdnn
+import torch
+
+
+def build_conv_bias_dts_leakyrelu_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_bias,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+    depth_to_space_mode,
+    block_size,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="conv_bias_dts_leakyrelu",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+
+    # Create conv
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+
+    # Create bias
+    hipdnn_tensor_bias_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+
+    n = torch_tensor_x.shape[0]
+    H = torch_tensor_x.shape[2]
+    W = torch_tensor_x.shape[3]
+
+    k = torch_tensor_w.shape[0]
+    r = torch_tensor_w.shape[2]
+    s = torch_tensor_w.shape[3]
+
+    outH = int((H + 2 * padding[0] - (dilation[0] * (r - 1) + 1)) / stride[0]) + 1
+    outW = int((W + 2 * padding[1] - (dilation[1] * (s - 1) + 1)) / stride[1]) + 1
+
+    if depth_to_space_mode == "CRD":
+        first_reshape_dim = [
+            n,
+            int(k // (block_size * block_size)),
+            block_size,
+            block_size,
+            outH,
+            outW,
+        ]
+        permutation = [0, 1, 4, 2, 5, 3]
+    else:
+        first_reshape_dim = [n, block_size, block_size, k // (block_size * block_size), outH, outW]
+        permutation = [0, 3, 4, 1, 5, 2]
+    second_reshape_dim = [
+        n,
+        int(k // (block_size * block_size)),
+        block_size * outH,
+        block_size * outW,
+    ]
+
+    # Create first reshape
+    hipdnn_tensor_first_reshape_output = graph.reshape(
+        input=hipdnn_tensor_bias_output, name="first_reshape"
+    )
+    hipdnn_tensor_first_reshape_output.set_dim(first_reshape_dim)
+
+    # Create transpose
+    hipdnn_tensor_transpose_output = graph.transpose(
+        input=hipdnn_tensor_first_reshape_output,
+        permutation=permutation,
+        name="transpose",
+    )
+
+    # Create second reshape
+    hipdnn_tensor_second_reshape_output = graph.reshape(
+        input=hipdnn_tensor_transpose_output, name="second_reshape"
+    )
+    hipdnn_tensor_second_reshape_output.set_dim(second_reshape_dim).set_stride(
+        [k * outH * outW, 1, k // block_size * outW, k // (block_size * block_size)]
+    )
+
+    # Create leakyrelu
+    hipdnn_tensor_y = graph.leaky_relu(
+        input=hipdnn_tensor_second_reshape_output, negative_slope=-1.0, name="leaky_relu"
+    )
+    hipdnn_tensor_y.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_bias, hipdnn_tensor_y)
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 8  # Number of input channels
+    h = 128  # Height
+    w = 128  # Width
+
+    # Filter dimensions
+    k = 16  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+    block_size = 2
+    depth_to_sacpe_mode = "DCR"
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_bias, hipdnn_tensor_y = (
+        build_conv_bias_dts_leakyrelu_graph(
+            hipdnn_handle,
+            torch_tensor_x,
+            torch_tensor_w,
+            torch_tensor_bias,
+            [pad_h, pad_w],
+            [stride_h, stride_w],
+            [dil_h, dil_w],
+            hipdnn_data_type,
+            depth_to_sacpe_mode,
+            block_size,
+        )
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("conv_bias_dts_leakyrelu graph execution complete.")
--- a/python/conv_depthtospace_fusion/conv_bias_dts_leakyrelu_add.py
+++ b/python/conv_depthtospace_fusion/conv_bias_dts_leakyrelu_add.py
+import hipdnn
+import torch
+
+
+def build_conv_bias_dts_leakyrelu_add_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_bias,
+    torch_tensor_add,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+    depth_to_space_mode,
+    block_size,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="conv_bias_dts_leakyrelu_add",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_add = graph.tensor_like(torch_tensor_add)
+
+    # Create conv
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+
+    # Create bias
+    hipdnn_tensor_bias_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+
+    n = torch_tensor_x.shape[0]
+    H = torch_tensor_x.shape[2]
+    W = torch_tensor_x.shape[3]
+
+    k = torch_tensor_w.shape[0]
+    r = torch_tensor_w.shape[2]
+    s = torch_tensor_w.shape[3]
+
+    outH = int((H + 2 * padding[0] - (dilation[0] * (r - 1) + 1)) / stride[0]) + 1
+    outW = int((W + 2 * padding[1] - (dilation[1] * (s - 1) + 1)) / stride[1]) + 1
+
+    if depth_to_space_mode == "CRD":
+        first_reshape_dim = [
+            n,
+            int(k // (block_size * block_size)),
+            block_size,
+            block_size,
+            outH,
+            outW,
+        ]
+        permutation = [0, 1, 4, 2, 5, 3]
+    else:
+        first_reshape_dim = [n, block_size, block_size, k // (block_size * block_size), outH, outW]
+        permutation = [0, 3, 4, 1, 5, 2]
+    second_reshape_dim = [
+        n,
+        int(k // (block_size * block_size)),
+        block_size * outH,
+        block_size * outW,
+    ]
+
+    # Create first reshape
+    hipdnn_tensor_first_reshape_output = graph.reshape(
+        input=hipdnn_tensor_bias_output, name="first_reshape"
+    )
+    hipdnn_tensor_first_reshape_output.set_dim(first_reshape_dim)
+
+    # Create transpose
+    hipdnn_tensor_transpose_output = graph.transpose(
+        input=hipdnn_tensor_first_reshape_output,
+        permutation=permutation,
+        name="transpose",
+    )
+
+    # Create second reshape
+    hipdnn_tensor_second_reshape_output = graph.reshape(
+        input=hipdnn_tensor_transpose_output, name="second_reshape"
+    )
+    hipdnn_tensor_second_reshape_output.set_dim(second_reshape_dim).set_stride(
+        [k * outH * outW, 1, k // block_size * outW, k // (block_size * block_size)]
+    )
+
+    # Create leakyRelu
+    hipdnn_tensor_leaky_relu_output = graph.leaky_relu(
+        input=hipdnn_tensor_second_reshape_output, negative_slope=-1.0, name="leaky_relu"
+    )
+
+    # Cretae add
+    hipdnn_tensor_y = graph.add(a=hipdnn_tensor_leaky_relu_output, b=hipdnn_tensor_add, name="add")
+    hipdnn_tensor_y.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 8  # Number of input channels
+    h = 128  # Height
+    w = 128  # Width
+
+    # Filter dimensions
+    k = 16  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+    block_size = 2
+    depth_to_sacpe_mode = "CRD"
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_add = torch.rand(
+        n,
+        k // (block_size * block_size),
+        h * block_size,
+        w * block_size,
+        dtype=torch_data_type,
+        device="cuda",
+    ).to(memory_format=torch.channels_last)
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    ) = build_conv_bias_dts_leakyrelu_add_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        torch_tensor_bias,
+        torch_tensor_add,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+        depth_to_sacpe_mode,
+        block_size,
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_add: torch_tensor_add.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("conv_bias_dts_leakyrelu_add graph execution complete.")
--- a/python/conv_depthtospace_fusion/conv_dts.py
+++ b/python/conv_depthtospace_fusion/conv_dts.py
+import hipdnn
+import torch
+
+
+def build_conv_dts_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+    depth_to_space_mode,
+    block_size,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="conv_dts",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+
+    # Create conv
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+
+    n = torch_tensor_x.shape[0]
+    H = torch_tensor_x.shape[2]
+    W = torch_tensor_x.shape[3]
+
+    k = torch_tensor_w.shape[0]
+    r = torch_tensor_w.shape[2]
+    s = torch_tensor_w.shape[3]
+
+    outH = int((H + 2 * padding[0] - (dilation[0] * (r - 1) + 1)) / stride[0]) + 1
+    outW = int((W + 2 * padding[1] - (dilation[1] * (s - 1) + 1)) / stride[1]) + 1
+
+    if depth_to_space_mode == "CRD":
+        first_reshape_dim = [
+            n,
+            int(k // (block_size * block_size)),
+            block_size,
+            block_size,
+            outH,
+            outW,
+        ]
+        permutation = [0, 1, 4, 2, 5, 3]
+    else:
+        first_reshape_dim = [n, block_size, block_size, k // (block_size * block_size), outH, outW]
+        permutation = [0, 3, 4, 1, 5, 2]
+    second_reshape_dim = [
+        n,
+        int(k // (block_size * block_size)),
+        block_size * outH,
+        block_size * outW,
+    ]
+
+    print(first_reshape_dim)
+
+    # Create first reshape
+    hipdnn_tensor_first_reshape_output = graph.reshape(
+        input=hipdnn_tensor_conv_output, name="first_reshape"
+    )
+    hipdnn_tensor_first_reshape_output.set_dim(first_reshape_dim)
+
+    # Create transpose
+    hipdnn_tensor_transpose_output = graph.transpose(
+        input=hipdnn_tensor_first_reshape_output,
+        permutation=permutation,
+        name="transpose",
+    )
+
+    # Create second reshape
+    hipdnn_tensor_second_reshape_output = graph.reshape(
+        input=hipdnn_tensor_transpose_output, name="second_reshape"
+    )
+    hipdnn_tensor_second_reshape_output.set_dim(second_reshape_dim).set_stride(
+        [k * outH * outW, 1, k // block_size * outW, k // (block_size * block_size)]
+    ).set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_second_reshape_output)
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 8  # Number of input channels
+    h = 128  # Height
+    w = 128  # Width
+
+    # Filter dimensions
+    k = 16  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+    block_size = 2
+    depth_to_sacpe_mode = "DCR"
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_y = build_conv_dts_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+        depth_to_sacpe_mode,
+        block_size,
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("conv_dts graph execution complete.")
--- a/python/conv_fusion/conv_bias.py
+++ b/python/conv_fusion/conv_bias.py
+import hipdnn
+import torch
+
+
+def build_conv_bias_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_bias,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="conv_bias",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+
+    # Create op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+    hipdnn_tensor_y = graph.add(a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias")
+    hipdnn_tensor_y.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_bias, hipdnn_tensor_y)
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 16  # Number of input channels
+    h = 16  # Height
+    w = 16  # Width
+
+    # Filter dimensions
+    k = 16  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_bias, hipdnn_tensor_y = (
+        build_conv_bias_graph(
+            hipdnn_handle,
+            torch_tensor_x,
+            torch_tensor_w,
+            torch_tensor_bias,
+            [pad_h, pad_w],
+            [stride_h, stride_w],
+            [dil_h, dil_w],
+            hipdnn_data_type,
+        )
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("conv_bias graph execution complete.")
--- a/python/conv_fusion/conv_bias_add.py
+++ b/python/conv_fusion/conv_bias_add.py
+import hipdnn
+import torch
+
+
+def build_conv_bias_add_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_bias,
+    torch_tensor_add,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="conv_bias_add",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_add = graph.tensor_like(torch_tensor_add)
+
+    # Create op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+    hipdnn_tensor_add_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+    hipdnn_tensor_y = graph.add(a=hipdnn_tensor_add_output, b=hipdnn_tensor_add, name="add")
+    hipdnn_tensor_y.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 16  # Number of input channels
+    h = 16  # Height
+    w = 16  # Width
+
+    # Filter dimensions
+    k = 16  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_add = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    ) = build_conv_bias_add_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        torch_tensor_bias,
+        torch_tensor_add,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_add: torch_tensor_add.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("conv_bias_add graph execution complete.")
--- a/python/conv_fusion/conv_bias_add_relu.py
+++ b/python/conv_fusion/conv_bias_add_relu.py
+import hipdnn
+import torch
+
+
+def build_conv_bias_add_relu_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_bias,
+    torch_tensor_add,
+    padding,
+    stride,
+    dilation,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="convolution_bias_add_relu",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+    hipdnn_tensor_add = graph.tensor_like(torch_tensor_add)
+
+    # Create op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+    hipdnn_tensor_bias_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+    hipdnn_tensor_add_output = graph.add(
+        a=hipdnn_tensor_bias_output, b=hipdnn_tensor_add, name="add"
+    )
+    hipdnn_tensor_y = graph.relu(input=hipdnn_tensor_add_output, name="relu")
+    hipdnn_tensor_y.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    )
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 16  # Number of input channels
+    h = 16  # Height
+    w = 16  # Width
+
+    # Filter dimensions
+    k = 16  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_add = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    (
+        graph,
+        hipdnn_tensor_x,
+        hipdnn_tensor_w,
+        hipdnn_tensor_bias,
+        hipdnn_tensor_add,
+        hipdnn_tensor_y,
+    ) = build_conv_bias_add_relu_graph(
+        hipdnn_handle,
+        torch_tensor_x,
+        torch_tensor_w,
+        torch_tensor_bias,
+        torch_tensor_add,
+        [pad_h, pad_w],
+        [stride_h, stride_w],
+        [dil_h, dil_w],
+        hipdnn_data_type,
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_add: torch_tensor_add.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("convolution_bias_add_relu graph execution complete.")
--- a/python/conv_fusion/conv_bias_prelu.py
+++ b/python/conv_fusion/conv_bias_prelu.py
+import hipdnn
+import torch
+
+
+def build_conv_bias_prelu_graph(
+    hipdnn_handle,
+    torch_tensor_x,
+    torch_tensor_w,
+    torch_tensor_bias,
+    padding,
+    stride,
+    dilation,
+    negative_slope,
+    hipdnn_data_type,
+):
+    # Create graph
+    graph = hipdnn.pygraph(
+        handle=hipdnn_handle,
+        io_data_type=hipdnn_data_type,
+        intermediate_data_type=hipdnn.data_type.FLOAT,
+        compute_data_type=hipdnn.data_type.FLOAT,
+        name="conv_bias_prelu",
+    )
+
+    # Create hipdnn tensors
+    hipdnn_tensor_x = graph.tensor_like(torch_tensor_x)
+    hipdnn_tensor_w = graph.tensor_like(torch_tensor_w)
+    hipdnn_tensor_bias = graph.tensor_like(torch_tensor_bias)
+
+    # Create op
+    hipdnn_tensor_conv_output = graph.conv_fprop(
+        image=hipdnn_tensor_x,
+        weight=hipdnn_tensor_w,
+        padding=padding,
+        stride=stride,
+        dilation=dilation,
+        name="conv2d",
+    )
+    hipdnn_tensor_add_output = graph.add(
+        a=hipdnn_tensor_conv_output, b=hipdnn_tensor_bias, name="bias"
+    )
+    hipdnn_tensor_y = graph.prelu(
+        input=hipdnn_tensor_add_output, negative_slope=negative_slope, name="prelu"
+    )
+    hipdnn_tensor_y.set_output(True)
+
+    graph.build(hipdnn_handle)
+
+    return (graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_bias, hipdnn_tensor_y)
+
+
+if __name__ == "__main__":
+    # Input dimensions
+    n = 1  # Batch size
+    c = 16  # Number of input channels
+    h = 16  # Height
+    w = 16  # Width
+
+    # Filter dimensions
+    k = 16  # Number of output channels
+    r = 3  # Filter height
+    s = 3  # Filter width
+
+    # Convolution parameters
+    stride_h = 1  # Height stride
+    stride_w = 1  # Width stride
+    pad_h = 1  # Height padding
+    pad_w = 1  # Width padding
+    dil_h = 1  # Height dilation
+    dil_w = 1  # Width dilation
+
+    # activate parameters
+    negative_slope = 0.01  # Negative slope
+
+    hipdnn_data_type = hipdnn.data_type.HALF
+    torch_data_type = torch.float16
+
+    torch_tensor_x = torch.rand(n, c, h, w, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_w = torch.rand(k, c, r, s, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+    torch_tensor_bias = torch.rand(1, k, 1, 1, dtype=torch_data_type, device="cuda").to(
+        memory_format=torch.channels_last
+    )
+
+    hipdnn_handle = hipdnn.create_handle()
+
+    graph, hipdnn_tensor_x, hipdnn_tensor_w, hipdnn_tensor_bias, hipdnn_tensor_y = (
+        build_conv_bias_prelu_graph(
+            hipdnn_handle,
+            torch_tensor_x,
+            torch_tensor_w,
+            torch_tensor_bias,
+            [pad_h, pad_w],
+            [stride_h, stride_w],
+            [dil_h, dil_w],
+            negative_slope,
+            hipdnn_data_type,
+        )
+    )
+
+    torch_tensor_y = torch.empty(
+        hipdnn_tensor_y.get_dim(),
+        dtype=torch_data_type,
+        memory_format=torch.channels_last,
+        device="cuda",
+    )
+    variant_pack = {
+        hipdnn_tensor_x: torch_tensor_x.data_ptr(),
+        hipdnn_tensor_w: torch_tensor_w.data_ptr(),
+        hipdnn_tensor_bias: torch_tensor_bias.data_ptr(),
+        hipdnn_tensor_y: torch_tensor_y.data_ptr(),
+    }
+    workspace = torch.empty(graph.get_workspace_size(), dtype=torch.uint8, device="cuda")
+
+    graph.exec(variant_pack=variant_pack, workspace=workspace.data_ptr())
+    print("conv_bias_prelu graph execution complete.")