Merge branch 'develop' into ck_conv_bww_fp16

c2976d7a · ltqin · e46ea9fd · 0619ebf7 · c2976d7a · c2976d7a
Commit c2976d7a authored Mar 04, 2022 by ltqin
20 changed files
--- a/composable_kernel/include/config.hpp
+++ b/composable_kernel/include/config.hpp
@@ -151,6 +151,12 @@
 #define CK_WORKAROUND_SWDEV_XXXXXX_THREAD_WISE_COPY_V1R5_TYPE_CONVERT_ISSUE 1
 #endif

+// workaround for verifaction failure, due to compiler regression, for conv bwd-data fp16 using some
+// tuning parameter
+#ifndef CK_WORKAROUND_SWDEV_325164
+#define CK_WORKAROUND_SWDEV_325164 1
+#endif
+
 namespace ck {

 enum InMemoryDataOperationEnum_t

--- a/composable_kernel/include/tensor_operation/element_wise_operation.hpp
+++ b/composable_kernel/include/tensor_operation/element_wise_operation.hpp
 #ifndef CK_ELEMENT_WISE_OPERATION_HPP
 #define CK_ELEMENT_WISE_OPERATION_HPP
+#include "data_type.hpp"

 #include "data_type.hpp"


--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -5,6 +5,7 @@
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
 #include "static_tensor.hpp"
+#include "tensor_space_filling_curve.hpp"

 namespace ck {

@@ -123,73 +124,16 @@ struct ThreadwiseTensorSliceTransfer_v3r1
        constexpr auto src_scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});

-        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
-
-        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
-
-        constexpr auto ordered_src_access_lengths =
-            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
-
-        // make forward steps
-        const auto src_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
-                });
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    SrcDimAccessOrder,
+                                                    remove_cv_t<decltype(src_scalar_per_access)>>;

-                return make_tensor_coordinate_step(src_desc, forward_step_idx);
-            },
-            Number<nDim>{});
-
-        // make backward steps
-        const auto src_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(src_desc, backward_step_idx);
-            },
-            Number<nDim>{});
+        // loop over space-filling curve
+        constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();

        // loop over tensor and copy
-        static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-                forward_sweep_(I0) = true;
-
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_src_access_idx[I0];
-
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
-                    });
-
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-
-                return forward_sweep_;
-            }();
-
-            // calculate src data index
-            constexpr auto src_data_idx = [&]() {
-                Index ordered_idx;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
-                                                      : ordered_src_access_lengths[i] - 1 -
-                                                            ordered_src_access_idx[i];
-                });
-
-                return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
-                       src_scalar_per_access;
-            }();
+        static_for<0, num_accesses, 1>{}([&](auto idx_1d) {
+            constexpr auto src_data_idx = SpaceFillingCurve::GetIndex(idx_1d);

            constexpr auto src_data_idx_seq = generate_sequence_v2(
                [&](auto i) { return Number<src_data_idx[i]>{}; }, Number<src_data_idx.Size()>{});
@@ -218,39 +162,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                .template SetAsType<src_vector_t>(
                    src_data_idx_seq, src_vector_container.template AsType<src_vector_t>()[I0]);

-            constexpr auto move_on_dim = [&]() constexpr
+            // move coordinate
+            if constexpr(idx_1d.value != num_accesses - 1)
            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
-
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &=
-                            ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
-                    });
-                });
-
-                return move_on_dim_;
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+                move_tensor_coordinate(
+                    src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
            }
-            ();
-
-            // move src coord
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
-                    }
-                }
-            });
        });

        // move src coordinate back to slice origin (or not)
@@ -374,73 +292,15 @@ struct ThreadwiseTensorSliceTransfer_v3r1
        constexpr auto dst_scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});

-        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
-
-        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
-
-        constexpr auto ordered_dst_access_lengths =
-            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
-
-        // make forward steps
-        const auto dst_forward_steps = generate_tuple(
-            [&](auto i) {
-                Index forward_step_idx;
-
-                static_for<0, nDim, 1>{}([&](auto j) {
-                    forward_step_idx(j) = (i.value == j.value) ? dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(dst_desc, forward_step_idx);
-            },
-            Number<nDim>{});
-
-        // make backward steps
-        const auto dst_backward_steps = generate_tuple(
-            [&](auto i) {
-                Index backward_step_idx;
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DstDimAccessOrder,
+                                                    remove_cv_t<decltype(dst_scalar_per_access)>>;

-                static_for<0, nDim, 1>{}([&](auto j) {
-                    backward_step_idx(j) = (i.value == j.value) ? -dst_scalar_per_access[i] : 0;
-                });
-
-                return make_tensor_coordinate_step(dst_desc, backward_step_idx);
-            },
-            Number<nDim>{});
+        constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();

        // loop over tensor and copy
-        static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-                forward_sweep_(I0) = true;
-
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_dst_access_idx[I0];
-
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
-                    });
-
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-
-                return forward_sweep_;
-            }();
-
-            // calculate dst data index
-            constexpr auto dst_data_idx = [&]() {
-                Index ordered_idx;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
-                                                      : ordered_dst_access_lengths[i] - 1 -
-                                                            ordered_dst_access_idx[i];
-                });
-
-                return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
-                       dst_scalar_per_access;
-            }();
+        static_for<0, num_accesses, 1>{}([&](auto idx_1d) {
+            constexpr auto dst_data_idx = SpaceFillingCurve::GetIndex(idx_1d);

            constexpr auto dst_data_idx_seq = generate_sequence_v2(
                [&](auto i) { return Number<dst_data_idx[i]>{}; }, Number<dst_data_idx.Size()>{});
@@ -470,39 +330,13 @@ struct ThreadwiseTensorSliceTransfer_v3r1
                is_dst_valid,
                dst_vector_container.template AsType<dst_vector_t>()[I0]);

-            constexpr auto move_on_dim = [&]() constexpr
+            // move coordinate
+            if constexpr(idx_1d.value != num_accesses - 1)
            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
-
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &=
-                            ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
-                    });
-                });
-
-                return move_on_dim_;
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+                move_tensor_coordinate(
+                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
            }
-            ();
-
-            // move dst coord
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
-                    }
-                }
-            });
        });

        // move dst coordinate back to slice origin (or not)
@@ -522,55 +356,15 @@ struct ThreadwiseTensorSliceTransfer_v3r1
        constexpr auto src_scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});

-        constexpr auto src_access_lengths = SliceLengths{} / src_scalar_per_access;
-
-        constexpr auto src_dim_access_order = SrcDimAccessOrder{};
-
-        constexpr auto ordered_src_access_lengths =
-            container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
-
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-            forward_sweep_(I0) = true;
-
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_src_access_lengths[I0] - 1;
-
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
-                });
-
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-
-            return forward_sweep_;
-        }();
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    SrcDimAccessOrder,
+                                                    remove_cv_t<decltype(src_scalar_per_access)>>;

-        // calculate src data index after last iteration in RunRead(), if it has not being reset by
-        // RunRead()
-        constexpr auto src_data_idx = [&]() {
-            Index ordered_idx;
+        constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();
+        constexpr auto reset_step =
+            SpaceFillingCurve::GetStepBetween(Number<num_accesses - 1>{}, Number<0>{});

-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
-            });
-
-            return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
-                   src_scalar_per_access;
-        }();
-
-        //
-        constexpr auto reset_src_data_step = [&]() {
-            Index reset_src_data_step_;
-
-            static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
-
-            return reset_src_data_step_;
-        }();
-
-        return reset_src_data_step;
+        return reset_step;
    }

    __device__ static constexpr auto GetDstCoordinateResetStep()
@@ -580,55 +374,15 @@ struct ThreadwiseTensorSliceTransfer_v3r1
        constexpr auto dst_scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<DstVectorDim, DstScalarPerVector>{}, Number<nDim>{});

-        constexpr auto dst_access_lengths = SliceLengths{} / dst_scalar_per_access;
-
-        constexpr auto dst_dim_access_order = DstDimAccessOrder{};
-
-        constexpr auto ordered_dst_access_lengths =
-            container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
-
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-            forward_sweep_(I0) = true;
-
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_dst_access_lengths[I0] - 1;
-
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
-                });
-
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-
-            return forward_sweep_;
-        }();
-
-        // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
-        // RunWrite()
-        constexpr auto dst_data_idx = [&]() {
-            Index ordered_idx;
-
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
-            });
-
-            return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
-                   dst_scalar_per_access;
-        }();
-
-        //
-        constexpr auto reset_dst_data_step = [&]() {
-            Index reset_dst_data_step_;
-
-            static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DstDimAccessOrder,
+                                                    remove_cv_t<decltype(dst_scalar_per_access)>>;

-            return reset_dst_data_step_;
-        }();
+        constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();
+        constexpr auto reset_step =
+            SpaceFillingCurve::GetStepBetween(Number<num_accesses - 1>{}, Number<0>{});

-        return reset_dst_data_step;
+        return reset_step;
    }

    // src_slice_origin_step_idx need to be known at compile-time, for performance reason

--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r1.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r1.hpp
@@ -4,6 +4,7 @@
 #include "common_header.hpp"
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
+#include "tensor_space_filling_curve.hpp"

 namespace ck {

@@ -40,9 +41,6 @@ struct ThreadwiseTensorSliceTransfer_v6r1
    using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
    using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));

-    using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
-    using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
-
    static constexpr auto I0 = Number<0>{};

    __device__ constexpr ThreadwiseTensorSliceTransfer_v6r1(const SrcDesc& src_desc,
@@ -79,70 +77,14 @@ struct ThreadwiseTensorSliceTransfer_v6r1
        constexpr auto scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});

-        constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
-
-        constexpr auto dim_access_order = DimAccessOrder{};
-
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-
-        auto make_forward_steps = [&](auto desc) {
-            return generate_tuple(
-                [&](auto i) {
-                    Index forward_step_idx;
-
-                    static_for<0, nDim, 1>{}([&](auto j) {
-                        forward_step_idx(j) = (i.value == j.value) ? scalar_per_access[i] : 0;
-                    });
-
-                    return make_tensor_coordinate_step(desc, forward_step_idx);
-                },
-                Number<nDim>{});
-        };
-
-        auto make_backward_steps = [&](auto desc) {
-            return generate_tuple(
-                [&](auto i) {
-                    Index backward_step_idx;
-
-                    static_for<0, nDim, 1>{}([&](auto j) {
-                        backward_step_idx(j) = (i.value == j.value) ? -scalar_per_access[i] : 0;
-                    });
-
-                    return make_tensor_coordinate_step(desc, backward_step_idx);
-                },
-                Number<nDim>{});
-        };
-
-        // make forward steps
-        const auto src_forward_steps = make_forward_steps(src_desc);
-        const auto dst_forward_steps = make_forward_steps(dst_desc);
-
-        // make backward steps
-        const auto src_backward_steps = make_backward_steps(src_desc);
-        const auto dst_backward_steps = make_backward_steps(dst_desc);
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;

-        // loop over slice window
-        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-                forward_sweep_(I0) = true;
-
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_access_idx[I0];
-
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
-                    });
-
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-
-                return forward_sweep_;
-            }();
+        // loop over space-filling curve
+        constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();

+        static_for<0, num_accesses, 1>{}([&](auto idx_1d) {
            using src_vector_type = vector_type_maker_t<SrcData, ScalarPerVector>;
            using src_vector_t    = typename src_vector_type::type;

@@ -168,59 +110,20 @@ struct ThreadwiseTensorSliceTransfer_v6r1
                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);

            // copy data from dst_vector into dst_buf
-            if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set)
-            {
-                dst_buf.template Set<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector_container.template AsType<dst_vector_t>()[I0]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd)
-            {
-                dst_buf.template AtomicAdd<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector_container.template AsType<dst_vector_t>()[I0]);
-            }
+            dst_buf.template Update<DstInMemOp, dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector_container.template AsType<dst_vector_t>()[I0]);

-            constexpr auto move_on_dim = [&]() constexpr
+            // move coordinate
+            if constexpr(idx_1d.value != num_accesses - 1)
            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
-
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
-                    });
-                });
-
-                return move_on_dim_;
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+                move_tensor_coordinate(
+                    src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
+                move_tensor_coordinate(
+                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
            }
-            ();
-
-            // move coordinate
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            src_desc, src_coord_, src_forward_steps[dim_access_order[i]]);
-
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            src_desc, src_coord_, src_backward_steps[dim_access_order[i]]);
-
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
-                    }
-                }
-            });
        });

        // move coordinate back to slice origin (or not)
@@ -243,59 +146,18 @@ struct ThreadwiseTensorSliceTransfer_v6r1

    __device__ static constexpr auto GetCoordinateResetStep()
    {
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
        constexpr auto scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});

-        constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
-
-        constexpr auto dim_access_order = DimAccessOrder{};
-
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-            forward_sweep_(I0) = true;
-
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_access_lengths[I0] - 1;
-
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
-                });
-
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-
-            return forward_sweep_;
-        }();
-
-        // calculate data index after last iteration in Run(), if it has not being reset
-        constexpr auto data_idx = [&]() {
-            Index ordered_idx;
-
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
-            });
-
-            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                   scalar_per_access;
-        }();
-
-        //
-        constexpr auto reset_data_step = [&]() {
-            Index reset_data_step_;
-
-            static_for<0, nDim, 1>{}([&](auto i) { reset_data_step_(i) = -data_idx[i]; });
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;

-            return reset_data_step_;
-        }();
+        constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();
+        constexpr auto reset_step =
+            SpaceFillingCurve::GetStepBetween(Number<num_accesses - 1>{}, Number<0>{});

-        return reset_data_step;
+        return reset_step;
    }

    // src_slice_origin_step_idx need to be known at compile-time, for performance reason
@@ -332,7 +194,7 @@ struct ThreadwiseTensorSliceTransfer_v6r1
    SrcCoord src_coord_;
    DstCoord dst_coord_;
    const ElementwiseOperation element_op_;
-};
+}; // namespace ck

 } // namespace ck
 #endif
--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r2.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r2.hpp
@@ -4,6 +4,7 @@
 #include "common_header.hpp"
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
+#include "tensor_space_filling_curve.hpp"

 namespace ck {

@@ -44,10 +45,6 @@ struct ThreadwiseTensorSliceTransfer_v6r2
    using Src1Coord = decltype(make_tensor_coordinate(Src1Desc{}, Index{}));
    using DstCoord  = decltype(make_tensor_coordinate(DstDesc{}, Index{}));

-    using Src0CoordStep = decltype(make_tensor_coordinate_step(Src0Desc{}, Index{}));
-    using Src1CoordStep = decltype(make_tensor_coordinate_step(Src1Desc{}, Index{}));
-    using DstCoordStep  = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
-
    static constexpr auto I0 = Number<0>{};

    __device__ constexpr ThreadwiseTensorSliceTransfer_v6r2(const Src0Desc& src0_desc,
@@ -96,72 +93,14 @@ struct ThreadwiseTensorSliceTransfer_v6r2
        constexpr auto scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});

-        constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
-
-        constexpr auto dim_access_order = DimAccessOrder{};
-
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-
-        auto make_forward_steps = [&](auto desc) {
-            return generate_tuple(
-                [&](auto i) {
-                    Index forward_step_idx;
-
-                    static_for<0, nDim, 1>{}([&](auto j) {
-                        forward_step_idx(j) = (i.value == j.value) ? scalar_per_access[i] : 0;
-                    });
-
-                    return make_tensor_coordinate_step(desc, forward_step_idx);
-                },
-                Number<nDim>{});
-        };
-
-        auto make_backward_steps = [&](auto desc) {
-            return generate_tuple(
-                [&](auto i) {
-                    Index backward_step_idx;
-
-                    static_for<0, nDim, 1>{}([&](auto j) {
-                        backward_step_idx(j) = (i.value == j.value) ? -scalar_per_access[i] : 0;
-                    });
-
-                    return make_tensor_coordinate_step(desc, backward_step_idx);
-                },
-                Number<nDim>{});
-        };
-
-        // make forward steps
-        const auto src0_forward_steps = make_forward_steps(src0_desc);
-        const auto src1_forward_steps = make_forward_steps(src1_desc);
-        const auto dst_forward_steps  = make_forward_steps(dst_desc);
-
-        // make backward steps
-        const auto src0_backward_steps = make_backward_steps(src0_desc);
-        const auto src1_backward_steps = make_backward_steps(src1_desc);
-        const auto dst_backward_steps  = make_backward_steps(dst_desc);
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;

-        // loop over slice window
-        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-                forward_sweep_(I0) = true;
-
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_access_idx[I0];
-
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
-                    });
-
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-
-                return forward_sweep_;
-            }();
+        constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();

+        // loop over space-filling curve
+        static_for<0, num_accesses, 1>{}([&](auto idx_1d) {
            using src0_vector_type = vector_type_maker_t<Src0Data, ScalarPerVector>;
            using src0_vector_t    = typename src0_vector_type::type;

@@ -197,65 +136,22 @@ struct ThreadwiseTensorSliceTransfer_v6r2
                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);

            // copy data from dst_vector into dst_buf
-            if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set)
-            {
-                dst_buf.template Set<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector_container.template AsType<dst_vector_t>()[I0]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd)
-            {
-                dst_buf.template AtomicAdd<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector_container.template AsType<dst_vector_t>()[I0]);
-            }
+            dst_buf.template Update<DstInMemOp, dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector_container.template AsType<dst_vector_t>()[I0]);

-            constexpr auto move_on_dim = [&]() constexpr
+            // move coordinate
+            if constexpr(idx_1d.value != num_accesses - 1)
            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
-
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
-                    });
-                });
-
-                return move_on_dim_;
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+                move_tensor_coordinate(
+                    src0_desc, src0_coord_, make_tensor_coordinate_step(src0_desc, forward_step));
+                move_tensor_coordinate(
+                    src1_desc, src1_coord_, make_tensor_coordinate_step(src1_desc, forward_step));
+                move_tensor_coordinate(
+                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
            }
-            ();
-
-            // move coordinate
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            src0_desc, src0_coord_, src0_forward_steps[dim_access_order[i]]);
-
-                        move_tensor_coordinate(
-                            src1_desc, src1_coord_, src1_forward_steps[dim_access_order[i]]);
-
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            src0_desc, src0_coord_, src0_backward_steps[dim_access_order[i]]);
-
-                        move_tensor_coordinate(
-                            src1_desc, src1_coord_, src1_backward_steps[dim_access_order[i]]);
-
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
-                    }
-                }
-            });
        });

        // move coordinate back to slice origin (or not)
@@ -286,59 +182,18 @@ struct ThreadwiseTensorSliceTransfer_v6r2

    __device__ static constexpr auto GetCoordinateResetStep()
    {
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
        constexpr auto scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});

-        constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
-
-        constexpr auto dim_access_order = DimAccessOrder{};
-
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-            forward_sweep_(I0) = true;
-
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_access_lengths[I0] - 1;
-
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
-                });
-
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-
-            return forward_sweep_;
-        }();
-
-        // calculate data index after last iteration in Run(), if it has not being reset
-        constexpr auto data_idx = [&]() {
-            Index ordered_idx;
-
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
-            });
-
-            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                   scalar_per_access;
-        }();
-
-        //
-        constexpr auto reset_data_step = [&]() {
-            Index reset_data_step_;
-
-            static_for<0, nDim, 1>{}([&](auto i) { reset_data_step_(i) = -data_idx[i]; });
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;

-            return reset_data_step_;
-        }();
+        constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();
+        constexpr auto reset_step =
+            SpaceFillingCurve::GetStepBetween(Number<num_accesses - 1>{}, Number<0>{});

-        return reset_data_step;
+        return reset_step;
    }

    // src_slice_origin_step_idx need to be known at compile-time, for performance reason

--- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r3.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v6r3.hpp
@@ -4,6 +4,7 @@
 #include "common_header.hpp"
 #include "tensor_descriptor.hpp"
 #include "tensor_descriptor_helper.hpp"
+#include "tensor_space_filling_curve.hpp"

 namespace ck {

@@ -48,11 +49,6 @@ struct ThreadwiseTensorSliceTransfer_v6r3
    using Src2Coord = decltype(make_tensor_coordinate(Src2Desc{}, Index{}));
    using DstCoord  = decltype(make_tensor_coordinate(DstDesc{}, Index{}));

-    using Src0CoordStep = decltype(make_tensor_coordinate_step(Src0Desc{}, Index{}));
-    using Src1CoordStep = decltype(make_tensor_coordinate_step(Src1Desc{}, Index{}));
-    using Src2CoordStep = decltype(make_tensor_coordinate_step(Src2Desc{}, Index{}));
-    using DstCoordStep  = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
-
    static constexpr auto I0 = Number<0>{};

    __device__ constexpr ThreadwiseTensorSliceTransfer_v6r3(const Src0Desc& src0_desc,
@@ -112,74 +108,14 @@ struct ThreadwiseTensorSliceTransfer_v6r3
        constexpr auto scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});

-        constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
-
-        constexpr auto dim_access_order = DimAccessOrder{};
-
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-
-        auto make_forward_steps = [&](auto desc) {
-            return generate_tuple(
-                [&](auto i) {
-                    Index forward_step_idx;
-
-                    static_for<0, nDim, 1>{}([&](auto j) {
-                        forward_step_idx(j) = (i.value == j.value) ? scalar_per_access[i] : 0;
-                    });
-
-                    return make_tensor_coordinate_step(desc, forward_step_idx);
-                },
-                Number<nDim>{});
-        };
-
-        auto make_backward_steps = [&](auto desc) {
-            return generate_tuple(
-                [&](auto i) {
-                    Index backward_step_idx;
-
-                    static_for<0, nDim, 1>{}([&](auto j) {
-                        backward_step_idx(j) = (i.value == j.value) ? -scalar_per_access[i] : 0;
-                    });
-
-                    return make_tensor_coordinate_step(desc, backward_step_idx);
-                },
-                Number<nDim>{});
-        };
-
-        // make forward steps
-        const auto src0_forward_steps = make_forward_steps(src0_desc);
-        const auto src1_forward_steps = make_forward_steps(src1_desc);
-        const auto src2_forward_steps = make_forward_steps(src2_desc);
-        const auto dst_forward_steps  = make_forward_steps(dst_desc);
-
-        // make backward steps
-        const auto src0_backward_steps = make_backward_steps(src0_desc);
-        const auto src1_backward_steps = make_backward_steps(src1_desc);
-        const auto src2_backward_steps = make_backward_steps(src2_desc);
-        const auto dst_backward_steps  = make_backward_steps(dst_desc);
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;

-        // loop over slice window
-        static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
-            // judge move forward or move backward
-            constexpr auto forward_sweep = [&]() {
-                StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-                forward_sweep_(I0) = true;
-
-                static_for<1, nDim, 1>{}([&](auto i) {
-                    index_t tmp = ordered_access_idx[I0];
-
-                    static_for<1, i, 1>{}([&](auto j) {
-                        tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
-                    });
-
-                    forward_sweep_(i) = tmp % 2 == 0;
-                });
-
-                return forward_sweep_;
-            }();
+        constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();

+        // loop over space-filling curve
+        static_for<0, num_accesses, 1>{}([&](auto idx_1d) {
            using src0_vector_type = vector_type_maker_t<Src0Data, ScalarPerVector>;
            using src0_vector_t    = typename src0_vector_type::type;

@@ -224,72 +160,24 @@ struct ThreadwiseTensorSliceTransfer_v6r3
            const bool is_dst_valid =
                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);

-            // copy data from dst_vector into dst_buf
-            if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::Set)
-            {
-                dst_buf.template Set<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector_container.template AsType<dst_vector_t>()[I0]);
-            }
-            else if constexpr(DstInMemOp == InMemoryDataOperationEnum_t::AtomicAdd)
-            {
-                dst_buf.template AtomicAdd<dst_vector_t>(
-                    dst_coord_.GetOffset(),
-                    is_dst_valid,
-                    dst_vector_container.template AsType<dst_vector_t>()[I0]);
-            }
+            dst_buf.template Update<DstInMemOp, dst_vector_t>(
+                dst_coord_.GetOffset(),
+                is_dst_valid,
+                dst_vector_container.template AsType<dst_vector_t>()[I0]);

-            constexpr auto move_on_dim = [&]() constexpr
+            // move coordinate
+            if constexpr(idx_1d.value != num_accesses - 1)
            {
-                StaticallyIndexedArray<bool, nDim> move_on_dim_;
-
-                static_for<0, nDim, 1>{}([&](auto i) {
-                    move_on_dim_(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
-
-                    static_for<i + 1, nDim, 1>{}([&](auto j) {
-                        move_on_dim_(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
-                    });
-                });
-
-                return move_on_dim_;
+                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
+                move_tensor_coordinate(
+                    src0_desc, src0_coord_, make_tensor_coordinate_step(src0_desc, forward_step));
+                move_tensor_coordinate(
+                    src1_desc, src1_coord_, make_tensor_coordinate_step(src1_desc, forward_step));
+                move_tensor_coordinate(
+                    src2_desc, src2_coord_, make_tensor_coordinate_step(src2_desc, forward_step));
+                move_tensor_coordinate(
+                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
            }
-            ();
-
-            // move coordinate
-            static_for<0, nDim, 1>{}([&](auto i) {
-                if constexpr(move_on_dim[i])
-                {
-                    if constexpr(forward_sweep[i])
-                    {
-                        move_tensor_coordinate(
-                            src0_desc, src0_coord_, src0_forward_steps[dim_access_order[i]]);
-
-                        move_tensor_coordinate(
-                            src1_desc, src1_coord_, src1_forward_steps[dim_access_order[i]]);
-
-                        move_tensor_coordinate(
-                            src2_desc, src2_coord_, src2_forward_steps[dim_access_order[i]]);
-
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_forward_steps[dim_access_order[i]]);
-                    }
-                    else
-                    {
-                        move_tensor_coordinate(
-                            src0_desc, src0_coord_, src0_backward_steps[dim_access_order[i]]);
-
-                        move_tensor_coordinate(
-                            src1_desc, src1_coord_, src1_backward_steps[dim_access_order[i]]);
-
-                        move_tensor_coordinate(
-                            src2_desc, src2_coord_, src2_backward_steps[dim_access_order[i]]);
-
-                        move_tensor_coordinate(
-                            dst_desc, dst_coord_, dst_backward_steps[dim_access_order[i]]);
-                    }
-                }
-            });
        });

        // move coordinate back to slice origin (or not)
@@ -328,59 +216,18 @@ struct ThreadwiseTensorSliceTransfer_v6r3

    __device__ static constexpr auto GetCoordinateResetStep()
    {
-        // scalar per access on each dim
-        // TODO: don't use lambda_scalar_per_access
        constexpr auto scalar_per_access = generate_sequence(
            detail::lambda_scalar_per_access<VectorDim, ScalarPerVector>{}, Number<nDim>{});

-        constexpr auto access_lengths = SliceLengths{} / scalar_per_access;
-
-        constexpr auto dim_access_order = DimAccessOrder{};
-
-        constexpr auto ordered_access_lengths =
-            container_reorder_given_new2old(access_lengths, dim_access_order);
-
-        // judge move forward or move backward during the last iteration
-        constexpr auto forward_sweep = [&]() {
-            StaticallyIndexedArray<bool, nDim> forward_sweep_;
-
-            forward_sweep_(I0) = true;
-
-            static_for<1, nDim, 1>{}([&](auto i) {
-                index_t tmp = ordered_access_lengths[I0] - 1;
-
-                static_for<1, i, 1>{}([&](auto j) {
-                    tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
-                });
-
-                forward_sweep_(i) = tmp % 2 == 0;
-            });
-
-            return forward_sweep_;
-        }();
-
-        // calculate data index after last iteration in Run(), if it has not being reset
-        constexpr auto data_idx = [&]() {
-            Index ordered_idx;
-
-            static_for<0, nDim, 1>{}([&](auto i) {
-                ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
-            });
-
-            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
-                   scalar_per_access;
-        }();
-
-        //
-        constexpr auto reset_data_step = [&]() {
-            Index reset_data_step_;
-
-            static_for<0, nDim, 1>{}([&](auto i) { reset_data_step_(i) = -data_idx[i]; });
+        using SpaceFillingCurve = SpaceFillingCurve<SliceLengths,
+                                                    DimAccessOrder,
+                                                    remove_cv_t<decltype(scalar_per_access)>>;

-            return reset_data_step_;
-        }();
+        constexpr auto num_accesses = SpaceFillingCurve::GetNumOfAccess();
+        constexpr auto reset_step =
+            SpaceFillingCurve::GetStepBetween(Number<num_accesses - 1>{}, Number<0>{});

-        return reset_data_step;
+        return reset_step;
    }

    // src_slice_origin_step_idx need to be known at compile-time, for performance reason

--- a/composable_kernel/include/utility/dynamic_buffer.hpp
+++ b/composable_kernel/include/utility/dynamic_buffer.hpp
@@ -3,6 +3,7 @@

 #include "amd_buffer_addressing.hpp"
 #include "c_style_pointer_cast.hpp"
+#include "config.hpp"
 #include "enable_if.hpp"

 namespace ck {
@@ -108,6 +109,30 @@ struct DynamicBuffer
        }
    }

+    template <InMemoryDataOperationEnum_t Op,
+              typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __host__ __device__ void Update(index_t i, bool is_valid_element, const X& x)
+    {
+        if constexpr(Op == InMemoryDataOperationEnum_t::Set)
+        {
+            this->template Set<X>(i, is_valid_element, x);
+        }
+        else if constexpr(Op == InMemoryDataOperationEnum_t::AtomicAdd)
+        {
+            this->template AtomicAdd<X>(i, is_valid_element, x);
+        }
+        else if constexpr(Op == InMemoryDataOperationEnum_t::Add)
+        {
+            auto tmp = this->template Get<X>(i, is_valid_element);
+            this->template Set<X>(i, is_valid_element, x + tmp);
+            // tmp += x;
+            // this->template Set<X>(i, is_valid_element, tmp);
+        }
+    }
+
    template <typename X,
              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                         typename scalar_type<remove_cvref_t<T>>::type>::value,

--- a/composable_kernel/include/utility/magic_division.hpp
+++ b/composable_kernel/include/utility/magic_division.hpp
@@ -25,21 +25,30 @@ struct MagicDivision
    // uint32_t
    __host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor)
    {
-        // assert(divisior >= 1 && divisior <= INT32_MAX);
-        uint32_t shift = 0;
-        for(shift = 0; shift < 32; ++shift)
+        // WARNING: magic division is only applicable for division inside this range.
+        // You should use the return value of CalculateMagicNumbers, if division is not inside this
+        // range. The "else" logic below is to quiet down run-time error.
+        if(divisor >= 1 && divisor <= INT32_MAX)
        {
-            if((1U << shift) >= divisor)
+            uint32_t shift = 0;
+            for(shift = 0; shift < 32; ++shift)
            {
-                break;
+                if((1U << shift) >= divisor)
+                {
+                    break;
+                }
            }
-        }

-        uint64_t one        = 1;
-        uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
-        // assert(multiplier <= 0xffffffffUL);
+            uint64_t one        = 1;
+            uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
+            // assert(multiplier <= 0xffffffffUL);

-        return make_tuple(uint32_t(multiplier), shift);
+            return make_tuple(uint32_t(multiplier), shift);
+        }
+        else
+        {
+            return make_tuple(uint32_t(0), uint32_t(0));
+        }
    }

    __host__ __device__ static constexpr uint32_t CalculateMagicMultiplier(uint32_t divisor)

--- a/composable_kernel/include/utility/tensor_space_filling_curve.hpp
+++ b/composable_kernel/include/utility/tensor_space_filling_curve.hpp
+#ifndef TENSOR_SPACE_FILLING_CURVE_HPP
+#define TENSOR_SPACE_FILLING_CURVE_HPP
+
 #include "math.hpp"
 #include "sequence.hpp"
+#include "sequence_helper.hpp"
 #include "tensor_adaptor.hpp"
 #include "statically_indexed_array_multi_index.hpp"
 #include "tuple_helper.hpp"
@@ -37,13 +41,25 @@ struct SpaceFillingCurve
               ScalarPerVector;
    }

+    template <index_t AccessIdx1dBegin, index_t AccessIdx1dEnd>
+    static __device__ __host__ constexpr auto GetStepBetween(Number<AccessIdx1dBegin>,
+                                                             Number<AccessIdx1dEnd>)
+    {
+        static_assert(AccessIdx1dBegin >= 0, "1D index should be non-negative");
+        static_assert(AccessIdx1dBegin < GetNumOfAccess(), "1D index should be larger than 0");
+        static_assert(AccessIdx1dEnd >= 0, "1D index should be non-negative");
+        static_assert(AccessIdx1dEnd < GetNumOfAccess(), "1D index should be larger than 0");
+
+        constexpr auto idx_begin = GetIndex(Number<AccessIdx1dBegin>{});
+        constexpr auto idx_end   = GetIndex(Number<AccessIdx1dEnd>{});
+        return idx_end - idx_begin;
+    }
+
    template <index_t AccessIdx1d>
    static __device__ __host__ constexpr auto GetForwardStep(Number<AccessIdx1d>)
    {
-
-        constexpr auto idx_curr = GetIndex(Number<AccessIdx1d>{});
-        constexpr auto idx_next = GetIndex(Number<AccessIdx1d + 1>{});
-        return idx_next - idx_curr;
+        static_assert(AccessIdx1d < GetNumOfAccess(), "1D index should be larger than 0");
+        return GetStepBetween(Number<AccessIdx1d>{}, Number<AccessIdx1d + 1>{});
    }

    template <index_t AccessIdx1d>
@@ -51,9 +67,7 @@ struct SpaceFillingCurve
    {
        static_assert(AccessIdx1d > 0, "1D index should be larger than 0");

-        constexpr auto idx_curr = GetIndex(Number<AccessIdx1d>{});
-        constexpr auto idx_prev = GetIndex(Number<AccessIdx1d - 1>{});
-        return idx_prev - idx_curr;
+        return GetStepBetween(Number<AccessIdx1d>{}, Number<AccessIdx1d - 1>{});
    }

    template <index_t AccessIdx1d>
@@ -129,3 +143,4 @@ struct SpaceFillingCurve
 };

 } // namespace ck
+#endif
--- a/device_operation/CMakeLists.txt
+++ b/device_operation/CMakeLists.txt
@@ -101,16 +101,25 @@ set(DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE
   ${PROJECT_SOURCE_DIR}/device_operation/src/device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp;
 )

-add_library(device_gemm_instance SHARED ${DEVICE_GEMM_INSTANCE_SOURCE})
-add_library(device_gemm_bias_2d_instance SHARED ${DEVICE_GEMM_BIAS_2D_INSTANCE_SOURCE})
-add_library(device_gemm_bias_relu_instance SHARED ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE})
-add_library(device_gemm_bias_relu_add_instance SHARED ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE})
+# device_conv2d_bwd_data_instance
+set(DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE 
+   ${PROJECT_SOURCE_DIR}/device_operation/src/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/src/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/src/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp;
+   ${PROJECT_SOURCE_DIR}/device_operation/src/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp;
+) 
+
+add_library(device_gemm_instance SHARED ${DEVICE_GEMM_INSTANCE_SOURCE}) 
+add_library(device_gemm_bias_relu_instance SHARED ${DEVICE_GEMM_BIAS_RELU_INSTANCE_SOURCE}) 
+add_library(device_gemm_bias_relu_add_instance SHARED ${DEVICE_GEMM_BIAS_RELU_ADD_INSTANCE_SOURCE}) 
 add_library(device_batched_gemm_instance SHARED ${DEVICE_BATCHED_GEMM_INSTANCE_SOURCE})
 add_library(device_conv1d_fwd_instance SHARED ${DEVICE_CONV1D_FWD_INSTANCE_SOURCE}) 
 add_library(device_conv2d_fwd_instance SHARED ${DEVICE_CONV2D_FWD_INSTANCE_SOURCE}) 
 add_library(device_conv2d_fwd_bias_relu_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_INSTANCE_SOURCE}) 
 add_library(device_conv2d_fwd_bias_relu_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ADD_INSTANCE_SOURCE}) 
 add_library(device_conv2d_fwd_bias_relu_atomic_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE}) 
+add_library(device_gemm_bias_2d_instance SHARED ${DEVICE_GEMM_BIAS_2D_INSTANCE_SOURCE})
+add_library(device_conv2d_bwd_data_instance SHARED ${DEVICE_CONV2D_BWD_DATA_INSTANCE_SOURCE})

 target_include_directories(device_gemm_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
 target_include_directories(device_gemm_bias_2d_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
@@ -122,6 +131,7 @@ target_include_directories(device_conv2d_fwd_instance SYSTEM PUBLIC $<BUILD_INTE
 target_include_directories(device_conv2d_fwd_bias_relu_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
 target_include_directories(device_conv2d_fwd_bias_relu_add_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
 target_include_directories(device_conv2d_fwd_bias_relu_atomic_add_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
+target_include_directories(device_conv2d_bwd_data_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)

 target_compile_features(device_gemm_instance PUBLIC)
 target_compile_features(device_gemm_bias_2d_instance PUBLIC)
@@ -133,6 +143,7 @@ target_compile_features(device_conv2d_fwd_instance PUBLIC)
 target_compile_features(device_conv2d_fwd_bias_relu_instance PUBLIC)
 target_compile_features(device_conv2d_fwd_bias_relu_add_instance PUBLIC)
 target_compile_features(device_conv2d_fwd_bias_relu_atomic_add_instance PUBLIC)
+target_compile_features(device_conv2d_bwd_data_instance PUBLIC)

 set_target_properties(device_gemm_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(device_gemm_bias_2d_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
@@ -144,6 +155,7 @@ set_target_properties(device_conv2d_fwd_instance PROPERTIES POSITION_INDEPENDENT
 set_target_properties(device_conv2d_fwd_bias_relu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(device_conv2d_fwd_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(device_conv2d_fwd_bias_relu_atomic_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_target_properties(device_conv2d_bwd_data_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)

 install(TARGETS device_gemm_instance LIBRARY DESTINATION lib)
 install(TARGETS device_gemm_bias_2d_instance LIBRARY DESTINATION lib)
@@ -155,3 +167,4 @@ install(TARGETS device_conv2d_fwd_instance LIBRARY DESTINATION lib)
 install(TARGETS device_conv2d_fwd_bias_relu_instance LIBRARY DESTINATION lib) 
 install(TARGETS device_conv2d_fwd_bias_relu_add_instance LIBRARY DESTINATION lib) 
 install(TARGETS device_conv2d_fwd_bias_relu_atomic_add_instance LIBRARY DESTINATION lib) 
+install(TARGETS device_conv2d_bwd_data_instance LIBRARY DESTINATION lib) 
--- a/device_operation/include/convolution_backward_data_specialization.hpp
+++ b/device_operation/include/convolution_backward_data_specialization.hpp
+#ifndef CONVOLUTION_BACKWARD_DATA_SPECIALIZATION
+#define CONVOLUTION_BACKWARD_DATA_SPECIALIZATION
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+enum ConvolutionBackwardDataSpecialization_t
+{
+    Default,
+    Filter1x1Stride1Pad0,
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+#endif
--- a/device_operation/include/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
+++ b/device_operation/include/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp
--- a/device_operation/include/device_conv_bwd_data.hpp
+++ b/device_operation/include/device_conv_bwd_data.hpp
+#ifndef DEVICE_CONV_BWD_DATA_HPP
+#define DEVICE_CONV_BWD_DATA_HPP
+
+#include <iostream>
+#include "device_base.hpp"
+#include "element_wise_operation.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+struct DeviceConvBwdData : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(void* p_in,
+                        const void* p_wei,
+                        const void* p_out,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t C,
+                        std::vector<ck::index_t> input_spatial_lengths,
+                        std::vector<ck::index_t> filter_spatial_lengths,
+                        std::vector<ck::index_t> output_spatial_lengths,
+                        std::vector<ck::index_t> conv_filter_strides,
+                        std::vector<ck::index_t> conv_filter_dilations,
+                        std::vector<ck::index_t> input_left_pads,
+                        std::vector<ck::index_t> input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation>
+using DeviceConvBwdDataPtr = std::unique_ptr<
+    DeviceConvBwdData<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+#endif
--- a/device_operation/src/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
+++ b/device_operation/src/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_bwd_data_instance {
+
+using BF16 = ushort;
+using F32  = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances = std::tuple<
+    // clang-format off
+        //####################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|        ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //####################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //####################################################################|       |        |        |        |   Operation|   Operation|   Operation|      Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //####################################################################|       |        |        |        |            |            |            |                    |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances =
+    std::tuple<
+        // clang-format off
+        //####################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                     ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //####################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //####################################################################|       |        |        |        |   Operation|   Operation|   Operation|                   Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //####################################################################|       |        |        |        |            |            |            |                                 |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(
+    std::vector<DeviceConvBwdDataPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_bf16_instances{});
+}
+
+} // namespace device_conv2d_bwd_data_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/device_operation/src/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/device_operation/src/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_bwd_data_instance {
+
+using F16 = ck::half_t;
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances = std::tuple<
+    // clang-format off
+        //####################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|       ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //####################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|               Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //####################################################################|       |        |        |        |   Operation|   Operation|   Operation|     Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //####################################################################|       |        |        |        |            |            |            |                   |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   256,   128,     4,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   256,     4,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,   128,     4,   8,  32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+#if !CK_WORKAROUND_SWDEV_325164
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,   128,     4,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,    64,   128,     4,   8,  32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    32,   128,     4,   8,  32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+#endif
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    64,     4,   8,  32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,    64,   128,     4,   8,  32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    64,     4,   8,  32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   256,   128,    64,     4,   8,  32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,   128,   128,    32,     4,   8,  32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    64,    32,     4,   8,  32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvBwdDataDefault,    64,    32,    64,     4,   8,  32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances =
+    std::tuple<
+        // clang-format off
+        //####################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                     ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //####################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //####################################################################|       |        |        |        |   Operation|   Operation|   Operation|                   Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //####################################################################|       |        |        |        |            |            |            |                                 |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvBwdDataPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances{});
+}
+
+} // namespace device_conv2d_bwd_data_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/device_operation/src/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+++ b/device_operation/src/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_bwd_data_instance {
+
+using F32 = float;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances = std::tuple<
+    // clang-format off
+        //####################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|        ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //####################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //####################################################################|       |        |        |        |   Operation|   Operation|   Operation|      Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //####################################################################|       |        |        |        |            |            |            |                    |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances =
+    std::tuple<
+        // clang-format off
+        //####################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                     ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //####################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                             Data|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //####################################################################|       |        |        |        |   Operation|   Operation|   Operation|                   Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //####################################################################|       |        |        |        |            |            |            |                                 |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  4,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  4,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  4,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  4,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  4,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  4,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  4,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  4,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  4,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  4,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  4,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,              4,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  4,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,              4,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(
+    std::vector<DeviceConvBwdDataPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f32_instances{});
+}
+
+} // namespace device_conv2d_bwd_data_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/device_operation/src/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/device_operation/src/device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation.hpp"
+#include "device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_bwd_data_instance {
+
+using DataType = int8_t;
+using AccType  = int32_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+static constexpr auto ConvBwdDataDefault =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Default;
+
+static constexpr auto ConvBwdDataFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardDataSpecialization_t::Filter1x1Stride1Pad0;
+
+// Compilation parameters for in[n, hi, wi, c] * wei[k, y, x, c] = out[n, ho, wo, k]
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances = std::tuple<
+    // clang-format off
+        //####################################################################|    InData|   WeiData|   OutData|    AccData|          In|         Wei|         Out|        ConvBackward| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //####################################################################|      Type|      Type|      Type|       Type| Elementwise| Elementwise| Elementwise|                Data|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //####################################################################|          |          |          |           |   Operation|   Operation|   Operation|      Specialization|      |      |      |      |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //####################################################################|          |          |          |           |            |            |            |                    |      |      |      |      |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   256,   128,     4,  16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   256,     4,  16,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,   128,     4,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    64,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    64,   128,     4,  16,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    64,     4,  16,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,   128,    64,     4,  16,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   256,    64,   128,     4,  16,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,   128,    32,     4,  16,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,   128,    32,   128,     4,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    64,    32,     4,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<  DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataDefault,    64,    32,    64,     4,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>
+    // clang-format on
+    >;
+
+using device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances =
+    std::tuple<
+        // clang-format off
+        //#####################################################################|    InData|   WeiData|   OutData|    AccData|          In|         Wei|         Out|                     ConvBackward| Block|  MPer|  NPer| K0Per|  K1| MPer| NPer| MXdl| NXdl|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
+        //#####################################################################|      Type|      Type|      Type|       Type| Elementwise| Elementwise| Elementwise|                             Data|  Size| Block| Block| Block|    |  XDL|  XDL|  Per|  Per|    ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
+        //#####################################################################|          |          |          |           |   Operation|   Operation|   Operation|                   Specialization|      |      |      |      |    |     |     | Wave| Wave|  Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
+        //#####################################################################|          |          |          |           |            |            |            |                                 |      |      |      |      |    |     |     |     |     |                 |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   256,   128,     4,  16,   32,   32,    4,    2,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   256,     4,  16,   32,   32,    2,    4,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,   128,     4,  16,   32,   32,    4,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,   128,     4,  16,   32,   32,    2,    2,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    64,     4,  16,   32,   32,    2,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    64,   128,     4,  16,   32,   32,    2,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    64,     4,  16,   32,   32,    2,    2,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,   128,    64,     4,  16,   32,   32,    2,    1,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   256,    64,   128,     4,  16,   32,   32,    1,    2,      S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 64, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,   128,    32,     4,  16,   32,   32,    2,    1,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              1,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,   128,    32,   128,     4,  16,   32,   32,    1,    2,      S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 32, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    64,    32,     4,  16,   32,   32,    2,    1,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              2,             16,      true,               7,               1>,
+        DeviceConv2dBwdDataXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<   DataType,  DataType,  DataType,    AccType, PassThrough, PassThrough, PassThrough,  ConvBwdDataFilter1x1Stride1Pad0,    64,    32,    64,     4,  16,   32,   32,    1,    2,      S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,      true,     S<4, 16, 1>,     S<2, 0, 1>,     S<0, 2, 1>,             1,              4,             16,      true,               7,               1>
+        // clang-format on
+        >;
+
+void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(
+    std::vector<DeviceConvBwdDataPtr<PassThrough, PassThrough, PassThrough>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances{});
+    add_device_operation_instances(
+        instances, device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_int8_instances{});
+}
+
+} // namespace device_conv2d_bwd_data_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/device_operation/src/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
+++ b/device_operation/src/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp
--- a/device_operation/src/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp
+++ b/device_operation/src/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp