prototype dynamic descriptor

d3405258 · Chao Liu · 834eb24c · d3405258 · d3405258 · d3405258
Commit d3405258 authored Jul 29, 2020 by Chao Liu
12 changed files
--- a/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_dynamic_transform.hpp
--- a/composable_kernel/include/kernel_algorithm/dummy_static_transform.hpp
+++ b/composable_kernel/include/kernel_algorithm/dummy_static_transform.hpp
@@ -96,31 +96,26 @@ struct DummyStaticTransform
        auto coord = typename TensorCoordinate<decltype(in_gemmk_gemmn_global_desc)>::type(k0, n0);
-        if(get_block_1d_id() < coord.GetOffset())
+#pragma unroll 1
+        for(index_t k = 0; k < 100; ++k)
        {
-            for(index_t k = 0; k < 1; ++k)
+            coord += Array<index_t, 2>{8, 0};
-            {
-                for(index_t n = 0; n < 4; ++n)
+            Float value = 1;
-                {
+            transfer_data<Float,
-                    auto tmp = coord + Array<index_t, 2>{k, n};
+                          1,
+                          AddressSpace::Vgpr,
-                    Float value = 1;
+                          AddressSpace::Global,
-                    transfer_data<Float,
+                          InMemoryDataOperation::Set,
-                                  1,
+                          1,
-                                  AddressSpace::Vgpr,
+                          1>(&value,
-                                  AddressSpace::Global,
+                             0,
-                                  InMemoryDataOperation::Set,
+                             true,
-                                  1,
+                             1,
-                                  1>(&value,
+                             p_in_global,
-                                     0,
+                             coord.GetOffset(),
-                                     true,
+                             coord.IsOffsetValidAssumingUpperIndexIsValid(),
-                                     1,
+                             in_gemmk_gemmn_global_desc.GetElementSpace());
-                                     p_in_global,
-                                     tmp.GetOffset(),
-                                     tmp.IsOffsetValidAssumingUpperIndexIsValid(),
-                                     in_gemmk_gemmn_global_desc.GetElementSpace());
-                }
-            }
        }
    }
 };

--- a/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_multi_index_transform.hpp
+#ifndef CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
+#define CK_DYNAMIC_MULTI_INDEX_TRANSFORM_HPP
+#include "common_header.hpp"
+namespace ck {
+struct DynamicPassThrough
+{
+    using LowerIndex = MultiIndex<1>;
+    using UpperIndex = MultiIndex<1>;
+    index_t low_length_;
+    __host__ __device__ constexpr DynamicPassThrough(index_t low_length) : low_length_(low_length)
+    {
+    }
+    __host__ __device__ static constexpr auto GetNumOfLowerDimension() { return Number<1>{}; }
+    __host__ __device__ static constexpr auto GetNumOfUpperDimension() { return Number<1>{}; }
+    __host__ __device__ static constexpr auto GetUpperLengths() { return Sequence<Length>{}; }
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    {
+        return idx_up;
+    }
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& /* idx_low_old */)
+    {
+        return idx_up_diff;
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return true; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+};
+template <index_t NDimLow>
+struct DynamicMerge
+{
+    static constexpr index_t ndim_low_ = NDimLow static constexpr index_t ndim_up_ = 1;
+    using LowerIndex = MultiIndex<ndim_low_>;
+    using UpperIndex = MultiIndex<ndum_up_>;
+    Array<index_t, NDimLow> low_lengths_;
+    index_t up_length_;
+    __host__ __device__ static constexpr auto GetNumOfLowerDimension()
+    {
+        return Number<ndim_low_>{};
+    }
+    __host__ __device__ static constexpr auto GetNumOfUpperDimension()
+    {
+        return Number<ndim_up_>{};
+    }
+    __host__ __device__ static constexpr auto GetUpperLengths()
+    {
+        return Array<index_t, 1> up_length_;
+    }
+    // emulate constexpr lambda
+    template <typename PseudoLowStrides>
+    struct lambda_CalculateLowerIndex
+    {
+        index_t& itmp;
+        LowerIndex& idx_low;
+        __host__ __device__ explicit constexpr lambda_CalculateLowerIndex(index_t& itmp_,
+                                                                          LowerIndex& idx_low_)
+            : itmp(itmp_), idx_low(idx_low_)
+        {
+        }
+        template <typename IDim>
+        __host__ __device__ constexpr void operator()(IDim idim) const
+        {
+            constexpr index_t stride = PseudoLowStrides::At(idim);
+            idx_low(idim)            = itmp / stride;
+            itmp -= idx_low[idim] * stride;
+        }
+    };
+    __host__ __device__ static constexpr auto CalculateLowerIndex(const UpperIndex& idx_up)
+    {
+        LowerIndex idx_low;
+        index_t itmp = idx_up[0];
+        constexpr auto pseudo_low_strides =
+            reverse_inclusive_scan_sequence(
+                LowerLengths::PopFront(), math::multiplies<index_t>{}, Number<1>{})
+                .PushBack(Number<1>{});
+        static_for<0, nDimLow - 1, 1>{}(
+            lambda_CalculateLowerIndex<decltype(pseudo_low_strides)>(itmp, idx_low));
+        idx_low(nDimLow - 1) = itmp / pseudo_low_strides[nDimLow - 1];
+        return idx_low;
+    }
+    // idx_low_diff depends on idx_low_old, so idx_low need to be up-to-date
+    // If idx_up_diff is known at compile-time, many calculations can be optimized
+    // away by compiler
+    // This function assume idx_low_old is not out-of-bound
+    __host__ __device__ static constexpr auto
+    CalculateLowerIndexDiff(const UpperIndex& idx_up_diff,
+                            const UpperIndex& /* idx_up_old */,
+                            const LowerIndex& idx_low_old)
+    {
+        if(idx_up_diff[0] == 0)
+        {
+            return make_zero_array<index_t, nDimLow>();
+        }
+        else
+        {
+            // CalculateLowerIndex(idx_up_diff) has multiple integer divisions.
+            //   If idx_up_diff is known at compile-time, the calculation can
+            //   be done at compile-time. However, if idx_up_diff is only known
+            //   at run-time, then the calculation will also be computed at
+            //   run-time, and can be very expensive.
+            LowerIndex idx_low_diff_tmp = CalculateLowerIndex(idx_up_diff);
+            // find out the last low dimension that changed
+            index_t last_changed_low_dim = 0;
+            static_for<0, nDimLow, 1>{}([&](auto i) {
+                if(idx_low_diff_tmp[i] != 0)
+                {
+                    last_changed_low_dim = i;
+                }
+            });
+            LowerIndex idx_low_new = idx_low_old + idx_low_diff_tmp;
+            if(idx_up_diff[0] > 0)
+            {
+                // do carry check on each low dimension in reversed order
+                // starting from the first digit that changed
+                // don't check the highest dimension
+                bool carry = false;
+                static_for<nDimLow - 1, 0, -1>{}([&](auto i) {
+                    if(i <= last_changed_low_dim)
+                    {
+                        if(carry)
+                        {
+                            ++idx_low_new(i);
+                        }
+                        carry = false;
+                        if(idx_low_new[i] >= LowerLengths::At(i))
+                        {
+                            idx_low_new(i) -= LowerLengths::At(i);
+                            carry = true;
+                        }
+                    }
+                });
+                // highest dimension, no out-of-bound check
+                if(carry)
+                {
+                    ++idx_low_new(0);
+                }
+            }
+            else
+            {
+                // do borrow check on each low dimension in reversed order
+                // starting from the first digit that changed
+                // don't check the highest dimension
+                bool borrow = false;
+                static_for<nDimLow - 1, 0, -1>{}([&](auto i) {
+                    if(i <= last_changed_low_dim)
+                    {
+                        if(borrow)
+                        {
+                            --idx_low_new(i);
+                        }
+                        borrow = false;
+                        if(idx_low_new[i] < 0)
+                        {
+                            idx_low_new(i) += LowerLengths::At(i);
+                            borrow = true;
+                        }
+                    }
+                });
+                // highest dimension, no out-of-bound check
+                if(borrow)
+                {
+                    --idx_low_new(0);
+                }
+            }
+            return idx_low_new - idx_low_old;
+        }
+    }
+    __host__ __device__ static constexpr bool IsLinearTransform() { return false; }
+    __host__ __device__ static constexpr bool IsValidUpperIndexAlwaysMappedToValidLowerIndex()
+    {
+        return true;
+    }
+};
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor.hpp
+#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
+#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HPP
+#include "common_header.hpp"
+namespace ck {
+template <index_t NDim>
+struct DynamicNativeTensorDescriptor
+{
+    using Index = MultiIndex<NDim>;
+    Array<index_t, NDim> lengths_;
+    Array<index_t, NDim> strides_;
+    index_t element_size_;
+    index_t element_space_;
+    template <typename Lengths, typename Strides>
+    __host__ __device__ constexpr DynamicNativeTensorDescriptor(const Lengths& lengths,
+                                                                const Strides& strides)
+        : lengths_(lengths), strides_(strides)
+    {
+        element_size_ = 1;
+        for(index_t i = 0; i < NDim; ++i)
+        {
+            element_size_ *= lengths_[i];
+        }
+        element_space_ = 1;
+        for(index_t i = 0; i < NDim; ++i)
+        {
+            element_space_ += (lengths_[i] - 1) * strides_[i];
+        }
+    }
+    __host__ __device__ static constexpr auto GetNumOfDimension() { return NDim; }
+    __host__ __device__ constexpr auto GetLength(const index_t& i) const { return lengths_[i]; }
+    __host__ __device__ constexpr auto GetStride(const index_t& i) const { return strides_[i]; }
+    __host__ __device__ constexpr auto GetLengths() const { return lengths_; }
+    __host__ __device__ constexpr auto GetStrides() const { return strides_; }
+    __host__ __device__ constexpr auto GetElementSize() const { return element_size_; }
+    __host__ __device__ constexpr auto GetElementSpace() const { return element_space_; }
+    __host__ __device__ constexpr auto CalculateOffset(const Index& idx) const
+    {
+        index_t offset = 0;
+#pragma unroll
+        for(index_t i = 0; i < NDim; ++i)
+        {
+            offset += idx[i] * strides_[i];
+        }
+        return offset;
+    }
+    __host__ __device__ constexpr auto CalculateOffsetDiff(const Index& idx_diff) const
+    {
+        index_t offset_diff = 0;
+#pragma unroll
+        for(index_t i = 0; i < NDim; ++i)
+        {
+            offset_diff += idx_diff[i] * strides_[i];
+        }
+        return offset_diff;
+    }
+    __host__ __device__ constexpr bool IsUpperIndexValid(const Index& idx) const
+    {
+        bool flag = true;
+#pragma unroll
+        for(index_t i = 0; i < NDim; ++i)
+        {
+            flag = flag && idx[i] >= 0 && idx[i] < lengths_[i];
+        }
+        return flag;
+    }
+};
+#if 0
+// Tensor descriptor for "transformed tensor"
+template <typename LowTensorDescriptor,
+          typename Transforms,          // Tuple<DynamicMultIndexTransforms,...>
+          typename LowDimensions,       // Tuple<Sequence<...>,...>
+          typename UpDimensions>        // Tuple<Sequence<...>,...>
+struct DynamicTransformedTensorDescriptor
+{
+    using Type = DynamicTransformedTensorDescriptor;
+    __host__ __device__ static constexpr auto GetNumOfLowerDimension()
+    {
+        // Here, we assume all lower-dimensions are active
+        // TODO: sanity-check all lower-dimension are indeed active
+        using duplicated_low_active_dims =
+            decltype(unpack(lambda_merge_sequences{}, LowDimensions{}));
+        using low_active_dims = typename sequence_unique_sort<duplicated_low_active_dims,
+                                                              math::less<index_t>,
+                                                              math::equal<index_t>>::type;
+        return low_active_dims::Size();
+    }
+    __host__ __device__ static constexpr auto GetNumOfUpperDimension()
+    {
+        using duplicated_up_active_dims =
+            decltype(unpack(lambda_merge_sequences{}, UpDimensions{}));
+        using up_active_dims = typename sequence_unique_sort<duplicated_up_active_dims,
+                                                             math::less<index_t>,
+                                                             math::equal<index_t>>::type;
+        return up_active_dims::Size();
+    }
+    static constexpr index_t ndim_up_  = GetNumOfUpperDimension();
+    static constexpr index_t ndim_low_ = GetNumOfLowerDimension();
+    static constexpr index_t num_transform_ = Transforms::Size();
+    using UpperIndex = MultiIndex<ndim_up_>;
+    using LowerIndex = MultiIndex<ndim_low_>;
+    const LowTensorDescriptor low_tensor_desc_;
+    const Transforms transforms_;
+    const LowDimensions low_dims_;
+    const UpDimensions up_dims_;
+    __host__ __device__ constexpr TransformedTensorDescriptor(const LowTensorDescriptor& low_tensor_desc,
+                                                              const Transforms& transforms)
+        : low_tensor_desc_(low_tensor_desc),
+          transforms_(transforms)
+    {
+    }
+    __host__ __device__ static constexpr auto GetNumOfDimension()
+    {
+        return GetNumOfUpperDimension();
+    }
+    __host__ __device__ constexpr auto GetLowerTensorDescriptor() const
+    {
+        return low_dims_;
+    }
+    __host__ __device__ constexpr auto GetUpperLengths() cons
+    {
+    }
+    __host__ __device__ constexpr auto GetLengths() const { return GetUpperLengths(); }
+    __host__ __device__ constexpr auto GetLength(index_t i) const
+    {
+        return GetLengths()[i];
+    }
+    __host__ __device__ constexpr auto GetElementSize() const
+    {
+        index_t element_size = 1;
+        for(index_t i = 0; i < ndim_up_; ++i)
+        {
+            element_size *= GetLength(i);
+        }
+        return element_size;
+    }
+    __host__ __device__ constexpr auto GetElementSpace() const
+    {
+        return lower_tensor_desc_.GetElementSpace();
+    }
+    // TODO: right now return value is not constexpr because use of non-constexpr lambda
+    __host__ __device__ constexpr LowerIndex CalculateLowerIndex(const UpperIndex& idx_up) const
+    {
+        LowerIndex idx_low;
+        static_for<0, num_transform_, 1>{}([&](auto itran) {
+            constexpr auto tran = Transforms{}.At(itran);
+            const auto idx_up_part = pick_array_element(idx_up, UpDimensions{}.At(itran));
+            auto idx_low_part      = pick_array_element(idx_low, LowDimensions{}.At(itran));
+            // this assume each lower (single) index is only assocaited with one transformation,
+            //   which is required for index transformation, and has been checked during constructor
+            //   of TransformedTensorDescriptor
+            idx_low_part = tran.CalculateLowerIndex(to_array(idx_up_part));
+        });
+        return idx_low;
+    }
+    // TODO: right now return value is not constexpr because use of non-constepxr lambda
+    __host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff(
+        const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old)
+    {
+        LowerIndex idx_low_diff;
+        static_for<0, nTransform, 1>{}([&](auto itran) {
+            constexpr auto tran = Transforms{}.At(itran);
+            const auto idx_up_diff_part =
+                pick_array_element(idx_up_diff, UpDimensions{}.At(itran));
+            const auto idx_up_old_part = pick_array_element(idx_up_old, UpDimensions{}.At(itran));
+            const auto idx_low_old_part =
+                pick_array_element(idx_low_old, LowDimensions{}.At(itran));
+            auto idx_low_diff_part = pick_array_element(idx_low_diff, LowDimensions{}.At(itran));
+            // this assume each lower (single) index is associated with only one transformation,
+            //   which is required for index transformation, and has been checked during constructor
+            //   of TransformedTensorDescriptor
+            idx_low_diff_part = tran.CalculateLowerIndexDiff(
+                to_array(idx_up_diff_part), to_array(idx_up_old_part), to_array(idx_low_old_part));
+        });
+        return idx_low_diff;
+    }
+    __host__ __device__ static constexpr index_t CalculateOffset(const UpperIndex& idx_up)
+    {
+        return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
+    }
+};
+#endif
+} // namespace ck
+#endif
--- a/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
+++ b/composable_kernel/include/tensor_description/dynamic_tensor_descriptor_helper.hpp
+#ifndef CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
+#define CK_DYNAMIC_TENSOR_DESCRIPTOR_HELPER_HPP
+#include "common_header.hpp"
+#include "dynamic_tensor_descriptor.hpp"
+namespace ck {
+template <typename Lengths, typename Strides>
+__host__ __device__ constexpr auto make_dynamic_native_tensor_descriptor(const Lengths& lengths,
+                                                                         const Strides& strides)
+{
+    static_assert(Lengths::GetSize() == Strides::GetSize(), "wrong! Size not the same");
+    return DynamicNativeTensorDescriptor<Lengths::GetSize()>(lengths, strides);
+}
+template <typename LowTensorDescriptor,
+          typename Transforms,
+          typename LowDimensions,
+          typename UpDimensions>
+__host__ __device__ constexpr auto
+transform_dynamic_tensor_descriptor(const LowTensorDescriptor& low_tensor_desc,
+                                    const Transforms& transforms,
+                                    LowDimensions,
+                                    UpDimensions)
+{
+    return DynamicTransformedTensorDescriptor<LowTensorDescriptor,
+                                              Transforms,
+                                              LowDimensions,
+                                              UpDimensions>(low_tensor_desc, transforms);
+}
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/amd_llvm_intrinsic.hpp
+++ b/composable_kernel/include/utility/amd_llvm_intrinsic.hpp
+#ifndef CK_AMD_LLVM_INTRINSIC_HPP
+#define CK_AMD_LLVM_INTRINSIC_HPP
+#include "float_type.hpp"
+namespace ck {
+__device__ int32_t __llvm_amdgcn_readfirstlane_i32(int32_t i) __asm("llvm.amdgcn.readfirstlane");
+} // namespace ck
+#endif
--- a/composable_kernel/include/utility/common_header.hpp
+++ b/composable_kernel/include/utility/common_header.hpp
@@ -20,6 +20,7 @@
 #if CK_USE_AMD_INLINE_ASM
 #include "amd_inline_asm.hpp"
+#include "amd_llvm_intrinsic.hpp"
 #endif
 #if CK_USE_AMD_XDLOPS

--- a/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
+++ b/driver/include/device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp
@@ -172,7 +172,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 4;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 0
+#elif 1
    // cdata = 64, BlockSize = 256, 128x128x16
    constexpr index_t BlockSize = 256;
@@ -290,7 +290,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
    constexpr index_t WeiBlockCopySrcDataPerRead_E  = 2;
    constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1;
-#elif 1
+#elif 0
    // cdata = 64, BlockSize = 128, 64x128x8
    constexpr index_t BlockSize = 128;

--- a/driver/include/device_dummy_dynamic_transform.hpp
+++ b/driver/include/device_dummy_dynamic_transform.hpp
+#include <unistd.h>
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "gridwise_operation_wrapper.hpp"
+#include "dummy_dynamic_transform.hpp"
+template <class T,
+          class InDesc,
+          class WeiDesc,
+          class OutDesc,
+          class ConvStrides,
+          class ConvDilations,
+          class InLeftPads,
+          class InRightPads>
+void device_dummy_dynamic_transform(InDesc,
+                                    const Tensor<T>& in_nchw,
+                                    WeiDesc,
+                                    const Tensor<T>& wei_kcyx,
+                                    OutDesc,
+                                    Tensor<T>& out_nkhw,
+                                    ConvStrides,
+                                    ConvDilations,
+                                    InLeftPads,
+                                    InRightPads,
+                                    ck::index_t nrepeat)
+{
+    using namespace ck;
+    using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
+    const auto in_nchw_desc = make_dynamic_native_tensor_descriptor(to_array(InDesc::GetLengths()),
+                                                                    to_array(InDesc::GetStrides()));
+    const auto wei_kcyx_desc = make_dynamic_native_tensor_descriptor(
+        to_array(WeiDesc::GetLengths()), to_array(WeiDesc::GetStrides()));
+    const auto out_nkhw_desc = make_dynamic_native_tensor_descriptor(
+        to_array(OutDesc::GetLengths()), to_array(OutDesc::GetStrides()));
+    const auto conv_strides   = to_array(ConvStrides{});
+    const auto conv_dilations = to_array(ConvDilations{});
+    const auto in_left_pads   = to_array(InLeftPads{});
+    const auto in_right_pads  = to_array(InRightPads{});
+    std::size_t data_sz = sizeof(T);
+    DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
+    DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
+    DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
+    in_nchw_device_buf.ToDevice(in_nchw.mData.data());
+    wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
+    out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
+    constexpr index_t BlockSize = 256;
+    constexpr index_t GridSize  = 1;
+    printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
+    using dummy_transform = DummyDynamicTransform<BlockSize>;
+    for(index_t i = 0; i < 5; ++i)
+    {
+        std::cout << "Start running " << nrepeat << " times..." << std::endl;
+        KernelTimer timer;
+        timer.Start();
+        for(index_t j = 0; j < nrepeat; ++j)
+        {
+            launch_kernel(run_gridwise_operation<dummy_transform,
+                                                 index_t* const,
+                                                 index_t* const,
+                                                 float* const,
+                                                 const DynamicNativeTensorDescriptor<4>,
+                                                 const DynamicNativeTensorDescriptor<4>,
+                                                 const DynamicNativeTensorDescriptor<4>,
+                                                 const Array<index_t, 2>,
+                                                 const Array<index_t, 2>,
+                                                 const Array<index_t, 2>,
+                                                 const Array<index_t, 2>,
+                                                 index_t,
+                                                 index_t,
+                                                 index_t,
+                                                 index_t>,
+                          dim3(GridSize),
+                          dim3(BlockSize),
+                          0,
+                          0,
+                          static_cast<index_t*>(in_nchw_device_buf.GetDeviceBuffer()),
+                          static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
+                          static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
+                          wei_kcyx_desc,
+                          in_nchw_desc,
+                          out_nkhw_desc,
+                          conv_strides,
+                          conv_dilations,
+                          in_left_pads,
+                          in_right_pads,
+                          10,
+                          10,
+                          10,
+                          10);
+        }
+    }
+    out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
+}
--- a/driver/include/device_dummy_transform.hpp
+++ b/driver/include/device_dummy_transform.hpp
@@ -12,17 +12,17 @@ template <class T,
          class ConvDilations,
          class InLeftPads,
          class InRightPads>
-void device_dummy_transform(InDesc,
+void device_dummy_static_transform(InDesc,
-                            const Tensor<T>& in_nchw,
+                                   const Tensor<T>& in_nchw,
-                            WeiDesc,
+                                   WeiDesc,
-                            const Tensor<T>& wei_kcyx,
+                                   const Tensor<T>& wei_kcyx,
-                            OutDesc,
+                                   OutDesc,
-                            Tensor<T>& out_nkhw,
+                                   Tensor<T>& out_nkhw,
-                            ConvStrides,
+                                   ConvStrides,
-                            ConvDilations,
+                                   ConvDilations,
-                            InLeftPads,
+                                   InLeftPads,
-                            InRightPads,
+                                   InRightPads,
-                            ck::index_t nrepeat)
+                                   ck::index_t nrepeat)
 {
    using namespace ck;

--- a/driver/src/conv_driver.cpp
+++ b/driver/src/conv_driver.cpp
@@ -14,7 +14,8 @@
 #include "device_tensor.hpp"
 #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
 #include "device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
-#include "device_dummy_transform.hpp"
+#include "device_dummy_static_transform.hpp"
+#include "device_dummy_dynamic_transform.hpp"
 int main(int argc, char* argv[])
 {
@@ -200,7 +201,7 @@ int main(int argc, char* argv[])
    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
-#elif 0
+#elif 1
    // 3x3, 35x35, stride 2
    constexpr index_t N  = 128;
    constexpr index_t C  = 288;
@@ -572,18 +573,30 @@ int main(int argc, char* argv[])
                                                         LeftPads{},
                                                         RightPads{},
                                                         nrepeat);
+#elif 0
+    device_dummy_static_transform(in_nchw_desc,
+                                  in_nchw,
+                                  wei_kcyx_desc,
+                                  wei_kcyx,
+                                  out_nkhw_desc,
+                                  out_nkhw_device,
+                                  ConvStrides{},
+                                  ConvDilations{},
+                                  LeftPads{},
+                                  RightPads{},
+                                  nrepeat);
 #elif 1
-    device_dummy_transform(in_nchw_desc,
+    device_dummy_dynamic_transform(in_nchw_desc,
-                           in_nchw,
+                                   in_nchw,
-                           wei_kcyx_desc,
+                                   wei_kcyx_desc,
-                           wei_kcyx,
+                                   wei_kcyx,
-                           out_nkhw_desc,
+                                   out_nkhw_desc,
-                           out_nkhw_device,
+                                   out_nkhw_device,
-                           ConvStrides{},
+                                   ConvStrides{},
-                           ConvDilations{},
+                                   ConvDilations{},
-                           LeftPads{},
+                                   LeftPads{},
-                           RightPads{},
+                                   RightPads{},
-                           nrepeat);
+                                   nrepeat);
 #endif
    if(do_verification)

--- a/driver/src/conv_driver.cu
+++ b/driver/src/conv_driver.cu
-conv_driver.cpp
\ No newline at end of file