Add example for demonstrating bundle multiple elems in tensor

5f50ed89 · Po-Yen, Chen · 82cc8731 · 5f50ed89 · 5f50ed89 · 5f50ed89
Commit 5f50ed89 authored Sep 12, 2022 by Po-Yen, Chen
4 changed files
--- a/example/36_permute/CMakeLists.txt
+++ b/example/36_permute/CMakeLists.txt
@@ -2,6 +2,8 @@ add_custom_target(example_permute)
 add_example_executable(example_permute_1xHxW_fp32 permute_1xHxW_fp32.cpp)
 add_example_executable(example_permute_NxHxW_fp32 permute_NxHxW_fp32.cpp)
+add_example_executable(example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp)
 add_dependencies(example_permute example_permute_1xHxW_fp32)
 add_dependencies(example_permute example_permute_NxHxW_fp32)
+add_dependencies(example_permute example_permute_HxWx4_fp16)
--- a/example/36_permute/common.hpp
+++ b/example/36_permute/common.hpp
@@ -23,6 +23,7 @@
 using F16 = ck::half_t;
 using F32 = float;
+using F64 = double;
 struct ExecutionConfig final
 {
@@ -53,6 +54,36 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 namespace detail {
+template <typename Bundle, std::size_t Divisor>
+struct get_bundled;
+template <typename Bundle>
+struct get_bundled<Bundle, 1>
+{
+    using type = Bundle;
+};
+template <>
+struct get_bundled<F64, 2>
+{
+    using type = F32;
+};
+template <>
+struct get_bundled<F64, 4>
+{
+    using type = F16;
+};
+template <>
+struct get_bundled<F32, 2>
+{
+    using type = F16;
+};
+template <typename Bundle, std::size_t Divisor>
+using get_bundled_t = typename get_bundled<Bundle, Divisor>::type;
 template <typename T, typename = void>
 struct is_iterator : std::false_type
 {

--- a/example/36_permute/permute_HxWx4_fp16.cpp
+++ b/example/36_permute/permute_HxWx4_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include "common.hpp"
+using ADataType = F64;
+using BDataType = F64;
+// clang-format off
+using DevicePermuteInstance = ck::tensor_operation::device::DevicePermute
+// ######|    InData|   OutData| Elementwise| NumDim| Block|  HPer|  WPer|   InBlock|      InBlockTransfer|           InBlockTransfer|       Src|       Dst|             Src|             Dst|
+// ######|      Type|      Type|   Operation|       |  Size| Block| Block| LdsExtraW| ThreadClusterLengths| ThreadClusterArrangeOrder| VectorDim| VectorDim| ScalarPerVector| ScalarPerVector|
+// ######|          |          |            |       |      |      |      |          |                     |                          |          |          |                |                |
+// ######|          |          |            |       |      |      |      |          |                     |                          |          |          |                |                |
+         < ADataType, BDataType, PassThrough,      3,   256,   128,   128,         0,         S<1, 16, 16>,                S<0, 1, 2>,         2,         1,               1,               1>;
+// clang-format on
+#define NUM_ELEMS_IN_BUNDLE 4
+#include "run_permute_example.inc"
+int main(int argc, char* argv[])
+{
+    return !run_permute_example(argc, argv, {1, 160, 80}, {0, 2, 1});
+}
--- a/example/36_permute/run_permute_example.inc
+++ b/example/36_permute/run_permute_example.inc
@@ -3,6 +3,10 @@
 #pragma once
+#ifndef NUM_ELEMS_IN_BUNDLE
+#define NUM_ELEMS_IN_BUNDLE 1
+#endif
 bool run_permute(const ExecutionConfig& config, const Problem& problem)
 {
    using std::begin, std::end;
@@ -14,12 +18,17 @@ bool run_permute(const ExecutionConfig& config, const Problem& problem)
    Tensor<ADataType> a(shape);
    Tensor<BDataType> b(transposed_shape);
-    std::iota(begin(a.mData), end(a.mData), 1);
+    using std::data, std::size;
+    {
+        auto* const elems =
+            reinterpret_cast<detail::get_bundled_t<ADataType, NUM_ELEMS_IN_BUNDLE>*>(data(a.mData));
+        std::iota(elems, elems + (size(a.mData) * NUM_ELEMS_IN_BUNDLE), 1);
+    }
    DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
    DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
-    a_device_buf.ToDevice(a.mData.data());
+    a_device_buf.ToDevice(data(a.mData));
    std::array<ck::index_t, 3> a_lengths, b_lengths;
    std::array<ck::index_t, 3> a_strides, b_strides;
@@ -55,7 +64,7 @@ bool run_permute(const ExecutionConfig& config, const Problem& problem)
        Tensor<BDataType> host_b(transposed_shape);
        host_permute(a, problem.axes, PassThrough{}, host_b);
-        b_device_buf.FromDevice(b.mData.data());
+        b_device_buf.FromDevice(data(b.mData));
        return ck::utils::check_err(
            b.mData, host_b.mData, "Error: incorrect results in output tensor", 1e-10, 1e-10);