Merge with (not the latest) upstream CK (#32)

* fix build for old ck examples * fix build for old ck

Merge with (not the latest) upstream CK (#32)
* fix build for old ck examples * fix build for old ck
0a7174ad · Chao Liu · GitHub · 496be40e · 0a7174ad · 0a7174ad
Unverified Commit 0a7174ad authored Nov 21, 2023 by Chao Liu Committed by GitHub Nov 21, 2023
11 changed files
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r1r2.hpp
@@ -94,7 +94,7 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
            using dst_vector_t    = typename dst_vector_type::type;
            const bool is_src_valid =
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(src_desc, src_coord_);
+                coordinate_has_valid_offset_assuming_top_index_is_valid(src_desc, src_coord_);
            // copy data from src_buf into src_vector_container
            auto src_vector_container = src_vector_type{
@@ -114,7 +114,7 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
            });
            const bool is_dst_valid =
-                coordinate_has_valid_offset_assuming_visible_index_is_valid(dst_desc, dst_coord_);
+                coordinate_has_valid_offset_assuming_top_index_is_valid(dst_desc, dst_coord_);
            // copy data from dst_vector into dst_buf
            dst_buf.template Update<DstInMemOp, dst_vector_t>(
@@ -126,28 +126,20 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
            if constexpr(idx_1d.value != num_access - 1)
            {
                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
-                move_tensor_coordinate(
+                move_tensor_coordinate(src_desc, src_coord_, forward_step);
-                    src_desc, src_coord_, make_tensor_coordinate_step(src_desc, forward_step));
+                move_tensor_coordinate(dst_desc, dst_coord_, forward_step);
-                move_tensor_coordinate(
-                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
            }
        });
        // move coordinate back to slice origin (or not)
        if constexpr(SrcResetCoordinateAfterRun)
        {
-            const auto src_reset_step =
+            move_tensor_coordinate(src_desc, src_coord_, GetCoordinateResetStep());
-                make_tensor_coordinate_step(src_desc, GetCoordinateResetStep());
-            move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
        }
        if constexpr(DstResetCoordinateAfterRun)
        {
-            const auto dst_reset_step =
+            move_tensor_coordinate(dst_desc, dst_coord_, GetCoordinateResetStep());
-                make_tensor_coordinate_step(dst_desc, GetCoordinateResetStep());
-            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
        }
    }
@@ -179,13 +171,10 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
                                       const Index& src_slice_origin_step_idx)
    {
        // if src coord was not reset by RunRead(), then need to adjust the step here
-        const auto adjusted_step_idx = SrcResetCoordinateAfterRun
+        const auto adjusted_step = SrcResetCoordinateAfterRun
                                       ? src_slice_origin_step_idx
                                       : src_slice_origin_step_idx + GetCoordinateResetStep();
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
        move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
    }
@@ -194,13 +183,10 @@ struct ThreadwiseTensorSliceTransfer_v6r1r2
                                       const Index& dst_slice_origin_step_idx)
    {
        // if dst coord was not reset by Run(), then need to adjust the step here
-        const auto adjusted_step_idx = DstResetCoordinateAfterRun
+        const auto adjusted_step = DstResetCoordinateAfterRun
                                       ? dst_slice_origin_step_idx
                                       : dst_slice_origin_step_idx + GetCoordinateResetStep();
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
    }

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r2.hpp
@@ -147,38 +147,26 @@ struct ThreadwiseTensorSliceTransfer_v6r2
            if constexpr(idx_1d.value != num_access - 1)
            {
                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
-                move_tensor_coordinate(
+                move_tensor_coordinate(src0_desc, src0_coord_, forward_step);
-                    src0_desc, src0_coord_, make_tensor_coordinate_step(src0_desc, forward_step));
+                move_tensor_coordinate(src1_desc, src1_coord_, forward_step);
-                move_tensor_coordinate(
+                move_tensor_coordinate(dst_desc, dst_coord_, forward_step);
-                    src1_desc, src1_coord_, make_tensor_coordinate_step(src1_desc, forward_step));
-                move_tensor_coordinate(
-                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
            }
        });
        // move coordinate back to slice origin (or not)
        if constexpr(Src0ResetCoordinateAfterRun)
        {
-            const auto src0_reset_step =
+            move_tensor_coordinate(src0_desc, src0_coord_, GetCoordinateResetStep());
-                make_tensor_coordinate_step(src0_desc, GetCoordinateResetStep());
-            move_tensor_coordinate(src0_desc, src0_coord_, src0_reset_step);
        }
        if constexpr(Src1ResetCoordinateAfterRun)
        {
-            const auto src1_reset_step =
+            move_tensor_coordinate(src1_desc, src1_coord_, GetCoordinateResetStep());
-                make_tensor_coordinate_step(src1_desc, GetCoordinateResetStep());
-            move_tensor_coordinate(src1_desc, src1_coord_, src1_reset_step);
        }
        if constexpr(DstResetCoordinateAfterRun)
        {
-            const auto dst_reset_step =
+            move_tensor_coordinate(dst_desc, dst_coord_, GetCoordinateResetStep());
-                make_tensor_coordinate_step(dst_desc, GetCoordinateResetStep());
-            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
        }
    }
@@ -210,13 +198,10 @@ struct ThreadwiseTensorSliceTransfer_v6r2
                                        const Index& src0_slice_origin_step_idx)
    {
        // if src coord was not reset by RunRead(), then need to adjust the step here
-        const auto adjusted_step_idx = Src0ResetCoordinateAfterRun
+        const auto adjusted_step = Src0ResetCoordinateAfterRun
                                       ? src0_slice_origin_step_idx
                                       : src0_slice_origin_step_idx + GetCoordinateResetStep();
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(src0_desc, adjusted_step_idx);
        move_tensor_coordinate(src0_desc, src0_coord_, adjusted_step);
    }
@@ -225,13 +210,10 @@ struct ThreadwiseTensorSliceTransfer_v6r2
                                        const Index& src1_slice_origin_step_idx)
    {
        // if src coord was not reset by RunRead(), then need to adjust the step here
-        const auto adjusted_step_idx = Src1ResetCoordinateAfterRun
+        const auto adjusted_step = Src1ResetCoordinateAfterRun
                                       ? src1_slice_origin_step_idx
                                       : src1_slice_origin_step_idx + GetCoordinateResetStep();
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(src1_desc, adjusted_step_idx);
        move_tensor_coordinate(src1_desc, src1_coord_, adjusted_step);
    }
@@ -240,13 +222,10 @@ struct ThreadwiseTensorSliceTransfer_v6r2
                                       const Index& dst_slice_origin_step_idx)
    {
        // if dst coord was not reset by Run(), then need to adjust the step here
-        const auto adjusted_step_idx = DstResetCoordinateAfterRun
+        const auto adjusted_step = DstResetCoordinateAfterRun
                                       ? dst_slice_origin_step_idx
                                       : dst_slice_origin_step_idx + GetCoordinateResetStep();
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
    }

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v6r3.hpp
@@ -171,48 +171,32 @@ struct ThreadwiseTensorSliceTransfer_v6r3
            if constexpr(idx_1d.value != num_access - 1)
            {
                constexpr auto forward_step = SpaceFillingCurve::GetForwardStep(idx_1d);
-                move_tensor_coordinate(
+                move_tensor_coordinate(src0_desc, src0_coord_, forward_step);
-                    src0_desc, src0_coord_, make_tensor_coordinate_step(src0_desc, forward_step));
+                move_tensor_coordinate(src1_desc, src1_coord_, forward_step);
-                move_tensor_coordinate(
+                move_tensor_coordinate(src2_desc, src2_coord_, forward_step);
-                    src1_desc, src1_coord_, make_tensor_coordinate_step(src1_desc, forward_step));
+                move_tensor_coordinate(dst_desc, dst_coord_, forward_step);
-                move_tensor_coordinate(
-                    src2_desc, src2_coord_, make_tensor_coordinate_step(src2_desc, forward_step));
-                move_tensor_coordinate(
-                    dst_desc, dst_coord_, make_tensor_coordinate_step(dst_desc, forward_step));
            }
        });
        // move coordinate back to slice origin (or not)
        if constexpr(Src0ResetCoordinateAfterRun)
        {
-            const auto src0_reset_step =
+            move_tensor_coordinate(src0_desc, src0_coord_, GetCoordinateResetStep());
-                make_tensor_coordinate_step(src0_desc, GetCoordinateResetStep());
-            move_tensor_coordinate(src0_desc, src0_coord_, src0_reset_step);
        }
        if constexpr(Src1ResetCoordinateAfterRun)
        {
-            const auto src1_reset_step =
+            move_tensor_coordinate(src1_desc, src1_coord_, GetCoordinateResetStep());
-                make_tensor_coordinate_step(src1_desc, GetCoordinateResetStep());
-            move_tensor_coordinate(src1_desc, src1_coord_, src1_reset_step);
        }
        if constexpr(Src2ResetCoordinateAfterRun)
        {
-            const auto src2_reset_step =
+            move_tensor_coordinate(src2_desc, src2_coord_, GetCoordinateResetStep());
-                make_tensor_coordinate_step(src2_desc, GetCoordinateResetStep());
-            move_tensor_coordinate(src2_desc, src2_coord_, src2_reset_step);
        }
        if constexpr(DstResetCoordinateAfterRun)
        {
-            const auto dst_reset_step =
+            move_tensor_coordinate(dst_desc, dst_coord_, GetCoordinateResetStep());
-                make_tensor_coordinate_step(dst_desc, GetCoordinateResetStep());
-            move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
        }
    }
@@ -244,13 +228,10 @@ struct ThreadwiseTensorSliceTransfer_v6r3
                                        const Index& src0_slice_origin_step_idx)
    {
        // if src coord was not reset by RunRead(), then need to adjust the step here
-        const auto adjusted_step_idx = Src0ResetCoordinateAfterRun
+        const auto adjusted_step = Src0ResetCoordinateAfterRun
                                       ? src0_slice_origin_step_idx
                                       : src0_slice_origin_step_idx + GetCoordinateResetStep();
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(src0_desc, adjusted_step_idx);
        move_tensor_coordinate(src0_desc, src0_coord_, adjusted_step);
    }
@@ -259,13 +240,10 @@ struct ThreadwiseTensorSliceTransfer_v6r3
                                        const Index& src1_slice_origin_step_idx)
    {
        // if src coord was not reset by RunRead(), then need to adjust the step here
-        const auto adjusted_step_idx = Src1ResetCoordinateAfterRun
+        const auto adjusted_step = Src1ResetCoordinateAfterRun
                                       ? src1_slice_origin_step_idx
                                       : src1_slice_origin_step_idx + GetCoordinateResetStep();
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(src1_desc, adjusted_step_idx);
        move_tensor_coordinate(src1_desc, src1_coord_, adjusted_step);
    }
@@ -274,13 +252,10 @@ struct ThreadwiseTensorSliceTransfer_v6r3
                                        const Index& src2_slice_origin_step_idx)
    {
        // if src coord was not reset by RunRead(), then need to adjust the step here
-        const auto adjusted_step_idx = Src2ResetCoordinateAfterRun
+        const auto adjusted_step = Src2ResetCoordinateAfterRun
                                       ? src2_slice_origin_step_idx
                                       : src2_slice_origin_step_idx + GetCoordinateResetStep();
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(src2_desc, adjusted_step_idx);
        move_tensor_coordinate(src2_desc, src2_coord_, adjusted_step);
    }
@@ -289,13 +264,10 @@ struct ThreadwiseTensorSliceTransfer_v6r3
                                       const Index& dst_slice_origin_step_idx)
    {
        // if dst coord was not reset by Run(), then need to adjust the step here
-        const auto adjusted_step_idx = DstResetCoordinateAfterRun
+        const auto adjusted_step = DstResetCoordinateAfterRun
                                       ? dst_slice_origin_step_idx
                                       : dst_slice_origin_step_idx + GetCoordinateResetStep();
-        // is it OK to construct a new step every time?
-        const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
        move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
    }

--- a/include/ck/tile_program/tile/tile_window_impl_static_distribution.hpp
+++ b/include/ck/tile_program/tile/tile_window_impl_static_distribution.hpp
@@ -6,6 +6,7 @@
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_adaptor.hpp"
 #include "ck/tensor_description/tensor_adaptor_coordinate.hpp"
+#include "ck/tensor_description/tensor_space_filling_curve.hpp"
 #include "ck/tile_program/tile/tile_distribution.hpp"
 #include "ck/tile_program/tile/static_tile_distribution_helper.hpp"

--- a/include/ck/utility/array_multi_index.hpp
+++ b/include/ck/utility/array_multi_index.hpp
@@ -75,4 +75,20 @@ __host__ __device__ constexpr auto operator*(const MultiIndex<NSize>& a, const T
    return r;
 }
+// MultiIndex = index_t * MultiIndex
+template <index_t NSize>
+__host__ __device__ constexpr auto operator*(index_t a, const MultiIndex<NSize>& x)
+{
+    MultiIndex<NSize> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = a * x[i]; });
+    return r;
+}
+// MultiIndex = MultiIndex * index_t
+template <index_t NSize>
+__host__ __device__ constexpr auto operator*(const MultiIndex<NSize>& x, index_t a)
+{
+    return a * x;
+}
 } // namespace ck
--- a/include/ck/utility/buffer_view_impl_global.hpp
+++ b/include/ck/utility/buffer_view_impl_global.hpp
@@ -239,7 +239,7 @@ struct BufferView<AddressSpaceEnum::Global,
        {
            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
-            amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x, Coherence>(
+            amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
                x, p_data_, i, is_valid_element, buffer_size_);
        }
        else

--- a/include/ck/utility/container_helper.hpp
+++ b/include/ck/utility/container_helper.hpp
@@ -471,7 +471,7 @@ __host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence<Is...>)
            constexpr index_t tmp = Seq::At(i);
            return Number<tmp>{};
        },
-        Seq::Size());
+        Number<Seq::Size()>{});
 }
 } // namespace ck
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -9,6 +9,7 @@
 namespace ck {
+using int64_t = long;
 using bhalf_t = ushort;
 using half_t  = _Float16;
 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
@@ -122,6 +123,13 @@ struct scalar_type<bhalf_t>
    static constexpr index_t vector_size = 1;
 };
+template <>
+struct scalar_type<int64_t>
+{
+    using type                           = int64_t;
+    static constexpr index_t vector_size = 1;
+};
 template <>
 struct scalar_type<int32_t>
 {
@@ -908,8 +916,6 @@ struct vector_type<T, 256>
    }
 };
-using int64_t = long;
 // fp64
 using double2_t = typename vector_type<double, 2>::type;
 using double4_t = typename vector_type<double, 4>::type;

--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
@@ -4,10 +4,11 @@
 #pragma once
 #include "ck/ck.hpp"
-#include "integral_constant.hpp"
+#include "ck/utility/integral_constant.hpp"
-#include "number.hpp"
+#include "ck/utility/number.hpp"
-#include "type.hpp"
+#include "ck/utility/type.hpp"
-#include "tuple.hpp"
+#include "ck/utility/tuple.hpp"
+#include "ck/utility/bit_cast.hpp"
 namespace ck {

--- a/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/add_device_operation_instance.hpp
@@ -7,6 +7,7 @@
 #include <type_traits>
 #include "ck/utility/functional2.hpp"
+#include "ck/utility/remove_cvref.hpp"
 namespace ck {
 namespace tensor_operation {

--- a/test/block_to_ctile_map/test_block_to_ctile_map.cpp
+++ b/test/block_to_ctile_map/test_block_to_ctile_map.cpp
@@ -6,6 +6,7 @@
 #include <gtest/gtest.h>
 #include "ck/ck.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp"
 using namespace ck;