add another test

9a232a29 · carlushuang · 00a0a07f · 9a232a29 · 9a232a29 · 9a232a29
Commit 9a232a29 authored Sep 03, 2024 by carlushuang
4 changed files
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -219,4 +219,5 @@ endif()
 add_subdirectory(position_embedding)
 add_subdirectory(scatter_gather)
 add_subdirectory(topk)
+add_subdirectory(tile_reduce)

--- a/test/tile_reduce/CMakeLists.txt
+++ b/test/tile_reduce/CMakeLists.txt
+add_test_executable(test_tile_reduce tile_reduce.cpp)
+target_compile_options(test_tile_reduce PRIVATE -v --save-temps -Wno-gnu-line-marker)
\ No newline at end of file
--- a/test/tile_reduce/tile_reduce.cpp
+++ b/test/tile_reduce/tile_reduce.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <vector>
+#include <iostream>
+#include <numeric>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <time.h>
+#include <unordered_set>
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/reduce.hpp"
+
+#ifndef TEST_TILE_REDUCE_VERBOSE
+#define TEST_TILE_REDUCE_VERBOSE 0
+#endif
+
+#define HIP_CALL(call)                                                              \
+    do                                                                              \
+    {                                                                               \
+        hipError_t err = call;                                                      \
+        if(err != hipSuccess)                                                       \
+        {                                                                           \
+            printf("[hiperror](%d) fail to call %s", static_cast<int>(err), #call); \
+            exit(0);                                                                \
+        }                                                                           \
+    } while(0)
+
+#define BLOCK_SIZE 256
+
+template <int Rows, int Cols, typename DataType>
+__global__ void reduce_row(DataType* p_src, DataType* p_dst)
+{
+    using namespace ck_tile;
+
+    // some constexpr vars
+    constexpr index_t vec = 16 / sizeof(DataType);
+    static_assert(Cols % vec == 0);
+    constexpr index_t col_lanes = Cols / vec;
+    constexpr index_t warp_size = ck_tile::get_warp_size();
+    static_assert(warp_size % col_lanes == 0);
+    constexpr index_t row_lanes = warp_size / col_lanes;
+    constexpr index_t num_warps = BLOCK_SIZE / warp_size;
+    static_assert(Rows % (num_warps * row_lanes) == 0);
+    constexpr index_t row_repeat = Rows / (num_warps * row_lanes);
+
+    auto src_tile = [&]() {
+        constexpr auto src_dist = make_static_tile_distribution(
+            tile_distribution_encoding<
+                sequence<1>,
+                tuple<sequence<row_repeat, num_warps, row_lanes>, sequence<col_lanes, vec>>,
+                tuple<sequence<1>, sequence<1, 2>>,
+                tuple<sequence<1>, sequence<2, 0>>,
+                sequence<1, 2>,
+                sequence<0, 1>>{});
+
+        auto src_view =
+            make_naive_tensor_view<address_space_enum::global>(p_src,
+                                                               make_tuple(Rows, Cols),
+                                                               make_tuple(Cols, 1),
+                                                               number<vec>{}, // alignement
+                                                               number<1>{});
+        return make_tile_window(
+            src_view, make_tuple(number<Rows>{}, number<Cols>{}), {0, 0}, src_dist);
+    }();
+
+    constexpr auto dst_dist = make_static_tile_distribution(
+        tile_distribution_encoding<
+            sequence<col_lanes>, // -> replicate here, hence we can figure out the offset
+            tuple<sequence<row_repeat, num_warps, row_lanes>, sequence<1> /* only 1 per row*/>,
+            tuple<sequence<1>, sequence<1, 0>>,
+            tuple<sequence<1>, sequence<2, 0>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{});
+
+    auto dst_tile = [&]() {
+        auto dst_view =
+            make_naive_tensor_view<address_space_enum::global>(p_dst,
+                                                               make_tuple(Rows, 1),
+                                                               make_tuple(1, 1),
+                                                               number<1>{}, // alignement
+                                                               number<1>{});
+        return make_tile_window(
+            dst_view, make_tuple(number<Rows>{}, number<1>{}), {0, 0}, dst_dist);
+    }();
+
+    auto data = load_tile(src_tile);
+
+    const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
+
+    // Note: the return type will fill the replicate dim
+    //       usually is 2d. This is for the next block_tile_reduce_sync()
+    //       in order to do further reduce.
+    auto r =
+        block_tile_reduce<DataType>(data, sequence<1>{}, f_max, -numeric<DataType>::infinity());
+
+    // r.foo();
+
+    // further reduce cross thread
+    block_tile_reduce_sync(r, f_max, bool_constant<false>{});
+
+    if(threadIdx.x % col_lanes == 0)
+    {
+        auto o                = make_static_distributed_tensor<DataType>(dst_dist);
+        o.get_thread_buffer() = r.get_thread_buffer();
+        store_tile(dst_tile, o);
+    }
+}
+
+template <int Rows, int Cols, typename DataType>
+int test_tile_reduce()
+{
+    std::srand(std::time(nullptr));
+    DataType* src = reinterpret_cast<DataType*>(malloc(Rows * Cols * sizeof(DataType)));
+    DataType* dst = reinterpret_cast<DataType*>(malloc(Rows * sizeof(DataType)));
+
+    // const auto f_max = [](auto e0, auto e1) { return max(e0, e1); };
+
+    for(auto i = 0; i < Rows * Cols; i++)
+    {
+        float v = static_cast<float>(std::rand() % 2000 - 1000) / 1000.f;
+        src[i]  = ck_tile::type_convert<DataType>(v);
+    }
+
+    void* dev_src;
+    void* dev_dst;
+    HIP_CALL(hipMalloc(&dev_src, Rows * Cols * sizeof(DataType)));
+    HIP_CALL(hipMalloc(&dev_dst, Rows * sizeof(DataType)));
+
+    HIP_CALL(hipMemcpy(dev_src, src, Rows * Cols * sizeof(DataType), hipMemcpyHostToDevice));
+
+    constexpr int bdim = BLOCK_SIZE;
+    int gdim           = 1;
+    reduce_row<Rows, Cols, DataType><<<gdim, bdim>>>(reinterpret_cast<DataType*>(dev_src),
+                                                     reinterpret_cast<DataType*>(dev_dst));
+
+    HIP_CALL(hipMemcpy(dst, dev_dst, Rows * sizeof(DataType), hipMemcpyDeviceToHost));
+
+    int err_cnt = 0;
+
+    for(int i_r = 0; i_r < Rows; i_r++)
+    {
+        auto row_max = -ck_tile::numeric<float>::infinity();
+        for(int i_c = 0; i_c < Cols; i_c++)
+        {
+            int idx = i_r * Cols + i_c;
+            float v = ck_tile::type_convert<float>(src[idx]);
+            row_max = row_max > v ? row_max : v;
+#if TEST_TILE_REDUCE_VERBOSE
+            printf("%.3f ", v);
+#endif
+        }
+        {
+            uint32_t ref = ck_tile::bit_cast<uint32_t>(row_max);
+            uint32_t out = ck_tile::bit_cast<uint32_t>(dst[i_r]);
+            if(ref != out)
+                err_cnt++;
+        }
+#if TEST_TILE_REDUCE_VERBOSE
+        printf(" -> %.3f (%.3f)\n", dst[i_r], row_max);
+#endif
+    }
+#if TEST_TILE_REDUCE_VERBOSE
+    printf("\n");
+#endif
+
+    free(src);
+    free(dst);
+    return err_cnt == 0 ? 0 : -1;
+}
+
+int main()
+{
+    int r = test_tile_reduce<32, 64, float>();
+    return r;
+}
--- a/test/topk/topk.cpp
+++ b/test/topk/topk.cpp
@@ -11,7 +11,7 @@
 #include "ck_tile/host.hpp"

 #ifndef TEST_TOPK_VERBOSE
-#define TEST_TOPK_VERBOSE 1
+#define TEST_TOPK_VERBOSE 0
 #endif

 int test_reference_topk()