Merge branch 'develop' into gemm_bf16_sk_muozturk

10e8be48 · M.Emin Ozturk · GitHub · b416c877 · 11b7a4db · 10e8be48
Unverified Commit 10e8be48 authored Oct 01, 2024 by M.Emin Ozturk Committed by GitHub Oct 01, 2024
8 changed files
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -102,12 +102,14 @@ function(add_instance_library INSTANCE_NAME)
                set(FMHA_FWD_FAST_EXP2 true)
            endif()
            if(FMHA_FWD_FAST_EXP2)
-	            list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
+                list(APPEND FMHA_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=1 -fgpu-flush-denormals-to-zero)
            else()
-        	    list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
+                list(APPEND FMHA_COMPILE_OPTIONS -Wno-undefined-func-template -DCK_TILE_FMHA_FWD_FAST_EXP2=0)
            endif()
-            list(APPEND EXAMPLE_FMHA_FWD_COMPILE_OPTIONS -Wno-float-equal)
-            target_compile_options(device_mha_instance PRIVATE ${EXAMPLE_FMHA_FWD_COMPILE_OPTIONS})
+            list(APPEND FMHA_COMPILE_OPTIONS -Wno-float-equal)
+            list(APPEND FMHA_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_SPLITKV_API=1)
+            list(APPEND FMHA_COMPILE_OPTIONS -DCK_TILE_FMHA_FWD_APPENDKV_API=1)
+            target_compile_options(device_mha_instance PRIVATE ${FMHA_COMPILE_OPTIONS})
        endif()
        
        target_compile_features(${INSTANCE_NAME} PUBLIC)

--- a/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/mha/CMakeLists.txt
@@ -32,23 +32,33 @@ if(EXISTS ${FMHA_CPP_FOLDER}/blob_list.txt)
    file(REMOVE ${FMHA_CPP_FOLDER}/blob_list.txt)
 endif()

+set(FMHA_KNOWN_APIS "fwd,fwd_splitkv,fwd_appendkv,bwd")
+
 # generate a list of kernels, but not actually emit files at config stage
+# Note: The receipt 3 arg filters the generated backwards instances to reduce compilation time.
+# With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad.
 execute_process(
-  COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
+  COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py
  --list_blobs ${FMHA_CPP_FOLDER}/blob_list.txt
+  --api ${FMHA_KNOWN_APIS}
+  --receipt 3
  RESULT_VARIABLE ret
 )
 if(ret AND NOT ret EQUAL 0)
  message( FATAL_ERROR "CK Tile MHA FAILED to genrate a list of kernels via Python.")
 else()
-  file(STRINGS ${FMHA_CPP_FOLDER}/blob_list.txt FMHA_FWD_GEN_BLOBS)
+  file(STRINGS ${FMHA_CPP_FOLDER}/blob_list.txt FMHA_GEN_BLOBS)
 endif()

 # actually generate the kernel content now
+# Note: The receipt 3 arg filters the generated backwards instances to reduce compilation time.
+# With receipt 3 set, we are generating instances for datatype == {fp16 || bfp16}, bias == {no || alibi}, deterministic == off, and dpad == dvpad.
 add_custom_command(
-  OUTPUT ${FMHA_FWD_GEN_BLOBS}
-  COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/example/ck_tile/01_fmha/generate.py
+  OUTPUT ${FMHA_GEN_BLOBS}
+  COMMAND ${PYTHON_EXECUTABLE} ${FMHA_SRC_FOLDER}/generate.py
  --output_dir ${FMHA_CPP_FOLDER}
+  --api ${FMHA_KNOWN_APIS}
+  --receipt 3
  COMMENT "Generating mha kernel (cpp) files now ..."
  VERBATIM
 )
@@ -57,12 +67,12 @@ add_custom_command(
 # have filename. Since, it was cauing the cmake
 # to throw "File name too long"
 set(device_files)
-foreach(filepath IN LISTS FMHA_FWD_GEN_BLOBS)
+foreach(filepath IN LISTS FMHA_GEN_BLOBS)
    get_filename_component(filename ${filepath} NAME)
    # Append the filename to the device_files list
    list(APPEND device_files ${filename})
 endforeach()
-add_custom_target(generate_cpp_files DEPENDS ${FMHA_FWD_GEN_BLOBS})
+add_custom_target(generate_cpp_files DEPENDS ${FMHA_GEN_BLOBS})

 add_instance_library(device_mha_instance ${device_files})


--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -7,8 +7,10 @@ MY_PROJECT_SOURCE=$1

 if [ $# -ge 2 ] ; then
    GPU_TARGETS=$2
+    REST_ARGS=${@:3}
 else
    GPU_TARGETS="gfx908;gfx90a;gfx940"
+    REST_ARGS=
 fi

 cmake                                                                                             \
@@ -20,4 +22,5 @@ cmake
 -D GPU_TARGETS=$GPU_TARGETS                                                                       \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
+$REST_ARGS                                                                                        \
 ${MY_PROJECT_SOURCE}
--- a/script/cmake-ck-release.sh
+++ b/script/cmake-ck-release.sh
@@ -7,8 +7,10 @@ MY_PROJECT_SOURCE=$1

 if [ $# -ge 2 ] ; then
    GPU_TARGETS=$2
+    REST_ARGS=${@:3}
 else
    GPU_TARGETS="gfx908;gfx90a;gfx940"
+    REST_ARGS=
 fi

 cmake                                                                                             \
@@ -20,5 +22,6 @@ cmake
 -D GPU_TARGETS=$GPU_TARGETS                                                                       \
 -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON                                                                 \
 -D USE_BITINT_EXTENSION_INT4=OFF                                                                  \
+$REST_ARGS                                                                                        \
 ${MY_PROJECT_SOURCE}

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -173,6 +173,7 @@ function(add_gtest_executable TEST_NAME)
 endfunction()

 add_compile_options(-Wno-c++20-extensions)
+add_subdirectory(ck_tile)
 add_subdirectory(magic_number_division)
 add_subdirectory(space_filling_curve)
 add_subdirectory(conv_util)

--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
+add_subdirectory(image_to_column)
--- a/test/ck_tile/image_to_column/CMakeLists.txt
+++ b/test/ck_tile/image_to_column/CMakeLists.txt
+# Currently ck_tile is only built on gfx9
+if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_tile_image_to_column test_tile_image_to_column.cpp)
+endif()
--- a/test/ck_tile/image_to_column/test_tile_image_to_column.cpp
+++ b/test/ck_tile/image_to_column/test_tile_image_to_column.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <algorithm>
+#include <gtest/gtest.h>
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include "ck_tile/ops/image_to_column.hpp"
+
+// Host API implementation
+template <typename DataType>
+class TestCkTileImageToColumn : public ::testing::Test
+{
+    static constexpr ck_tile::index_t VectorSize  = 1;
+    static constexpr ck_tile::index_t NDimSpatial = 2;
+
+    protected:
+    void Run(const ck_tile::conv::ConvParam conv_params)
+    {
+
+        using ImLayout = ck_tile::tensor_layout::convolution::NHWGC;
+
+        const auto G = conv_params.G_;
+        const auto N = conv_params.N_;
+        const auto C = conv_params.C_;
+
+        const ck_tile::long_index_t NDoHoWo =
+            N * std::accumulate(conv_params.output_spatial_lengths_.begin(),
+                                std::next(conv_params.output_spatial_lengths_.begin(), NDimSpatial),
+                                1,
+                                std::multiplies<>());
+
+        const ck_tile::long_index_t CZYX =
+            C * std::accumulate(conv_params.filter_spatial_lengths_.begin(),
+                                std::next(conv_params.filter_spatial_lengths_.begin(), NDimSpatial),
+                                1,
+                                std::multiplies<>());
+
+        const auto in_desc =
+            ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<ImLayout>(
+                conv_params);
+        const auto out_desc = ck_tile::HostTensorDescriptor({G, NDoHoWo, CZYX});
+
+        // host verify
+        ck_tile::HostTensor<DataType> in(in_desc);
+        ck_tile::HostTensor<DataType> out_device(out_desc);
+        ck_tile::HostTensor<DataType> out_host(out_desc);
+
+        std::cout << "input: " << in.mDesc << std::endl;
+        std::cout << "output: " << out_device.mDesc << std::endl;
+
+        ck_tile::FillUniformDistributionIntegerValue<DataType>{-5.f, 5.f}(in);
+
+        ck_tile::DeviceMem in_device_buf(in.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem out_device_buf(out_device.get_element_space_size_in_bytes());
+
+        in_device_buf.ToDevice(in.data());
+
+        using thread_tile = ck_tile::sequence<4, 4>;
+        using warp_tile   = ck_tile::sequence<8, 128>;
+        using block_tile  = ck_tile::sequence<32, 128>;
+
+        using Shape = ck_tile::TileImageToColumnShape<thread_tile, warp_tile, block_tile>;
+
+        using PipelineProblem = ck_tile::BlockImageToColumnProblem<DataType,
+                                                                   DataType,
+                                                                   Shape,
+                                                                   NDimSpatial,
+                                                                   VectorSize,
+                                                                   VectorSize>;
+
+        using Kernel = ck_tile::ImageToColumn<PipelineProblem>;
+
+        auto kargs = Kernel::MakeKargs(
+            in_device_buf.GetDeviceBuffer(),
+            out_device_buf.GetDeviceBuffer(),
+            G,
+            N,
+            C,
+            ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(
+                conv_params.input_spatial_lengths_),
+            ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(
+                conv_params.filter_spatial_lengths_),
+            ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(
+                conv_params.output_spatial_lengths_),
+            ck_tile::to_array<ck_tile::long_index_t, NDimSpatial + 3>(in_desc.get_strides()),
+            ck_tile::to_array<ck_tile::long_index_t, 3>(out_desc.get_strides()),
+            ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.conv_filter_strides_),
+            ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(
+                conv_params.conv_filter_dilations_),
+            ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.input_left_pads_),
+            ck_tile::to_array<ck_tile::long_index_t, NDimSpatial>(conv_params.input_right_pads_));
+
+        const dim3 grids = Kernel::GridSize(
+            kargs.N * kargs.output_spatial_lengths[0] * kargs.output_spatial_lengths[1],
+            kargs.filter_spatial_lengths[0] * kargs.filter_spatial_lengths[1] * kargs.C,
+            kargs.G);
+        constexpr dim3 blocks = Kernel::BlockSize();
+
+        constexpr ck_tile::index_t kBlockPerCu = 2;
+
+        ck_tile::launch_kernel(
+            ck_tile::stream_config{},
+            ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+        // reference
+        ck_tile::reference_im2col<DataType, DataType, NDimSpatial>(in, out_host, conv_params);
+
+        out_device_buf.FromDevice(out_device.data());
+        bool pass = ck_tile::check_err(out_device, out_host);
+
+        EXPECT_TRUE(pass);
+    }
+};
+
+class TestCkTileImageToColumnFloat : public TestCkTileImageToColumn<float>
+{
+};
+
+class TestCkTileImageToColumnHalf : public TestCkTileImageToColumn<ck_tile::half_t>
+{
+};
+
+TEST_F(TestCkTileImageToColumnFloat, TestCorrectness)
+{
+    this->Run({2, 2, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->Run({2, 2, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->Run({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {3, 3}, {1, 1}, {0, 0}, {0, 0}});
+    this->Run({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->Run({2, 2, 64, 1, 64, {3, 3}, {28, 28}, {2, 2}, {2, 2}, {1, 1}, {1, 1}});
+}
+
+TEST_F(TestCkTileImageToColumnHalf, TestCorrectness)
+{
+    this->Run({2, 2, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->Run({2, 2, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}});
+    this->Run({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {3, 3}, {1, 1}, {0, 0}, {0, 0}});
+    this->Run({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}});
+    this->Run({2, 2, 64, 1, 64, {3, 3}, {28, 28}, {2, 2}, {2, 2}, {1, 1}, {1, 1}});
+}