Merge branch 'develop' into bwroblew/contraction_mixed_dt

b4e77c8c · Bartlomiej Wroblewski · GitHub · ac56377a · 271ef645 · b4e77c8c
Unverified Commit b4e77c8c authored Sep 28, 2023 by Bartlomiej Wroblewski Committed by GitHub Sep 28, 2023
6 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
-# Change Log for Composable Kernel
+# Changelog for Composable Kernel
 Full documentation for Composable Kernel is not yet available.
 ## (Unreleased) CK for ROCm 6.0.0
-### Fixed
+### Fixes
+ - Fixed a hazard associated with inline v_dot (#808)
+ - Fixed two bugs in grouped convolution backward data without K padding (#848 #876)
 ### Optimizations
+None
-### Added
+### Additions
- Added image to column (#867) and column to image kernels (#930).
+- Added an image to a column kernel (#867)
+- Added a column to an image kernel (#930)
+- Support for 3D grouped convolution forward on RDNA 3 GPUs (#935)
+- Grouped convolution support for small K and C (#822 #879 #897)
+- Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
+- Support for bf16/f32/f16 and NHWGC (2D and 3d) grouped convolution backward data (#757 #799)
+- Support for Batched Gemm DL (#732)
-### Changed
+### Changes
+ - Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
+## CK 0.2.0 for ROCm 5.7.0
-## CK 0.2.0 for ROCm 5.5.0
+### Fixes
+- Fixed a bug in 6-dimensional kernels (#555)
-### Fixed
+- Fixed a test case failure with grouped convolution backward weight (#524)
- Fixed a bug in 6-dimensional kernels (#555).
- Fixed grouped ConvBwdWeight test case failure (#524).
 ### Optimizations
- Improve proformance of normalization kernel
+- Improved the performance of the normalization kernel
-### Added
+### Additions
- Added new cmake flag "DL_KERNELS" must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances.
+- New CMake flags:
- Added new cmake flag "DTYPES" which could be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instance of select data types.
+  - "DL_KERNELS"-- Must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances
- Added new cmake flag "INSTANCES_ONLY" which will only build CK library and instances without the tests, examples, or profiler.
+  - "DTYPES" -- Can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build an instance of the specified data types
- Added new feature: if GPU_TARGETS is not set on cmake command line, CK will be built for all targets supported by compiler.
+  - "INSTANCES_ONLY" -- Only builds CK library and instances without tests, examples, or profiler
- Added support on MI300A/MI300X.
+- New feature: if GPU_TARGETS is not set in the CMake command line, CK will be built for all targets supported by the compiler
- Added support on NAVI3x.
+- Support for MI300A/MI300X
- Added user tutorial (#563).
+- Support for AMD RDNA 3
- Added more instances for irregular GEMM sizes (#560).
+- New user tutorial (#563)
- Added inter-wave consumer-producer programming model for GEMM kernels (#310).
+- Additional instances for irregular GEMM sizes (#560)
- Added multi-D GEMM client APIs (#534).
+- New inter-wave consumer-producer programming model for GEMM kernels (#310)
- Added multi-embeddings support (#542).
+- GEMM with support multiple elementwise fusions (multi-D) (#534)
- Added Navi3x blockwise GEMM and real GEMM support (#541).
+- Multi-embeddings support (#542)
- Added Navi grouped ConvBwdWeight support (#505).
+- AMD RDNA 3 blockwise GEMM and real GEMM support (#541)
- Added MaxPool, AvgPool forward (#815).
+- AMD RDNA grouped convolution backward weight support (#505)
- Added MaxPool backward (#750).
+- MaxPool and AvgPool forward (#815); MaxPool backward (#750)
-### Changed
+### Changes
+None
--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -34,6 +34,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
 #endif
        // warm up
        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());
        const int nrepeat = 10;
 #if DEBUG_LOG
@@ -50,6 +51,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
        for(int i = 0; i < nrepeat; ++i)
        {
            kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+            hip_check_error(hipGetLastError());
        }
        hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
@@ -64,11 +66,13 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
    else
    {
        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());
        return 0;
    }
 #else
    kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+    hip_check_error(hipGetLastError());
    return 0;
 #endif
@@ -101,6 +105,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
        // warm up
        preprocess();
        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());
        const int nrepeat = 10;
 #if DEBUG_LOG
@@ -118,6 +123,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
        {
            preprocess();
            kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+            hip_check_error(hipGetLastError());
        }
        hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
@@ -133,11 +139,13 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
    {
        preprocess();
        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+        hip_check_error(hipGetLastError());
        return 0;
    }
 #else
    kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
+    hip_check_error(hipGetLastError());
    return 0;
 #endif

--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -9,8 +9,10 @@
 namespace ck {
-// Convert X to Y
+// Convert X to Y, both X and Y are non-const data types.
-template <typename Y, typename X>
+template <typename Y,
+          typename X,
+          std::enable_if_t<!(std::is_const_v<Y> || std::is_const_v<X>), bool> = false>
 __host__ __device__ constexpr Y type_convert(X x)
 {
    static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
@@ -18,6 +20,19 @@ __host__ __device__ constexpr Y type_convert(X x)
    return static_cast<Y>(x);
 }
+// Convert X to Y, either X or Y is a const data type.
+template <typename Y,
+          typename X,
+          std::enable_if_t<std::is_const_v<Y> || std::is_const_v<X>, bool> = false>
+__host__ __device__ constexpr Y type_convert(X x)
+{
+    static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
+    using NonConstY = std::remove_const_t<Y>;
+    using NonConstX = std::remove_const_t<X>;
+    return static_cast<Y>(type_convert<NonConstY, NonConstX>(x));
+}
 // convert bfp16 to fp32
 template <>
 inline __host__ __device__ constexpr float type_convert<float, bhalf_t>(bhalf_t x)

--- a/test/data_type/CMakeLists.txt
+++ b/test/data_type/CMakeLists.txt
@@ -13,3 +13,5 @@ add_gtest_executable(test_bf8 bf8.cpp)
 if(result EQUAL 0)
  target_link_libraries(test_bf8 PRIVATE utility)
 endif()
+add_gtest_executable(test_type_convert_const type_convert_const.cpp)
--- a/test/data_type/type_convert_const.cpp
+++ b/test/data_type/type_convert_const.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
+#include "gtest/gtest.h"
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/type_convert.hpp"
+using ck::bhalf_t;
+using ck::type_convert;
+TEST(TypeConvertConst, ConvertToConst)
+{
+    constexpr float bf16_epsilon = 0.0078125;
+    constexpr float rel_tol      = 2 * bf16_epsilon;
+    const std::vector<float> cases = {0.0, -123.f, 3.981323f, 0.2429f};
+    for(float x : cases)
+    {
+        const float abs_tol = std::abs(rel_tol * x);
+        {
+            bhalf_t y = type_convert<bhalf_t>(x);
+            // Test non-const bhalf to const float.
+            const float y_float = type_convert<const float>(y);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        {
+            // Test non-const float to const bhalf.
+            const bhalf_t y = type_convert<const bhalf_t>(x);
+            // Remove the constness manually to not rely on const casts anymore since the
+            // possible issue could hide after two casts.
+            bhalf_t& y_nonconst = const_cast<bhalf_t&>(y);
+            float y_float       = type_convert<float>(y_nonconst);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+    }
+}
+TEST(TypeConvertConst, ConvertFromConst)
+{
+    constexpr float bf16_epsilon = 0.0078125;
+    constexpr float rel_tol      = 2 * bf16_epsilon;
+    const std::vector<float> cases = {0.0, -123.f, 3.981323f, 0.2429f};
+    for(const float x : cases)
+    {
+        const float abs_tol = std::abs(rel_tol * x);
+        {
+            // Test const float to const bhalf_t.
+            const bhalf_t y = type_convert<const bhalf_t>(x);
+            // Remove the constness manually to not rely on const casts anymore since the
+            // possible issue could hide after two casts.
+            bhalf_t& y_nonconst = const_cast<bhalf_t&>(y);
+            float y_float       = type_convert<float>(y_nonconst);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        {
+            // Test const float to non-const bhalf.
+            bhalf_t y     = type_convert<bhalf_t>(x);
+            float y_float = type_convert<float>(y);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        {
+            const bhalf_t y = type_convert<const bhalf_t>(x);
+            // Test const bhalf to non-const float.
+            float y_float = type_convert<float>(y);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        // Tests with full type specializations for X.
+        {
+            // Test const float to const bhalf_t.
+            const bhalf_t y = type_convert<const bhalf_t, const float>(x);
+            // Remove the constness manually to not rely on const casts anymore since the
+            // possible issue could hide after two casts.
+            bhalf_t& y_nonconst = const_cast<bhalf_t&>(y);
+            float y_float       = type_convert<float>(y_nonconst);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        {
+            // Test const float to non-const bhalf.
+            bhalf_t y     = type_convert<bhalf_t, const float>(x);
+            float y_float = type_convert<float>(y);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+        {
+            const bhalf_t y = type_convert<const bhalf_t, const float>(x);
+            // Test const bhalf to non-const float.
+            float y_float = type_convert<float, const bhalf_t>(y);
+            ASSERT_NEAR(y_float, x, abs_tol);
+        }
+    }
+}
--- a/test/gemm_split_k/test_gemm_splitk_ut_cases.inc
+++ b/test/gemm_split_k/test_gemm_splitk_ut_cases.inc
@@ -2,7 +2,7 @@
 TYPED_TEST(TestGemmSplitK_MK_KN, SmallM)
 {
-    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
    constexpr int N = 512;
    constexpr int K = 320;
@@ -16,7 +16,7 @@ TYPED_TEST(TestGemmSplitK_MK_KN, SmallM)
 TYPED_TEST(TestGemmSplitK_MK_NK, SmallM)
 {
-    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
    constexpr int N = 512;
    constexpr int K = 320;
@@ -30,7 +30,7 @@ TYPED_TEST(TestGemmSplitK_MK_NK, SmallM)
 TYPED_TEST(TestGemmSplitK_KM_KN, SmallM)
 {
-    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
    constexpr int N = 512;
    constexpr int K = 320;
@@ -43,7 +43,7 @@ TYPED_TEST(TestGemmSplitK_KM_KN, SmallM)
 TYPED_TEST(TestGemmSplitK_KM_NK, SmallM)
 {
-    std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6};
+    std::vector<int> Ms{1, 2, 3, 4, 5, 6};
    constexpr int N = 512;
    constexpr int K = 320;