Unverified Commit b4e77c8c authored by Bartlomiej Wroblewski's avatar Bartlomiej Wroblewski Committed by GitHub
Browse files

Merge branch 'develop' into bwroblew/contraction_mixed_dt

parents ac56377a 271ef645
# Change Log for Composable Kernel # Changelog for Composable Kernel
Full documentation for Composable Kernel is not yet available. Full documentation for Composable Kernel is not yet available.
## (Unreleased) CK for ROCm 6.0.0 ## (Unreleased) CK for ROCm 6.0.0
### Fixed ### Fixes
- Fixed a hazard associated with inline v_dot (#808)
- Fixed two bugs in grouped convolution backward data without K padding (#848 #876)
### Optimizations ### Optimizations
None
### Added ### Additions
- Added image to column (#867) and column to image kernels (#930). - Added an image to a column kernel (#867)
- Added a column to an image kernel (#930)
- Support for 3D grouped convolution forward on RDNA 3 GPUs (#935)
- Grouped convolution support for small K and C (#822 #879 #897)
- Support for NHWGC (2D and 3D) grouped convolution backward weight (#769 #804)
- Support for bf16/f32/f16 and NHWGC (2D and 3d) grouped convolution backward data (#757 #799)
- Support for Batched Gemm DL (#732)
### Changed ### Changes
- Changed the grouped convolution API to maintain consistency with other convolution kernels (#817)
## CK 0.2.0 for ROCm 5.7.0
## CK 0.2.0 for ROCm 5.5.0 ### Fixes
- Fixed a bug in 6-dimensional kernels (#555)
### Fixed - Fixed a test case failure with grouped convolution backward weight (#524)
- Fixed a bug in 6-dimensional kernels (#555).
- Fixed grouped ConvBwdWeight test case failure (#524).
### Optimizations ### Optimizations
- Improve proformance of normalization kernel - Improved the performance of the normalization kernel
### Added ### Additions
- Added new cmake flag "DL_KERNELS" must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances. - New CMake flags:
- Added new cmake flag "DTYPES" which could be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instance of select data types. - "DL_KERNELS"-- Must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances
- Added new cmake flag "INSTANCES_ONLY" which will only build CK library and instances without the tests, examples, or profiler. - "DTYPES" -- Can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build an instance of the specified data types
- Added new feature: if GPU_TARGETS is not set on cmake command line, CK will be built for all targets supported by compiler. - "INSTANCES_ONLY" -- Only builds CK library and instances without tests, examples, or profiler
- Added support on MI300A/MI300X. - New feature: if GPU_TARGETS is not set in the CMake command line, CK will be built for all targets supported by the compiler
- Added support on NAVI3x. - Support for MI300A/MI300X
- Added user tutorial (#563). - Support for AMD RDNA 3
- Added more instances for irregular GEMM sizes (#560). - New user tutorial (#563)
- Added inter-wave consumer-producer programming model for GEMM kernels (#310). - Additional instances for irregular GEMM sizes (#560)
- Added multi-D GEMM client APIs (#534). - New inter-wave consumer-producer programming model for GEMM kernels (#310)
- Added multi-embeddings support (#542). - GEMM with support multiple elementwise fusions (multi-D) (#534)
- Added Navi3x blockwise GEMM and real GEMM support (#541). - Multi-embeddings support (#542)
- Added Navi grouped ConvBwdWeight support (#505). - AMD RDNA 3 blockwise GEMM and real GEMM support (#541)
- Added MaxPool, AvgPool forward (#815). - AMD RDNA grouped convolution backward weight support (#505)
- Added MaxPool backward (#750). - MaxPool and AvgPool forward (#815); MaxPool backward (#750)
### Changed ### Changes
None
...@@ -34,6 +34,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config, ...@@ -34,6 +34,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
#endif #endif
// warm up // warm up
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...); kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
hip_check_error(hipGetLastError());
const int nrepeat = 10; const int nrepeat = 10;
#if DEBUG_LOG #if DEBUG_LOG
...@@ -50,6 +51,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config, ...@@ -50,6 +51,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
for(int i = 0; i < nrepeat; ++i) for(int i = 0; i < nrepeat; ++i)
{ {
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...); kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
hip_check_error(hipGetLastError());
} }
hip_check_error(hipEventRecord(stop, stream_config.stream_id_)); hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
...@@ -64,11 +66,13 @@ float launch_and_time_kernel(const StreamConfig& stream_config, ...@@ -64,11 +66,13 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
else else
{ {
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...); kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
hip_check_error(hipGetLastError());
return 0; return 0;
} }
#else #else
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...); kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
hip_check_error(hipGetLastError());
return 0; return 0;
#endif #endif
...@@ -101,6 +105,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, ...@@ -101,6 +105,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
// warm up // warm up
preprocess(); preprocess();
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...); kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
hip_check_error(hipGetLastError());
const int nrepeat = 10; const int nrepeat = 10;
#if DEBUG_LOG #if DEBUG_LOG
...@@ -118,6 +123,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, ...@@ -118,6 +123,7 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
{ {
preprocess(); preprocess();
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...); kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
hip_check_error(hipGetLastError());
} }
hip_check_error(hipEventRecord(stop, stream_config.stream_id_)); hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
...@@ -133,11 +139,13 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, ...@@ -133,11 +139,13 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
{ {
preprocess(); preprocess();
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...); kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
hip_check_error(hipGetLastError());
return 0; return 0;
} }
#else #else
kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...); kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
hip_check_error(hipGetLastError());
return 0; return 0;
#endif #endif
......
...@@ -9,8 +9,10 @@ ...@@ -9,8 +9,10 @@
namespace ck { namespace ck {
// Convert X to Y // Convert X to Y, both X and Y are non-const data types.
template <typename Y, typename X> template <typename Y,
typename X,
std::enable_if_t<!(std::is_const_v<Y> || std::is_const_v<X>), bool> = false>
__host__ __device__ constexpr Y type_convert(X x) __host__ __device__ constexpr Y type_convert(X x)
{ {
static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>); static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
...@@ -18,6 +20,19 @@ __host__ __device__ constexpr Y type_convert(X x) ...@@ -18,6 +20,19 @@ __host__ __device__ constexpr Y type_convert(X x)
return static_cast<Y>(x); return static_cast<Y>(x);
} }
// Convert X to Y, either X or Y is a const data type.
template <typename Y,
typename X,
std::enable_if_t<std::is_const_v<Y> || std::is_const_v<X>, bool> = false>
__host__ __device__ constexpr Y type_convert(X x)
{
static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
using NonConstY = std::remove_const_t<Y>;
using NonConstX = std::remove_const_t<X>;
return static_cast<Y>(type_convert<NonConstY, NonConstX>(x));
}
// convert bfp16 to fp32 // convert bfp16 to fp32
template <> template <>
inline __host__ __device__ constexpr float type_convert<float, bhalf_t>(bhalf_t x) inline __host__ __device__ constexpr float type_convert<float, bhalf_t>(bhalf_t x)
......
...@@ -13,3 +13,5 @@ add_gtest_executable(test_bf8 bf8.cpp) ...@@ -13,3 +13,5 @@ add_gtest_executable(test_bf8 bf8.cpp)
if(result EQUAL 0) if(result EQUAL 0)
target_link_libraries(test_bf8 PRIVATE utility) target_link_libraries(test_bf8 PRIVATE utility)
endif() endif()
add_gtest_executable(test_type_convert_const type_convert_const.cpp)
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#include "gtest/gtest.h"
#include "ck/utility/data_type.hpp"
#include "ck/utility/type_convert.hpp"
using ck::bhalf_t;
using ck::type_convert;
TEST(TypeConvertConst, ConvertToConst)
{
constexpr float bf16_epsilon = 0.0078125;
constexpr float rel_tol = 2 * bf16_epsilon;
const std::vector<float> cases = {0.0, -123.f, 3.981323f, 0.2429f};
for(float x : cases)
{
const float abs_tol = std::abs(rel_tol * x);
{
bhalf_t y = type_convert<bhalf_t>(x);
// Test non-const bhalf to const float.
const float y_float = type_convert<const float>(y);
ASSERT_NEAR(y_float, x, abs_tol);
}
{
// Test non-const float to const bhalf.
const bhalf_t y = type_convert<const bhalf_t>(x);
// Remove the constness manually to not rely on const casts anymore since the
// possible issue could hide after two casts.
bhalf_t& y_nonconst = const_cast<bhalf_t&>(y);
float y_float = type_convert<float>(y_nonconst);
ASSERT_NEAR(y_float, x, abs_tol);
}
}
}
TEST(TypeConvertConst, ConvertFromConst)
{
constexpr float bf16_epsilon = 0.0078125;
constexpr float rel_tol = 2 * bf16_epsilon;
const std::vector<float> cases = {0.0, -123.f, 3.981323f, 0.2429f};
for(const float x : cases)
{
const float abs_tol = std::abs(rel_tol * x);
{
// Test const float to const bhalf_t.
const bhalf_t y = type_convert<const bhalf_t>(x);
// Remove the constness manually to not rely on const casts anymore since the
// possible issue could hide after two casts.
bhalf_t& y_nonconst = const_cast<bhalf_t&>(y);
float y_float = type_convert<float>(y_nonconst);
ASSERT_NEAR(y_float, x, abs_tol);
}
{
// Test const float to non-const bhalf.
bhalf_t y = type_convert<bhalf_t>(x);
float y_float = type_convert<float>(y);
ASSERT_NEAR(y_float, x, abs_tol);
}
{
const bhalf_t y = type_convert<const bhalf_t>(x);
// Test const bhalf to non-const float.
float y_float = type_convert<float>(y);
ASSERT_NEAR(y_float, x, abs_tol);
}
// Tests with full type specializations for X.
{
// Test const float to const bhalf_t.
const bhalf_t y = type_convert<const bhalf_t, const float>(x);
// Remove the constness manually to not rely on const casts anymore since the
// possible issue could hide after two casts.
bhalf_t& y_nonconst = const_cast<bhalf_t&>(y);
float y_float = type_convert<float>(y_nonconst);
ASSERT_NEAR(y_float, x, abs_tol);
}
{
// Test const float to non-const bhalf.
bhalf_t y = type_convert<bhalf_t, const float>(x);
float y_float = type_convert<float>(y);
ASSERT_NEAR(y_float, x, abs_tol);
}
{
const bhalf_t y = type_convert<const bhalf_t, const float>(x);
// Test const bhalf to non-const float.
float y_float = type_convert<float, const bhalf_t>(y);
ASSERT_NEAR(y_float, x, abs_tol);
}
}
}
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
TYPED_TEST(TestGemmSplitK_MK_KN, SmallM) TYPED_TEST(TestGemmSplitK_MK_KN, SmallM)
{ {
std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6}; std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 512; constexpr int N = 512;
constexpr int K = 320; constexpr int K = 320;
...@@ -16,7 +16,7 @@ TYPED_TEST(TestGemmSplitK_MK_KN, SmallM) ...@@ -16,7 +16,7 @@ TYPED_TEST(TestGemmSplitK_MK_KN, SmallM)
TYPED_TEST(TestGemmSplitK_MK_NK, SmallM) TYPED_TEST(TestGemmSplitK_MK_NK, SmallM)
{ {
std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6}; std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 512; constexpr int N = 512;
constexpr int K = 320; constexpr int K = 320;
...@@ -30,7 +30,7 @@ TYPED_TEST(TestGemmSplitK_MK_NK, SmallM) ...@@ -30,7 +30,7 @@ TYPED_TEST(TestGemmSplitK_MK_NK, SmallM)
TYPED_TEST(TestGemmSplitK_KM_KN, SmallM) TYPED_TEST(TestGemmSplitK_KM_KN, SmallM)
{ {
std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6}; std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 512; constexpr int N = 512;
constexpr int K = 320; constexpr int K = 320;
...@@ -43,7 +43,7 @@ TYPED_TEST(TestGemmSplitK_KM_KN, SmallM) ...@@ -43,7 +43,7 @@ TYPED_TEST(TestGemmSplitK_KM_KN, SmallM)
TYPED_TEST(TestGemmSplitK_KM_NK, SmallM) TYPED_TEST(TestGemmSplitK_KM_NK, SmallM)
{ {
std::vector<int> Ms{0, 1, 2, 3, 4, 5, 6}; std::vector<int> Ms{1, 2, 3, 4, 5, 6};
constexpr int N = 512; constexpr int N = 512;
constexpr int K = 320; constexpr int K = 320;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment