Commit a7ae4f8e authored by Astha Rai's avatar Astha Rai
Browse files

Merge branch 'codegen_hiprtc' of github.com:ROCm/composable_kernel into codegen_hiprtc

parents a6055c3c 781005a5
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
#include "ck_tile/core/algorithm/coordinate_transform.hpp" #include "ck_tile/core/algorithm/coordinate_transform.hpp"
#include "ck_tile/core/algorithm/indexing_adaptor.hpp" #include "ck_tile/core/algorithm/indexing_adaptor.hpp"
#include "ck_tile/core/algorithm/space_filling_curve.hpp" #include "ck_tile/core/algorithm/space_filling_curve.hpp"
#include "ck_tile/core/algorithm/static_encoding_pattern.hpp"
#include "ck_tile/core/arch/amd_buffer_addressing.hpp" #include "ck_tile/core/arch/amd_buffer_addressing.hpp"
#include "ck_tile/core/arch/arch.hpp" #include "ck_tile/core/arch/arch.hpp"
#include "ck_tile/core/arch/generic_memory_space_atomic.hpp" #include "ck_tile/core/arch/generic_memory_space_atomic.hpp"
...@@ -53,8 +54,8 @@ ...@@ -53,8 +54,8 @@
#include "ck_tile/core/tensor/tile_window.hpp" #include "ck_tile/core/tensor/tile_window.hpp"
#include "ck_tile/core/tensor/tile_window_linear.hpp" #include "ck_tile/core/tensor/tile_window_linear.hpp"
#include "ck_tile/core/tensor/tile_window_utils.hpp" #include "ck_tile/core/tensor/tile_window_utils.hpp"
#include "ck_tile/core/tensor/transpose_tile.hpp"
#include "ck_tile/core/tensor/update_tile.hpp" #include "ck_tile/core/tensor/update_tile.hpp"
#include "ck_tile/core/utility/amd_address_space.hpp"
#include "ck_tile/core/utility/bit_cast.hpp" #include "ck_tile/core/utility/bit_cast.hpp"
#include "ck_tile/core/utility/functional.hpp" #include "ck_tile/core/utility/functional.hpp"
#include "ck_tile/core/utility/functional_with_tuple.hpp" #include "ck_tile/core/utility/functional_with_tuple.hpp"
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core/arch/arch.hpp"
#include "ck_tile/core/config.hpp"
#include "ck_tile/core/container/sequence.hpp"
#include "ck_tile/core/container/tuple.hpp"
#include "ck_tile/core/numeric/integer.hpp"
#include "ck_tile/core/tensor/tile_distribution.hpp"
#include "ck_tile/core/tensor/tile_distribution_encoding.hpp"
namespace ck_tile {
/**
* @brief Enumeration describing static tile distribution patterns.
*
*/
enum struct tile_distribution_pattern
{
/**
* @brief Thread raked pattern.
*
*/
thread_raked,
/**
* @brief Warp raked pattern.
*
*/
warp_raked,
/**
* @brief Block raked pattern - aka linear.
*
*/
block_raked,
};
struct TileDistributionEncodingPattern
{
};
/**
* @brief Class creating 2D static tile distribution with different load/store patterns.
*
* @note We always assume that Tile is YPerTile x XPerTile where X dim (rightmost)
* is contiguous and we can do vector load on this dimension.
*
* @tparam BlockSize Number of threads in a workgroup.
* @tparam YPerTile The tile size of outer/leftmost dimension.
* @tparam XPerTile The tile size of inner/rightmost dimension (contiguous).
* @tparam VecSize The vector access size.
* @tparam DistributionPattern The enumeration describing used access pattern.
*/
template <index_t BlockSize,
index_t YPerTile,
index_t XPerTile,
index_t VecSize,
tile_distribution_pattern DistributionPattern>
struct TileDistributionEncodingPattern2D : public TileDistributionEncodingPattern
{
};
// Thread raked
template <index_t BlockSize, index_t YPerTile, index_t XPerTile, index_t VecSize>
struct TileDistributionEncodingPattern2D<BlockSize,
YPerTile,
XPerTile,
VecSize,
tile_distribution_pattern::thread_raked>
: public TileDistributionEncodingPattern
{
// TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
static constexpr index_t warp_size = get_warp_size();
static constexpr index_t num_warps = BlockSize / get_warp_size();
static constexpr index_t X1 = VecSize;
static constexpr index_t X0 = XPerTile / X1; // # of threads in X dim
// # of rows in Y dim accessed by single wavefront in one iteration
static constexpr index_t Y1 = warp_size / X0;
static_assert(X0 * Y1 == warp_size, "X0 * Y1 must cover whole wavefront!");
static constexpr index_t Y0 = num_warps;
// YPerWarp = YPerTile / Y0;
// Y2 = YPerWarp / Y1;
static constexpr index_t Y2 = YPerTile / (Y1 * Y0); // # of iters within wavefront
static_assert(X0 * Y1 * Y0 == BlockSize, "X0 * warp_ys * Y0 must cover whole workgroup!");
static_assert(Y0 * Y1 * Y2 == YPerTile, "Y0, Y1, Y2 must cover whole YPerTile");
CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
{
return make_static_tile_distribution(
tile_distribution_encoding<sequence<1>,
tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
tuple<sequence<1>, sequence<1, 2>>,
tuple<sequence<0>, sequence<1, 0>>,
sequence<1, 2>,
sequence<2, 1>>{});
}
CK_TILE_HOST_DEVICE static constexpr auto MakeShuffled2DStaticTileDistribution()
{
return make_static_tile_distribution(
tile_distribution_encoding<sequence<1>,
tuple<sequence<X0, X1>, sequence<Y0, Y1, Y2>>,
tuple<sequence<2>, sequence<2, 1>>,
tuple<sequence<0>, sequence<1, 0>>,
sequence<1, 2>,
sequence<1, 2>>{});
}
};
// Warp raked
template <index_t BlockSize, index_t YPerTile, index_t XPerTile, index_t VecSize>
struct TileDistributionEncodingPattern2D<BlockSize,
YPerTile,
XPerTile,
VecSize,
tile_distribution_pattern::warp_raked>
: public TileDistributionEncodingPattern
{
static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
static constexpr index_t warp_size = get_warp_size();
static constexpr index_t num_warps = BlockSize / get_warp_size();
static constexpr index_t X1 = VecSize;
static constexpr index_t X0 = XPerTile / X1; // # of threads in X dim
static constexpr index_t Y2 = warp_size / X0; // # of rows in Y dim to cover whole wavefront
static_assert(X0 * Y2 == warp_size, "X0 * Y2 must cover whole wavefront!");
static constexpr index_t Y0 = num_warps;
static_assert(X0 * Y2 * Y0 == BlockSize, "X0 * Y2 * Y1 must cover whole workgroup!");
static constexpr index_t Y1 = YPerTile / (Y2 * Y0); // # of iters within wavefront
static_assert(Y0 * Y1 * Y2 == YPerTile, "Y0, Y1, Y2 must cover whole YPerTile");
CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
{
return make_static_tile_distribution(
tile_distribution_encoding<sequence<1>,
tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
tuple<sequence<1>, sequence<1, 2>>,
tuple<sequence<0>, sequence<2, 0>>,
sequence<1, 2>,
sequence<1, 1>>{});
}
CK_TILE_HOST_DEVICE static constexpr auto MakeShuffled2DStaticTileDistribution()
{
return make_static_tile_distribution(
tile_distribution_encoding<sequence<1>,
tuple<sequence<X0, X1>, sequence<Y0, Y1, Y2>>,
tuple<sequence<2>, sequence<2, 1>>,
tuple<sequence<0>, sequence<2, 0>>,
sequence<1, 2>,
sequence<1, 1>>{});
}
};
// Block raked
template <index_t BlockSize, index_t YPerTile, index_t XPerTile, index_t VecSize>
struct TileDistributionEncodingPattern2D<BlockSize,
YPerTile,
XPerTile,
VecSize,
tile_distribution_pattern::block_raked>
: public TileDistributionEncodingPattern
{
// TODO: make pattern where below condition does not need to hold - GGemmMultiDSplitk!
static_assert(XPerTile % VecSize == 0, "XPerTile must be a multiple of VecSize!");
static constexpr index_t warp_size = get_warp_size();
static constexpr index_t num_warps = BlockSize / get_warp_size();
static constexpr index_t X1 = VecSize;
static constexpr index_t X0 = XPerTile / X1; // # of threads in X dim
static constexpr index_t Y2 = warp_size / X0; // # of rows in Y dim to cover whole wavefront
static_assert(X0 * Y2 == warp_size, "X0 * Y2 must cover whole wavefront!");
static constexpr index_t Y1 = num_warps;
static_assert(X0 * Y2 * Y1 == BlockSize, "X0 * Y2 * Y1 must cover whole workgroup!");
static constexpr index_t Y0 = YPerTile / (Y2 * Y1); // # of iters
static_assert(Y0 * Y1 * Y2 == YPerTile, "Y0, Y1, Y2 must cover whole YPerTile");
CK_TILE_HOST_DEVICE static constexpr auto Make2DStaticTileDistribution()
{
return make_static_tile_distribution(
tile_distribution_encoding<sequence<1>,
tuple<sequence<Y0, Y1, Y2>, sequence<X0, X1>>,
tuple<sequence<1>, sequence<1, 2>>,
tuple<sequence<1>, sequence<2, 0>>,
sequence<1, 2>,
sequence<0, 1>>{});
}
CK_TILE_HOST_DEVICE static constexpr auto MakeShuffled2DStaticTileDistribution()
{
return make_static_tile_distribution(
tile_distribution_encoding<sequence<1>,
tuple<sequence<X0, X1>, sequence<Y0, Y1, Y2>>,
tuple<sequence<2>, sequence<2, 1>>,
tuple<sequence<1>, sequence<2, 0>>,
sequence<1, 2>,
sequence<1, 0>>{});
}
};
} // namespace ck_tile
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -12,18 +12,37 @@ ...@@ -12,18 +12,37 @@
namespace ck_tile { namespace ck_tile {
enum struct address_space_enum template <typename, bool>
struct safe_underlying_type;
template <typename T>
struct safe_underlying_type<T, true>
{
using type = std::underlying_type_t<T>;
};
template <typename T>
struct safe_underlying_type<T, false>
{
using type = void;
};
template <typename T>
using safe_underlying_type_t = typename safe_underlying_type<T, std::is_enum<T>::value>::type;
enum struct address_space_enum : std::uint16_t
{ {
generic, generic = 0,
global, global,
lds, lds,
sgpr, sgpr,
vgpr, constant,
vgpr
}; };
enum struct memory_operation_enum enum struct memory_operation_enum : std::uint16_t
{ {
set, set = 0,
atomic_add, atomic_add,
atomic_max, atomic_max,
add add
...@@ -109,4 +128,30 @@ CK_TILE_DEVICE void s_nop(index_t cnt = 0) ...@@ -109,4 +128,30 @@ CK_TILE_DEVICE void s_nop(index_t cnt = 0)
#endif #endif
} }
#define CK_CONSTANT_ADDRESS_SPACE \
__attribute__((address_space( \
static_cast<safe_underlying_type_t<address_space_enum>>(address_space_enum::constant))))
template <typename T>
__device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* p)
{
// cast a pointer in "Constant" address space (4) to "Generic" address space (0)
// only c-style pointer cast seems be able to be compiled
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wold-style-cast"
return (T*)(p); // NOLINT(old-style-cast)
#pragma clang diagnostic pop
}
template <typename T>
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p)
{
// cast a pointer in "Generic" address space (0) to "Constant" address space (4)
// only c-style pointer cast seems be able to be compiled;
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wold-style-cast"
return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast)
#pragma clang diagnostic pop
}
} // namespace ck_tile } // namespace ck_tile
...@@ -546,7 +546,7 @@ CK_TILE_HOST_DEVICE constexpr auto tuple_reverse(const tuple<Ts...>& t) ...@@ -546,7 +546,7 @@ CK_TILE_HOST_DEVICE constexpr auto tuple_reverse(const tuple<Ts...>& t)
using Idx = number<tuple<Ts...>::size() - i - 1>; using Idx = number<tuple<Ts...>::size() - i - 1>;
return t.at(Idx{}); return t.at(Idx{});
}, },
number<tuple<Ts...>::size()()>{}); number<tuple<Ts...>::size()>{});
} }
// Reduce tuple values in specific range using Function // Reduce tuple values in specific range using Function
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#include "ck_tile/core/config.hpp" #include "ck_tile/core/config.hpp"
#include "ck_tile/core/utility/bit_cast.hpp" #include "ck_tile/core/utility/bit_cast.hpp"
...@@ -376,6 +376,16 @@ struct numeric<bfloat16_t> ...@@ -376,6 +376,16 @@ struct numeric<bfloat16_t>
} }
}; };
template <typename T>
struct numeric_traits;
template <>
struct numeric_traits<bfloat16_t>
{
static constexpr int exp = 8;
static constexpr int mant = 7;
};
#if CK_TILE_USE_CUSTOM_DATA_TYPE #if CK_TILE_USE_CUSTOM_DATA_TYPE
CK_TILE_ARITHMETIC_USING_FLOAT(CK_TILE_HOST_DEVICE, bfloat16_t) CK_TILE_ARITHMETIC_USING_FLOAT(CK_TILE_HOST_DEVICE, bfloat16_t)
#endif #endif
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -18,8 +18,17 @@ ...@@ -18,8 +18,17 @@
namespace ck_tile { namespace ck_tile {
// Note: this tile window do not support single issue /**
// you need to use tile_window_linear structure for this purpose * @brief This class provides tile (windowed) view and access to the device memory.
*
* @note This tile window does not support single issue you need to use tile_window_linear
* structure for this purpose
*
* @tparam BottomTensorView_ Class describing & holding device tensor memory.
* @tparam WindowLengths_ Spatial sizes of windowed view on tensor.
* @tparam StaticTileDistribution_ Thread distribution (mapping) into Tile dimensions
* @tparam NumCoord TBD
*/
template <typename BottomTensorView_, template <typename BottomTensorView_,
typename WindowLengths_, typename WindowLengths_,
typename StaticTileDistribution_, typename StaticTileDistribution_,
...@@ -1009,6 +1018,14 @@ CK_TILE_DEVICE void move_tile_window( ...@@ -1009,6 +1018,14 @@ CK_TILE_DEVICE void move_tile_window(
window.move(step); window.move(step);
} }
/**
* @brief This class provides description of tile windowed view on the device memory.
*
* @note This class does not provide any functions to read or modify device memory.
*
* @tparam BottomTensorView_ Class describing & holding device tensor memory.
* @tparam WindowLengths_ Spatial sizes of windowed view on tensor.
*/
template <typename BottomTensorView_, typename WindowLengths_> template <typename BottomTensorView_, typename WindowLengths_>
struct tile_window_with_static_lengths struct tile_window_with_static_lengths
{ {
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core/config.hpp"
#include "ck_tile/core/numeric/integer.hpp"
#include "ck_tile/core/numeric/integral_constant.hpp"
#include "ck_tile/core/utility/functional.hpp"
#include "ck_tile/core/algorithm/coordinate_transform.hpp"
#include "ck_tile/core/algorithm/space_filling_curve.hpp"
#include "ck_tile/core/container/container_helper.hpp"
#include "ck_tile/core/container/thread_buffer.hpp"
#include "ck_tile/core/container/statically_indexed_array.hpp"
#include "ck_tile/core/numeric/math.hpp"
#include "ck_tile/core/utility/type_traits.hpp"
#include "ck_tile/core/tensor/tile_elementwise.hpp"
#include "ck_tile/core/utility/transpose_vectors.hpp"
namespace ck_tile {
namespace detail {
template <typename OutTensor, typename InTensor>
CK_TILE_DEVICE void transpose_tile2d_impl_in_thread(OutTensor& out_tensor,
const InTensor& in_tensor)
{
constexpr auto I0 = number<0>{};
static_assert(std::is_same_v<typename InTensor::DataType, typename OutTensor::DataType>,
"Data type for InTensor and OutTensor must be the same!");
using DataType = typename InTensor::DataType;
constexpr auto y_in_desc = InTensor::get_tile_distribution().get_ys_to_d_descriptor();
constexpr auto y_out_desc = OutTensor::get_tile_distribution().get_ys_to_d_descriptor();
// y_dim_out_to_in
// For swapped Hs tile case I need only get_rh_minor_to_y
// since rh_major are already swapped due to swapped Hs.
constexpr auto get_rh_minor_to_y = [](auto dstr_tensor) {
using DstrEncode = typename decltype(dstr_tensor.get_tile_distribution())::DstrEncode;
map<index_t, index_t> rh_minor_to_y_;
static_for<0, DstrEncode::NDimY, 1>{}([&](auto i) {
constexpr index_t rh_minor = DstrEncode::ys_to_rhs_minor_[i];
rh_minor_to_y_(rh_minor) = i;
});
return rh_minor_to_y_;
};
// In swapped Hs case <Y,X> -> <X,Y> tile
// we have same rh_major, but reversed rh_minor!
constexpr auto rh_minor_to_y_in = get_rh_minor_to_y(InTensor{});
constexpr auto rh_minor_to_y_out = get_rh_minor_to_y(OutTensor{});
// Is this really needed?? Should we have simple reverse here??
constexpr auto y_dim_out_to_in = [&] {
map<index_t, index_t> y_dim_out_to_in_;
for(const auto& [rh_minor, y_out] : rh_minor_to_y_out)
{
y_dim_out_to_in_(y_out) = rh_minor_to_y_in[rh_minor];
}
return y_dim_out_to_in_;
}();
constexpr index_t NDimY = InTensor::get_tile_distribution().get_num_of_dimension_y();
constexpr auto y_lengths = to_sequence(y_in_desc.get_lengths());
// input and output vector dim in the order of input Y dims
constexpr index_t y_dim_vec_in = NDimY - 1;
constexpr index_t y_dim_vec_out = y_dim_out_to_in[NDimY - 1];
// vector lengths
constexpr index_t vec_length_in = y_lengths[y_dim_vec_in];
constexpr index_t vec_length_out = y_lengths[y_dim_vec_out];
// # of vectors
constexpr index_t num_vec_in = vec_length_out;
constexpr index_t num_vec_out = vec_length_in;
using InVec = array<DataType, vec_length_in>;
using OutVec = array<DataType, vec_length_out>;
// SFC
constexpr auto scalars_per_access_arr = generate_array(
[&](auto i) { return (i == y_dim_vec_in or i == y_dim_vec_out) ? y_lengths[i] : 1; },
number<NDimY>{});
constexpr auto scalars_per_access = TO_SEQUENCE(scalars_per_access_arr, NDimY);
using SFC_Y = space_filling_curve<decltype(y_lengths),
typename arithmetic_sequence_gen<0, NDimY, 1>::type,
decltype(scalars_per_access)>;
constexpr index_t num_access = SFC_Y::get_num_of_access();
static_assert(num_access > 0, "wrong! num_access should be larger than 0");
// in/out vectors to be transposed
thread_buffer<InVec, num_vec_in> in_vectors;
thread_buffer<OutVec, num_vec_out> out_vectors;
// loop over SFC and do transpose
static_for<0, num_access, 1>{}([&](auto iAccess) {
// data index [y0, y1, ...] in the order of input tensor
constexpr auto idx_y_start = SFC_Y::get_index(iAccess);
// get input vectors
static_for<0, num_vec_in, 1>{}([&](auto i) {
constexpr auto idx_y_in = generate_tuple(
[&](auto ii) {
return ii == y_dim_vec_out ? idx_y_start[ii] + i : idx_y_start[ii];
},
number<NDimY>{});
constexpr index_t in_offset = y_in_desc.calculate_offset(idx_y_in);
static_assert(in_offset % vec_length_in == 0);
in_vectors(i).template get_as<InVec>()(I0) =
in_tensor.get_thread_buffer()
.template get_as<InVec>()[number<in_offset / vec_length_in>{}];
});
// transpose
transpose_vectors<DataType, num_vec_in, num_vec_out>{}(in_vectors, out_vectors);
// set output vectors
static_for<0, num_vec_out, 1>{}([&](auto i) {
constexpr auto idx_y_out_tmp = generate_array(
[&](auto ii) { return ii == y_dim_vec_in ? idx_y_start[ii] + i : idx_y_start[ii]; },
number<NDimY>{});
constexpr auto idx_y_out =
container_reorder_given_new2old(idx_y_out_tmp, y_dim_out_to_in);
constexpr index_t out_offset = y_out_desc.calculate_offset(idx_y_out);
static_assert(out_offset % vec_length_out == 0);
out_tensor.get_thread_buffer().template set_as<OutVec>(
number<out_offset / vec_length_out>{},
out_vectors[i].template get_as<OutVec>()[I0]);
});
});
}
} // namespace detail
template <typename OutTensor, typename InTensor>
CK_TILE_DEVICE void transpose_tile2d(OutTensor& out, const InTensor& in)
{
using InDataType = typename InTensor::DataType;
using OutDataType = typename OutTensor::DataType;
using InTileDistr = typename InTensor::StaticTileDistribution;
using OutTileDistr = typename OutTensor::StaticTileDistribution;
using InDstrEncode = typename InTileDistr::DstrEncode;
using OutDstrEncode = typename OutTileDistr::DstrEncode;
using InThreadTensorDesc = typename InTensor::ThreadTensorDesc;
using OutThreadTensorDesc = typename OutTensor::ThreadTensorDesc;
// Ys:
constexpr auto in_thread_desc_lengths = InThreadTensorDesc{}.get_lengths();
constexpr auto out_thread_desc_lengths = OutThreadTensorDesc{}.get_lengths();
// type convert
const auto in_tmp = [&]() {
if constexpr(std::is_same_v<OutDataType, InDataType>)
{
return in;
}
else
{
return tile_elementwise_in(type_convert<OutDataType, InDataType>, in);
}
}();
// Scenario where we switch from tile <Y, X> -> <X, Y> - only 2D tiles!
// we preserve Ps but swap Ys: <Y1, Y0> -> <Y0, Y1>
if constexpr(InDstrEncode::rs_lengths_ == OutDstrEncode::rs_lengths_ &&
InDstrEncode::hs_lengthss_ == tuple_reverse(OutDstrEncode::hs_lengthss_) &&
InDstrEncode::NDimY == OutDstrEncode::NDimY && InDstrEncode::NDimY == 2 &&
in_thread_desc_lengths == tuple_reverse(out_thread_desc_lengths))
// Any condition on Ps ??
// InDstrEncode::ps_to_rhss_major_ == OutDstrEncode::ps_to_rhss_major_ &&
// InDstrEncode::ps_to_rhss_minor_ == OutDstrEncode::ps_to_rhss_minor_ &&
{
detail::transpose_tile2d_impl_in_thread(out, in_tmp);
}
else
{
static_assert(false, "Provided tensors could not be transposed!");
}
}
} // namespace ck_tile
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core/config.hpp"
// Address Space for AMDGCN
// https://llvm.org/docs/AMDGPUUsage.html#address-space
namespace ck_tile {
#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
template <typename T>
__device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* p)
{
// cast a pointer in "Constant" address space (4) to "Generic" address space (0)
// only c-style pointer cast seems be able to be compiled
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wold-style-cast"
return (T*)p; // NOLINT(old-style-cast)
#pragma clang diagnostic pop
}
template <typename T>
__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p)
{
// cast a pointer in "Generic" address space (0) to "Constant" address space (4)
// only c-style pointer cast seems be able to be compiled
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wold-style-cast"
return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast)
#pragma clang diagnostic pop
}
} // namespace ck_tile
...@@ -109,4 +109,22 @@ CK_TILE_HOST_DEVICE PY c_style_pointer_cast(PX p_x) ...@@ -109,4 +109,22 @@ CK_TILE_HOST_DEVICE PY c_style_pointer_cast(PX p_x)
#pragma clang diagnostic pop #pragma clang diagnostic pop
} }
template <typename CompareTo, typename... Rest>
struct is_any_of : std::false_type
{
};
template <typename CompareTo, typename FirstType>
struct is_any_of<CompareTo, FirstType> : std::is_same<CompareTo, FirstType>
{
};
template <typename CompareTo, typename FirstType, typename... Rest>
struct is_any_of<CompareTo, FirstType, Rest...>
: std::integral_constant<bool,
std::is_same<CompareTo, FirstType>::value ||
is_any_of<CompareTo, Rest...>::value>
{
};
} // namespace ck_tile } // namespace ck_tile
...@@ -51,16 +51,18 @@ struct composes<F> ...@@ -51,16 +51,18 @@ struct composes<F>
template <typename... Ts> template <typename... Ts>
__host__ __device__ composes(Ts&&...)->composes<remove_cvref_t<Ts>...>; __host__ __device__ composes(Ts&&...)->composes<remove_cvref_t<Ts>...>;
template <typename To> template <typename SaturateType>
struct saturates struct saturates
{ {
template <typename From> // NOTE: this function does not return SaturateType value
CK_TILE_HOST_DEVICE constexpr auto operator()(const From& from) const // it is user's responsiblity to do further cast or not
-> std::enable_if_t<std::is_arithmetic_v<From>, From> template <typename AccType>
CK_TILE_HOST_DEVICE constexpr auto operator()(const AccType& a_) const
-> std::enable_if_t<std::is_arithmetic_v<AccType>, AccType>
{ {
return clamp(from, return clamp(a_,
type_convert<From>(numeric<To>::lowest()), type_convert<AccType>(numeric<SaturateType>::lowest()),
type_convert<From>(numeric<To>::max())); type_convert<AccType>(numeric<SaturateType>::max()));
} }
}; };
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -18,6 +18,112 @@ ...@@ -18,6 +18,112 @@
namespace ck_tile { namespace ck_tile {
template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
double get_relative_threshold(const int number_of_accumulations = 1)
{
using F8 = ck_tile::fp8_t;
using F16 = ck_tile::half_t;
using BF16 = ck_tile::bf16_t;
using F32 = float;
using I8 = int8_t;
using I32 = int32_t;
static_assert(is_any_of<ComputeDataType, F8, F16, BF16, F32, I8, I32, int>::value,
"Warning: Unhandled ComputeDataType for setting up the relative threshold!");
double compute_error = 0;
if constexpr(is_any_of<ComputeDataType, I8, I32, int>::value)
{
return 0;
}
else
{
compute_error = std::pow(2, -numeric_traits<ComputeDataType>::mant) * 0.5;
}
static_assert(is_any_of<OutDataType, F8, F16, BF16, F32, I8, I32, int>::value,
"Warning: Unhandled OutDataType for setting up the relative threshold!");
double output_error = 0;
if constexpr(is_any_of<OutDataType, I8, I32, int>::value)
{
return 0;
}
else
{
output_error = std::pow(2, -numeric_traits<OutDataType>::mant) * 0.5;
}
double midway_error = std::max(compute_error, output_error);
static_assert(is_any_of<AccDataType, F8, F16, BF16, F32, I8, I32, int>::value,
"Warning: Unhandled AccDataType for setting up the relative threshold!");
double acc_error = 0;
if constexpr(is_any_of<AccDataType, I8, I32, int>::value)
{
return 0;
}
else
{
acc_error = std::pow(2, -numeric_traits<AccDataType>::mant) * 0.5 * number_of_accumulations;
}
return std::max(acc_error, midway_error);
}
template <typename ComputeDataType, typename OutDataType, typename AccDataType = ComputeDataType>
double get_absolute_threshold(const double max_possible_num, const int number_of_accumulations = 1)
{
using F8 = ck_tile::fp8_t;
using F16 = ck_tile::half_t;
using BF16 = ck_tile::bf16_t;
using F32 = float;
using I8 = int8_t;
using I32 = int32_t;
static_assert(is_any_of<ComputeDataType, F8, F16, BF16, F32, I8, I32, int>::value,
"Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
auto expo = std::log2(std::abs(max_possible_num));
double compute_error = 0;
if constexpr(is_any_of<ComputeDataType, I8, I32, int>::value)
{
return 0;
}
else
{
compute_error = std::pow(2, expo - numeric_traits<ComputeDataType>::mant) * 0.5;
}
static_assert(is_any_of<OutDataType, F8, F16, BF16, F32, I8, I32, int>::value,
"Warning: Unhandled OutDataType for setting up the absolute threshold!");
double output_error = 0;
if constexpr(is_any_of<OutDataType, I8, I32, int>::value)
{
return 0;
}
else
{
output_error = std::pow(2, expo - numeric_traits<OutDataType>::mant) * 0.5;
}
double midway_error = std::max(compute_error, output_error);
static_assert(is_any_of<AccDataType, F8, F16, BF16, F32, I8, I32, int>::value,
"Warning: Unhandled AccDataType for setting up the absolute threshold!");
double acc_error = 0;
if constexpr(is_any_of<AccDataType, I8, I32, int>::value)
{
return 0;
}
else
{
acc_error =
std::pow(2, expo - numeric_traits<AccDataType>::mant) * 0.5 * number_of_accumulations;
}
return std::max(acc_error, midway_error);
}
template <typename T> template <typename T>
std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) std::ostream& operator<<(std::ostream& os, const std::vector<T>& v)
{ {
...@@ -337,7 +443,11 @@ std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_val ...@@ -337,7 +443,11 @@ std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_val
} }
if(!res) if(!res)
{ {
std::cerr << std::setw(12) << std::setprecision(7) << "max err: " << max_err << std::endl; const float error_percent =
static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
std::cerr << "max err: " << max_err;
std::cerr << ", number of errors: " << err_count;
std::cerr << ", " << error_percent << "% wrong values" << std::endl;
} }
return res; return res;
} }
......
...@@ -14,57 +14,41 @@ namespace detail { ...@@ -14,57 +14,41 @@ namespace detail {
template <typename OldLayout> template <typename OldLayout>
CK_TILE_HOST std::vector<std::size_t> get_layout_transpose_gnchw_to_old() CK_TILE_HOST std::vector<std::size_t> get_layout_transpose_gnchw_to_old()
{ {
if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNCW> || using namespace ck_tile::tensor_layout::convolution;
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GKCX> ||
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNKW>) if constexpr(is_any_of<OldLayout, GNCW, GKCX, GNKW>::value)
{ {
return {0, 1, 2, 3}; return {0, 1, 2, 3};
} }
else if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNCHW> || else if constexpr(is_any_of<OldLayout, GNCHW, GKCYX, GNKHW>::value)
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GKCYX> ||
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNKHW>)
{ {
return {0, 1, 2, 3, 4}; return {0, 1, 2, 3, 4};
} }
else if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNCDHW> || else if constexpr(is_any_of<OldLayout, GNCDHW, GKCZYX, GNKDHW>::value)
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GKCZYX> ||
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNKDHW>)
{ {
return {0, 1, 2, 3, 4, 5}; return {0, 1, 2, 3, 4, 5};
} }
if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNWC> || if constexpr(is_any_of<OldLayout, GNWC, GKXC, GNWK>::value)
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GKXC> ||
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNWK>)
{ {
return {0, 1, 3, 2}; return {0, 1, 3, 2};
} }
else if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNHWC> || else if constexpr(is_any_of<OldLayout, GNHWC, GKYXC, GNHWK>::value)
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GKYXC> ||
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNHWK>)
{ {
return {0, 1, 4, 2, 3}; return {0, 1, 4, 2, 3};
} }
else if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNDHWC> || else if constexpr(is_any_of<OldLayout, GNDHWC, GKZYXC, GNDHWK>::value)
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GKZYXC> ||
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::GNDHWK>)
{ {
return {0, 1, 5, 2, 3, 4}; return {0, 1, 5, 2, 3, 4};
} }
else if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::NWGC> || else if constexpr(is_any_of<OldLayout, NWGC, KXGC, NWGK>::value)
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::KXGC> ||
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::NWGK>)
{ {
return {2, 0, 3, 1}; return {2, 0, 3, 1};
} }
else if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::NHWGC> || else if constexpr(is_any_of<OldLayout, NHWGC, KYXGC, NHWGK>::value)
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::KYXGC> ||
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::NHWGK>)
{ {
return {3, 0, 4, 1, 2}; return {3, 0, 4, 1, 2};
} }
else if constexpr(std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::NDHWGC> || else if constexpr(is_any_of<OldLayout, NDHWGC, KZYXGC, NDHWGK>::value)
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::KZYXGC> ||
std::is_same_v<OldLayout, ck_tile::tensor_layout::convolution::NDHWGK>)
{ {
return {4, 0, 5, 1, 2, 3}; return {4, 0, 5, 1, 2, 3};
} }
...@@ -83,11 +67,11 @@ template <typename InLayout> ...@@ -83,11 +67,11 @@ template <typename InLayout>
CK_TILE_HOST HostTensorDescriptor CK_TILE_HOST HostTensorDescriptor
make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvParam& param) make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvParam& param)
{ {
using namespace ck_tile::tensor_layout::convolution;
std::vector<std::size_t> physical_lengths; std::vector<std::size_t> physical_lengths;
if constexpr(std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::GNCW> || if constexpr(is_any_of<InLayout, GNCW, GNCHW, GNCDHW>::value)
std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::GNCHW> ||
std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::GNCDHW>)
{ {
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_), physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_), static_cast<std::size_t>(param.N_),
...@@ -97,9 +81,7 @@ make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvPara ...@@ -97,9 +81,7 @@ make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvPara
param.input_spatial_lengths_.begin(), param.input_spatial_lengths_.begin(),
param.input_spatial_lengths_.begin() + param.num_dim_spatial_); param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
} }
else if constexpr(std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::GNWC> || else if constexpr(is_any_of<InLayout, GNWC, GNHWC, GNDHWC>::value)
std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::GNHWC> ||
std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::GNDHWC>)
{ {
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_), physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_), static_cast<std::size_t>(param.N_),
...@@ -109,9 +91,7 @@ make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvPara ...@@ -109,9 +91,7 @@ make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvPara
param.input_spatial_lengths_.begin(), param.input_spatial_lengths_.begin(),
param.input_spatial_lengths_.begin() + param.num_dim_spatial_); param.input_spatial_lengths_.begin() + param.num_dim_spatial_);
} }
else if constexpr(std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::NWGC> || else if constexpr(is_any_of<InLayout, NWGC, NHWGC, NDHWGC>::value)
std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::NHWGC> ||
std::is_same_v<InLayout, ck_tile::tensor_layout::convolution::NDHWGC>)
{ {
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_), physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.G_), static_cast<std::size_t>(param.G_),
...@@ -139,11 +119,11 @@ template <typename WeiLayout> ...@@ -139,11 +119,11 @@ template <typename WeiLayout>
CK_TILE_HOST HostTensorDescriptor CK_TILE_HOST HostTensorDescriptor
make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvParam& param) make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvParam& param)
{ {
using namespace ck_tile::tensor_layout::convolution;
std::vector<std::size_t> physical_lengths; std::vector<std::size_t> physical_lengths;
if constexpr(std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::KXC> || if constexpr(is_any_of<WeiLayout, KXC, KYXC, KZYXC>::value)
std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::KYXC> ||
std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::KZYXC>)
{ {
if(param.G_ != 1) if(param.G_ != 1)
{ {
...@@ -157,9 +137,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara ...@@ -157,9 +137,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara
param.filter_spatial_lengths_.begin(), param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_); param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
} }
else if constexpr(std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::GKCX> || else if constexpr(is_any_of<WeiLayout, GKCX, GKCYX, GKCZYX>::value)
std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::GKCYX> ||
std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::GKCZYX>)
{ {
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_), physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.K_), static_cast<std::size_t>(param.K_),
...@@ -169,9 +147,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara ...@@ -169,9 +147,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara
param.filter_spatial_lengths_.begin(), param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_); param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
} }
else if constexpr(std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::GKXC> || else if constexpr(is_any_of<WeiLayout, GKXC, GKYXC, GKZYXC>::value)
std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::GKYXC> ||
std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::GKZYXC>)
{ {
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_), physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.K_), static_cast<std::size_t>(param.K_),
...@@ -181,9 +157,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara ...@@ -181,9 +157,7 @@ make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvPara
param.filter_spatial_lengths_.begin(), param.filter_spatial_lengths_.begin(),
param.filter_spatial_lengths_.begin() + param.num_dim_spatial_); param.filter_spatial_lengths_.begin() + param.num_dim_spatial_);
} }
else if constexpr(std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::KXGC> || else if constexpr(is_any_of<WeiLayout, KXGC, KYXGC, KZYXGC>::value)
std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::KYXGC> ||
std::is_same_v<WeiLayout, ck_tile::tensor_layout::convolution::KZYXGC>)
{ {
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_), physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.K_),
static_cast<std::size_t>(param.G_), static_cast<std::size_t>(param.G_),
...@@ -211,11 +185,11 @@ template <typename OutLayout> ...@@ -211,11 +185,11 @@ template <typename OutLayout>
CK_TILE_HOST HostTensorDescriptor CK_TILE_HOST HostTensorDescriptor
make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvParam& param) make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvParam& param)
{ {
using namespace ck_tile::tensor_layout::convolution;
std::vector<std::size_t> physical_lengths; std::vector<std::size_t> physical_lengths;
if constexpr(std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::GNKW> || if constexpr(is_any_of<OutLayout, GNKW, GNKHW, GNKDHW>::value)
std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::GNKHW> ||
std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::GNKDHW>)
{ {
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_), physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_), static_cast<std::size_t>(param.N_),
...@@ -226,9 +200,7 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvPar ...@@ -226,9 +200,7 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvPar
param.output_spatial_lengths_.begin() + param.num_dim_spatial_); param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
} }
// separate from legacy code above // separate from legacy code above
else if constexpr(std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::GNWK> || else if constexpr(is_any_of<OutLayout, GNWK, GNHWK, GNDHWK>::value)
std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::GNHWK> ||
std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::GNDHWK>)
{ {
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_), physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.G_),
static_cast<std::size_t>(param.N_), static_cast<std::size_t>(param.N_),
...@@ -238,9 +210,7 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvPar ...@@ -238,9 +210,7 @@ make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvPar
param.output_spatial_lengths_.begin(), param.output_spatial_lengths_.begin(),
param.output_spatial_lengths_.begin() + param.num_dim_spatial_); param.output_spatial_lengths_.begin() + param.num_dim_spatial_);
} }
else if constexpr(std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::NWGK> || else if constexpr(is_any_of<OutLayout, NWGK, NHWGK, NDHWGK>::value)
std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::NHWGK> ||
std::is_same_v<OutLayout, ck_tile::tensor_layout::convolution::NDHWGK>)
{ {
physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_), physical_lengths = std::vector<std::size_t>{static_cast<std::size_t>(param.N_),
static_cast<std::size_t>(param.G_), static_cast<std::size_t>(param.G_),
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -678,4 +678,43 @@ struct HostTensor ...@@ -678,4 +678,43 @@ struct HostTensor
Descriptor mDesc; Descriptor mDesc;
Data mData; Data mData;
}; };
template <bool is_row_major>
auto host_tensor_descriptor(std::size_t row,
std::size_t col,
std::size_t stride,
bool_constant<is_row_major>)
{
using namespace ck_tile::literals;
if constexpr(is_row_major)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
}
template <bool is_row_major>
auto get_default_stride(std::size_t row,
std::size_t col,
std::size_t stride,
bool_constant<is_row_major>)
{
if(stride == 0)
{
if constexpr(is_row_major)
{
return col;
}
else
{
return row;
}
}
else
return stride;
}
} // namespace ck_tile } // namespace ck_tile
...@@ -73,7 +73,7 @@ void reference_fused_moe( ...@@ -73,7 +73,7 @@ void reference_fused_moe(
ck_tile::index_t tokens, ck_tile::index_t tokens,
ck_tile::index_t experts, ck_tile::index_t experts,
ck_tile::index_t hidden_size, ck_tile::index_t hidden_size,
ck_tile::index_t intermediate_size, // this size is for gate/up ck_tile::index_t intermediate_size, // this size is for gate/up/down
ck_tile::index_t topk, ck_tile::index_t topk,
ck_tile::index_t gate_only) ck_tile::index_t gate_only)
{ {
...@@ -82,19 +82,8 @@ void reference_fused_moe( ...@@ -82,19 +82,8 @@ void reference_fused_moe(
assert(sorted_expert_ids_host.get_num_of_dimension() == 1); assert(sorted_expert_ids_host.get_num_of_dimension() == 1);
assert(num_sorted_tiles_host.get_element_size() == 1); assert(num_sorted_tiles_host.get_element_size() == 1);
ck_tile::index_t num_sorted_tiles = num_sorted_tiles_host.mData[0] / block_m; ck_tile::index_t num_sorted_tiles = num_sorted_tiles_host.mData[0] / block_m;
ck_tile::index_t intermediate_size_0 = intermediate_size; ck_tile::index_t intermediate_size_0 = intermediate_size * (gate_only ? 1 : 2);
ck_tile::index_t intermediate_size_1 = intermediate_size / (gate_only ? 1 : 2); ck_tile::index_t intermediate_size_1 = intermediate_size;
// TODO: better remove this in the future, or modify the token_id value
auto get_topk_id = [&](ck_tile::index_t token_id_, ck_tile::index_t expert_id_) {
for(ck_tile::index_t i_ = 0; i_ < topk; i_++)
{
if(token_ids_host(token_id_, i_) == expert_id_)
return i_;
}
throw std::runtime_error("not correct token/expert pair\n");
return -1; // TODO: not correct!!
};
ck_tile::HostTensor<AccDataType> out_topk_tokens({tokens, topk, hidden_size}); ck_tile::HostTensor<AccDataType> out_topk_tokens({tokens, topk, hidden_size});
...@@ -105,11 +94,31 @@ void reference_fused_moe( ...@@ -105,11 +94,31 @@ void reference_fused_moe(
if(i_tile >= num_sorted_tiles) if(i_tile >= num_sorted_tiles)
return; return;
ck_tile::index_t i_expert = sorted_expert_ids_host.mData[i_tile]; ck_tile::index_t i_expert = sorted_expert_ids_host.mData[i_tile];
ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten];
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten];
ck_tile::index_t i_topk = i_token >> 24;
i_token &= 0xffffff;
if(i_token >= tokens)
return;
(void)token_ids_host;
#else
// TODO: better remove this in the future, or modify the token_id value
auto get_topk_id = [&](ck_tile::index_t token_id_, ck_tile::index_t expert_id_) {
for(ck_tile::index_t i_ = 0; i_ < topk; i_++)
{
if(token_ids_host(token_id_, i_) == expert_id_)
return i_;
}
throw std::runtime_error("not correct token/expert pair\n");
return -1; // TODO: not correct!!
};
ck_tile::index_t i_token = sorted_token_ids_host.mData[i_flatten];
if(i_token >= tokens) if(i_token >= tokens)
return; return;
ck_tile::index_t i_topk = get_topk_id(i_token, i_expert); // TODO: ugly ck_tile::index_t i_topk = get_topk_id(i_token, i_expert); // TODO: ugly
auto weight = sorted_weight_host.mData[i_flatten]; #endif
auto weight = sorted_weight_host.mData[i_flatten];
ck_tile::HostTensor<AccDataType> acc_0({1, intermediate_size_0}); ck_tile::HostTensor<AccDataType> acc_0({1, intermediate_size_0});
// first gemm // first gemm
......
...@@ -8,16 +8,40 @@ ...@@ -8,16 +8,40 @@
namespace ck_tile { namespace ck_tile {
// Note: for simplicity, each functor only care about single M
struct reference_rmsnorm2d_default_epilogue
{
template <typename OutDataType, typename AccDataType>
void operator()(int m, HostTensor<OutDataType>& o, const HostTensor<AccDataType>& acc)
{
const int N = acc.mDesc.get_lengths()[1];
for(int n = 0; n < N; ++n)
{
o(m, n) = ck_tile::type_convert<OutDataType>(acc(m, n));
}
}
template <typename OutDataType, typename AccDataType>
auto operator()(int m, const HostTensor<AccDataType>& acc)
{
HostTensor<OutDataType> o(acc.get_lengths(), acc.get_strides());
operator()(m, o, acc);
return o;
}
};
template <typename XDataType, template <typename XDataType,
typename GammaDataType, typename GammaDataType,
typename ComputeDataType, typename ComputeDataType,
typename YDataType, typename YDataType,
typename InvRmsDataType> typename InvRmsDataType,
typename Epilogue = reference_rmsnorm2d_default_epilogue>
void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n, void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
const HostTensor<GammaDataType>& gamma_n, const HostTensor<GammaDataType>& gamma_n,
HostTensor<YDataType>& y_m_n, HostTensor<YDataType>& y_m_n,
HostTensor<InvRmsDataType>& invRms_m, HostTensor<InvRmsDataType>& invRms_m,
ComputeDataType epsilon) ComputeDataType epsilon,
Epilogue epilogue_functor = {})
{ {
auto rmsnorm2d_fwd_func = [&](auto m) { auto rmsnorm2d_fwd_func = [&](auto m) {
const int N = x_m_n.mDesc.get_lengths()[1]; const int N = x_m_n.mDesc.get_lengths()[1];
...@@ -37,13 +61,15 @@ void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n, ...@@ -37,13 +61,15 @@ void reference_rmsnorm2d_fwd(const HostTensor<XDataType>& x_m_n,
if constexpr(!std::is_same_v<InvRmsDataType, ck_tile::null_type>) if constexpr(!std::is_same_v<InvRmsDataType, ck_tile::null_type>)
invRms_m(m) = ck_tile::type_convert<InvRmsDataType>(divisor); invRms_m(m) = ck_tile::type_convert<InvRmsDataType>(divisor);
HostTensor<ComputeDataType> acc(x_m_n.get_lengths(), x_m_n.get_strides());
for(int n = 0; n < N; ++n) for(int n = 0; n < N; ++n)
{ {
ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n)); ComputeDataType x = ck_tile::type_convert<ComputeDataType>(x_m_n(m, n));
ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n)); ComputeDataType gamma = ck_tile::type_convert<ComputeDataType>(gamma_n(n));
auto y = x * divisor * gamma; acc(m, n) = x * divisor * gamma;
y_m_n(m, n) = ck_tile::type_convert<YDataType>(y);
} }
epilogue_functor(m, y_m_n, acc);
}; };
make_ParallelTensorFunctor(rmsnorm2d_fwd_func, invRms_m.mDesc.get_lengths()[0])( make_ParallelTensorFunctor(rmsnorm2d_fwd_func, invRms_m.mDesc.get_lengths()[0])(
......
...@@ -22,7 +22,7 @@ CK_TILE_HOST void reference_rowwise_quantization2d(const HostTensor<XDataType>& ...@@ -22,7 +22,7 @@ CK_TILE_HOST void reference_rowwise_quantization2d(const HostTensor<XDataType>&
// scale = amax / 127 for int8 // scale = amax / 127 for int8
auto v_scale = type_convert<XDataType>(scale_m(m)); auto v_scale = type_convert<XDataType>(scale_m(m));
auto v_qx = v_x / v_scale; auto v_qx = v_x / v_scale;
qx_m_n(m, n) = saturates<QXDataType>{}(v_qx); qx_m_n(m, n) = type_convert<QXDataType>(saturates<QXDataType>{}(v_qx));
} }
}; };
......
...@@ -719,7 +719,82 @@ struct Silu ...@@ -719,7 +719,82 @@ struct Silu
constexpr T one = type_convert<T>(1); constexpr T one = type_convert<T>(1);
y = x * (one / (one + ck_tile::exp(-x))); y = x * (one / (one + ck_tile::exp(-x)));
}; };
template <>
CK_TILE_HOST_DEVICE void operator()<fp32x2_t>(fp32x2_t& y, const fp32x2_t& x) const
{
constexpr auto one = type_convert<float>(1);
y[0] = x[0] * __builtin_amdgcn_rcpf(one + ck_tile::exp(-x[0]));
y[1] = x[1] * __builtin_amdgcn_rcpf(one + ck_tile::exp(-x[1]));
};
};
#if 0
// Silu, the formular is not so good to do inline asm (dependency)
// we put the code here purposely if in the future ppl want to try
struct SiluAsm
{
template <typename T>
CK_TILE_HOST void operator()(T& y, T& x) const
{
static_assert(std::is_same_v<T, float>, "Data type is not supported by this operation!");
constexpr T one = type_convert<T>(1);
y = x * (one / (one + ck_tile::exp(-x)));
};
template <typename T>
CK_TILE_DEVICE void operator()(T& y, T& x) const
{
static_assert(std::is_same_v<T, float>, "Data type is not supported by this operation!");
const uint32_t log2e_neg_ = 0x3fb8aa3b | 0x80000000; // log2e_v<float> * -1;
// NOTE: x/y can't be same register before inline asm
// "+v" as y, "v" as x is not enought, x/y stil maybe put to same register
T tmp = x;
asm volatile("v_mul_f32 %[v_y], %[s_log2e], %[v_x]\n"
"v_exp_f32 %[v_y], %[v_y]\n"
"s_nop 0 ; hazard for exp\n"
"v_add_f32 %[v_y], %[v_y], 1.0\n"
"v_rcp_f32 %[v_y], %[v_y]\n"
"s_nop 0 ; hazard for rcp\n"
"v_mul_f32 %[v_y], %[v_x], %[v_y]\n"
: [v_y] "+v"(y), [v_x] "+v"(tmp)
: [s_log2e] "s"(log2e_neg_)
:);
};
template <>
CK_TILE_HOST void operator()<fp32x2_t>(fp32x2_t& y, fp32x2_t& x) const
{
constexpr auto one = type_convert<float>(1);
y[0] = x[0] * (one / (one + ck_tile::exp(-x[0])));
y[1] = x[1] * (one / (one + ck_tile::exp(-x[1])));
};
template <>
CK_TILE_DEVICE void operator()<fp32x2_t>(fp32x2_t& y, fp32x2_t& x) const
{
const uint32_t log2e_neg_ = 0x3fb8aa3b | 0x80000000; // log2e_v<float> * -1;
// NOTE: x/y can't be same register before inline asm
// float tmp0 = x[0], tmp1 = x[1];
asm volatile("v_mul_f32 %[v_y0], %[s_log2e], %[v_x0]\n"
"v_mul_f32 %[v_y1], %[s_log2e], %[v_x1]\n"
"v_exp_f32 %[v_y0], %[v_y0]\n"
"v_exp_f32 %[v_y1], %[v_y1]\n"
"v_add_f32 %[v_y0], %[v_y0], 1.0\n"
"v_add_f32 %[v_y1], %[v_y1], 1.0\n"
"v_rcp_f32 %[v_y0], %[v_y0]\n"
"v_rcp_f32 %[v_y1], %[v_y1]\n"
"v_mul_f32 %[v_y0], %[v_x0], %[v_y0]\n"
"v_mul_f32 %[v_y1], %[v_x1], %[v_y1]\n"
: [v_y0] "+v"(y[0]), [v_y1] "+v"(y[1]), [v_x0] "+v"(x[0]), [v_x1] "+v"(x[1])
: [s_log2e] "s"(log2e_neg_)
:);
};
}; };
#endif
struct TanH struct TanH
{ {
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
...@@ -24,19 +24,19 @@ struct DynamicQuantEpilogueTraits ...@@ -24,19 +24,19 @@ struct DynamicQuantEpilogueTraits
// this epilogue just store out a M*N matrix, row major // this epilogue just store out a M*N matrix, row major
template <typename AccDataType_, template <typename AccDataType_,
typename XScaleDataType_, typename SmoothScaleDataType_,
typename YScaleDataType_, typename YScaleDataType_,
typename ODataType_, typename ODataType_,
typename BlockShape_, typename BlockShape_,
typename Traits_> typename Traits_>
struct DynamicQuantEpilogueProblem struct DynamicQuantEpilogueProblem
{ {
using AccDataType = remove_cvref_t<AccDataType_>; using AccDataType = remove_cvref_t<AccDataType_>;
using XScaleDataType = remove_cvref_t<XScaleDataType_>; using SmoothScaleDataType = remove_cvref_t<SmoothScaleDataType_>;
using YScaleDataType = remove_cvref_t<YScaleDataType_>; using YScaleDataType = remove_cvref_t<YScaleDataType_>;
using ODataType = remove_cvref_t<ODataType_>; using ODataType = remove_cvref_t<ODataType_>;
using BlockShape = remove_cvref_t<BlockShape_>; // can consum generic 2d shape using BlockShape = remove_cvref_t<BlockShape_>; // can consum generic 2d shape
using Traits = remove_cvref_t<Traits_>; using Traits = remove_cvref_t<Traits_>;
}; };
// TODO: we should put descriptor creation function into policy // TODO: we should put descriptor creation function into policy
...@@ -45,7 +45,7 @@ struct DynamicQuantEpilogue ...@@ -45,7 +45,7 @@ struct DynamicQuantEpilogue
{ {
using Problem = remove_cvref_t<Problem_>; using Problem = remove_cvref_t<Problem_>;
using AccDataType = remove_cvref_t<typename Problem::AccDataType>; using AccDataType = remove_cvref_t<typename Problem::AccDataType>;
using XScaleDataType = remove_cvref_t<typename Problem::XScaleDataType>; using SmoothScaleDataType = remove_cvref_t<typename Problem::SmoothScaleDataType>;
using YScaleDataType = remove_cvref_t<typename Problem::YScaleDataType>; using YScaleDataType = remove_cvref_t<typename Problem::YScaleDataType>;
using ODataType = remove_cvref_t<typename Problem::ODataType>; using ODataType = remove_cvref_t<typename Problem::ODataType>;
using BlockShape = remove_cvref_t<typename Problem::BlockShape>; using BlockShape = remove_cvref_t<typename Problem::BlockShape>;
...@@ -78,7 +78,7 @@ struct DynamicQuantEpilogue ...@@ -78,7 +78,7 @@ struct DynamicQuantEpilogue
#if 0 #if 0
// don't remove this // don't remove this
// Note that if we set encoding purposely like this, you will result in compile fail // Note that if we set encoding purposely like this, you will result in compile fail
// TODO: x_scale create local-scratch to accept arbitrary acc input (with same length) // TODO: sm_scale create local-scratch to accept arbitrary acc input (with same length)
return make_static_tile_distribution( return make_static_tile_distribution(
tile_distribution_encoding< tile_distribution_encoding<
sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M>, sequence<S::Repeat_M, S::WarpPerBlock_M, S::ThreadPerWarp_M>,
...@@ -105,34 +105,18 @@ struct DynamicQuantEpilogue ...@@ -105,34 +105,18 @@ struct DynamicQuantEpilogue
return reduce_crosswarp_sync.GetSmemSize(); return reduce_crosswarp_sync.GetSmemSize();
} }
// TODO: this function assume store out vector size is the same as OAccTile last dimension size template <typename ODramWindowTmp, typename YScaleWindow, typename OAccTile>
// how do we fix this ? CK_TILE_DEVICE auto Impl(ODramWindowTmp& o_dram_window_tmp,
template <typename ODramWindowTmp, YScaleWindow& y_scale_window,
typename XScaleWindow, const OAccTile& o_acc_tile,
typename YScaleWindow, void* smem)
typename OAccTile>
CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp,
const XScaleWindow& x_scale_window_,
YScaleWindow& y_scale_window,
const OAccTile& o_acc_tile,
void* smem)
{ {
auto reduce = GetBlockReduce2d(); auto reduce = GetBlockReduce2d();
auto reduce_sync = GetBlockReduce2dSync(); auto reduce_sync = GetBlockReduce2dSync();
auto reduce_crosswarp_sync = GetBlockReduce2dCrossWarpSync(); auto reduce_crosswarp_sync = GetBlockReduce2dCrossWarpSync();
const auto x_scale_window =
make_tile_window(x_scale_window_, MakeSmoothInputScaleTileDistribution());
auto x_scale = load_tile(x_scale_window);
auto o_acc_tmp = o_acc_tile; auto o_acc_tmp = o_acc_tile;
sweep_tile(o_acc_tmp, [&](auto idx) {
constexpr auto j_idx = make_tuple(idx[number<1>{}]);
const auto xs_ = type_convert<AccDataType>(x_scale[j_idx]);
o_acc_tmp(idx) = o_acc_tmp(idx) * xs_;
});
const auto f_absmax = [](auto acc_, auto v_0_) { return max(acc_, abs(v_0_)); }; const auto f_absmax = [](auto acc_, auto v_0_) { return max(acc_, abs(v_0_)); };
auto row_absmax = [&]() { auto row_absmax = [&]() {
...@@ -184,5 +168,45 @@ struct DynamicQuantEpilogue ...@@ -184,5 +168,45 @@ struct DynamicQuantEpilogue
store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tmp)); store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_acc_tmp));
} }
} }
// TODO: this function assume store out vector size is the same as OAccTile last dimension size
// how do we fix this ?
// Smooth Dynamic Quant
template <typename ODramWindowTmp,
typename SmoothScaleWindow,
typename YScaleWindow,
typename OAccTile>
CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp,
const SmoothScaleWindow& sm_scale_window_,
YScaleWindow& y_scale_window,
const OAccTile& o_acc_tile,
void* smem)
{
const auto sm_scale_window =
make_tile_window(sm_scale_window_, MakeSmoothInputScaleTileDistribution());
auto sm_scale = load_tile(sm_scale_window);
auto o_acc_tmp = o_acc_tile;
sweep_tile(o_acc_tmp, [&](auto idx) {
constexpr auto j_idx = make_tuple(idx[number<1>{}]);
const auto xs_ = type_convert<AccDataType>(sm_scale[j_idx]);
o_acc_tmp(idx) = o_acc_tmp(idx) * xs_;
});
Impl(o_dram_window_tmp, y_scale_window, o_acc_tmp, smem);
}
// Dynamic Quant
template <typename ODramWindowTmp, typename YScaleWindow, typename OAccTile>
CK_TILE_DEVICE auto operator()(ODramWindowTmp& o_dram_window_tmp,
YScaleWindow& y_scale_window,
const OAccTile& o_acc_tile,
void* smem)
{
Impl(o_dram_window_tmp, y_scale_window, o_acc_tile, smem);
}
}; };
} // namespace ck_tile } // namespace ck_tile
...@@ -234,10 +234,153 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16 ...@@ -234,10 +234,153 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_Base // for f16/bf16
CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize() CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetSmemSize()
{ {
return 32 * (128 + 8) * sizeof(bf16_t); // return 32 * (128 + 8) * sizeof(bf16_t);
return MakeLdsLoadDesc_A().get_element_space_size() * sizeof(bf16_t) * 2; // 2 lds buffers
} }
}; };
// clang-format off
#define _EXPAND_ASM_ARGS_OUT_ONE_ACC \
[s_loop_cnt]"+s"(loop_cnt), \
[v_acc_0]"+v"(v_acc[0]), \
[v_acc_1]"+v"(v_acc[1]), \
[v_acc_2]"+v"(v_acc[2]), \
[v_acc_3]"+v"(v_acc[3]), \
[v_acc_4]"+v"(v_acc[4]), \
[v_acc_5]"+v"(v_acc[5]), \
[v_acc_6]"+v"(v_acc[6]), \
[v_acc_7]"+v"(v_acc[7]), \
[v_acc_8]"+v"(v_acc[8]), \
[v_acc_9]"+v"(v_acc[9]), \
[v_acc_10]"+v"(v_acc[10]), \
[v_acc_11]"+v"(v_acc[11]), \
[v_acc_12]"+v"(v_acc[12]), \
[v_acc_13]"+v"(v_acc[13]), \
[v_acc_14]"+v"(v_acc[14]), \
[v_acc_15]"+v"(v_acc[15]), \
[s_mem_]"+r"(smem)
#define _EXPAND_ASM_ARGS_OUT_TWO_ACC \
[s_loop_cnt]"+s"(loop_cnt), \
[v_acc_0]"+v"(v_acc[0]), \
[v_acc_1]"+v"(v_acc[1]), \
[v_acc_2]"+v"(v_acc[2]), \
[v_acc_3]"+v"(v_acc[3]), \
[v_acc_4]"+v"(v_acc[4]), \
[v_acc_5]"+v"(v_acc[5]), \
[v_acc_6]"+v"(v_acc[6]), \
[v_acc_7]"+v"(v_acc[7]), \
[v_acc_8]"+v"(v_acc[8]), \
[v_acc_9]"+v"(v_acc[9]), \
[v_acc_10]"+v"(v_acc[10]), \
[v_acc_11]"+v"(v_acc[11]), \
[v_acc_12]"+v"(v_acc[12]), \
[v_acc_13]"+v"(v_acc[13]), \
[v_acc_14]"+v"(v_acc[14]), \
[v_acc_15]"+v"(v_acc[15]), \
[v_acc_16]"+v"(v_acc[16]), \
[v_acc_17]"+v"(v_acc[17]), \
[v_acc_18]"+v"(v_acc[18]), \
[v_acc_19]"+v"(v_acc[19]), \
[v_acc_20]"+v"(v_acc[20]), \
[v_acc_21]"+v"(v_acc[21]), \
[v_acc_22]"+v"(v_acc[22]), \
[v_acc_23]"+v"(v_acc[23]), \
[v_acc_24]"+v"(v_acc[24]), \
[v_acc_25]"+v"(v_acc[25]), \
[v_acc_26]"+v"(v_acc[26]), \
[v_acc_27]"+v"(v_acc[27]), \
[v_acc_28]"+v"(v_acc[28]), \
[v_acc_29]"+v"(v_acc[29]), \
[v_acc_30]"+v"(v_acc[30]), \
[v_acc_31]"+v"(v_acc[31]), \
[s_mem_]"+r"(smem)
#define _EXPAND_ASM_ARGS_IN \
[s_res_a0]"s"(res_a[0]), \
[s_res_a1]"s"(res_a[1]), \
[s_res_a2]"s"(res_a[2]), \
[s_res_a3]"s"(res_a[3]), \
[s_res_b0]"s"(res_b[0]), \
[s_res_b1]"s"(res_b[1]), \
[s_res_b2]"s"(res_b[2]), \
[s_res_b3]"s"(res_b[3]), \
[v_os_a0]"v"(static_cast<index_t>(cached_coords_a[number<0>{}] * sizeof(ADataType))), \
[v_os_a1]"v"(static_cast<index_t>(cached_coords_a[number<1>{}] * sizeof(ADataType))), \
[v_os_a2]"v"(static_cast<index_t>(cached_coords_a[number<2>{}] * sizeof(ADataType))), \
[v_os_a3]"v"(static_cast<index_t>(cached_coords_a[number<3>{}] * sizeof(ADataType))), \
[v_os_a4]"v"(static_cast<index_t>(cached_coords_a[number<4>{}] * sizeof(ADataType))), \
[v_os_a5]"v"(static_cast<index_t>(cached_coords_a[number<5>{}] * sizeof(ADataType))), \
[v_os_a6]"v"(static_cast<index_t>(cached_coords_a[number<6>{}] * sizeof(ADataType))), \
[v_os_a7]"v"(static_cast<index_t>(cached_coords_a[number<7>{}] * sizeof(ADataType))), \
\
[v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))), \
[v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))), \
[v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))), \
[v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))), \
[v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))), \
[v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))), \
[v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))), \
[v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))), \
\
[v_os_slda]"v"(static_cast<index_t>(a_sld.cached_coords_[number<0>{}].get_offset() * sizeof(ADataType))),\
[s_m0_init]"s"(m0_init_value), \
[s_size_per_issue]"s"(size_per_issue), \
[smem_sz]"n"(smem_buf_size), \
[sld_os_0]"n"(sld_os[number<0>{}].value), \
[sld_os_1]"n"(sld_os[number<1>{}].value), \
[sld_os_2]"n"(sld_os[number<2>{}].value), \
[sld_os_3]"n"(sld_os[number<3>{}].value), \
[sld_os_4]"n"(sld_os[number<4>{}].value), \
[sld_os_5]"n"(sld_os[number<5>{}].value), \
[sld_os_6]"n"(sld_os[number<6>{}].value), \
[sld_os_7]"n"(sld_os[number<7>{}].value), \
[s_tile_os_a]"s"(tile_offset_a_bytes), \
[s_tile_os_b]"s"(tile_offset_b_bytes)
#define _EXPAND_ASM_ARGS_CLOBBER \
"memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9", \
"a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19", \
"a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29", \
"a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39", \
"a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49", \
"a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59", \
"a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69", \
"a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79", \
"a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89", \
"a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99", \
"a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107", \
"a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115", \
"a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123", \
"a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131", \
"a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139", \
"a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147", \
"a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155", \
"a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163", \
"a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171", \
"a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179", \
"a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187", \
"a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195", \
"a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203", \
"a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211", \
"a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219", \
"a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227", \
"a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235", \
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243", \
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251", \
"a252", "a253", "a254", "a255", \
"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", \
"s86", \
"v64", "v65", "v66", "v67", "v68", "v69", \
"v70", "v71", "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79", \
"v80", "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89", \
"v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99", \
"v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107", \
"v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115", \
"v116", "v117", "v118", "v119", "v120", "v121", "v122", "v123", \
"v124", "v125", "v126", "v127"
// clang-format on
struct Flatmm_32x512x128_1x4x1_16x16x32_BF16 : public Flatmm_32x512x128_1x4x1_16x16x32_Base struct Flatmm_32x512x128_1x4x1_16x16x32_BF16 : public Flatmm_32x512x128_1x4x1_16x16x32_Base
{ {
using ADataType = bf16_t; using ADataType = bf16_t;
...@@ -245,7 +388,9 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_BF16 : public Flatmm_32x512x128_1x4x1_16 ...@@ -245,7 +388,9 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_BF16 : public Flatmm_32x512x128_1x4x1_16
// TODO: need paired with tile_window_linear! // TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function! // TODO: need call init_raw() before call this function!
template <typename ARes, typename ACoords, typename BRes, typename BCoords> // Is2B: originally for B matrix we have 2 prefetch buffers. If set this to true
// we can support A matric serve 2 B matrix, B0/B1, each B0/B1 still have same tile size
template <typename ARes, typename ACoords, typename BRes, typename BCoords, bool Is2B = false>
CK_TILE_DEVICE auto CK_TILE_DEVICE auto
operator()(const ARes& res_a, operator()(const ARes& res_a,
const ACoords& cached_coords_a, const ACoords& cached_coords_a,
...@@ -254,7 +399,8 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_BF16 : public Flatmm_32x512x128_1x4x1_16 ...@@ -254,7 +399,8 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_BF16 : public Flatmm_32x512x128_1x4x1_16
CK_TILE_LDS_ADDR void* smem, CK_TILE_LDS_ADDR void* smem,
index_t k, index_t k,
index_t tile_offset_a, // for each tile, the offset to move for each unroll index_t tile_offset_a, // for each tile, the offset to move for each unroll
index_t tile_offset_b) // for each tile, the offset to move for each unroll index_t tile_offset_b,
bool_constant<Is2B> = {}) // for each tile, the offset to move for each unroll
{ {
static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 2 /*2x per dword*/); // 8 static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 2 /*2x per dword*/); // 8
static_assert(BCoords::size() == Repeat_N); static_assert(BCoords::size() == Repeat_N);
...@@ -299,129 +445,78 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_BF16 : public Flatmm_32x512x128_1x4x1_16 ...@@ -299,129 +445,78 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_BF16 : public Flatmm_32x512x128_1x4x1_16
index_t loop_cnt = k / Block_K; index_t loop_cnt = k / Block_K;
// this is the acc thread buffer if constexpr(Is2B)
fp32x4_t v_acc[16]{.0f}; {
// this is the acc thread buffer
fp32x4_t v_acc[32]{.0f};
// B nr->kr // B nr->kr
#pragma clang diagnostic push #pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm" #pragma clang diagnostic ignored "-Winline-asm"
// clang-format off // clang-format off
asm volatile( asm volatile(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16 #define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
#define CK_TILE_FLATMM_UK_2B 1
#include "uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc" #include "uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc"
#undef CK_TILE_FLATMM_UK_MFMA : _EXPAND_ASM_ARGS_OUT_TWO_ACC
: [s_loop_cnt]"+s"(loop_cnt), : _EXPAND_ASM_ARGS_IN,
[v_acc_0]"+v"(v_acc[0]), [s_res_b4]"s"(res_b[4]),
[v_acc_1]"+v"(v_acc[1]), [s_res_b5]"s"(res_b[5]),
[v_acc_2]"+v"(v_acc[2]), [s_res_b6]"s"(res_b[6]),
[v_acc_3]"+v"(v_acc[3]), [s_res_b7]"s"(res_b[7])
[v_acc_4]"+v"(v_acc[4]), : _EXPAND_ASM_ARGS_CLOBBER, "s24", "s25", "s26", "s27"
[v_acc_5]"+v"(v_acc[5]), );
[v_acc_6]"+v"(v_acc[6]), // clang-format on
[v_acc_7]"+v"(v_acc[7]),
[v_acc_8]"+v"(v_acc[8]),
[v_acc_9]"+v"(v_acc[9]),
[v_acc_10]"+v"(v_acc[10]),
[v_acc_11]"+v"(v_acc[11]),
[v_acc_12]"+v"(v_acc[12]),
[v_acc_13]"+v"(v_acc[13]),
[v_acc_14]"+v"(v_acc[14]),
[v_acc_15]"+v"(v_acc[15]),
[s_mem_]"+r"(smem)
: [s_res_a0]"s"(res_a[0]),
[s_res_a1]"s"(res_a[1]),
[s_res_a2]"s"(res_a[2]),
[s_res_a3]"s"(res_a[3]),
[s_res_b0]"s"(res_b[0]),
[s_res_b1]"s"(res_b[1]),
[s_res_b2]"s"(res_b[2]),
[s_res_b3]"s"(res_b[3]),
[v_os_a0]"v"(static_cast<index_t>(cached_coords_a[number<0>{}] * sizeof(ADataType))),
[v_os_a1]"v"(static_cast<index_t>(cached_coords_a[number<1>{}] * sizeof(ADataType))),
[v_os_a2]"v"(static_cast<index_t>(cached_coords_a[number<2>{}] * sizeof(ADataType))),
[v_os_a3]"v"(static_cast<index_t>(cached_coords_a[number<3>{}] * sizeof(ADataType))),
[v_os_a4]"v"(static_cast<index_t>(cached_coords_a[number<4>{}] * sizeof(ADataType))),
[v_os_a5]"v"(static_cast<index_t>(cached_coords_a[number<5>{}] * sizeof(ADataType))),
[v_os_a6]"v"(static_cast<index_t>(cached_coords_a[number<6>{}] * sizeof(ADataType))),
[v_os_a7]"v"(static_cast<index_t>(cached_coords_a[number<7>{}] * sizeof(ADataType))),
[v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
[v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
[v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
[v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
[v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
[v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
[v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
[v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
[v_os_slda]"v"(static_cast<index_t>(a_sld.cached_coords_[number<0>{}].get_offset() * sizeof(ADataType))),
[s_m0_init]"s"(m0_init_value),
[s_size_per_issue]"s"(size_per_issue),
[smem_sz]"n"(smem_buf_size), //(smem_buf_size),
[sld_os_0]"n"(sld_os[number<0>{}].value),
[sld_os_1]"n"(sld_os[number<1>{}].value),
[sld_os_2]"n"(sld_os[number<2>{}].value),
[sld_os_3]"n"(sld_os[number<3>{}].value),
[sld_os_4]"n"(sld_os[number<4>{}].value),
[sld_os_5]"n"(sld_os[number<5>{}].value),
[sld_os_6]"n"(sld_os[number<6>{}].value),
[sld_os_7]"n"(sld_os[number<7>{}].value),
[s_tile_os_a]"s"(tile_offset_a_bytes),
[s_tile_os_b]"s"(tile_offset_b_bytes)
: "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
"a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
"a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
"a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
"a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
"a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
"a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
"a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
"a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
"a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
"a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
"a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
"a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
"a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
"a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
"a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
"a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
"a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
"a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
"a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
"a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
"a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
"a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
"a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
"a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
"a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
"a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
"a252", "a253", "a254", "a255",
"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
"s86", // s86 as tmp
"v64", "v65", "v66", "v67", "v68", "v69",
"v70", "v71", "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79",
"v80", "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89",
"v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99",
"v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107",
"v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115",
"v116", "v117", "v118", "v119", "v120", "v121", "v122", "v123",
"v124", "v125", "v126", "v127"
);
// clang-format on
#pragma clang diagnostic pop #pragma clang diagnostic pop
// return local scratch // return local scratch
auto c = MakeCBlockTile(); auto c = make_tuple(MakeCBlockTile(), MakeCBlockTile());
for(auto i = 0; i < 16; i++) for(auto i = 0; i < 16; i++)
{
c.at(number<0>{}).get_thread_buffer()[4 * i + 0] = v_acc[i].x;
c.at(number<0>{}).get_thread_buffer()[4 * i + 1] = v_acc[i].y;
c.at(number<0>{}).get_thread_buffer()[4 * i + 2] = v_acc[i].z;
c.at(number<0>{}).get_thread_buffer()[4 * i + 3] = v_acc[i].w;
}
for(auto i = 0; i < 16; i++)
{
c.at(number<1>{}).get_thread_buffer()[4 * i + 0] = v_acc[16 + i].x;
c.at(number<1>{}).get_thread_buffer()[4 * i + 1] = v_acc[16 + i].y;
c.at(number<1>{}).get_thread_buffer()[4 * i + 2] = v_acc[16 + i].z;
c.at(number<1>{}).get_thread_buffer()[4 * i + 3] = v_acc[16 + i].w;
}
return c;
}
else
{ {
c.get_thread_buffer()[4 * i + 0] = v_acc[i].x; // this is the acc thread buffer
c.get_thread_buffer()[4 * i + 1] = v_acc[i].y; fp32x4_t v_acc[16]{.0f};
c.get_thread_buffer()[4 * i + 2] = v_acc[i].z;
c.get_thread_buffer()[4 * i + 3] = v_acc[i].w; // B nr->kr
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
// clang-format off
asm volatile(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
#include "uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc"
: _EXPAND_ASM_ARGS_OUT_ONE_ACC
: _EXPAND_ASM_ARGS_IN
: _EXPAND_ASM_ARGS_CLOBBER
);
// clang-format on
#pragma clang diagnostic pop
// return local scratch
auto c = MakeCBlockTile();
for(auto i = 0; i < 16; i++)
{
c.get_thread_buffer()[4 * i + 0] = v_acc[i].x;
c.get_thread_buffer()[4 * i + 1] = v_acc[i].y;
c.get_thread_buffer()[4 * i + 2] = v_acc[i].z;
c.get_thread_buffer()[4 * i + 3] = v_acc[i].w;
}
return c;
} }
return c;
} }
}; };
...@@ -432,7 +527,7 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_FP16 : public Flatmm_32x512x128_1x4x1_16 ...@@ -432,7 +527,7 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_FP16 : public Flatmm_32x512x128_1x4x1_16
// TODO: need paired with tile_window_linear! // TODO: need paired with tile_window_linear!
// TODO: need call init_raw() before call this function! // TODO: need call init_raw() before call this function!
template <typename ARes, typename ACoords, typename BRes, typename BCoords> template <typename ARes, typename ACoords, typename BRes, typename BCoords, bool Is2B = false>
CK_TILE_DEVICE auto CK_TILE_DEVICE auto
operator()(const ARes& res_a, operator()(const ARes& res_a,
const ACoords& cached_coords_a, const ACoords& cached_coords_a,
...@@ -441,7 +536,8 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_FP16 : public Flatmm_32x512x128_1x4x1_16 ...@@ -441,7 +536,8 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_FP16 : public Flatmm_32x512x128_1x4x1_16
CK_TILE_LDS_ADDR void* smem, CK_TILE_LDS_ADDR void* smem,
index_t k, index_t k,
index_t tile_offset_a, // for each tile, the offset to move for each unroll index_t tile_offset_a, // for each tile, the offset to move for each unroll
index_t tile_offset_b) // for each tile, the offset to move for each unroll index_t tile_offset_b, // for each tile, the offset to move for each unroll
bool_constant<Is2B> = {})
{ {
static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 2 /*2x per dword*/); // 8 static_assert(ACoords::size() == Block_M * Block_K / BlockSize / 2 /*2x per dword*/); // 8
static_assert(BCoords::size() == Repeat_N); static_assert(BCoords::size() == Repeat_N);
...@@ -486,130 +582,82 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_FP16 : public Flatmm_32x512x128_1x4x1_16 ...@@ -486,130 +582,82 @@ struct Flatmm_32x512x128_1x4x1_16x16x32_FP16 : public Flatmm_32x512x128_1x4x1_16
index_t loop_cnt = k / Block_K; index_t loop_cnt = k / Block_K;
// this is the acc thread buffer if constexpr(Is2B)
fp32x4_t v_acc[16]{.0f}; {
// this is the acc thread buffer
fp32x4_t v_acc[32]{.0f};
// B nr->kr // B nr->kr
#pragma clang diagnostic push #pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm" #pragma clang diagnostic ignored "-Winline-asm"
// clang-format off // clang-format off
asm volatile( asm volatile(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16 #define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16
#define CK_TILE_FLATMM_UK_2B 1
#include "uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc" #include "uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc"
#undef CK_TILE_FLATMM_UK_MFMA : _EXPAND_ASM_ARGS_OUT_TWO_ACC
: [s_loop_cnt]"+s"(loop_cnt), : _EXPAND_ASM_ARGS_IN,
[v_acc_0]"+v"(v_acc[0]), [s_res_b4]"s"(res_b[4]),
[v_acc_1]"+v"(v_acc[1]), [s_res_b5]"s"(res_b[5]),
[v_acc_2]"+v"(v_acc[2]), [s_res_b6]"s"(res_b[6]),
[v_acc_3]"+v"(v_acc[3]), [s_res_b7]"s"(res_b[7])
[v_acc_4]"+v"(v_acc[4]), : _EXPAND_ASM_ARGS_CLOBBER, "s24", "s25", "s26", "s27"
[v_acc_5]"+v"(v_acc[5]), );
[v_acc_6]"+v"(v_acc[6]), // clang-format on
[v_acc_7]"+v"(v_acc[7]),
[v_acc_8]"+v"(v_acc[8]),
[v_acc_9]"+v"(v_acc[9]),
[v_acc_10]"+v"(v_acc[10]),
[v_acc_11]"+v"(v_acc[11]),
[v_acc_12]"+v"(v_acc[12]),
[v_acc_13]"+v"(v_acc[13]),
[v_acc_14]"+v"(v_acc[14]),
[v_acc_15]"+v"(v_acc[15]),
[s_mem_]"+r"(smem)
: [s_res_a0]"s"(res_a[0]),
[s_res_a1]"s"(res_a[1]),
[s_res_a2]"s"(res_a[2]),
[s_res_a3]"s"(res_a[3]),
[s_res_b0]"s"(res_b[0]),
[s_res_b1]"s"(res_b[1]),
[s_res_b2]"s"(res_b[2]),
[s_res_b3]"s"(res_b[3]),
[v_os_a0]"v"(static_cast<index_t>(cached_coords_a[number<0>{}] * sizeof(ADataType))),
[v_os_a1]"v"(static_cast<index_t>(cached_coords_a[number<1>{}] * sizeof(ADataType))),
[v_os_a2]"v"(static_cast<index_t>(cached_coords_a[number<2>{}] * sizeof(ADataType))),
[v_os_a3]"v"(static_cast<index_t>(cached_coords_a[number<3>{}] * sizeof(ADataType))),
[v_os_a4]"v"(static_cast<index_t>(cached_coords_a[number<4>{}] * sizeof(ADataType))),
[v_os_a5]"v"(static_cast<index_t>(cached_coords_a[number<5>{}] * sizeof(ADataType))),
[v_os_a6]"v"(static_cast<index_t>(cached_coords_a[number<6>{}] * sizeof(ADataType))),
[v_os_a7]"v"(static_cast<index_t>(cached_coords_a[number<7>{}] * sizeof(ADataType))),
[v_os_b0]"v"(static_cast<index_t>(cached_coords_b[number<0>{}] * sizeof(BDataType))),
[v_os_b1]"v"(static_cast<index_t>(cached_coords_b[number<1>{}] * sizeof(BDataType))),
[v_os_b2]"v"(static_cast<index_t>(cached_coords_b[number<2>{}] * sizeof(BDataType))),
[v_os_b3]"v"(static_cast<index_t>(cached_coords_b[number<3>{}] * sizeof(BDataType))),
[v_os_b4]"v"(static_cast<index_t>(cached_coords_b[number<4>{}] * sizeof(BDataType))),
[v_os_b5]"v"(static_cast<index_t>(cached_coords_b[number<5>{}] * sizeof(BDataType))),
[v_os_b6]"v"(static_cast<index_t>(cached_coords_b[number<6>{}] * sizeof(BDataType))),
[v_os_b7]"v"(static_cast<index_t>(cached_coords_b[number<7>{}] * sizeof(BDataType))),
[v_os_slda]"v"(static_cast<index_t>(a_sld.cached_coords_[number<0>{}].get_offset() * sizeof(ADataType))),
[s_m0_init]"s"(m0_init_value),
[s_size_per_issue]"s"(size_per_issue),
[smem_sz]"n"(smem_buf_size), //(smem_buf_size),
[sld_os_0]"n"(sld_os[number<0>{}].value),
[sld_os_1]"n"(sld_os[number<1>{}].value),
[sld_os_2]"n"(sld_os[number<2>{}].value),
[sld_os_3]"n"(sld_os[number<3>{}].value),
[sld_os_4]"n"(sld_os[number<4>{}].value),
[sld_os_5]"n"(sld_os[number<5>{}].value),
[sld_os_6]"n"(sld_os[number<6>{}].value),
[sld_os_7]"n"(sld_os[number<7>{}].value),
[s_tile_os_a]"s"(tile_offset_a_bytes),
[s_tile_os_b]"s"(tile_offset_b_bytes)
: "memory", "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", "a8", "a9",
"a10", "a11", "a12", "a13", "a14", "a15", "a16", "a17", "a18", "a19",
"a20", "a21", "a22", "a23", "a24", "a25", "a26", "a27", "a28", "a29",
"a30", "a31", "a32", "a33", "a34", "a35", "a36", "a37", "a38", "a39",
"a40", "a41", "a42", "a43", "a44", "a45", "a46", "a47", "a48", "a49",
"a50", "a51", "a52", "a53", "a54", "a55", "a56", "a57", "a58", "a59",
"a60", "a61", "a62", "a63", "a64", "a65", "a66", "a67", "a68", "a69",
"a70", "a71", "a72", "a73", "a74", "a75", "a76", "a77", "a78", "a79",
"a80", "a81", "a82", "a83", "a84", "a85", "a86", "a87", "a88", "a89",
"a90", "a91", "a92", "a93", "a94", "a95", "a96", "a97", "a98", "a99",
"a100", "a101", "a102", "a103", "a104", "a105", "a106", "a107",
"a108", "a109", "a110", "a111", "a112", "a113", "a114", "a115",
"a116", "a117", "a118", "a119", "a120", "a121", "a122", "a123",
"a124", "a125", "a126", "a127", "a128", "a129", "a130", "a131",
"a132", "a133", "a134", "a135", "a136", "a137", "a138", "a139",
"a140", "a141", "a142", "a143", "a144", "a145", "a146", "a147",
"a148", "a149", "a150", "a151", "a152", "a153", "a154", "a155",
"a156", "a157", "a158", "a159", "a160", "a161", "a162", "a163",
"a164", "a165", "a166", "a167", "a168", "a169", "a170", "a171",
"a172", "a173", "a174", "a175", "a176", "a177", "a178", "a179",
"a180", "a181", "a182", "a183", "a184", "a185", "a186", "a187",
"a188", "a189", "a190", "a191", "a192", "a193", "a194", "a195",
"a196", "a197", "a198", "a199", "a200", "a201", "a202", "a203",
"a204", "a205", "a206", "a207", "a208", "a209", "a210", "a211",
"a212", "a213", "a214", "a215", "a216", "a217", "a218", "a219",
"a220", "a221", "a222", "a223", "a224", "a225", "a226", "a227",
"a228", "a229", "a230", "a231", "a232", "a233", "a234", "a235",
"a236", "a237", "a238", "a239", "a240", "a241", "a242", "a243",
"a244", "a245", "a246", "a247", "a248", "a249", "a250", "a251",
"a252", "a253", "a254", "a255",
"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
"s86", // s86 as tmp
"v64", "v65", "v66", "v67", "v68", "v69",
"v70", "v71", "v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79",
"v80", "v81", "v82", "v83", "v84", "v85", "v86", "v87", "v88", "v89",
"v90", "v91", "v92", "v93", "v94", "v95", "v96", "v97", "v98", "v99",
"v100", "v101", "v102", "v103", "v104", "v105", "v106", "v107",
"v108", "v109", "v110", "v111", "v112", "v113", "v114", "v115",
"v116", "v117", "v118", "v119", "v120", "v121", "v122", "v123",
"v124", "v125", "v126", "v127"
);
// clang-format on
#pragma clang diagnostic pop #pragma clang diagnostic pop
// return local scratch // return local scratch
auto c = MakeCBlockTile(); auto c = make_tuple(MakeCBlockTile(), MakeCBlockTile());
for(auto i = 0; i < 16; i++) for(auto i = 0; i < 16; i++)
{
c.at(number<0>{}).get_thread_buffer()[4 * i + 0] = v_acc[i].x;
c.at(number<0>{}).get_thread_buffer()[4 * i + 1] = v_acc[i].y;
c.at(number<0>{}).get_thread_buffer()[4 * i + 2] = v_acc[i].z;
c.at(number<0>{}).get_thread_buffer()[4 * i + 3] = v_acc[i].w;
}
for(auto i = 0; i < 16; i++)
{
c.at(number<1>{}).get_thread_buffer()[4 * i + 0] = v_acc[16 + i].x;
c.at(number<1>{}).get_thread_buffer()[4 * i + 1] = v_acc[16 + i].y;
c.at(number<1>{}).get_thread_buffer()[4 * i + 2] = v_acc[16 + i].z;
c.at(number<1>{}).get_thread_buffer()[4 * i + 3] = v_acc[16 + i].w;
}
return c;
}
else
{ {
c.get_thread_buffer()[4 * i + 0] = v_acc[i].x; // this is the acc thread buffer
c.get_thread_buffer()[4 * i + 1] = v_acc[i].y; fp32x4_t v_acc[16]{.0f};
c.get_thread_buffer()[4 * i + 2] = v_acc[i].z;
c.get_thread_buffer()[4 * i + 3] = v_acc[i].w; // B nr->kr
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Winline-asm"
// clang-format off
asm volatile(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16
#include "uk/flatmm_uk_gfx9_32x512x128_1x1x1_16x16x16.inc"
: _EXPAND_ASM_ARGS_OUT_ONE_ACC
: _EXPAND_ASM_ARGS_IN
: _EXPAND_ASM_ARGS_CLOBBER
);
// clang-format on
#pragma clang diagnostic pop
// return local scratch
auto c = MakeCBlockTile();
for(auto i = 0; i < 16; i++)
{
c.get_thread_buffer()[4 * i + 0] = v_acc[i].x;
c.get_thread_buffer()[4 * i + 1] = v_acc[i].y;
c.get_thread_buffer()[4 * i + 2] = v_acc[i].z;
c.get_thread_buffer()[4 * i + 3] = v_acc[i].w;
}
return c;
} }
return c;
} }
}; };
#undef _EXPAND_ASM_ARGS_OUT_ONE_ACC
#undef _EXPAND_ASM_ARGS_OUT_TWO_ACC
#undef _EXPAND_ASM_ARGS_IN
#undef _EXPAND_ASM_ARGS_CLOBBER
} // namespace ck_tile } // namespace ck_tile
...@@ -65,7 +65,8 @@ struct FlatmmSn_32x128x512_1x4x1_16x16x32_Base ...@@ -65,7 +65,8 @@ struct FlatmmSn_32x128x512_1x4x1_16x16x32_Base
// in LDS we need store as // in LDS we need store as
// M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4) // M0(2)* N0(2) * Nl(4) * Nw(4) * (Mw(16)*Nv(4) + 4)
// y y wave-id lid/16 lid%16 v // y y wave-id lid/16 lid%16 v
return 2 * 2 * 4 * 4 * (16 * 4 + 4) * sizeof(bf16_t); constexpr index_t nbufs = 2;
return 2 * 2 * 4 * 4 * (16 * 4 + 4) * sizeof(bf16_t) * nbufs;
} }
}; };
...@@ -173,7 +174,6 @@ struct FlatmmSn_32x128x512_1x4x1_16x16x32_BF16 : public FlatmmSn_32x128x512_1x4x ...@@ -173,7 +174,6 @@ struct FlatmmSn_32x128x512_1x4x1_16x16x32_BF16 : public FlatmmSn_32x128x512_1x4x
asm volatile( asm volatile(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16 #define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_BF16
#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc" #include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc"
#undef CK_TILE_FLATMM_UK_MFMA
:[smem_]"+r"(smem), :[smem_]"+r"(smem),
[s_loop_cnt]"+s"(loop_cnt), [s_loop_cnt]"+s"(loop_cnt),
[c0]"+v" (v_c0), [c0]"+v" (v_c0),
...@@ -418,7 +418,6 @@ struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16 : public FlatmmSn_32x128x512_1x4x ...@@ -418,7 +418,6 @@ struct FlatmmSn_32x128x512_1x4x1_16x16x32_FP16 : public FlatmmSn_32x128x512_1x4x
asm volatile( asm volatile(
#define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16 #define CK_TILE_FLATMM_UK_MFMA CK_TILE_FLATMM_UK_MFMA_FP16
#include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc" #include "uk/flatmm_sn_uk_gfx9_32x128x512_1x4x1_16x16x16.inc"
#undef CK_TILE_FLATMM_UK_MFMA
:[smem_]"+r"(smem), :[smem_]"+r"(smem),
[s_loop_cnt]"+s"(loop_cnt), [s_loop_cnt]"+s"(loop_cnt),
[c0]"+v" (v_c0), [c0]"+v" (v_c0),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment