Commit f6ec737c authored by Chao Liu's avatar Chao Liu
Browse files

changed blockwise and threadwise gemm to use new tensor descriptor

parent a7c587ee
#ifndef CK_DUMMY_DYNAMIC_TRANSFORM_HPP
#define CK_DUMMY_DYNAMIC_TRANSFORM_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
namespace ck {
template <typename... Wei, typename... In, typename... Out>
__host__ __device__ constexpr auto
map_convolution_into_gemm_fwd_v4r4(const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
const MultiIndex<2> conv_strides,
const MultiIndex<2> conv_dilations,
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(I1);
const index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
const index_t ConvStrideH = conv_strides[I0];
const index_t ConvStrideW = conv_strides[I1];
const index_t ConvDilationH = conv_dilations[I0];
const index_t ConvDilationW = conv_dilations[I1];
const index_t InLeftPadH = in_left_pads[I0];
const index_t InLeftPadW = in_left_pads[I1];
const index_t InRightPadH = in_right_pads[I0];
const index_t InRightPadW = in_right_pads[I1];
// input tensor
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
transform_dynamic_tensor_descriptor(
in_n_c_hi_wi_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicLeftPad{Hi, InLeftPadH},
DynamicLeftPad{Wi, InLeftPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicRightPad{Hi + InLeftPadH, InRightPadH},
DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(I2);
const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(I3);
const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_hip_wip_global_desc,
make_tuple(
DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicEmbed<2>{make_multi_index(Y, Ho), make_multi_index(ConvDilationH, ConvStrideH)},
DynamicEmbed<2>{make_multi_index(X, Wo), make_multi_index(ConvDilationW, ConvStrideW)}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
const auto in_gemmktotal_gemmn_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{make_multi_index(C, Y, X)},
DynamicMerge<3>{make_multi_index(N, Ho, Wo)}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
const index_t gemmktotal = in_gemmktotal_gemmn_global_desc.GetLength(I0);
const index_t gemmn = in_gemmktotal_gemmn_global_desc.GetLength(I1);
constexpr index_t GemmKPack = 8;
const index_t gemmk = gemmktotal / GemmKPack;
const auto in_gemmk_gemmn_gemmkpack_global_desc = transform_dynamic_tensor_descriptor(
in_gemmktotal_gemmn_global_desc,
make_tuple(DynamicUnMerge<2>{make_multi_index(gemmk, GemmKPack)},
DynamicPassThrough{gemmn}),
make_tuple(Sequence<0>{}, Sequence<1>{}),
make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
static_assert(decltype(in_gemmk_gemmn_gemmkpack_global_desc)::GetNumOfDimension() == 3,
"wrong!");
return make_tuple(in_gemmk_gemmn_gemmkpack_global_desc);
}
#if 0
template <typename... Wei, typename... In, typename... Out>
__host__ __device__ constexpr auto map_convolution_into_gemm_bwd_v4r1(
const DynamicTensorDescriptor<Wei...>& wei_k_c_y_x_global_desc,
const DynamicTensorDescriptor<In...>& in_n_c_hi_wi_global_desc,
const DynamicTensorDescriptor<Out...>& out_n_k_ho_wo_global_desc,
const MultiIndex<2> conv_strides,
const MultiIndex<2> conv_dilations,
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(I1);
const index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
const index_t ConvStrideH = conv_strides[I0];
const index_t ConvStrideW = conv_strides[I1];
const index_t ConvDilationH = conv_dilations[I0];
const index_t ConvDilationW = conv_dilations[I1];
const index_t InLeftPadH = in_left_pads[I0];
const index_t InLeftPadW = in_left_pads[I1];
const index_t InRightPadH = in_right_pads[I0];
const index_t InRightPadW = in_right_pads[I1];
#if !CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK
constexpr bool out_skip_out_of_bound_check = false;
#else
constexpr bool out_skip_out_of_bound_check = true;
#endif
constexpr auto out_n_k_ydot_htilda_xdot_wtilda_global_desc = transform_tensor_descriptor(
out_n_k_ho_wo_global_desc,
make_tuple(PassThrough{N},
PassThrough{K},
Embed<2>{make_multi_index(YDot, HTilda), make_multi_index(-ConvDilationH / GcdStrideDilationH, 1)},
Embed<2>{make_multi_index(XDot, WTilda), make_multi_index(-ConvDilationW / GcdStrideDilationW, 1)}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
constexpr auto out_n_k_ydot_htildaslice_xdot_wtildaslice_global_desc =
transform_tensor_descriptor(
out_n_k_ydot_htilda_xdot_wtilda_global_desc,
make_tuple(PassThrough{N},
PassThrough{K},
PassThrough{YDot},
PassThrough{XDot},
Slice<Sequence<HTilda, WTilda>,
Sequence<iHTildaLeft, iWTildaLeft>,
Sequence<iHTildaRight, iWTildaRight>>{}),
make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<3, 5>{}),
make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}, Sequence<3, 5>{}));
constexpr auto out_n_k_ydotslice_htildaslice_xdotslice_wtildaslice_global_desc =
transform_tensor_descriptor(
out_n_k_ydot_htildaslice_xdot_wtildaslice_global_desc,
make_tuple(
PassThrough<N>{},
PassThrough<K>{},
PassThrough<HTildaSlice>{},
PassThrough<WTildaSlice>{},
Slice<Sequence<YDot, XDot>, Sequence<0, 0>, Sequence<YDotSlice, XDotSlice>>{}),
make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, Sequence<2, 4>{}),
make_tuple(
Sequence<0>{}, Sequence<1>{}, Sequence<3>{}, Sequence<5>{}, Sequence<2, 4>{}));
constexpr auto out_gemmk_gemmn_global_desc = transform_tensor_descriptor(
out_n_k_ydotslice_htildaslice_xdotslice_wtildaslice_global_desc,
make_tuple(Merge<Sequence<K, YDotSlice, XDotSlice>>{},
Merge<Sequence<N, HTildaSlice, WTildaSlice>>{}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return make_tuple(in_gemmk_gemmn_gemmkpack_global_desc);
}
#endif
template <index_t BlockSize>
struct DummyDynamicTransform_1
{
template <typename WeiDesc, typename InDesc, typename OutDesc>
__device__ void Run_1(index_t* const __restrict__ p_wei_global,
float* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
const WeiDesc wei_k_c_y_x_global_desc,
const InDesc in_n_c_hi_wi_global_desc,
const OutDesc out_n_k_ho_wo_global_desc,
const MultiIndex<2> conv_strides,
const MultiIndex<2> conv_dilations,
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads) const
{
const auto transformed_tensor_descs =
map_convolution_into_gemm(move(wei_k_c_y_x_global_desc),
move(in_n_c_hi_wi_global_desc),
move(out_n_k_ho_wo_global_desc),
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
const auto in_gemmk_gemmn_global_desc = transformed_tensor_descs.At(Number<0>{});
MultiIndex<2> idx;
// initialize idx
static_for<0, 2, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
const index_t niter = p_wei_global[10];
auto in_gemmk_gemmn_coord = make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, idx);
const auto in_gemmk_gemmn_coord_iterator = make_dynamic_tensor_coordinate_iterator(
in_gemmk_gemmn_global_desc, make_multi_index(1, 0));
for(index_t iter = 0; iter < niter; ++iter)
{
move_dynamic_tensor_coordinate(
in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord, in_gemmk_gemmn_coord_iterator);
// write
float value = 1;
transfer_data<float,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(&value,
0,
true,
1,
p_out_global,
in_gemmk_gemmn_coord.GetOffset(),
#if 1
coordinate_has_valid_offset_assuming_visible_index_is_valid(
in_gemmk_gemmn_global_desc, in_gemmk_gemmn_coord),
#else
true,
#endif
in_gemmk_gemmn_global_desc.GetElementSpaceSize());
}
}
template <typename WeiDesc, typename InDesc, typename OutDesc>
__device__ void Run_2(index_t* const __restrict__ p_wei_global,
float* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
const WeiDesc wei_k_c_y_x_global_desc,
const InDesc in_n_c_hi_wi_global_desc,
const OutDesc out_n_k_ho_wo_global_desc,
const MultiIndex<2> conv_strides,
const MultiIndex<2> conv_dilations,
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads) const
{
constexpr auto i0 = Number<0>{};
constexpr auto i1 = Number<1>{};
constexpr auto i2 = Number<2>{};
constexpr auto i3 = Number<3>{};
const index_t N = in_n_c_hi_wi_global_desc.GetLength(i0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(i1);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(i1);
const index_t Y = wei_k_c_y_x_global_desc.GetLength(i2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(i3);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(i2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(i3);
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(i2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(i3);
const index_t ConvStrideH = conv_strides[i0];
const index_t ConvStrideW = conv_strides[i1];
const index_t ConvDilationH = conv_dilations[i0];
const index_t ConvDilationW = conv_dilations[i1];
const index_t InLeftPadH = in_left_pads[i0];
const index_t InLeftPadW = in_left_pads[i1];
const index_t InRightPadH = in_right_pads[i0];
const index_t InRightPadW = in_right_pads[i1];
#if 0
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
move(in_n_c_hi_wi_global_desc),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicPassThrough{Hi},
DynamicPassThrough{Wi}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
#elif 0
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
move(in_n_c_hi_wi_global_desc),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicLeftPad{Hi, InLeftPadH},
DynamicLeftPad{Wi, InLeftPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
#else
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
transform_dynamic_tensor_descriptor(
move(in_n_c_hi_wi_global_desc),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicLeftPad{Hi, InLeftPadH},
DynamicLeftPad{Wi, InLeftPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicRightPad{Hi + InLeftPadH, InRightPadH},
DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
#endif
MultiIndex<4> idx;
// initialize idx
static_for<0, 4, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
#if 1
const index_t niter = p_wei_global[10];
auto in_coord = make_dynamic_tensor_coordinate(in_n_c_hip_wip_global_desc, idx);
const auto in_coord_iterator = make_dynamic_tensor_coordinate_iterator(
in_n_c_hip_wip_global_desc, make_multi_index(1, 0, 0, 0));
for(index_t iter = 0; iter < niter; ++iter)
{
move_dynamic_tensor_coordinate(in_n_c_hip_wip_global_desc, in_coord, in_coord_iterator);
// write
float value = 1;
transfer_data<float,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(&value,
0,
true,
1,
p_out_global,
in_coord.GetOffset(),
coordinate_has_valid_offset_assuming_visible_index_is_valid(
in_n_c_hip_wip_global_desc, in_coord),
in_n_c_hip_wip_global_desc.GetElementSpaceSize());
}
#else
// write
// auto in_coord = make_dynamic_tensor_coordinate(in_n_c_hi_wi_global_desc, idx);
p_out_global[in_n_c_hip_wip_global_desc.CalculateOffset(idx)] = 1;
#endif
}
template <typename WeiDesc, typename InDesc, typename OutDesc>
__device__ void Run(index_t* const __restrict__ p_wei_global,
float* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
const WeiDesc wei_k_c_y_x_global_desc,
const InDesc in_n_c_hi_wi_global_desc,
const OutDesc out_n_k_ho_wo_global_desc,
const MultiIndex<2> conv_strides,
const MultiIndex<2> conv_dilations,
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads) const
{
Run_2(p_wei_global,
p_in_global,
p_out_global,
wei_k_c_y_x_global_desc,
in_n_c_hi_wi_global_desc,
out_n_k_ho_wo_global_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
}
};
template <index_t BlockSize>
struct DummyDynamicTransform_fwd_v4r4
{
template <typename TransformInDesc>
__device__ void Run(index_t* const __restrict__ p_wei_global,
float* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
const TransformInDesc in_gemmk_gemmn_gemmkpack_global_desc) const
{
MultiIndex<3> idx;
// initialize idx
static_for<0, 3, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
const index_t niter = p_wei_global[10];
auto in_gemmk_gemmn_gemmkpack_coord =
make_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc, idx);
const auto in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1 =
make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
make_multi_index(0, 0, 1));
const auto in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0 =
make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
make_multi_index(0, 1, 0));
const auto in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0 =
make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
make_multi_index(1, 0, 0));
// move (0, 0, 1)
for(index_t iter = 0; iter < niter; ++iter)
{
move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
in_gemmk_gemmn_gemmkpack_coord,
in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1);
// write
float value = 1;
transfer_data<float,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(
&value,
0,
true,
1,
p_out_global,
in_gemmk_gemmn_gemmkpack_coord.GetOffset(),
#if 1
coordinate_has_valid_offset_assuming_visible_index_is_valid(
in_gemmk_gemmn_gemmkpack_global_desc, in_gemmk_gemmn_gemmkpack_coord),
#else
true,
#endif
in_gemmk_gemmn_gemmkpack_global_desc.GetElementSpaceSize());
}
// move (0, 1, 0)
for(index_t iter = 0; iter < niter; ++iter)
{
move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
in_gemmk_gemmn_gemmkpack_coord,
in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0);
// write
float value = 1;
transfer_data<float,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(
&value,
0,
true,
1,
p_out_global,
in_gemmk_gemmn_gemmkpack_coord.GetOffset(),
#if 1
coordinate_has_valid_offset_assuming_visible_index_is_valid(
in_gemmk_gemmn_gemmkpack_global_desc, in_gemmk_gemmn_gemmkpack_coord),
#else
true,
#endif
in_gemmk_gemmn_gemmkpack_global_desc.GetElementSpaceSize());
}
// move (1, 0, 0)
for(index_t iter = 0; iter < niter; ++iter)
{
move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
in_gemmk_gemmn_gemmkpack_coord,
in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0);
// write
float value = 1;
transfer_data<float,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(
&value,
0,
true,
1,
p_out_global,
in_gemmk_gemmn_gemmkpack_coord.GetOffset(),
#if 1
coordinate_has_valid_offset_assuming_visible_index_is_valid(
in_gemmk_gemmn_gemmkpack_global_desc, in_gemmk_gemmn_gemmkpack_coord),
#else
true,
#endif
in_gemmk_gemmn_gemmkpack_global_desc.GetElementSpaceSize());
}
}
};
} // namespace ck
#endif
#ifndef CK_DUMMY_DYNAMIC_TRANSFORM_V1_HPP
#define CK_DUMMY_DYNAMIC_TRANSFORM_V1_HPP
#include "common_header.hpp"
#include "dynamic_tensor_descriptor_v1.hpp"
#include "dynamic_tensor_descriptor_helper_v1.hpp"
#include "dynamic_tensor_coordinate_v1.hpp"
namespace ck {
template <typename WeiDesc, typename InDesc, typename OutDesc>
__host__ __device__ constexpr auto
map_convolution_into_gemm_v1(const WeiDesc& wei_k_c_y_x_global_desc,
const InDesc& in_n_c_hi_wi_global_desc,
const OutDesc& out_n_k_ho_wo_global_desc,
const MultiIndex<2>& conv_strides,
const MultiIndex<2>& conv_dilations,
const MultiIndex<2>& in_left_pads,
const MultiIndex<2>& in_right_pads)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(I1);
const index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
const index_t ConvStrideH = conv_strides[I0];
const index_t ConvStrideW = conv_strides[I1];
const index_t ConvDilationH = conv_dilations[I0];
const index_t ConvDilationW = conv_dilations[I1];
const index_t InLeftPadH = in_left_pads[I0];
const index_t InLeftPadW = in_left_pads[I1];
const index_t InRightPadH = in_right_pads[I0];
const index_t InRightPadW = in_right_pads[I1];
// input tensor
const auto in_n_c_hip_wip_global_desc = transform_dynamic_tensor_descriptor(
transform_dynamic_tensor_descriptor(
in_n_c_hi_wi_global_desc,
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicLeftPad{Hi, InLeftPadH},
DynamicLeftPad{Wi, InLeftPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})),
make_tuple(DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicRightPad{Hi + InLeftPadH, InRightPadH},
DynamicRightPad{Wi + InLeftPadW, InRightPadW}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
const index_t Hip = in_n_c_hip_wip_global_desc.GetLength(I2);
const index_t Wip = in_n_c_hip_wip_global_desc.GetLength(I3);
const auto in_n_c_y_ho_x_wo_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_hip_wip_global_desc,
make_tuple(
DynamicPassThrough{N},
DynamicPassThrough{C},
DynamicEmbed<2>{make_multi_index(Y, Ho), make_multi_index(ConvDilationH, ConvStrideH)},
DynamicEmbed<2>{make_multi_index(X, Wo), make_multi_index(ConvDilationW, ConvStrideW)}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
const auto in_gemmk_gemmn_global_desc = transform_dynamic_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(DynamicMerge<3>{make_multi_index(C, Y, X)},
DynamicMerge<3>{make_multi_index(N, Ho, Wo)}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
return make_tuple(in_gemmk_gemmn_global_desc);
}
template <index_t BlockSize>
struct DummyDynamicTransform_v1
{
template <typename WeiDesc, typename InDesc, typename OutDesc>
__device__ void Run_1(index_t* const __restrict__ p_wei_global,
float* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
const WeiDesc wei_k_c_y_x_global_desc,
const InDesc in_n_c_hi_wi_global_desc,
const OutDesc out_n_k_ho_wo_global_desc,
const MultiIndex<2> conv_strides,
const MultiIndex<2> conv_dilations,
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
#if 1
const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
const index_t K = out_n_k_ho_wo_global_desc.GetLength(I1);
const index_t Y = wei_k_c_y_x_global_desc.GetLength(I2);
const index_t X = wei_k_c_y_x_global_desc.GetLength(I3);
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
const index_t Ho = out_n_k_ho_wo_global_desc.GetLength(I2);
const index_t Wo = out_n_k_ho_wo_global_desc.GetLength(I3);
const index_t ConvStrideH = conv_strides[I0];
const index_t ConvStrideW = conv_strides[I1];
const index_t ConvDilationH = conv_dilations[I0];
const index_t ConvDilationW = conv_dilations[I1];
const index_t InLeftPadH = in_left_pads[I0];
const index_t InLeftPadW = in_left_pads[I1];
const index_t InRightPadH = in_right_pads[I0];
const index_t InRightPadW = in_right_pads[I1];
#else
const index_t N = in_n_c_hi_wi_global_desc.GetLength(I0);
const index_t C = in_n_c_hi_wi_global_desc.GetLength(I1);
const index_t Y = 3;
const index_t X = 3;
const index_t Hi = in_n_c_hi_wi_global_desc.GetLength(I2);
const index_t Wi = in_n_c_hi_wi_global_desc.GetLength(I3);
const index_t ConvStrideH = conv_strides[I0];
const index_t ConvStrideW = conv_strides[I1];
const index_t ConvDilationH = conv_dilations[I0];
const index_t ConvDilationW = conv_dilations[I1];
const index_t InLeftPadH = in_left_pads[I0];
const index_t InLeftPadW = in_left_pads[I1];
const index_t InRightPadH = in_right_pads[I0];
const index_t InRightPadW = in_right_pads[I1];
#endif
// define transform
// pass through
auto f_lower_idx_diff_passthrough = [](index_t& idx_low_diff, const index_t& idx_up_diff) {
idx_low_diff = idx_up_diff;
};
// pad
auto f_lower_idx_diff_pad = [](index_t& idx_low_diff, const index_t& idx_up_diff) {
idx_low_diff = idx_up_diff;
};
// embed
auto f_lower_idx_diff_embed = [](index_t& idx_low_diff,
const index_t& idx_up_diff_0,
const index_t& idx_up_diff_1,
const index_t coeff0,
const index_t coeff1) {
idx_low_diff = coeff0 * idx_up_diff_0 + coeff1 * idx_up_diff_1;
};
// unmerge
auto f_lower_idx_diff_unmerge = [](index_t& idx_low_diff,
const index_t& idx_up_diff_0,
const index_t& idx_up_diff_1,
const index_t up_length_1) {
idx_low_diff = up_length_1 * idx_up_diff_0 + idx_up_diff_1;
};
// merge
auto f_lower_idx_diff_merge_v1 = [](index_t& idx_low_diff_0,
index_t& idx_low_diff_1,
index_t& idx_low_diff_2,
const index_t& idx_up_diff,
const index_t& idx_low_old_0,
const index_t& idx_low_old_1,
const index_t& idx_low_old_2,
const index_t& idx_low_diff_const_0,
const index_t& idx_low_diff_const_1,
const index_t& idx_low_diff_const_2,
const index_t& idx_low_bound_0,
const index_t& idx_low_bound_1,
const index_t& idx_low_bound_2) {
auto f_carry_arithmetic = [](index_t& idx_low_diff,
index_t& carry,
const index_t& idx_low_old,
const index_t& idx_low_diff_const,
const index_t& idx_low_bound) {
index_t idx_low_tmp = idx_low_old + carry + idx_low_diff_const;
#if 1 // positive
bool do_carry = idx_low_tmp >= idx_low_bound;
index_t idx_low_new = do_carry ? idx_low_tmp - idx_low_bound : idx_low_tmp;
carry = do_carry ? 1 : 0;
#else // negative
bool do_borrow = idx_low_tmp < 0;
index_t idx_low_new = do_borrow ? idx_low_tmp + idx_low_bound : idx_low_tmp;
carry = do_borrow ? -1 : 0;
#endif
idx_low_diff = idx_low_new - idx_low_old;
};
index_t carry = 0;
f_carry_arithmetic(
idx_low_diff_2, carry, idx_low_old_2, idx_low_diff_const_2, idx_low_bound_2);
f_carry_arithmetic(
idx_low_diff_1, carry, idx_low_old_1, idx_low_diff_const_1, idx_low_bound_1);
idx_low_diff_0 = idx_low_diff_const_0 + carry;
};
auto f_lower_idx_diff_merge_v2 = [](index_t& idx_low_diff_0,
index_t& idx_low_diff_1,
index_t& idx_low_diff_2,
const index_t& idx_up_diff,
const index_t& idx_low_old_0,
const index_t& idx_low_old_1,
const index_t& idx_low_old_2,
const index_t& idx_low_diff_const_0,
const index_t& idx_low_diff_const_1,
const index_t& idx_low_diff_const_2,
const index_t& idx_low_bound_0,
const index_t& idx_low_bound_1,
const index_t& idx_low_bound_2) {
auto f_carry_arithmetic = [](index_t& idx_low_diff,
index_t& carry,
const index_t& idx_low_old,
const index_t& idx_low_diff_const,
const index_t& idx_low_bound) {
index_t idx_low_tmp = idx_low_old + carry;
index_t idx_low_bound_minus_idx_low_diff_const = idx_low_bound - idx_low_diff_const;
#if 1 // positive
bool do_carry = idx_low_tmp >= idx_low_bound_minus_idx_low_diff_const;
idx_low_diff =
do_carry ? -idx_low_bound_minus_idx_low_diff_const : idx_low_diff_const;
idx_low_diff += carry;
carry = do_carry ? 1 : 0;
#else // negative
bool do_borrow = idx_low_tmp < -idx_low_diff_const;
idx_low_diff = do_borrow ? idx_low_diff_const + idx_low_bound : idx_low_diff_const;
idx_low_diff -= carry;
carry = do_borrow ? 1 : carry;
#endif
};
index_t carry = 0;
f_carry_arithmetic(
idx_low_diff_2, carry, idx_low_old_2, idx_low_diff_const_2, idx_low_bound_2);
f_carry_arithmetic(
idx_low_diff_1, carry, idx_low_old_1, idx_low_diff_const_1, idx_low_bound_1);
idx_low_diff_0 = idx_low_diff_const_0 + carry;
};
auto f_lower_idx_diff_merge_v3 = [](index_t& idx_low_diff_0,
index_t& idx_low_diff_1,
index_t& idx_low_diff_2,
const index_t& idx_up_diff,
const index_t& idx_low_old_0,
const index_t& idx_low_old_1,
const index_t& idx_low_old_2,
const index_t& idx_low_diff_const_0,
const index_t& idx_low_diff_const_1,
const index_t& idx_low_diff_const_2,
const index_t& idx_low_bound_0,
const index_t& idx_low_bound_1,
const index_t& idx_low_bound_2) {
auto f_carry_arithmetic = [](index_t& idx_low_diff,
index_t& negative_carry,
const index_t& idx_low_old,
const index_t& idx_low_diff_const,
const index_t& idx_low_bound) {
index_t neg_idx_low_tmp = negative_carry - idx_low_old;
index_t idx_low_diff_const_minus_idx_low_bound = idx_low_diff_const - idx_low_bound;
#if 1
bool do_carry = neg_idx_low_tmp <= idx_low_diff_const_minus_idx_low_bound;
idx_low_diff =
do_carry ? idx_low_diff_const_minus_idx_low_bound : idx_low_diff_const;
idx_low_diff -= negative_carry;
negative_carry = do_carry ? -1 : 0;
#else
bool do_borrow = neg_idx_low_tmp > idx_low_diff_const;
idx_low_diff = do_borrow ? idx_low_diff_const + idx_low_bound : idx_low_diff_const;
idx_low_diff -= negative_carry;
negative_carry = do_borrow ? 1 : negative_carry;
#endif
};
index_t negative_carry = 0;
f_carry_arithmetic(idx_low_diff_2,
negative_carry,
idx_low_old_2,
idx_low_diff_const_2,
idx_low_bound_2);
f_carry_arithmetic(idx_low_diff_1,
negative_carry,
idx_low_old_1,
idx_low_diff_const_1,
idx_low_bound_1);
idx_low_diff_0 = idx_low_diff_const_0 - negative_carry;
};
index_t idx[20];
index_t idx_diff[20];
index_t const_tmp[6];
// populate const
const index_t GemmKPack = p_wei_global[0];
#if 1
for(index_t i = 0; i < 6; ++i)
{
const_tmp[i] = p_wei_global[i + 1];
}
#else
const_tmp[0] = 0;
const_tmp[1] = 2;
const_tmp[2] = 2;
#endif
// initialize idx
for(index_t i = 0; i < 20; ++i)
{
idx[i] = p_wei_global[get_thread_local_1d_id() + i];
}
// offset
idx[0] = idx[1] * in_n_c_hi_wi_global_desc.GetStride(0) +
idx[2] * in_n_c_hi_wi_global_desc.GetStride(1) +
idx[3] * in_n_c_hi_wi_global_desc.GetStride(2) +
idx[4] * in_n_c_hi_wi_global_desc.GetStride(3);
// start lowering diff
#pragma unroll 1
for(index_t i = 0; i < 100; ++i)
{
for(index_t i = 0; i < 20; ++i)
{
idx_diff[i] = 0;
}
idx_diff[17] = 8;
// stage 4
// Unmerge(GemmKTotal) => GemmK, GemmKPack
f_lower_idx_diff_unmerge(idx_diff[15], idx_diff[17], idx_diff[18], GemmKPack);
// PassThrough GemmN => GemmN
f_lower_idx_diff_passthrough(idx_diff[16], idx_diff[19]);
// stage 3
// Merge(C, Y, X) => GemmKTotal
#if 0
f_lower_idx_diff_merge_v2(idx_diff[10],
idx_diff[11],
idx_diff[13],
idx_diff[15],
idx[10],
idx[11],
idx[13],
const_tmp[0],
const_tmp[1],
const_tmp[2],
C,
Y,
X);
#elif 1
index_t tmp = idx_diff[15];
const index_t const_tmp_0 = tmp / (Y * X);
tmp -= const_tmp_0 * (Y * X);
const index_t const_tmp_1 = tmp / X;
const index_t const_tmp_2 = tmp - const_tmp_1 * X;
f_lower_idx_diff_merge_v2(idx_diff[10],
idx_diff[11],
idx_diff[13],
idx_diff[15],
idx[10],
idx[11],
idx[13],
const_tmp_0,
const_tmp_1,
const_tmp_2,
C,
Y,
X);
#elif 0
index_t tmp = idx_diff[15];
const index_t const_tmp_0 = __llvm_amdgcn_readfirstlane_i32(tmp / (Y * X));
tmp -= const_tmp_0 * (Y * X);
const index_t const_tmp_1 = __llvm_amdgcn_readfirstlane_i32(tmp / X);
const index_t const_tmp_2 = __llvm_amdgcn_readfirstlane_i32(tmp - const_tmp_1 * X);
f_lower_idx_diff_merge_v2(idx_diff[10],
idx_diff[11],
idx_diff[13],
idx_diff[15],
idx[10],
idx[11],
idx[13],
const_tmp_0,
const_tmp_1,
const_tmp_2,
C,
Y,
X);
#endif
// stage 2
// PassThrough(N) => N
f_lower_idx_diff_passthrough(idx_diff[5], idx_diff[9]);
// PassThrough(C) => C
f_lower_idx_diff_passthrough(idx_diff[6], idx_diff[10]);
// Embed(Hip) => Y, Ho
f_lower_idx_diff_embed(
idx_diff[7], idx_diff[11], idx_diff[12], ConvDilationH, ConvStrideH);
// Embed(Wip) => X, Wo
f_lower_idx_diff_embed(
idx_diff[8], idx_diff[13], idx_diff[14], ConvDilationW, ConvStrideW);
// stage 1
// PassThrough(N) => N
f_lower_idx_diff_passthrough(idx_diff[1], idx_diff[5]);
// PassThrough(C) => C
f_lower_idx_diff_passthrough(idx_diff[2], idx_diff[6]);
// Pad(Hi) => Hip
f_lower_idx_diff_pad(idx_diff[3], idx_diff[7]);
// Pad(Wi) => Wip
f_lower_idx_diff_pad(idx_diff[4], idx_diff[8]);
// stage 0
// offset_diff
idx_diff[0] = idx_diff[1] * in_n_c_hi_wi_global_desc.GetStride(0) +
idx_diff[2] * in_n_c_hi_wi_global_desc.GetStride(1) +
idx_diff[3] * in_n_c_hi_wi_global_desc.GetStride(2) +
idx_diff[4] * in_n_c_hi_wi_global_desc.GetStride(3);
#if 0
// update idx
for(index_t i = 0; i < 20; ++ i)
{
idx[i] += idx_diff[i];
}
// padding check
bool is_in_bound = idx[3] >= 0 && idx[3] < Hi && idx[4] >= 0 && idx[4] < Wi;
#elif 0 // no pad
// offset
idx[0] += idx_diff[0];
// C, Y, X
idx[10] += idx_diff[10];
idx[11] += idx_diff[11];
idx[13] += idx_diff[13];
// padding check
bool is_in_bound = true;
#else // pad
// offset
idx[0] += idx_diff[0];
// C, Y, X
idx[10] += idx_diff[10];
idx[11] += idx_diff[11];
idx[13] += idx_diff[13];
// Hi, Wi
idx[3] += idx_diff[3];
idx[4] += idx_diff[4];
// padding check
bool is_in_bound = idx[3] >= 0 && idx[3] < Hi && idx[4] >= 0 && idx[4] < Wi;
#endif
float value = 1;
transfer_data<float,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(&value,
0,
true,
1,
p_out_global,
idx[0],
is_in_bound,
out_n_k_ho_wo_global_desc.GetElementSpace());
}
}
template <typename WeiDesc, typename InDesc, typename OutDesc>
__device__ void Run_2(index_t* const __restrict__ p_wei_global,
float* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
const WeiDesc wei_k_c_y_x_global_desc,
const InDesc in_n_c_hi_wi_global_desc,
const OutDesc out_n_k_ho_wo_global_desc,
const MultiIndex<2> conv_strides,
const MultiIndex<2> conv_dilations,
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads) const
{
const auto transformed_tensor_descs =
map_convolution_into_gemm_v1(wei_k_c_y_x_global_desc,
in_n_c_hi_wi_global_desc,
out_n_k_ho_wo_global_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
const auto in_gemmk_gemmn_global_desc = transformed_tensor_descs.At(Number<0>{});
MultiIndex<2> idx;
static_for<0, 2, 1>{}([&](auto i) { idx(i) = p_wei_global[get_thread_local_1d_id() + i]; });
const index_t niter = p_wei_global[10];
auto in_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate_v1(in_gemmk_gemmn_global_desc, idx);
for(index_t iter = 0; iter < niter; ++iter)
{
constexpr auto gemmk1_gemmn0 = make_multi_index(1, 0);
in_gemmk_gemmn_coord += gemmk1_gemmn0;
// write
float value = 1;
transfer_data<float,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(&value,
0,
true,
1,
p_out_global,
in_gemmk_gemmn_coord.GetOffset(),
#if 1
in_gemmk_gemmn_coord.IsOffsetValidAssumingUpperIndexIsValid(),
#else
true,
#endif
in_gemmk_gemmn_global_desc.GetElementSpace());
}
}
template <typename WeiDesc, typename InDesc, typename OutDesc>
__device__ void Run(index_t* const __restrict__ p_wei_global,
float* const __restrict__ p_in_global,
float* const __restrict__ p_out_global,
const WeiDesc wei_k_c_y_x_global_desc,
const InDesc in_n_c_hi_wi_global_desc,
const OutDesc out_n_k_ho_wo_global_desc,
const MultiIndex<2> conv_strides,
const MultiIndex<2> conv_dilations,
const MultiIndex<2> in_left_pads,
const MultiIndex<2> in_right_pads) const
{
Run_2(p_wei_global,
p_in_global,
p_out_global,
wei_k_c_y_x_global_desc,
in_n_c_hi_wi_global_desc,
out_n_k_ho_wo_global_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
}
};
} // namespace ck
#endif
#ifndef CK_DUMMY_STATIC_TRANSFORM_HPP
#define CK_DUMMY_STATIC_TRANSFORM_HPP
#include "common_header.hpp"
#include "tensor_descriptor.hpp"
#include "tensor_descriptor_helper.hpp"
namespace ck {
// GemmM = K
// GemmN = N * Ho * Wo
// GemmK = C * Y * X
template <index_t GridSize,
index_t BlockSize,
typename Float,
typename InGlobalDesc,
typename WeiGlobalDesc,
typename OutGlobalDesc,
typename ConvStrides,
typename ConvDilations,
typename InLeftPads,
typename InRightPads>
struct DummyStaticTransform
{
__device__ void Run(Float* const __restrict__ p_in_global,
Float* const __restrict__ p_wei_global,
Float* const __restrict__ p_out_global) const
{
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_n_c_hi_wi_global_desc = InGlobalDesc{};
constexpr auto wei_k_c_y_x_global_desc = WeiGlobalDesc{};
constexpr auto out_n_k_ho_wo_global_desc = OutGlobalDesc{};
constexpr index_t N = in_n_c_hi_wi_global_desc.GetLengths()[0];
constexpr index_t C = in_n_c_hi_wi_global_desc.GetLengths()[1];
constexpr index_t Hi = in_n_c_hi_wi_global_desc.GetLengths()[2];
constexpr index_t Wi = in_n_c_hi_wi_global_desc.GetLengths()[3];
constexpr index_t K = out_n_k_ho_wo_global_desc.GetLengths()[1];
constexpr index_t Ho = out_n_k_ho_wo_global_desc.GetLengths()[2];
constexpr index_t Wo = out_n_k_ho_wo_global_desc.GetLengths()[3];
constexpr index_t Y = wei_k_c_y_x_global_desc.GetLengths()[2];
constexpr index_t X = wei_k_c_y_x_global_desc.GetLengths()[3];
constexpr index_t ConvStrideH = ConvStrides{}[0];
constexpr index_t ConvStrideW = ConvStrides{}[1];
constexpr index_t ConvDilationH = ConvDilations{}[0];
constexpr index_t ConvDilationW = ConvDilations{}[1];
// weight tensor
constexpr auto wei_gemmk_gemmm_global_desc = reorder_tensor_descriptor_given_upper2lower(
unfold_tensor_descriptor(wei_k_c_y_x_global_desc, I1, I3), Sequence<1, 0>{});
// input tensor
constexpr auto in_n_c_hip_wip_global_desc = transform_tensor_descriptor(
in_n_c_hi_wi_global_desc,
make_tuple(PassThrough<N>{},
PassThrough<C>{},
Pad<Sequence<Hi, Wi>, InLeftPads, InRightPads>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}));
constexpr index_t Hip = in_n_c_hip_wip_global_desc.GetLengths()[2];
constexpr index_t Wip = in_n_c_hip_wip_global_desc.GetLengths()[3];
constexpr auto in_n_c_y_ho_x_wo_global_desc = transform_tensor_descriptor(
in_n_c_hip_wip_global_desc,
make_tuple(PassThrough<N>{},
PassThrough<C>{},
Embed<Hip, Sequence<Y, Ho>, Sequence<ConvDilationH, ConvStrideH, 0>>{},
Embed<Wip, Sequence<X, Wo>, Sequence<ConvDilationW, ConvStrideW, 0>>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{}));
constexpr auto in_gemmk_gemmn_global_desc = transform_tensor_descriptor(
in_n_c_y_ho_x_wo_global_desc,
make_tuple(Merge<Sequence<C, Y, X>>{}, Merge<Sequence<N, Ho, Wo>>{}),
make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
// output tensor
constexpr auto out_gemmm_gemmn_global_desc =
transform_tensor_descriptor(unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3),
make_tuple(PassThrough<K>{}, Merge<Sequence<N, Ho * Wo>>{}),
make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
make_tuple(Sequence<0>{}, Sequence<1>{}));
// input
const index_t k0 = p_in_global[get_thread_local_1d_id()];
const index_t n0 = p_in_global[get_thread_local_1d_id()];
auto coord = typename TensorCoordinate<decltype(in_gemmk_gemmn_global_desc)>::type(k0, n0);
#pragma unroll 1
for(index_t k = 0; k < 100; ++k)
{
coord += make_multi_index(8, 0);
Float value = 1;
transfer_data<Float,
1,
AddressSpace::Vgpr,
AddressSpace::Global,
InMemoryDataOperation::Set,
1,
1>(&value,
0,
true,
1,
p_in_global,
coord.GetOffset(),
coord.IsOffsetValidAssumingUpperIndexIsValid(),
in_gemmk_gemmn_global_desc.GetElementSpace());
}
}
};
} // namespace ck
#endif
......@@ -95,28 +95,6 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
level1_n_id * NPerLevel0Cluster + level0_n_id * NPerThreadSubC};
}
#if 0
__device__ static MatrixIndex GetDistanceFromBeginOfThreadMatrixC(index_t m_in_c,
index_t n_in_c)
{
constexpr auto c_thread_mtx = ThreadMatrixC{};
constexpr index_t MPerLevel1Cluster =
MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster;
constexpr index_t NPerLevel1Cluster =
NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster;
index_t m_repeat = m_in_c / MPerThreadSubC;
index_t n_repeat = n_in_c / NPerThreadSubC;
index_t m_in_sub_c = m_in_c % MPerThreadSubC;
index_t n_in_sub_c = n_in_c % NPerThreadSubC;
return MatrixIndex{m_repeat * MPerLevel1Cluster + m_in_sub_c,
n_repeat * NPerLevel1Cluster + n_in_sub_c};
}
#endif
template <typename FloatA, typename FloatB, typename FloatC>
__device__ void
Run_naive(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
......@@ -352,5 +330,366 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
}
};
// blockwise GEMM: C += transpose(A) * B
// A and B are visable to the whole block, C is distributed among each thread
// If following number are power of 2, index calculation shall be greatly reduced:
// MPerThreadSubC, NPerThreadSubC, MLevel0ThreadCluster, NLevel0ThreadCluster,
// MLevel1ThreadCluster, NLevel1ThreadCluster
template <index_t BlockSize,
typename BlockMatrixA,
typename BlockMatrixB,
typename ThreadMatrixC,
index_t MPerThreadSubC,
index_t NPerThreadSubC,
index_t KPerThreadLoop,
index_t MLevel0ThreadCluster,
index_t NLevel0ThreadCluster,
index_t MLevel1ThreadCluster,
index_t NLevel1ThreadCluster,
index_t ThreadGemmADataPerRead_M,
index_t ThreadGemmBDataPerRead_N>
struct BlockwiseGemm_km_kn_m0m1n0n1_v1
{
struct MatrixIndex
{
index_t row;
index_t col;
};
index_t mMyThreadOffsetA;
index_t mMyThreadOffsetB;
__device__ BlockwiseGemm_km_kn_m0m1n0n1_v1()
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr index_t ThreadPerLevel1Cluster = MLevel0ThreadCluster * NLevel0ThreadCluster *
MLevel1ThreadCluster * NLevel1ThreadCluster;
static_assert(BlockSize == ThreadPerLevel1Cluster, "wrong! wrong blocksize\n");
static_assert(BlockMatrixA{}.GetLength(I0) == BlockMatrixB{}.GetLength(I0),
"wrong! K dimension not consistent\n");
constexpr index_t M = BlockMatrixA{}.GetLength(I1); // A is transposed
constexpr index_t N = BlockMatrixB{}.GetLength(I1);
static_assert(M % (MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster) == 0 &&
N % (NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster) == 0,
"wrong! Cannot evenly divide work among\n");
static_assert(ThreadMatrixC{}.GetLength(I0) == GetThreadMatrixCLengths()[I0] &&
ThreadMatrixC{}.GetLength(I1) == GetThreadMatrixCLengths()[I1],
"wrong! ThreadMatrixC lengths is wrong");
auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
mMyThreadOffsetA = BlockMatrixA{}.CalculateOffset(make_tuple(0, c_thread_mtx_index.row));
mMyThreadOffsetB = BlockMatrixB{}.CalculateOffset(make_tuple(0, c_thread_mtx_index.col));
}
__device__ static constexpr auto GetThreadMatrixCLengths()
{
constexpr auto I1 = Number<1>{};
constexpr index_t M = BlockMatrixA{}.GetLength(I1); // A is transposed
constexpr index_t N = BlockMatrixB{}.GetLength(I1);
constexpr index_t MRepeat =
M / (MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster);
constexpr index_t NRepeat =
N / (NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster);
static_assert(M == 128, "wrong!");
static_assert(MPerThreadSubC == 4, "wrong!");
static_assert(MRepeat == 2, "wrong!");
static_assert(NRepeat == 2, "wrong!");
static_assert(NPerThreadSubC == 4, "wrong!");
return Sequence<MRepeat * MPerThreadSubC, NRepeat * NPerThreadSubC>{};
}
__device__ static MatrixIndex GetBeginOfThreadMatrixC(index_t thread_id)
{
constexpr index_t ThreadPerLevel0Cluster = MLevel0ThreadCluster * NLevel0ThreadCluster;
index_t level1_id = thread_id / ThreadPerLevel0Cluster;
index_t level1_m_id = level1_id / NLevel1ThreadCluster;
index_t level1_n_id = level1_id % NLevel1ThreadCluster;
index_t level0_id = thread_id % ThreadPerLevel0Cluster;
index_t level0_m_id = level0_id / NLevel0ThreadCluster;
index_t level0_n_id = level0_id % NLevel0ThreadCluster;
constexpr index_t MPerLevel0Cluster = MPerThreadSubC * MLevel0ThreadCluster;
constexpr index_t NPerLevel0Cluster = NPerThreadSubC * NLevel0ThreadCluster;
return MatrixIndex{level1_m_id * MPerLevel0Cluster + level0_m_id * MPerThreadSubC,
level1_n_id * NPerLevel0Cluster + level0_n_id * NPerThreadSubC};
}
template <typename FloatA, typename FloatB, typename FloatC>
__device__ void
Run_naive(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto a_block_mtx = BlockMatrixA{};
constexpr auto b_block_mtx = BlockMatrixB{};
constexpr auto c_thread_mtx = ThreadMatrixC{};
constexpr index_t K = a_block_mtx[I0];
constexpr index_t MPerThread = c_thread_mtx[I0];
constexpr index_t NPerThread = c_thread_mtx[I1];
constexpr index_t MPerLevel1Cluster =
MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster;
constexpr index_t NPerLevel1Cluster =
NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster;
constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
// thread A, B for GEMM
constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
Number<KPerThreadLoop>{}, Number<MPerThread>{});
constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
Number<KPerThreadLoop>{}, Number<NPerThread>{});
FloatA p_a_thread[a_thread_mtx.GetElementSpace()];
FloatB p_b_thread[b_thread_mtx.GetElementSpace()];
constexpr auto a_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixA,
decltype(a_thread_mtx),
KPerThreadLoop,
MPerThreadSubC,
ThreadGemmADataPerRead_M>{};
constexpr auto b_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixB,
decltype(b_thread_mtx),
KPerThreadLoop,
NPerThreadSubC,
ThreadGemmBDataPerRead_N>{};
constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v1<decltype(a_thread_mtx),
decltype(b_thread_mtx),
decltype(c_thread_mtx)>{};
#pragma unroll
// loop over k
for(index_t k_begin = 0; k_begin < K; k_begin += KPerThreadLoop)
{
#pragma unroll
// read A
for(index_t m_repeat = 0; m_repeat < MRepeat; ++m_repeat)
{
a_thread_copy.Run(p_a_block +
a_block_mtx.CalculateOffset(
make_tuple(k_begin, m_repeat * MPerLevel1Cluster)) +
mMyThreadOffsetA,
p_a_thread + a_thread_mtx.CalculateOffset(
make_tuple(0, m_repeat * MPerThreadSubC)));
}
#pragma unroll
// read B
for(index_t n_repeat = 0; n_repeat < NRepeat; ++n_repeat)
{
b_thread_copy.Run(p_b_block +
b_block_mtx.CalculateOffset(
make_tuple(k_begin, n_repeat * NPerLevel1Cluster)) +
mMyThreadOffsetB,
p_b_thread + b_thread_mtx.CalculateOffset(
make_tuple(0, n_repeat * NPerThreadSubC)));
}
// C += A * B
threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
}
}
template <typename FloatA, typename FloatB, typename FloatC>
__device__ void
Run_pipelined_2x2(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto a_block_mtx = BlockMatrixA{};
constexpr auto b_block_mtx = BlockMatrixB{};
constexpr auto c_thread_mtx = ThreadMatrixC{};
constexpr auto K = a_block_mtx.GetLength(I0);
constexpr auto MPerThread = c_thread_mtx.GetLength(I0);
constexpr auto NPerThread = c_thread_mtx.GetLength(I1);
constexpr index_t MPerLevel1Cluster =
MPerThreadSubC * MLevel0ThreadCluster * MLevel1ThreadCluster;
constexpr index_t NPerLevel1Cluster =
NPerThreadSubC * NLevel0ThreadCluster * NLevel1ThreadCluster;
constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
static_assert(MRepeat == 2 && NRepeat == 2,
"wrong! inline asm cannot deal with this GEMM config yet");
// thread A, B
constexpr auto a_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<KPerThreadLoop>{}, Number<MPerThread>{}));
constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<KPerThreadLoop>{}, Number<NPerThread>{}));
// thread A-sub, B-sub
constexpr auto a_thread_sub_mtx = make_dynamic_naive_tensor_descriptor_v2(
make_tuple(Number<KPerThreadLoop>{}, Number<MPerThreadSubC>{}),
make_tuple(Number<MPerThread>{}, Number<1>{}));
constexpr auto b_thread_sub_mtx = make_dynamic_naive_tensor_descriptor_v2(
make_tuple(Number<KPerThreadLoop>{}, Number<NPerThreadSubC>{}),
make_tuple(Number<NPerThread>{}, Number<1>{}));
constexpr auto c_thread_sub_mtx = make_dynamic_naive_tensor_descriptor_v2(
make_tuple(Number<MPerThreadSubC>{}, Number<NPerThreadSubC>{}),
make_tuple(Number<NPerThread>{}, Number<1>{}));
FloatA p_a_thread[a_thread_mtx.GetElementSpaceSize()];
FloatB p_b_thread[b_thread_mtx.GetElementSpaceSize()];
constexpr auto a_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixA,
decltype(a_thread_mtx),
KPerThreadLoop,
MPerThreadSubC,
ThreadGemmADataPerRead_M>{};
constexpr auto b_thread_copy = ThreadwiseMatrixSliceCopy_v2<BlockMatrixB,
decltype(b_thread_mtx),
KPerThreadLoop,
NPerThreadSubC,
ThreadGemmBDataPerRead_N>{};
constexpr auto threadwise_gemm = ThreadwiseGemm_km_kn_mn_v1<decltype(a_thread_sub_mtx),
decltype(b_thread_sub_mtx),
decltype(c_thread_sub_mtx)>{};
const FloatA* p_a_block_off = p_a_block + mMyThreadOffsetA;
const FloatB* p_b_block_off = p_b_block + mMyThreadOffsetB;
// read A_sub_0
a_thread_copy.Run(p_a_block_off, p_a_thread);
// read B_sub_0
b_thread_copy.Run(p_b_block_off, p_b_thread);
// read B_sub_1
b_thread_copy.Run(p_b_block_off +
b_block_mtx.CalculateOffset(make_tuple(0, NPerLevel1Cluster)),
p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
// read A_sub_1
a_thread_copy.Run(p_a_block_off +
a_block_mtx.CalculateOffset(make_tuple(0, MPerLevel1Cluster)),
p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)));
// C_sub_00 += transpose(A_sub_0) * B_sub_0
threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
// C_sub_01 += transpose(A_sub_0) * B_sub_1
threadwise_gemm.Run(
p_a_thread,
p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
#pragma unroll
// loop over rest of k
for(index_t k = KPerThreadLoop; k < K; k += KPerThreadLoop)
{
// read A_sub_0
a_thread_copy.Run(p_a_block_off + a_block_mtx.CalculateOffset(make_tuple(k, 0)),
p_a_thread);
// C_sub_10 += transpose(A_sub_1) * B_sub_0
threadwise_gemm.Run(
p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
p_b_thread,
p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, 0)));
// read B_sub_0
b_thread_copy.Run(p_b_block_off + b_block_mtx.CalculateOffset(make_tuple(k, 0)),
p_b_thread);
// C_sub_11 += transpose(A_sub_1) * B_sub_1
threadwise_gemm.Run(
p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
p_c_thread +
c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, NPerThreadSubC)));
// read B_sub_1
b_thread_copy.Run(
p_b_block_off + b_block_mtx.CalculateOffset(make_tuple(k, NPerLevel1Cluster)),
p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
// read A_sub_1
a_thread_copy.Run(
p_a_block_off + a_block_mtx.CalculateOffset(make_tuple(k, MPerLevel1Cluster)),
p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)));
// C_sub_00 += transpose(A_sub_0) * B_sub_0
threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
// C_sub_01 += transpose(A_sub_0) * B_sub_1
threadwise_gemm.Run(
p_a_thread,
p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)));
}
// C_sub_10 += transpose(A_sub_1) * B_sub_0
threadwise_gemm.Run(
p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
p_b_thread,
p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, 0)));
// C_sub_11 += transpose(A_sub_1) * B_sub_1
threadwise_gemm.Run(
p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, MPerThreadSubC)),
p_b_thread + b_thread_mtx.CalculateOffset(make_tuple(0, NPerThreadSubC)),
p_c_thread + c_thread_mtx.CalculateOffset(make_tuple(MPerThreadSubC, NPerThreadSubC)));
}
template <typename FloatA, typename FloatB, typename FloatC>
__device__ void Run(const FloatA* p_a_block, const FloatB* p_b_block, FloatC* p_c_thread) const
{
#if CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr index_t MPerThread = ThreadMatrixC{}.GetLength(I0);
constexpr index_t NPerThread = ThreadMatrixC{}.GetLength(I1);
constexpr index_t MRepeat = MPerThread / MPerThreadSubC;
constexpr index_t NRepeat = NPerThread / NPerThreadSubC;
if constexpr(MRepeat == 2 && NRepeat == 2)
{
Run_pipelined_2x2(p_a_block, p_b_block, p_c_thread);
}
else
{
Run_naive(p_a_block, p_b_block, p_c_thread);
}
#else
Run_naive(p_a_block, p_b_block, p_c_thread);
#endif
}
};
} // namespace ck
#endif
......@@ -130,12 +130,12 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr auto a_k_m_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
make_multi_index(KPerBlock, MPerBlock), max_lds_align);
make_tuple(Number<KPerBlock>{}, Number<MPerBlock>{}), Number<max_lds_align>{});
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr auto b_k_n_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
make_multi_index(KPerBlock, NPerBlock), max_lds_align);
make_tuple(Number<KPerBlock>{}, Number<NPerBlock>{}), Number<max_lds_align>{});
// A matrix blockwise copy
auto a_blockwise_copy =
......@@ -201,18 +201,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// b_mtx[KPerBlocl, NPerBlock] is in LDS
// c_mtx[MPerBlock, NPerBlock] is distributed among threads, and saved in
// register
constexpr index_t a_k_m_block_mtx_stride =
a_k_m_block_desc.CalculateOffset(make_multi_index(1, 0)) -
a_k_m_block_desc.CalculateOffset(make_multi_index(0, 0));
constexpr index_t b_k_n_block_mtx_stride =
b_k_n_block_desc.CalculateOffset(make_multi_index(1, 0)) -
b_k_n_block_desc.CalculateOffset(make_multi_index(0, 0));
constexpr auto a_k_m_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<KPerBlock>{}, Number<MPerBlock>{}, Number<a_k_m_block_mtx_stride>{});
constexpr auto b_k_n_block_mtx_desc = make_ConstantMatrixDescriptor(
Number<KPerBlock>{}, Number<NPerBlock>{}, Number<b_k_n_block_mtx_stride>{});
// sanity check
static_assert(MPerBlock % (MPerThread * MLevel0Cluster * MLevel1Cluster) == 0 &&
NPerBlock % (NPerThread * NLevel0Cluster * NLevel1Cluster) == 0,
......@@ -223,14 +211,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// c_thread_mtx definition: this is a mess
// TODO:: more elegent way of defining c_thread_mtx
constexpr auto c_m0m1_n0n1_thread_mtx_desc = make_ConstantMatrixDescriptor_packed(
Number<MRepeat * MPerThread>{}, Number<NRepeat * NPerThread>{});
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<
BlockSize,
decltype(a_k_m_block_mtx_desc),
decltype(b_k_n_block_mtx_desc),
decltype(c_m0m1_n0n1_thread_mtx_desc),
constexpr auto c_m0m1_n0n1_thread_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<MRepeat * MPerThread>{}, Number<NRepeat * NPerThread>{}));
const auto blockwise_gemm =
BlockwiseGemm_km_kn_m0m1n0n1_v1<BlockSize,
decltype(a_k_m_block_desc),
decltype(b_k_n_block_desc),
decltype(c_m0m1_n0n1_thread_desc),
MPerThread,
NPerThread,
KPerThread,
......@@ -252,10 +240,10 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
Float* p_b_block_double = p_shared_block + 2 * a_block_space_size;
// register allocation for output
AccFloat p_c_thread[c_m0m1_n0n1_thread_mtx_desc.GetElementSpace()];
AccFloat p_c_thread[c_m0m1_n0n1_thread_desc.GetElementSpaceSize()];
// zero out threadwise output
threadwise_matrix_set_zero(c_m0m1_n0n1_thread_mtx_desc, p_c_thread);
threadwise_matrix_set_zero_v2(c_m0m1_n0n1_thread_desc, p_c_thread);
constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock, 0);
constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock, 0);
......@@ -422,7 +410,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
AddressSpace::Global,
CGlobalMemoryDataOperation,
1,
true,
true>(c_m0_m1_n0_n1_global_desc,
make_multi_index(m_thread_data_on_global / M1,
m_thread_data_on_global % M1,
......
......@@ -44,11 +44,11 @@ template <typename SrcData,
AddressSpace DstAddressSpace,
InMemoryDataOperation DstInMemOp,
index_t DstScalarStrideInVector,
bool SrcResetCoordinateAfterRun,
bool DstResetCoordinateAfterRun>
struct ThreadwiseDynamicTensorSliceTransfer_v1r3
{
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using DstCoord = decltype(make_dynamic_tensor_coordinate(DstDesc{}, Index{}));
......@@ -61,11 +61,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
{
}
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v1r3()
: ThreadwiseDynamicTensorSliceTransfer_v1r3(DstDesc{}, make_zero_multi_index<nDim>())
{
}
__device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
{
dst_slice_origin_coord_ = make_dynamic_tensor_coordinate(dst_desc, dst_slice_origin_idx);
......@@ -297,7 +292,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
return forward_sweep;
}();
// calculate dst data index after last iteration in RunWrite(), if it has not being reset by
// calculate dst data index after last iteration in Run(), if it has not being reset by
// RunWrite()
constexpr auto dst_data_idx = [&]() {
Index ordered_idx;
......@@ -328,7 +323,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
__device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
const Index& dst_slice_origin_step_idx)
{
// if dst coord was not reset by RunWrite(), then need to adjust the step here
// if dst coord was not reset by Run(), then need to adjust the step here
const auto adjusted_step_idx =
DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
: dst_slice_origin_step_idx + GetDstCoordinateResetStep();
......@@ -344,6 +339,319 @@ struct ThreadwiseDynamicTensorSliceTransfer_v1r3
DstCoord dst_slice_origin_coord_;
}; // namespace ck
// this version is less likely to have scratch memory issue, due to:
// 1. It does not keep reference to tensor descriptor
// 2. It does not construct new tensor coordinate for this->Run()
// Assume dst_slice_origin_idx is 0
template <typename SrcData,
typename DstData,
typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename DimAccessOrder,
index_t SrcVectorDim,
index_t SrcScalarPerVector,
AddressSpace SrcAddressSpace,
AddressSpace DstAddressSpace,
index_t SrcScalarStrideInVector,
bool SrcResetCoordinateAfterRun>
struct ThreadwiseDynamicTensorSliceTransfer_v2
{
static constexpr index_t nDim = SliceLengths::Size();
using Index = MultiIndex<nDim>;
using SrcCoord = decltype(make_dynamic_tensor_coordinate(SrcDesc{}, Index{}));
using SrcCoordIterator = decltype(make_dynamic_tensor_coordinate_iterator(SrcDesc{}, Index{}));
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v2(const SrcDesc& src_desc,
const Index& src_slice_origin_idx)
: src_slice_origin_coord_(make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx))
{
}
__device__ void SetDstSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
{
src_slice_origin_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
}
template <typename SrcIteratorHacks>
__device__ void Run(const SrcDesc& src_desc,
const SrcData* p_src,
DstData* p_dst,
const SrcIteratorHacks& src_iterator_hacks)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
// Comments: dst_desc is constexpr
constexpr auto dst_desc = remove_cv_t<remove_reference_t<DstDesc>>{};
// scalar per access on each dim
// TODO: don't use lambda_scalar_per_access
constexpr auto src_scalar_per_access = generate_sequence(
lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
constexpr auto src_scalar_step_in_vector =
generate_sequence(lambda_scalar_step_in_vector<SrcVectorDim>{}, Number<nDim>{});
constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
constexpr auto dim_access_order = DimAccessOrder{};
constexpr auto ordered_access_lengths =
container_reorder_given_new2old(access_lengths, dim_access_order);
// make forward iterators
const auto src_forward_iterators = generate_tuple(
[&](auto i) {
Index forward_step;
static_for<0, nDim, 1>{}([&](auto j) {
forward_step(j) = (i.value == j.value) ? src_scalar_per_access[i] : 0;
});
return make_dynamic_tensor_coordinate_iterator(
src_desc, forward_step, src_iterator_hacks[I0][i]);
},
Number<nDim>{});
// make backward iterators
const auto src_backward_iterators = generate_tuple(
[&](auto i) {
Index backward_step;
static_for<0, nDim, 1>{}([&](auto j) {
backward_step(j) = (i.value == j.value) ? -src_scalar_per_access[i] : 0;
});
return make_dynamic_tensor_coordinate_iterator(
src_desc, backward_step, src_iterator_hacks[I1][i]);
},
Number<nDim>{});
// loop over tensor and copy
static_ford<decltype(ordered_access_lengths)>{}([&](auto ordered_access_idx) {
// judge move forward or move backward
constexpr auto forward_sweep = [&]() {
StaticallyIndexedArray<bool, nDim> forward_sweep;
forward_sweep(I0) = true;
static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_access_idx[I0];
static_for<0, i, 1>{}([&](auto j) {
tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j];
});
forward_sweep(i) = tmp % 2 == 0;
});
return forward_sweep;
}();
// calculate src data index
constexpr auto src_data_idx = [&]() {
Index ordered_idx;
static_for<0, nDim, 1>{}([&](auto i) {
ordered_idx(i) = forward_sweep[i]
? ordered_access_idx[i]
: ordered_access_lengths[i] - 1 - ordered_access_idx[i];
});
auto src_data_idx = container_reorder_given_old2new(ordered_idx, dim_access_order) *
src_scalar_per_access;
return src_data_idx;
}();
// copy data
// hardcoding for buffer_store
// TODO refactor transfer_data() to encapsulate this
static_assert(DstAddressSpace == AddressSpace::Vgpr, "wrong! hardcode for ds_read");
vector_type<SrcData, SrcScalarPerVector> src_vector;
using src_vector_t = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
if constexpr(SrcAddressSpace == AddressSpace::Global)
{
src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
p_src,
src_slice_origin_coord_.GetOffset(),
true,
src_desc.GetElementSpaceSize());
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_coord_);
src_vector.Vector() = is_valid ? src_vector.Vector() : src_vector_t{0};
}
else
{
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_coord_);
src_vector.Vector() = is_valid ? *reinterpret_cast<const src_vector_t*>(
&p_src[src_slice_origin_coord_.GetOffset()])
: src_vector_t{0};
}
// this is hardcoded for dst that has compile-time tensor descriptor
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
// assume dst_slice_origin_idx is 0
// TODO: support non-zero dst_slice_oring_idx
constexpr index_t dst_offset =
dst_desc.CalculateOffset(src_data_idx + i * src_scalar_step_in_vector);
p_dst[Number<dst_offset>{}] = src_vector[i];
});
constexpr auto move_on_dim = [&]() constexpr
{
StaticallyIndexedArray<bool, nDim> move_on_dim;
static_for<0, nDim, 1>{}([&](auto i) {
move_on_dim(i) = ordered_access_idx[i] < ordered_access_lengths[i] - 1;
static_for<i + 1, nDim, 1>{}([&](auto j) {
move_on_dim(i) &= ordered_access_idx[j] == ordered_access_lengths[j] - 1;
});
});
return move_on_dim;
}
();
// move
static_for<0, nDim, 1>{}([&](auto i) {
if constexpr(move_on_dim[i])
{
if constexpr(forward_sweep[i])
{
move_dynamic_tensor_coordinate(src_desc,
src_slice_origin_coord_,
src_forward_iterators[dim_access_order[i]]);
}
else
{
move_dynamic_tensor_coordinate(src_desc,
src_slice_origin_coord_,
src_backward_iterators[dim_access_order[i]]);
}
}
});
});
// move src coordinate back to slice origin (or not)
if constexpr(SrcResetCoordinateAfterRun)
{
const auto src_reset_iterator =
make_dynamic_tensor_coordinate_iterator(src_desc, GetSrcCoordinateResetStep());
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_coord_, src_reset_iterator);
}
}
__device__ void Run(const SrcDesc& src_desc, const SrcData* p_src, DstData* p_dst)
{
constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
constexpr auto src_iterator_hacks =
make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
Run(src_desc, p_src, p_dst, src_iterator_hacks);
}
__device__ static constexpr auto GetSrcCoordinateResetStep()
{
constexpr auto I0 = Number<0>{};
// scalar per access on each dim
// TODO: don't use lambda_scalar_per_access
constexpr auto src_scalar_per_access = generate_sequence(
lambda_scalar_per_access<SrcVectorDim, SrcScalarPerVector>{}, Number<nDim>{});
constexpr auto access_lengths = SliceLengths{} / src_scalar_per_access;
constexpr auto dim_access_order = DimAccessOrder{};
constexpr auto ordered_access_lengths =
container_reorder_given_new2old(access_lengths, dim_access_order);
// judge move forward or move backward during the last iteration
constexpr auto forward_sweep = [&]() {
StaticallyIndexedArray<bool, nDim> forward_sweep;
forward_sweep(I0) = true;
static_for<1, nDim, 1>{}([&](auto i) {
index_t tmp = ordered_access_lengths[I0] - 1;
static_for<0, i, 1>{}([&](auto j) {
tmp = tmp * ordered_access_lengths[j] + ordered_access_lengths[j] - 1;
});
forward_sweep(i) = tmp % 2 == 0;
});
return forward_sweep;
}();
// calculate src data index after last iteration in Run(), if it has not being reset by
// RunWrite()
constexpr auto src_data_idx = [&]() {
Index ordered_idx;
static_for<0, nDim, 1>{}([&](auto i) {
ordered_idx(i) = forward_sweep[i] ? ordered_access_lengths[i] - 1 : 0;
});
auto src_data_idx = container_reorder_given_old2new(ordered_idx, dim_access_order) *
src_scalar_per_access;
return src_data_idx;
}();
//
constexpr auto reset_src_data_step = [&]() {
Index reset_src_data_step;
static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step(i) = -src_data_idx[i]; });
return reset_src_data_step;
}();
return reset_src_data_step;
}
// dst_slice_origin_step_idx need to be known at compile-time, for performance reason
__device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
const Index& src_slice_origin_step_idx)
{
// if src coord was not reset by Run(), then need to adjust the step here
const auto adjusted_step_idx =
SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
: src_slice_origin_step_idx + GetSrcCoordinateResetStep();
// is it OK to construct a new step every time?
const auto adjusted_step =
make_dynamic_tensor_coordinate_iterator(src_desc, adjusted_step_idx);
move_dynamic_tensor_coordinate(src_desc, src_slice_origin_coord_, adjusted_step);
}
private:
SrcCoord src_slice_origin_coord_;
}; // namespace ck
// this version does following things to avoid "alloca" in LLVM-IR, which would cause scratch memory
// and sometimes useless instructions
// 1. It does not keep reference to tensor descriptor
......@@ -398,12 +706,6 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
"wrong!");
}
__device__ constexpr ThreadwiseDynamicTensorSliceTransfer_v3()
: ThreadwiseDynamicTensorSliceTransfer_v3(
SrcDesc{}, make_zero_multi_index<nDim>(), DstDesc{}, make_zero_multi_index<nDim>())
{
}
__device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
{
src_slice_origin_coord_ = make_dynamic_tensor_coordinate(src_desc, src_slice_origin_idx);
......@@ -512,7 +814,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
vector_type<SrcData, SrcScalarPerVector> src_vector;
using SrcVectorType = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
using src_vector_t = typename vector_type<SrcData, SrcScalarPerVector>::MemoryType;
#if 1
src_vector.Vector() = amd_buffer_load<SrcData, SrcScalarPerVector>(
......@@ -521,7 +823,7 @@ struct ThreadwiseDynamicTensorSliceTransfer_v3
const bool is_valid = coordinate_has_valid_offset_assuming_visible_index_is_valid(
src_desc, src_slice_origin_coord_);
src_vector.Vector() = is_valid ? src_vector.Vector() : SrcVectorType{0};
src_vector.Vector() = is_valid ? src_vector.Vector() : src_vector_t{0};
static_for<0, SrcScalarPerVector, 1>{}([&](auto i) {
constexpr index_t buffer_offset =
......
......@@ -161,5 +161,160 @@ struct ThreadwiseGemmTransANormalBNormalC
}
};
template <typename Float, class Matrix>
__device__ void threadwise_matrix_set_zero_v2(Matrix, Float* __restrict__ p_thread)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto M = Matrix{}.GetLength(I0);
constexpr auto N = Matrix{}.GetLength(I1);
static_for<0, M, 1>{}([&](auto i) {
static_for<0, N, 1>{}([&](auto j) {
constexpr auto offset = Matrix{}.CalculateOffset(make_tuple(i, j));
p_thread[offset] = Float(0);
});
});
}
template <typename SrcMatrix,
typename DstMatrix,
index_t NSliceRow,
index_t NSliceCol,
index_t DataPerAccess>
struct ThreadwiseMatrixSliceCopy_v2
{
template <typename Data>
__device__ static void Run(const Data* p_src, Data* p_dst)
{
using vector_t = typename vector_type<Data, DataPerAccess>::MemoryType;
static_for<0, NSliceRow, 1>{}([&](auto i) {
static_for<0, NSliceCol, DataPerAccess>{}([&](auto j) {
constexpr auto src_offset = SrcMatrix{}.CalculateOffset(make_tuple(i, j));
constexpr auto dst_offset = DstMatrix{}.CalculateOffset(make_tuple(i, j));
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
});
});
}
};
// C += transpose(A) * B
// Element of matrix can be vectorized data
template <typename MatrixA, typename MatrixB, typename MatrixC>
struct ThreadwiseGemm_km_kn_mn_v1
{
template <typename FloatA, typename FloatB, typename FloatC>
__device__ static void Run_source(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr index_t M = MatrixC{}[I0];
constexpr index_t N = MatrixC{}[I1];
constexpr index_t K = MatrixA{}[I0];
static_for<0, K, 1>{}([&](auto k) {
static_for<0, M, 1>{}([&](auto m) {
static_for<0, N, 1>{}([&](auto n) {
const index_t a_offset =
MatrixA{}.CalculateOffset(make_tuple(k, m)); // A is transposed
const index_t b_offset = MatrixB{}.CalculateOffset(make_tuple(k, n));
const index_t c_offset = MatrixC{}.CalculateOffset(make_tuple(m, n));
p_c[c_offset] +=
inner_product_with_conversion<FloatC>{}(p_a[a_offset], p_b[b_offset]);
});
});
});
}
#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
template <typename FloatA, typename FloatB, typename FloatC>
__device__ static void Run_amd_asm(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
{
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr index_t M = MatrixC{}.GetLength(I0);
constexpr index_t N = MatrixC{}.GetLength(I1);
constexpr index_t K = MatrixA{}.GetLength(I0);
static_assert(N == 4 || N == 2, "wrong! this config not supported by asm yet");
static_for<0, K, 1>{}([&](auto k) {
static_for<0, M, 1>{}([&](auto m) {
constexpr auto a_offset = MatrixA{}.CalculateOffset(make_tuple(k, m));
if constexpr(N == 2)
{
constexpr auto b_offset_0 = MatrixB{}.CalculateOffset(make_tuple(k, I0));
constexpr auto b_offset_1 = MatrixB{}.CalculateOffset(make_tuple(k, I1));
constexpr auto c_offset_0 = MatrixC{}.CalculateOffset(make_tuple(m, I0));
constexpr auto c_offset_1 = MatrixC{}.CalculateOffset(make_tuple(m, I1));
amd_assembly_outer_product_1x2(p_a[a_offset],
p_b[b_offset_0],
p_b[b_offset_1],
p_c[c_offset_0],
p_c[c_offset_1]);
}
else if constexpr(N == 4)
{
constexpr auto b_offset_0 = MatrixB{}.CalculateOffset(make_tuple(k, I0));
constexpr auto b_offset_1 = MatrixB{}.CalculateOffset(make_tuple(k, I1));
constexpr auto b_offset_2 = MatrixB{}.CalculateOffset(make_tuple(k, I2));
constexpr auto b_offset_3 = MatrixB{}.CalculateOffset(make_tuple(k, I3));
constexpr auto c_offset_0 = MatrixC{}.CalculateOffset(make_tuple(m, I0));
constexpr auto c_offset_1 = MatrixC{}.CalculateOffset(make_tuple(m, I1));
constexpr auto c_offset_2 = MatrixC{}.CalculateOffset(make_tuple(m, I2));
constexpr auto c_offset_3 = MatrixC{}.CalculateOffset(make_tuple(m, I3));
amd_assembly_outer_product_1x4(p_a[a_offset],
p_b[b_offset_0],
p_b[b_offset_1],
p_b[b_offset_2],
p_b[b_offset_3],
p_c[c_offset_0],
p_c[c_offset_1],
p_c[c_offset_2],
p_c[c_offset_3]);
}
});
});
}
#endif
template <typename FloatA, typename FloatB, typename FloatC>
__device__ static void Run(const FloatA* p_a, const FloatB* p_b, FloatC* p_c)
{
#if CK_THREADWISE_GEMM_USE_AMD_INLINE_ASM
constexpr bool has_amd_asm = is_same<FloatC, float>{} &&
((is_same<FloatA, float>{} && is_same<FloatB, float>{}) ||
(is_same<FloatA, half2_t>{} && is_same<FloatB, half2_t>{}) ||
(is_same<FloatA, half4_t>{} && is_same<FloatB, half4_t>{}));
if constexpr(has_amd_asm)
{
Run_amd_asm(p_a, p_b, p_c);
}
else
{
Run_source(p_a, p_b, p_c);
}
#else
Run_source(p_a, p_b, p_c);
#endif
}
};
} // namespace ck
#endif
......@@ -114,8 +114,8 @@ __host__ __device__ constexpr T min(T x, Ts... xs)
}
// greatest common divisor, aka highest common factor
template <typename X, typename Y>
__host__ __device__ constexpr auto gcd(X x, Y y)
template <typename T>
__host__ __device__ constexpr T gcd(T x, T y)
{
if(x == y || x == 0)
{
......@@ -135,6 +135,14 @@ __host__ __device__ constexpr auto gcd(X x, Y y)
}
}
template <index_t X, index_t Y>
__host__ __device__ constexpr auto gcd(Number<X>, Number<Y>)
{
constexpr auto r = gcd(X, Y);
return Number<r>{};
}
template <typename X, typename... Ys>
__host__ __device__ constexpr auto gcd(X x, Ys... ys)
{
......
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "dummy_dynamic_transform.hpp"
template <class T,
class InDesc,
class WeiDesc,
class OutDesc,
class ConvStrides,
class ConvDilations,
class InLeftPads,
class InRightPads>
void device_dummy_dynamic_transform(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
InLeftPads,
InRightPads,
ck::index_t nrepeat)
{
using namespace ck;
using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
const auto in_nchw_desc = make_dynamic_naive_tensor_descriptor<4>(
to_multi_index(InDesc::GetLengths()), to_multi_index(InDesc::GetStrides()));
const auto wei_kcyx_desc = make_dynamic_naive_tensor_descriptor<4>(
to_multi_index(WeiDesc::GetLengths()), to_multi_index(WeiDesc::GetStrides()));
const auto out_nkhw_desc = make_dynamic_naive_tensor_descriptor<4>(
to_multi_index(OutDesc::GetLengths()), to_multi_index(OutDesc::GetStrides()));
const auto conv_strides = to_multi_index(ConvStrides{});
const auto conv_dilations = to_multi_index(ConvDilations{});
const auto in_left_pads = to_multi_index(InLeftPads{});
const auto in_right_pads = to_multi_index(InRightPads{});
const auto tensor_descs = map_convolution_into_gemm_fwd_v4r4(wei_kcyx_desc,
in_nchw_desc,
out_nkhw_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
const auto in_gemmk_gemmn_gemmkpack_global_desc = tensor_descs.At(Number<0>{});
// test on cpu
{
auto in_gemmk_gemmn_gemmkpack_coord = make_dynamic_tensor_coordinate(
in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 0));
const auto in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1 =
make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
make_multi_index(0, 0, 1));
print_array_v2("do_tansforms 0 0 1: ",
in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1.do_transforms_);
for(index_t iter = 0; iter < 10; ++iter)
{
printf("iter %d\n", iter);
print_array_v2("idx: ", in_gemmk_gemmn_gemmkpack_coord.GetIndex());
print_array_v2("hidden idx: ", in_gemmk_gemmn_gemmkpack_coord.GetHiddenIndex());
printf("offset: %d\n", in_gemmk_gemmn_gemmkpack_coord.GetOffset());
printf("\n");
move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
in_gemmk_gemmn_gemmkpack_coord,
in_gemmk_gemmn_gemmkpack_coord_iterator_0_0_1);
}
}
{
auto in_gemmk_gemmn_gemmkpack_coord = make_dynamic_tensor_coordinate(
in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 0));
const auto in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0 =
make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
make_multi_index(0, 1, 0));
print_array_v2("do_tansforms 0 1 0: ",
in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0.do_transforms_);
for(index_t iter = 0; iter < 10; ++iter)
{
printf("iter %d\n", iter);
print_array_v2("idx: ", in_gemmk_gemmn_gemmkpack_coord.GetIndex());
print_array_v2("hidden idx: ", in_gemmk_gemmn_gemmkpack_coord.GetHiddenIndex());
printf("offset: %d\n", in_gemmk_gemmn_gemmkpack_coord.GetOffset());
printf("\n");
move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
in_gemmk_gemmn_gemmkpack_coord,
in_gemmk_gemmn_gemmkpack_coord_iterator_0_1_0);
}
}
{
auto in_gemmk_gemmn_gemmkpack_coord = make_dynamic_tensor_coordinate(
in_gemmk_gemmn_gemmkpack_global_desc, make_multi_index(0, 0, 0));
const auto in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0 =
make_dynamic_tensor_coordinate_iterator(in_gemmk_gemmn_gemmkpack_global_desc,
make_multi_index(1, 0, 0));
print_array_v2("do_tansforms 1 0 0: ",
in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0.do_transforms_);
for(index_t iter = 0; iter < 10; ++iter)
{
printf("iter %d\n", iter);
print_array_v2("idx: ", in_gemmk_gemmn_gemmkpack_coord.GetIndex());
print_array_v2("hidden idx: ", in_gemmk_gemmn_gemmkpack_coord.GetHiddenIndex());
printf("offset: %d\n", in_gemmk_gemmn_gemmkpack_coord.GetOffset());
printf("\n");
move_dynamic_tensor_coordinate(in_gemmk_gemmn_gemmkpack_global_desc,
in_gemmk_gemmn_gemmkpack_coord,
in_gemmk_gemmn_gemmkpack_coord_iterator_1_0_0);
}
}
std::size_t data_sz = sizeof(T);
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
in_nchw_device_buf.ToDevice(in_nchw.mData.data());
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
constexpr index_t BlockSize = 256;
constexpr index_t GridSize = 1;
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
#if 0
launch_kernel(run_gridwise_operation<DummyDynamicTransform_1<BlockSize>,
index_t* const,
float* const,
float* const,
const decltype(wei_kcyx_desc),
const decltype(in_nchw_desc),
const decltype(out_nkhw_desc),
const MultiIndex<2>,
const MultiIndex<2>,
const MultiIndex<2>,
const MultiIndex<2>>,
dim3(GridSize),
dim3(BlockSize),
0,
0,
static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
static_cast<float*>(in_nchw_device_buf.GetDeviceBuffer()),
static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
wei_kcyx_desc,
in_nchw_desc,
out_nkhw_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
#else
launch_kernel(
run_gridwise_operation<DummyDynamicTransform_fwd_v4r4<BlockSize>,
index_t* const,
float* const,
float* const,
const decltype(in_gemmk_gemmn_gemmkpack_global_desc)>,
dim3(GridSize),
dim3(BlockSize),
0,
0,
static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
static_cast<float*>(in_nchw_device_buf.GetDeviceBuffer()),
static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
in_gemmk_gemmn_gemmkpack_global_desc);
#endif
}
}
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
}
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "dummy_dynamic_transform_v1.hpp"
template <class T,
class InDesc,
class WeiDesc,
class OutDesc,
class ConvStrides,
class ConvDilations,
class InLeftPads,
class InRightPads>
void device_dummy_dynamic_transform_v1(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
InLeftPads,
InRightPads,
ck::index_t nrepeat)
{
using namespace ck;
using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
const auto in_nchw_desc = make_dynamic_naive_tensor_descriptor_v1(
to_multi_index(InDesc::GetLengths()), to_multi_index(InDesc::GetStrides()));
const auto wei_kcyx_desc = make_dynamic_naive_tensor_descriptor_v1(
to_multi_index(WeiDesc::GetLengths()), to_multi_index(WeiDesc::GetStrides()));
const auto out_nkhw_desc = make_dynamic_naive_tensor_descriptor_v1(
to_multi_index(OutDesc::GetLengths()), to_multi_index(OutDesc::GetStrides()));
const auto conv_strides = to_multi_index(ConvStrides{});
const auto conv_dilations = to_multi_index(ConvDilations{});
const auto in_left_pads = to_multi_index(InLeftPads{});
const auto in_right_pads = to_multi_index(InRightPads{});
{
const auto tensor_descs = map_convolution_into_gemm_v1(wei_kcyx_desc,
in_nchw_desc,
out_nkhw_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
const auto in_gemmk_gemmn_global_desc = tensor_descs.At(Number<0>{});
auto in_gemmk_gemmn_coord =
make_dynamic_tensor_coordinate(in_gemmk_gemmn_global_desc, make_multi_index(0, 0));
for(index_t iter = 0; iter < 10; ++iter)
{
constexpr auto gemmk1_gemmn0 = make_multi_index(1, 0);
printf("iter %d\n", iter);
print_array("idx0: ", in_gemmk_gemmn_coord.GetIndex());
print_array("idx1: ", in_gemmk_gemmn_coord.GetLowerCoordinate().GetIndex());
print_array("idx2: ",
in_gemmk_gemmn_coord.GetLowerCoordinate().GetLowerCoordinate().GetIndex());
print_array("idx3: ",
in_gemmk_gemmn_coord.GetLowerCoordinate()
.GetLowerCoordinate()
.GetLowerCoordinate()
.GetIndex());
print_array("idx4: ",
in_gemmk_gemmn_coord.GetLowerCoordinate()
.GetLowerCoordinate()
.GetLowerCoordinate()
.GetLowerCoordinate()
.GetIndex());
printf("offset: %d\n", in_gemmk_gemmn_coord.GetOffset());
printf("\n");
in_gemmk_gemmn_coord += gemmk1_gemmn0;
}
}
std::size_t data_sz = sizeof(T);
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
in_nchw_device_buf.ToDevice(in_nchw.mData.data());
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
constexpr index_t BlockSize = 256;
constexpr index_t GridSize = 1;
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
using dummy_transform = DummyDynamicTransform_v1<BlockSize>;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
launch_kernel(run_gridwise_operation<dummy_transform,
index_t* const,
float* const,
float* const,
const DynamicNativeTensorDescriptor_v1<4>,
const DynamicNativeTensorDescriptor_v1<4>,
const DynamicNativeTensorDescriptor_v1<4>,
const MultiIndex<2>,
const MultiIndex<2>,
const MultiIndex<2>,
const MultiIndex<2>>,
dim3(GridSize),
dim3(BlockSize),
0,
0,
static_cast<index_t*>(wei_kcyx_device_buf.GetDeviceBuffer()),
static_cast<float*>(in_nchw_device_buf.GetDeviceBuffer()),
static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()),
wei_kcyx_desc,
in_nchw_desc,
out_nkhw_desc,
conv_strides,
conv_dilations,
in_left_pads,
in_right_pads);
}
}
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
}
#include <unistd.h>
#include "device.hpp"
#include "host_tensor.hpp"
#include "gridwise_operation_wrapper.hpp"
#include "dummy_static_transform.hpp"
template <class T,
class InDesc,
class WeiDesc,
class OutDesc,
class ConvStrides,
class ConvDilations,
class InLeftPads,
class InRightPads>
void device_dummy_static_transform(InDesc,
const Tensor<T>& in_nchw,
WeiDesc,
const Tensor<T>& wei_kcyx,
OutDesc,
Tensor<T>& out_nkhw,
ConvStrides,
ConvDilations,
InLeftPads,
InRightPads,
ck::index_t nrepeat)
{
using namespace ck;
using TDevice = typename conditional<is_same<half_float::half, T>::value, half_t, T>::type;
constexpr auto I0 = Number<0>{};
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto in_nchw_desc =
make_native_tensor_descriptor(InDesc::GetLengths(), InDesc::GetStrides());
constexpr auto wei_kcyx_desc =
make_native_tensor_descriptor(WeiDesc::GetLengths(), WeiDesc::GetStrides());
constexpr auto out_nkhw_desc =
make_native_tensor_descriptor(OutDesc::GetLengths(), OutDesc::GetStrides());
constexpr index_t N = out_nkhw_desc.GetLength(I0);
constexpr index_t K = out_nkhw_desc.GetLength(I1);
constexpr index_t Ho = out_nkhw_desc.GetLength(I2);
constexpr index_t Wo = out_nkhw_desc.GetLength(I3);
std::size_t data_sz = sizeof(T);
DeviceMem in_nchw_device_buf(data_sz * in_nchw.mDesc.GetElementSpace());
DeviceMem wei_kcyx_device_buf(data_sz * wei_kcyx.mDesc.GetElementSpace());
DeviceMem out_nkhw_device_buf(data_sz * out_nkhw.mDesc.GetElementSpace());
in_nchw_device_buf.ToDevice(in_nchw.mData.data());
wei_kcyx_device_buf.ToDevice(wei_kcyx.mData.data());
out_nkhw_device_buf.ToDevice(out_nkhw.mData.data());
constexpr index_t BlockSize = 256;
constexpr index_t GridSize = 1;
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
using dummy_transform = DummyStaticTransform<GridSize,
BlockSize,
float,
decltype(in_nchw_desc),
decltype(wei_kcyx_desc),
decltype(out_nkhw_desc),
ConvStrides,
ConvDilations,
InLeftPads,
InRightPads>;
for(index_t i = 0; i < 5; ++i)
{
std::cout << "Start running " << nrepeat << " times..." << std::endl;
KernelTimer timer;
timer.Start();
for(index_t j = 0; j < nrepeat; ++j)
{
launch_kernel(run_gridwise_operation<dummy_transform,
float* const __restrict__,
float* const __restrict__,
float* const __restrict__>,
dim3(GridSize),
dim3(BlockSize),
0,
0,
static_cast<float*>(in_nchw_device_buf.GetDeviceBuffer()),
static_cast<float*>(wei_kcyx_device_buf.GetDeviceBuffer()),
static_cast<float*>(out_nkhw_device_buf.GetDeviceBuffer()));
}
}
out_nkhw_device_buf.FromDevice(out_nkhw.mData.data());
}
......@@ -14,9 +14,6 @@
#include "device_convolution_forward_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp"
#include "device_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp"
#include "device_dummy_static_transform.hpp"
#include "device_dummy_dynamic_transform_v1.hpp"
#include "device_dummy_dynamic_transform.hpp"
int main(int argc, char* argv[])
{
......@@ -616,42 +613,6 @@ int main(int argc, char* argv[])
LeftPads{},
RightPads{},
nrepeat);
#elif 0
device_dummy_static_transform(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#elif 0
device_dummy_dynamic_transform_v1(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#elif 1
device_dummy_dynamic_transform(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
wei_kcyx,
out_nkhw_desc,
out_nkhw_device,
ConvStrides{},
ConvDilations{},
LeftPads{},
RightPads{},
nrepeat);
#endif
if(do_verification)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment