Commit 68ea43b1 authored by Chao Liu's avatar Chao Liu
Browse files

add vector load and access order to threadwise copy v1r3

parent 5fdccfce
......@@ -492,7 +492,7 @@ struct DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
0,
GemmABlockTransferSrcScalarPerVector_GemmK,
GemmABlockTransferDstScalarPerVector_GemmM,
true, // move back src coordinate after threadwise copy
false, // don't move back src coordinate after threadwise copy
GemmBBlockTransferThreadSliceLengths_GemmK_GemmN,
GemmBBlockTransferThreadClusterLengths_GemmK_GemmN,
Sequence<0, 1>,
......
......@@ -502,7 +502,7 @@ struct DynamicMerge
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ void UpdateLowerIndex_1(LowIdxDiff& idx_diff_low,
__host__ __device__ void UpdateLowerIndex_1a(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
LowIdx& idx_low,
const UpIdx& /* idx_up_new */,
......@@ -640,6 +640,148 @@ struct DynamicMerge
}
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
typename UpIdx,
index_t Hack>
__host__ __device__ void UpdateLowerIndex_1b(LowIdxDiff& idx_diff_low,
const UpIdxDiff& idx_diff_up,
LowIdx& idx_low,
const UpIdx& /* idx_up_new */,
Number<Hack>) const
{
static_assert(LowIdxDiff::Size() == NDimLow && UpIdxDiff::Size() == 1 &&
LowIdx::Size() == NDimLow && UpIdx::Size() == 1,
"wrong! inconsistent # of dimension");
// CalculateLowerIndex(idx_diff_low_const) has multiple integer divisions.
// However,
// 1) If idx_diff_up is known at compile-time, then idx_diff_low_const
// can be calculated at compile-time.
// 2) If idx_diff_up is not known at compile-time, but its value
// doesn't change during the whole kernel execution, then
// idx_diff_low_const also
// doesn't change during the whole kernel execution. Compiler generated
// ISA should
// only caclculate idx_diff_low_const once and save it durinng the whole
// kernel execution
// If neither 1) nor 2) is satisfied, then the calculation will also be
// computed at
// run-time each time this function is called, and can be very expensive.
LowerIndex idx_diff_low_const;
LowerIndex idx_low_length_minus_idx_diff_low_const;
LowerIndex idx_low_length_plus_idx_diff_low_const;
#if !CK_HACK_DYNAMIC_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = tmp / low_lengths_scan_[i];
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = tmp;
static_for<0, NDimLow, 1>{}([&](auto i) {
idx_low_length_minus_idx_diff_low_const(i) = low_lengths_[i] - idx_diff_low_const[i];
idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
});
#else
// Hack: this force result into SGPR. Need to make sure the result is thread invariant
index_t tmp = idx_diff_up[Number<0>{}];
static_for<0, NDimLow - 1, 1>{}([&](auto i) {
idx_diff_low_const(i) = __builtin_amdgcn_readfirstlane(tmp / low_lengths_scan_[i]);
tmp -= idx_diff_low_const[i] * low_lengths_scan_[i];
});
idx_diff_low_const(Number<NDimLow - 1>{}) = __builtin_amdgcn_readfirstlane(tmp);
static_for<0, NDimLow, 1>{}([&](auto i) {
idx_low_length_minus_idx_diff_low_const(i) =
__builtin_amdgcn_readfirstlane(low_lengths_[i] - idx_diff_low_const[i]);
idx_low_length_plus_idx_diff_low_const(i) = low_lengths_[i] + idx_diff_low_const[i];
});
#endif
if constexpr(Hack == 1)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
idx_low += idx_diff_low;
}
else if constexpr(Hack == 2)
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t borrow = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t negative_idx_low_tmp = borrow - idx_low[i];
bool do_borrow = negative_idx_low_tmp > idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) -= borrow;
borrow = do_borrow ? 1 : 0;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] - borrow;
idx_low += idx_diff_low;
}
else
{
// do carry check on each low dimension in reversed order
// do not need to check the first dimension
index_t carry = 0;
static_for<NDimLow - 1, 0, -1>{}([&](auto i) {
index_t idx_low_tmp = idx_low[i] + carry;
bool do_carry = idx_low_tmp >= idx_low_length_minus_idx_diff_low_const[i];
bool do_borrow = idx_low_tmp < -idx_diff_low_const[i];
idx_diff_low(i) =
do_carry ? -idx_low_length_minus_idx_diff_low_const[i] : idx_diff_low_const[i];
idx_diff_low(i) =
do_borrow ? idx_low_length_plus_idx_diff_low_const[i] : idx_diff_low[i];
idx_diff_low(i) += carry;
carry = do_carry ? 1 : 0;
carry = do_borrow ? -1 : carry;
});
idx_diff_low(Number<0>{}) = idx_diff_low_const[Number<0>{}] + carry;
idx_low += idx_diff_low;
}
}
template <typename LowIdxDiff,
typename UpIdxDiff,
typename LowIdx,
......@@ -705,11 +847,15 @@ struct DynamicMerge
do_carry = idx_low_tmp >= low_lengths_[i];
#if 0
// TODO: use exec-mask inline asm
if(do_carry)
{
idx_diff_low(i) -= low_lengths_[i];
}
#else
idx_diff_low(i) = do_carry ? idx_diff_low[i] - low_lengths_[i] : idx_diff_low[i];
#endif
idx_low(i) += idx_diff_low[i];
});
......@@ -733,11 +879,15 @@ struct DynamicMerge
do_borrow = idx_low_tmp < 0;
#if 0
// TODO: use exec-mask inline asm
if(do_borrow)
{
idx_diff_low(i) += low_lengths_[i];
}
#else
idx_diff_low(i) = do_borrow ? idx_diff_low[i] + low_lengths_[i] : idx_diff_low[i];
#endif
idx_low(i) += idx_diff_low[i];
});
......@@ -765,8 +915,10 @@ struct DynamicMerge
const UpIdx& idx_up_new,
Number<Hack>) const
{
#if 1
UpdateLowerIndex_1(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
#if 0
UpdateLowerIndex_1a(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
#elif 0
UpdateLowerIndex_1b(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
#else
UpdateLowerIndex_2(idx_diff_low, idx_diff_up, idx_low, idx_up_new, Number<Hack>{});
#endif
......
......@@ -4,6 +4,7 @@
#include "common_header.hpp"
#include "dynamic_tensor_descriptor.hpp"
#include "dynamic_tensor_descriptor_helper.hpp"
#include "tensor_descriptor_helper.hpp"
#include "blockwise_dynamic_tensor_slice_transfer.hpp"
#include "threadwise_dynamic_tensor_slice_transfer.hpp"
#include "ConstantMatrixDescriptor.hpp"
......@@ -364,9 +365,14 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
// define input tensor descriptor for threadwise copy
// thread input tensor, src of threadwise copy
#if 0 // debug
constexpr auto c_m0_m1_n0_n1_thread_desc =
make_dynamic_naive_tensor_descriptor_packed<4>(
make_multi_index(MRepeat, MPerThread, NRepeat, NPerThread));
#else
constexpr auto c_m0_m1_n0_n1_thread_desc = make_native_tensor_descriptor_packed(
Sequence<MRepeat, MPerThread, NRepeat, NPerThread>{});
#endif
// calculate origin of thread input tensor on global memory
// blockwise GEMM c matrix starting index
......@@ -379,6 +385,7 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
const index_t n_thread_data_on_global =
n_block_data_on_global + c_thread_mtx_on_block.col;
#if 0
ThreadwiseDynamicTensorSliceTransfer_v1r2<
AccFloat,
Float,
......@@ -404,6 +411,28 @@ struct GridwiseDynamicGemm_km_kn_mn_v1
n_thread_data_on_global % N1))
.Run_hack(
c_m0_m1_n0_n1_thread_desc, p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global);
#else
ThreadwiseDynamicTensorSliceTransfer_v1r3<
AccFloat,
Float,
decltype(c_m0_m1_n0_n1_thread_desc),
decltype(c_m0_m1_n0_n1_global_desc),
Sequence<MRepeat, MPerThread, NRepeat, NPerThread>,
CThreadTransferSrcDstAccessOrder,
CThreadTransferSrcDstVectorDim,
CThreadTransferDstScalarPerVector,
AddressSpace::Vgpr,
AddressSpace::Global,
CGlobalMemoryDataOperation,
1,
true,
true>(c_m0_m1_n0_n1_global_desc,
make_multi_index(m_thread_data_on_global / M1,
m_thread_data_on_global % M1,
n_thread_data_on_global / N1,
n_thread_data_on_global % N1))
.Run_hack(p_c_thread, c_m0_m1_n0_n1_global_desc, p_c_global);
#endif
}
}
......
......@@ -97,8 +97,8 @@ __llvm_amdgcn_buffer_atomic_add_f32(float vdata,
#endif
// buffer_load requires:
// 1) p_src_thread must be in global memory space, p_dst_thread must be vgpr
// 2) p_src_thread to be a wavewise pointer.
// 1) p_src_wave must be in global memory space
// 2) p_src_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize>
__device__ typename vector_type<T, VectorSize>::MemoryType
......@@ -118,6 +118,18 @@ __device__ void amd_buffer_store(const T* p_src_thread,
bool dst_thread_data_valid,
index_t dst_data_range);
// buffer_store requires:
// 1) p_dst_wave must be global memory
// 2) p_dst_wave to be a wavewise pointer.
// It is user's responsibility to make sure that is true.
template <typename T, index_t VectorSize>
__device__ void
amd_buffer_store_v2(const typename vector_type<T, VectorSize>::MemoryType src_thread_data,
T* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range);
// buffer_atomic requires:
// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory
// 2) p_dst_thread to be a wavewise pointer.
......@@ -926,6 +938,126 @@ __device__ void amd_buffer_store<ushort, 8>(const ushort* p_src_thread,
#endif
}
template <>
__device__ void amd_buffer_store_v2<float, 1>(const float src_thread_data,
float* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
template <>
__device__ void amd_buffer_store_v2<float, 2>(const float2_t src_thread_data,
float* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x2(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x2(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
template <>
__device__ void amd_buffer_store_v2<float, 4>(const float4_t src_thread_data,
float* p_dst_wave,
const index_t dst_thread_data_offset,
const bool dst_thread_data_valid,
const index_t dst_data_range)
{
BufferResourceConstant<float> dst_wave_buffer_resource;
// wavewise base address (64 bit)
dst_wave_buffer_resource.address[0] = p_dst_wave;
// wavewise range (32 bit)
dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float);
// wavewise setting (32 bit)
dst_wave_buffer_resource.config[3] = 0x00027000;
index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float);
#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK
uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff;
__llvm_amdgcn_buffer_store_f32x4(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_addr_shift + dst_thread_addr_offset,
false,
false);
#else
if(dst_thread_data_valid)
{
__llvm_amdgcn_buffer_store_f32x4(src_thread_data,
dst_wave_buffer_resource.data,
0,
dst_thread_addr_offset,
false,
false);
}
#endif
}
#if CK_USE_AMD_BUFFER_ATOMIC_FADD
template <>
__device__ void amd_buffer_atomic_add<float, 1>(const float* p_src_thread,
......
......@@ -91,7 +91,7 @@
#endif
// workaround: put all workaround here
// workaround for unnecessary VGPA <--> AGRP data movement when using mfma LLVM intrinsic
// workaround for unnecessary VGPR <--> AGPR data movement when using mfma LLVM intrinsic
#ifndef CK_WORKAROUND_SWDEV_229564
#define CK_WORKAROUND_SWDEV_229564 1
#endif
......@@ -123,6 +123,8 @@ using index_t = uint32_t;
using index_t = int32_t;
#endif
typedef int32_t int32x2_t __attribute__((ext_vector_type(2)));
// int32x4_t use by buffer_load and buffer_store llvm intrinsic
typedef int32_t int32x4_t __attribute__((ext_vector_type(4)));
......
......@@ -32,7 +32,8 @@ struct static_for
static_assert(Increment != 0 && (NEnd - NBegin) % Increment == 0,
"Wrong! should satisfy (NEnd - NBegin) % Increment == 0");
static_assert((Increment > 0 && NBegin <= NEnd) || (Increment < 0 && NBegin >= NEnd),
"wrongs! should have NBegin <= NEnd");
"wrongs! should (Increment > 0 && NBegin <= NEnd) || (Increment < 0 && "
"NBegin >= NEnd)");
}
template <class F>
......
......@@ -145,7 +145,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
constexpr index_t GemmCThreadTransferDstScalarPerVector_GemmN1 = 1;
#elif 0
#elif 1
// cdata = 64, BlockSize = 256, 128x128x8
// b thread copy 2x2
constexpr index_t BlockSize = 256;
......@@ -166,7 +166,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<4, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<2, 128>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 2;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<2, 2>;
......@@ -201,7 +201,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc
printf("%s: BlockSize %u, GridSize %u \n", __func__, BlockSize, GridSize);
constexpr auto conv_driver =
#if 0
#if 1
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_pad
#else
DriverDynamicConvolutionForwardImplicitGemm_v4r4_nchw_kcyx_nkhw_no_pad
......
......@@ -22,7 +22,22 @@ int main(int argc, char* argv[])
{
using namespace ck;
#if 1
#if 0
// 3x3, 36x36, stride 2
constexpr index_t N = 128;
constexpr index_t C = 192;
constexpr index_t HI = 37;
constexpr index_t WI = 37;
constexpr index_t K = 384;
constexpr index_t Y = 3;
constexpr index_t X = 3;
using ConvStrides = Sequence<2, 2>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 3x3, 35x35, stride 2
constexpr index_t N = 128;
constexpr index_t C = 192;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment