Commit 6e59255a authored by root's avatar root
Browse files

debug

parent 7d0a5412
...@@ -33,7 +33,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -33,7 +33,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
}; };
index_t mMyThreadOffsetA; index_t mMyThreadOffsetA;
index_t mMyThreadOffsetB;
__device__ BlockwiseGemm_km_kn_m0m1n0n1_v3() __device__ BlockwiseGemm_km_kn_m0m1n0n1_v3()
{ {
...@@ -68,8 +67,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -68,8 +67,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id()); auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
mMyThreadOffsetA = BlockMatrixA{}.CalculateOffset(make_tuple(0, c_thread_mtx_index.k)); mMyThreadOffsetA = BlockMatrixA{}.CalculateOffset(make_tuple(0, c_thread_mtx_index.k));
mMyThreadOffsetB = BlockMatrixB{}.CalculateOffset(
make_tuple(0, 0, c_thread_mtx_index.h, c_thread_mtx_index.w));
} }
__device__ static constexpr auto GetThreadMatrixCLengths() __device__ static constexpr auto GetThreadMatrixCLengths()
...@@ -109,38 +106,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -109,38 +106,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
} }
}; };
template <typename SrcDesc,
typename DstDesc,
index_t NSliceCYX,
index_t NSliceH,
index_t NSliceW,
index_t DataPerAccess>
struct ThreadwiseSliceCopy_b
{
template <typename Data>
__device__ static void Run(const Data* p_src, Data* p_dst)
{
static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
"wrong! Desc should be known at compile-time");
using vector_t = typename vector_type<Data, DataPerAccess>::type;
static_for<0, NSliceCYX, 1>{}([&](auto i) {
static_for<0, NSliceH, 1>{}([&](auto j) {
static_for<0, NSliceW, 1>{}([&](auto k) {
constexpr auto src_offset =
SrcDesc{}.CalculateOffset(make_tuple(i, 0, j, k));
constexpr auto dst_offset =
DstDesc{}.CalculateOffset(make_tuple(i, 0, j, k));
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
});
});
});
}
};
template <typename FloatA, typename FloatB, typename FloatC> template <typename FloatA, typename FloatB, typename FloatC>
__device__ void __device__ void
Run_naive(const FloatA* p_a_block, const FloatB* p_b_thread, FloatC* p_c_thread) const Run_naive(const FloatA* p_a_block, const FloatB* p_b_thread, FloatC* p_c_thread) const
...@@ -160,7 +125,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -160,7 +125,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
make_tuple(Number<CYXPerThreadLoop>{}, Number<KPerThread>{})); make_tuple(Number<CYXPerThreadLoop>{}, Number<KPerThread>{}));
constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2( constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}, Number<1>{}, Number<1>{})); // make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}, Number<1>{}, Number<1>{}));
make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}));
constexpr auto c_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2( constexpr auto c_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<KPerThread>{}, Number<1>{})); make_tuple(Number<KPerThread>{}, Number<1>{}));
...@@ -183,7 +149,7 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3 ...@@ -183,7 +149,7 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
mMyThreadOffsetA, mMyThreadOffsetA,
p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, 0))); p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, 0)));
// threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread); threadwise_gemm.Run(p_a_thread, p_b_thread + CYXPerThreadLoop * cyx_begin, p_c_thread);
} }
} }
......
...@@ -56,32 +56,19 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -56,32 +56,19 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
{ {
__host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte() __host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
{ {
constexpr auto max_lds_align = math::lcm(Number<ABlockTransferDstScalarPerVector_M>{}, constexpr auto max_lds_align =
Number<BBlockTransferDstScalarPerVector_N>{}, math::lcm(Number<ABlockTransferDstScalarPerVector_M>{}, Number<KPerThread>{});
Number<KPerThread>{},
Number<HWPerThread>{});
static_assert(CYXPerBlock == 4 && HWPerBlock == 64 && KPerBlock == 16, "");
// A matrix in LDS memory, dst of blockwise copy // A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment // be careful of LDS alignment
constexpr auto a_cyx_k_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2( constexpr auto a_cyx_k_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
make_tuple(Number<CYXPerBlock>{}, Number<KPerBlock>{}), max_lds_align); make_tuple(Number<CYXPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr auto b_cyx_n_h_w_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
make_tuple(Number<CYXPerBlock>{}, Number<1>{}, Number<8>{}, Number<8>{}),
max_lds_align);
// LDS allocation for A and B: be careful of alignment // LDS allocation for A and B: be careful of alignment
constexpr auto a_block_space_size = constexpr auto a_block_space_size =
math::integer_least_multiple(a_cyx_k_block_desc.GetElementSpaceSize(), max_lds_align); math::integer_least_multiple(a_cyx_k_block_desc.GetElementSpaceSize(), max_lds_align);
constexpr auto b_block_space_size = math::integer_least_multiple( return 2 * (a_block_space_size) * sizeof(Float);
b_cyx_n_h_w_block_desc.GetElementSpaceSize(), max_lds_align);
return 2 * (a_block_space_size + b_block_space_size) * sizeof(Float);
} }
template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop> template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
...@@ -180,17 +167,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -180,17 +167,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
#if 1 #if 1
constexpr auto b_cyx_n_h_w_thread_desc = make_dynamic_naive_tensor_descriptor_packed_v2( constexpr auto b_cyx_n_h_w_thread_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<CYXPerThread>{}, Number<1>{}, Number<1>{}, Number<1>{})); make_tuple(Number<CYXPerBlock>{}, Number<1>{}, Number<1>{}, Number<1>{}));
const index_t h_thread_id = get_thread_local_1d_id() / 8; const index_t h_thread_id = get_thread_local_1d_id() / 8;
const index_t w_thread_id = get_thread_local_1d_id() % 8; const index_t w_thread_id = get_thread_local_1d_id() % 8;
auto b_threadwise_transfer = ThreadwiseDynamicTensorSliceTransfer_v2< using ThreadwiseTensorSliceTransferB = ThreadwiseDynamicTensorSliceTransfer_v2<
Float, Float,
Float, Float,
decltype(b_cyx_n_h_w_global_desc), decltype(b_cyx_n_h_w_global_desc),
decltype(b_cyx_n_h_w_thread_desc), decltype(b_cyx_n_h_w_thread_desc),
Sequence<CYXPerThread, 1, 1, 1>, Sequence<CYXPerBlock, 1, 1, 1>,
Sequence<3, 2, 0, 1>, // BBlockTransferSrcAccessOrder, Sequence<3, 2, 0, 1>, // BBlockTransferSrcAccessOrder,
3, // BBlockTransferSrcVectorDim, 3, // BBlockTransferSrcVectorDim,
1, // BBlockTransferSrcScalarPerVector, 1, // BBlockTransferSrcScalarPerVector,
...@@ -198,7 +185,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -198,7 +185,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
AddressSpace::Vgpr, AddressSpace::Vgpr,
InMemoryDataOperation::Set, InMemoryDataOperation::Set,
1, 1,
true>( true>;
ThreadwiseTensorSliceTransferB b_threadwise_transfer(
b_cyx_n_h_w_global_desc, b_cyx_n_h_w_global_desc,
make_multi_index( make_multi_index(
0, 0, h_block_data_on_global + h_thread_id, w_block_data_on_global + w_thread_id)); 0, 0, h_block_data_on_global + h_thread_id, w_block_data_on_global + w_thread_id));
...@@ -234,7 +223,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -234,7 +223,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
b_cyx_n_h_w_block_desc.GetElementSpaceSize(), max_lds_align); b_cyx_n_h_w_block_desc.GetElementSpaceSize(), max_lds_align);
Float* p_a_block_double = p_shared_block; Float* p_a_block_double = p_shared_block;
Float* p_b_block_double = p_shared_block + 2 * a_block_space_size;
// register allocation for output // register allocation for output
AccFloat p_c_thread[c_k_n_h_w_thread_desc.GetElementSpaceSize()]; AccFloat p_c_thread[c_k_n_h_w_thread_desc.GetElementSpaceSize()];
...@@ -279,16 +267,16 @@ struct GridwiseDynamicGemm_km_kn_mn_v2 ...@@ -279,16 +267,16 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
a_blockwise_copy.RunWrite(a_cyx_k_block_desc, p_a_block_double); a_blockwise_copy.RunWrite(a_cyx_k_block_desc, p_a_block_double);
__syncthreads(); __syncthreads();
blockwise_gemm.Run(p_a_block_double, p_b_thread, p_c_thread);
} }
#if 0 #if 0
if constexpr(HasMainKBlockLoop) if constexpr(HasMainKBlockLoop)
{ {
Float* p_a_block_even = p_a_block_double; Float* p_a_block_even = p_a_block_double;
Float* p_b_block_even = p_b_block_double;
Float* p_a_block_odd = p_a_block_double + a_block_space_size; Float* p_a_block_odd = p_a_block_double + a_block_space_size;
Float* p_b_block_odd = p_b_block_double + b_block_space_size;
index_t b_block_data_begin = 0; index_t b_block_data_begin = 0;
......
...@@ -72,7 +72,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc ...@@ -72,7 +72,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr index_t GemmMPerBlock = 16; constexpr index_t GemmMPerBlock = 16;
constexpr index_t GemmNPerBlock = 64; constexpr index_t GemmNPerBlock = 64;
constexpr index_t GemmKPerBlock = 4; constexpr index_t GemmKPerBlock = 4 * 3 * 3;
constexpr index_t GemmMPerThread = 16; constexpr index_t GemmMPerThread = 16;
constexpr index_t GemmNPerThread = 1; constexpr index_t GemmNPerThread = 1;
...@@ -83,14 +83,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc ...@@ -83,14 +83,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr index_t GemmMLevel1Cluster = 1; constexpr index_t GemmMLevel1Cluster = 1;
constexpr index_t GemmNLevel1Cluster = 64; constexpr index_t GemmNLevel1Cluster = 64;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<1, 1>; using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<9, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 16>; using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 16>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1; constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1; constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<4, 1>; using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<4, 1>;
using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<1, 64>; using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<9, 64>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1; constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1; constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
......
...@@ -779,7 +779,7 @@ int main(int argc, char* argv[]) ...@@ -779,7 +779,7 @@ int main(int argc, char* argv[])
#if 1 #if 1
// LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl; // LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
// LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl; // LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
// LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl; LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl; LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
#endif #endif
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment