Commit 6e59255a authored by root's avatar root
Browse files

debug

parent 7d0a5412
......@@ -33,7 +33,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
};
index_t mMyThreadOffsetA;
index_t mMyThreadOffsetB;
__device__ BlockwiseGemm_km_kn_m0m1n0n1_v3()
{
......@@ -68,8 +67,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
mMyThreadOffsetA = BlockMatrixA{}.CalculateOffset(make_tuple(0, c_thread_mtx_index.k));
mMyThreadOffsetB = BlockMatrixB{}.CalculateOffset(
make_tuple(0, 0, c_thread_mtx_index.h, c_thread_mtx_index.w));
}
__device__ static constexpr auto GetThreadMatrixCLengths()
......@@ -109,38 +106,6 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
}
};
template <typename SrcDesc,
typename DstDesc,
index_t NSliceCYX,
index_t NSliceH,
index_t NSliceW,
index_t DataPerAccess>
struct ThreadwiseSliceCopy_b
{
template <typename Data>
__device__ static void Run(const Data* p_src, Data* p_dst)
{
static_assert(SrcDesc::IsKnownAtCompileTime() && DstDesc::IsKnownAtCompileTime(),
"wrong! Desc should be known at compile-time");
using vector_t = typename vector_type<Data, DataPerAccess>::type;
static_for<0, NSliceCYX, 1>{}([&](auto i) {
static_for<0, NSliceH, 1>{}([&](auto j) {
static_for<0, NSliceW, 1>{}([&](auto k) {
constexpr auto src_offset =
SrcDesc{}.CalculateOffset(make_tuple(i, 0, j, k));
constexpr auto dst_offset =
DstDesc{}.CalculateOffset(make_tuple(i, 0, j, k));
*reinterpret_cast<vector_t*>(&p_dst[dst_offset]) =
*reinterpret_cast<const vector_t*>(&p_src[src_offset]);
});
});
});
}
};
template <typename FloatA, typename FloatB, typename FloatC>
__device__ void
Run_naive(const FloatA* p_a_block, const FloatB* p_b_thread, FloatC* p_c_thread) const
......@@ -160,7 +125,8 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
make_tuple(Number<CYXPerThreadLoop>{}, Number<KPerThread>{}));
constexpr auto b_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}, Number<1>{}, Number<1>{}));
// make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}, Number<1>{}, Number<1>{}));
make_tuple(Number<CYXPerThreadLoop>{}, Number<1>{}));
constexpr auto c_thread_mtx = make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<KPerThread>{}, Number<1>{}));
......@@ -183,7 +149,7 @@ struct BlockwiseGemm_km_kn_m0m1n0n1_v3
mMyThreadOffsetA,
p_a_thread + a_thread_mtx.CalculateOffset(make_tuple(0, 0)));
// threadwise_gemm.Run(p_a_thread, p_b_thread, p_c_thread);
threadwise_gemm.Run(p_a_thread, p_b_thread + CYXPerThreadLoop * cyx_begin, p_c_thread);
}
}
......
......@@ -56,32 +56,19 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
{
__host__ __device__ static constexpr index_t GetSharedMemoryNumberOfByte()
{
constexpr auto max_lds_align = math::lcm(Number<ABlockTransferDstScalarPerVector_M>{},
Number<BBlockTransferDstScalarPerVector_N>{},
Number<KPerThread>{},
Number<HWPerThread>{});
static_assert(CYXPerBlock == 4 && HWPerBlock == 64 && KPerBlock == 16, "");
constexpr auto max_lds_align =
math::lcm(Number<ABlockTransferDstScalarPerVector_M>{}, Number<KPerThread>{});
// A matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr auto a_cyx_k_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
make_tuple(Number<CYXPerBlock>{}, Number<KPerBlock>{}), max_lds_align);
// B matrix in LDS memory, dst of blockwise copy
// be careful of LDS alignment
constexpr auto b_cyx_n_h_w_block_desc = make_dynamic_naive_tensor_descriptor_aligned_v2(
make_tuple(Number<CYXPerBlock>{}, Number<1>{}, Number<8>{}, Number<8>{}),
max_lds_align);
// LDS allocation for A and B: be careful of alignment
constexpr auto a_block_space_size =
math::integer_least_multiple(a_cyx_k_block_desc.GetElementSpaceSize(), max_lds_align);
constexpr auto b_block_space_size = math::integer_least_multiple(
b_cyx_n_h_w_block_desc.GetElementSpaceSize(), max_lds_align);
return 2 * (a_block_space_size + b_block_space_size) * sizeof(Float);
return 2 * (a_block_space_size) * sizeof(Float);
}
template <bool HasMainKBlockLoop, bool HasDoubleTailKBlockLoop>
......@@ -180,17 +167,17 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
#if 1
constexpr auto b_cyx_n_h_w_thread_desc = make_dynamic_naive_tensor_descriptor_packed_v2(
make_tuple(Number<CYXPerThread>{}, Number<1>{}, Number<1>{}, Number<1>{}));
make_tuple(Number<CYXPerBlock>{}, Number<1>{}, Number<1>{}, Number<1>{}));
const index_t h_thread_id = get_thread_local_1d_id() / 8;
const index_t w_thread_id = get_thread_local_1d_id() % 8;
auto b_threadwise_transfer = ThreadwiseDynamicTensorSliceTransfer_v2<
using ThreadwiseTensorSliceTransferB = ThreadwiseDynamicTensorSliceTransfer_v2<
Float,
Float,
decltype(b_cyx_n_h_w_global_desc),
decltype(b_cyx_n_h_w_thread_desc),
Sequence<CYXPerThread, 1, 1, 1>,
Sequence<CYXPerBlock, 1, 1, 1>,
Sequence<3, 2, 0, 1>, // BBlockTransferSrcAccessOrder,
3, // BBlockTransferSrcVectorDim,
1, // BBlockTransferSrcScalarPerVector,
......@@ -198,7 +185,9 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
AddressSpace::Vgpr,
InMemoryDataOperation::Set,
1,
true>(
true>;
ThreadwiseTensorSliceTransferB b_threadwise_transfer(
b_cyx_n_h_w_global_desc,
make_multi_index(
0, 0, h_block_data_on_global + h_thread_id, w_block_data_on_global + w_thread_id));
......@@ -234,7 +223,6 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
b_cyx_n_h_w_block_desc.GetElementSpaceSize(), max_lds_align);
Float* p_a_block_double = p_shared_block;
Float* p_b_block_double = p_shared_block + 2 * a_block_space_size;
// register allocation for output
AccFloat p_c_thread[c_k_n_h_w_thread_desc.GetElementSpaceSize()];
......@@ -279,16 +267,16 @@ struct GridwiseDynamicGemm_km_kn_mn_v2
a_blockwise_copy.RunWrite(a_cyx_k_block_desc, p_a_block_double);
__syncthreads();
blockwise_gemm.Run(p_a_block_double, p_b_thread, p_c_thread);
}
#if 0
if constexpr(HasMainKBlockLoop)
{
Float* p_a_block_even = p_a_block_double;
Float* p_b_block_even = p_b_block_double;
Float* p_a_block_odd = p_a_block_double + a_block_space_size;
Float* p_b_block_odd = p_b_block_double + b_block_space_size;
index_t b_block_data_begin = 0;
......
......@@ -72,7 +72,7 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr index_t GemmMPerBlock = 16;
constexpr index_t GemmNPerBlock = 64;
constexpr index_t GemmKPerBlock = 4;
constexpr index_t GemmKPerBlock = 4 * 3 * 3;
constexpr index_t GemmMPerThread = 16;
constexpr index_t GemmNPerThread = 1;
......@@ -83,14 +83,14 @@ void device_dynamic_convolution_forward_implicit_gemm_v5r1_nchw_kcyx_nkhw(InDesc
constexpr index_t GemmMLevel1Cluster = 1;
constexpr index_t GemmNLevel1Cluster = 64;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<1, 1>;
using GemmABlockTransferThreadSliceLengths_GemmK_GemmM = Sequence<9, 1>;
using GemmABlockTransferThreadClusterLengths_GemmK_GemmM = Sequence<4, 16>;
constexpr index_t GemmABlockTransferSrcScalarPerVector_GemmK = 1;
constexpr index_t GemmABlockTransferDstScalarPerVector_GemmM = 1;
using GemmBBlockTransferThreadSliceLengths_GemmK_GemmN = Sequence<4, 1>;
using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<1, 64>;
using GemmBBlockTransferThreadClusterLengths_GemmK_GemmN = Sequence<9, 64>;
constexpr index_t GemmBBlockTransferSrcScalarPerVector_GemmN = 1;
constexpr index_t GemmBBlockTransferDstScalarPerVector_GemmN = 1;
......
......@@ -779,7 +779,7 @@ int main(int argc, char* argv[])
#if 1
// LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
// LogRange(std::cout << "wei_kcyx: ", wei_kcyx.mData, ",") << std::endl;
// LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_host : ", out_nkhw_host.mData, ",") << std::endl;
LogRange(std::cout << "out_nkhw_device: ", out_nkhw_device.mData, ",") << std::endl;
#endif
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment