Commit 701b7341 authored by Chao Liu's avatar Chao Liu
Browse files

clean up

parent bc9ea646
......@@ -155,13 +155,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
static_assert(in_e_n1_b_n2_block_desc.GetStride(I1) % GemmDataPerReadB == 0,
"GemmDataPerReadB alignment requirement is not satisfied");
#if 0
#if 1
// input blockwise copy
// slice a merged tensor, reorder and copy to a normal tensor
// this copy operator already has blockwise offset built-in
auto blockwise_in_copy =
BlockwiseGenericTensorSliceCopy_v1<BlockSize,
Float,
decltype(in_e_n1_b_n2_global_merged_desc),
decltype(in_e_n1_b_n2_block_desc),
decltype(in_e_n1_b_n2_block_desc.GetLengths()),
......@@ -199,13 +198,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Sequence<EPerBlock, KPerBlock>{},
Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
#if 0
#if 1
// operator for blockwise copy of weight into LDS
// slice a tensor, and copy it into another tensor
// this copy operator already have blockwise offset built-in
auto blockwise_wei_copy =
BlockwiseGenericTensorSliceCopy_v1<BlockSize,
Float,
decltype(wei_e_k_global_desc),
decltype(wei_e_k_block_desc),
decltype(wei_e_k_block_desc.GetLengths()),
......@@ -324,7 +322,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
#if 0
#if 1
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{},
// True);
......@@ -356,7 +354,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
Float p_in_register_buffer[blockwise_in_copy.GetRegisterBufferSize()];
Float p_wei_register_buffer[blockwise_wei_copy.GetRegisterBufferSize()];
#if 0
#if 1
blockwise_in_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
// blockwise_wei_copy.MoveSlicingWindowOnSourceTensor(I0, Number<EPerBlock>{}, True);
p_wei_block_on_global += EPerBlock * wei_e_k_global_desc.GetStride(I0);
......@@ -436,19 +434,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
out_k_n1_b_n2_global_merged_desc.GetOffsetFromMultiIndex(
k_thread_data_on_global, 0, b_thread_data_on_global, 0);
#if 0
ThreadwiseGenericTensorSliceCopy_v1r1<
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc.GetLengths()),
arithmetic_sequence_gen<0, 8, 1>::type,
arithmetic_sequence_gen<0, 8, 1>::type,
7,
7,
1,
1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
.Run(p_out_thread, p_out_thread_on_global);
#elif 0
#if 1
ThreadwiseGenericTensorSliceCopy_v1r2<
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
decltype(out_n0_n1_n2_k0_k1_k2_h_w_global_mem_desc),
......
......@@ -237,37 +237,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
thread_buffer_desc.GetOffsetFromMultiIndex(buffer_data_id_begin);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
#if 0
threadwise_generic_tensor_slice_copy_v1(SrcDesc{},
p_src + src_offset + mThreadSrcOffset,
make_zero_array<index_t, nDim>(),
thread_buffer_desc,
p_buffer + buffer_offset,
make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths,
SrcDimAccessOrder{},
Number<SrcDataPerAccess>{});
#elif 1
ThreadwiseGenericTensorSliceCopy_v1r1<
SrcDesc,
decltype(thread_buffer_desc),
SubLengths,
SrcDimAccessOrder,
typename arithmetic_sequence_gen<0, nDim, 1>::type,
SrcVectorAccessDim,
0,
SrcDataPerAccess,
1>(make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
.Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
#elif 1
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2<SrcDesc,
decltype(thread_buffer_desc),
SubLengths,
......@@ -277,7 +254,6 @@ struct BlockwiseGenericTensorSliceCopy_v1
1>(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
.Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
#endif
});
}
......@@ -316,48 +292,23 @@ struct BlockwiseGenericTensorSliceCopy_v1
const index_t dst_offset = DstDesc::GetOffsetFromMultiIndex(dst_data_id_begin);
#endif
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
#if 0
threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc,
p_buffer + buffer_offset,
make_zero_array<index_t, nDim>(),
DstDesc{},
p_dst + dst_offset + mThreadDstOffset,
make_zero_array<index_t, nDim>(),
thread_sub_tensor_lengths,
DstDimAccessOrder{},
Number<DstDataPerAccess>{});
#elif 1
ThreadwiseGenericTensorSliceCopy_v1r1<
decltype(thread_buffer_desc),
DstDesc,
SubLengths,
typename arithmetic_sequence_gen<0, nDim, 1>::type,
DstDimAccessOrder,
0,
DstVectorAccessDim,
1,
DstDataPerAccess>(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
.Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
#elif 1
// By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor.
// To satisfy this assumption, the user need to make sure that, on a merged dimension
// that constains multiple original dimensions, the length of the last original
// dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
ThreadwiseGenericTensorSliceCopy_v1r2<decltype(thread_buffer_desc),
DstDesc,
SubLengths,
DstDimAccessOrder,
DstVectorAccessDim,
1,
DstDataPerAccess>(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>())
DstDataPerAccess>(
make_zero_array<index_t, nDim>(), make_zero_array<index_t, nDim>())
.Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
#endif
});
}
......
......@@ -6,10 +6,6 @@
#include "ConstantMergedTensorDescriptor.hpp"
#include "tensor_coordinate.hpp"
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0
#endif
......@@ -24,100 +20,6 @@
namespace ck {
// user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1
template <class Float,
class SrcDesc,
class DstDesc,
class SliceLengths,
class DimAccessOrder,
index_t DataPerAccess>
__device__ void threadwise_generic_tensor_slice_copy_v1(
SrcDesc,
const Float* __restrict__ p_src,
Array<index_t, SrcDesc::GetNumOfDimension()> src_multi_id_begin,
DstDesc,
Float* __restrict__ p_dst,
Array<index_t, DstDesc::GetNumOfDimension()> dst_multi_id_begin,
SliceLengths,
DimAccessOrder,
Number<DataPerAccess>)
{
constexpr index_t nDim = SrcDesc::GetNumOfDimension();
static_assert(nDim == SrcDesc::GetNumOfDimension() && nDim == DstDesc::GetNumOfDimension() &&
nDim == SliceLengths::GetSize() && nDim == DimAccessOrder::GetSize(),
"wrong! # of dimensions not the same");
static_assert(is_valid_sequence_map<DimAccessOrder>::value, "wrong! map is not valid");
// TODO: do more sanity-check here, something like:
// constexpr auto src_strides_in_access_order =
// SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
// constexpr auto dst_strides_in_access_order =
// SrcDesc::ReorderGivenNew2Old(DimAccessOrder{}).GetStride(Number<nDim-1>{});
// // check src/dst stride on the lowest access dimension
// static_assert((DataPerAccess == 1 || src_strides_in_access_order.Back() == 1) &&
// (DataPerAccess == 1 || dst_strides_in_access_order.Back() == 1),
// "wrong! src/dst stride on the lowest access dimension needs to be 1 for "
// "vectorized read/write");
constexpr auto slice_lengths_in_access_order =
SliceLengths::ReorderGivenNew2Old(DimAccessOrder{});
// check slice length on the lowest access dimension
static_assert(slice_lengths_in_access_order.Back() % DataPerAccess == 0,
"wrong! slice length on the lowest access dimension should be evenly divided by "
"DataPerAccess");
constexpr index_t num_access_on_lowest_access_dimension =
slice_lengths_in_access_order.Back() / DataPerAccess;
constexpr auto access_lengths = slice_lengths_in_access_order.Modify(
Number<nDim - 1>{}, Number<num_access_on_lowest_access_dimension>{});
using vector_t = typename vector_type<Float, DataPerAccess>::MemoryType;
#if CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1
static_ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
constexpr index_t itmp = access_multi_id.Back() * DataPerAccess;
constexpr auto data_multi_id_in_access_order =
access_multi_id.Modify(Number<nDim - 1>{}, Number<itmp>{});
constexpr auto data_multi_id =
data_multi_id_in_access_order.ReorderGivenOld2New(DimAccessOrder{});
const index_t src_index =
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
const index_t dst_index =
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
});
#else
ford<decltype(access_lengths)>{}([&](auto access_multi_id) {
auto data_multi_id_in_access_order = access_multi_id;
data_multi_id_in_access_order(nDim - 1) = access_multi_id[nDim - 1] * DataPerAccess;
const auto data_multi_id =
reorder_array_given_old2new(data_multi_id_in_access_order, DimAccessOrder{});
const index_t src_index =
SrcDesc::GetOffsetFromMultiIndex(src_multi_id_begin + data_multi_id);
const index_t dst_index =
DstDesc::GetOffsetFromMultiIndex(dst_multi_id_begin + data_multi_id);
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
});
#endif
}
// This threadwise copy allow vector access of src and dst.
// It allows the dimensions of vector access to be different on src and dst.
// It also allows the vector size to be different on src and dst.
......
......@@ -379,7 +379,7 @@ int main(int argc, char* argv[])
#elif 0
device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw(
(in_nchw_desc, in_nchw, wei_kcyx_desc, wei_kcyx, out_nkhw_desc, out_nkhw_device, nrepeat);
#elif 0
#elif 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment