"git@developer.sourcefind.cn:gaoqiong/composable_kernel.git" did not exist on "18ffbd680273c5970ff1d105c1ee6fca99e0df88"
Commit 4904281f authored by root's avatar root
Browse files

try vector-load for 13x13 image

parent 7d09790a
......@@ -97,6 +97,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
p_src_long_vector[i] = 0;
}
#if 0 //original code
// load data from src to the long-vector buffer
for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i)
{
......@@ -120,6 +121,53 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset);
}
}
#else //try vector load
// load data from src to the long-vector buffer
index_t i = 0;
while(i<long_vector_size){
auto scalar_id = make_zero_array<index_t, nDim>();
scalar_id(vector_access_dim) = i;
const index_t buffer_offset = i;
const auto src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
scalar_id(vector_access_dim) = i + 3;
const auto src_coord3 = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
scalar_id(vector_access_dim) = i + 1;
const auto src_coord1 = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id);
if(((long_vector_size-i)>=4)&&src_coord.IsOffsetValidAssumingUpperIndexIsValid()&&src_coord3.IsOffsetValidAssumingUpperIndexIsValid()&&(src_coord.GetOffset()+3==src_coord3.GetOffset())){
transfer_data<SrcData,
4,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset);
i=i+4;
}else if(((long_vector_size-i)>=2)&&src_coord.IsOffsetValidAssumingUpperIndexIsValid()&&src_coord1.IsOffsetValidAssumingUpperIndexIsValid()&&((src_coord.GetOffset()+1)==src_coord1.GetOffset())){
transfer_data<SrcData,
2,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset);
i=i+2;
}else {
// Check src data's valid mapping situation, only check the first data in this src
// vector. It's user's responsiblity to make sure all data in the src vector
// has the valid/invalid mapping situation
if(src_coord.IsOffsetValidAssumingUpperIndexIsValid())
{
transfer_data<SrcData,
1,
SrcAddressSpace,
AddressSpace::Vgpr,
InMemoryDataOperation::Set>(
p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset);
}
i++;
}
}
#endif
// SrcData to DstData conversion
DstData p_dst_long_vector[long_vector_size];
......
......@@ -147,7 +147,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 4;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 4;
#elif 1
#elif 0
// BlockSize = 256, GemmKPerBlock = 16
// for 1x1 filter, vector-read-b = 4
constexpr index_t BlockSize = 256;
......@@ -179,7 +179,7 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 4;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 4;
#elif 1
#elif 0
// 1x1 filter, 14x14 image
constexpr index_t BlockSize = 256;
......@@ -210,6 +210,81 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 2;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 = 2;
#elif 0
// 1x1 filter, 7x7 image
constexpr index_t BlockSize = 128;
constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock = 64;
constexpr index_t GemmKPerBlock = 16;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 4;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t ThreadGemmDataPerReadM = 4;
constexpr index_t ThreadGemmDataPerReadN = 4;
using GemmABlockCopyThreadSliceLengths_GemmK_GemmM = Sequence<4, 4>;//4>;
using GemmABlockCopyThreadClusterLengths_GemmK_GemmM = Sequence<4, 32>;//32>;
/*
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); //jane:该维度需要vector-access多少次
*/
//using slicelength = Sequence<4, 32>;
//auto long_vector_access_lengths =
constexpr index_t GemmABlockCopySrcDataPerRead_GemmK = 4;
constexpr index_t GemmABlockCopyDstDataPerWrite_GemmM = 4;
using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN = Sequence<4, 2>;//;Sequence<8, 1>;
using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN = Sequence<4, 32>;//Sequence<2, 64>;
constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN = 2;
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 =1;
#elif 1
// 1x1 filter, 13x13 image
constexpr index_t BlockSize = 128;
constexpr index_t GemmMPerBlock = 128;
constexpr index_t GemmNPerBlock =64;
constexpr index_t GemmKPerBlock = 16;
constexpr index_t GemmMPerThreadSubC = 4;
constexpr index_t GemmNPerThreadSubC = 4;
constexpr index_t GemmMLevel0Cluster = 4;
constexpr index_t GemmNLevel0Cluster = 4;
constexpr index_t GemmMLevel1Cluster = 4;
constexpr index_t GemmNLevel1Cluster = 2;
constexpr index_t GemmKPerThreadLoop = 1;
constexpr index_t ThreadGemmDataPerReadM = 4; //这个不起作用
constexpr index_t ThreadGemmDataPerReadN = 4; //这个不起作用
using GemmABlockCopyThreadSliceLengths_GemmK_GemmM = Sequence<4, 4>;//4>;
using GemmABlockCopyThreadClusterLengths_GemmK_GemmM = Sequence<4, 32>;//32>;
constexpr index_t GemmABlockCopySrcDataPerRead_GemmK = 4;
constexpr index_t GemmABlockCopyDstDataPerWrite_GemmM = 4;
#if 0 //vector load x1
using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN = Sequence<8, 1>;//;Sequence<8, 1>;
using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN = Sequence<2, 64>;//Sequence<2, 64>;
constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN = 1;
#elif 0 //vector load x2
using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN = Sequence<4, 2>;//;Sequence<8, 1>;
using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN = Sequence<4, 32>;//Sequence<2, 64>;
constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN = 2;
#elif 1 //vector load x4
using GemmBBlockCopyThreadSliceLengths_GemmK_GemmN = Sequence<2, 4>;//;Sequence<8, 1>;
using GemmBBlockCopyThreadClusterLengths_GemmK_GemmN = Sequence<8, 16>;//Sequence<2, 64>;
constexpr index_t GemmBBlockCopySrcDataPerRead_GemmN = 4;
#endif
constexpr index_t GemmBBlockCopyDstDataPerWrite_GemmN = 1;
constexpr index_t GemmCThreadCopyDstDataPerWrite_GemmN1 =1;
#endif
constexpr index_t GemmM = K;
......@@ -237,11 +312,11 @@ void device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw(InDesc,
GemmKPerBlock,
GemmMPerThreadSubC,
GemmNPerThreadSubC,
GemmKPerThreadLoop,
GemmMLevel0Cluster,
GemmNLevel0Cluster,
GemmMLevel1Cluster,
GemmNLevel1Cluster,
GemmKPerThreadLoop,
ThreadGemmDataPerReadM,
ThreadGemmDataPerReadN,
GemmABlockCopyThreadSliceLengths_GemmK_GemmM,
......
......@@ -29,7 +29,38 @@ int main(int argc, char* argv[])
{
using namespace ck;
#if 1
#if 0
// 1x1 for vector memory access , 7x7 image size
constexpr index_t N = 128;
constexpr index_t C = 256;
constexpr index_t HI = 7;
constexpr index_t WI = 7;
constexpr index_t K = 256;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 1
// 1x1 for vector memory access, 13x13 image size
constexpr index_t N = 128;
constexpr index_t C = 256;//1024;
constexpr index_t HI = 13;
constexpr index_t WI = 13;
constexpr index_t K = 256;//2048;
constexpr index_t Y = 1;
constexpr index_t X = 1;
using ConvStrides = Sequence<1, 1>;
using ConvDilations = Sequence<1, 1>;
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 0
// 1x1
constexpr index_t N = 64;
constexpr index_t C = 64;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment