Commit ebe38f3d authored by Chao Liu's avatar Chao Liu
Browse files

debugging

parent 9b280cc5
......@@ -11,13 +11,6 @@ struct NativeDimension
__host__ __device__ static constexpr auto GetLength() { return Number<Length>{}; }
__host__ __device__ static constexpr auto GetStride() { return Number<Stride>{}; }
__host__ __device__ static constexpr index_t CalculateOffset(index_t i) { return i * Stride; }
__host__ __device__ static constexpr index_t CalculateOffsetDiff(index_t i_diff)
{
return i_diff * Stride;
}
};
} // namespace ck
......
......@@ -77,8 +77,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
mThreadwiseLoad.template Run<BlockSrcData,
ThreadBufferData,
BlockSrcAddressSpace,
ThreadBufferAddressSpace>(p_block_src,
p_thread_buffer);
ThreadBufferAddressSpace>(p_block_src, p_thread_buffer);
#else // tweaking
mThreadwiseLoad.template Run_optimized_src_address_calculation<BlockSrcData,
ThreadBufferData,
......
......@@ -483,8 +483,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
address_space_t ThreadBufferAddressSpace = address_space_t::generic>
__device__ void RunLoadThreadBuffer(const TData* p_block_src, TData* p_thread_buffer) const
{
mThreadwiseLoad.Run<TData, BlockSrcAddressSpace, ThreadBufferAddressSpace>(p_block_src,
p_thread_buffer);
mThreadwiseLoad.template Run<TData, BlockSrcAddressSpace, ThreadBufferAddressSpace>(
p_block_src, p_thread_buffer);
}
template <typename TData,
......@@ -492,8 +492,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
address_space_t BlockDstAddressSpace = address_space_t::generic>
__device__ void RunStoreThreadBuffer(const TData* p_thread_buffer, TData* p_block_dst) const
{
mThreadwiseStore.Run<TData, ThreadBufferAddressSpace, BlockDstAddressSpace>(p_thread_buffer,
p_block_dst);
mThreadwiseStore.template Run<TData, ThreadBufferAddressSpace, BlockDstAddressSpace>(
p_thread_buffer, p_block_dst);
}
template <typename TData,
......
......@@ -130,7 +130,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<SrcData, SrcDataPerAccess>(
p_src, 0, src_coord.GetOffset());
p_src, src_coord.GetOffset(), 0);
#else
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
......@@ -172,8 +172,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
__buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
p_dst,
0,
dst_coord.GetOffset());
dst_coord.GetOffset(),
0);
#else
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
......@@ -287,10 +287,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const auto src_coord =
src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
// this is src compile-time offset
// this is src compile-time offset
#if 0
// TODO: is this good implementation?
const index_t src_linear_offset =
src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
#else
const index_t src_linear_offset =
SrcDesc::CalculateOffset(linear_dim_data_steps + scalar_id);
#endif
// Check src vector's padding situation, only check the first data in
// this src vector. It's user's responsiblity to make sure all data in
......@@ -471,10 +476,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const auto dst_coord =
dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
// this is dst compile-time offset
// this is dst compile-time offset
#if 1
// TODO: is this good implementation?
const index_t dst_linear_offset =
dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
#else
const index_t dst_linear_offset =
DstDesc::CalculateOffset(linear_dim_data_steps + scalar_id);
#endif
// Check dst vector's padding situation, only check the first data in
// this dst vector. It's user's responsiblity to make sure all data in
......
......@@ -6,7 +6,7 @@
#define CK_UNSIGNED_INDEX_TYPE 0
#define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INTRINSIC 1
#define CK_USE_AMD_INTRINSIC 0
#define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
......
......@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>;
#elif 1
#elif 0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr index_t N = 128;
......@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<3, 0>;
using RightPads = Sequence<3, 0>;
#elif 0
#elif 1
// 1x7 filter, 0x3 pad, 17x17 input
constexpr index_t N = 128;
constexpr index_t C = 128;
......@@ -448,7 +448,7 @@ int main(int argc, char* argv[])
ConvStrides{},
ConvDilations{},
nrepeat);
#elif 0
#elif 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc,
in_nchw,
wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment