Commit ebe38f3d authored by Chao Liu's avatar Chao Liu
Browse files

debugging

parent 9b280cc5
...@@ -11,13 +11,6 @@ struct NativeDimension ...@@ -11,13 +11,6 @@ struct NativeDimension
__host__ __device__ static constexpr auto GetLength() { return Number<Length>{}; } __host__ __device__ static constexpr auto GetLength() { return Number<Length>{}; }
__host__ __device__ static constexpr auto GetStride() { return Number<Stride>{}; } __host__ __device__ static constexpr auto GetStride() { return Number<Stride>{}; }
__host__ __device__ static constexpr index_t CalculateOffset(index_t i) { return i * Stride; }
__host__ __device__ static constexpr index_t CalculateOffsetDiff(index_t i_diff)
{
return i_diff * Stride;
}
}; };
} // namespace ck } // namespace ck
......
...@@ -193,7 +193,7 @@ struct TensorCoordinate ...@@ -193,7 +193,7 @@ struct TensorCoordinate
private: private:
template <typename... Ts> template <typename... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
{ {
return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>( return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>()); make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
...@@ -201,7 +201,7 @@ struct TensorCoordinate ...@@ -201,7 +201,7 @@ struct TensorCoordinate
template <typename... Ts> template <typename... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
{ {
return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>( return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
make_zero_array<index_t, TensorDesc::GetNumOfDimension()>()); make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
......
...@@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated ...@@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated
private: private:
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
{ {
return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor<Ts...>>(); return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor<Ts...>>();
} }
template <class... Ts> template <class... Ts>
__host__ __device__ static constexpr auto __host__ __device__ static constexpr auto
MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>) MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
{ {
return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>(); return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
} }
......
...@@ -75,10 +75,9 @@ struct BlockwiseGenericTensorSliceCopy_v4 ...@@ -75,10 +75,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
{ {
#if 0 #if 0
mThreadwiseLoad.template Run<BlockSrcData, mThreadwiseLoad.template Run<BlockSrcData,
ThreadBufferData, ThreadBufferData,
BlockSrcAddressSpace, BlockSrcAddressSpace,
ThreadBufferAddressSpace>(p_block_src, ThreadBufferAddressSpace>(p_block_src, p_thread_buffer);
p_thread_buffer);
#else // tweaking #else // tweaking
mThreadwiseLoad.template Run_optimized_src_address_calculation<BlockSrcData, mThreadwiseLoad.template Run_optimized_src_address_calculation<BlockSrcData,
ThreadBufferData, ThreadBufferData,
......
...@@ -483,8 +483,8 @@ struct BlockwiseGenericTensorSliceCopy_v2 ...@@ -483,8 +483,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
address_space_t ThreadBufferAddressSpace = address_space_t::generic> address_space_t ThreadBufferAddressSpace = address_space_t::generic>
__device__ void RunLoadThreadBuffer(const TData* p_block_src, TData* p_thread_buffer) const __device__ void RunLoadThreadBuffer(const TData* p_block_src, TData* p_thread_buffer) const
{ {
mThreadwiseLoad.Run<TData, BlockSrcAddressSpace, ThreadBufferAddressSpace>(p_block_src, mThreadwiseLoad.template Run<TData, BlockSrcAddressSpace, ThreadBufferAddressSpace>(
p_thread_buffer); p_block_src, p_thread_buffer);
} }
template <typename TData, template <typename TData,
...@@ -492,8 +492,8 @@ struct BlockwiseGenericTensorSliceCopy_v2 ...@@ -492,8 +492,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
address_space_t BlockDstAddressSpace = address_space_t::generic> address_space_t BlockDstAddressSpace = address_space_t::generic>
__device__ void RunStoreThreadBuffer(const TData* p_thread_buffer, TData* p_block_dst) const __device__ void RunStoreThreadBuffer(const TData* p_thread_buffer, TData* p_block_dst) const
{ {
mThreadwiseStore.Run<TData, ThreadBufferAddressSpace, BlockDstAddressSpace>(p_thread_buffer, mThreadwiseStore.template Run<TData, ThreadBufferAddressSpace, BlockDstAddressSpace>(
p_block_dst); p_thread_buffer, p_block_dst);
} }
template <typename TData, template <typename TData,
......
...@@ -130,7 +130,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -130,7 +130,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
__buffer_load<SrcData, SrcDataPerAccess>( __buffer_load<SrcData, SrcDataPerAccess>(
p_src, 0, src_coord.GetOffset()); p_src, src_coord.GetOffset(), 0);
#else #else
*reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) = *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
*reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]); *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
...@@ -172,8 +172,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -172,8 +172,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
__buffer_store<DstData, DstDataPerAccess>( __buffer_store<DstData, DstDataPerAccess>(
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]), *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
p_dst, p_dst,
0, dst_coord.GetOffset(),
dst_coord.GetOffset()); 0);
#else #else
*reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) = *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
*reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]); *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
...@@ -287,10 +287,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -287,10 +287,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const auto src_coord = const auto src_coord =
src_nonlinear_coord + (linear_dim_data_steps + scalar_id); src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
// this is src compile-time offset // this is src compile-time offset
#if 0
// TODO: is this good implementation? // TODO: is this good implementation?
const index_t src_linear_offset = const index_t src_linear_offset =
src_coord.GetOffset() - src_nonlinear_coord.GetOffset(); src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
#else
const index_t src_linear_offset =
SrcDesc::CalculateOffset(linear_dim_data_steps + scalar_id);
#endif
// Check src vector's padding situation, only check the first data in // Check src vector's padding situation, only check the first data in
// this src vector. It's user's responsiblity to make sure all data in // this src vector. It's user's responsiblity to make sure all data in
...@@ -471,10 +476,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 ...@@ -471,10 +476,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
const auto dst_coord = const auto dst_coord =
dst_nonlinear_coord + (linear_dim_data_steps + scalar_id); dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
// this is dst compile-time offset // this is dst compile-time offset
#if 1
// TODO: is this good implementation? // TODO: is this good implementation?
const index_t dst_linear_offset = const index_t dst_linear_offset =
dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset(); dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
#else
const index_t dst_linear_offset =
DstDesc::CalculateOffset(linear_dim_data_steps + scalar_id);
#endif
// Check dst vector's padding situation, only check the first data in // Check dst vector's padding situation, only check the first data in
// this dst vector. It's user's responsiblity to make sure all data in // this dst vector. It's user's responsiblity to make sure all data in
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
#define CK_UNSIGNED_INDEX_TYPE 0 #define CK_UNSIGNED_INDEX_TYPE 0
#define CK_DEVICE_BACKEND_AMD 1 #define CK_DEVICE_BACKEND_AMD 1
#define CK_USE_AMD_INTRINSIC 1 #define CK_USE_AMD_INTRINSIC 0
#define CK_USE_AMD_INLINE_ASM 1 #define CK_USE_AMD_INLINE_ASM 1
#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1 #define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
......
...@@ -295,7 +295,7 @@ int main(int argc, char* argv[]) ...@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<0, 0>; using LeftPads = Sequence<0, 0>;
using RightPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>;
#elif 1 #elif 0
// 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
// cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
constexpr index_t N = 128; constexpr index_t N = 128;
...@@ -341,7 +341,7 @@ int main(int argc, char* argv[]) ...@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
using LeftPads = Sequence<3, 0>; using LeftPads = Sequence<3, 0>;
using RightPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>;
#elif 0 #elif 1
// 1x7 filter, 0x3 pad, 17x17 input // 1x7 filter, 0x3 pad, 17x17 input
constexpr index_t N = 128; constexpr index_t N = 128;
constexpr index_t C = 128; constexpr index_t C = 128;
...@@ -448,7 +448,7 @@ int main(int argc, char* argv[]) ...@@ -448,7 +448,7 @@ int main(int argc, char* argv[])
ConvStrides{}, ConvStrides{},
ConvDilations{}, ConvDilations{},
nrepeat); nrepeat);
#elif 0 #elif 1
device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc, device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc,
in_nchw, in_nchw,
wei_kcyx_desc, wei_kcyx_desc,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment