"driver/conv_bwd_data_driver.cpp" did not exist on "c5da0377fbf4c2ab92a9d62f711bbc832481a92d"
Commit 5636576f authored by Chao Liu's avatar Chao Liu
Browse files

bug fix in ford, forgot to reorder lengths

parent 9d99a580
...@@ -470,7 +470,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer ...@@ -470,7 +470,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_lds_double_buffer
7, 7,
1, 1,
1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>()) 1>(make_zero_array<index_t, 8>(), make_zero_array<index_t, 8>())
.Run_non_static(p_out_thread, p_out_thread_on_global); .Run(p_out_thread, p_out_thread_on_global);
#elif 0 #elif 0
ThreadwiseGenericTensorSliceCopy_v2< ThreadwiseGenericTensorSliceCopy_v2<
decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc), decltype(out_n0_n1_n2_k0_k1_k2_h_w_thread_desc),
......
...@@ -276,7 +276,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -276,7 +276,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
SrcDataPerAccess, SrcDataPerAccess,
1>(make_zero_array<index_t, nDim>(), 1>(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>()) make_zero_array<index_t, nDim>())
.Run_non_static(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset); .Run(p_src + src_offset + mThreadSrcOffset, p_buffer + buffer_offset);
#endif #endif
}); });
} }
...@@ -318,10 +318,11 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -318,10 +318,11 @@ struct BlockwiseGenericTensorSliceCopy_v1
// By position the origin of the per-thread window at the point, where multi-index // By position the origin of the per-thread window at the point, where multi-index
// of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy // of the SrcDesc (might be a merged tensor) is all-zero. This threadwise slice copy
// is assuming each thread is copy a noraml (not merged) tensor. // is assuming each thread is copy a noraml (not merged) tensor.
// User need to guarantee this is true. // To satisfy this assumption, the user need to make sure that, on a merged dimension
// By setting SubLengths = 1 at the merged dimension, this is always true; // that constains multiple original dimensions, the length of the last original
// If in the future, you want to enable SubLengths > 1 at the merged dimension, // dimension need to be evenly dividable by its sub-lengths. Also, the repeat-length on
// special care in implementation is needed // the merged dimension need to be 1. These sanity checks are performed in constructor
// of BlockwiseGenericTensorSliceCopy_v1
#if 0 #if 0
threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc, threadwise_generic_tensor_slice_copy_v1(thread_buffer_desc,
p_buffer + buffer_offset, p_buffer + buffer_offset,
...@@ -354,7 +355,7 @@ struct BlockwiseGenericTensorSliceCopy_v1 ...@@ -354,7 +355,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
1, 1,
DstDataPerAccess>(make_zero_array<index_t, nDim>(), DstDataPerAccess>(make_zero_array<index_t, nDim>(),
make_zero_array<index_t, nDim>()) make_zero_array<index_t, nDim>())
.Run_non_static(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset); .Run(p_buffer + buffer_offset, p_dst + dst_offset + mThreadDstOffset);
#endif #endif
}); });
} }
......
...@@ -10,6 +10,10 @@ ...@@ -10,6 +10,10 @@
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#endif #endif
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
#endif
namespace ck { namespace ck {
// user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1 // user need to make sure alignment requirement is satisfied when setting DataPerAccesss > 1
...@@ -369,8 +373,10 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 ...@@ -369,8 +373,10 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
constexpr auto long_vector_access_lengths = SliceLengths::Modify( constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
#ifndef CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2
static_ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&]( static_ford<decltype(long_vector_access_lengths), DimAccessOrder>{}([&](
auto long_vector_access_id) { auto long_vector_access_id) {
// data id w.r.t slicing-window // data id w.r.t slicing-window
constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify( constexpr auto long_vector_data_begin_id = long_vector_access_id.Modify(
vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size); vector_access_dim, long_vector_access_id[vector_access_dim] * long_vector_size);
...@@ -406,26 +412,10 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 ...@@ -406,26 +412,10 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]); *reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
}); });
}); });
} #else
template <class TData>
__device__ void Run_non_static(const TData* p_src, TData* p_dst) const
{
using src_vector_t = typename vector_type<TData, SrcDataPerAccess>::MemoryType;
using dst_vector_t = typename vector_type<TData, DstDataPerAccess>::MemoryType;
constexpr auto vector_access_dim = Number<VectorAccessDim>{};
constexpr auto src_data_per_access = Number<SrcDataPerAccess>{};
constexpr auto dst_data_per_access = Number<DstDataPerAccess>{};
constexpr auto long_vector_size = Number<math::lcm(SrcDataPerAccess, DstDataPerAccess)>{};
constexpr auto long_vector_access_lengths = SliceLengths::Modify(
vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size);
ford<decltype(long_vector_access_lengths), DimAccessOrder>{}( ford<decltype(long_vector_access_lengths), DimAccessOrder>{}(
[&](auto long_vector_access_id) { [&](auto long_vector_access_id) {
// data id w.r.t slicing-window // data id w.r.t slicing-window
auto long_vector_data_begin_id = long_vector_access_id; auto long_vector_data_begin_id = long_vector_access_id;
long_vector_data_begin_id(vector_access_dim) = long_vector_data_begin_id(vector_access_dim) =
...@@ -464,6 +454,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 ...@@ -464,6 +454,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2
*reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]); *reinterpret_cast<dst_vector_t*>(&p_long_vector[buffer_offset]);
} }
}); });
#endif
} }
private: private:
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#define CK_USE_AMD_INLINE_ASM 1 #define CK_USE_AMD_INLINE_ASM 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
namespace ck { namespace ck {
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#define CK_USE_AMD_INLINE_ASM 0 #define CK_USE_AMD_INLINE_ASM 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0
namespace ck { namespace ck {
......
...@@ -135,9 +135,11 @@ struct ford ...@@ -135,9 +135,11 @@ struct ford
template <class F> template <class F>
__host__ __device__ constexpr void operator()(F f) const __host__ __device__ constexpr void operator()(F f) const
{ {
for(index_t i = 0; i < Lengths::Front(); ++i) constexpr auto ordered_lengths = Lengths::ReorderGivenNew2Old(Orders{});
for(index_t i = 0; i < ordered_lengths.Front(); ++i)
{ {
ford_impl<decltype(Lengths::PopFront()), Orders>{}(f, Array<index_t, 1>{i}); ford_impl<decltype(ordered_lengths.PopFront()), Orders>{}(f, Array<index_t, 1>{i});
} }
} }
}; };
......
...@@ -59,7 +59,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc, ...@@ -59,7 +59,7 @@ void device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw(InDesc,
constexpr index_t B = (N * Ho * Wo) / (N1 * N2); constexpr index_t B = (N * Ho * Wo) / (N1 * N2);
#if 0 #if 1
// each thread hold 64 data // each thread hold 64 data
constexpr index_t BlockSize = 256; constexpr index_t BlockSize = 256;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment