debugging

ebe38f3d · Chao Liu · 9b280cc5 · ebe38f3d · ebe38f3d · ebe38f3d
Commit ebe38f3d authored Sep 27, 2019 by Chao Liu
8 changed files
--- a/composable_kernel/include/tensor_description/dimension.hpp
+++ b/composable_kernel/include/tensor_description/dimension.hpp
@@ -11,13 +11,6 @@ struct NativeDimension
    __host__ __device__ static constexpr auto GetLength() { return Number<Length>{}; }
    __host__ __device__ static constexpr auto GetStride() { return Number<Stride>{}; }
-    __host__ __device__ static constexpr index_t CalculateOffset(index_t i) { return i * Stride; }
-    __host__ __device__ static constexpr index_t CalculateOffsetDiff(index_t i_diff)
-    {
-        return i_diff * Stride;
-    }
 };
 } // namespace ck

--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -193,7 +193,7 @@ struct TensorCoordinate
    private:
    template <typename... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
    {
        return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
            make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
@@ -201,7 +201,7 @@ struct TensorCoordinate
    template <typename... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
    {
        return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
            make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());

--- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
@@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated
    private:
    template <class... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
    {
        return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor<Ts...>>();
    }
    template <class... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
    {
        return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
    }

--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -75,10 +75,9 @@ struct BlockwiseGenericTensorSliceCopy_v4
    {
 #if 0
        mThreadwiseLoad.template Run<BlockSrcData,
-                                             ThreadBufferData,
+                                     ThreadBufferData,
-                                             BlockSrcAddressSpace,
+                                     BlockSrcAddressSpace,
-                                             ThreadBufferAddressSpace>(p_block_src,
+                                     ThreadBufferAddressSpace>(p_block_src, p_thread_buffer);
-                                                                       p_thread_buffer);
 #else // tweaking
        mThreadwiseLoad.template Run_optimized_src_address_calculation<BlockSrcData,
                                                                       ThreadBufferData,

--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy_deprecated.hpp
@@ -483,8 +483,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
              address_space_t ThreadBufferAddressSpace = address_space_t::generic>
    __device__ void RunLoadThreadBuffer(const TData* p_block_src, TData* p_thread_buffer) const
    {
-        mThreadwiseLoad.Run<TData, BlockSrcAddressSpace, ThreadBufferAddressSpace>(p_block_src,
+        mThreadwiseLoad.template Run<TData, BlockSrcAddressSpace, ThreadBufferAddressSpace>(
-                                                                                   p_thread_buffer);
+            p_block_src, p_thread_buffer);
    }
    template <typename TData,
@@ -492,8 +492,8 @@ struct BlockwiseGenericTensorSliceCopy_v2
              address_space_t BlockDstAddressSpace     = address_space_t::generic>
    __device__ void RunStoreThreadBuffer(const TData* p_thread_buffer, TData* p_block_dst) const
    {
-        mThreadwiseStore.Run<TData, ThreadBufferAddressSpace, BlockDstAddressSpace>(p_thread_buffer,
+        mThreadwiseStore.template Run<TData, ThreadBufferAddressSpace, BlockDstAddressSpace>(
-                                                                                    p_block_dst);
+            p_thread_buffer, p_block_dst);
    }
    template <typename TData,

--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -130,7 +130,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
 #if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE
                        *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
                            __buffer_load<SrcData, SrcDataPerAccess>(
-                                p_src, 0, src_coord.GetOffset());
+                                p_src, src_coord.GetOffset(), 0);
 #else
                        *reinterpret_cast<src_vector_t*>(&p_src_long_vector[buffer_offset]) =
                            *reinterpret_cast<const src_vector_t*>(&p_src[src_coord.GetOffset()]);
@@ -172,8 +172,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                        __buffer_store<DstData, DstDataPerAccess>(
                            *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]),
                            p_dst,
-                            0,
+                            dst_coord.GetOffset(),
-                            dst_coord.GetOffset());
+                            0);
 #else
                        *reinterpret_cast<dst_vector_t*>(&p_dst[dst_coord.GetOffset()]) =
                            *reinterpret_cast<dst_vector_t*>(&p_dst_long_vector[buffer_offset]);
@@ -287,10 +287,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                    const auto src_coord =
                        src_nonlinear_coord + (linear_dim_data_steps + scalar_id);
-                    // this is src compile-time offset
+// this is src compile-time offset
+#if 0
                    // TODO: is this good implementation?
                    const index_t src_linear_offset =
                        src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
+#else
+                    const index_t src_linear_offset =
+                        SrcDesc::CalculateOffset(linear_dim_data_steps + scalar_id);
+#endif
                    // Check src vector's padding situation, only check the first data in
                    //   this src vector. It's user's responsiblity to make sure all data in
@@ -471,10 +476,15 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                    const auto dst_coord =
                        dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);
-                    // this is dst compile-time offset
+// this is dst compile-time offset
+#if 1
                    // TODO: is this good implementation?
                    const index_t dst_linear_offset =
                        dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
+#else
+                    const index_t dst_linear_offset =
+                        DstDesc::CalculateOffset(linear_dim_data_steps + scalar_id);
+#endif
                    // Check dst vector's padding situation, only check the first data in
                    //   this dst vector. It's user's responsiblity to make sure all data in

--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -6,7 +6,7 @@
 #define CK_UNSIGNED_INDEX_TYPE 0
 #define CK_DEVICE_BACKEND_AMD 1
-#define CK_USE_AMD_INTRINSIC 1
+#define CK_USE_AMD_INTRINSIC 0
 #define CK_USE_AMD_INLINE_ASM 1
 #define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1

--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -295,7 +295,7 @@ int main(int argc, char* argv[])
    using LeftPads  = Sequence<0, 0>;
    using RightPads = Sequence<0, 0>;
-#elif 1
+#elif 0
    // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output
    // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81%
    constexpr index_t N  = 128;
@@ -341,7 +341,7 @@ int main(int argc, char* argv[])
    using LeftPads  = Sequence<3, 0>;
    using RightPads = Sequence<3, 0>;
-#elif 0
+#elif 1
    // 1x7 filter, 0x3 pad, 17x17 input
    constexpr index_t N  = 128;
    constexpr index_t C  = 128;
@@ -448,7 +448,7 @@ int main(int argc, char* argv[])
                                                         ConvStrides{},
                                                         ConvDilations{},
                                                         nrepeat);
-#elif 0
+#elif 1
    device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded(in_nchw_desc,
                                                                in_nchw,
                                                                wei_kcyx_desc,