bug fix

126cae0c · Chao Liu · e1a67b69 · 126cae0c · 126cae0c · 126cae0c
Commit 126cae0c authored Sep 17, 2019 by Chao Liu
6 changed files
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
@@ -185,6 +185,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
                                               InBlockCopyDstDataPerWrite_N2>(
                {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
+#if 0
        // weight tensor
        //     tensor descriptor in device memory, src of blockwise copy
        constexpr auto wei_e_k_global_desc =
@@ -192,6 +193,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
                                        make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}),
                                        make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}),
                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+#else // hack
+        constexpr auto wei_e_k_global_desc_old =
+            WeiGlobalDesc::Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
+        constexpr auto wei_e_k_global_desc = make_native_tensor_descriptor(
+            wei_e_k_global_desc_old.GetLengths(), wei_e_k_global_desc_old.GetStrides());
+#endif
        //     tensor descriptor in LDS, dst of blockwise copy
        //     be careful of LDS alignment

--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -184,23 +184,27 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
                                               InBlockCopyDstDataPerWrite_N2>(
                {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0});
+#if 0
        // weight tensor
        //     tensor descriptor in device memory, src of blockwise copy
        constexpr auto wei_e_k_global_desc =
-#if 0
            transform_tensor_descriptor(wei_k_c_y_x_global_desc,
                                        make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}),
                                        make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}),
                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
 #else // hack
-            make_native_tensor_descriptor_packed(Sequence<K, C * Y * X>{});
+        constexpr auto wei_e_k_global_desc_old =
+            WeiGlobalDesc::Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{});
+        constexpr auto wei_e_k_global_desc = make_native_tensor_descriptor(
+            wei_e_k_global_desc_old.GetLengths(), wei_e_k_global_desc_old.GetStrides());
 #endif
-            //     tensor descriptor in LDS, dst of blockwise copy
+        //     tensor descriptor in LDS, dst of blockwise copy
-            //     be careful of LDS alignment
+        //     be careful of LDS alignment
-            constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned(
+        constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned(
-                Sequence<EPerBlock, KPerBlock>{},
+            Sequence<EPerBlock, KPerBlock>{},
-                Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
+            Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
        // operator for blockwise copy of weight into LDS
        //     slice a tensor, and copy it into another tensor

--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -313,14 +313,14 @@ struct TensorCoordinate
    private:
    template <class... Ts>
    __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
    {
        return NormalTensorCoordinate<ConstantTensorDescriptor<Ts...>>();
    }
    template <class... Ts>
    __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
    {
        return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
    }

--- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
@@ -188,7 +188,7 @@ struct TensorCoordinate_v2
    private:
    template <typename... Ts>
    __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
    {
        return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
            make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
@@ -196,7 +196,7 @@ struct TensorCoordinate_v2
    template <typename... Ts>
    __host__ __device__ static constexpr auto
-        MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
+    MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
    {
        return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
            make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());

--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -85,12 +85,6 @@ struct NativeTensorDescriptor
        return offset;
    }
-    // TODO: remove this
-    __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(const Index& idx)
-    {
-        return CalculateOffset(idx);
-    }
    __host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff)
    {
        index_t offset_diff = 0;
@@ -227,13 +221,6 @@ struct TransformedTensorDescriptor
        return LowTensorDescriptor{};
    }
-#if 0
-    __host__ __device__ static constexpr auto GetLowerLengths()
-    {
-        return GetLowerTensorDescriptor().GetLengths();
-    }
-#endif
    struct lambda_GetUpperLengths
    {
        template <typename Transform>
@@ -359,12 +346,6 @@ struct TransformedTensorDescriptor
        return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up));
    }
-    // TODO: remove this
-    __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(const UpperIndex& idx_up)
-    {
-        return CalculateOffset(idx_up);
-    }
 #if 0
    template <index_t IDim>
    __host__ __device__ static constexpr bool IsLinearDimension(Number<IDim>)

--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -49,7 +49,7 @@ struct GeneratorTensor_3
    {
        std::array<index_t, sizeof...(Is)> dims = {{static_cast<index_t>(is)...}};
-        auto f_acc = [](auto a, auto b) { return 100 * a + b; };
+        auto f_acc = [](auto a, auto b) { return 10 * a + b; };
        return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc);
    }
@@ -75,19 +75,19 @@ int main(int argc, char* argv[])
    using namespace ck;
 #if 0
-    constexpr index_t N  = 256;
+    constexpr index_t N  = 8;
-    constexpr index_t C  = 64;
+    constexpr index_t C  = 8;
-    constexpr index_t HI = 17;
+    constexpr index_t HI = 2;
-    constexpr index_t WI = 17;
+    constexpr index_t WI = 8;
-    constexpr index_t K  = 256;
+    constexpr index_t K  = 128;
-    constexpr index_t Y  = 17;
+    constexpr index_t Y  = 1;
-    constexpr index_t X  = 17;
+    constexpr index_t X  = 1;
    using ConvStrides   = Sequence<1, 1>;
    using ConvDilations = Sequence<1, 1>;
-    using LeftPads  = Sequence<0, 3>;
+    using LeftPads  = Sequence<0, 0>;
-    using RightPads = Sequence<0, 3>;
+    using RightPads = Sequence<0, 0>;
 #elif 0
    // 3x3, 34x34
    constexpr index_t N  = 64;
@@ -347,7 +347,7 @@ int main(int argc, char* argv[])
        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
 #elif 0
        in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread);
-        wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread);
+        wei_kcyx.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
 #elif 0
        in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread);
        wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);