update reference conv

0cb8ba92 · Chao Liu · ba816e69 · 0cb8ba92 · 0cb8ba92 · 0cb8ba92
Commit 0cb8ba92 authored Jul 13, 2022 by Chao Liu
3 changed files
--- a/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
+++ b/example/09_convnd_fwd/convnd_fwd_xdl_fp16.cpp
@@ -26,7 +26,6 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
    AccDataType,    //
    InElementOp,    // Input Elementwise Operation
    WeiElementOp,   // Weights Elementwise Operation                          =
-                    // ck::tensor_layout::convolution::NKHW,
    OutElementOp,   // Output Elementwise Operation
    ConvFwdDefault, // ConvForwardSpecialization
    NumDimSpatial,  // NumDimSpatial
@@ -57,10 +56,11 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device::DeviceConvNdFwdNwc
    1>;             // CThreadTransferDstScalarPerVector

 template <ck::index_t NumDimSpatial>
-using ReferenceConvNDFwdInstance = ck::tensor_operation::host::ReferenceConvFwd<NumDimSpatial,
-                                                                                InLayout,
-                                                                                WeiLayout,
-                                                                                OutLayout,
+using ReferenceConvNDFwdInstance =
+    ck::tensor_operation::host::ReferenceConvFwd<NumDimSpatial,
+                                                 ck::tensor_layout::convolution::NHWC,
+                                                 ck::tensor_layout::convolution::KYXC,
+                                                 ck::tensor_layout::convolution::NHWK,
                                                 InDataType,
                                                 WeiDataType,
                                                 OutDataType,

--- a/example/26_contraction/contraction_scale_xdl_fp32.cpp
+++ b/example/26_contraction/contraction_scale_xdl_fp32.cpp
@@ -260,16 +260,16 @@ int main(int argc, char* argv[])

        e_ms_ns_lengths = {M0, M1, N0, N1};
        e_ms_ns_strides = {
-            std::stoi(argv[22]), std::stoi(argv[23]), std::stoi(argv[24]), std::stoi(argv[25])};
+            std::stoi(argv[18]), std::stoi(argv[19]), std::stoi(argv[20]), std::stoi(argv[21])};

-        scale = std::stof(argv[26]);
+        scale = std::stof(argv[22]);
    }
    else
    {
        printf("arg1: verification (0=no, 1=yes)\n");
        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
        printf("arg3: time kernel (0=no, 1=yes)\n");
-        printf("arg4 to 7: M0, M1, N0, N1, K0, K1\n");
+        printf("arg4 to 9: M0, M1, N0, N1, K0, K1\n");
        printf("arg10 to 13: Stride_A_M0, Stride_A_M1, Stride_A_K0, Stride_A_K1\n");
        printf("arg14 to 17: Stride_B_N0, Stride_B_N1, Stride_B_K0, Stride_B_K1\n");
        printf("arg18 to 21: Stride_E_M0, Stride_E_M1, Stride_E_N0, Stride_E_N1\n");

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp
@@ -87,65 +87,63 @@ struct ReferenceConvFwd : public device::BaseOperator
    {
        using Argument = ReferenceConvFwd::Argument;

+        // FIXME: properly implement "TensorView" for doing transpose or refer to dimension by name
        float Run(const Argument& arg)
        {
            // tensor descriptor in NCHW/KXYC/NKHW dimensional order
            HostTensorDescriptor in_desc  = arg.input_.mDesc;
            HostTensorDescriptor wei_desc = arg.weight_.mDesc;
-            HostTensorDescriptor oout_desc = arg.output_.mDesc;
+            HostTensorDescriptor out_desc = arg.output_.mDesc;

            // input
-            if constexpr(is_same_v<InLayout,ck::tensor_layout::convolution::NWC>)
+            if constexpr(is_same_v<InLayout, ck::tensor_layout::convolution::NWC>)
            {
                in_desc = transpose_host_tensor_descriptor_given_new2old(
-                    input_.mDesc, std::vector<std::size_t>{0, 2, 1});
+                    arg.input_.mDesc, std::vector<std::size_t>{0, 2, 1});
            }
-            else if constexpr(is_same_v<InLayout,ck::tensor_layout::convolution::NHWC>)
+            else if constexpr(is_same_v<InLayout, ck::tensor_layout::convolution::NHWC>)
            {
                in_desc = transpose_host_tensor_descriptor_given_new2old(
-                    input_.mDesc, std::vector<std::size_t>{0, 3, 1, 2});
+                    arg.input_.mDesc, std::vector<std::size_t>{0, 3, 1, 2});
            }
-            else if constexpr(is_same_v<InLayout,ck::tensor_layout::convolution::NDHWC>)
+            else if constexpr(is_same_v<InLayout, ck::tensor_layout::convolution::NDHWC>)
            {
                in_desc = transpose_host_tensor_descriptor_given_new2old(
-                    input_.mDesc, std::vector<std::size_t>{0, 4, 1, 2, 3});
+                    arg.input_.mDesc, std::vector<std::size_t>{0, 4, 1, 2, 3});
            }

            // weight
            if constexpr(is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC>)
            {
                wei_desc = transpose_host_tensor_descriptor_given_new2old(
-                    weight_.mDesc, std::vector<std::size_t>{0, 2, 1});
+                    arg.weight_.mDesc, std::vector<std::size_t>{0, 2, 1});
            }
-            if constexpr(is_same_v<WeiLayout, ck::tensor_layout::convolution::KXC>)
+            else if constexpr(is_same_v<WeiLayout, ck::tensor_layout::convolution::KYXC>)
            {
                wei_desc = transpose_host_tensor_descriptor_given_new2old(
-                    weight_.mDesc, std::vector<std::size_t>{0, 3, 1, 2});
+                    arg.weight_.mDesc, std::vector<std::size_t>{0, 3, 1, 2});
            }
-            else if constexpr(NumDimSpatial == 2 &&
-                              WeiLayout == ck::tensor_layout::convolution::KYXC)
+            else if constexpr(is_same_v<WeiLayout, ck::tensor_layout::convolution::KZYXC>)
            {
                wei_desc = transpose_host_tensor_descriptor_given_new2old(
-                    weight_.mDesc, std::vector<std::size_t>{0, 3, 1, 2});
+                    arg.weight_.mDesc, std::vector<std::size_t>{0, 4, 1, 2, 3});
            }

            // output
-            if constexpr(NumDimSpatial == 1 && OutLayout == ck::tensor_layout::convolution::NWK)
+            if constexpr(is_same_v<OutLayout, ck::tensor_layout::convolution::NWK>)
            {
                out_desc = transpose_host_tensor_descriptor_given_new2old(
-                    output_.mDesc, std::vector<std::size_t>{0, 2, 1});
+                    arg.output_.mDesc, std::vector<std::size_t>{0, 2, 1});
            }
-            else if constexpr(NumDimSpatial == 2 &&
-                              OutLayout == ck::tensor_layout::convolution::NHWK)
+            else if constexpr(is_same_v<OutLayout, ck::tensor_layout::convolution::NHWK>)
            {
                out_desc = transpose_host_tensor_descriptor_given_new2old(
-                    output_.mDesc, std::vector<std::size_t>{0, 3, 1, 2});
+                    arg.output_.mDesc, std::vector<std::size_t>{0, 3, 1, 2});
            }
-            else if constexpr(NumDimSpatial == 3 &&
-                              OutLayout == ck::tensor_layout::convolution::NDHWK)
+            else if constexpr(is_same_v<OutLayout, ck::tensor_layout::convolution::NDHWK>)
            {
                out_desc = transpose_host_tensor_descriptor_given_new2old(
-                    output_.mDesc, std::vector<std::size_t>{0, 4, 1, 2, 3});
+                    arg.output_.mDesc, std::vector<std::size_t>{0, 4, 1, 2, 3});
            }

            if constexpr(NumDimSpatial == 1)
@@ -161,16 +159,26 @@ struct ReferenceConvFwd : public device::BaseOperator
                                ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[0]) +
                                ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[0]) -
                                ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
+
                            if(wi >= 0 &&
                               ck::type_convert<std::size_t>(wi) < in_desc.GetLengths()[2])
                            {
                                float v_in;
                                float v_wei;

-                                arg.in_element_op_(v_in,
-                                                   ck::type_convert<float>(arg.input_(n, c, wi)));
-                                arg.wei_element_op_(v_wei,
-                                                    ck::type_convert<float>(arg.weight_(k, c, x)));
+                                // FIXME hacky
+                                arg.in_element_op_(
+                                    v_in,
+                                    ck::type_convert<float>(
+                                        arg.input_
+                                            .mData[in_desc.GetOffsetFromMultiIndex(n, c, wi)]));
+
+                                // FIXME hacky
+                                arg.wei_element_op_(
+                                    v_wei,
+                                    ck::type_convert<float>(
+                                        arg.weight_
+                                            .mData[wei_desc.GetOffsetFromMultiIndex(k, c, x)]));

                                v_acc += v_in * v_wei;
                            }
@@ -180,7 +188,10 @@ struct ReferenceConvFwd : public device::BaseOperator
                    float v_out;

                    arg.out_element_op_(v_out, v_acc);
-                    arg.output_(n, k, wo) = ck::type_convert<OutDataType>(v_out);
+
+                    // FIXME hacky
+                    arg.output_.mData[out_desc.GetOffsetFromMultiIndex({n, k, wo})] =
+                        ck::type_convert<OutDataType>(v_out);
                };

                make_ParallelTensorFunctor(f_ncw,
@@ -204,12 +215,14 @@ struct ReferenceConvFwd : public device::BaseOperator
                                ck::type_convert<ck::long_index_t>(ho * arg.conv_strides_[0]) +
                                ck::type_convert<ck::long_index_t>(y * arg.conv_dilations_[0]) -
                                ck::type_convert<ck::long_index_t>(arg.in_left_pads_[0]);
+
                            for(std::size_t x = 0; x < wei_desc.GetLengths()[3]; ++x)
                            {
                                auto wi =
                                    ck::type_convert<ck::long_index_t>(wo * arg.conv_strides_[1]) +
                                    ck::type_convert<ck::long_index_t>(x * arg.conv_dilations_[1]) -
                                    ck::type_convert<ck::long_index_t>(arg.in_left_pads_[1]);
+
                                if(hi >= 0 &&
                                   ck::type_convert<std::size_t>(hi) < in_desc.GetLengths()[2] &&
                                   wi >= 0 &&
@@ -218,10 +231,20 @@ struct ReferenceConvFwd : public device::BaseOperator
                                    float v_in;
                                    float v_wei;

+                                    // FIXME hacky
                                    arg.in_element_op_(
-                                        v_in, ck::type_convert<float>(arg.input_(n, c, hi, wi)));
+                                        v_in,
+                                        ck::type_convert<float>(
+                                            arg.input_.mData[in_desc.GetOffsetFromMultiIndex(
+                                                n, c, hi, wi)]));
+
+                                    // FIXME hacky
                                    arg.wei_element_op_(
-                                        v_wei, ck::type_convert<float>(arg.weight_(k, c, y, x)));
+                                        v_wei,
+                                        ck::type_convert<float>(
+                                            arg.weight_.mData[wei_desc.GetOffsetFromMultiIndex(
+                                                k, c, y, x)]));
+
                                    v_acc += v_in * v_wei;
                                }
                            }
@@ -231,7 +254,10 @@ struct ReferenceConvFwd : public device::BaseOperator
                    float v_out;

                    arg.out_element_op_(v_out, v_acc);
-                    arg.output_(n, k, ho, wo) = ck::type_convert<OutDataType>(v_out);
+
+                    // FIXME hacky
+                    arg.output_.mData[out_desc.GetOffsetFromMultiIndex({n, k, ho, wo})] =
+                        ck::type_convert<OutDataType>(v_out);
                };

                make_ParallelTensorFunctor(f_nchw,
@@ -282,12 +308,20 @@ struct ReferenceConvFwd : public device::BaseOperator
                                        float v_in;
                                        float v_wei;

+                                        // FIXME hacky
                                        arg.in_element_op_(
                                            v_in,
-                                            ck::type_convert<float>(arg.input_(n, c, di, hi, wi)));
+                                            ck::type_convert<float>(
+                                                arg.input_.mData[in_desc.GetOffsetFromMultiIndex(
+                                                    n, c, di, hi, wi)]));
+
+                                        // FIXME hacky
                                        arg.wei_element_op_(
                                            v_wei,
-                                            ck::type_convert<float>(arg.weight_(k, c, z, y, x)));
+                                            ck::type_convert<float>(
+                                                arg.weight_.mData[wei_desc.GetOffsetFromMultiIndex(
+                                                    k, c, z, y, x)]));
+
                                        v_acc += v_in * v_wei;
                                    }
                                }
@@ -298,7 +332,10 @@ struct ReferenceConvFwd : public device::BaseOperator
                    float v_out;

                    arg.out_element_op_(v_out, v_acc);
-                    arg.output_(n, k, d_o, ho, wo) = ck::type_convert<OutDataType>(v_out);
+
+                    // FIXME hacky
+                    arg.output_.mData[out_desc.GetOffsetFromMultiIndex({n, k, d_o, ho, wo})] =
+                        ck::type_convert<OutDataType>(v_out);
                };

                make_ParallelTensorFunctor(f_nchw,