fix a bug in general index calculation

5771a040 · carlushuang · 5e6cca6f · 5771a040 · 5771a040 · 5771a040
Commit 5771a040 authored Apr 27, 2022 by carlushuang
3 changed files
--- a/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
--- a/library/src/tensor_operation_instance/cpu/conv2d_fwd/device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_instance.cpp
+++ b/library/src/tensor_operation_instance/cpu/conv2d_fwd/device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_instance.cpp
 #include <stdlib.h>
 #include "convolution_forward_specialization_cpu.hpp"
 #include "config.hpp"
 #include "device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp"
 #include "element_wise_operation_cpu.hpp"
 #include "device_operation_instance.hpp"
 namespace ck {
 namespace tensor_operation {
 namespace cpu {
 namespace device {
 namespace device_conv2d_fwd_avx2_instance {
 using InType                           = float;
 using WeiType                          = float;
 using OutType                          = float;
 using AccType                          = float;
 using InLayout                         = ck::tensor_layout::gemm::RowMajor;    // NHWC
 using WeiLayout                        = ck::tensor_layout::gemm::ColumnMajor; // KYXC
 static constexpr bool NonTemporalStore = false;
 using PT = ck::tensor_operation::cpu::element_wise::PassThrough;
 using ThreadwiseGemmAvx2_MxN_4x24_Dispatch =
    ck::cpu::ThreadwiseGemmAvx2_MxN_4x24_Dispatch<InType,
                                                  WeiType,
                                                  OutType,
                                                  InLayout,
                                                  WeiLayout,
                                                  NonTemporalStore>;
 static constexpr auto ConvFwdDefault =
    ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Default;
 static constexpr auto ConvFwd1x1P0 =
    ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Filter1x1Pad0;
 static constexpr auto ConvFwd1x1S1P0 =
    ck::tensor_operation::cpu::device::ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0;
 static constexpr auto DefaultGemmKLoop =
    ck::tensor_operation::cpu::device::ConvolutionForwardGemmKSpecialization_t::DefaultGemmKLoop;
 static constexpr auto GemmKLoopOverC =
    ck::tensor_operation::cpu::device::ConvolutionForwardGemmKSpecialization_t::NHWC_GemmKLoopOverC;
 static constexpr auto LoopOver_MNK = ck::tensor_operation::cpu::device::LoopOver_MNK;
 static constexpr auto LoopOver_MKN = ck::tensor_operation::cpu::device::LoopOver_MKN;
 // clang-format off
 #define DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(a_elem_op, b_elem_op, c_elem_op, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf) \
    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, GemmKLoopOverC  , LoopOver_MNK, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>, \
    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1P0,   GemmKLoopOverC  , LoopOver_MNK, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>, \
    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, DefaultGemmKLoop, LoopOver_MNK, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>, \
    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1P0,   DefaultGemmKLoop, LoopOver_MNK, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>, \
    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, GemmKLoopOverC  , LoopOver_MKN, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>, \
    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1P0,   GemmKLoopOverC  , LoopOver_MKN, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>, \
    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwdDefault, DefaultGemmKLoop, LoopOver_MKN, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>, \
    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1P0,   DefaultGemmKLoop, LoopOver_MKN, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>
 // clang-format on
+using device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances = std::tuple<
-using device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances = std::tuple<
+    // clang-format off
-    // clang-format off
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  256, 120,  64,  4, 24, true, true, false),
-    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  256, 120,  64,  4, 24, true, true, false),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  512, 144, 128,  4, 24, true, true, false),
-    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  512, 144, 128,  4, 24, true, true, false),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  512, 240, 128,  4, 24, true, true, false),
-    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  512, 240, 128,  4, 24, true, true, false),
+    // DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  768, 192, 128,  4, 24, true, true, false),
-    // DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  768, 192, 128,  4, 24, true, true, false),
+    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  768, 288, 128,  4, 24, true, true, false)>;
-    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  768, 288, 128,  4, 24, true, true, false)>;
+// clang-format on
-// clang-format on
+void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
-void add_device_conv2d_fwd_avx2_nhwc_kyxc_nhwk(std::vector<DeviceConvFwdPtr<PT, PT, PT>>& instances)
+{
-{
+    ck::tensor_operation::device::add_device_operation_instances(
-    ck::tensor_operation::device::add_device_operation_instances(
+        instances, device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances{});
-        instances, device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances{});
+}
-}
+} // namespace device_conv2d_fwd_avx2_instance
-} // namespace device_conv2d_fwd_avx2_instance
+} // namespace device
-} // namespace device
+} // namespace cpu
-} // namespace cpu
+} // namespace tensor_operation
-} // namespace tensor_operation
+} // namespace ck
-} // namespace ck
--- a/test/convnd_fwd_cpu/conv2d_fwd_cpu.cpp
+++ b/test/convnd_fwd_cpu/conv2d_fwd_cpu.cpp
@@ -37,26 +37,53 @@ using WeiElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
 template <typename T>
-static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
+static bool
+check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pixel_check = 0)
 {
    int error_count = 0;
-    float max_diff  = 1e-6;
+    float max_diff  = 1e-5;
+    double square_difference = .0;
+    double mag1              = .0;
+    double mag2              = .0;
    for(int i = 0; i < ref.mData.size(); ++i)
    {
-        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
+        double ri = (double)ref.mData[i];
-        if(max_diff < diff)
+        double pi = (double)result.mData[i];
+        double d  = ri - pi;
+        if(per_pixel_check)
        {
-            error_count++;
+            if(max_diff < std::abs(d))
-            printf("idx:%3d, ref:%f, res:%f (diff:%f)\n",
+            {
-                   i,
+                error_count++;
-                   double(ref.mData[i]),
+                printf("idx:%3d, ref:%f, res:%f (diff:%f)\n",
-                   double(result.mData[i]),
+                       i,
-                   diff);
+                       double(ref.mData[i]),
+                       double(result.mData[i]),
+                       d);
+            }
        }
+        square_difference += d * d;
+        if(std::abs(mag1) < std::abs(ri))
+            mag1 = ri;
+        if(std::abs(mag2) < std::abs(pi))
+            mag2 = pi;
    }
-    return error_count == 0;
+    double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
+    double computed_nrms = std::sqrt(square_difference) / (std::sqrt(ref.mData.size()) * mag);
+    if(computed_nrms >= nrms)
+        printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n",
+               computed_nrms,
+               mag1,
+               mag2,
+               nrms);
+    return computed_nrms < nrms && error_count == 0;
 }
 float calculate_gflops() {}
@@ -171,20 +198,28 @@ int main(int argc, char* argv[])
                  << ", Dilation(H, W):" << conv_dilation_h << ", " << conv_dilation_w
                  << ", Threads:" << omp_get_max_threads() << std::endl;
+        int per_pixel_check = 0;
        switch(init_method)
        {
-        case 0: break;
+        case 0:
+            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
+            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{});
+            per_pixel_check = 1;
+            break;
        case 1:
            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
            // in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
            // wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{});
+            per_pixel_check = 1;
            break;
        case 2:
-            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
+            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
-            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{});
+            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
            break;
        case 3:
 #define PACK_32(v24, v16, v8, v0) \
@@ -310,7 +345,10 @@ int main(int argc, char* argv[])
                out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
-                if(!check_out(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result))
+                if(!check_out(out_n_k_ho_wo_host_result,
+                              out_n_k_ho_wo_device_result,
+                              1e-6,
+                              per_pixel_check))
                {
                    std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
                    success = false;