fix a bug in general index calculation

5771a040 · carlushuang · 5e6cca6f · 5771a040 · 5771a040 · 5771a040
Commit 5771a040 authored Apr 27, 2022 by carlushuang
3 changed files
--- a/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
@@ -567,6 +567,7 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
                            {
                                i_x_itr_k = 0;
                                i_y_itr_k++;
+                                i_wi_itr_k -= Dx * Fx;
                                i_hi_itr_k += Dy;
                                p_src_k += input_offset_ovf_x_acc_y;
                            }

--- a/library/src/tensor_operation_instance/cpu/conv2d_fwd/device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_instance.cpp
+++ b/library/src/tensor_operation_instance/cpu/conv2d_fwd/device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_instance.cpp
@@ -57,7 +57,6 @@ static constexpr auto LoopOver_MKN = ck::tensor_operation::cpu::device::LoopOver
    DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<float , float , float, a_elem_op, b_elem_op, c_elem_op, ConvFwd1x1P0,   DefaultGemmKLoop, LoopOver_MKN, 2, m_per_block, n_per_block, k_per_block, m_per_thread, n_per_thread, a_local_buf, b_local_buf, c_local_buf>

 // clang-format on
-
 using device_conv2d_fwd_avx2_nhwc_kyxc_nhwk_f32_instances = std::tuple<
    // clang-format off
    DEVICE_CONV2D_FWD_AVX2_NHWC_KYXC_NHWK_F32(PT, PT, PT,  256, 120,  64,  4, 24, true, true, false),

--- a/test/convnd_fwd_cpu/conv2d_fwd_cpu.cpp
+++ b/test/convnd_fwd_cpu/conv2d_fwd_cpu.cpp
@@ -37,26 +37,53 @@ using WeiElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::cpu::element_wise::PassThrough;

 template <typename T>
-static bool check_out(const Tensor<T>& ref, const Tensor<T>& result)
+static bool
+check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pixel_check = 0)
 {
    int error_count = 0;
-    float max_diff  = 1e-6;
+    float max_diff  = 1e-5;
+
+    double square_difference = .0;
+    double mag1              = .0;
+    double mag2              = .0;

    for(int i = 0; i < ref.mData.size(); ++i)
    {
-        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
-        if(max_diff < diff)
+        double ri = (double)ref.mData[i];
+        double pi = (double)result.mData[i];
+        double d  = ri - pi;
+
+        if(per_pixel_check)
+        {
+            if(max_diff < std::abs(d))
            {
                error_count++;
                printf("idx:%3d, ref:%f, res:%f (diff:%f)\n",
                       i,
                       double(ref.mData[i]),
                       double(result.mData[i]),
-                   diff);
+                       d);
            }
        }

-    return error_count == 0;
+        square_difference += d * d;
+        if(std::abs(mag1) < std::abs(ri))
+            mag1 = ri;
+        if(std::abs(mag2) < std::abs(pi))
+            mag2 = pi;
+    }
+
+    double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
+    double computed_nrms = std::sqrt(square_difference) / (std::sqrt(ref.mData.size()) * mag);
+
+    if(computed_nrms >= nrms)
+        printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n",
+               computed_nrms,
+               mag1,
+               mag2,
+               nrms);
+
+    return computed_nrms < nrms && error_count == 0;
 }

 float calculate_gflops() {}
@@ -171,20 +198,28 @@ int main(int argc, char* argv[])
                  << ", Dilation(H, W):" << conv_dilation_h << ", " << conv_dilation_w
                  << ", Threads:" << omp_get_max_threads() << std::endl;

+        int per_pixel_check = 0;
        switch(init_method)
        {
-        case 0: break;
+        case 0:
+            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
+            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{});
+            per_pixel_check = 1;
+            break;
        case 1:

            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
            // in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
            // wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{});
+            per_pixel_check = 1;
            break;
+
        case 2:
-            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
-            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{});
+            in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+            wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
            break;
+
        case 3:

 #define PACK_32(v24, v16, v8, v0) \
@@ -310,7 +345,10 @@ int main(int argc, char* argv[])

                out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());

-                if(!check_out(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result))
+                if(!check_out(out_n_k_ho_wo_host_result,
+                              out_n_k_ho_wo_device_result,
+                              1e-6,
+                              per_pixel_check))
                {
                    std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
                    success = false;