fix a bug when buffer is larger than 4G

e8f639d2 · carlushuang · 3e2a530f · e8f639d2 · e8f639d2 · e8f639d2
Commit e8f639d2 authored Jul 02, 2022 by carlushuang
11 changed files
--- a/example/cpu_01_conv2d_fwd/cpu_conv2d_fwd.cpp
+++ b/example/cpu_01_conv2d_fwd/cpu_conv2d_fwd.cpp
@@ -116,19 +116,19 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::Relu;
 template <typename T>
 static bool
-check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pixel_check = 0)
+check_out(const T* ref, const T* result, std::size_t len, double nrms, int per_pixel_check = 0)
 {
-    int error_count = 0;
+    std::size_t error_count = 0;
-    float max_diff  = 1e-5;
+    float max_diff          = 1e-5;
    double square_difference = .0;
    double mag1              = .0;
    double mag2              = .0;
-    for(int i = 0; i < ref.mData.size(); ++i)
+    for(std::size_t i = 0; i < len; ++i)
    {
-        double ri = (double)ref.mData[i];
+        double ri = (double)ref[i];
-        double pi = (double)result.mData[i];
+        double pi = (double)result[i];
        double d  = ri - pi;
        if(per_pixel_check)
@@ -136,11 +136,8 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
            if(max_diff < std::abs(d))
            {
                error_count++;
-                printf("idx:%3d, ref:%f, res:%f (diff:%f)\n",
+                printf(
-                       i,
+                    "idx:%3d, ref:%f, res:%f (diff:%f)\n", i, double(ref[i]), double(result[i]), d);
-                       double(ref.mData[i]),
-                       double(result.mData[i]),
-                       d);
            }
        }
@@ -152,7 +149,7 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
    }
    double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
-    double computed_nrms = std::sqrt(square_difference) / (std::sqrt(ref.mData.size()) * mag);
+    double computed_nrms = std::sqrt(square_difference) / (std::sqrt(len) * mag);
    if(computed_nrms >= nrms)
        printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n",
@@ -360,7 +357,6 @@ int main(int argc, char* argv[])
            f_host_tensor_descriptor(K, C, Y, X)); // TODO: This is only to hold data
 #endif
        Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo));
-        Tensor<OutDataType> out_n_k_ho_wo_device_result(f_host_tensor_descriptor(N, K, Ho, Wo));
        std::cout << "in (N, C, Hi, Wi): " << in_n_c_hi_wi.mDesc << std::endl;
        std::cout << "wei(K, C,  Y,  X): " << wei_k_c_y_x.mDesc << std::endl;
@@ -651,12 +647,12 @@ int main(int argc, char* argv[])
                double gflops = (total_flop * 1e-6) / time;
-                out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
+                if(cpu_validation &&
+                   !check_out(out_n_k_ho_wo_host_result.mData.data(),
-                if(cpu_validation && !check_out(out_n_k_ho_wo_host_result,
+                              reinterpret_cast<OutDataType*>(out_device_buf.mpDeviceBuf),
-                                                out_n_k_ho_wo_device_result,
+                              out_n_k_ho_wo_host_result.mData.size(),
-                                                1e-6,
+                              1e-6,
-                                                per_pixel_check))
+                              per_pixel_check))
                {
                    std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
                    success = false;

--- a/example/cpu_02_conv2d_fwd_bias_relu_add/cpu_conv2d_fwd_bias_relu_add.cpp
+++ b/example/cpu_02_conv2d_fwd_bias_relu_add/cpu_conv2d_fwd_bias_relu_add.cpp
@@ -152,19 +152,19 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::Add;
 template <typename T>
 static bool
-check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pixel_check = 0)
+check_out(const T* ref, const T* result, std::size_t len, double nrms, int per_pixel_check = 0)
 {
-    int error_count = 0;
+    std::size_t error_count = 0;
-    float max_diff  = 1e-5;
+    float max_diff          = 1e-5;
    double square_difference = .0;
    double mag1              = .0;
    double mag2              = .0;
-    for(int i = 0; i < ref.mData.size(); ++i)
+    for(std::size_t i = 0; i < len; ++i)
    {
-        double ri = (double)ref.mData[i];
+        double ri = (double)ref[i];
-        double pi = (double)result.mData[i];
+        double pi = (double)result[i];
        double d  = ri - pi;
        if(per_pixel_check)
@@ -172,11 +172,8 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
            if(max_diff < std::abs(d))
            {
                error_count++;
-                printf("idx:%3d, ref:%f, res:%f (diff:%f)\n",
+                printf(
-                       i,
+                    "idx:%3d, ref:%f, res:%f (diff:%f)\n", i, double(ref[i]), double(result[i]), d);
-                       double(ref.mData[i]),
-                       double(result.mData[i]),
-                       d);
            }
        }
@@ -188,7 +185,7 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
    }
    double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
-    double computed_nrms = std::sqrt(square_difference) / (std::sqrt(ref.mData.size()) * mag);
+    double computed_nrms = std::sqrt(square_difference) / (std::sqrt(len) * mag);
    if(computed_nrms >= nrms)
        printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n",
@@ -407,7 +404,6 @@ int main(int argc, char* argv[])
            f_host_tensor_descriptor(K, C, Y, X)); // TODO: This is only to hold data
 #endif
        Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo));
-        Tensor<OutDataType> out_n_k_ho_wo_device_result(f_host_tensor_descriptor(N, K, Ho, Wo));
        // bias: assume contiguous 1d vector
        Tensor<OutDataType> bias(
@@ -788,12 +784,12 @@ int main(int argc, char* argv[])
                double gflops = (total_flop * 1e-6) / time;
-                out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
+                if(cpu_validation &&
+                   !check_out(out_n_k_ho_wo_host_result.mData.data(),
-                if(cpu_validation && !check_out(out_n_k_ho_wo_host_result,
+                              reinterpret_cast<OutDataType*>(out_device_buf.mpDeviceBuf),
-                                                out_n_k_ho_wo_device_result,
+                              out_n_k_ho_wo_host_result.mData.size(),
-                                                1e-6,
+                              1e-6,
-                                                per_pixel_check))
+                              per_pixel_check))
                {
                    std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
                    success = false;

--- a/include/ck/tensor_operation/cpu/device/device_convnd_direct_fwd_avx2_nhwc_kyxck8_nhwk.hpp
+++ b/include/ck/tensor_operation/cpu/device/device_convnd_direct_fwd_avx2_nhwc_kyxck8_nhwk.hpp
@@ -693,7 +693,7 @@ struct DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_W
                throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
            }
-            memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
+            // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
            const auto kernel = ck::cpu::kernel_direct_conv_nhwc_avx_mxn<GridwiseGemm,
                                                                         InDataType,
@@ -734,7 +734,7 @@ struct DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_W
            // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
            // result
-            memset(arg.p_c_grid_, 0xfe, arg.c_grid_desc_.GetElementSpaceSize());
+            // memset(arg.p_c_grid_, 0xfe, arg.c_grid_desc_.GetElementSpaceSize());
            launch_cpu_kernel(kernel,
                              gridwise_gemm,

--- a/include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_kyxc_nhwk.hpp
@@ -712,7 +712,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
                throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
            }
-            memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
+            // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
            const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm,
                                                             InDataType,
@@ -743,7 +743,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
            // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
            // result
-            memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
+            // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
            launch_cpu_kernel(kernel,
                              gridwise_gemm,

--- a/include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_kyxck8_nhwk.hpp
+++ b/include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_kyxck8_nhwk.hpp
@@ -688,7 +688,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K
                throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
            }
-            memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
+            // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
            const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm,
                                                             InDataType,
@@ -719,7 +719,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K
            // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
            // result
-            memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
+            // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
            launch_cpu_kernel(kernel,
                              gridwise_gemm,

--- a/include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_yxck_nhwk.hpp
+++ b/include/ck/tensor_operation/cpu/device/device_convnd_fwd_avx2_nhwc_yxck_nhwk.hpp
@@ -681,7 +681,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K
                throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
            }
-            memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
+            // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
            const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm,
                                                             InDataType,
@@ -712,7 +712,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K
            // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
            // result
-            memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
+            // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
            launch_cpu_kernel(kernel,
                              gridwise_gemm,

--- a/include/ck/tensor_operation/cpu/device/device_convnd_fwd_bias_activation_add_avx2_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/cpu/device/device_convnd_fwd_bias_activation_add_avx2_nhwc_kyxc_nhwk.hpp
@@ -785,7 +785,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu
                throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
            }
-            memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
+            // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
            const auto kernel =
                ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm,
@@ -825,7 +825,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu
            // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
            // result
-            memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
+            // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
            launch_cpu_kernel(kernel,
                              gridwise_gemm,

--- a/include/ck/tensor_operation/cpu/device/device_convnd_fwd_bias_activation_add_avx2_nhwc_kyxck8_nhwk.hpp
+++ b/include/ck/tensor_operation/cpu/device/device_convnd_fwd_bias_activation_add_avx2_nhwc_kyxck8_nhwk.hpp
@@ -762,7 +762,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou
                throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
            }
-            memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
+            // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
            const auto kernel =
                ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm,
@@ -802,7 +802,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou
            // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
            // result
-            memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
+            // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
            launch_cpu_kernel(kernel,
                              gridwise_gemm,

--- a/include/ck/tensor_operation/cpu/device/device_convnd_fwd_bias_activation_add_avx2_nhwc_yxck_nhwk.hpp
+++ b/include/ck/tensor_operation/cpu/device/device_convnd_fwd_bias_activation_add_avx2_nhwc_yxck_nhwk.hpp
@@ -758,7 +758,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu
                throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
            }
-            memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
+            // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
            const auto kernel =
                ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm,
@@ -798,7 +798,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu
            // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
            // result
-            memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
+            // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
            launch_cpu_kernel(kernel,
                              gridwise_gemm,

--- a/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
--- a/library/src/host_tensor/device.cpp
+++ b/library/src/host_tensor/device.cpp
 #include <chrono>
 #include <assert.h>
 #include <string.h>
+#include <stdlib.h>
 #include "device.hpp"
 #ifndef CK_NOGPU
@@ -85,15 +86,10 @@ DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t align
    {
        assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2
-        void* p1;
+        // TODO: posix only
-        void** p2;
+        int rtn = posix_memalign(&mpDeviceBuf, alignment, mem_size);
-        int offset = alignment - 1 + sizeof(void*);
-        p1         = malloc(mem_size + offset);
-        assert(p1 != nullptr);
-        p2 = reinterpret_cast<void**>((reinterpret_cast<size_t>(p1) + offset) & ~(alignment - 1));
+        assert(rtn == 0);
-        p2[-1]      = p1;
-        mpDeviceBuf = reinterpret_cast<void*>(p2);
    }
 }
@@ -110,7 +106,7 @@ void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
 DeviceAlignedMemCPU::~DeviceAlignedMemCPU()
 {
    if(mpDeviceBuf != nullptr)
-        free((reinterpret_cast<void**>(mpDeviceBuf))[-1]);
+        free(mpDeviceBuf);
 }
 struct WallTimerImpl