Commit e8f639d2 authored by carlushuang's avatar carlushuang
Browse files

fix a bug when buffer is larger than 4G

parent 3e2a530f
......@@ -116,19 +116,19 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::Relu;
template <typename T>
static bool
check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pixel_check = 0)
check_out(const T* ref, const T* result, std::size_t len, double nrms, int per_pixel_check = 0)
{
int error_count = 0;
std::size_t error_count = 0;
float max_diff = 1e-5;
double square_difference = .0;
double mag1 = .0;
double mag2 = .0;
for(int i = 0; i < ref.mData.size(); ++i)
for(std::size_t i = 0; i < len; ++i)
{
double ri = (double)ref.mData[i];
double pi = (double)result.mData[i];
double ri = (double)ref[i];
double pi = (double)result[i];
double d = ri - pi;
if(per_pixel_check)
......@@ -136,11 +136,8 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
if(max_diff < std::abs(d))
{
error_count++;
printf("idx:%3d, ref:%f, res:%f (diff:%f)\n",
i,
double(ref.mData[i]),
double(result.mData[i]),
d);
printf(
"idx:%3d, ref:%f, res:%f (diff:%f)\n", i, double(ref[i]), double(result[i]), d);
}
}
......@@ -152,7 +149,7 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
}
double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
double computed_nrms = std::sqrt(square_difference) / (std::sqrt(ref.mData.size()) * mag);
double computed_nrms = std::sqrt(square_difference) / (std::sqrt(len) * mag);
if(computed_nrms >= nrms)
printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n",
......@@ -360,7 +357,6 @@ int main(int argc, char* argv[])
f_host_tensor_descriptor(K, C, Y, X)); // TODO: This is only to hold data
#endif
Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo));
Tensor<OutDataType> out_n_k_ho_wo_device_result(f_host_tensor_descriptor(N, K, Ho, Wo));
std::cout << "in (N, C, Hi, Wi): " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "wei(K, C, Y, X): " << wei_k_c_y_x.mDesc << std::endl;
......@@ -651,10 +647,10 @@ int main(int argc, char* argv[])
double gflops = (total_flop * 1e-6) / time;
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
if(cpu_validation && !check_out(out_n_k_ho_wo_host_result,
out_n_k_ho_wo_device_result,
if(cpu_validation &&
!check_out(out_n_k_ho_wo_host_result.mData.data(),
reinterpret_cast<OutDataType*>(out_device_buf.mpDeviceBuf),
out_n_k_ho_wo_host_result.mData.size(),
1e-6,
per_pixel_check))
{
......
......@@ -152,19 +152,19 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::Add;
template <typename T>
static bool
check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pixel_check = 0)
check_out(const T* ref, const T* result, std::size_t len, double nrms, int per_pixel_check = 0)
{
int error_count = 0;
std::size_t error_count = 0;
float max_diff = 1e-5;
double square_difference = .0;
double mag1 = .0;
double mag2 = .0;
for(int i = 0; i < ref.mData.size(); ++i)
for(std::size_t i = 0; i < len; ++i)
{
double ri = (double)ref.mData[i];
double pi = (double)result.mData[i];
double ri = (double)ref[i];
double pi = (double)result[i];
double d = ri - pi;
if(per_pixel_check)
......@@ -172,11 +172,8 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
if(max_diff < std::abs(d))
{
error_count++;
printf("idx:%3d, ref:%f, res:%f (diff:%f)\n",
i,
double(ref.mData[i]),
double(result.mData[i]),
d);
printf(
"idx:%3d, ref:%f, res:%f (diff:%f)\n", i, double(ref[i]), double(result[i]), d);
}
}
......@@ -188,7 +185,7 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
}
double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
double computed_nrms = std::sqrt(square_difference) / (std::sqrt(ref.mData.size()) * mag);
double computed_nrms = std::sqrt(square_difference) / (std::sqrt(len) * mag);
if(computed_nrms >= nrms)
printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n",
......@@ -407,7 +404,6 @@ int main(int argc, char* argv[])
f_host_tensor_descriptor(K, C, Y, X)); // TODO: This is only to hold data
#endif
Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo));
Tensor<OutDataType> out_n_k_ho_wo_device_result(f_host_tensor_descriptor(N, K, Ho, Wo));
// bias: assume contiguous 1d vector
Tensor<OutDataType> bias(
......@@ -788,10 +784,10 @@ int main(int argc, char* argv[])
double gflops = (total_flop * 1e-6) / time;
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
if(cpu_validation && !check_out(out_n_k_ho_wo_host_result,
out_n_k_ho_wo_device_result,
if(cpu_validation &&
!check_out(out_n_k_ho_wo_host_result.mData.data(),
reinterpret_cast<OutDataType*>(out_device_buf.mpDeviceBuf),
out_n_k_ho_wo_host_result.mData.size(),
1e-6,
per_pixel_check))
{
......
......@@ -693,7 +693,7 @@ struct DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_W
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
}
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
// memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = ck::cpu::kernel_direct_conv_nhwc_avx_mxn<GridwiseGemm,
InDataType,
......@@ -734,7 +734,7 @@ struct DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_W
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result
memset(arg.p_c_grid_, 0xfe, arg.c_grid_desc_.GetElementSpaceSize());
// memset(arg.p_c_grid_, 0xfe, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel,
gridwise_gemm,
......
......@@ -712,7 +712,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
}
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
// memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm,
InDataType,
......@@ -743,7 +743,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
// memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel,
gridwise_gemm,
......
......@@ -688,7 +688,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
}
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
// memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm,
InDataType,
......@@ -719,7 +719,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
// memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel,
gridwise_gemm,
......
......@@ -681,7 +681,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
}
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
// memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm,
InDataType,
......@@ -712,7 +712,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
// memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel,
gridwise_gemm,
......
......@@ -785,7 +785,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
}
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
// memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel =
ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm,
......@@ -825,7 +825,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
// memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel,
gridwise_gemm,
......
......@@ -762,7 +762,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
}
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
// memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel =
ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm,
......@@ -802,7 +802,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
// memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel,
gridwise_gemm,
......
......@@ -758,7 +758,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
}
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
// memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel =
ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm,
......@@ -798,7 +798,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
// memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel,
gridwise_gemm,
......
#include <chrono>
#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include "device.hpp"
#ifndef CK_NOGPU
......@@ -85,15 +86,10 @@ DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t align
{
assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2
void* p1;
void** p2;
int offset = alignment - 1 + sizeof(void*);
p1 = malloc(mem_size + offset);
assert(p1 != nullptr);
// TODO: posix only
int rtn = posix_memalign(&mpDeviceBuf, alignment, mem_size);
p2 = reinterpret_cast<void**>((reinterpret_cast<size_t>(p1) + offset) & ~(alignment - 1));
p2[-1] = p1;
mpDeviceBuf = reinterpret_cast<void*>(p2);
assert(rtn == 0);
}
}
......@@ -110,7 +106,7 @@ void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
DeviceAlignedMemCPU::~DeviceAlignedMemCPU()
{
if(mpDeviceBuf != nullptr)
free((reinterpret_cast<void**>(mpDeviceBuf))[-1]);
free(mpDeviceBuf);
}
struct WallTimerImpl
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment