"docs/git@developer.sourcefind.cn:change/sglang.git" did not exist on "9d02bb3e2a61adb31558400b838c49d502656b12"
Commit e8f639d2 authored by carlushuang's avatar carlushuang
Browse files

fix a bug when buffer is larger than 4G

parent 3e2a530f
...@@ -116,19 +116,19 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::Relu; ...@@ -116,19 +116,19 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::Relu;
template <typename T> template <typename T>
static bool static bool
check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pixel_check = 0) check_out(const T* ref, const T* result, std::size_t len, double nrms, int per_pixel_check = 0)
{ {
int error_count = 0; std::size_t error_count = 0;
float max_diff = 1e-5; float max_diff = 1e-5;
double square_difference = .0; double square_difference = .0;
double mag1 = .0; double mag1 = .0;
double mag2 = .0; double mag2 = .0;
for(int i = 0; i < ref.mData.size(); ++i) for(std::size_t i = 0; i < len; ++i)
{ {
double ri = (double)ref.mData[i]; double ri = (double)ref[i];
double pi = (double)result.mData[i]; double pi = (double)result[i];
double d = ri - pi; double d = ri - pi;
if(per_pixel_check) if(per_pixel_check)
...@@ -136,11 +136,8 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi ...@@ -136,11 +136,8 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
if(max_diff < std::abs(d)) if(max_diff < std::abs(d))
{ {
error_count++; error_count++;
printf("idx:%3d, ref:%f, res:%f (diff:%f)\n", printf(
i, "idx:%3d, ref:%f, res:%f (diff:%f)\n", i, double(ref[i]), double(result[i]), d);
double(ref.mData[i]),
double(result.mData[i]),
d);
} }
} }
...@@ -152,7 +149,7 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi ...@@ -152,7 +149,7 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
} }
double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()}); double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
double computed_nrms = std::sqrt(square_difference) / (std::sqrt(ref.mData.size()) * mag); double computed_nrms = std::sqrt(square_difference) / (std::sqrt(len) * mag);
if(computed_nrms >= nrms) if(computed_nrms >= nrms)
printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n", printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n",
...@@ -360,7 +357,6 @@ int main(int argc, char* argv[]) ...@@ -360,7 +357,6 @@ int main(int argc, char* argv[])
f_host_tensor_descriptor(K, C, Y, X)); // TODO: This is only to hold data f_host_tensor_descriptor(K, C, Y, X)); // TODO: This is only to hold data
#endif #endif
Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo)); Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo));
Tensor<OutDataType> out_n_k_ho_wo_device_result(f_host_tensor_descriptor(N, K, Ho, Wo));
std::cout << "in (N, C, Hi, Wi): " << in_n_c_hi_wi.mDesc << std::endl; std::cout << "in (N, C, Hi, Wi): " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "wei(K, C, Y, X): " << wei_k_c_y_x.mDesc << std::endl; std::cout << "wei(K, C, Y, X): " << wei_k_c_y_x.mDesc << std::endl;
...@@ -651,12 +647,12 @@ int main(int argc, char* argv[]) ...@@ -651,12 +647,12 @@ int main(int argc, char* argv[])
double gflops = (total_flop * 1e-6) / time; double gflops = (total_flop * 1e-6) / time;
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); if(cpu_validation &&
!check_out(out_n_k_ho_wo_host_result.mData.data(),
if(cpu_validation && !check_out(out_n_k_ho_wo_host_result, reinterpret_cast<OutDataType*>(out_device_buf.mpDeviceBuf),
out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result.mData.size(),
1e-6, 1e-6,
per_pixel_check)) per_pixel_check))
{ {
std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl; std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
success = false; success = false;
......
...@@ -152,19 +152,19 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::Add; ...@@ -152,19 +152,19 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::Add;
template <typename T> template <typename T>
static bool static bool
check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pixel_check = 0) check_out(const T* ref, const T* result, std::size_t len, double nrms, int per_pixel_check = 0)
{ {
int error_count = 0; std::size_t error_count = 0;
float max_diff = 1e-5; float max_diff = 1e-5;
double square_difference = .0; double square_difference = .0;
double mag1 = .0; double mag1 = .0;
double mag2 = .0; double mag2 = .0;
for(int i = 0; i < ref.mData.size(); ++i) for(std::size_t i = 0; i < len; ++i)
{ {
double ri = (double)ref.mData[i]; double ri = (double)ref[i];
double pi = (double)result.mData[i]; double pi = (double)result[i];
double d = ri - pi; double d = ri - pi;
if(per_pixel_check) if(per_pixel_check)
...@@ -172,11 +172,8 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi ...@@ -172,11 +172,8 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
if(max_diff < std::abs(d)) if(max_diff < std::abs(d))
{ {
error_count++; error_count++;
printf("idx:%3d, ref:%f, res:%f (diff:%f)\n", printf(
i, "idx:%3d, ref:%f, res:%f (diff:%f)\n", i, double(ref[i]), double(result[i]), d);
double(ref.mData[i]),
double(result.mData[i]),
d);
} }
} }
...@@ -188,7 +185,7 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi ...@@ -188,7 +185,7 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
} }
double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()}); double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
double computed_nrms = std::sqrt(square_difference) / (std::sqrt(ref.mData.size()) * mag); double computed_nrms = std::sqrt(square_difference) / (std::sqrt(len) * mag);
if(computed_nrms >= nrms) if(computed_nrms >= nrms)
printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n", printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n",
...@@ -407,7 +404,6 @@ int main(int argc, char* argv[]) ...@@ -407,7 +404,6 @@ int main(int argc, char* argv[])
f_host_tensor_descriptor(K, C, Y, X)); // TODO: This is only to hold data f_host_tensor_descriptor(K, C, Y, X)); // TODO: This is only to hold data
#endif #endif
Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo)); Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo));
Tensor<OutDataType> out_n_k_ho_wo_device_result(f_host_tensor_descriptor(N, K, Ho, Wo));
// bias: assume contiguous 1d vector // bias: assume contiguous 1d vector
Tensor<OutDataType> bias( Tensor<OutDataType> bias(
...@@ -788,12 +784,12 @@ int main(int argc, char* argv[]) ...@@ -788,12 +784,12 @@ int main(int argc, char* argv[])
double gflops = (total_flop * 1e-6) / time; double gflops = (total_flop * 1e-6) / time;
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); if(cpu_validation &&
!check_out(out_n_k_ho_wo_host_result.mData.data(),
if(cpu_validation && !check_out(out_n_k_ho_wo_host_result, reinterpret_cast<OutDataType*>(out_device_buf.mpDeviceBuf),
out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result.mData.size(),
1e-6, 1e-6,
per_pixel_check)) per_pixel_check))
{ {
std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl; std::cout << "Fail Info: " << conv_ptr->GetTypeString() << std::endl;
success = false; success = false;
......
...@@ -693,7 +693,7 @@ struct DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_W ...@@ -693,7 +693,7 @@ struct DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_W
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting"); throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
} }
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = ck::cpu::kernel_direct_conv_nhwc_avx_mxn<GridwiseGemm, const auto kernel = ck::cpu::kernel_direct_conv_nhwc_avx_mxn<GridwiseGemm,
InDataType, InDataType,
...@@ -734,7 +734,7 @@ struct DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_W ...@@ -734,7 +734,7 @@ struct DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_W
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result // result
memset(arg.p_c_grid_, 0xfe, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0xfe, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel, launch_cpu_kernel(kernel,
gridwise_gemm, gridwise_gemm,
......
...@@ -712,7 +712,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K ...@@ -712,7 +712,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting"); throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
} }
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm, const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm,
InDataType, InDataType,
...@@ -743,7 +743,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K ...@@ -743,7 +743,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result // result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel, launch_cpu_kernel(kernel,
gridwise_gemm, gridwise_gemm,
......
...@@ -688,7 +688,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K ...@@ -688,7 +688,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting"); throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
} }
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm, const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm,
InDataType, InDataType,
...@@ -719,7 +719,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K ...@@ -719,7 +719,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result // result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel, launch_cpu_kernel(kernel,
gridwise_gemm, gridwise_gemm,
......
...@@ -681,7 +681,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K ...@@ -681,7 +681,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting"); throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
} }
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm, const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm,
InDataType, InDataType,
...@@ -712,7 +712,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K ...@@ -712,7 +712,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result // result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel, launch_cpu_kernel(kernel,
gridwise_gemm, gridwise_gemm,
......
...@@ -785,7 +785,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu ...@@ -785,7 +785,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting"); throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
} }
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = const auto kernel =
ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm, ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm,
...@@ -825,7 +825,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu ...@@ -825,7 +825,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result // result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel, launch_cpu_kernel(kernel,
gridwise_gemm, gridwise_gemm,
......
...@@ -762,7 +762,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou ...@@ -762,7 +762,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting"); throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
} }
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = const auto kernel =
ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm, ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm,
...@@ -802,7 +802,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou ...@@ -802,7 +802,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result // result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel, launch_cpu_kernel(kernel,
gridwise_gemm, gridwise_gemm,
......
...@@ -758,7 +758,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu ...@@ -758,7 +758,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting"); throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
} }
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = const auto kernel =
ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm, ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm,
...@@ -798,7 +798,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu ...@@ -798,7 +798,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result // result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel, launch_cpu_kernel(kernel,
gridwise_gemm, gridwise_gemm,
......
#include <chrono> #include <chrono>
#include <assert.h> #include <assert.h>
#include <string.h> #include <string.h>
#include <stdlib.h>
#include "device.hpp" #include "device.hpp"
#ifndef CK_NOGPU #ifndef CK_NOGPU
...@@ -85,15 +86,10 @@ DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t align ...@@ -85,15 +86,10 @@ DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t align
{ {
assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2 assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2
void* p1; // TODO: posix only
void** p2; int rtn = posix_memalign(&mpDeviceBuf, alignment, mem_size);
int offset = alignment - 1 + sizeof(void*);
p1 = malloc(mem_size + offset);
assert(p1 != nullptr);
p2 = reinterpret_cast<void**>((reinterpret_cast<size_t>(p1) + offset) & ~(alignment - 1)); assert(rtn == 0);
p2[-1] = p1;
mpDeviceBuf = reinterpret_cast<void*>(p2);
} }
} }
...@@ -110,7 +106,7 @@ void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); } ...@@ -110,7 +106,7 @@ void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
DeviceAlignedMemCPU::~DeviceAlignedMemCPU() DeviceAlignedMemCPU::~DeviceAlignedMemCPU()
{ {
if(mpDeviceBuf != nullptr) if(mpDeviceBuf != nullptr)
free((reinterpret_cast<void**>(mpDeviceBuf))[-1]); free(mpDeviceBuf);
} }
struct WallTimerImpl struct WallTimerImpl
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment