"git@developer.sourcefind.cn:zhaoyu6/sglang.git" did not exist on "ac05310098aa525ca526c04539d5549cb4fc00a7"
Commit e8f639d2 authored by carlushuang's avatar carlushuang
Browse files

fix a bug when buffer is larger than 4G

parent 3e2a530f
...@@ -116,19 +116,19 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::Relu; ...@@ -116,19 +116,19 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::Relu;
template <typename T> template <typename T>
static bool static bool
check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pixel_check = 0) check_out(const T* ref, const T* result, std::size_t len, double nrms, int per_pixel_check = 0)
{ {
int error_count = 0; std::size_t error_count = 0;
float max_diff = 1e-5; float max_diff = 1e-5;
double square_difference = .0; double square_difference = .0;
double mag1 = .0; double mag1 = .0;
double mag2 = .0; double mag2 = .0;
for(int i = 0; i < ref.mData.size(); ++i) for(std::size_t i = 0; i < len; ++i)
{ {
double ri = (double)ref.mData[i]; double ri = (double)ref[i];
double pi = (double)result.mData[i]; double pi = (double)result[i];
double d = ri - pi; double d = ri - pi;
if(per_pixel_check) if(per_pixel_check)
...@@ -136,11 +136,8 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi ...@@ -136,11 +136,8 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
if(max_diff < std::abs(d)) if(max_diff < std::abs(d))
{ {
error_count++; error_count++;
printf("idx:%3d, ref:%f, res:%f (diff:%f)\n", printf(
i, "idx:%3d, ref:%f, res:%f (diff:%f)\n", i, double(ref[i]), double(result[i]), d);
double(ref.mData[i]),
double(result.mData[i]),
d);
} }
} }
...@@ -152,7 +149,7 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi ...@@ -152,7 +149,7 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
} }
double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()}); double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
double computed_nrms = std::sqrt(square_difference) / (std::sqrt(ref.mData.size()) * mag); double computed_nrms = std::sqrt(square_difference) / (std::sqrt(len) * mag);
if(computed_nrms >= nrms) if(computed_nrms >= nrms)
printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n", printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n",
...@@ -360,7 +357,6 @@ int main(int argc, char* argv[]) ...@@ -360,7 +357,6 @@ int main(int argc, char* argv[])
f_host_tensor_descriptor(K, C, Y, X)); // TODO: This is only to hold data f_host_tensor_descriptor(K, C, Y, X)); // TODO: This is only to hold data
#endif #endif
Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo)); Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo));
Tensor<OutDataType> out_n_k_ho_wo_device_result(f_host_tensor_descriptor(N, K, Ho, Wo));
std::cout << "in (N, C, Hi, Wi): " << in_n_c_hi_wi.mDesc << std::endl; std::cout << "in (N, C, Hi, Wi): " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "wei(K, C, Y, X): " << wei_k_c_y_x.mDesc << std::endl; std::cout << "wei(K, C, Y, X): " << wei_k_c_y_x.mDesc << std::endl;
...@@ -651,10 +647,10 @@ int main(int argc, char* argv[]) ...@@ -651,10 +647,10 @@ int main(int argc, char* argv[])
double gflops = (total_flop * 1e-6) / time; double gflops = (total_flop * 1e-6) / time;
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); if(cpu_validation &&
!check_out(out_n_k_ho_wo_host_result.mData.data(),
if(cpu_validation && !check_out(out_n_k_ho_wo_host_result, reinterpret_cast<OutDataType*>(out_device_buf.mpDeviceBuf),
out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result.mData.size(),
1e-6, 1e-6,
per_pixel_check)) per_pixel_check))
{ {
......
...@@ -152,19 +152,19 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::Add; ...@@ -152,19 +152,19 @@ using OutElementOp = ck::tensor_operation::cpu::element_wise::Add;
template <typename T> template <typename T>
static bool static bool
check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pixel_check = 0) check_out(const T* ref, const T* result, std::size_t len, double nrms, int per_pixel_check = 0)
{ {
int error_count = 0; std::size_t error_count = 0;
float max_diff = 1e-5; float max_diff = 1e-5;
double square_difference = .0; double square_difference = .0;
double mag1 = .0; double mag1 = .0;
double mag2 = .0; double mag2 = .0;
for(int i = 0; i < ref.mData.size(); ++i) for(std::size_t i = 0; i < len; ++i)
{ {
double ri = (double)ref.mData[i]; double ri = (double)ref[i];
double pi = (double)result.mData[i]; double pi = (double)result[i];
double d = ri - pi; double d = ri - pi;
if(per_pixel_check) if(per_pixel_check)
...@@ -172,11 +172,8 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi ...@@ -172,11 +172,8 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
if(max_diff < std::abs(d)) if(max_diff < std::abs(d))
{ {
error_count++; error_count++;
printf("idx:%3d, ref:%f, res:%f (diff:%f)\n", printf(
i, "idx:%3d, ref:%f, res:%f (diff:%f)\n", i, double(ref[i]), double(result[i]), d);
double(ref.mData[i]),
double(result.mData[i]),
d);
} }
} }
...@@ -188,7 +185,7 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi ...@@ -188,7 +185,7 @@ check_out(const Tensor<T>& ref, const Tensor<T>& result, double nrms, int per_pi
} }
double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()}); double mag = std::max({std::fabs(mag1), std::fabs(mag2), std::numeric_limits<double>::min()});
double computed_nrms = std::sqrt(square_difference) / (std::sqrt(ref.mData.size()) * mag); double computed_nrms = std::sqrt(square_difference) / (std::sqrt(len) * mag);
if(computed_nrms >= nrms) if(computed_nrms >= nrms)
printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n", printf("nrms:%lf, mag1:%lf, mag2:%lf, expected_nrms is %1f\n",
...@@ -407,7 +404,6 @@ int main(int argc, char* argv[]) ...@@ -407,7 +404,6 @@ int main(int argc, char* argv[])
f_host_tensor_descriptor(K, C, Y, X)); // TODO: This is only to hold data f_host_tensor_descriptor(K, C, Y, X)); // TODO: This is only to hold data
#endif #endif
Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo)); Tensor<OutDataType> out_n_k_ho_wo_host_result(f_host_tensor_descriptor(N, K, Ho, Wo));
Tensor<OutDataType> out_n_k_ho_wo_device_result(f_host_tensor_descriptor(N, K, Ho, Wo));
// bias: assume contiguous 1d vector // bias: assume contiguous 1d vector
Tensor<OutDataType> bias( Tensor<OutDataType> bias(
...@@ -788,10 +784,10 @@ int main(int argc, char* argv[]) ...@@ -788,10 +784,10 @@ int main(int argc, char* argv[])
double gflops = (total_flop * 1e-6) / time; double gflops = (total_flop * 1e-6) / time;
out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); if(cpu_validation &&
!check_out(out_n_k_ho_wo_host_result.mData.data(),
if(cpu_validation && !check_out(out_n_k_ho_wo_host_result, reinterpret_cast<OutDataType*>(out_device_buf.mpDeviceBuf),
out_n_k_ho_wo_device_result, out_n_k_ho_wo_host_result.mData.size(),
1e-6, 1e-6,
per_pixel_check)) per_pixel_check))
{ {
......
...@@ -693,7 +693,7 @@ struct DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_W ...@@ -693,7 +693,7 @@ struct DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_W
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting"); throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
} }
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = ck::cpu::kernel_direct_conv_nhwc_avx_mxn<GridwiseGemm, const auto kernel = ck::cpu::kernel_direct_conv_nhwc_avx_mxn<GridwiseGemm,
InDataType, InDataType,
...@@ -734,7 +734,7 @@ struct DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_W ...@@ -734,7 +734,7 @@ struct DeviceConvNDDirectFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_W
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result // result
memset(arg.p_c_grid_, 0xfe, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0xfe, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel, launch_cpu_kernel(kernel,
gridwise_gemm, gridwise_gemm,
......
...@@ -712,7 +712,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K ...@@ -712,7 +712,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting"); throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
} }
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm, const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm,
InDataType, InDataType,
...@@ -743,7 +743,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K ...@@ -743,7 +743,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result // result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel, launch_cpu_kernel(kernel,
gridwise_gemm, gridwise_gemm,
......
...@@ -688,7 +688,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K ...@@ -688,7 +688,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting"); throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
} }
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm, const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm,
InDataType, InDataType,
...@@ -719,7 +719,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K ...@@ -719,7 +719,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Output_N_Ho_Wo_K
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result // result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel, launch_cpu_kernel(kernel,
gridwise_gemm, gridwise_gemm,
......
...@@ -681,7 +681,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K ...@@ -681,7 +681,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting"); throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
} }
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm, const auto kernel = ck::cpu::kernel_gemm_avx_mxn<GridwiseGemm,
InDataType, InDataType,
...@@ -712,7 +712,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K ...@@ -712,7 +712,7 @@ struct DeviceConvNDFwdAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Output_N_Ho_Wo_K
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result // result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel, launch_cpu_kernel(kernel,
gridwise_gemm, gridwise_gemm,
......
...@@ -785,7 +785,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu ...@@ -785,7 +785,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting"); throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
} }
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = const auto kernel =
ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm, ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm,
...@@ -825,7 +825,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu ...@@ -825,7 +825,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Outpu
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result // result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel, launch_cpu_kernel(kernel,
gridwise_gemm, gridwise_gemm,
......
...@@ -762,7 +762,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou ...@@ -762,7 +762,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting"); throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
} }
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = const auto kernel =
ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm, ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm,
...@@ -802,7 +802,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou ...@@ -802,7 +802,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_K_Y_X_C_K8_Ou
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result // result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel, launch_cpu_kernel(kernel,
gridwise_gemm, gridwise_gemm,
......
...@@ -758,7 +758,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu ...@@ -758,7 +758,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu
throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting"); throw std::runtime_error("wrong! GridwiseGemmAvx2_MxN has invalid setting");
} }
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
const auto kernel = const auto kernel =
ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm, ck::cpu::kernel_gemm_bias_activation_add_avx_mxn<GridwiseGemm,
...@@ -798,7 +798,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu ...@@ -798,7 +798,7 @@ struct DeviceConvNDFwdBiasActivationAddAvx2_Input_N_Hi_Wi_C_Weight_Y_X_C_K_Outpu
// TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the // TODO: this is for benchmark purpose, so last time we clear c buffer and calculate the
// result // result
memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize()); // memset(arg.p_c_grid_, 0, arg.c_grid_desc_.GetElementSpaceSize());
launch_cpu_kernel(kernel, launch_cpu_kernel(kernel,
gridwise_gemm, gridwise_gemm,
......
...@@ -18,10 +18,13 @@ namespace cpu { ...@@ -18,10 +18,13 @@ namespace cpu {
namespace avx2_util { namespace avx2_util {
template <typename ElementwiseOp> template <typename ElementwiseOp>
void memcpy32_avx2(void* dst, const void* src, const ck::index_t n, const ElementwiseOp& element_op) void memcpy32_avx2(void* dst,
const void* src,
const ck::long_index_t n,
const ElementwiseOp& element_op)
{ {
// 16-8-4-2-1 pattern // 16-8-4-2-1 pattern
ck::index_t i_n = n; ck::long_index_t i_n = n;
float* p_dst = reinterpret_cast<float*>(dst); float* p_dst = reinterpret_cast<float*>(dst);
const float* p_src = reinterpret_cast<const float*>(src); const float* p_src = reinterpret_cast<const float*>(src);
while(i_n >= 16) while(i_n >= 16)
...@@ -67,11 +70,11 @@ void memcpy32_avx2_with_extra_2src(void* dst, ...@@ -67,11 +70,11 @@ void memcpy32_avx2_with_extra_2src(void* dst,
const void* src, const void* src,
const void* src1, const void* src1,
const void* src2, const void* src2,
const ck::index_t n, const ck::long_index_t n,
const ElementwiseOp& element_op) const ElementwiseOp& element_op)
{ {
// 16-8-4-2-1 pattern // 16-8-4-2-1 pattern
ck::index_t i_n = n; ck::long_index_t i_n = n;
float* p_dst = reinterpret_cast<float*>(dst); float* p_dst = reinterpret_cast<float*>(dst);
const float* p_src = reinterpret_cast<const float*>(src); const float* p_src = reinterpret_cast<const float*>(src);
const float* p_src1 = reinterpret_cast<const float*>(src1); const float* p_src1 = reinterpret_cast<const float*>(src1);
...@@ -146,11 +149,11 @@ void memcpy32_avx2_with_extra_2src(void* dst, ...@@ -146,11 +149,11 @@ void memcpy32_avx2_with_extra_2src(void* dst,
const void* src, const void* src,
float v_src1, float v_src1,
const void* src2, const void* src2,
const ck::index_t n, const ck::long_index_t n,
const ElementwiseOp& element_op) const ElementwiseOp& element_op)
{ {
// 16-8-4-2-1 pattern // 16-8-4-2-1 pattern
ck::index_t i_n = n; ck::long_index_t i_n = n;
float* p_dst = reinterpret_cast<float*>(dst); float* p_dst = reinterpret_cast<float*>(dst);
const float* p_src = reinterpret_cast<const float*>(src); const float* p_src = reinterpret_cast<const float*>(src);
const float* p_src2 = reinterpret_cast<const float*>(src2); const float* p_src2 = reinterpret_cast<const float*>(src2);
...@@ -214,11 +217,11 @@ template <typename ElementwiseOp> ...@@ -214,11 +217,11 @@ template <typename ElementwiseOp>
void memcpy32_avx2_with_extra_1src(void* dst, void memcpy32_avx2_with_extra_1src(void* dst,
const void* src, const void* src,
const void* src_aux, const void* src_aux,
const ck::index_t n, const ck::long_index_t n,
const ElementwiseOp& element_op) const ElementwiseOp& element_op)
{ {
// 16-8-4-2-1 pattern // 16-8-4-2-1 pattern
ck::index_t i_n = n; ck::long_index_t i_n = n;
float* p_dst = reinterpret_cast<float*>(dst); float* p_dst = reinterpret_cast<float*>(dst);
const float* p_src = reinterpret_cast<const float*>(src); const float* p_src = reinterpret_cast<const float*>(src);
const float* p_src_aux = reinterpret_cast<const float*>(src_aux); const float* p_src_aux = reinterpret_cast<const float*>(src_aux);
...@@ -277,11 +280,11 @@ template <typename ElementwiseOp> ...@@ -277,11 +280,11 @@ template <typename ElementwiseOp>
void memcpy32_avx2_with_extra_1src(void* dst, void memcpy32_avx2_with_extra_1src(void* dst,
const void* src, const void* src,
const float v_src_aux, const float v_src_aux,
const ck::index_t n, const ck::long_index_t n,
const ElementwiseOp& element_op) const ElementwiseOp& element_op)
{ {
// 16-8-4-2-1 pattern // 16-8-4-2-1 pattern
ck::index_t i_n = n; ck::long_index_t i_n = n;
float* p_dst = reinterpret_cast<float*>(dst); float* p_dst = reinterpret_cast<float*>(dst);
const float* p_src = reinterpret_cast<const float*>(src); const float* p_src = reinterpret_cast<const float*>(src);
...@@ -320,10 +323,10 @@ void memcpy32_avx2_with_extra_1src(void* dst, ...@@ -320,10 +323,10 @@ void memcpy32_avx2_with_extra_1src(void* dst,
} }
} }
inline void memset32_avx2(void* dst, const int32_t value, const ck::index_t n) inline void memset32_avx2(void* dst, const int32_t value, const ck::long_index_t n)
{ {
// 16-8-4-2-1 pattern // 16-8-4-2-1 pattern
ck::index_t i_n = n; ck::long_index_t i_n = n;
float* p_dst = reinterpret_cast<float*>(dst); float* p_dst = reinterpret_cast<float*>(dst);
__m256 ymm = _mm256_set1_ps(*reinterpret_cast<const float*>(&value)); __m256 ymm = _mm256_set1_ps(*reinterpret_cast<const float*>(&value));
__m128 xmm = _mm_set1_ps(*reinterpret_cast<const float*>(&value)); __m128 xmm = _mm_set1_ps(*reinterpret_cast<const float*>(&value));
...@@ -361,9 +364,9 @@ inline void memset32_avx2(void* dst, const int32_t value, const ck::index_t n) ...@@ -361,9 +364,9 @@ inline void memset32_avx2(void* dst, const int32_t value, const ck::index_t n)
template <typename ElementwiseOp> template <typename ElementwiseOp>
void transpose8x8_avx2(void* dst, void transpose8x8_avx2(void* dst,
ck::index_t stride_dst, ck::long_index_t stride_dst,
const void* src, const void* src,
ck::index_t stride_src, ck::long_index_t stride_src,
const ElementwiseOp& element_op) const ElementwiseOp& element_op)
{ {
// TODO: use vinsertf128 for better port usage. vpermf128 is slow // TODO: use vinsertf128 for better port usage. vpermf128 is slow
...@@ -560,8 +563,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -560,8 +563,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
void SetSrcSliceOrigin(const SrcDesc&, const Index& src_slice_origin_idx) void SetSrcSliceOrigin(const SrcDesc&, const Index& src_slice_origin_idx)
{ {
ck::index_t idx_m = src_slice_origin_idx[Number<0>{}]; intptr_t idx_m = src_slice_origin_idx[Number<0>{}];
ck::index_t idx_k = src_slice_origin_idx[Number<1>{}]; intptr_t idx_k = src_slice_origin_idx[Number<1>{}];
if constexpr(ConvForwardSpecialization == if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0) ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
...@@ -640,19 +643,19 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -640,19 +643,19 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
} }
else else
{ {
const ck::index_t m_per_block = slice_length[Number<0>{}]; const intptr_t m_per_block = slice_length[Number<0>{}];
const ck::index_t k_per_block = slice_length[Number<1>{}]; const intptr_t k_per_block = slice_length[Number<1>{}];
const float* p_src = reinterpret_cast<const float*>(src_buf.p_data_) + src_offset; const float* p_src = reinterpret_cast<const float*>(src_buf.p_data_) + src_offset;
float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_); float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_);
// printf("src offset:%d, k_per_block:%d, m_per_block:%d\n", src_offset, k_per_block, // printf("src offset:%llu, k_per_block:%d, m_per_block:%d\n", src_offset, k_per_block,
// m_per_block); // m_per_block); fflush(stdout);
if constexpr(ConvForwardSpecialization == if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0) ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
{ {
ck::index_t i_m_itr = m_per_block; intptr_t i_m_itr = m_per_block;
// standard 8-4-2-1 pattern // standard 8-4-2-1 pattern
while(i_m_itr >= 8) while(i_m_itr >= 8)
{ {
...@@ -712,9 +715,9 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -712,9 +715,9 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
else if constexpr(ConvForwardSpecialization == else if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization_t::Filter1x1Pad0) ConvolutionForwardSpecialization_t::Filter1x1Pad0)
{ {
ck::index_t i_m_itr = m_per_block; intptr_t i_m_itr = m_per_block;
ck::index_t i_wo_itr = i_wo; intptr_t i_wo_itr = i_wo;
ck::index_t i_ho_itr = i_ho; intptr_t i_ho_itr = i_ho;
while(i_m_itr > 0) while(i_m_itr > 0)
{ {
avx2_util::memcpy32_avx2(p_dst, p_src, k_per_block, element_op_); avx2_util::memcpy32_avx2(p_dst, p_src, k_per_block, element_op_);
...@@ -743,11 +746,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -743,11 +746,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
if(gemm_k_spec_ == ConvolutionForwardGemmKSpecialization_t::NHWC_GemmKLoopOverC) if(gemm_k_spec_ == ConvolutionForwardGemmKSpecialization_t::NHWC_GemmKLoopOverC)
{ {
// c % k_per_block == 0, so every time k_per_block here is the same // c % k_per_block == 0, so every time k_per_block here is the same
ck::index_t i_m_itr = m_per_block; intptr_t i_m_itr = m_per_block;
ck::index_t i_wo_itr = i_wo; intptr_t i_wo_itr = i_wo;
ck::index_t i_ho_itr = i_ho; intptr_t i_ho_itr = i_ho;
ck::index_t i_wi_itr = i_wi; intptr_t i_wi_itr = i_wi;
ck::index_t i_hi_itr = i_hi; intptr_t i_hi_itr = i_hi;
while(i_m_itr > 0) while(i_m_itr > 0)
{ {
...@@ -785,11 +788,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -785,11 +788,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
} }
else else
{ {
ck::index_t i_m_itr = m_per_block; intptr_t i_m_itr = m_per_block;
ck::index_t i_wo_itr = i_wo; intptr_t i_wo_itr = i_wo;
ck::index_t i_ho_itr = i_ho; intptr_t i_ho_itr = i_ho;
ck::index_t i_wi_itr = i_wi; intptr_t i_wi_itr = i_wi;
ck::index_t i_hi_itr = i_hi; intptr_t i_hi_itr = i_hi;
// ihi = iho * s_stride_h + iy * s_dilation_h - s_pad_h // ihi = iho * s_stride_h + iy * s_dilation_h - s_pad_h
// iwi = iwo * s_stride_w + ix * s_dilation_w - s_pad_w // iwi = iwo * s_stride_w + ix * s_dilation_w - s_pad_w
while(i_m_itr > 0) while(i_m_itr > 0)
...@@ -797,16 +800,16 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -797,16 +800,16 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
/*** go along Gemm K ***/ /*** go along Gemm K ***/
const float* p_src_k = p_src; const float* p_src_k = p_src;
float* p_dst_k = p_dst; float* p_dst_k = p_dst;
ck::index_t i_wi_itr_k = i_wi_itr; intptr_t i_wi_itr_k = i_wi_itr;
ck::index_t i_hi_itr_k = i_hi_itr; intptr_t i_hi_itr_k = i_hi_itr;
ck::index_t i_c_itr_k = i_c; intptr_t i_c_itr_k = i_c;
// ck::index_t i_y_itr_k = i_y; // intptr_t i_y_itr_k = i_y;
ck::index_t i_x_itr_k = i_x; intptr_t i_x_itr_k = i_x;
ck::index_t i_k_itr = k_per_block; intptr_t i_k_itr = k_per_block;
while(i_k_itr > 0) while(i_k_itr > 0)
{ {
ck::index_t current_k_block_along_c = intptr_t current_k_block_along_c =
ck::math::min(C - i_c_itr_k, i_k_itr); ck::math::min(C - i_c_itr_k, i_k_itr);
// printf("current_k_block_along_c:%d, i_c_itr_k:%d, k_per_block:%d\n", // printf("current_k_block_along_c:%d, i_c_itr_k:%d, k_per_block:%d\n",
...@@ -875,7 +878,7 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -875,7 +878,7 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& src_slice_origin_step_idx) void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& src_slice_origin_step_idx)
{ {
ck::index_t move_k = src_slice_origin_step_idx[Number<1>{}]; intptr_t move_k = src_slice_origin_step_idx[Number<1>{}];
if constexpr(ConvForwardSpecialization == if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0) ConvolutionForwardSpecialization_t::Filter1x1Stride1Pad0)
{ {
...@@ -937,35 +940,35 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC ...@@ -937,35 +940,35 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_In_NHWC
const ElementwiseOperation element_op_; const ElementwiseOperation element_op_;
const ConvolutionForwardGemmKSpecialization_t gemm_k_spec_; const ConvolutionForwardGemmKSpecialization_t gemm_k_spec_;
ck::index_t i_n; intptr_t i_n;
ck::index_t i_c; intptr_t i_c;
ck::index_t i_hi; intptr_t i_hi;
ck::index_t i_wi; intptr_t i_wi;
ck::index_t i_ho; intptr_t i_ho;
ck::index_t i_wo; intptr_t i_wo;
ck::index_t i_y; intptr_t i_y;
ck::index_t i_x; intptr_t i_x;
ck::index_t i_gemm_k; intptr_t i_gemm_k;
ck::index_t N; intptr_t N;
// ck::index_t K; // intptr_t K;
ck::index_t C; intptr_t C;
ck::index_t Hi; intptr_t Hi;
ck::index_t Wi; intptr_t Wi;
ck::index_t Ho; intptr_t Ho;
ck::index_t Wo; intptr_t Wo;
ck::index_t Sy; intptr_t Sy;
ck::index_t Sx; intptr_t Sx;
ck::index_t Dy; intptr_t Dy;
ck::index_t Dx; intptr_t Dx;
ck::index_t Py; intptr_t Py;
ck::index_t Px; intptr_t Px;
ck::index_t Fy; intptr_t Fy;
ck::index_t Fx; intptr_t Fx;
intptr_t input_offset_acc_wi; intptr_t input_offset_acc_wi;
intptr_t input_offset_ovf_wi_acc_hi; intptr_t input_offset_ovf_wi_acc_hi;
...@@ -1008,9 +1011,9 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC ...@@ -1008,9 +1011,9 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC
void SetSrcSliceOrigin(const SrcDesc&, const Index& src_slice_origin_idx) void SetSrcSliceOrigin(const SrcDesc&, const Index& src_slice_origin_idx)
{ {
ck::index_t idx_n0 = src_slice_origin_idx[Number<0>{}]; intptr_t idx_n0 = src_slice_origin_idx[Number<0>{}];
ck::index_t idx_k = src_slice_origin_idx[Number<1>{}]; intptr_t idx_k = src_slice_origin_idx[Number<1>{}];
ck::index_t idx_n1 = src_slice_origin_idx[Number<2>{}]; intptr_t idx_n1 = src_slice_origin_idx[Number<2>{}];
i_gemm_n = idx_n0 * GemmN1 + idx_n1; i_gemm_n = idx_n0 * GemmN1 + idx_n1;
// i_gemm_k = idx_k; // i_gemm_k = idx_k;
...@@ -1037,8 +1040,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC ...@@ -1037,8 +1040,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC
} }
else else
{ {
const ck::index_t n_per_block = slice_length[Number<0>{}] * slice_length[Number<2>{}]; const intptr_t n_per_block = slice_length[Number<0>{}] * slice_length[Number<2>{}];
const ck::index_t k_per_block = slice_length[Number<1>{}]; const intptr_t k_per_block = slice_length[Number<1>{}];
// printf(" >>>> %d, %d, %d -> %d(%dx%d), %d\n", GemmN, GemmK, GemmN1, n_per_block, // printf(" >>>> %d, %d, %d -> %d(%dx%d), %d\n", GemmN, GemmK, GemmN1, n_per_block,
// dst_desc.GetTransforms()[Number<0>{}] // dst_desc.GetTransforms()[Number<0>{}]
...@@ -1053,8 +1056,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC ...@@ -1053,8 +1056,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC
// n * k -> n0 * k * n1, n1 = 8, n0 = n/8 // n * k -> n0 * k * n1, n1 = 8, n0 = n/8
for(index_t i_n_itr = 0; i_n_itr < n_per_block; i_n_itr += 8) for(index_t i_n_itr = 0; i_n_itr < n_per_block; i_n_itr += 8)
{ {
ck::index_t current_n_8 = ck::math::min(GemmN - (i_n_itr + i_gemm_n), 8); intptr_t current_n_8 = ck::math::min(GemmN - (i_n_itr + i_gemm_n), (intptr_t)8);
ck::index_t i_k_itr = k_per_block; intptr_t i_k_itr = k_per_block;
if(current_n_8 == 8) if(current_n_8 == 8)
{ {
const float* p_src_k = p_src; const float* p_src_k = p_src;
...@@ -1151,7 +1154,7 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC ...@@ -1151,7 +1154,7 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC
{ {
for(index_t i_sub_k = 0; i_sub_k < k_per_block; i_sub_k++) for(index_t i_sub_k = 0; i_sub_k < k_per_block; i_sub_k++)
{ {
ck::index_t i_current_n_itr = i_n_itr + i_sub_n + i_gemm_n; intptr_t i_current_n_itr = i_n_itr + i_sub_n + i_gemm_n;
float v = i_current_n_itr < GemmN float v = i_current_n_itr < GemmN
? element_op_.Apply(p_src_k[i_sub_n * GemmK + i_sub_k]) ? element_op_.Apply(p_src_k[i_sub_n * GemmK + i_sub_k])
...@@ -1171,8 +1174,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC ...@@ -1171,8 +1174,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC
// src_slice_origin_step_idx need to be known at compile-time, for performance reason // src_slice_origin_step_idx need to be known at compile-time, for performance reason
void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& src_slice_origin_step_idx) void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& src_slice_origin_step_idx)
{ {
ck::index_t move_k = src_slice_origin_step_idx[Number<1>{}]; intptr_t move_k = src_slice_origin_step_idx[Number<1>{}];
ck::index_t move_n0 = src_slice_origin_step_idx[Number<0>{}]; intptr_t move_n0 = src_slice_origin_step_idx[Number<0>{}];
// i_gemm_k += move_k; // i_gemm_k += move_k;
...@@ -1187,13 +1190,13 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC ...@@ -1187,13 +1190,13 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXC
private: private:
const ElementwiseOperation element_op_; const ElementwiseOperation element_op_;
ck::index_t i_gemm_n; intptr_t i_gemm_n;
// ck::index_t i_gemm_k; // intptr_t i_gemm_k;
// ck::index_t GemmN0; // intptr_t GemmN0;
ck::index_t GemmN1; intptr_t GemmN1;
ck::index_t GemmN; intptr_t GemmN;
ck::index_t GemmK; intptr_t GemmK;
intptr_t src_offset; intptr_t src_offset;
}; };
...@@ -1226,9 +1229,9 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXCK8 ...@@ -1226,9 +1229,9 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXCK8
void SetSrcSliceOrigin(const SrcDesc&, const Index& src_slice_origin_idx) void SetSrcSliceOrigin(const SrcDesc&, const Index& src_slice_origin_idx)
{ {
ck::index_t idx_n0 = src_slice_origin_idx[Number<0>{}]; intptr_t idx_n0 = src_slice_origin_idx[Number<0>{}];
ck::index_t idx_k = src_slice_origin_idx[Number<1>{}]; intptr_t idx_k = src_slice_origin_idx[Number<1>{}];
ck::index_t idx_n1 = src_slice_origin_idx[Number<2>{}]; intptr_t idx_n1 = src_slice_origin_idx[Number<2>{}];
src_offset = idx_n0 * GemmK * GemmN1 + idx_k * GemmN1 + idx_n1; src_offset = idx_n0 * GemmK * GemmN1 + idx_k * GemmN1 + idx_n1;
...@@ -1251,10 +1254,9 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXCK8 ...@@ -1251,10 +1254,9 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXCK8
} }
else else
{ {
const ck::index_t n0_per_block = slice_length[Number<0>{}]; const intptr_t n0_per_block = slice_length[Number<0>{}];
const ck::index_t k_n1_per_block = const intptr_t k_n1_per_block = slice_length[Number<1>{}] * slice_length[Number<2>{}];
slice_length[Number<1>{}] * slice_length[Number<2>{}]; const intptr_t SrcStride_K_N1 = GemmK * slice_length[Number<2>{}];
const ck::index_t SrcStride_K_N1 = GemmK * slice_length[Number<2>{}];
// printf(" >>>> %d, %d, %d -> %d(%dx%d), %d\n", GemmN, GemmK, GemmN1, n_per_block, // printf(" >>>> %d, %d, %d -> %d(%dx%d), %d\n", GemmN, GemmK, GemmN1, n_per_block,
// dst_desc.GetTransforms()[Number<0>{}] // dst_desc.GetTransforms()[Number<0>{}]
...@@ -1356,9 +1358,9 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXCK8 ...@@ -1356,9 +1358,9 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXCK8
// src_slice_origin_step_idx need to be known at compile-time, for performance reason // src_slice_origin_step_idx need to be known at compile-time, for performance reason
void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& src_slice_origin_step_idx) void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& src_slice_origin_step_idx)
{ {
ck::index_t move_n0 = src_slice_origin_step_idx[Number<0>{}]; intptr_t move_n0 = src_slice_origin_step_idx[Number<0>{}];
ck::index_t move_k = src_slice_origin_step_idx[Number<1>{}]; intptr_t move_k = src_slice_origin_step_idx[Number<1>{}];
ck::index_t move_n1 = src_slice_origin_step_idx[Number<2>{}]; intptr_t move_n1 = src_slice_origin_step_idx[Number<2>{}];
// i_gemm_k += move_k; // i_gemm_k += move_k;
...@@ -1373,13 +1375,13 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXCK8 ...@@ -1373,13 +1375,13 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_KYXCK8
private: private:
const ElementwiseOperation element_op_; const ElementwiseOperation element_op_;
ck::index_t i_gemm_n; intptr_t i_gemm_n;
// ck::index_t i_gemm_k; // intptr_t i_gemm_k;
// ck::index_t GemmN0; // intptr_t GemmN0;
ck::index_t GemmN1; intptr_t GemmN1;
ck::index_t GemmN; intptr_t GemmN;
ck::index_t GemmK; intptr_t GemmK;
intptr_t src_offset; intptr_t src_offset;
}; };
...@@ -1410,8 +1412,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_YXCK ...@@ -1410,8 +1412,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_YXCK
void SetSrcSliceOrigin(const SrcDesc&, const Index& src_slice_origin_idx) void SetSrcSliceOrigin(const SrcDesc&, const Index& src_slice_origin_idx)
{ {
ck::index_t idx_k = src_slice_origin_idx[Number<0>{}]; intptr_t idx_k = src_slice_origin_idx[Number<0>{}];
ck::index_t idx_n = src_slice_origin_idx[Number<1>{}]; intptr_t idx_n = src_slice_origin_idx[Number<1>{}];
src_offset = idx_k * GemmN + idx_n; src_offset = idx_k * GemmN + idx_n;
} }
...@@ -1431,8 +1433,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_YXCK ...@@ -1431,8 +1433,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_YXCK
} }
else else
{ {
const ck::index_t k_per_block = slice_length[Number<0>{}]; const intptr_t k_per_block = slice_length[Number<0>{}];
const ck::index_t n_per_block = slice_length[Number<1>{}]; const intptr_t n_per_block = slice_length[Number<1>{}];
const float* p_src = reinterpret_cast<const float*>(src_buf.p_data_) + src_offset; const float* p_src = reinterpret_cast<const float*>(src_buf.p_data_) + src_offset;
float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_); float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_);
...@@ -1497,8 +1499,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_YXCK ...@@ -1497,8 +1499,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_YXCK
// src_slice_origin_step_idx need to be known at compile-time, for performance reason // src_slice_origin_step_idx need to be known at compile-time, for performance reason
void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& src_slice_origin_step_idx) void MoveSrcSliceWindow(const SrcDesc& src_desc, const Index& src_slice_origin_step_idx)
{ {
ck::index_t move_k = src_slice_origin_step_idx[Number<0>{}]; intptr_t move_k = src_slice_origin_step_idx[Number<0>{}];
ck::index_t move_n = src_slice_origin_step_idx[Number<1>{}]; intptr_t move_n = src_slice_origin_step_idx[Number<1>{}];
src_offset += move_k * GemmN + move_n; src_offset += move_k * GemmN + move_n;
} }
...@@ -1509,8 +1511,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_YXCK ...@@ -1509,8 +1511,8 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_ConvFwd_Wei_YXCK
private: private:
const ElementwiseOperation element_op_; const ElementwiseOperation element_op_;
ck::index_t GemmN; intptr_t GemmN;
ck::index_t GemmK; intptr_t GemmK;
intptr_t src_offset; intptr_t src_offset;
}; };
...@@ -1587,14 +1589,14 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_MxN ...@@ -1587,14 +1589,14 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_MxN
if constexpr(!std::is_same<ElementwiseOperation, if constexpr(!std::is_same<ElementwiseOperation,
ck::tensor_operation::cpu::element_wise::PassThrough>::value) ck::tensor_operation::cpu::element_wise::PassThrough>::value)
{ {
const ck::index_t m_per_block = slice_length[Number<0>{}]; const intptr_t m_per_block = slice_length[Number<0>{}];
const ck::index_t n_per_block = slice_length[Number<1>{}]; const intptr_t n_per_block = slice_length[Number<1>{}];
const ck::index_t current_n = ck::math::min(DstGemmN - i_dst_gemm_n, n_per_block); const intptr_t current_n = ck::math::min(DstGemmN - i_dst_gemm_n, n_per_block);
float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_) + dst_offset; float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_) + dst_offset;
ck::index_t i_m_itr = m_per_block; intptr_t i_m_itr = m_per_block;
// printf("xxxx %d, current_n:%d, DstGemmN:%d, n_per_block:%d, // printf("xxxx %d, current_n:%d, DstGemmN:%d, n_per_block:%d,
// dst_offset:%d\n",__LINE__, current_n, // dst_offset:%d\n",__LINE__, current_n,
...@@ -1657,15 +1659,15 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_MxN ...@@ -1657,15 +1659,15 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_MxN
} }
else else
{ {
const ck::index_t m_per_block = slice_length[Number<0>{}]; const intptr_t m_per_block = slice_length[Number<0>{}];
const ck::index_t n_per_block = slice_length[Number<1>{}]; const intptr_t n_per_block = slice_length[Number<1>{}];
const ck::index_t current_n = ck::math::min(DstGemmN - i_dst_gemm_n, n_per_block); const intptr_t current_n = ck::math::min(DstGemmN - i_dst_gemm_n, n_per_block);
const float* p_src = reinterpret_cast<float*>(src_buf.p_data_) + src_offset; const float* p_src = reinterpret_cast<float*>(src_buf.p_data_) + src_offset;
float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_) + dst_offset; float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_) + dst_offset;
ck::index_t i_m_itr = m_per_block; intptr_t i_m_itr = m_per_block;
// printf("xxxx %d, current_n:%d, DstGemmN:%d, n_per_block:%d\n",__LINE__, current_n, // printf("xxxx %d, current_n:%d, DstGemmN:%d, n_per_block:%d\n",__LINE__, current_n,
// DstGemmN, n_per_block);fflush(stdout); // DstGemmN, n_per_block);fflush(stdout);
...@@ -1740,11 +1742,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_MxN ...@@ -1740,11 +1742,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_MxN
private: private:
const ElementwiseOperation element_op_; const ElementwiseOperation element_op_;
ck::index_t i_dst_gemm_m; intptr_t i_dst_gemm_m;
ck::index_t i_dst_gemm_n; intptr_t i_dst_gemm_n;
ck::index_t DstGemmM; intptr_t DstGemmM;
ck::index_t DstGemmN; intptr_t DstGemmN;
intptr_t src_offset; intptr_t src_offset;
intptr_t dst_offset; intptr_t dst_offset;
...@@ -1868,10 +1870,10 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_ ...@@ -1868,10 +1870,10 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_
if constexpr(!std::is_same<ElementwiseOperation, if constexpr(!std::is_same<ElementwiseOperation,
ck::tensor_operation::cpu::element_wise::PassThrough>::value) ck::tensor_operation::cpu::element_wise::PassThrough>::value)
{ {
const ck::index_t m_per_block = slice_length[Number<0>{}]; const intptr_t m_per_block = slice_length[Number<0>{}];
const ck::index_t n_per_block = slice_length[Number<1>{}]; const intptr_t n_per_block = slice_length[Number<1>{}];
const ck::index_t current_n = ck::math::min(DstGemmN - i_dst_gemm_n, n_per_block); const intptr_t current_n = ck::math::min(DstGemmN - i_dst_gemm_n, n_per_block);
float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_) + dst_offset; float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_) + dst_offset;
const float* p_src1 = const float* p_src1 =
...@@ -1879,7 +1881,7 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_ ...@@ -1879,7 +1881,7 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_
const float* p_src2 = const float* p_src2 =
reinterpret_cast<const float*>(src2_buf.p_data_) + src2_offset; reinterpret_cast<const float*>(src2_buf.p_data_) + src2_offset;
ck::index_t i_m_itr = m_per_block; intptr_t i_m_itr = m_per_block;
// printf("xxxx %d, current_n:%d, DstGemmN:%d, n_per_block:%d, // printf("xxxx %d, current_n:%d, DstGemmN:%d, n_per_block:%d,
// dst_offset:%d\n",__LINE__, current_n, // dst_offset:%d\n",__LINE__, current_n,
...@@ -2129,17 +2131,17 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_ ...@@ -2129,17 +2131,17 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_
} }
else else
{ {
const ck::index_t m_per_block = slice_length[Number<0>{}]; const intptr_t m_per_block = slice_length[Number<0>{}];
const ck::index_t n_per_block = slice_length[Number<1>{}]; const intptr_t n_per_block = slice_length[Number<1>{}];
const ck::index_t current_n = ck::math::min(DstGemmN - i_dst_gemm_n, n_per_block); const intptr_t current_n = ck::math::min(DstGemmN - i_dst_gemm_n, n_per_block);
const float* p_src = reinterpret_cast<const float*>(src_buf.p_data_) + src_offset; const float* p_src = reinterpret_cast<const float*>(src_buf.p_data_) + src_offset;
float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_) + dst_offset; float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_) + dst_offset;
const float* p_src1 = reinterpret_cast<const float*>(src1_buf.p_data_) + src1_offset; const float* p_src1 = reinterpret_cast<const float*>(src1_buf.p_data_) + src1_offset;
const float* p_src2 = reinterpret_cast<const float*>(src2_buf.p_data_) + src2_offset; const float* p_src2 = reinterpret_cast<const float*>(src2_buf.p_data_) + src2_offset;
ck::index_t i_m_itr = m_per_block; intptr_t i_m_itr = m_per_block;
// printf("xxxx %d, current_n:%d, DstGemmN:%d, n_per_block:%d\n",__LINE__, current_n, // printf("xxxx %d, current_n:%d, DstGemmN:%d, n_per_block:%d\n",__LINE__, current_n,
// DstGemmN, n_per_block);fflush(stdout); // DstGemmN, n_per_block);fflush(stdout);
...@@ -2404,11 +2406,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_ ...@@ -2404,11 +2406,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_Residual_
private: private:
const ElementwiseOperation element_op_; const ElementwiseOperation element_op_;
ck::index_t i_dst_gemm_m; intptr_t i_dst_gemm_m;
ck::index_t i_dst_gemm_n; intptr_t i_dst_gemm_n;
ck::index_t DstGemmM; intptr_t DstGemmM;
ck::index_t DstGemmN; intptr_t DstGemmN;
intptr_t src_offset; intptr_t src_offset;
intptr_t src1_offset; intptr_t src1_offset;
...@@ -2526,16 +2528,16 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_MxN ...@@ -2526,16 +2528,16 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_MxN
if constexpr(!std::is_same<ElementwiseOperation, if constexpr(!std::is_same<ElementwiseOperation,
ck::tensor_operation::cpu::element_wise::PassThrough>::value) ck::tensor_operation::cpu::element_wise::PassThrough>::value)
{ {
const ck::index_t m_per_block = slice_length[Number<0>{}]; const intptr_t m_per_block = slice_length[Number<0>{}];
const ck::index_t n_per_block = slice_length[Number<1>{}]; const intptr_t n_per_block = slice_length[Number<1>{}];
const ck::index_t current_n = ck::math::min(DstGemmN - i_dst_gemm_n, n_per_block); const intptr_t current_n = ck::math::min(DstGemmN - i_dst_gemm_n, n_per_block);
float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_) + dst_offset; float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_) + dst_offset;
const float* p_src1 = const float* p_src1 =
reinterpret_cast<const float*>(src1_buf.p_data_) + src1_offset; reinterpret_cast<const float*>(src1_buf.p_data_) + src1_offset;
ck::index_t i_m_itr = m_per_block; intptr_t i_m_itr = m_per_block;
// standard 8-4-2-1 pattern // standard 8-4-2-1 pattern
if constexpr(Src1AlongDim0) if constexpr(Src1AlongDim0)
...@@ -2745,16 +2747,16 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_MxN ...@@ -2745,16 +2747,16 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_MxN
} }
else else
{ {
const ck::index_t m_per_block = slice_length[Number<0>{}]; const intptr_t m_per_block = slice_length[Number<0>{}];
const ck::index_t n_per_block = slice_length[Number<1>{}]; const intptr_t n_per_block = slice_length[Number<1>{}];
const ck::index_t current_n = ck::math::min(DstGemmN - i_dst_gemm_n, n_per_block); const intptr_t current_n = ck::math::min(DstGemmN - i_dst_gemm_n, n_per_block);
const float* p_src = reinterpret_cast<const float*>(src_buf.p_data_) + src_offset; const float* p_src = reinterpret_cast<const float*>(src_buf.p_data_) + src_offset;
float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_) + dst_offset; float* p_dst = reinterpret_cast<float*>(dst_buf.p_data_) + dst_offset;
const float* p_src1 = reinterpret_cast<const float*>(src1_buf.p_data_) + src1_offset; const float* p_src1 = reinterpret_cast<const float*>(src1_buf.p_data_) + src1_offset;
ck::index_t i_m_itr = m_per_block; intptr_t i_m_itr = m_per_block;
// printf("xxxx %d, current_n:%d, DstGemmN:%d, n_per_block:%d\n",__LINE__, current_n, // printf("xxxx %d, current_n:%d, DstGemmN:%d, n_per_block:%d\n",__LINE__, current_n,
// DstGemmN, n_per_block);fflush(stdout); // DstGemmN, n_per_block);fflush(stdout);
...@@ -2981,11 +2983,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_MxN ...@@ -2981,11 +2983,11 @@ struct ThreadwiseTensorSliceTransferAvx2Specialization_MatC_Store_Bias_MxN
private: private:
const ElementwiseOperation element_op_; const ElementwiseOperation element_op_;
ck::index_t i_dst_gemm_m; intptr_t i_dst_gemm_m;
ck::index_t i_dst_gemm_n; intptr_t i_dst_gemm_n;
ck::index_t DstGemmM; intptr_t DstGemmM;
ck::index_t DstGemmN; intptr_t DstGemmN;
intptr_t src_offset; intptr_t src_offset;
intptr_t src1_offset; intptr_t src1_offset;
......
#include <chrono> #include <chrono>
#include <assert.h> #include <assert.h>
#include <string.h> #include <string.h>
#include <stdlib.h>
#include "device.hpp" #include "device.hpp"
#ifndef CK_NOGPU #ifndef CK_NOGPU
...@@ -85,15 +86,10 @@ DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t align ...@@ -85,15 +86,10 @@ DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t align
{ {
assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2 assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2
void* p1; // TODO: posix only
void** p2; int rtn = posix_memalign(&mpDeviceBuf, alignment, mem_size);
int offset = alignment - 1 + sizeof(void*);
p1 = malloc(mem_size + offset);
assert(p1 != nullptr);
p2 = reinterpret_cast<void**>((reinterpret_cast<size_t>(p1) + offset) & ~(alignment - 1)); assert(rtn == 0);
p2[-1] = p1;
mpDeviceBuf = reinterpret_cast<void*>(p2);
} }
} }
...@@ -110,7 +106,7 @@ void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); } ...@@ -110,7 +106,7 @@ void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
DeviceAlignedMemCPU::~DeviceAlignedMemCPU() DeviceAlignedMemCPU::~DeviceAlignedMemCPU()
{ {
if(mpDeviceBuf != nullptr) if(mpDeviceBuf != nullptr)
free((reinterpret_cast<void**>(mpDeviceBuf))[-1]); free(mpDeviceBuf);
} }
struct WallTimerImpl struct WallTimerImpl
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment