Commit eb57178d authored by Astha Rai's avatar Astha Rai
Browse files

working version: fixed error in stride for A, still a bit inefficient

parent 17b72bd6
...@@ -25,11 +25,11 @@ using DeviceElementwisePermuteInstance = ...@@ -25,11 +25,11 @@ using DeviceElementwisePermuteInstance =
2, // NumDim_m, {N, C} 2, // NumDim_m, {N, C}
2, // NumDim_n, {H, W} 2, // NumDim_n, {H, W}
1, // NumDim_k, {D} 1, // NumDim_k, {D}
1, 8,
1, 8,
1, 8,
ck::Sequence<1>, ck::Sequence<8>,
ck::Sequence<1>>; ck::Sequence<8>>;
template <typename HostTensorA, typename HostTensorB, typename Functor> template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor) void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
...@@ -50,21 +50,22 @@ int main() ...@@ -50,21 +50,22 @@ int main()
bool do_verification = true; bool do_verification = true;
bool time_kernel = true; bool time_kernel = true;
const int N = 1; const int N = 4;
const int C = 2; const int C = 16;
const int H = 3; const int H = 32;
const int W = 4; const int W = 5;
const int D = 16; const int D = 16;
//
std::vector<std::size_t> ncdhw = {N, C, D, H, W}; std::vector<std::size_t> ncdhw = {N, C, D, H, W};
std::vector<std::size_t> nchwd = {N, C, H, W, D}; std::vector<std::size_t> nchwd = {N, C, H, W, D};
Tensor<ADataType> a(ncdhw); Tensor<ADataType> a(ncdhw);
Tensor<BDataType> b(nchwd); Tensor<BDataType> b(nchwd);
//a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
for(std::size_t i = 0; i < a.mData.size(); i++){ //for(std::size_t i = 0; i < a.mData.size(); i++){
a.mData[i] = i; // a.mData[i] = i;
} // }
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
...@@ -75,7 +76,7 @@ int main() ...@@ -75,7 +76,7 @@ int main()
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()}; std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D}; std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
std::array<ck::index_t, 5> a_strides = {C * D * H * W, D * H * W, H, 1, H * W}; // N, C, D, H, W std::array<ck::index_t, 5> a_strides = {C * D * H * W, D * H * W, W, 1, H * W}; // N, C, D, H, W
std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, C, H, W, D std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, C, H, W, D
auto broadcastPermute = DeviceElementwisePermuteInstance{}; auto broadcastPermute = DeviceElementwisePermuteInstance{};
...@@ -104,10 +105,6 @@ int main() ...@@ -104,10 +105,6 @@ int main()
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time;
// LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl;
// std::cout << "A: " << a.mData.data() << std::endl;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl; << std::endl;
...@@ -117,11 +114,11 @@ int main() ...@@ -117,11 +114,11 @@ int main()
{ {
b_device_buf.FromDevice(b.mData.data()); b_device_buf.FromDevice(b.mData.data());
// LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl; //LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl; //LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl;
Tensor<BDataType> host_b(nchwd); Tensor<BDataType> host_b(nchwd);
host_elementwise4D(host_b, a, PassThrough{}); host_elementwise4D(host_b, a, PassThrough{});
LogRangeAsType<float>(std::cout << "Host B : ", host_b.mData, ",") << std::endl; //LogRangeAsType<float>(std::cout << "Host B : ", host_b.mData, ",") << std::endl;
pass &= pass &=
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
......
...@@ -88,9 +88,18 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple, ...@@ -88,9 +88,18 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple,
const auto m = desc_mnk.GetLength(I0); const auto m = desc_mnk.GetLength(I0);
const auto n = desc_mnk.GetLength(I1); const auto n = desc_mnk.GetLength(I1);
const auto k = desc_mnk.GetLength(I2); const auto k = desc_mnk.GetLength(I2);
// std::cout << "m: " << m << std::endl;
// std::cout << "n: " << n << std::endl;
// std::cout << "k: " << k << std::endl;
//std::cout << "m: " << num_threads_m << std::endl;
//std::cout << "n: " << num_threads_n << std::endl;
//std::cout << "k: " << num_threads_k << std::endl;
const index_t loop_step_m = num_threads_m * MPerThread; const index_t loop_step_m = num_threads_m * MPerThread;
const index_t loop_step_n = num_threads_n * NPerThread; const index_t loop_step_n = num_threads_n * NPerThread;
const index_t loop_step_k = num_threads_k * KPerThread; const index_t loop_step_k = num_threads_k * KPerThread;
//std::cout << "loop_step_m: " << loop_step_m << std::endl;
//std::cout << "loop_step_n: " << loop_step_n << std::endl;
//std::cout << "loop_step_k: " << loop_step_k << std::endl;
const auto pad_m = math::integer_least_multiple(m, loop_step_m) - m; const auto pad_m = math::integer_least_multiple(m, loop_step_m) - m;
const auto pad_n = math::integer_least_multiple(n, loop_step_n) - n; const auto pad_n = math::integer_least_multiple(n, loop_step_n) - n;
const auto pad_k = math::integer_least_multiple(k, loop_step_k) - k; const auto pad_k = math::integer_least_multiple(k, loop_step_k) - k;
...@@ -302,22 +311,22 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple, ...@@ -302,22 +311,22 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple,
const std::array<index_t, NumDim>& strides, const std::array<index_t, NumDim>& strides,
index_t scalarPerVector, index_t scalarPerVector,
index_t vectorDim) { index_t vectorDim) {
ignore = lengths; //ignore = lengths;
ignore = strides; //ignore = strides;
ignore = scalarPerVector; //ignore = scalarPerVector;
ignore = vectorDim; //ignore = vectorDim;
// if(strides[vectorDim] == 1 && if(strides[vectorDim] == 1 &&
//(lengths[vectorDim] % scalarPerVector == 0)) (lengths[vectorDim] % scalarPerVector == 0 ||
////lengths[vectorDim] % scalarPerVector == lengths[vectorDim])) lengths[vectorDim] % scalarPerVector == lengths[vectorDim]))
//{ {
// return true; return true;
//} }
// if(strides[vectorDim] >= scalarPerVector) if(strides[vectorDim] >= scalarPerVector)
//{ {
// return true; return true;
//} }
return true; return false;
}; };
bool valid = true; bool valid = true;
......
...@@ -230,13 +230,13 @@ struct GridwiseElementwise_3D ...@@ -230,13 +230,13 @@ struct GridwiseElementwise_3D
static_for<0, NumInput, 1>{}([&](auto I) { static_for<0, NumInput, 1>{}([&](auto I) {
in_global_load_tuple(I).MoveSrcSliceWindow( in_global_load_tuple(I).MoveSrcSliceWindow(
in_grid_3d_desc_tuple[I], in_grid_3d_desc_tuple[I],
make_multi_index(0, loop_step_n, -(K / loop_step_k) * loop_step_k)); make_multi_index(0, loop_step_n /**-math::integer_divide_ceil(K, loop_step_k) * loop_step_k**/,-(K / loop_step_k) * loop_step_k));
}); });
static_for<0, NumOutput, 1>{}([&](auto I) { static_for<0, NumOutput, 1>{}([&](auto I) {
out_global_store_tuple(I).MoveDstSliceWindow( out_global_store_tuple(I).MoveDstSliceWindow(
out_grid_3d_desc_tuple[I], out_grid_3d_desc_tuple[I],
make_multi_index(0, loop_step_n, -(K / loop_step_k) * loop_step_k)); make_multi_index(0, loop_step_n /**-math::integer_divide_ceil(K, loop_step_k) * loop_step_k**/, -(K / loop_step_k) * loop_step_k));
}); });
} while(--num_iter_n); } while(--num_iter_n);
...@@ -245,16 +245,16 @@ struct GridwiseElementwise_3D ...@@ -245,16 +245,16 @@ struct GridwiseElementwise_3D
in_global_load_tuple(I).MoveSrcSliceWindow( in_global_load_tuple(I).MoveSrcSliceWindow(
in_grid_3d_desc_tuple[I], in_grid_3d_desc_tuple[I],
make_multi_index(loop_step_m, make_multi_index(loop_step_m,
-(N / loop_step_n) * loop_step_n, /**-math::integer_divide_ceil(N, loop_step_n) * loop_step_n**/-(N / loop_step_n) * loop_step_n,
-(K / loop_step_k) * loop_step_k)); /**-math::integer_divide_ceil(K, loop_step_k) * loop_step_k**/-(K / loop_step_k) * loop_step_k));
}); });
static_for<0, NumOutput, 1>{}([&](auto I) { static_for<0, NumOutput, 1>{}([&](auto I) {
out_global_store_tuple(I).MoveDstSliceWindow( out_global_store_tuple(I).MoveDstSliceWindow(
out_grid_3d_desc_tuple[I], out_grid_3d_desc_tuple[I],
make_multi_index(loop_step_m, make_multi_index(loop_step_m,
-(N / loop_step_n) * loop_step_n, /**-math::integer_divide_ceil(N, loop_step_n) * loop_step_n**/-(N / loop_step_n) * loop_step_n,
-(K / loop_step_k) * loop_step_k)); /**-math::integer_divide_ceil(K, loop_step_k) * loop_step_k**/-(K / loop_step_k) * loop_step_k));
}); });
} while(--num_iter_m); } while(--num_iter_m);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment