Commit eb57178d authored by Astha Rai's avatar Astha Rai
Browse files

working version: fixed error in stride for A, still a bit inefficient

parent 17b72bd6
......@@ -25,11 +25,11 @@ using DeviceElementwisePermuteInstance =
2, // NumDim_m, {N, C}
2, // NumDim_n, {H, W}
1, // NumDim_k, {D}
1,
1,
1,
ck::Sequence<1>,
ck::Sequence<1>>;
8,
8,
8,
ck::Sequence<8>,
ck::Sequence<8>>;
template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
......@@ -50,21 +50,22 @@ int main()
bool do_verification = true;
bool time_kernel = true;
const int N = 1;
const int C = 2;
const int H = 3;
const int W = 4;
const int N = 4;
const int C = 16;
const int H = 32;
const int W = 5;
const int D = 16;
//
std::vector<std::size_t> ncdhw = {N, C, D, H, W};
std::vector<std::size_t> nchwd = {N, C, H, W, D};
Tensor<ADataType> a(ncdhw);
Tensor<BDataType> b(nchwd);
//a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
for(std::size_t i = 0; i < a.mData.size(); i++){
a.mData[i] = i;
}
a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
//for(std::size_t i = 0; i < a.mData.size(); i++){
// a.mData[i] = i;
// }
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
......@@ -75,7 +76,7 @@ int main()
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
std::array<ck::index_t, 5> a_strides = {C * D * H * W, D * H * W, H, 1, H * W}; // N, C, D, H, W
std::array<ck::index_t, 5> a_strides = {C * D * H * W, D * H * W, W, 1, H * W}; // N, C, D, H, W
std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; // N, C, H, W, D
auto broadcastPermute = DeviceElementwisePermuteInstance{};
......@@ -104,10 +105,6 @@ int main()
float gb_per_sec = num_btype / 1.E6 / ave_time;
// LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl;
// std::cout << "A: " << a.mData.data() << std::endl;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
......@@ -117,11 +114,11 @@ int main()
{
b_device_buf.FromDevice(b.mData.data());
// LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl;
LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl;
//LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl;
//LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl;
Tensor<BDataType> host_b(nchwd);
host_elementwise4D(host_b, a, PassThrough{});
LogRangeAsType<float>(std::cout << "Host B : ", host_b.mData, ",") << std::endl;
//LogRangeAsType<float>(std::cout << "Host B : ", host_b.mData, ",") << std::endl;
pass &=
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
......
......@@ -88,9 +88,18 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple,
const auto m = desc_mnk.GetLength(I0);
const auto n = desc_mnk.GetLength(I1);
const auto k = desc_mnk.GetLength(I2);
// std::cout << "m: " << m << std::endl;
// std::cout << "n: " << n << std::endl;
// std::cout << "k: " << k << std::endl;
//std::cout << "m: " << num_threads_m << std::endl;
//std::cout << "n: " << num_threads_n << std::endl;
//std::cout << "k: " << num_threads_k << std::endl;
const index_t loop_step_m = num_threads_m * MPerThread;
const index_t loop_step_n = num_threads_n * NPerThread;
const index_t loop_step_k = num_threads_k * KPerThread;
//std::cout << "loop_step_m: " << loop_step_m << std::endl;
//std::cout << "loop_step_n: " << loop_step_n << std::endl;
//std::cout << "loop_step_k: " << loop_step_k << std::endl;
const auto pad_m = math::integer_least_multiple(m, loop_step_m) - m;
const auto pad_n = math::integer_least_multiple(n, loop_step_n) - n;
const auto pad_k = math::integer_least_multiple(k, loop_step_k) - k;
......@@ -302,22 +311,22 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple,
const std::array<index_t, NumDim>& strides,
index_t scalarPerVector,
index_t vectorDim) {
ignore = lengths;
ignore = strides;
ignore = scalarPerVector;
ignore = vectorDim;
// if(strides[vectorDim] == 1 &&
//(lengths[vectorDim] % scalarPerVector == 0))
////lengths[vectorDim] % scalarPerVector == lengths[vectorDim]))
//{
// return true;
//}
// if(strides[vectorDim] >= scalarPerVector)
//{
// return true;
//}
//ignore = lengths;
//ignore = strides;
//ignore = scalarPerVector;
//ignore = vectorDim;
if(strides[vectorDim] == 1 &&
(lengths[vectorDim] % scalarPerVector == 0 ||
lengths[vectorDim] % scalarPerVector == lengths[vectorDim]))
{
return true;
}
if(strides[vectorDim] >= scalarPerVector)
{
return true;
}
return false;
};
bool valid = true;
......
......@@ -230,13 +230,13 @@ struct GridwiseElementwise_3D
static_for<0, NumInput, 1>{}([&](auto I) {
in_global_load_tuple(I).MoveSrcSliceWindow(
in_grid_3d_desc_tuple[I],
make_multi_index(0, loop_step_n, -(K / loop_step_k) * loop_step_k));
make_multi_index(0, loop_step_n /**-math::integer_divide_ceil(K, loop_step_k) * loop_step_k**/,-(K / loop_step_k) * loop_step_k));
});
static_for<0, NumOutput, 1>{}([&](auto I) {
out_global_store_tuple(I).MoveDstSliceWindow(
out_grid_3d_desc_tuple[I],
make_multi_index(0, loop_step_n, -(K / loop_step_k) * loop_step_k));
make_multi_index(0, loop_step_n /**-math::integer_divide_ceil(K, loop_step_k) * loop_step_k**/, -(K / loop_step_k) * loop_step_k));
});
} while(--num_iter_n);
......@@ -245,16 +245,16 @@ struct GridwiseElementwise_3D
in_global_load_tuple(I).MoveSrcSliceWindow(
in_grid_3d_desc_tuple[I],
make_multi_index(loop_step_m,
-(N / loop_step_n) * loop_step_n,
-(K / loop_step_k) * loop_step_k));
/**-math::integer_divide_ceil(N, loop_step_n) * loop_step_n**/-(N / loop_step_n) * loop_step_n,
/**-math::integer_divide_ceil(K, loop_step_k) * loop_step_k**/-(K / loop_step_k) * loop_step_k));
});
static_for<0, NumOutput, 1>{}([&](auto I) {
out_global_store_tuple(I).MoveDstSliceWindow(
out_grid_3d_desc_tuple[I],
make_multi_index(loop_step_m,
-(N / loop_step_n) * loop_step_n,
-(K / loop_step_k) * loop_step_k));
/**-math::integer_divide_ceil(N, loop_step_n) * loop_step_n**/-(N / loop_step_n) * loop_step_n,
/**-math::integer_divide_ceil(K, loop_step_k) * loop_step_k**/-(K / loop_step_k) * loop_step_k));
});
} while(--num_iter_m);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment