Commit e73a2cb7 authored by Astha Rai's avatar Astha Rai
Browse files

cleaned up formatting/comments

parent e1a5137e
...@@ -2,6 +2,5 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) ...@@ -2,6 +2,5 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp) add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp) add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
add_example_executable(example_elementwise_permute_5D elementwise_permute_5D.cpp) add_example_executable(example_elementwise_permute_5D elementwise_permute_5D.cpp)
add_example_executable(example_elementwise_permute_5D_2d elementwise_permute_5D_2d.cpp)
add_example_executable(example_elementwise_permute_5D_3d elementwise_permute_5D_3d.cpp) add_example_executable(example_elementwise_permute_5D_3d elementwise_permute_5D_3d.cpp)
endif() endif()
#include <iostream>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_2d_impl.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
using F16 = ck::half_t;
using ADataType = F16;
using BDataType = F16;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceElementwisePermuteInstance =
ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>,
ck::Tuple<BDataType>,
PassThrough,
3, // NumDim_M
2, // NumDim_N
8,
8,
ck::Sequence<8>,
ck::Sequence<8>>;
template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nchwd,
const HostTensorA& A_ncdhw,
const std::vector<std::size_t>& shape_ncdhw,
Functor functor)
{
for(std::size_t n = 0; n < shape_ncdhw[0]; ++n)
for(std::size_t c = 0; c < shape_ncdhw[1]; ++c)
for(std::size_t d = 0; d < shape_ncdhw[2]; ++d)
for(std::size_t h = 0; h < shape_ncdhw[3]; ++h)
for(std::size_t w = 0; w < shape_ncdhw[0]; ++w)
{
auto a_val = A_ncdhw(n, c, d, h, w);
functor(B_nchwd(n, c, h, w, d), a_val);
}
}
int main()
{
bool do_verification = true;
bool time_kernel = true;
//const int N = 120;
//const int C = 128;
//const int H = 32;
//const int W = 1024;
const int N = 8;
const int C = 8;
const int D = 8;
const int H = 8;
const int W = 8;
/**const int N = 120;
const int H = 32;
const int W = 64;
const int C = 128;**/
std::vector<std::size_t> ncdhw = {N, C, D, H, W};
std::vector<std::size_t> nchwd = {N, C, H, W, D};
Tensor<ADataType> a(ncdhw);
Tensor<BDataType> b(nchwd);
a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a.mData.data());
// LogRangeAsType<float>(std::cout << "Tensor a : ", a.mData, ",") << std::endl;
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
//std::array<ck::index_t, 5> ab_lengths{N, H, W, C};
std::array<ck::index_t, 5> ab_lengths{N, C, D, H, W};
//std::array<ck::index_t, 5> a_strides = {C * H * W, W, 1, H * W};
//std::array<ck::index_t, 5> b_strides = {H * W * C, W * C, C, 1};
std::array<ck::index_t, 5> a_strides = {C * D * H * W, D * H * W, H * W, W, 1};
std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, 1, W * D, D};
auto broadcastPermute = DeviceElementwisePermuteInstance{};
auto argument = broadcastPermute.MakeArgumentPointer(
ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
if(!broadcastPermute.IsSupportedArgument(argument.get()))
{
throw std::runtime_error(
"The runtime parameters seems not supported by the device instance, exiting!");
};
std::cout << "A (ncdhw): " << a.mDesc << std::endl;
std::cout << "B (nchwd): " << b.mDesc << std::endl;
auto broadcastPermute_invoker_ptr = broadcastPermute.MakeInvokerPointer();
float ave_time =
broadcastPermute_invoker_ptr->Run(argument.get(), StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4];
std::size_t num_btype = sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] + ncdhw[4]) +
sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] + ncdhw[4]);
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
//LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl;
//LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
bool pass = true;
if(do_verification)
{
b_device_buf.FromDevice(b.mData.data());
// LogRangeAsType<float>(std::cout << "Tensor b : ", b.mData, ",") << std::endl;
Tensor<BDataType> host_b(nchwd);
host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>(
host_b, a, ncdhw, PassThrough{});
//LogRangeAsType<float>(std::cout << "Host_b : ", host_b.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "Host b : ", host_b.mData, ",") << std::endl;
pass &=
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
}
return pass ? 0 : 1;
}
...@@ -55,7 +55,6 @@ int main() ...@@ -55,7 +55,6 @@ int main()
const int H = 32; const int H = 32;
const int W = 5; const int W = 5;
const int D = 16; const int D = 16;
//
std::vector<std::size_t> ncdhw = {N, C, D, H, W}; std::vector<std::size_t> ncdhw = {N, C, D, H, W};
std::vector<std::size_t> nchwd = {N, C, H, W, D}; std::vector<std::size_t> nchwd = {N, C, H, W, D};
...@@ -63,9 +62,6 @@ int main() ...@@ -63,9 +62,6 @@ int main()
Tensor<BDataType> b(nchwd); Tensor<BDataType> b(nchwd);
a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}); a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
//for(std::size_t i = 0; i < a.mData.size(); i++){
// a.mData[i] = i;
// }
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
...@@ -114,11 +110,11 @@ int main() ...@@ -114,11 +110,11 @@ int main()
{ {
b_device_buf.FromDevice(b.mData.data()); b_device_buf.FromDevice(b.mData.data());
//LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl; // LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl;
//LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl; // LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl;
Tensor<BDataType> host_b(nchwd); Tensor<BDataType> host_b(nchwd);
host_elementwise4D(host_b, a, PassThrough{}); host_elementwise4D(host_b, a, PassThrough{});
//LogRangeAsType<float>(std::cout << "Host B : ", host_b.mData, ",") << std::endl; // LogRangeAsType<float>(std::cout << "Host B : ", host_b.mData, ",") << std::endl;
pass &= pass &=
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
......
...@@ -88,18 +88,11 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple, ...@@ -88,18 +88,11 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple,
const auto m = desc_mnk.GetLength(I0); const auto m = desc_mnk.GetLength(I0);
const auto n = desc_mnk.GetLength(I1); const auto n = desc_mnk.GetLength(I1);
const auto k = desc_mnk.GetLength(I2); const auto k = desc_mnk.GetLength(I2);
// std::cout << "m: " << m << std::endl;
// std::cout << "n: " << n << std::endl;
// std::cout << "k: " << k << std::endl;
//std::cout << "m: " << num_threads_m << std::endl;
//std::cout << "n: " << num_threads_n << std::endl;
//std::cout << "k: " << num_threads_k << std::endl;
const index_t loop_step_m = num_threads_m * MPerThread; const index_t loop_step_m = num_threads_m * MPerThread;
const index_t loop_step_n = num_threads_n * NPerThread; const index_t loop_step_n = num_threads_n * NPerThread;
const index_t loop_step_k = num_threads_k * KPerThread; const index_t loop_step_k = num_threads_k * KPerThread;
//std::cout << "loop_step_m: " << loop_step_m << std::endl;
//std::cout << "loop_step_n: " << loop_step_n << std::endl;
//std::cout << "loop_step_k: " << loop_step_k << std::endl;
const auto pad_m = math::integer_least_multiple(m, loop_step_m) - m; const auto pad_m = math::integer_least_multiple(m, loop_step_m) - m;
const auto pad_n = math::integer_least_multiple(n, loop_step_n) - n; const auto pad_n = math::integer_least_multiple(n, loop_step_n) - n;
const auto pad_k = math::integer_least_multiple(k, loop_step_k) - k; const auto pad_k = math::integer_least_multiple(k, loop_step_k) - k;
...@@ -311,10 +304,6 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple, ...@@ -311,10 +304,6 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple,
const std::array<index_t, NumDim>& strides, const std::array<index_t, NumDim>& strides,
index_t scalarPerVector, index_t scalarPerVector,
index_t vectorDim) { index_t vectorDim) {
//ignore = lengths;
//ignore = strides;
//ignore = scalarPerVector;
//ignore = vectorDim;
if(strides[vectorDim] == 1 && if(strides[vectorDim] == 1 &&
(lengths[vectorDim] % scalarPerVector == 0 || (lengths[vectorDim] % scalarPerVector == 0 ||
lengths[vectorDim] % scalarPerVector == lengths[vectorDim])) lengths[vectorDim] % scalarPerVector == lengths[vectorDim]))
...@@ -335,9 +324,6 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple, ...@@ -335,9 +324,6 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple,
pArg->inStridesArray_[I.value], pArg->inStridesArray_[I.value],
InScalarPerVectorSeq::At(I), InScalarPerVectorSeq::At(I),
NumDim_m - 1); NumDim_m - 1);
// LogRangeAsType<float>(std::cout << "in scalarperveq : ",
// InScalarPerVectorSeq::At(I), ",") << std::endl; LogRangeAsType<float>(std::cout <<
// "vecdim : ", NumDim_m - 1, ",") << std::endl;
}); });
static_for<0, NumOutput, 1>{}([&](auto I) { static_for<0, NumOutput, 1>{}([&](auto I) {
...@@ -345,9 +331,6 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple, ...@@ -345,9 +331,6 @@ struct DeviceElementwise3dImpl : public DeviceElementwise<InDataTypeTuple,
pArg->outStridesArray_[I.value], pArg->outStridesArray_[I.value],
OutScalarPerVectorSeq::At(I), OutScalarPerVectorSeq::At(I),
NumDim - 1); NumDim - 1);
// LogRangeAsType<float>(std::cout << "out scalarperveq : ",
// OutScalarPerVectorSeq::At(I), ",") << std::endl; LogRangeAsType<float>(std::cout
// << "vecdim : ", NumDim - 1, ",") << std::endl;
}); });
return valid; return valid;
......
...@@ -230,13 +230,13 @@ struct GridwiseElementwise_3D ...@@ -230,13 +230,13 @@ struct GridwiseElementwise_3D
static_for<0, NumInput, 1>{}([&](auto I) { static_for<0, NumInput, 1>{}([&](auto I) {
in_global_load_tuple(I).MoveSrcSliceWindow( in_global_load_tuple(I).MoveSrcSliceWindow(
in_grid_3d_desc_tuple[I], in_grid_3d_desc_tuple[I],
make_multi_index(0, loop_step_n /**-math::integer_divide_ceil(K, loop_step_k) * loop_step_k**/,-(K / loop_step_k) * loop_step_k)); make_multi_index(0, loop_step_n, -(K / loop_step_k) * loop_step_k));
}); });
static_for<0, NumOutput, 1>{}([&](auto I) { static_for<0, NumOutput, 1>{}([&](auto I) {
out_global_store_tuple(I).MoveDstSliceWindow( out_global_store_tuple(I).MoveDstSliceWindow(
out_grid_3d_desc_tuple[I], out_grid_3d_desc_tuple[I],
make_multi_index(0, loop_step_n /**-math::integer_divide_ceil(K, loop_step_k) * loop_step_k**/, -(K / loop_step_k) * loop_step_k)); make_multi_index(0, loop_step_n, -(K / loop_step_k) * loop_step_k));
}); });
} while(--num_iter_n); } while(--num_iter_n);
...@@ -245,16 +245,16 @@ struct GridwiseElementwise_3D ...@@ -245,16 +245,16 @@ struct GridwiseElementwise_3D
in_global_load_tuple(I).MoveSrcSliceWindow( in_global_load_tuple(I).MoveSrcSliceWindow(
in_grid_3d_desc_tuple[I], in_grid_3d_desc_tuple[I],
make_multi_index(loop_step_m, make_multi_index(loop_step_m,
/**-math::integer_divide_ceil(N, loop_step_n) * loop_step_n**/-(N / loop_step_n) * loop_step_n, -(N / loop_step_n) * loop_step_n,
/**-math::integer_divide_ceil(K, loop_step_k) * loop_step_k**/-(K / loop_step_k) * loop_step_k)); -(K / loop_step_k) * loop_step_k));
}); });
static_for<0, NumOutput, 1>{}([&](auto I) { static_for<0, NumOutput, 1>{}([&](auto I) {
out_global_store_tuple(I).MoveDstSliceWindow( out_global_store_tuple(I).MoveDstSliceWindow(
out_grid_3d_desc_tuple[I], out_grid_3d_desc_tuple[I],
make_multi_index(loop_step_m, make_multi_index(loop_step_m,
/**-math::integer_divide_ceil(N, loop_step_n) * loop_step_n**/-(N / loop_step_n) * loop_step_n, -(N / loop_step_n) * loop_step_n,
/**-math::integer_divide_ceil(K, loop_step_k) * loop_step_k**/-(K / loop_step_k) * loop_step_k)); -(K / loop_step_k) * loop_step_k));
}); });
} while(--num_iter_m); } while(--num_iter_m);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment