Commit 57b9cf69 authored by Astha Rai's avatar Astha Rai
Browse files

removed unneccesary comments, renamed files

parent a3115568
......@@ -45,12 +45,7 @@ int main()
std::vector<std::size_t> ncdhw = {N, C, D, H, W};
std::vector<std::size_t> nchwd = {N, C, H, W, D};
// Tensor<ADataType> a(ncdhw);
// Tensor<BDataType> b(nchwd);
auto size = N * C * D * H * W;
// a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
auto size = N * C * D * H * W;
std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
std::array<ck::index_t, 5> a_strides = {C * D * H * W, D * H * W, 1, D * H, D};
......
add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
add_example_executable(example_elementwise_permute_5D elementwise_permute_5D.cpp)
add_example_executable(example_elementwise_permute_5D_3d elementwise_permute_5D_3d.cpp)
add_example_executable(example_elementwise_permute elementwise_permute.cpp)
add_example_executable(example_elementwise_permute_3d elementwise_permute_3d.cpp)
......@@ -19,13 +19,13 @@ using BDataType = F16;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceElementwisePermuteInstance =
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>,
ck::Tuple<BDataType>,
PassThrough,
5,
8,
ck::Sequence<8>,
ck::Sequence<1>>;
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
ck::Tuple<BDataType>, // OutDataTypeTuple
PassThrough, // ElementwiseOp
5, // NumDim
8, // MPerThread
ck::Sequence<8>, // InScalarPerVectorSeq
ck::Sequence<1>>; // OutScalarPerVectorSeq
template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
......@@ -103,7 +103,6 @@ int main()
float gb_per_sec = num_btype / 1.E6 / ave_time;
// LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
......@@ -115,9 +114,6 @@ int main()
Tensor<BDataType> host_b(nchwd);
host_elementwise4D(host_b, a, PassThrough{});
// LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "Host B : ", host_b.mData, ",") << std::endl;
pass &=
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
}
......
......@@ -19,17 +19,17 @@ using BDataType = F16;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceElementwisePermuteInstance =
ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>,
ck::Tuple<BDataType>,
PassThrough,
2, // NumDim_m, {N, C}
2, // NumDim_n, {H, W}
1, // NumDim_k, {D}
8,
8,
8,
ck::Sequence<8>,
ck::Sequence<8>>;
ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>, // InDataTypeTuple
ck::Tuple<BDataType>, // OutDataTypeTuple
PassThrough, // ElementwiseOp
2, // NumDim_m, {N, C}
2, // NumDim_n, {H, W}
1, // NumDim_k, {D}
8, // MPerThread
8, // NPerThread
8, // KPerThread
ck::Sequence<1>, // InScalarPerVectorSeq
ck::Sequence<1>>; // OutScalarPerVectorSeq
template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
......@@ -109,12 +109,8 @@ int main()
if(do_verification)
{
b_device_buf.FromDevice(b.mData.data());
// LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl;
Tensor<BDataType> host_b(nchwd);
host_elementwise4D(host_b, a, PassThrough{});
// LogRangeAsType<float>(std::cout << "Host B : ", host_b.mData, ",") << std::endl;
pass &=
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
......
......@@ -19,13 +19,13 @@ using BDataType = F16;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceElementwisePermuteInstance =
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>,
ck::Tuple<BDataType>,
PassThrough,
4,
8,
ck::Sequence<8>,
ck::Sequence<1>>;
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
ck::Tuple<BDataType>, // OutDataTypeTuple
PassThrough, // Elementwise op
4, // NumDim
8, // MPerThread
ck::Sequence<8>, // InScalarPerVectorSeq
ck::Sequence<1>>; // OutScalarPerVectorSeq
template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)
......@@ -106,8 +106,6 @@ int main()
b_device_buf.FromDevice(b.mData.data());
Tensor<BDataType> host_b(nhwc);
host_elementwise4D(host_b, a, PassThrough{});
// LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "B : ", host_b.mData, ",") << std::endl;
pass &=
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
......
......@@ -17,15 +17,15 @@ using BDataType = F16;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceElementwisePermuteInstance =
ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>,
ck::Tuple<BDataType>,
PassThrough,
3, // NumDim_M
1, // NumDim_N
1,
1,
ck::Sequence<1>,
ck::Sequence<1>>;
ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>, // InDataTypeTuple
ck::Tuple<BDataType>, // OutDataTypeTuple
PassThrough, // Elementwise op
3, // NumDim_M
1, // NumDim_N
1, // MPerThread
1, // NPerThread
ck::Sequence<1>, // InScalarPerVectorSeq
ck::Sequence<1>>; // OutScalarPerVectorSeq
template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nhwc,
......@@ -48,19 +48,10 @@ int main()
bool do_verification = true;
bool time_kernel = true;
// const int N = 120;
// const int C = 128;
// const int H = 32;
// const int W = 1024;
const int N = 16;
const int C = 8;
const int N = 120;
const int C = 128;
const int H = 32;
const int W = 64;
/**const int N = 120;
const int H = 32;
const int W = 64;
const int C = 128;**/
const int W = 1024;
std::vector<std::size_t> nchw = {N, C, H, W};
std::vector<std::size_t> nhwc = {N, H, W, C};
......@@ -74,7 +65,6 @@ int main()
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a.mData.data());
// LogRangeAsType<float>(std::cout << "Tensor a : ", a.mData, ",") << std::endl;
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
......@@ -110,25 +100,18 @@ int main()
float gb_per_sec = num_btype / 1.E6 / ave_time;
// LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
bool pass = true;
// LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl;
if(do_verification)
{
b_device_buf.FromDevice(b.mData.data());
// LogRangeAsType<float>(std::cout << "Tensor b : ", b.mData, ",") << std::endl;
Tensor<BDataType> host_b(nhwc);
host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>(
host_b, a, nchw, PassThrough{});
// LogRangeAsType<float>(std::cout << "Host_b : ", host_b.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "Host b : ", host_b.mData, ",") << std::endl;
pass &=
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
}
......
......@@ -143,8 +143,9 @@ struct GridwiseElementwise_3D
Sequence<MPerThread, NPerThread, KPerThread>, // SliceLengths
Sequence<0, 1, 2>, // DimAccessOrder
0, // SrcVectorDim
1, // InScalarPerVectorSeq::At(I), // ScalarPerVector
1, // SrcScalarStrideInVector
InScalarPerVectorSeq::At(I), // InScalarPerVectorSeq::At(I), //
// ScalarPerVector
1, // SrcScalarStrideInVector
true>{in_grid_3d_desc_tuple[I], thread_global_offset};
},
Number<NumInput>{});
......@@ -163,7 +164,7 @@ struct GridwiseElementwise_3D
Sequence<MPerThread, NPerThread, KPerThread>, // SliceLengths
Sequence<0, 1, 2>, // DimAccessOrder
1, // SrcVectorDim
1, // OutScalarPerVectorSeq::At(I),
OutScalarPerVectorSeq::At(I), // OutScalarPerVectorSeq::At(I),
InMemoryDataOperationEnum::Set,
1,
true>(out_grid_3d_desc_tuple[I], thread_global_offset, PassThroughOp{});
......
......@@ -42,16 +42,18 @@ struct DeviceOperationInstanceFactory<
static auto GetInstances()
{
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F32>> &&
is_same_v<OutDataTypeTuple, ck::Tuple<F32>>)
{
add_device_transpose_f32_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
else if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F16>> &&
is_same_v<OutDataTypeTuple, ck::Tuple<F16>>)
{
add_device_transpose_f16_instances(op_ptrs);
}
return op_ptrs;
}
return op_ptrs;
};
} // namespace instance
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment