Commit 57b9cf69 authored by Astha Rai's avatar Astha Rai
Browse files

removed unneccesary comments, renamed files

parent a3115568
...@@ -45,13 +45,8 @@ int main() ...@@ -45,13 +45,8 @@ int main()
std::vector<std::size_t> ncdhw = {N, C, D, H, W}; std::vector<std::size_t> ncdhw = {N, C, D, H, W};
std::vector<std::size_t> nchwd = {N, C, H, W, D}; std::vector<std::size_t> nchwd = {N, C, H, W, D};
// Tensor<ADataType> a(ncdhw);
// Tensor<BDataType> b(nchwd);
auto size = N * C * D * H * W; auto size = N * C * D * H * W;
// a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D}; std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
std::array<ck::index_t, 5> a_strides = {C * D * H * W, D * H * W, 1, D * H, D}; std::array<ck::index_t, 5> a_strides = {C * D * H * W, D * H * W, 1, D * H, D};
std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1}; std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1};
......
add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp) add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp)
add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp) add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp)
add_example_executable(example_elementwise_permute_5D elementwise_permute_5D.cpp) add_example_executable(example_elementwise_permute elementwise_permute.cpp)
add_example_executable(example_elementwise_permute_5D_3d elementwise_permute_5D_3d.cpp) add_example_executable(example_elementwise_permute_3d elementwise_permute_3d.cpp)
...@@ -19,13 +19,13 @@ using BDataType = F16; ...@@ -19,13 +19,13 @@ using BDataType = F16;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceElementwisePermuteInstance = using DeviceElementwisePermuteInstance =
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
ck::Tuple<BDataType>, ck::Tuple<BDataType>, // OutDataTypeTuple
PassThrough, PassThrough, // ElementwiseOp
5, 5, // NumDim
8, 8, // MPerThread
ck::Sequence<8>, ck::Sequence<8>, // InScalarPerVectorSeq
ck::Sequence<1>>; ck::Sequence<1>>; // OutScalarPerVectorSeq
template <typename HostTensorA, typename HostTensorB, typename Functor> template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor) void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
...@@ -103,7 +103,6 @@ int main() ...@@ -103,7 +103,6 @@ int main()
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time;
// LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl; << std::endl;
...@@ -115,9 +114,6 @@ int main() ...@@ -115,9 +114,6 @@ int main()
Tensor<BDataType> host_b(nchwd); Tensor<BDataType> host_b(nchwd);
host_elementwise4D(host_b, a, PassThrough{}); host_elementwise4D(host_b, a, PassThrough{});
// LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "Host B : ", host_b.mData, ",") << std::endl;
pass &= pass &=
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
} }
......
...@@ -19,17 +19,17 @@ using BDataType = F16; ...@@ -19,17 +19,17 @@ using BDataType = F16;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceElementwisePermuteInstance = using DeviceElementwisePermuteInstance =
ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>, ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>, // InDataTypeTuple
ck::Tuple<BDataType>, ck::Tuple<BDataType>, // OutDataTypeTuple
PassThrough, PassThrough, // ElementwiseOp
2, // NumDim_m, {N, C} 2, // NumDim_m, {N, C}
2, // NumDim_n, {H, W} 2, // NumDim_n, {H, W}
1, // NumDim_k, {D} 1, // NumDim_k, {D}
8, 8, // MPerThread
8, 8, // NPerThread
8, 8, // KPerThread
ck::Sequence<8>, ck::Sequence<1>, // InScalarPerVectorSeq
ck::Sequence<8>>; ck::Sequence<1>>; // OutScalarPerVectorSeq
template <typename HostTensorA, typename HostTensorB, typename Functor> template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor) void host_elementwise4D(HostTensorB& B_nchwd, const HostTensorA& A_ncdhw, Functor functor)
...@@ -109,12 +109,8 @@ int main() ...@@ -109,12 +109,8 @@ int main()
if(do_verification) if(do_verification)
{ {
b_device_buf.FromDevice(b.mData.data()); b_device_buf.FromDevice(b.mData.data());
// LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl;
Tensor<BDataType> host_b(nchwd); Tensor<BDataType> host_b(nchwd);
host_elementwise4D(host_b, a, PassThrough{}); host_elementwise4D(host_b, a, PassThrough{});
// LogRangeAsType<float>(std::cout << "Host B : ", host_b.mData, ",") << std::endl;
pass &= pass &=
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
......
...@@ -19,13 +19,13 @@ using BDataType = F16; ...@@ -19,13 +19,13 @@ using BDataType = F16;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceElementwisePermuteInstance = using DeviceElementwisePermuteInstance =
ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, ck::tensor_operation::device::DeviceElementwiseImpl<ck::Tuple<ADataType>, // InDataTypeTuple
ck::Tuple<BDataType>, ck::Tuple<BDataType>, // OutDataTypeTuple
PassThrough, PassThrough, // Elementwise op
4, 4, // NumDim
8, 8, // MPerThread
ck::Sequence<8>, ck::Sequence<8>, // InScalarPerVectorSeq
ck::Sequence<1>>; ck::Sequence<1>>; // OutScalarPerVectorSeq
template <typename HostTensorA, typename HostTensorB, typename Functor> template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor) void host_elementwise4D(HostTensorB& B_nhwc, const HostTensorA& A_nchw, Functor functor)
...@@ -106,8 +106,6 @@ int main() ...@@ -106,8 +106,6 @@ int main()
b_device_buf.FromDevice(b.mData.data()); b_device_buf.FromDevice(b.mData.data());
Tensor<BDataType> host_b(nhwc); Tensor<BDataType> host_b(nhwc);
host_elementwise4D(host_b, a, PassThrough{}); host_elementwise4D(host_b, a, PassThrough{});
// LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "B : ", host_b.mData, ",") << std::endl;
pass &= pass &=
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
......
...@@ -17,15 +17,15 @@ using BDataType = F16; ...@@ -17,15 +17,15 @@ using BDataType = F16;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceElementwisePermuteInstance = using DeviceElementwisePermuteInstance =
ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>, ck::tensor_operation::device::DeviceElementwise2dImpl<ck::Tuple<ADataType>, // InDataTypeTuple
ck::Tuple<BDataType>, ck::Tuple<BDataType>, // OutDataTypeTuple
PassThrough, PassThrough, // Elementwise op
3, // NumDim_M 3, // NumDim_M
1, // NumDim_N 1, // NumDim_N
1, 1, // MPerThread
1, 1, // NPerThread
ck::Sequence<1>, ck::Sequence<1>, // InScalarPerVectorSeq
ck::Sequence<1>>; ck::Sequence<1>>; // OutScalarPerVectorSeq
template <typename HostTensorA, typename HostTensorB, typename Functor> template <typename HostTensorA, typename HostTensorB, typename Functor>
void host_elementwise4D(HostTensorB& B_nhwc, void host_elementwise4D(HostTensorB& B_nhwc,
...@@ -48,19 +48,10 @@ int main() ...@@ -48,19 +48,10 @@ int main()
bool do_verification = true; bool do_verification = true;
bool time_kernel = true; bool time_kernel = true;
// const int N = 120; const int N = 120;
// const int C = 128; const int C = 128;
// const int H = 32;
// const int W = 1024;
const int N = 16;
const int C = 8;
const int H = 32; const int H = 32;
const int W = 64; const int W = 1024;
/**const int N = 120;
const int H = 32;
const int W = 64;
const int C = 128;**/
std::vector<std::size_t> nchw = {N, C, H, W}; std::vector<std::size_t> nchw = {N, C, H, W};
std::vector<std::size_t> nhwc = {N, H, W, C}; std::vector<std::size_t> nhwc = {N, H, W, C};
...@@ -74,7 +65,6 @@ int main() ...@@ -74,7 +65,6 @@ int main()
DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(sizeof(BDataType) * b.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a.mData.data()); a_device_buf.ToDevice(a.mData.data());
// LogRangeAsType<float>(std::cout << "Tensor a : ", a.mData, ",") << std::endl;
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()}; std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()}; std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
...@@ -110,25 +100,18 @@ int main() ...@@ -110,25 +100,18 @@ int main()
float gb_per_sec = num_btype / 1.E6 / ave_time; float gb_per_sec = num_btype / 1.E6 / ave_time;
// LogRangeAsType<float>(std::cout << "A : ", a.mData, ",") << std::endl;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl; << std::endl;
bool pass = true; bool pass = true;
// LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl;
if(do_verification) if(do_verification)
{ {
b_device_buf.FromDevice(b.mData.data()); b_device_buf.FromDevice(b.mData.data());
// LogRangeAsType<float>(std::cout << "Tensor b : ", b.mData, ",") << std::endl;
Tensor<BDataType> host_b(nhwc); Tensor<BDataType> host_b(nhwc);
host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>( host_elementwise4D<Tensor<ADataType>, Tensor<BDataType>, PassThrough>(
host_b, a, nchw, PassThrough{}); host_b, a, nchw, PassThrough{});
// LogRangeAsType<float>(std::cout << "Host_b : ", host_b.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "B : ", b.mData, ",") << std::endl;
// LogRangeAsType<float>(std::cout << "Host b : ", host_b.mData, ",") << std::endl;
pass &= pass &=
ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3); ck::utils::check_err(b.mData, host_b.mData, "Error: Incorrect results b", 1e-3, 1e-3);
} }
......
...@@ -143,7 +143,8 @@ struct GridwiseElementwise_3D ...@@ -143,7 +143,8 @@ struct GridwiseElementwise_3D
Sequence<MPerThread, NPerThread, KPerThread>, // SliceLengths Sequence<MPerThread, NPerThread, KPerThread>, // SliceLengths
Sequence<0, 1, 2>, // DimAccessOrder Sequence<0, 1, 2>, // DimAccessOrder
0, // SrcVectorDim 0, // SrcVectorDim
1, // InScalarPerVectorSeq::At(I), // ScalarPerVector InScalarPerVectorSeq::At(I), // InScalarPerVectorSeq::At(I), //
// ScalarPerVector
1, // SrcScalarStrideInVector 1, // SrcScalarStrideInVector
true>{in_grid_3d_desc_tuple[I], thread_global_offset}; true>{in_grid_3d_desc_tuple[I], thread_global_offset};
}, },
...@@ -163,7 +164,7 @@ struct GridwiseElementwise_3D ...@@ -163,7 +164,7 @@ struct GridwiseElementwise_3D
Sequence<MPerThread, NPerThread, KPerThread>, // SliceLengths Sequence<MPerThread, NPerThread, KPerThread>, // SliceLengths
Sequence<0, 1, 2>, // DimAccessOrder Sequence<0, 1, 2>, // DimAccessOrder
1, // SrcVectorDim 1, // SrcVectorDim
1, // OutScalarPerVectorSeq::At(I), OutScalarPerVectorSeq::At(I), // OutScalarPerVectorSeq::At(I),
InMemoryDataOperationEnum::Set, InMemoryDataOperationEnum::Set,
1, 1,
true>(out_grid_3d_desc_tuple[I], thread_global_offset, PassThroughOp{}); true>(out_grid_3d_desc_tuple[I], thread_global_offset, PassThroughOp{});
......
...@@ -42,16 +42,18 @@ struct DeviceOperationInstanceFactory< ...@@ -42,16 +42,18 @@ struct DeviceOperationInstanceFactory<
static auto GetInstances() static auto GetInstances()
{ {
std::vector<std::unique_ptr<DeviceOp>> op_ptrs; std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>) if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F32>> &&
is_same_v<OutDataTypeTuple, ck::Tuple<F32>>)
{ {
add_device_transpose_f32_instances(op_ptrs); add_device_transpose_f32_instances(op_ptrs);
} }
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>) else if constexpr(is_same_v<InDataTypeTuple, ck::Tuple<F16>> &&
is_same_v<OutDataTypeTuple, ck::Tuple<F16>>)
{ {
add_device_transpose_f16_instances(op_ptrs); add_device_transpose_f16_instances(op_ptrs);
} }
}
return op_ptrs; return op_ptrs;
}
}; };
} // namespace instance } // namespace instance
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment