"vscode:/vscode.git/clone" did not exist on "cc0b9a7a134ff562c964a584f18dec8044444e0f"
Commit adce1006 authored by Astha Rai's avatar Astha Rai
Browse files

cleaned up errors, randomized input tensor, added more instances

parent d68df255
#include <iostream>
#include <cstdlib>
#include <random>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
......@@ -48,10 +49,8 @@ void host_elementwise4D(HostTensorB& B_nhwc,
for(std::size_t n = 0; n < N; ++n)
{
ADataType tmp_val;
// auto a_val = A_nchw(n, c, h, w);
auto a_val = A_nchw.mData[(n) + (c * N) + (h * C * N) + (w * H * C * N)];
functor_b(tmp_val, a_val);
// functor_a(B_nhwc(n, h, w, c), scale * tmp_val);
functor_a(B_nhwc.mData[(n) + (c * W * H * N) + (h * N) + (w * H * N)],
scale * tmp_val);
}
......@@ -62,12 +61,14 @@ int main()
bool do_verification = true;
bool time_kernel = true;
std::vector<std::size_t> nchw = {4, 2, 1, 8};
std::vector<std::size_t> nhwc = {4, 1, 8, 2};
std::vector<std::size_t> nchw = {16, 8, 32, 64};
std::vector<std::size_t> nhwc = {16, 32, 64, 8};
Tensor<ADataType> a(nchw);
Tensor<BDataType> b(nhwc);
float scale = 1.f;
auto i = 0;
std::mt19937 gen(11939);
std::uniform_int_distribution<int> dis(0, 1);
for(std::size_t w = 0; w < a.mDesc.GetLengths()[3]; ++w)
for(std::size_t h = 0; h < a.mDesc.GetLengths()[2]; ++h)
for(std::size_t c = 0; c < a.mDesc.GetLengths()[1]; ++c)
......@@ -75,7 +76,7 @@ int main()
{
a.mData[(n * nchw[1] * nchw[2] * nchw[3]) + (c * nchw[2] * nchw[3]) +
(h * nchw[3]) + w] = i;
i++;
i = dis(gen);
}
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
......
......@@ -67,6 +67,8 @@ int main()
float scale = 1.f;
auto i = 0;
std::mt19937 gen(11939);
std::uniform_int_distribution<int> dis(0, 1);
for(std::size_t w = 0; w < a.mDesc.GetLengths()[3]; ++w)
for(std::size_t h = 0; h < a.mDesc.GetLengths()[2]; ++h)
for(std::size_t c = 0; c < a.mDesc.GetLengths()[1]; ++c)
......@@ -74,7 +76,7 @@ int main()
{
a.mData[(n * nchw[1] * nchw[2] * nchw[3]) + (c * nchw[2] * nchw[3]) +
(h * nchw[3]) + w] = i;
i++;
i = dis(gen);
}
DeviceMem a_device_buf(sizeof(ADataType) * a.mDesc.GetElementSpaceSize());
......
......@@ -24,14 +24,14 @@ using device_permute_scale_f16_instances =
std::tuple <
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4, 1, ck::Sequence<1>, ck::Sequence<1>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4, 8, ck::Sequence<1>, ck::Sequence<1>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4, 8, ck::Sequence<8>, ck::Sequence<1>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4, 8, ck::Sequence<1>, ck::Sequence<8>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4, 4, ck::Sequence<1>, ck::Sequence<1>>,
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, Pass, UnaryOp, Scale, 4, 2, ck::Sequence<1>, ck::Sequence<1>>
>;
using device_permute_scale_f32_instances = std::tuple<
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4, 1, ck::Sequence<1>, ck::Sequence<1>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4, 8, ck::Sequence<1>, ck::Sequence<1>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4, 4, ck::Sequence<1>, ck::Sequence<1>>,
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, Pass, UnaryOp, Scale, 4, 2, ck::Sequence<1>, ck::Sequence<1>>
>;
// clang-format on
......
......@@ -4,6 +4,7 @@
#pragma once
#include <iomanip>
#include <random>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
......@@ -18,7 +19,6 @@
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
namespace ck {
namespace profiler {
......@@ -94,6 +94,8 @@ bool profile_permute_scale_impl(int do_verification,
case 0: break;
case 1: a.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2}); break;
default: // a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}
std::mt19937 gen(11939);
std::uniform_int_distribution<int> dis(0, 1);
auto i = 0;
for(std::size_t w = 0; w < a.mDesc.GetLengths()[3]; ++w)
for(std::size_t h = 0; h < a.mDesc.GetLengths()[2]; ++h)
......@@ -102,7 +104,7 @@ bool profile_permute_scale_impl(int do_verification,
{
a.mData[(n * nchw[1] * nchw[2] * nchw[3]) + (c * nchw[2] * nchw[3]) +
(h * nchw[3]) + w] = i;
i++;
i = dis(gen);
}
}
......@@ -136,8 +138,6 @@ bool profile_permute_scale_impl(int do_verification,
host_elementwise4D(host_b, a, ElementOp{}, UnaryOp{}, scale);
}
int num_kernel = 0;
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(ab_lengths,
......@@ -207,14 +207,8 @@ bool profile_permute_scale_impl(int do_verification,
if(time_kernel)
{
LogRange(std::cout << "length = ", lengths, ",") << ", ";
std::cout << "num_kernel = " << num_kernel << ", best perf = " << best_ave_time << " ms, "
<< best_gb_per_sec << " GB/s, " << best_instance_name << std::endl;
}
if(num_kernel == 1)
{
std::cout << "Error: No kernel is tested" << std::endl;
return false;
std::cout << "best perf = " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_instance_name << std::endl;
}
return true;
......
......@@ -18,12 +18,12 @@ class TestPermute : public ::testing::Test
void Run()
{
std::vector<std::vector<ck::index_t>> lengths = {
{4, 2, 1, 8}, {1, 1, 1, 1}, {16, 8, 32, 8}};
{4, 2, 1, 8}, {1, 1, 1, 1}, {16, 8, 32, 64}, {32, 64, 128, 128}};
for(auto length : lengths)
{
bool success = ck::profiler::profile_permute_scale_impl<ADataType, BDataType, 4>(
true, 2, false, false, length);
true, 2, false, true, length);
EXPECT_TRUE(success);
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment