Commit 574fd35a authored by Astha Rai's avatar Astha Rai
Browse files

adding client example

parent f157518d
add_executable(client_elementwise_transpose3d elementwise_transpose_3d.cpp)
target_link_libraries(client_elementwise_transpose3d PRIVATE composable_kernel::device_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/transpose_3d.hpp"
using F16 = ck::half_t;
using F32 = float;
using ADataType = F16;
using BDataType = F16;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main()
{
bool time_kernel = true;
const int N = 16;
const int C = 8;
const int D = 8;
const int H = 8;
const int W = 8;
std::vector<std::size_t> ncdhw = {N, C, D, H, W};
std::vector<std::size_t> nchwd = {N, C, H, W, D};
Tensor<ADataType> a(ncdhw);
Tensor<BDataType> b(nchwd);
auto size = N * C * D * H * W;
a.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
std::array<ck::index_t, 5> ab_lengths{N, C, H, W, D};
std::array<ck::index_t, 5> a_strides = {C * D * H * W, D * H * W, 1, D * H, D};
std::array<ck::index_t, 5> b_strides = {C * H * W * D, H * W * D, W * D, D, 1};
SimpleDeviceMem a_dev_buf(sizeof(ADataType) * size);
SimpleDeviceMem b_dev_buf(sizeof(BDataType) * size);
std::array<const void*, 1> input = {a_device_buf.GetDeviceBuffer()};
std::array<void*, 1> output = {b_device_buf.GetDeviceBuffer()};
using DeviceElementwisePermuteInstance =
ck::tensor_operation::device::DeviceElementwise3dImpl<ck::Tuple<ADataType>,
ck::Tuple<BDataType>,
PassThrough,
2,
2,
1,
8,
8,
8,
ck::Sequence<8>,
ck::Sequence<1>>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(
ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_btype =
sizeof(ADataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]) +
sizeof(BDataType) * (ncdhw[0] * ncdhw[1] * ncdhw[2] * ncdhw[3] * ncdhw[4]);
float gb_per_sec = num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
<< op_name << std::endl;
if(ave_time < best_ave_time)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(
ab_lengths, {a_strides}, {b_strides}, input, output, PassThrough{});
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
\ No newline at end of file
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_3d_impl.hpp"
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using namespace ck::tensor_layout::convolution;
using F16 = ck::half_t;
using F32 = float;
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using device_transpose_f16_instances = std::tuple<
// clang-format off FOR 16, 32, 16, 32, 16
DeviceElementwise3dImpl<ck::Tuple<F16>,
ck::Tuple<F16>,
2,
2,
1,
8,
8,
8,
ck::Sequence<8>,
ck::Sequence<8>>,
DeviceElementwise3dImpl<ck::Tuple<F16>,
ck::Tuple<F16>,
2,
2,
1,
8,
8,
8,
ck::Sequence<8>,
ck::Sequence<1>>,
DeviceElementwise3dImpl<ck::Tuple<F16>,
ck::Tuple<F16>,
2,
2,
1,
8,
8,
8,
ck::Sequence<1>,
ck::Sequence<8>>,
DeviceElementwise3dImpl<ck::Tuple<F16>,
ck::Tuple<F16>,
2,
2,
1,
8,
8,
8,
ck::Sequence<1>,
ck::Sequence<1>>,
DeviceElementwise3dImpl<ck::Tuple<F16>,
ck::Tuple<F16>,
2,
2,
1,
8,
1,
1,
ck::Sequence<1>,
ck::Sequence<1>>,
DeviceElementwise3dImpl<ck::Tuple<F16>,
ck::Tuple<F16>,
2,
2,
1,
8,
1,
1,
ck::Sequence<8>,
ck::Sequence<1>>,
DeviceElementwise3dImpl<ck::Tuple<F16>,
ck::Tuple<F16>,
2,
2,
1,
8,
4,
4,
ck::Sequence<1>,
ck::Sequence<1>>,
DeviceElementwise3dImpl<ck::Tuple<F16>,
ck::Tuple<F16>,
2,
2,
1,
8,
4,
4,
ck::Sequence<8>,
ck::Sequence<8>>
// clang-format on
>;
using device_transpose_f32_instances = std::tuple<
// clang-format off // for 16, 8, 16, 32, 8 -> test with instances for fp16
DeviceElementwise3dImpl<ck::Tuple<F32>,
ck::Tuple<F32>,
2,
2,
1,
4,
4,
4,
ck::Sequence<1>,
ck::Sequence<1>>,
DeviceElementwise3dImpl<ck::Tuple<F32>,
ck::Tuple<F32>,
2,
2,
1,
4,
4,
4,
ck::Sequence<8>,
ck::Sequence<1>>,
DeviceElementwise3dImpl<ck::Tuple<F32>,
ck::Tuple<F32>,
2,
2,
1,
4,
4,
4,
ck::Sequence<8>,
ck::Sequence<8>>,
DeviceElementwise3dImpl<ck::Tuple<F32>,
ck::Tuple<F32>,
2,
2,
1,
4,
8,
4,
ck::Sequence<8>,
ck::Sequence<8>>,
DeviceElementwise3dImpl<ck::Tuple<F32>,
ck::Tuple<F32>,
2,
2,
1,
4,
8,
8,
ck::Sequence<8>,
ck::Sequence<8>>,
DeviceElementwise3dImpl<ck::Tuple<F32>,
ck::Tuple<F32>,
2,
2,
1,
4,
8,
8,
ck::Sequence<4>,
ck::Sequence<8>>,
DeviceElementwise3dImpl<ck::Tuple<F32>,
ck::Tuple<F32>,
2,
2,
1,
4,
8,
8,
ck::Sequence<4>,
ck::Sequence<4>>,
DeviceElementwise3dImpl<ck::Tuple<F32>,
ck::Tuple<F32>,
2,
2,
1,
4,
8,
8,
ck::Sequence<8>,
ck::Sequence<4>>,
DeviceElementwise3dImpl<ck::Tuple<F32>,
ck::Tuple<F32>,
2,
2,
1,
4,
4,
8,
ck::Sequence<8>,
ck::Sequence<8>>,
// clang-format on
>;
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
\ No newline at end of file
// SPDX-License-Identifier: MIT
// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <vector>
#include <memory>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_elementwise_3d_impl.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
using namespace ck::transpose_op;
void add_device_transpose_f16_instances(
std::vector<std::unique_ptr<DeviceElementwise3dImpl<F16, F16, NCDHW, 3>>>& instances);
void add_device_transpose_f32_instances(
std::vector<std::unique_ptr<DeviceElementwise3dImpl<F32, F32, NCDHW, 3>>>& instances);
template <typename InDataTypeTuple,
typename OutDataTypeTuple,
typename ElementwiseOperation,
index_t NumDim>
struct DeviceOperationInstanceFactory<
ck::tensor_operation::device::
DeviceElementwise3dImpl<InDataTypeTuple, OutDataTypeTuple, ElementwiseOperation, NumDim>>
{
using DeviceOp = DeviceElementwise3dImpl<InDataTypeTuple,
OutDataTypeTuple,
ElementwiseOperation,
NumDim_m, // choose how to set dims
NumDim_n,
NumDim_k,
MPerThread,
NPerThread,
KPerThread,
InScalarPerVectorSeq,
OutScalarPerVectorSeq>;
static auto GetInstances()
{
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
if constexpr(is_same_v<InDataType, float> && is_same_v<OutDataType, float>)
{
add_device_transpose_f32_instances(op_ptrs);
}
else if constexpr(is_same_v<InDataType, half_t> && is_same_v<OutDataType, half_t>)
{
add_device_transpose_f16_instances(op_ptrs);
}
}
return op_ptrs;
}
};
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment