Commit 32806d5f authored by Jun Liu's avatar Jun Liu
Browse files

Merge branch 'amd-develop' into amd-master

parents e70a4d19 d0f355a3
...@@ -16,7 +16,7 @@ using XDataType = ck::half_t; ...@@ -16,7 +16,7 @@ using XDataType = ck::half_t;
using GammaDataType = ck::half_t; using GammaDataType = ck::half_t;
using BetaDataType = ck::half_t; using BetaDataType = ck::half_t;
using YDataType = ck::half_t; using YDataType = ck::half_t;
using SaveMeanInvStdDataType = float; using SaveMeanInvStdDataType = ck::half_t;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
#define SAVE_MEAN_INV_STD #define SAVE_MEAN_INV_STD
...@@ -155,6 +155,7 @@ int main(int argc, char* argv[]) ...@@ -155,6 +155,7 @@ int main(int argc, char* argv[])
<< best_op_name << std::endl; << best_op_name << std::endl;
// run the best intance // run the best intance
if(found)
{ {
auto& op_ptr = op_ptrs[best_op_id]; auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
......
...@@ -140,6 +140,7 @@ int main(int argc, char* argv[]) ...@@ -140,6 +140,7 @@ int main(int argc, char* argv[])
<< best_op_name << std::endl; << best_op_name << std::endl;
// run the best intance // run the best intance
if(found)
{ {
auto& op_ptr = op_ptrs[best_op_id]; auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
......
...@@ -142,6 +142,7 @@ int main() ...@@ -142,6 +142,7 @@ int main()
<< best_op_name << std::endl; << best_op_name << std::endl;
// run the best intance // run the best intance
if(found)
{ {
auto& op_ptr = op_ptrs[best_op_id]; auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
......
...@@ -204,6 +204,7 @@ int main(int argc, char* argv[]) ...@@ -204,6 +204,7 @@ int main(int argc, char* argv[])
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance // run the best intance
if(found)
{ {
auto& op_ptr = op_ptrs[best_op_id]; auto& op_ptr = op_ptrs[best_op_id];
......
add_executable(client_groupnorm_swish groupnorm_swish.cpp) add_executable(client_groupnorm_bwd_data groupnorm_bwd_data.cpp)
target_link_libraries(client_groupnorm_swish PRIVATE composable_kernel::device_other_operations) target_link_libraries(client_groupnorm_bwd_data PRIVATE composable_kernel::device_other_operations)
add_executable(client_groupnorm_swish_fwd groupnorm_swish_fwd.cpp)
target_link_libraries(client_groupnorm_swish_fwd PRIVATE composable_kernel::device_other_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_normalization_bwd_data.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/groupnorm_bwd_data.hpp"
using DYDataType = float;
using XDataType = float;
using GammaDataType = float;
using MeanInvStdDataType = float;
using DXDataType = float;
constexpr int Rank = 5;
constexpr int NumReduceDim = 3;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main(int argc, char* argv[])
{
ck::index_t N = 32;
ck::index_t H = 16;
ck::index_t W = 16;
ck::index_t G = 64;
ck::index_t C = 128;
std::size_t length = N * H * W * G * C;
std::vector<ck::index_t> strideDy = {H * W * G * C, W * G * C, G * C, C, 1};
std::vector<ck::index_t> strideX = strideDy;
std::vector<ck::index_t> strideDx = strideDy;
std::vector<ck::index_t> strideGamma = {0, 0, 0, C, 1};
std::vector<ck::index_t> strideMeanInvStd = {G, 0, 0, 1, 0};
SimpleDeviceMem dy_dev(sizeof(DYDataType) * length);
SimpleDeviceMem x_dev(sizeof(XDataType) * length);
SimpleDeviceMem gamma_dev(sizeof(GammaDataType) * G * C);
SimpleDeviceMem mean_dev(sizeof(MeanInvStdDataType) * N * G);
SimpleDeviceMem inv_std_dev(sizeof(MeanInvStdDataType) * N * G);
SimpleDeviceMem dx_dev(sizeof(DXDataType) * length);
using DeviceOp = ck::tensor_operation::device::DeviceNormalizationBwdData<DYDataType,
XDataType,
GammaDataType,
MeanInvStdDataType,
DXDataType,
Rank,
NumReduceDim>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},
strideDy,
strideX,
strideGamma,
strideMeanInvStd,
strideMeanInvStd,
strideDx,
{1, 2, 4}, // reduceDims
dy_dev.GetDeviceBuffer(),
x_dev.GetDeviceBuffer(),
gamma_dev.GetDeviceBuffer(),
mean_dev.GetDeviceBuffer(),
inv_std_dev.GetDeviceBuffer(),
dx_dev.GetDeviceBuffer());
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_byte = sizeof(DYDataType) * length + sizeof(XDataType) * length +
sizeof(GammaDataType) * G * C +
sizeof(MeanInvStdDataType) * N * G * 2 +
sizeof(DXDataType) * length;
float gb_per_sec = num_byte / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
<< op_name << std::endl;
if(ave_time < best_ave_time)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
// run the best intance
if(found)
{
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_op_name << std::endl;
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer({N, H, W, G, C},
strideDy,
strideX,
strideGamma,
strideMeanInvStd,
strideMeanInvStd,
strideDx,
{1, 2, 4}, // reduceDims
dy_dev.GetDeviceBuffer(),
x_dev.GetDeviceBuffer(),
gamma_dev.GetDeviceBuffer(),
mean_dev.GetDeviceBuffer(),
inv_std_dev.GetDeviceBuffer(),
dx_dev.GetDeviceBuffer());
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
...@@ -94,7 +94,6 @@ int main(int argc, char* argv[]) ...@@ -94,7 +94,6 @@ int main(int argc, char* argv[])
SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size); SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size); SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
SimpleDeviceMem out_indices_device_buf(sizeof(IndexDataType) * out_tensor_size);
using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank, using DeviceOp = ck::tensor_operation::device::DevicePoolFwd<InOutRank,
WindowRank, WindowRank,
...@@ -123,22 +122,22 @@ int main(int argc, char* argv[]) ...@@ -123,22 +122,22 @@ int main(int argc, char* argv[])
for(int i = 0; i < op_ptrs.size(); ++i) for(int i = 0; i < op_ptrs.size(); ++i)
{ {
auto& op_ptr = op_ptrs[i]; auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer( auto argument_ptr =
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()), nullptr,
in_length, in_length,
window_spatial_lengths, window_spatial_lengths,
out_length, out_length,
in_tensor_stride, in_tensor_stride,
out_tensor_stride, out_tensor_stride,
out_tensor_stride, out_tensor_stride,
window_strides, window_strides,
window_dilations, window_dilations,
input_left_pads, input_left_pads,
input_right_pads, input_right_pads,
{2, 3, 4}); {2, 3, 4});
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
...@@ -184,21 +183,21 @@ int main(int argc, char* argv[]) ...@@ -184,21 +183,21 @@ int main(int argc, char* argv[])
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl; << std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer( auto argument_ptr =
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), op_ptr->MakeArgumentPointer(static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()), nullptr,
in_length, in_length,
window_spatial_lengths, window_spatial_lengths,
out_length, out_length,
in_tensor_stride, in_tensor_stride,
out_tensor_stride, out_tensor_stride,
out_tensor_stride, out_tensor_stride,
window_strides, window_strides,
window_dilations, window_dilations,
input_left_pads, input_left_pads,
input_right_pads, input_right_pads,
{2, 3, 4}); {2, 3, 4});
auto invoker_ptr = op_ptr->MakeInvokerPointer(); auto invoker_ptr = op_ptr->MakeInvokerPointer();
......
...@@ -191,6 +191,7 @@ int main(int argc, char* argv[]) ...@@ -191,6 +191,7 @@ int main(int argc, char* argv[])
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl; << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
// run the best intance // run the best intance
if(found)
{ {
auto& op_ptr = op_ptrs[best_op_id]; auto& op_ptr = op_ptrs[best_op_id];
......
...@@ -117,6 +117,7 @@ int main() ...@@ -117,6 +117,7 @@ int main()
<< best_op_name << std::endl; << best_op_name << std::endl;
// run the best intance // run the best intance
if(found)
{ {
auto& op_ptr = op_ptrs[best_op_id]; auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
using InLayout = ck::tensor_layout::convolution::NDHWGC; using InLayout = ck::tensor_layout::convolution::NDHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKZYXC; using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
using OutLayout = ck::tensor_layout::convolution::NDHWGK; using OutLayout = ck::tensor_layout::convolution::NDHWGK;
using BiasLayout = ck::tensor_layout::convolution::G_K;
using PassThrough = ck::tensor_operation::element_wise::PassThrough; using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ScaleAddScaleAddRelu = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu; using ScaleAddScaleAddRelu = ck::tensor_operation::element_wise::ScaleAddScaleAddRelu;
...@@ -64,6 +65,9 @@ int execute_conv_fwd_scaleadd_scaleadd_relu() ...@@ -64,6 +65,9 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo}; std::array<ck::index_t, 6> out_lengths{G, N, K, Do, Ho, Wo};
std::array<ck::index_t, 6> out_strides{ std::array<ck::index_t, 6> out_strides{
K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K}; K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K};
// Logical broadcast bias (we have to pass bias lengths in the same format as output - GNKDHW)
std::array<ck::index_t, 6> bias_lengths{G, 1, K, 1, 1, 1};
std::array<ck::index_t, 6> bias_strides{K, 0, 1, 0, 0, 0};
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1}; std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1}; std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1, 1};
...@@ -74,13 +78,13 @@ int execute_conv_fwd_scaleadd_scaleadd_relu() ...@@ -74,13 +78,13 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C); SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Z * Y * X * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K); SimpleDeviceMem out(sizeof(OutDataType) * N * Do * Ho * Wo * G * K);
SimpleDeviceMem d0(sizeof(std::tuple_element_t<0, DDataTypes>) * N * Do * Ho * Wo * G * K); SimpleDeviceMem d0(sizeof(std::tuple_element_t<0, DDataTypes>) * N * Do * Ho * Wo * G * K);
SimpleDeviceMem d1(sizeof(std::tuple_element_t<1, DDataTypes>) * N * Do * Ho * Wo * G * K); SimpleDeviceMem d1(sizeof(std::tuple_element_t<1, DDataTypes>) * G * K);
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD< using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
NumDimSpatial, NumDimSpatial,
InLayout, InLayout,
WeiLayout, WeiLayout,
ck::Tuple<OutLayout, OutLayout>, ck::Tuple<OutLayout, BiasLayout>,
OutLayout, OutLayout,
InDataType, InDataType,
WeiDataType, WeiDataType,
...@@ -117,8 +121,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu() ...@@ -117,8 +121,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
in_strides, in_strides,
wei_lengths, wei_lengths,
wei_strides, wei_strides,
{out_lengths, out_lengths}, {out_lengths, bias_lengths},
{out_strides, out_strides}, {out_strides, bias_strides},
out_lengths, out_lengths,
out_strides, out_strides,
filter_strides, filter_strides,
...@@ -187,8 +191,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu() ...@@ -187,8 +191,8 @@ int execute_conv_fwd_scaleadd_scaleadd_relu()
in_strides, in_strides,
wei_lengths, wei_lengths,
wei_strides, wei_strides,
{out_lengths, out_lengths}, {out_lengths, bias_lengths},
{out_strides, out_strides}, {out_strides, bias_strides},
out_lengths, out_lengths,
out_strides, out_strides,
filter_strides, filter_strides,
......
add_executable(client_tensor_transform tensor_transform.cpp)
target_link_libraries(client_tensor_transform PRIVATE composable_kernel::device_other_operations)
add_executable(client_tensor_transform_using_wrapper tensor_transform_using_wrapper.cpp)
target_link_libraries(client_tensor_transform_using_wrapper PRIVATE composable_kernel::device_other_operations)
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
#include "ck/utility/tuple.hpp" #include "ck/utility/tuple.hpp"
#include "ck/utility/sequence.hpp" #include "ck/utility/sequence.hpp"
#include "tensor_transform_wrapper.hpp" #include "ck/wrapper/layout.hpp"
using DataType = int; using DataType = int;
...@@ -17,7 +17,7 @@ template <typename Layout> ...@@ -17,7 +17,7 @@ template <typename Layout>
void Print1d(const Layout& layout) void Print1d(const Layout& layout)
{ {
std::cout << "Print1d" << std::endl; std::cout << "Print1d" << std::endl;
for(ck::index_t w = 0; w < ck::tensor_transform_wrapper::size(layout); w++) for(ck::index_t w = 0; w < ck::wrapper::size(layout); w++)
{ {
std::cout << layout(ck::make_tuple(w)) << " "; std::cout << layout(ck::make_tuple(w)) << " ";
} }
...@@ -28,9 +28,9 @@ template <typename Layout> ...@@ -28,9 +28,9 @@ template <typename Layout>
void Print2d(const Layout& layout) void Print2d(const Layout& layout)
{ {
std::cout << "Print2d" << std::endl; std::cout << "Print2d" << std::endl;
for(ck::index_t h = 0; h < ck::tensor_transform_wrapper::size<0>(layout); h++) for(ck::index_t h = 0; h < ck::wrapper::size<0>(layout); h++)
{ {
for(ck::index_t w = 0; w < ck::tensor_transform_wrapper::size<1>(layout); w++) for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
{ {
std::cout << layout(ck::make_tuple(h, w)) << " "; std::cout << layout(ck::make_tuple(h, w)) << " ";
} }
...@@ -43,15 +43,11 @@ template <typename Layout> ...@@ -43,15 +43,11 @@ template <typename Layout>
void Print3dCustom(const Layout& layout) void Print3dCustom(const Layout& layout)
{ {
std::cout << "Print3dCustom" << std::endl; std::cout << "Print3dCustom" << std::endl;
for(ck::index_t d = 0; for(ck::index_t d = 0; d < ck::wrapper::size<0>(ck::wrapper::get<0>(layout)); d++)
d < ck::tensor_transform_wrapper::size<0>(ck::tensor_transform_wrapper::get<0>(layout));
d++)
{ {
for(ck::index_t h = 0; for(ck::index_t h = 0; h < ck::wrapper::size<1>(ck::wrapper::get<0>(layout)); h++)
h < ck::tensor_transform_wrapper::size<1>(ck::tensor_transform_wrapper::get<0>(layout));
h++)
{ {
for(ck::index_t w = 0; w < ck::tensor_transform_wrapper::size<1>(layout); w++) for(ck::index_t w = 0; w < ck::wrapper::size<1>(layout); w++)
{ {
std::cout << layout(ck::make_tuple(ck::make_tuple(d, h), w)) << " "; std::cout << layout(ck::make_tuple(ck::make_tuple(d, h), w)) << " ";
} }
...@@ -68,7 +64,7 @@ int main() ...@@ -68,7 +64,7 @@ int main()
// Basic descriptor 0, 1, 2, ... 30, 31 (compile-time descriptor) // Basic descriptor 0, 1, 2, ... 30, 31 (compile-time descriptor)
// (dims:4,8 strides:1,4) // (dims:4,8 strides:1,4)
const auto shape_4x8 = ck::make_tuple(ck::Number<4>{}, ck::Number<8>{}); const auto shape_4x8 = ck::make_tuple(ck::Number<4>{}, ck::Number<8>{});
const auto layout_4x8_s1x4 = ck::tensor_transform_wrapper::make_layout(shape_4x8); const auto layout_4x8_s1x4 = ck::wrapper::make_layout(shape_4x8);
std::cout << "dims:4,8 strides:1,4" << std::endl; std::cout << "dims:4,8 strides:1,4" << std::endl;
Print2d(layout_4x8_s1x4); Print2d(layout_4x8_s1x4);
using Cord1x1Type = ck::Tuple<ck::Number<1>, ck::Number<1>>; using Cord1x1Type = ck::Tuple<ck::Number<1>, ck::Number<1>>;
...@@ -77,10 +73,9 @@ int main() ...@@ -77,10 +73,9 @@ int main()
// Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (runtime descriptor) // Basic descriptor 0, 1, 8, 9, 16, 17, ... 30, 31 (runtime descriptor)
// dims:4,(2,4) strides:2,(1,8) // dims:4,(2,4) strides:2,(1,8)
const auto shape_4x2x4 = ck::make_tuple(4, ck::make_tuple(2, 4)); const auto shape_4x2x4 = ck::make_tuple(4, ck::make_tuple(2, 4));
const auto strides_s2x1x8 = ck::make_tuple(2, ck::make_tuple(1, 8)); const auto strides_s2x1x8 = ck::make_tuple(2, ck::make_tuple(1, 8));
const auto layout_4x2x4_s2x1x8 = const auto layout_4x2x4_s2x1x8 = ck::wrapper::make_layout(shape_4x2x4, strides_s2x1x8);
ck::tensor_transform_wrapper::make_layout(shape_4x2x4, strides_s2x1x8);
std::cout << "dims:4,(2,4) strides:2,(1,8)" << std::endl; std::cout << "dims:4,(2,4) strides:2,(1,8)" << std::endl;
Print2d(layout_4x2x4_s2x1x8); Print2d(layout_4x2x4_s2x1x8);
...@@ -92,7 +87,7 @@ int main() ...@@ -92,7 +87,7 @@ int main()
const auto strides_s1x4x2x8 = ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}), const auto strides_s1x4x2x8 = ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}),
ck::make_tuple(ck::Number<2>{}, ck::Number<8>{})); ck::make_tuple(ck::Number<2>{}, ck::Number<8>{}));
static const auto layout_2x2x2x4_s1x4x2x8 = static const auto layout_2x2x2x4_s1x4x2x8 =
ck::tensor_transform_wrapper::make_layout(shape_2x2x2x4, strides_s1x4x2x8); ck::wrapper::make_layout(shape_2x2x2x4, strides_s1x4x2x8);
std::cout << "dims:(2,2),(2,4) strides:(1,4),(2,8)" << std::endl; std::cout << "dims:(2,2),(2,4) strides:(1,4),(2,8)" << std::endl;
Print2d(layout_2x2x2x4_s1x4x2x8); Print2d(layout_2x2x2x4_s1x4x2x8);
...@@ -108,7 +103,7 @@ int main() ...@@ -108,7 +103,7 @@ int main()
ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}), ck::Number<2>{}), ck::make_tuple(ck::make_tuple(ck::Number<1>{}, ck::Number<4>{}), ck::Number<2>{}),
ck::Number<8>{}); ck::Number<8>{});
static const auto layout_2x2x2x4_s1x4x2x8_nested = static const auto layout_2x2x2x4_s1x4x2x8_nested =
ck::tensor_transform_wrapper::make_layout(shape_2x2x2x4_nested, strides_s1x4x2x8_nested); ck::wrapper::make_layout(shape_2x2x2x4_nested, strides_s1x4x2x8_nested);
std::cout << "dims:((2,2),2),4 strides:((1,4),2),8" << std::endl; std::cout << "dims:((2,2),2),4 strides:((1,4),2),8" << std::endl;
Print1d(layout_2x2x2x4_s1x4x2x8_nested); Print1d(layout_2x2x2x4_s1x4x2x8_nested);
......
...@@ -149,7 +149,7 @@ function(clang_tidy_check TARGET) ...@@ -149,7 +149,7 @@ function(clang_tidy_check TARGET)
add_custom_target(${tidy_target} add_custom_target(${tidy_target}
# for some targets clang-tidy not able to get information from .clang-tidy # for some targets clang-tidy not able to get information from .clang-tidy
DEPENDS ${SOURCE} DEPENDS ${SOURCE}
COMMAND ${CLANG_TIDY_COMMAND} "-config=\{CheckOptions: \[\{key: bugprone-reserved-identifier.AllowedIdentifiers,value: __HIP_PLATFORM_HCC__\; __HIP_ROCclr__\}\]\}" ${SOURCE} "-export-fixes=${CLANG_TIDY_FIXIT_DIR}/${TARGET}-${tidy_file}.yaml" COMMAND ${CLANG_TIDY_COMMAND} "-config=\{CheckOptions: \[\{key: bugprone-reserved-identifier.AllowedIdentifiers,value: __HIP_PLATFORM_HCC__\; __HIP_PLATFORM_AMD__\; __HIP_ROCclr__\}\]\}" ${SOURCE} "-export-fixes=${CLANG_TIDY_FIXIT_DIR}/${TARGET}-${tidy_file}.yaml"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "clang-tidy: Running clang-tidy on target ${SOURCE}..." COMMENT "clang-tidy: Running clang-tidy on target ${SOURCE}..."
) )
......
...@@ -4,23 +4,34 @@ ...@@ -4,23 +4,34 @@
# list see the documentation: # list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html # https://www.sphinx-doc.org/en/master/usage/configuration.html
import subprocess import re
from rocm_docs import ROCmDocs from rocm_docs import ROCmDocs
html_theme_options = {"flavor": "list"}
name = "Composable Kernel" with open('../CMakeLists.txt', encoding='utf-8') as f:
get_version = r'sed -n -e "s/^rocm_setup_version(.* \([0-9\.]\{1,\}\).*/\1/p" ../CMakeLists.txt' match = re.search(r'.*set\(version ([0-9.]+)[^0-9.]+', f.read())
version = subprocess.getoutput(get_version) if not match:
if len(version) > 0: raise ValueError("VERSION not found!")
name = f"{name} {version}" version_number = match[1]
left_nav_title = f"Composable Kernel {version_number} Documentation"
# for PDF output on Read the Docs
project = "Composable Kernel Documentation"
author = "Advanced Micro Devices, Inc."
copyright = "Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved."
version = version_number
release = version_number
external_toc_path = "./sphinx/_toc.yml" external_toc_path = "./sphinx/_toc.yml"
docs_core = ROCmDocs(f"{name} Documentation") docs_core = ROCmDocs(left_nav_title)
docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/docBin/xml") docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
docs_core.setup() docs_core.setup()
external_projects_current_project = "composable_kernel"
mathjax3_config = { mathjax3_config = {
'tex': { 'tex': {
'macros': { 'macros': {
......
...@@ -58,7 +58,7 @@ PROJECT_LOGO = ...@@ -58,7 +58,7 @@ PROJECT_LOGO =
# entered, it will be relative to the location where doxygen was started. If # entered, it will be relative to the location where doxygen was started. If
# left blank the current directory will be used. # left blank the current directory will be used.
OUTPUT_DIRECTORY = docBin OUTPUT_DIRECTORY = .
# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
# directories (in 2 levels) under the output directory of each output format and # directories (in 2 levels) under the output directory of each output format and
...@@ -778,7 +778,9 @@ WARN_LOGFILE = ...@@ -778,7 +778,9 @@ WARN_LOGFILE =
INPUT = ../../include/ck/tensor_operation/gpu/grid \ INPUT = ../../include/ck/tensor_operation/gpu/grid \
../../include/ck/tensor_operation/gpu/block \ ../../include/ck/tensor_operation/gpu/block \
../../include/ck/tensor_operation/gpu/thread \ ../../include/ck/tensor_operation/gpu/thread \
../../library/include/ck/library/utility ../../library/include/ck/library/utility \
../../include/ck/wrapper
# This tag can be used to specify the character encoding of the source files # This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
......
...@@ -34,6 +34,7 @@ Current CK library are structured into 4 layers: ...@@ -34,6 +34,7 @@ Current CK library are structured into 4 layers:
* "Templated Tile Operators" layer * "Templated Tile Operators" layer
* "Templated Kernel and Invoker" layer * "Templated Kernel and Invoker" layer
* "Instantiated Kernel and Invoker" layer * "Instantiated Kernel and Invoker" layer
* "Wrapper for tensor transform operations"
* "Client API" layer * "Client API" layer
.. image:: data/ck_layer.png .. image:: data/ck_layer.png
...@@ -50,6 +51,7 @@ The following is a list of CK documents in the suggested reading order: ...@@ -50,6 +51,7 @@ The following is a list of CK documents in the suggested reading order:
tutorial_hello_world tutorial_hello_world
dockerhub dockerhub
wrapper
Supported_Primitives_Guide Supported_Primitives_Guide
API_Reference_Guide API_Reference_Guide
Contributors_Guide Contributors_Guide
...@@ -5,6 +5,6 @@ defaults: ...@@ -5,6 +5,6 @@ defaults:
maxdepth: 6 maxdepth: 6
root: index root: index
subtrees: subtrees:
- caption: About - caption: About
entries: entries:
- file: license - file: license
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment