Commit 114c2646 authored by Jun Liu's avatar Jun Liu
Browse files

Merge branch 'amd-develop' into amd-master

parents 0629870d 705d5a08
...@@ -449,7 +449,7 @@ rocm_install(FILES ...@@ -449,7 +449,7 @@ rocm_install(FILES
) )
# Install CK version and configuration files # Install CK version and configuration files
install(FILES rocm_install(FILES
${PROJECT_BINARY_DIR}/include/ck/version.h ${PROJECT_BINARY_DIR}/include/ck/version.h
${PROJECT_BINARY_DIR}/include/ck/config.h ${PROJECT_BINARY_DIR}/include/ck/config.h
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck/
......
...@@ -100,6 +100,10 @@ int main(int argc, char* argv[]) ...@@ -100,6 +100,10 @@ int main(int argc, char* argv[])
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N + std::size_t num_byte = sizeof(XDataType) * M * N + sizeof(GammaDataType) * N +
...@@ -153,6 +157,10 @@ int main(int argc, char* argv[]) ...@@ -153,6 +157,10 @@ int main(int argc, char* argv[])
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
} }
......
...@@ -129,6 +129,10 @@ int main(int argc, char* argv[]) ...@@ -129,6 +129,10 @@ int main(int argc, char* argv[])
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_byte = std::size_t num_byte =
...@@ -184,6 +188,10 @@ int main(int argc, char* argv[]) ...@@ -184,6 +188,10 @@ int main(int argc, char* argv[])
if(op_ptr->IsSupportedArgument(argument_ptr.get())) if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{ {
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
} }
......
add_executable(client_max_pool2d_fwd max_pool2d_fwd.cpp) add_executable(client_max_pool2d_fwd max_pool2d_fwd.cpp)
target_link_libraries(client_max_pool2d_fwd PRIVATE composable_kernel::device_operations) target_link_libraries(client_max_pool2d_fwd PRIVATE composable_kernel::device_operations)
add_executable(client_max_pool2d_bwd max_pool2d_bwd.cpp)
target_link_libraries(client_max_pool2d_bwd PRIVATE composable_kernel::device_operations)
add_executable(client_avg_pool3d_fwd avg_pool3d_fwd.cpp) add_executable(client_avg_pool3d_fwd avg_pool3d_fwd.cpp)
target_link_libraries(client_avg_pool3d_fwd PRIVATE composable_kernel::device_operations) target_link_libraries(client_avg_pool3d_fwd PRIVATE composable_kernel::device_operations)
\ No newline at end of file
add_executable(client_avg_pool3d_bwd avg_pool3d_bwd.cpp)
target_link_libraries(client_avg_pool3d_bwd PRIVATE composable_kernel::device_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/avg_pool3d_bwd.hpp"
using DOutDataType = ck::half_t;
using DInDataType = ck::half_t;
using DOutLayout = ck::tensor_layout::convolution::NDHWC;
using DInLayout = ck::tensor_layout::convolution::NDHWC;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}, mMemSize_(mem_size)
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
void SetZero() const { (void)hipMemset(p_mem_, 0, mMemSize_); }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
std::size_t mMemSize_;
};
int main(int argc, char* argv[])
{
ck::index_t N = 2;
ck::index_t C = 32;
ck::index_t Z = 2;
ck::index_t Y = 2;
ck::index_t X = 2;
ck::index_t Di = 30;
ck::index_t Hi = 30;
ck::index_t Wi = 30;
ck::index_t window_stride_d = 2;
ck::index_t window_stride_h = 2;
ck::index_t window_stride_w = 2;
ck::index_t window_dilation_d = 1;
ck::index_t window_dilation_h = 1;
ck::index_t window_dilation_w = 1;
ck::index_t in_left_pad_d = 1;
ck::index_t in_left_pad_h = 1;
ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_d = 1;
ck::index_t in_right_pad_h = 1;
ck::index_t in_right_pad_w = 1;
const ck::index_t Zs = (Z - 1) * window_dilation_d + 1;
const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
ck::index_t Do = (Di + in_left_pad_d + in_right_pad_d - Zs) / window_stride_d + 1;
ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
// Pool API only support the order of NCDHW
std::vector<ck::index_t> in_length = {N, C, Di, Hi, Wi};
std::vector<ck::index_t> out_length = {N, C, Do, Ho, Wo};
std::vector<ck::index_t> window_spatial_lengths = {Z, Y, X};
std::vector<ck::index_t> window_strides = {window_stride_d, window_stride_h, window_stride_w};
std::vector<ck::index_t> window_dilations{
window_dilation_d, window_dilation_h, window_dilation_w};
std::vector<ck::index_t> input_left_pads = {in_left_pad_d, in_left_pad_h, in_left_pad_w};
std::vector<ck::index_t> input_right_pads = {in_right_pad_d, in_right_pad_h, in_right_pad_w};
std::size_t in_tensor_size = N * C * Di * Hi * Wi;
std::size_t out_tensor_size = N * C * Do * Ho * Wo;
// tensor layout = NDHWC
std::vector<ck::index_t> in_tensor_stride = {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C};
std::vector<ck::index_t> out_tensor_stride = {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C};
SimpleDeviceMem dout_device_buf(sizeof(DOutDataType) * out_tensor_size);
SimpleDeviceMem din_device_buf(sizeof(DInDataType) * in_tensor_size);
using DeviceOp = ck::tensor_operation::device::
DeviceAvgPoolBwd<3, DOutDataType, DInDataType, DOutLayout, DInLayout>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
out_length,
in_length,
out_tensor_stride,
in_tensor_stride,
window_spatial_lengths,
window_strides,
window_dilations,
input_left_pads,
input_right_pads);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
din_device_buf.SetZero();
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_bytes =
in_tensor_size * sizeof(DInDataType) + out_tensor_size * sizeof(DOutDataType);
float gb_per_sec = num_bytes / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
<< op_name << std::endl;
if(ave_time < best_ave_time)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
// run the best intance
if(found)
{
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_op_name << std::endl;
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<DOutDataType*>(dout_device_buf.GetDeviceBuffer()),
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
out_length,
in_length,
out_tensor_stride,
in_tensor_stride,
window_spatial_lengths,
window_strides,
window_dilations,
input_left_pads,
input_right_pads);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
din_device_buf.SetZero();
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <vector>
#include <iostream>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_pool_fwd.hpp"
#include "ck/tensor_operation/gpu/device/device_max_pool_bwd.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp"
#include "ck/library/tensor_operation_instance/gpu/max_pool_bwd.hpp"
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using DOutDataType = ck::half_t;
using DInDataType = ck::half_t;
using IndexDataType = int32_t;
// We use pool3d to implement pool2d in this example
using InLayout = ck::tensor_layout::convolution::NDHWC;
using OutLayout = ck::tensor_layout::convolution::NDHWC;
constexpr ck::index_t InOutRank = 5;
constexpr ck::index_t WindowRank = 3;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
void TransformPool2dparamToPool3d(std::vector<ck::index_t>& input_lengths,
std::vector<ck::index_t>& window_lengths,
std::vector<ck::index_t>& output_lengths,
std::vector<ck::index_t>& input_stride,
std::vector<ck::index_t>& output_stride,
std::vector<ck::index_t>& indices_stride,
std::vector<ck::index_t>& window_strides,
std::vector<ck::index_t>& window_dilations,
std::vector<ck::index_t>& input_left_pads,
std::vector<ck::index_t>& input_right_pads,
std::vector<ck::index_t>& pooling_dims)
{
// NCHW to NCDHW
input_lengths.insert(input_lengths.begin() + 2, 1);
output_lengths.insert(output_lengths.begin() + 2, 1);
input_stride.insert(input_stride.begin() + 2, 0);
output_stride.insert(output_stride.begin() + 2, 0);
indices_stride.insert(indices_stride.begin() + 2, 0);
// YX to ZYX
window_lengths.insert(window_lengths.begin(), 1);
window_strides.insert(window_strides.begin(), 0);
window_dilations.insert(window_dilations.begin(), 0);
input_left_pads.insert(input_left_pads.begin(), 0);
input_right_pads.insert(input_right_pads.begin(), 0);
pooling_dims = {2, 3, 4};
}
int main(int argc, char* argv[])
{
ck::index_t N = 2;
ck::index_t C = 32;
ck::index_t Y = 2;
ck::index_t X = 2;
ck::index_t Hi = 30;
ck::index_t Wi = 30;
ck::index_t window_stride_h = 2;
ck::index_t window_stride_w = 2;
ck::index_t window_dilation_h = 1;
ck::index_t window_dilation_w = 1;
ck::index_t in_left_pad_h = 1;
ck::index_t in_left_pad_w = 1;
ck::index_t in_right_pad_h = 1;
ck::index_t in_right_pad_w = 1;
const ck::index_t Ys = (Y - 1) * window_dilation_h + 1;
const ck::index_t Xs = (X - 1) * window_dilation_w + 1;
ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - Ys) / window_stride_h + 1;
ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - Xs) / window_stride_w + 1;
// Pool API only support the order of NCHW
std::vector<ck::index_t> in_length = {N, C, Hi, Wi};
std::vector<ck::index_t> out_length = {N, C, Ho, Wo};
std::vector<ck::index_t> window_spatial_lengths = {Y, X};
std::vector<ck::index_t> window_strides = {window_stride_h, window_stride_w};
std::vector<ck::index_t> window_dilations = {window_dilation_h, window_dilation_w};
std::vector<ck::index_t> input_left_pads = {in_left_pad_h, in_left_pad_w};
std::vector<ck::index_t> input_right_pads = {in_right_pad_h, in_right_pad_w};
std::vector<ck::index_t> pooling_dims = {2, 3};
std::size_t in_tensor_size = N * C * Hi * Wi;
std::size_t out_tensor_size = N * C * Ho * Wo;
// tensor layout = NHWC
std::vector<ck::index_t> in_tensor_stride = {C * Hi * Wi, 1, Wi * C, C};
std::vector<ck::index_t> out_tensor_stride = {C * Ho * Wo, 1, Wo * C, C};
TransformPool2dparamToPool3d(in_length,
window_spatial_lengths,
out_length,
in_tensor_stride,
out_tensor_stride,
out_tensor_stride,
window_strides,
window_dilations,
input_left_pads,
input_right_pads,
pooling_dims);
SimpleDeviceMem in_device_buf(sizeof(InDataType) * in_tensor_size);
SimpleDeviceMem out_device_buf(sizeof(OutDataType) * out_tensor_size);
SimpleDeviceMem indices_device_buf(sizeof(IndexDataType) * out_tensor_size);
SimpleDeviceMem dout_device_buf(sizeof(DOutDataType) * out_tensor_size);
SimpleDeviceMem din_device_buf(sizeof(DInDataType) * in_tensor_size);
// Generate index data from max pool forward
{
using MaxPoolFwdDeviceOp =
ck::tensor_operation::device::DevicePoolFwd<InOutRank,
WindowRank,
InDataType,
OutDataType,
IndexDataType,
InLayout,
OutLayout,
ck::ReduceTensorOp::MAX,
true>;
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
MaxPoolFwdDeviceOp>::GetInstances();
auto& op_ptr = op_ptrs[0];
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
in_length,
window_spatial_lengths,
out_length,
in_tensor_stride,
out_tensor_stride,
out_tensor_stride,
window_strides,
window_dilations,
input_left_pads,
input_right_pads,
pooling_dims);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
}
// Run MaxPool bwd
using MaxPoolBwdDeviceOp =
ck::tensor_operation::device::DeviceMaxPoolBwd<DOutDataType, IndexDataType, DInDataType>;
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
MaxPoolBwdDeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<InDataType*>(dout_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
out_tensor_size,
in_tensor_size,
window_spatial_lengths,
window_strides,
window_dilations);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_bytes = in_tensor_size * sizeof(DInDataType) +
out_tensor_size * sizeof(IndexDataType) +
out_tensor_size * sizeof(DOutDataType);
float gb_per_sec = num_bytes / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << "GB / s,"
<< op_name << std::endl;
if(ave_time < best_ave_time)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
// run the best intance
if(found)
{
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
<< best_op_name << std::endl;
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(
static_cast<InDataType*>(dout_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(indices_device_buf.GetDeviceBuffer()),
static_cast<DInDataType*>(din_device_buf.GetDeviceBuffer()),
out_tensor_size,
in_tensor_size,
window_spatial_lengths,
window_strides,
window_dilations);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
SimpleDeviceMem workspace(workspace_sz);
op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace.GetDeviceBuffer());
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
return 0;
}
add_executable(client_image_to_column image_to_column.cpp)
target_link_libraries(client_image_to_column PRIVATE composable_kernel::device_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iomanip>
#include <iostream>
#include <iterator>
#include <numeric>
#include <vector>
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/gpu/image_to_column.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using InDataType = ck::half_t;
using OutDataType = ck::half_t;
using InLayout = ck::tensor_layout::convolution::GNHWC;
static constexpr ck::index_t NumDimSpatial = 2;
static constexpr ck::index_t G = 1;
static constexpr ck::index_t N = 32; // batch size
static constexpr ck::index_t C = 32; // input channel (per group)
static constexpr ck::index_t Y = 3; // filter H
static constexpr ck::index_t X = 3; // filter W
static constexpr ck::index_t Hi = 28; // input H
static constexpr ck::index_t Wi = 28; // input W
static constexpr ck::index_t Ho = 28; // output H
static constexpr ck::index_t Wo = 28; // output W
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main()
{
std::array<ck::index_t, 2> in_spatial_lengths{Hi, Wi};
std::array<ck::index_t, 2> wei_spatial_lengths{Y, X};
std::array<ck::index_t, 2> out_spatial_lengths{Ho, Wo};
// We have NHWGC in memory space (G is dummy)
// However, CK's API only accept length and stride with order of GNCHW
// Hence, we need to adjust the order of stride
std::array<ck::index_t, 5> in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C};
std::array<ck::index_t, 2> out_strides{Y * X * C, 1};
std::array<ck::index_t, NumDimSpatial> filter_strides{1, 1};
std::array<ck::index_t, NumDimSpatial> filter_dilations{1, 1};
std::array<ck::index_t, NumDimSpatial> input_left_pads{1, 1};
std::array<ck::index_t, NumDimSpatial> input_right_pads{1, 1};
SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C);
SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * Y * X * C);
using DeviceOp = ck::tensor_operation::device::
DeviceImageToColumn<NumDimSpatial, InLayout, InDataType, OutDataType>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
int best_op_id = -1;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
N,
C,
in_spatial_lengths,
out_spatial_lengths,
wei_spatial_lengths,
in_strides,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
std::string op_name = op_ptr->GetTypeString();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C +
sizeof(OutDataType) * N * Ho * Wo * Y * X * C;
float gb_per_sec = num_bytes / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, "
<< op_name << std::endl;
if(avg_time < best_avg_time)
{
best_op_id = i;
best_op_name = op_name;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cerr << op_name << " does not support this problem" << std::endl;
}
}
if(best_op_id < 0)
{
std::cerr << "no suitable instance" << std::endl;
return EXIT_FAILURE;
}
std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_gb_per_sec
<< " GB/s, " << best_op_name << std::endl;
// run the best intance
{
auto& op_ptr = op_ptrs[best_op_id];
std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
<< std::endl;
auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(),
out.GetDeviceBuffer(),
N,
C,
in_spatial_lengths,
out_spatial_lengths,
wei_spatial_lengths,
in_strides,
out_strides,
filter_strides,
filter_dilations,
input_left_pads,
input_right_pads);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
}
std::cout << "Done" << std::endl;
}
}
add_executable(client_grouped_gemm_fixed_nk_bias_fp16 grouped_gemm_fixed_nk_bias_fp16.cpp)
target_link_libraries(client_grouped_gemm_fixed_nk_bias_fp16 PRIVATE composable_kernel::device_operations)
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include <iomanip>
#include <iostream>
#include <vector>
#include <random>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_fixed_nk.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_gemm_bias.hpp"
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using Add = ck::tensor_operation::element_wise::Add;
using ADataType = F16;
using BDataType = F16;
using D0DataType = F32;
using DsDataType = ck::Tuple<D0DataType>;
using EDataType = F32;
using ALayout = Row;
using BLayout = Row;
using D0Layout = Row;
using DsLayout = ck::Tuple<D0Layout>;
using ELayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = Add;
struct SimpleDeviceMem
{
SimpleDeviceMem() = delete;
SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
{
(void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
}
void* GetDeviceBuffer() { return p_mem_; }
~SimpleDeviceMem() { (void)hipFree(p_mem_); }
void* p_mem_;
};
int main()
{
std::vector<int> Ms, Ns, Ks, StrideAs, StrideBs, StrideEs;
int sum_of_m = 0;
Ms = {167, 183, 177, 181, 153, 139, 156, 173, 163, 150, 204, 184, 168, 156, 168, 148};
int group_count = Ms.size();
for(int i = 0; i < group_count; ++i)
{
Ns.push_back(768);
Ks.push_back(4608);
StrideAs.push_back(std::is_same<Row, ALayout>::value ? Ks[i] : Ms[i]);
StrideBs.push_back(std::is_same<Row, BLayout>::value ? Ns[i] : Ks[i]);
StrideEs.push_back(std::is_same<Row, ELayout>::value ? Ns[i] : Ms[i]);
sum_of_m += Ms[i];
}
auto f_matrix_space_size =
[](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
using Layout = decltype(layout);
if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
{
return (nRow - 1) * stride + nCol;
}
else
{
return (nCol - 1) * stride + nRow;
}
};
std::vector<SimpleDeviceMem> a_dev_bufs, b_dev_bufs, d0_dev_bufs, e_dev_bufs;
a_dev_bufs.reserve(group_count);
b_dev_bufs.reserve(group_count);
d0_dev_bufs.reserve(group_count);
e_dev_bufs.reserve(group_count);
std::vector<void*> p_e;
p_e.reserve(group_count);
std::vector<ck::tensor_operation::device::GemmDesc> gemm_descs;
gemm_descs.reserve(group_count);
std::vector<ck::tensor_operation::device::GroupedGemmKernelArgument<1>>
grouped_gemm_kernel_args_;
grouped_gemm_kernel_args_.reserve(group_count);
for(int i = 0; i < group_count; ++i)
{
a_dev_bufs.emplace_back(sizeof(ADataType) *
f_matrix_space_size(Ms[i], Ks[i], StrideAs[i], ALayout{}));
b_dev_bufs.emplace_back(sizeof(BDataType) *
f_matrix_space_size(Ks[i], Ns[i], StrideBs[i], BLayout{}));
d0_dev_bufs.emplace_back(sizeof(D0DataType) *
f_matrix_space_size(Ms[i], Ns[i], 0, D0Layout{}));
e_dev_bufs.emplace_back(sizeof(EDataType) *
f_matrix_space_size(Ms[i], Ns[i], StrideEs[i], ELayout{}));
gemm_descs.push_back({sum_of_m, Ns[i], Ks[i], 1, StrideBs[i], 1, {0}});
p_e.push_back(e_dev_bufs[i].GetDeviceBuffer());
grouped_gemm_kernel_args_.push_back(
{a_dev_bufs[i].GetDeviceBuffer(),
b_dev_bufs[i].GetDeviceBuffer(),
std::array<const void*, 1>{d0_dev_bufs[i].GetDeviceBuffer()},
e_dev_bufs[i].GetDeviceBuffer(),
Ms[i],
Ns[i],
Ks[i],
StrideAs[i],
StrideBs[i],
std::array<ck::index_t, 1>{0},
StrideEs[i]});
}
using DeviceOp = ck::tensor_operation::device::DeviceGroupedGemmFixedNK<ALayout,
BLayout,
DsLayout,
ELayout,
ADataType,
BDataType,
DsDataType,
EDataType,
AElementOp,
BElementOp,
CDEElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
const auto a_element_op = AElementOp{};
const auto b_element_op = BElementOp{};
const auto cde_element_op = CDEElementOp{};
std::string best_op_name;
bool found = false;
int best_op_id = -1;
float best_ave_time = 0;
float best_tflops = 0;
float best_gb_per_sec = 0;
// profile device operation instances
std::cout << "Run all instances and do timing" << std::endl;
std::vector<const void*> p_a = {}, p_b = {};
std::vector<std::array<const void*, 1>> p_ds = {};
for(int i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];
auto argument_ptr = op_ptr->MakeArgumentPointer(
p_a, p_b, p_ds, p_e, gemm_descs, a_element_op, b_element_op, cde_element_op);
auto invoker_ptr = op_ptr->MakeInvokerPointer();
SimpleDeviceMem grouped_gemm_kernel_args_dev(
op_ptr->GetDeviceKernelArgSize(argument_ptr.get()));
SimpleDeviceMem grouped_gemm_workspace_dev(op_ptr->GetWorkSpaceSize(argument_ptr.get()));
std::string op_name = op_ptr->GetTypeString();
hipGetErrorString(hipMemcpy(grouped_gemm_kernel_args_dev.GetDeviceBuffer(),
grouped_gemm_kernel_args_.data(),
op_ptr->GetDeviceKernelArgSize(argument_ptr.get()),
hipMemcpyHostToDevice));
op_ptr->SetWorkSpacePointer(argument_ptr.get(),
grouped_gemm_workspace_dev.GetDeviceBuffer());
op_ptr->SetDeviceKernelArgs(argument_ptr.get(),
grouped_gemm_kernel_args_dev.GetDeviceBuffer());
op_ptr->SetKBatch(argument_ptr.get(), 2);
if(op_ptr->IsSupportedArgument(argument_ptr.get()))
{
float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
std::size_t flop = 0, num_btype = 0;
for(std::size_t j = 0; j < gemm_descs.size(); ++j)
{
flop += std::size_t(2) * Ms[j] * Ns[j] * Ks[j];
num_btype += sizeof(ADataType) * Ms[j] * Ks[j] + sizeof(BDataType) * Ks[j] * Ns[j] +
sizeof(EDataType) * Ms[j] * Ns[j];
}
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
<< gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
found = true;
best_op_id = i;
best_op_name = op_name;
best_tflops = tflops;
best_ave_time = ave_time;
best_gb_per_sec = gb_per_sec;
}
}
else
{
std::cout << op_name << " does not support this problem" << std::endl;
}
}
std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
<< best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
return 0;
}
...@@ -309,6 +309,8 @@ XML_OUTPUT ...@@ -309,6 +309,8 @@ XML_OUTPUT
XML_PROGRAMLISTING XML_PROGRAMLISTING
) )
set(WARN_AS_ERROR YES)
set(DOXYGEN_CONFIG_FILE "${CMAKE_CURRENT_BINARY_DIR}/doxygen/doxygen.conf" CACHE PATH "Path to generated doxygen configuration file") set(DOXYGEN_CONFIG_FILE "${CMAKE_CURRENT_BINARY_DIR}/doxygen/doxygen.conf" CACHE PATH "Path to generated doxygen configuration file")
function(add_doxygen_doc) function(add_doxygen_doc)
......
...@@ -2,7 +2,101 @@ ...@@ -2,7 +2,101 @@
Contributor's Guide Contributor's Guide
=================== ===================
Pull-request guidelines This chapter explains how to get started contributing to the Composable Kernel project and what are
======================= the contributing rules.
[TODO] Getting started
===============
#. **Documentation:** Before contributing to the library, familiarize yourself with the
`Composable Kernel User Guide <https://rocm.docs.amd.com/projects/composable_kernel/en/latest/>`_.
It provides insight into the core concepts, environment configuration, and steps to obtain or
build the library. You can also find some of this information in the
`README file <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/README.md>`_
on the project's GitHub page.
#. **Additional reading:** We also recommend reading a `blog post
<https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_
from the AMD Community portal. It offers a deeper understanding of the library's objectives and
showcases its performance capabilities.
#. **General information:** For broader information about AMD products, consider exploring the
`AMD Developer Central portal <https://www.amd.com/en/developer.html>`_.
How do I contribute
===================
We deeply value contributions from our users. You can make an impact by reporting issues or
proposing code enhancements through pull requests.
Reporting issues
----------------
We use `Github issues <https://github.com/ROCmSoftwarePlatform/composable_kernel/issues>`_
to track public bugs and enhancement requests.
If you encounter an issue with the library, please check if the problem has already been
reported by searching existing issues on GitHub. If your issue seems unique, please submit a new
issue. All reported issues must include:
* A comprehensive description of the problem, including:
* What did you observe?
* Why do you think it is a bug (if it seems like one)?
* What did you expect to happen? What would indicate the resolution of the problem?
* Are there any known workarounds?
* Your configuration details, including:
* Which GPU are you using?
* Which OS version are you on?
* Which ROCm version are you using?
* Are you using a Docker image? If so, which one?
* Steps to reproduce the issue, including:
* What actions trigger the issue? What are the reproduction steps?
* If you build the library from scratch, what CMake command did you use?
* How frequently does this issue happen? Does it reproduce every time? Or is it a sporadic issue?
Before sumbitting any issue, ensure you have addressed all relevant questions from the checklist.
Creating Pull Requests
----------------------
You can submit `Pull Requests (PR) on GitHub
<https://github.com/ROCmSoftwarePlatform/composable_kernel/pulls>`_.
All contributors are required to develop their changes on a separate branch and then create a
pull requrest to merge their changes into the `develop` branch, which is the default
development branch in the Composable Kernel project. All external contributors must use their own
forks of the project to develop their changes.
When submitting a Pull Request you should:
* Describe the change providing information about the motivation for the change and a general
description of all code modifications.
* Verify and test the change:
* Run any relevant existing tests.
* Write new tests if added functionality is not covered by current tests.
* Ensure your changes align with the coding style defined in the ``.clang-format`` file located in
the project's root directory. We leverage `pre-commit` to run `clang-format` automatically. We
highly recommend contributors utilize this method to maintain consistent code formatting.
Instructions on setting up `pre-commit` can be found in the project's
`README file <https://github.com/ROCmSoftwarePlatform/composable_kernel/blob/develop/README.md>`_
* Link your PR to any related issues:
* If there is an issue that is resolved by your change, please provide a link to the issue in
the description of your pull request.
* For larger contributions, structure your change into a sequence of smaller, focused commits, each
addressing a particular aspect or fix.
Following the above guidelines ensures a seamless review process and faster assistance from our
end.
Thank you for your commitment to enhancing the Composable Kernel project! We look forward to collaborating with you.
...@@ -6,8 +6,7 @@ if(DL_KERNELS) ...@@ -6,8 +6,7 @@ if(DL_KERNELS)
if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_example_executable(example_gemm_dl_fp16 gemm_dl_fp16.cpp) add_example_executable(example_gemm_dl_fp16 gemm_dl_fp16.cpp)
add_dependencies(example_gemm_dl example_gemm_dl_fp16) add_dependencies(example_gemm_dl example_gemm_dl_fp16)
add_example_executable(example_gemm_dl_dpp8_fp16 gemm_dl_dpp8_fp16.cpp) add_example_executable(example_gemm_dpp_fp16 gemm_dpp_fp16.cpp)
add_dependencies(example_gemm_dl example_gemm_dl_dpp8_fp16)
endif() endif()
if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp) add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp)
...@@ -40,6 +39,9 @@ endif() ...@@ -40,6 +39,9 @@ endif()
if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp) add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp)
add_dependencies(example_gemm_xdl example_gemm_xdl_bf16) add_dependencies(example_gemm_xdl example_gemm_xdl_bf16)
add_example_executable(example_gemm_xdl_bf16_rtn gemm_xdl_bf16_rtn.cpp)
add_dependencies(example_gemm_xdl example_gemm_xdl_bf16_rtn)
endif() endif()
if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES)
......
...@@ -3,31 +3,33 @@ ...@@ -3,31 +3,33 @@
#include "common.hpp" #include "common.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_dl_dpp8.hpp" #include "ck/tensor_operation/gpu/device/impl/device_gemm_dpp.hpp"
using ADataType = ck::half_t; using ADataType = ck::half_t;
using BDataType = ck::half_t; using BDataType = ck::half_t;
using CDataType = ck::half_t;
using AccDataType = float; using AccDataType = float;
using CDataType = ck::half_t;
using F16 = ck::half_t;
using ALayout = Col; using ALayout = Row;
using BLayout = Row; using BLayout = Col;
using CLayout = Row; using CLayout = Row;
using AElementOp = PassThrough; using AElementOp = PassThrough;
using BElementOp = PassThrough; using BElementOp = PassThrough;
using CElementOp = PassThrough; using CElementOp = PassThrough;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default; static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNPadding;
// clang-format off // clang-format off
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDlDpp8 using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDpp
// ######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| K0Per| K1| M1Per| N1Per| KPer| M11N11Thread| M11N11Thread| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| CThreadTransfer| // ######| AData| BData| CData| AccData| ALayout| BLayout| CLayout| A| B| C| GEMM| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MDpp| NDpp| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer|
// ######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise| Spacialization| Size| Block| Block| Block| | ThreadM111| ThreadN111| Thread| ClusterM110Xs| ClusterN110Xs| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| ThreadSliceLengths| ThreadClusterLengths| ThreadCluster| SrcAccess| SrcVectorTensor| SrcVectorTensor| DstVectorTensor| SrcDstAccess| SrcDstVectorDim| DstScalarPerVector| // ######| Type| Type| Type| Type| | | | Elementwise| Elementwise| Elementwise|Spacialization| Size| Block| Block| Block| | | Dpp| Dpp| PerWave| PerWave| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| SrcDstVectorDim| DstScalar|
// ######| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | K0_M0_M1_K1| K0_M0_M1_K1| ArrangeOrder| Order| Lengths_K0_M0_M1_K1| ContiguousDimOrder| Lengths_K0_M0_M1_K1| K0_N0_N1_K1| K0_N0_N1_K1| ArrangeOrder| Order| Lengths_K0_N0_N1_K1| ContiguousDimOrder| Lengths_K0_N0_N1_K1| Order| | | // ######| | | | | | | | Operation| Operation| Operation| | | | | | | | | | | | Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | | PerVector|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | // ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmDefault, 256, 128, 128, 16, 2, 1, 8, 8, S<8, 8>, S<4, 1>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<2, 1, 4, 2>, S<8, 1, 32, 1>, S<0, 3, 1, 2>, S<0, 3, 1, 2>, S<1, 1, 4, 1>, S<0, 3, 1, 2>, S<1, 1, 4, 2>, S<0, 1, 2, 3, 4, 5>, 5, 4>; < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout, AElementOp, BElementOp, CElementOp, GemmDefault, 128, 64, 64, 64, 8, 2, 32, 8, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 2, 2, true, 5, 1>;
// clang-format on // // clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host:: using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>; ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/utility/type_convert.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle.hpp"
using ADataType = ck::bhalf_t;
using BDataType = ck::bhalf_t;
using CDataType = ck::bhalf_t;
using AccDataType = float;
using CShuffleDataType = float;
using ALayout = Row;
using BLayout = Col;
using CLayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = ck::tensor_operation::element_wise::ConvertBF16RTN;
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
// clang-format off
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
// ######| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle| A| B| C| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer|
// ######| | | | Type| Type| Type| Type| DataType| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector|
// ######| | | | | | | | | Operation| Operation| Operation| | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl|
// ######| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
< ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType, AElementOp, BElementOp, CElementOp, GemmDefault, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8>;
// clang-format on
using ReferenceGemmInstance = ck::tensor_operation::host::
ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;
#include "run_gemm_example.inc"
int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }
...@@ -5,6 +5,9 @@ set(target 0) ...@@ -5,6 +5,9 @@ set(target 0)
foreach(gpu IN LISTS GPU_TARGETS) foreach(gpu IN LISTS GPU_TARGETS)
if(gpu IN_LIST gpu_list1 AND target EQUAL 0) if(gpu IN_LIST gpu_list1 AND target EQUAL 0)
add_example_executable(example_gemm_bilinear_wmma_fp16 gemm_bilinear_wmma_fp16.cpp) add_example_executable(example_gemm_bilinear_wmma_fp16 gemm_bilinear_wmma_fp16.cpp)
add_example_executable(example_gemm_bilinear_wmma_int8 gemm_bilinear_wmma_int8.cpp)
endif()
if(GPU_TARGETS MATCHES "gfx908" OR GPU_TARGETS MATCHES "gfx90a" OR GPU_TARGETS MATCHES "gfx940")
set(target 1) set(target 1)
endif() endif()
endforeach() endforeach()
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_wmma_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
struct AlphaBetaAdd
{
AlphaBetaAdd(int alpha, int beta) : alpha_(alpha), beta_(beta){};
template <typename E, typename C, typename D>
__host__ __device__ constexpr void operator()(E& e, const C& c, const D& d) const;
template <>
__host__ __device__ constexpr void operator()<std::int8_t, std::int32_t, std::int8_t>(
std::int8_t& e, const std::int32_t& c, const std::int8_t& d) const
{
e = ck::type_convert<std::int8_t>(alpha_ * c + beta_ * ck::type_convert<std::int32_t>(d));
};
int alpha_;
int beta_;
};
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using I8 = std::int8_t;
using I32 = std::int32_t;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ADataType = I8;
using BDataType = I8;
using AccDataType = I32;
using CShuffleDataType = I32;
using DDataType = I8;
using EDataType = I8;
using ALayout = Row;
using BLayout = Row;
using DLayout = Row;
using ELayout = Row;
using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CDEElementOp = AlphaBetaAdd;
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
using DeviceOpInstance =
ck::tensor_operation::device::DeviceGemmMultipleD_Wmma_CShuffle<ALayout,
BLayout,
ck::Tuple<DLayout>,
ELayout,
ADataType,
BDataType,
ck::Tuple<DDataType>,
EDataType,
AccDataType,
CShuffleDataType,
AElementOp,
BElementOp,
CDEElementOp,
GemmSpec,
32,
16,
16,
4,
16,
16,
16,
1,
1,
S<2, 16, 1>,
S<1, 0, 2>,
S<1, 0, 2>,
2,
16,
16,
1,
S<4, 1, 8>,
S<0, 2, 1>,
S<0, 2, 1>,
1,
16,
2,
1,
1,
1,
S<1, 16, 1, 2>,
8>;
int main(int argc, char* argv[])
{
bool do_verification = true;
int init_method = 1;
bool time_kernel = true;
// GEMM shape
ck::index_t M = 3840;
ck::index_t N = 4096;
ck::index_t K = 4096;
ck::index_t StrideA = 4096;
ck::index_t StrideB = 4096;
ck::index_t StrideD = 4096;
ck::index_t StrideE = 4096;
int alpha = 1;
int beta = 1;
if(argc == 1)
{
// use default case
}
else if(argc == 4)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 6)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
alpha = std::stof(argv[4]);
beta = std::stof(argv[5]);
}
else if(argc == 13)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
time_kernel = std::stoi(argv[3]);
M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
K = std::stoi(argv[6]);
StrideA = std::stoi(argv[7]);
StrideB = std::stoi(argv[8]);
StrideD = std::stoi(argv[9]);
StrideE = std::stoi(argv[10]);
alpha = std::stof(argv[11]);
beta = std::stof(argv[12]);
}
else
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: time kernel (0=no, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, alpha, "
"beta\n");
exit(0);
}
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
using namespace ck::literals;
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
{
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
Tensor<DDataType> d_m_n(f_host_tensor_descriptor(M, N, StrideD, DLayout{}));
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
switch(init_method)
{
case 0: break;
case 1:
a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
d_m_n.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
break;
default:
a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
}
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
d_device_buf.ToDevice(d_m_n.mData.data());
e_device_buf.ToDevice(e_m_n_device_result.mData.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
auto cde_element_op = CDEElementOp{alpha, beta};
// do GEMM
auto device_op = DeviceOpInstance{};
auto invoker = device_op.MakeInvoker();
auto argument =
device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
e_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
std::array<ck::index_t, 1>{StrideD},
StrideE,
a_element_op,
b_element_op,
cde_element_op);
if(!device_op.IsSupportedArgument(argument))
{
throw std::runtime_error(
"wrong! device_gemm with the specified compilation parameters does "
"not support this GEMM problem");
}
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(EDataType) * M * N;
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_btype / 1.E6 / ave_time;
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
if(do_verification)
{
Tensor<CShuffleDataType> c_m_n({M, N});
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
CShuffleDataType,
AccDataType,
AElementOp,
BElementOp,
PassThrough>;
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
auto ref_argument =
ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
ref_invoker.Run(ref_argument);
for(int m = 0; m < M; ++m)
{
for(int n = 0; n < N; ++n)
{
cde_element_op(e_m_n_host_result(m, n), c_m_n(m, n), d_m_n(m, n));
}
}
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
}
return 0;
}
add_custom_target(example_grouped_gemm_xdl) add_custom_target(example_grouped_gemm_xdl)
if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES)
add_example_executable(example_grouped_gemm_xdl_fp32 grouped_gemm_xdl_fp32.cpp) add_example_executable(example_grouped_gemm_xdl_fp32 grouped_gemm_xdl_fp32.cpp)
add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_fp32) add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_fp32)
...@@ -7,10 +8,14 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) ...@@ -7,10 +8,14 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
add_example_executable(example_grouped_gemm_xdl_fp16 grouped_gemm_xdl_fp16.cpp) add_example_executable(example_grouped_gemm_xdl_fp16 grouped_gemm_xdl_fp16.cpp)
add_example_executable(example_grouped_gemm_multiple_d_dl_fp16 grouped_gemm_multiple_d_dl_fp16.cpp) add_example_executable(example_grouped_gemm_multiple_d_dl_fp16 grouped_gemm_multiple_d_dl_fp16.cpp)
add_example_executable(example_grouped_gemm_xdl_splitk_fp16 grouped_gemm_xdl_splitk_fp16.cpp) add_example_executable(example_grouped_gemm_xdl_splitk_fp16 grouped_gemm_xdl_splitk_fp16.cpp)
add_example_executable(example_grouped_gemm_xdl_fixed_nk_fp16 grouped_gemm_xdl_fixed_nk_fp16.cpp)
add_example_executable(example_grouped_gemm_xdl_fixed_nk_bias_fp16 grouped_gemm_xdl_fixed_nk_bias_fp16.cpp)
add_dependencies(example_grouped_gemm_xdl add_dependencies(example_grouped_gemm_xdl
example_grouped_gemm_xdl_fp16 example_grouped_gemm_xdl_fp16
example_grouped_gemm_multiple_d_dl_fp16 example_grouped_gemm_multiple_d_dl_fp16
example_grouped_gemm_xdl_splitk_fp16) example_grouped_gemm_xdl_splitk_fp16
example_grouped_gemm_xdl_fixed_nk_fp16
example_grouped_gemm_xdl_fixed_nk_bias_fp16)
endif() endif()
if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES)
add_example_executable(example_grouped_gemm_xdl_bfp16 grouped_gemm_xdl_bfp16.cpp) add_example_executable(example_grouped_gemm_xdl_bfp16 grouped_gemm_xdl_bfp16.cpp)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment