"...csrc/git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "45002089de1c040637864904a22758e280ff6cab"
Unverified Commit 5b9b083d authored by carlushuang's avatar carlushuang Committed by GitHub
Browse files

[CK_TILE] not using structures under ck_tile/ops for ck_tile/host (#1834)



* not using structures under ck_tile/ops for ck_tile/host

* update as constexpr function

* Rename fn

* Update other examples.

---------
Co-authored-by: default avatarAdam Osewski <19374865+aosewski@users.noreply.github.com>
Co-authored-by: default avatarAdam Osewski <Adam.Osewski@amd.com>
parent 052a7265
...@@ -2,6 +2,13 @@ ...@@ -2,6 +2,13 @@
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once #pragma once
template <typename Layout>
static constexpr inline auto is_row_major(Layout layout_)
{
return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
ck_tile::tensor_layout::gemm::RowMajor>>{};
}
auto calculate_rtol_atol(const ck_tile::index_t K, auto calculate_rtol_atol(const ck_tile::index_t K,
const ck_tile::index_t kbatch, const ck_tile::index_t kbatch,
const float max_accumulated_value) const float max_accumulated_value)
...@@ -88,48 +95,16 @@ int run_gemm_example_with_layouts(int argc, ...@@ -88,48 +95,16 @@ int run_gemm_example_with_layouts(int argc,
int n_warmup = arg_parser.get_int("warmup"); int n_warmup = arg_parser.get_int("warmup");
int n_repeat = arg_parser.get_int("repeat"); int n_repeat = arg_parser.get_int("repeat");
using namespace ck_tile::literals; stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
auto f_host_tensor_descriptor = stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(CLayout{}));
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>) ck_tile::HostTensor<ADataType> a_m_k(
{ ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz}); ck_tile::HostTensor<BDataType> b_k_n(
} ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
else
{
return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
auto f_get_default_stride = [](std::size_t row,
std::size_t col,
std::size_t stride,
auto layout) {
if(stride == 0)
{
// give a chance if stride is zero, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
{
return col;
}
else
{
return row;
}
}
else
return stride;
};
stride_A = f_get_default_stride(M, K, stride_A, a_layout);
stride_B = f_get_default_stride(K, N, stride_B, b_layout);
stride_C = f_get_default_stride(M, N, stride_C, CLayout{});
ck_tile::HostTensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, stride_A, a_layout));
ck_tile::HostTensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, stride_B, b_layout));
ck_tile::HostTensor<CDataType> c_m_n_dev_result( ck_tile::HostTensor<CDataType> c_m_n_dev_result(
f_host_tensor_descriptor(M, N, stride_C, CLayout{})); ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
// TODO: add different init types // TODO: add different init types
ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k); ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
...@@ -163,7 +138,7 @@ int run_gemm_example_with_layouts(int argc, ...@@ -163,7 +138,7 @@ int run_gemm_example_with_layouts(int argc,
if(arg_parser.get_int("v") == 1) if(arg_parser.get_int("v") == 1)
{ {
ck_tile::HostTensor<CDataType> c_m_n_host_ref( ck_tile::HostTensor<CDataType> c_m_n_host_ref(
f_host_tensor_descriptor(M, N, stride_C, CLayout{})); ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
c_m_n_host_ref.SetZero(); c_m_n_host_ref.SetZero();
ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>( ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
...@@ -185,7 +160,7 @@ int run_gemm_example_with_layouts(int argc, ...@@ -185,7 +160,7 @@ int run_gemm_example_with_layouts(int argc,
else if(arg_parser.get_int("v") == 2) else if(arg_parser.get_int("v") == 2)
{ {
ck_tile::HostTensor<CDataType> c_m_n_gpu_ref( ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
f_host_tensor_descriptor(M, N, stride_C, CLayout{})); ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes()); ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
c_m_n_gpu_ref.SetZero(); c_m_n_gpu_ref.SetZero();
c_m_n_gpu_buf_ref.SetZero(); c_m_n_gpu_buf_ref.SetZero();
......
...@@ -3,6 +3,13 @@ ...@@ -3,6 +3,13 @@
#pragma once #pragma once
template <typename Layout>
static constexpr inline auto is_row_major(Layout layout_)
{
return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
ck_tile::tensor_layout::gemm::RowMajor>>{};
}
auto calculate_rtol_atol(const ck_tile::index_t K, auto calculate_rtol_atol(const ck_tile::index_t K,
const ck_tile::index_t kbatch, const ck_tile::index_t kbatch,
const float max_accumulated_value) const float max_accumulated_value)
...@@ -106,56 +113,16 @@ int run_batched_gemm_example_with_layouts(int argc, ...@@ -106,56 +113,16 @@ int run_batched_gemm_example_with_layouts(int argc,
int n_warmup = arg_parser.get_int("warmup"); int n_warmup = arg_parser.get_int("warmup");
int n_repeat = arg_parser.get_int("repeat"); int n_repeat = arg_parser.get_int("repeat");
using namespace ck_tile::literals; stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
auto f_host_tensor_descriptor = [](std::size_t batch_count_, stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(c_layout));
std::size_t row,
std::size_t col, ck_tile::HostTensor<ADataType> a_m_k(ck_tile::host_tensor_descriptor(
std::size_t stride, batch_count, M, K, stride_A, batch_stride_A, is_row_major(a_layout)));
std::size_t batch_stride, ck_tile::HostTensor<BDataType> b_k_n(ck_tile::host_tensor_descriptor(
auto layout) { batch_count, K, N, stride_B, batch_stride_B, is_row_major(b_layout)));
if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>) ck_tile::HostTensor<CDataType> c_m_n_dev_result(ck_tile::host_tensor_descriptor(
{ batch_count, M, N, stride_C, batch_stride_C, is_row_major(c_layout)));
return ck_tile::HostTensorDescriptor({batch_count_, row, col},
{batch_stride, stride, 1_uz});
}
else
{
return ck_tile::HostTensorDescriptor({batch_count_, row, col},
{batch_stride, 1_uz, stride});
}
};
auto f_get_default_stride = [](std::size_t row,
std::size_t col,
std::size_t stride,
auto layout) {
if(stride == 0)
{
// give a chance if stride is zero, return a default packed stride
if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
{
return col;
}
else
{
return row;
}
}
else
return stride;
};
stride_A = f_get_default_stride(M, K, stride_A, a_layout);
stride_B = f_get_default_stride(K, N, stride_B, b_layout);
stride_C = f_get_default_stride(M, N, stride_C, c_layout);
ck_tile::HostTensor<ADataType> a_m_k(
f_host_tensor_descriptor(batch_count, M, K, stride_A, batch_stride_A, a_layout));
ck_tile::HostTensor<BDataType> b_k_n(
f_host_tensor_descriptor(batch_count, K, N, stride_B, batch_stride_B, b_layout));
ck_tile::HostTensor<CDataType> c_m_n_dev_result(
f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, c_layout));
ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k); ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n); ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
...@@ -191,8 +158,8 @@ int run_batched_gemm_example_with_layouts(int argc, ...@@ -191,8 +158,8 @@ int run_batched_gemm_example_with_layouts(int argc,
if(arg_parser.get_int("v") == 1) if(arg_parser.get_int("v") == 1)
{ {
ck_tile::HostTensor<CDataType> c_m_n_host_ref( ck_tile::HostTensor<CDataType> c_m_n_host_ref(ck_tile::host_tensor_descriptor(
f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, CLayout{})); batch_count, M, N, stride_C, batch_stride_C, is_row_major(CLayout){}));
c_m_n_host_ref.SetZero(); c_m_n_host_ref.SetZero();
const auto b_n_k = b_k_n.transpose({0, 2, 1}); const auto b_n_k = b_k_n.transpose({0, 2, 1});
...@@ -216,8 +183,8 @@ int run_batched_gemm_example_with_layouts(int argc, ...@@ -216,8 +183,8 @@ int run_batched_gemm_example_with_layouts(int argc,
} }
else if(arg_parser.get_int("v") == 2) else if(arg_parser.get_int("v") == 2)
{ {
ck_tile::HostTensor<CDataType> c_m_n_gpu_ref( ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(ck_tile::host_tensor_descriptor(
f_host_tensor_descriptor(batch_count, M, N, stride_C, batch_stride_C, CLayout{})); batch_count, M, N, stride_C, batch_stride_C, is_row_major(CLayout){}));
ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes()); ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
c_m_n_gpu_ref.SetZero(); c_m_n_gpu_ref.SetZero();
c_m_n_gpu_buf_ref.SetZero(); c_m_n_gpu_buf_ref.SetZero();
......
...@@ -3,6 +3,13 @@ ...@@ -3,6 +3,13 @@
#pragma once #pragma once
template <typename Layout>
static constexpr inline auto is_row_major(Layout layout_)
{
return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
ck_tile::tensor_layout::gemm::RowMajor>>{};
}
auto calculate_rtol_atol(const ck_tile::index_t K, auto calculate_rtol_atol(const ck_tile::index_t K,
const ck_tile::index_t kbatch, const ck_tile::index_t kbatch,
const float max_accumulated_value) const float max_accumulated_value)
...@@ -128,16 +135,19 @@ int run_grouped_gemm_example_with_layouts(int argc, ...@@ -128,16 +135,19 @@ int run_grouped_gemm_example_with_layouts(int argc,
const ck_tile::index_t N = Ns[i]; const ck_tile::index_t N = Ns[i];
const ck_tile::index_t K = Ks[i]; const ck_tile::index_t K = Ks[i];
stride_As[i] = ck_tile::get_default_stride(M, N, stride_As[i], a_layout); stride_As[i] =
stride_Bs[i] = ck_tile::get_default_stride(K, N, stride_Bs[i], b_layout); ck_tile::get_default_stride(M, N, stride_As[i], is_row_major(a_layout));
stride_Cs[i] = ck_tile::get_default_stride(M, N, stride_Cs[i], CLayout{}); stride_Bs[i] =
ck_tile::get_default_stride(K, N, stride_Bs[i], is_row_major(b_layout));
stride_Cs[i] =
ck_tile::get_default_stride(M, N, stride_Cs[i], is_row_major(CLayout{}));
a_m_k_tensors.push_back(ck_tile::HostTensor<ADataType>( a_m_k_tensors.push_back(ck_tile::HostTensor<ADataType>(
ck_tile::host_tensor_descriptor(M, K, stride_As[i], a_layout))); ck_tile::host_tensor_descriptor(M, K, stride_As[i], is_row_major(a_layout))));
b_k_n_tensors.push_back(ck_tile::HostTensor<BDataType>( b_k_n_tensors.push_back(ck_tile::HostTensor<BDataType>(
ck_tile::host_tensor_descriptor(K, N, stride_Bs[i], b_layout))); ck_tile::host_tensor_descriptor(K, N, stride_Bs[i], is_row_major(b_layout))));
c_m_n_tensors.push_back(ck_tile::HostTensor<CDataType>( c_m_n_tensors.push_back(ck_tile::HostTensor<CDataType>(
ck_tile::host_tensor_descriptor(M, N, stride_Cs[i], CLayout{}))); ck_tile::host_tensor_descriptor(M, N, stride_Cs[i], is_row_major(CLayout{}))));
std::cout << "gemm[" << i << "]" std::cout << "gemm[" << i << "]"
<< " a_m_k: " << a_m_k_tensors[i].mDesc << " b_k_n: " << b_k_n_tensors[i].mDesc << " a_m_k: " << a_m_k_tensors[i].mDesc << " b_k_n: " << b_k_n_tensors[i].mDesc
...@@ -177,8 +187,8 @@ int run_grouped_gemm_example_with_layouts(int argc, ...@@ -177,8 +187,8 @@ int run_grouped_gemm_example_with_layouts(int argc,
{ {
for(int i = 0; i < group_count; ++i) for(int i = 0; i < group_count; ++i)
{ {
ck_tile::HostTensor<CDataType> c_m_n_host_ref( ck_tile::HostTensor<CDataType> c_m_n_host_ref(ck_tile::host_tensor_descriptor(
ck_tile::host_tensor_descriptor(Ms[i], Ns[i], stride_Cs[i], CLayout{})); Ms[i], Ns[i], stride_Cs[i], is_row_major(CLayout{})));
c_m_n_host_ref.SetZero(); c_m_n_host_ref.SetZero();
ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>( ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref); a_m_k_tensors[i], b_k_n_tensors[i], c_m_n_host_ref);
......
...@@ -679,12 +679,15 @@ struct HostTensor ...@@ -679,12 +679,15 @@ struct HostTensor
Data mData; Data mData;
}; };
template <typename TLayout> template <bool is_row_major>
auto host_tensor_descriptor(std::size_t row, std::size_t col, std::size_t stride, TLayout layout) auto host_tensor_descriptor(std::size_t row,
std::size_t col,
std::size_t stride,
bool_constant<is_row_major>)
{ {
using namespace ck_tile::literals; using namespace ck_tile::literals;
if constexpr(std::is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>) if constexpr(is_row_major)
{ {
return HostTensorDescriptor({row, col}, {stride, 1_uz}); return HostTensorDescriptor({row, col}, {stride, 1_uz});
} }
...@@ -693,12 +696,15 @@ auto host_tensor_descriptor(std::size_t row, std::size_t col, std::size_t stride ...@@ -693,12 +696,15 @@ auto host_tensor_descriptor(std::size_t row, std::size_t col, std::size_t stride
return HostTensorDescriptor({row, col}, {1_uz, stride}); return HostTensorDescriptor({row, col}, {1_uz, stride});
} }
} }
template <typename TLayout> template <bool is_row_major>
auto get_default_stride(std::size_t row, std::size_t col, std::size_t stride, TLayout layout) auto get_default_stride(std::size_t row,
std::size_t col,
std::size_t stride,
bool_constant<is_row_major>)
{ {
if(stride == 0) if(stride == 0)
{ {
if constexpr(std::is_same_v<decltype(layout), tensor_layout::gemm::RowMajor>) if constexpr(is_row_major)
{ {
return col; return col;
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment