Commit e4e99a49 authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Use new utilities to shorten codes

parent 7acbf104
...@@ -63,10 +63,10 @@ template <typename DataType> ...@@ -63,10 +63,10 @@ template <typename DataType>
std::ostream& show_2d_matrix(std::ostream& os, Tensor<DataType>& matrix) std::ostream& show_2d_matrix(std::ostream& os, Tensor<DataType>& matrix)
{ {
os << "[" << std::endl; os << "[" << std::endl;
for(size_t x = 0; x < matrix.mDesc.GetLengths()[0]; x++) for(size_t x = 0; x < matrix.GetLengths()[0]; x++)
{ {
os << "["; os << "[";
for(size_t y = 0; y < matrix.mDesc.GetLengths()[1]; y++) for(size_t y = 0; y < matrix.GetLengths()[1]; y++)
{ {
os << std::setw(5) << static_cast<float>(matrix(x, y)); os << std::setw(5) << static_cast<float>(matrix(x, y));
} }
...@@ -133,17 +133,17 @@ int main(int argc, char* argv[]) ...@@ -133,17 +133,17 @@ int main(int argc, char* argv[])
exit(0); exit(0);
} }
using namespace ck::literals;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -152,9 +152,9 @@ int main(int argc, char* argv[]) ...@@ -152,9 +152,9 @@ int main(int argc, char* argv[])
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_host_result.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -173,12 +173,12 @@ int main(int argc, char* argv[]) ...@@ -173,12 +173,12 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_1<ADataType>{1}); b_k_n.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
} }
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_m_k_device_buf(a_m_k.GetMemorySize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_k_n_device_buf(b_k_n.GetMemorySize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); DeviceMem c_m_n_device_buf(c_m_n_device_result.GetMemorySize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data()); a_m_k_device_buf.ToDevice(a_m_k.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.data());
auto a_element_op = AElementOp{}; auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{}; auto b_element_op = BElementOp{};
...@@ -187,9 +187,9 @@ int main(int argc, char* argv[]) ...@@ -187,9 +187,9 @@ int main(int argc, char* argv[])
// do GEMM // do GEMM
auto gemm = DeviceGemmInstance{}; auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker(); auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()), auto argument = gemm.MakeArgument(a_m_k_device_buf.GetDeviceBuffer(),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()), b_k_n_device_buf.GetDeviceBuffer(),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()), c_m_n_device_buf.GetDeviceBuffer(),
M, M,
N, N,
K, K,
...@@ -220,7 +220,7 @@ int main(int argc, char* argv[]) ...@@ -220,7 +220,7 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl; << gemm.GetTypeString() << std::endl;
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_m_n_device_buf.FromDevice(c_m_n_device_result.data());
if(do_verification) if(do_verification)
{ {
...@@ -240,7 +240,7 @@ int main(int argc, char* argv[]) ...@@ -240,7 +240,7 @@ int main(int argc, char* argv[])
show_2d_matrix(std::cout << "c_host :", c_m_n_host_result) << std::endl; show_2d_matrix(std::cout << "c_host :", c_m_n_host_result) << std::endl;
} }
#endif #endif
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
} }
return 0; return 0;
......
...@@ -9,10 +9,10 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -9,10 +9,10 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
static_assert(sizeof(ck::int4_t) == sizeof(int8_t)); static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
#endif #endif
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size; auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
using namespace ck::literals;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>) if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
...@@ -32,41 +32,38 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -32,41 +32,38 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{ {
case 0: break; case 0: break;
case 1: case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k.begin(), ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
a_m_k.end()); ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n.begin(),
b_k_n.end());
break; break;
default: default:
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k.begin(), a_m_k.end()); ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n.begin(), b_k_n.end()); ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
} }
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_host_result.GetDesc() << std::endl;
#ifdef BUILD_INT4_EXAMPLE #ifdef BUILD_INT4_EXAMPLE
DeviceMem a_m_k_device_buf(sizeof(KernelADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_m_k_device_buf(a_m_k.GetMemorySize());
DeviceMem b_k_n_device_buf(sizeof(KernelBDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_k_n_device_buf(b_k_n.GetMemorySize());
DeviceMem c_m_n_device_buf(sizeof(KernelCDataType) * DeviceMem c_m_n_device_buf(c_m_n_device_result.GetMemorySize());
c_m_n_device_result.mDesc.GetElementSpaceSize());
const Tensor<KernelADataType> a_m_k_converted(a_m_k); const Tensor<KernelADataType> a_m_k_converted(a_m_k);
const Tensor<KernelBDataType> b_k_n_converted(b_k_n); const Tensor<KernelBDataType> b_k_n_converted(b_k_n);
a_m_k_device_buf.ToDevice(a_m_k_converted.mData.data()); a_m_k_device_buf.ToDevice(a_m_k_converted.data());
b_k_n_device_buf.ToDevice(b_k_n_converted.mData.data()); b_k_n_device_buf.ToDevice(b_k_n_converted.data());
#else #else
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_m_k_device_buf(a_m_k.GetMemorySize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_k_n_device_buf(b_k_n.GetMemorySize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); DeviceMem c_m_n_device_buf(c_m_n_device_result.GetMemorySize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data()); a_m_k_device_buf.ToDevice(a_m_k.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.data());
#endif #endif
auto a_element_op = AElementOp{}; auto a_element_op = AElementOp{};
...@@ -76,16 +73,9 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -76,16 +73,9 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
// do GEMM // do GEMM
auto gemm = DeviceGemmInstance{}; auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker(); auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument( auto argument = gemm.MakeArgument(a_m_k_device_buf.GetDeviceBuffer(),
#ifdef BUILD_INT4_EXAMPLE b_k_n_device_buf.GetDeviceBuffer(),
static_cast<KernelADataType*>(a_m_k_device_buf.GetDeviceBuffer()), c_m_n_device_buf.GetDeviceBuffer(),
static_cast<KernelBDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<KernelCDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
#else
static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
#endif
M, M,
N, N,
K, K,
...@@ -127,17 +117,17 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config) ...@@ -127,17 +117,17 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
#ifdef BUILD_INT4_EXAMPLE #ifdef BUILD_INT4_EXAMPLE
Tensor<CDataType> c_m_n_device_result_converted(c_m_n_host_result.mDesc); Tensor<CDataType> c_m_n_device_result_converted(c_m_n_host_result.GetDesc());
c_m_n_device_buf.FromDevice(c_m_n_device_result_converted.mData.data()); c_m_n_device_buf.FromDevice(c_m_n_device_result_converted.data());
c_m_n_device_result = c_m_n_device_result_converted.CopyAsType<CDataType>(); c_m_n_device_result = c_m_n_device_result_converted.CopyAsType<CDataType>();
return ck::utils::check_err(c_m_n_device_result_converted.mData, c_m_n_host_result.mData); return ck::utils::check_err(c_m_n_device_result_converted, c_m_n_host_result);
#else #else
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_m_n_device_buf.FromDevice(c_m_n_device_result.data());
return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData); return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
#endif #endif
} }
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <initializer_list>
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/utility/check_err.hpp"
struct AlphaBetaAdd struct AlphaBetaAdd
{ {
...@@ -175,17 +177,17 @@ int main(int argc, char* argv[]) ...@@ -175,17 +177,17 @@ int main(int argc, char* argv[])
exit(0); exit(0);
} }
using namespace ck::literals;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -195,10 +197,10 @@ int main(int argc, char* argv[]) ...@@ -195,10 +197,10 @@ int main(int argc, char* argv[])
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "d_m_n: " << d_m_n.mDesc << std::endl; std::cout << "d_m_n: " << d_m_n.GetDesc() << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl; std::cout << "e_m_n: " << e_m_n_host_result.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -214,15 +216,15 @@ int main(int argc, char* argv[]) ...@@ -214,15 +216,15 @@ int main(int argc, char* argv[])
d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5}); d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize()); DeviceMem d_device_buf(d_m_n.GetMemorySize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize()); DeviceMem e_device_buf(e_m_n_device_result.GetMemorySize());
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.data());
d_device_buf.ToDevice(d_m_n.mData.data()); d_device_buf.ToDevice(d_m_n.data());
e_device_buf.ToDevice(e_m_n_device_result.mData.data()); e_device_buf.ToDevice(e_m_n_device_result.data());
auto a_element_op = AElementOp{}; auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{}; auto b_element_op = BElementOp{};
...@@ -231,17 +233,16 @@ int main(int argc, char* argv[]) ...@@ -231,17 +233,16 @@ int main(int argc, char* argv[])
// do GEMM // do GEMM
auto device_op = DeviceOpInstance{}; auto device_op = DeviceOpInstance{};
auto invoker = device_op.MakeInvoker(); auto invoker = device_op.MakeInvoker();
auto argument = auto argument = device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(), b_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()}, ck::utils::to_array({d_device_buf.GetDeviceBuffer()}),
e_device_buf.GetDeviceBuffer(), e_device_buf.GetDeviceBuffer(),
M, M,
N, N,
K, K,
StrideA, StrideA,
StrideB, StrideB,
std::array<ck::index_t, 1>{StrideD}, ck::utils::to_array({StrideD}),
StrideE, StrideE,
a_element_op, a_element_op,
b_element_op, b_element_op,
...@@ -267,12 +268,11 @@ int main(int argc, char* argv[]) ...@@ -267,12 +268,11 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s" std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl; << std::endl;
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.data());
if(do_verification) if(do_verification)
{ {
Tensor<CShuffleDataType> c_m_n(HostTensorDescriptor( Tensor<CShuffleDataType> c_m_n(HostTensorDescriptor({M, N}));
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType, using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType, BDataType,
...@@ -297,9 +297,9 @@ int main(int argc, char* argv[]) ...@@ -297,9 +297,9 @@ int main(int argc, char* argv[])
} }
} }
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.data());
return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1; return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
} }
return 0; return 0;
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <initializer_list>
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/utility/check_err.hpp"
template <ck::index_t... Is> template <ck::index_t... Is>
using S = ck::Sequence<Is...>; using S = ck::Sequence<Is...>;
...@@ -153,17 +155,17 @@ int main(int argc, char* argv[]) ...@@ -153,17 +155,17 @@ int main(int argc, char* argv[])
exit(0); exit(0);
} }
using namespace ck::literals;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -173,10 +175,10 @@ int main(int argc, char* argv[]) ...@@ -173,10 +175,10 @@ int main(int argc, char* argv[])
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "d_m_n: " << d_m_n.mDesc << std::endl; std::cout << "d_m_n: " << d_m_n.GetDesc() << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl; std::cout << "e_m_n: " << e_m_n_host_result.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -192,14 +194,14 @@ int main(int argc, char* argv[]) ...@@ -192,14 +194,14 @@ int main(int argc, char* argv[])
d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0}); d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize()); DeviceMem d_device_buf(d_m_n.GetMemorySize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize()); DeviceMem e_device_buf(e_m_n_device_result.GetMemorySize());
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.data());
d_device_buf.ToDevice(d_m_n.mData.data()); d_device_buf.ToDevice(d_m_n.data());
auto a_element_op = AElementOp{}; auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{}; auto b_element_op = BElementOp{};
...@@ -210,17 +212,16 @@ int main(int argc, char* argv[]) ...@@ -210,17 +212,16 @@ int main(int argc, char* argv[])
auto invoker = device_op.MakeInvoker(); auto invoker = device_op.MakeInvoker();
auto argument = auto argument = device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(), b_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()}, ck::utils::to_array({d_device_buf.GetDeviceBuffer()}),
e_device_buf.GetDeviceBuffer(), e_device_buf.GetDeviceBuffer(),
M, M,
N, N,
K, K,
StrideA, StrideA,
StrideB, StrideB,
std::array<ck::index_t, 1>{0}, ck::utils::to_array({0}),
StrideE, StrideE,
a_element_op, a_element_op,
b_element_op, b_element_op,
...@@ -247,7 +248,7 @@ int main(int argc, char* argv[]) ...@@ -247,7 +248,7 @@ int main(int argc, char* argv[])
if(do_verification) if(do_verification)
{ {
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.data());
Tensor<AccDataType> c_m_n(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); Tensor<AccDataType> c_m_n(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
...@@ -275,7 +276,7 @@ int main(int argc, char* argv[]) ...@@ -275,7 +276,7 @@ int main(int argc, char* argv[])
} }
} }
return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1; return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
} }
return 0; return 0;
......
...@@ -35,11 +35,11 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC ...@@ -35,11 +35,11 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
> >
e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{})); e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl; std::cout << "d0_m_n: " << d0_m_n.GetDesc() << std::endl;
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl; std::cout << "d1_m_n: " << d1_m_n.GetDesc() << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl; std::cout << "e_m_n: " << e_m_n_host_result.GetDesc() << std::endl;
switch(config.init_method) switch(config.init_method)
{ {
...@@ -57,11 +57,11 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC ...@@ -57,11 +57,11 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0}); d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
} }
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize()); DeviceMem d0_device_buf(d0_m_n.GetMemorySize());
DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize()); DeviceMem d1_device_buf(d1_m_n.GetMemorySize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize()); DeviceMem e_device_buf(e_m_n_device_result.GetMemorySize());
#ifdef BUILD_INT4_EXAMPLE #ifdef BUILD_INT4_EXAMPLE
const Tensor<KernelADataType> a_m_k_converted(a_m_k); const Tensor<KernelADataType> a_m_k_converted(a_m_k);
...@@ -69,15 +69,15 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC ...@@ -69,15 +69,15 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
const Tensor<KernelD0DataType> d0_m_n_converted(d0_m_n); const Tensor<KernelD0DataType> d0_m_n_converted(d0_m_n);
const Tensor<KernelD1DataType> d1_m_n_converted(d1_m_n); const Tensor<KernelD1DataType> d1_m_n_converted(d1_m_n);
a_device_buf.ToDevice(a_m_k_converted.mData.data()); a_device_buf.ToDevice(a_m_k_converted.data());
b_device_buf.ToDevice(b_k_n_converted.mData.data()); b_device_buf.ToDevice(b_k_n_converted.data());
d0_device_buf.ToDevice(d0_m_n_converted.mData.data()); d0_device_buf.ToDevice(d0_m_n_converted.data());
d1_device_buf.ToDevice(d1_m_n_converted.mData.data()); d1_device_buf.ToDevice(d1_m_n_converted.data());
#else #else
a_device_buf.ToDevice(a_m_k.mData.data()); a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.mData.data()); b_device_buf.ToDevice(b_k_n.data());
d0_device_buf.ToDevice(d0_m_n.mData.data()); d0_device_buf.ToDevice(d0_m_n.data());
d1_device_buf.ToDevice(d1_m_n.mData.data()); d1_device_buf.ToDevice(d1_m_n.data());
#endif #endif
auto a_element_op = AElementOp{}; auto a_element_op = AElementOp{};
...@@ -142,14 +142,14 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC ...@@ -142,14 +142,14 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
} }
} }
e_device_buf.FromDevice(e_m_n_device_result.mData.data()); e_device_buf.FromDevice(e_m_n_device_result.data());
#ifdef BUILD_INT4_EXAMPLE #ifdef BUILD_INT4_EXAMPLE
const Tensor<EDataType> e_m_n_device_result_converted(e_m_n_device_result); const Tensor<EDataType> e_m_n_device_result_converted(e_m_n_device_result);
return ck::utils::check_err(e_m_n_device_result_converted.mData, e_m_n_host_result.mData); return ck::utils::check_err(e_m_n_device_result_converted, e_m_n_host_result);
#else #else
return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData); return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
#endif #endif
} }
......
...@@ -10,13 +10,14 @@ ...@@ -10,13 +10,14 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
void print_helper_msg() void print_helper_msg()
{ {
...@@ -50,9 +51,9 @@ bool run_grouped_conv_fwd(bool do_verification, ...@@ -50,9 +51,9 @@ bool run_grouped_conv_fwd(bool do_verification,
Tensor<OutDataType> out_host(out_g_n_k_wos_desc); Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
Tensor<OutDataType> out_device(out_g_n_k_wos_desc); Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
std::cout << "in: " << in.mDesc << std::endl; std::cout << "in: " << in.GetDesc() << std::endl;
std::cout << "wei: " << wei.mDesc << std::endl; std::cout << "wei: " << wei.GetDesc() << std::endl;
std::cout << "out: " << out_host.mDesc << std::endl; std::cout << "out: " << out_host.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -66,56 +67,34 @@ bool run_grouped_conv_fwd(bool do_verification, ...@@ -66,56 +67,34 @@ bool run_grouped_conv_fwd(bool do_verification,
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5}); wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
} }
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); DeviceMem in_device_buf(in.GetMemorySize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize()); DeviceMem wei_device_buf(wei.GetMemorySize());
DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize()); DeviceMem out_device_buf(out_device.GetMemorySize());
in_device_buf.ToDevice(in.mData.data()); in_device_buf.ToDevice(in.data());
wei_device_buf.ToDevice(wei.mData.data()); wei_device_buf.ToDevice(wei.data());
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{}; using ck::utils::empty_array, ck::utils::to_array;
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
// do Conv // do Conv
auto conv = DeviceConvNDFwdInstance{}; auto conv = DeviceConvNDFwdInstance{};
auto invoker = conv.MakeInvoker(); auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(), auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(), wei_device_buf.GetDeviceBuffer(),
std::array<const void*, 0>{}, empty_array(),
out_device_buf.GetDeviceBuffer(), out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths, to_array(in_g_n_c_wis_desc.GetLengths()),
a_g_n_c_wis_strides, to_array(in_g_n_c_wis_desc.GetStrides()),
b_g_k_c_xs_lengths, to_array(wei_g_k_c_xs_desc.GetLengths()),
b_g_k_c_xs_strides, to_array(wei_g_k_c_xs_desc.GetStrides()),
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}}, empty_array(),
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}}, empty_array(),
e_g_n_k_wos_lengths, to_array(out_g_n_k_wos_desc.GetLengths()),
e_g_n_k_wos_strides, to_array(out_g_n_k_wos_desc.GetStrides()),
conv_filter_strides, to_array(conv_param.conv_filter_strides_),
conv_filter_dilations, to_array(conv_param.conv_filter_dilations_),
input_left_pads, to_array(conv_param.input_left_pads_),
input_right_pads, to_array(conv_param.input_right_pads_),
in_element_op, in_element_op,
wei_element_op, wei_element_op,
out_element_op); out_element_op);
...@@ -161,10 +140,10 @@ bool run_grouped_conv_fwd(bool do_verification, ...@@ -161,10 +140,10 @@ bool run_grouped_conv_fwd(bool do_verification,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(out_device.mData.data()); out_device_buf.FromDevice(out_device.data());
return ck::utils::check_err( return ck::utils::check_err(
out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f); out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
} }
return true; return true;
......
...@@ -16,6 +16,9 @@ ...@@ -16,6 +16,9 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
...@@ -23,7 +26,6 @@ ...@@ -23,7 +26,6 @@
#include "ck/library/utility/fill.hpp" #include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using BF16 = ck::bhalf_t; using BF16 = ck::bhalf_t;
using FP16 = ck::half_t; using FP16 = ck::half_t;
...@@ -140,9 +142,7 @@ make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size) ...@@ -140,9 +142,7 @@ make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
{ {
std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_}; std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_};
std::copy(begin(problem_size.output_spatial_lengths_), ck::ranges::copy(problem_size.output_spatial_lengths_, std::back_inserter(dimensions));
end(problem_size.output_spatial_lengths_),
std::back_inserter(dimensions));
return HostTensorDescriptor(dimensions); return HostTensorDescriptor(dimensions);
} }
...@@ -158,10 +158,3 @@ void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor, ...@@ -158,10 +158,3 @@ void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor,
assert(size(descriptor.GetStrides()) == size(strides)); assert(size(descriptor.GetStrides()) == size(strides));
std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides)); std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides));
} }
template <typename Range, typename OutputIterator>
auto copy(const Range& range, OutputIterator iter)
-> decltype(std::copy(std::begin(range), std::end(range), iter))
{
return std::copy(std::begin(range), std::end(range), iter);
}
...@@ -77,32 +77,28 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, ...@@ -77,32 +77,28 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
{ {
case 0: break; case 0: break;
case 1: case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input.begin(), ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input);
conv_input.end()); ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight.begin(),
conv_weight.end());
break; break;
default: default:
ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input.begin(), conv_input.end()); ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input);
ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight.begin(), ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight);
conv_weight.end());
} }
DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize()); DeviceMem conv_input_device_buf(conv_input.GetMemorySize());
DeviceMem conv_weight_device_buf(sizeof(BDataType) * conv_weight.mDesc.GetElementSpaceSize()); DeviceMem conv_weight_device_buf(conv_weight.GetMemorySize());
DeviceMem conv_output_device_buf(sizeof(EDataType) * DeviceMem conv_output_device_buf(conv_output_device.GetMemorySize());
conv_output_device.mDesc.GetElementSpaceSize()); DeviceMem r0_device_buf(r0_device.GetMemorySize());
DeviceMem r0_device_buf(sizeof(R0DataType) * r0_device.mDesc.GetElementSpaceSize());
#ifdef BUILD_INT4_EXAMPLE #ifdef BUILD_INT4_EXAMPLE
const Tensor<KernelADataType> conv_input_converted(conv_input); const Tensor<KernelADataType> conv_input_converted(conv_input);
const Tensor<KernelBDataType> conv_weight_converted(conv_weight); const Tensor<KernelBDataType> conv_weight_converted(conv_weight);
conv_input_device_buf.ToDevice(conv_input_converted.mData.data()); conv_input_device_buf.ToDevice(conv_input_converted.data());
conv_weight_device_buf.ToDevice(conv_weight_converted.mData.data()); conv_weight_device_buf.ToDevice(conv_weight_converted.data());
#else #else
conv_input_device_buf.ToDevice(conv_input.mData.data()); conv_input_device_buf.ToDevice(conv_input.data());
conv_weight_device_buf.ToDevice(conv_weight.mData.data()); conv_weight_device_buf.ToDevice(conv_weight.data());
#endif #endif
std::array<ck::index_t, NDimSpatial + 3> conv_input_g_n_c_wis_lengths{}, std::array<ck::index_t, NDimSpatial + 3> conv_input_g_n_c_wis_lengths{},
...@@ -112,8 +108,6 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, ...@@ -112,8 +108,6 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
std::array<ck::index_t, NDimSpatial + 3> conv_output_g_n_k_wos_lengths{}, std::array<ck::index_t, NDimSpatial + 3> conv_output_g_n_k_wos_lengths{},
conv_output_g_n_k_wos_strides{}; conv_output_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial + 2> r0_lengths{}, r0_strides{}; std::array<ck::index_t, NDimSpatial + 2> r0_lengths{}, r0_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{}, conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{}, input_right_pads{};
unpack_host_tensor_descriptor( unpack_host_tensor_descriptor(
conv_input_g_n_c_wis_desc, conv_input_g_n_c_wis_lengths, conv_input_g_n_c_wis_strides); conv_input_g_n_c_wis_desc, conv_input_g_n_c_wis_lengths, conv_input_g_n_c_wis_strides);
...@@ -123,33 +117,30 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, ...@@ -123,33 +117,30 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides); conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides);
unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides); unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides);
copy(problem_size.conv_filter_strides_, begin(conv_filter_strides)); using ck::utils::empty_array, ck::utils::to_array;
copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations));
copy(problem_size.input_left_pads_, begin(input_left_pads));
copy(problem_size.input_right_pads_, begin(input_right_pads));
// run Conv + Reduction on device // run Conv + Reduction on device
auto conv = DeviceInstance<NDimSpatial>{}; auto conv = DeviceInstance<NDimSpatial>{};
auto invoker = conv.MakeInvoker(); auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(conv_input_device_buf.GetDeviceBuffer(), auto argument = conv.MakeArgument(conv_input_device_buf.GetDeviceBuffer(),
conv_weight_device_buf.GetDeviceBuffer(), conv_weight_device_buf.GetDeviceBuffer(),
std::array<const void*, 0>{}, empty_array(),
conv_output_device_buf.GetDeviceBuffer(), conv_output_device_buf.GetDeviceBuffer(),
{r0_device_buf.GetDeviceBuffer()}, {r0_device_buf.GetDeviceBuffer()},
conv_input_g_n_c_wis_lengths, conv_input_g_n_c_wis_lengths,
conv_input_g_n_c_wis_strides, conv_input_g_n_c_wis_strides,
conv_weight_g_k_c_xs_lengths, conv_weight_g_k_c_xs_lengths,
conv_weight_g_k_c_xs_strides, conv_weight_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}}, empty_array(),
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}}, empty_array(),
conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_lengths,
conv_output_g_n_k_wos_strides, conv_output_g_n_k_wos_strides,
r0_lengths, r0_lengths,
r0_strides, r0_strides,
conv_filter_strides, to_array(problem_size.conv_filter_strides_),
conv_filter_dilations, to_array(problem_size.conv_filter_dilations_),
input_left_pads, to_array(problem_size.input_left_pads_),
input_right_pads, to_array(problem_size.input_right_pads_),
AElementOp{}, AElementOp{},
BElementOp{}, BElementOp{},
CDEElementOp{}, CDEElementOp{},
...@@ -194,11 +185,11 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, ...@@ -194,11 +185,11 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
Tensor<R0DataType> r0_host(r0_device.mDesc); Tensor<R0DataType> r0_host(r0_device.GetDesc());
auto reduce0_op = RsThreadReduceOp{}[ck::Number<0>{}]; auto reduce0_op = RsThreadReduceOp{}[ck::Number<0>{}];
auto& output_dims = conv_output_g_n_k_wos_desc.GetLengths(); auto output_dims = conv_output_g_n_k_wos_desc.GetLengths();
if constexpr(NDimSpatial == 1) if constexpr(NDimSpatial == 1)
{ {
...@@ -273,19 +264,16 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, ...@@ -273,19 +264,16 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
} }
} }
conv_output_device_buf.FromDevice(conv_output_device.mData.data()); conv_output_device_buf.FromDevice(conv_output_device.data());
r0_device_buf.FromDevice(r0_device.mData.data()); r0_device_buf.FromDevice(r0_device.data());
return ck::utils::check_err(conv_output_device.mData, return ck::utils::check_err(conv_output_device,
conv_output_host.mData, conv_output_host,
"Error: incorrect results! (Matrix E)", "Error: incorrect results! (Matrix E)",
1e-5f, 1e-5f,
1e-4f) && 1e-4f) &&
ck::utils::check_err(r0_device.mData, ck::utils::check_err(
r0_host.mData, r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-5f, 1e-4f);
"Error: incorrect results! (Matrix R0)",
1e-5f,
1e-4f);
} }
return true; return true;
......
...@@ -7,15 +7,17 @@ ...@@ -7,15 +7,17 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp" #include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_reduction.hpp" #include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/ranges.hpp"
#include "reduce_example_common.hpp" #include "reduce_example_common.hpp"
...@@ -156,11 +158,11 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -156,11 +158,11 @@ int reduce_blockwise_impl(bool do_verification,
Tensor<int> out_indices_ref(outLengths); Tensor<int> out_indices_ref(outLengths);
Tensor<int> out_indices(outLengths); Tensor<int> out_indices(outLengths);
auto inStrides = in.mDesc.GetStrides(); auto inStrides = in.GetStrides();
auto outStrides = out.mDesc.GetStrides(); auto outStrides = out.GetStrides();
size_t invariant_total_length = out.mDesc.GetElementSize(); size_t invariant_total_length = out.GetElementSize();
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; size_t reduce_total_length = in.GetElementSize() / invariant_total_length;
std::size_t num_thread = 1; std::size_t num_thread = 1;
...@@ -187,42 +189,43 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -187,42 +189,43 @@ int reduce_blockwise_impl(bool do_verification,
} }
if(beta != 0.0f) if(beta != 0.0f)
for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++) {
out.mData[i] = out_ref.mData[i]; ck::ranges::copy(out_ref, out.begin());
}
}; };
// these buffers are usually provided by the user application // these buffers are usually provided by the user application
DeviceMem in_dev(sizeof(InOutDataTypeInDevice) * in.mDesc.GetElementSpaceSize()); DeviceMem in_dev(in.GetMemorySize());
DeviceMem out_dev(sizeof(InOutDataTypeInDevice) * out.mDesc.GetElementSpaceSize()); DeviceMem out_dev(out.GetMemorySize());
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if(std::is_same<InOutDataType, int4_t>::value) if(std::is_same<InOutDataType, int4_t>::value)
{ {
std::vector<InOutDataTypeInDevice> tmp_buf(in.mData.size()); std::vector<InOutDataTypeInDevice> tmp_buf(in.size());
std::copy_n(in.mData.data(), in.mData.size(), tmp_buf.data()); std::copy_n(in.data(), in.size(), tmp_buf.data());
in_dev.ToDevice(tmp_buf.data()); in_dev.ToDevice(tmp_buf.data());
} }
else else
#endif #endif
in_dev.ToDevice(in.mData.data()); in_dev.ToDevice(in.data());
if(beta != 0.0f) if(beta != 0.0f)
{ {
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if(std::is_same<InOutDataType, int4_t>::value) if(std::is_same<InOutDataType, int4_t>::value)
{ {
std::vector<InOutDataTypeInDevice> tmp_buf(in.mData.size()); std::vector<InOutDataTypeInDevice> tmp_buf(in.size());
std::copy_n(out.mData.data(), out.mData.size(), tmp_buf.data()); std::copy_n(out.data(), out.size(), tmp_buf.data());
out_dev.ToDevice(tmp_buf.data()); out_dev.ToDevice(tmp_buf.data());
} }
else else
#endif #endif
out_dev.ToDevice(out.mData.data()); out_dev.ToDevice(out.data());
}; };
size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0; size_t indicesSizeInBytes = OutputIndex ? out.GetElementSize() * sizeof(int32_t) : 0;
DeviceMem out_index_dev(indicesSizeInBytes); DeviceMem out_index_dev(indicesSizeInBytes);
...@@ -245,33 +248,25 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -245,33 +248,25 @@ int reduce_blockwise_impl(bool do_verification,
NumReduceDim, NumReduceDim,
PropagateNan, PropagateNan,
OutputIndex> OutputIndex>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); hostReduce(in.GetDesc(), out_ref.GetDesc(), invariantDims, reduceDims);
hostReduce.Run(alpha, hostReduce.Run(alpha,
in.mData.data(), in.data(),
beta, beta,
out_ref.mData.data(), out_ref.data(),
out_indices_ref.mData.data(), out_indices_ref.data(),
in_elementwise_op, in_elementwise_op,
acc_elementwise_op); acc_elementwise_op);
}; };
std::vector<ck::index_t> i_inLengths; using Indices = std::vector<ck::index_t>;
std::vector<ck::index_t> i_inStrides;
std::vector<ck::index_t> i_outLengths;
std::vector<ck::index_t> i_outStrides;
i_inLengths.assign(inLengths.begin(), inLengths.end());
i_inStrides.assign(inStrides.begin(), inStrides.end());
i_outLengths.assign(outLengths.begin(), outLengths.end());
i_outStrides.assign(outStrides.begin(), outStrides.end());
auto reduce = DeviceReduceInstance{}; auto reduce = DeviceReduceInstance{};
auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths, auto argument_ptr = reduce.MakeArgumentPointer(ck::ranges::to<Indices>(inLengths),
i_inStrides, ck::ranges::to<Indices>(inStrides),
i_outLengths, ck::ranges::to<Indices>(outLengths),
i_outStrides, ck::ranges::to<Indices>(outStrides),
reduceDims, reduceDims,
alpha, alpha,
beta, beta,
...@@ -312,22 +307,22 @@ int reduce_blockwise_impl(bool do_verification, ...@@ -312,22 +307,22 @@ int reduce_blockwise_impl(bool do_verification,
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 #ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if(std::is_same<InOutDataType, int4_t>::value) if(std::is_same<InOutDataType, int4_t>::value)
{ {
std::vector<InOutDataTypeInDevice> tmp_buf(out.mData.size()); std::vector<InOutDataTypeInDevice> tmp_buf(out.size());
out_dev.FromDevice(tmp_buf.data()); out_dev.FromDevice(tmp_buf.data());
std::copy_n(tmp_buf.data(), out.mData.size(), out.mData.data()); std::copy_n(tmp_buf.data(), out.size(), out.data());
} }
else else
#endif #endif
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.data());
pass = pass && ck::utils::check_err(out.mData, out_ref.mData); pass = pass && ck::utils::check_err(out, out_ref);
if(OutputIndex) if(OutputIndex)
{ {
out_index_dev.FromDevice(out_indices.mData.data()); out_index_dev.FromDevice(out_indices.data());
pass = pass && ck::utils::check_err(out_indices.mData, out_indices_ref.mData); pass = pass && ck::utils::check_err(out_indices, out_indices_ref);
}; };
}; };
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <initializer_list>
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
#include <sstream> #include <sstream>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h> #include <getopt.h>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp" #include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_reduction.hpp" #include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/ranges.hpp"
using namespace ck; using namespace ck;
using namespace ck::tensor_operation::device; using namespace ck::tensor_operation::device;
...@@ -139,12 +142,12 @@ int main(int argc, char* argv[]) ...@@ -139,12 +142,12 @@ int main(int argc, char* argv[])
Tensor<InOutDataType> in_2(inLengths_2); // also the output tensor of the first reduction Tensor<InOutDataType> in_2(inLengths_2); // also the output tensor of the first reduction
Tensor<InOutDataType> out(outLengths); Tensor<InOutDataType> out(outLengths);
auto inStrides_1 = in_1.mDesc.GetStrides(); auto inStrides_1 = in_1.GetStrides();
auto inStrides_2 = in_2.mDesc.GetStrides(); auto inStrides_2 = in_2.GetStrides();
auto outStrides = out.mDesc.GetStrides(); auto outStrides = out.GetStrides();
size_t invariant_total_length = out.mDesc.GetElementSize(); size_t invariant_total_length = out.GetElementSize();
size_t reduce_total_length = in_1.mDesc.GetElementSize() / invariant_total_length; size_t reduce_total_length = in_1.GetElementSize() / invariant_total_length;
std::size_t num_thread = 1; std::size_t num_thread = 1;
...@@ -171,18 +174,19 @@ int main(int argc, char* argv[]) ...@@ -171,18 +174,19 @@ int main(int argc, char* argv[])
} }
if(beta != 0.0f) if(beta != 0.0f)
for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++) {
out.mData[i] = out_ref.mData[i]; ck::ranges::copy(out_ref, out.begin());
}
}; };
DeviceMem in_1_dev(sizeof(InOutDataType) * in_1.mDesc.GetElementSpaceSize()); DeviceMem in_1_dev(in_1.GetMemorySize());
DeviceMem in_2_dev(sizeof(InOutDataType) * in_2.mDesc.GetElementSpaceSize()); DeviceMem in_2_dev(in_2.GetMemorySize());
DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize()); DeviceMem out_dev(out.GetMemorySize());
in_1_dev.ToDevice(in_1.mData.data()); in_1_dev.ToDevice(in_1.data());
if(beta != 0.0f) if(beta != 0.0f)
out_dev.ToDevice(out.mData.data()); out_dev.ToDevice(out.data());
InElementwiseOperation in_elementwise_op; InElementwiseOperation in_elementwise_op;
AccElementwiseOperation acc_elementwise_op; AccElementwiseOperation acc_elementwise_op;
...@@ -203,37 +207,25 @@ int main(int argc, char* argv[]) ...@@ -203,37 +207,25 @@ int main(int argc, char* argv[])
2, // NumReduceDim 2, // NumReduceDim
PropagateNan, PropagateNan,
OutputIndex> OutputIndex>
hostReduce(in_1.mDesc, out_ref.mDesc, invariantDims, reduceDims); hostReduce(in_1.GetDesc(), out_ref.GetDesc(), invariantDims, reduceDims);
hostReduce.Run(alpha, hostReduce.Run(alpha,
in_1.mData.data(), in_1.data(),
beta, beta,
out_ref.mData.data(), out_ref.data(),
nullptr, nullptr,
in_elementwise_op, in_elementwise_op,
acc_elementwise_op); acc_elementwise_op);
}; };
std::vector<ck::index_t> i_inLengths_1; using Indices = std::vector<ck::index_t>;
std::vector<ck::index_t> i_inStrides_1;
std::vector<ck::index_t> i_inLengths_2;
std::vector<ck::index_t> i_inStrides_2;
std::vector<ck::index_t> i_outLengths;
std::vector<ck::index_t> i_outStrides;
i_inLengths_1.assign(inLengths_1.begin(), inLengths_1.end());
i_inStrides_1.assign(inStrides_1.begin(), inStrides_1.end());
i_inLengths_2.assign(inLengths_2.begin(), inLengths_2.end());
i_inStrides_2.assign(inStrides_2.begin(), inStrides_2.end());
i_outLengths.assign(outLengths.begin(), outLengths.end());
i_outStrides.assign(outStrides.begin(), outStrides.end());
auto reduce_1 = DeviceReduceInstance_1{}; auto reduce_1 = DeviceReduceInstance_1{};
auto argument_ptr_1 = reduce_1.MakeArgumentPointer(i_inLengths_1, auto argument_ptr_1 = reduce_1.MakeArgumentPointer(ck::ranges::to<Indices>(inLengths_1),
i_inStrides_1, ck::ranges::to<Indices>(inStrides_1),
i_inLengths_2, ck::ranges::to<Indices>(inLengths_2),
i_inStrides_2, ck::ranges::to<Indices>(inStrides_2),
reduceDims_1, reduceDims_1,
1.0f, 1.0f,
0.0f, 0.0f,
...@@ -255,10 +247,10 @@ int main(int argc, char* argv[]) ...@@ -255,10 +247,10 @@ int main(int argc, char* argv[])
auto reduce_2 = DeviceReduceInstance_2{}; auto reduce_2 = DeviceReduceInstance_2{};
auto argument_ptr_2 = reduce_2.MakeArgumentPointer(i_inLengths_2, auto argument_ptr_2 = reduce_2.MakeArgumentPointer(ck::ranges::to<Indices>(inLengths_2),
i_inStrides_2, ck::ranges::to<Indices>(inStrides_2),
i_outLengths, ck::ranges::to<Indices>(outLengths),
i_outStrides, ck::ranges::to<Indices>(outStrides),
reduceDims_2, reduceDims_2,
alpha, alpha,
beta, beta,
...@@ -293,8 +285,8 @@ int main(int argc, char* argv[]) ...@@ -293,8 +285,8 @@ int main(int argc, char* argv[])
if(do_verify) if(do_verify)
{ {
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.data());
pass = pass && ck::utils::check_err(out.mData, out_ref.mData); pass = pass && ck::utils::check_err(out, out_ref);
}; };
return (pass ? 0 : 1); return (pass ? 0 : 1);
......
...@@ -7,15 +7,17 @@ ...@@ -7,15 +7,17 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp" #include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_reduction.hpp" #include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/ranges.hpp"
#include "reduce_example_common.hpp" #include "reduce_example_common.hpp"
...@@ -95,11 +97,11 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -95,11 +97,11 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
Tensor<InOutDataType> out_ref(outLengths); Tensor<InOutDataType> out_ref(outLengths);
Tensor<InOutDataType> out(outLengths); Tensor<InOutDataType> out(outLengths);
auto inStrides = in.mDesc.GetStrides(); auto inStrides = in.GetStrides();
auto outStrides = out.mDesc.GetStrides(); auto outStrides = out.GetStrides();
size_t invariant_total_length = out.mDesc.GetElementSize(); size_t invariant_total_length = out.GetElementSize();
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length; size_t reduce_total_length = in.GetElementSize() / invariant_total_length;
std::size_t num_thread = 1; std::size_t num_thread = 1;
...@@ -126,18 +128,19 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -126,18 +128,19 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
} }
if(beta != 0.0f) if(beta != 0.0f)
for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++) {
out.mData[i] = out_ref.mData[i]; ck::ranges::copy(out_ref, out.begin());
}
}; };
// these buffers are usually provided by the user application // these buffers are usually provided by the user application
DeviceMem in_dev(sizeof(InOutDataType) * in.mDesc.GetElementSpaceSize()); DeviceMem in_dev(in.GetMemorySize());
DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize()); DeviceMem out_dev(out.GetMemorySize());
in_dev.ToDevice(in.mData.data()); in_dev.ToDevice(in.data());
if(beta != 0.0f) if(beta != 0.0f)
out_dev.ToDevice(out.mData.data()); out_dev.ToDevice(out.data());
InElementwiseOperation in_elementwise_op; InElementwiseOperation in_elementwise_op;
AccElementwiseOperation acc_elementwise_op; AccElementwiseOperation acc_elementwise_op;
...@@ -158,33 +161,20 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -158,33 +161,20 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
NumReduceDim, NumReduceDim,
PropagateNan, PropagateNan,
false> false>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); hostReduce(in.GetDesc(), out_ref.GetDesc(), invariantDims, reduceDims);
hostReduce.Run(alpha, hostReduce.Run(
in.mData.data(), alpha, in.data(), beta, out_ref.data(), nullptr, in_elementwise_op, acc_elementwise_op);
beta,
out_ref.mData.data(),
nullptr,
in_elementwise_op,
acc_elementwise_op);
}; };
std::vector<ck::index_t> i_inLengths; using Indices = std::vector<ck::index_t>;
std::vector<ck::index_t> i_inStrides;
std::vector<ck::index_t> i_outLengths;
std::vector<ck::index_t> i_outStrides;
i_inLengths.assign(inLengths.begin(), inLengths.end());
i_inStrides.assign(inStrides.begin(), inStrides.end());
i_outLengths.assign(outLengths.begin(), outLengths.end());
i_outStrides.assign(outStrides.begin(), outStrides.end());
auto reduce = DeviceReduceInstance{}; auto reduce = DeviceReduceInstance{};
auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths, auto argument_ptr = reduce.MakeArgumentPointer(ck::ranges::to<Indices>(inLengths),
i_inStrides, ck::ranges::to<Indices>(inStrides),
i_outLengths, ck::ranges::to<Indices>(outLengths),
i_outStrides, ck::ranges::to<Indices>(outStrides),
reduceDims, reduceDims,
alpha, alpha,
beta, beta,
...@@ -222,8 +212,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ...@@ -222,8 +212,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
if(do_verification) if(do_verification)
{ {
out_dev.FromDevice(out.mData.data()); out_dev.FromDevice(out.data());
pass = pass && ck::utils::check_err(out.mData, out_ref.mData); pass = pass && ck::utils::check_err(out, out_ref);
}; };
return (pass ? 0 : 1); return (pass ? 0 : 1);
......
...@@ -8,14 +8,16 @@ ...@@ -8,14 +8,16 @@
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp" #include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp" #include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp" #include "ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp" #include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
template <typename InDataType, template <typename InDataType,
typename OutDataType, typename OutDataType,
...@@ -56,8 +58,8 @@ static void pool_host_verify(const Tensor<InDataType>& in, ...@@ -56,8 +58,8 @@ static void pool_host_verify(const Tensor<InDataType>& in,
for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x) for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x)
{ {
ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1]; ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1];
if(hi >= 0 && hi < static_cast<ck::index_t>(in.mDesc.GetLengths()[2]) && if(hi >= 0 && hi < static_cast<ck::index_t>(in.GetLengths()[2]) && wi >= 0 &&
wi >= 0 && wi < static_cast<ck::index_t>(in.mDesc.GetLengths()[3])) wi < static_cast<ck::index_t>(in.GetLengths()[3]))
{ {
AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi)); AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
...@@ -74,10 +76,10 @@ static void pool_host_verify(const Tensor<InDataType>& in, ...@@ -74,10 +76,10 @@ static void pool_host_verify(const Tensor<InDataType>& in,
}; };
make_ParallelTensorFunctor(f_nchw, make_ParallelTensorFunctor(f_nchw,
out.mDesc.GetLengths()[0], out.GetLengths()[0],
out.mDesc.GetLengths()[1], out.GetLengths()[1],
out.mDesc.GetLengths()[2], out.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency()); out.GetLengths()[3])(std::thread::hardware_concurrency());
} }
else else
{ {
...@@ -95,8 +97,7 @@ static void pool_host_verify(const Tensor<InDataType>& in, ...@@ -95,8 +97,7 @@ static void pool_host_verify(const Tensor<InDataType>& in,
for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x) for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x)
{ {
ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1]; ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 && if(hi >= 0 && hi < in.GetLengths()[2] && wi >= 0 && wi < in.GetLengths()[3])
wi < in.mDesc.GetLengths()[3])
{ {
AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi)); AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
IndexDataType currIndex = y * window_spatial_lengths[1] + x; IndexDataType currIndex = y * window_spatial_lengths[1] + x;
...@@ -115,10 +116,10 @@ static void pool_host_verify(const Tensor<InDataType>& in, ...@@ -115,10 +116,10 @@ static void pool_host_verify(const Tensor<InDataType>& in,
}; };
make_ParallelTensorFunctor(f_nchw, make_ParallelTensorFunctor(f_nchw,
out.mDesc.GetLengths()[0], out.GetLengths()[0],
out.mDesc.GetLengths()[1], out.GetLengths()[1],
out.mDesc.GetLengths()[2], out.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency()); out.GetLengths()[3])(std::thread::hardware_concurrency());
}; };
} }
...@@ -169,19 +170,18 @@ bool pool_test(bool do_verification, ...@@ -169,19 +170,18 @@ bool pool_test(bool do_verification,
const std::array<ck::index_t, 2> input_left_pads{{in_left_pad_h, in_left_pad_w}}; const std::array<ck::index_t, 2> input_left_pads{{in_left_pad_h, in_left_pad_w}};
const std::array<ck::index_t, 2> input_right_pads{{in_right_pad_h, in_right_pad_w}}; const std::array<ck::index_t, 2> input_right_pads{{in_right_pad_h, in_right_pad_w}};
using namespace ck::literals;
// tensor layout // tensor layout
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) { [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value) if constexpr(ck::is_same_v<decltype(layout), ck::tensor_layout::convolution::NCHW>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
} }
else if constexpr(ck::is_same<decltype(layout), else if constexpr(ck::is_same_v<decltype(layout), ck::tensor_layout::convolution::NHWC>)
ck::tensor_layout::convolution::NHWC>::value)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}), return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
} }
}; };
...@@ -193,8 +193,8 @@ bool pool_test(bool do_verification, ...@@ -193,8 +193,8 @@ bool pool_test(bool do_verification,
Tensor<IndexDataType> out_indices_n_c_ho_wo_device( Tensor<IndexDataType> out_indices_n_c_ho_wo_device(
f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{})); f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl; std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.GetDesc() << std::endl;
std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl; std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -204,25 +204,24 @@ bool pool_test(bool do_verification, ...@@ -204,25 +204,24 @@ bool pool_test(bool do_verification,
default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0}); default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0});
} }
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize()); DeviceMem in_device_buf(in_n_c_hi_wi.GetMemorySize());
DeviceMem out_device_buf(sizeof(OutDataType) * DeviceMem out_device_buf(out_n_c_ho_wo_device.GetMemorySize());
out_n_c_ho_wo_device.mDesc.GetElementSpaceSize()); DeviceMem out_indices_device_buf(out_indices_n_c_ho_wo_device.GetMemorySize());
DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
out_indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize()); in_device_buf.ToDevice(in_n_c_hi_wi.data());
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data()); using ck::utils::to_array;
auto pool = DevicePoolFwdInstance{}; auto pool = DevicePoolFwdInstance{};
auto invoker_ptr = pool.MakeInvokerPointer(); auto invoker_ptr = pool.MakeInvokerPointer();
auto argument_ptr = pool.MakeArgumentPointer( auto argument_ptr = pool.MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()), out_device_buf.GetDeviceBuffer(),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()), out_indices_device_buf.GetDeviceBuffer(),
static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
N, N,
C, C,
std::array<ck::index_t, 2>{{Hi, Wi}}, to_array({Hi, Wi}),
std::array<ck::index_t, 2>{{Y, X}}, to_array({Y, X}),
std::array<ck::index_t, 2>{{Ho, Wo}}, to_array({Ho, Wo}),
window_strides, window_strides,
input_left_pads, input_left_pads,
input_right_pads); input_right_pads);
...@@ -265,16 +264,16 @@ bool pool_test(bool do_verification, ...@@ -265,16 +264,16 @@ bool pool_test(bool do_verification,
input_left_pads, input_left_pads,
input_right_pads); input_right_pads);
out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data()); out_device_buf.FromDevice(out_n_c_ho_wo_device.data());
pass = pass && ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData); pass = pass && ck::utils::check_err(out_n_c_ho_wo_device, out_n_c_ho_wo_host);
if constexpr(OutputIndex) if constexpr(OutputIndex)
{ {
out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data()); out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.data());
pass = pass && ck::utils::check_err(out_indices_n_c_ho_wo_device.mData, pass = pass &&
out_indices_n_c_ho_wo_host.mData); ck::utils::check_err(out_indices_n_c_ho_wo_device, out_indices_n_c_ho_wo_host);
}; };
} }
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream> #include <iostream>
#include <numeric> #include <numeric>
#include <initializer_list> #include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp" #include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
...@@ -12,11 +12,12 @@ ...@@ -12,11 +12,12 @@
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp" #include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" #include "ck/library/utility/literals.hpp"
#include "ck/library/utility/check_err.hpp"
struct RequantReluRequant struct RequantReluRequant
{ {
...@@ -155,17 +156,17 @@ int main(int argc, char* argv[]) ...@@ -155,17 +156,17 @@ int main(int argc, char* argv[])
exit(0); exit(0);
} }
using namespace ck::literals;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -174,9 +175,9 @@ int main(int argc, char* argv[]) ...@@ -174,9 +175,9 @@ int main(int argc, char* argv[])
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl; std::cout << "c_m_n: " << c_m_n_host_result.GetDesc() << std::endl;
switch(init_method) switch(init_method)
{ {
...@@ -190,12 +191,12 @@ int main(int argc, char* argv[]) ...@@ -190,12 +191,12 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}); b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
} }
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem a_m_k_device_buf(a_m_k.GetMemorySize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem b_k_n_device_buf(b_k_n.GetMemorySize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); DeviceMem c_m_n_device_buf(c_m_n_device_result.GetMemorySize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data()); a_m_k_device_buf.ToDevice(a_m_k.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.data());
auto a_element_op = PassThrough{}; auto a_element_op = PassThrough{};
auto b_element_op = PassThrough{}; auto b_element_op = PassThrough{};
...@@ -204,9 +205,9 @@ int main(int argc, char* argv[]) ...@@ -204,9 +205,9 @@ int main(int argc, char* argv[])
// do GEMM // do GEMM
auto gemm = DeviceGemmInstance{}; auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker(); auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()), auto argument = gemm.MakeArgument(a_m_k_device_buf.GetDeviceBuffer(),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()), b_k_n_device_buf.GetDeviceBuffer(),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()), c_m_n_device_buf.GetDeviceBuffer(),
M, M,
N, N,
K, K,
...@@ -237,7 +238,7 @@ int main(int argc, char* argv[]) ...@@ -237,7 +238,7 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl; << gemm.GetTypeString() << std::endl;
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); c_m_n_device_buf.FromDevice(c_m_n_device_result.data());
if(do_verification) if(do_verification)
{ {
...@@ -249,7 +250,7 @@ int main(int argc, char* argv[]) ...@@ -249,7 +250,7 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument); ref_invoker.Run(ref_argument);
return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1; return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result) ? 0 : 1;
} }
return 0; return 0;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <initializer_list>
#include <iostream>
#include <numeric>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using BF16 = ck::bhalf_t;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include "common.hpp"
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using BF16 = ck::bhalf_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ADataType = BF16; using ADataType = BF16;
using BDataType = BF16; using BDataType = BF16;
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include "common.hpp"
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ADataType = F16; using ADataType = F16;
using BDataType = F16; using BDataType = F16;
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include "common.hpp"
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ADataType = F32; using ADataType = F32;
using BDataType = F32; using BDataType = F32;
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include "common.hpp"
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ADataType = ck::int4_t; using ADataType = ck::int4_t;
using BDataType = ck::int4_t; using BDataType = ck::int4_t;
......
// SPDX-License-Identifier: MIT // SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream> #include "common.hpp"
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ADataType = int8_t; using ADataType = int8_t;
using BDataType = int8_t; using BDataType = int8_t;
......
...@@ -50,17 +50,17 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -50,17 +50,17 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
gemm_descs.push_back({M, N, K, stride_A, stride_B, stride_C, {}}); gemm_descs.push_back({M, N, K, stride_A, stride_B, stride_C, {}});
} }
using namespace ck::literals;
auto f_host_tensor_descriptor = auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) { [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value) if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {stride, 1_uz});
std::vector<std::size_t>({stride, 1}));
} }
else else
{ {
return HostTensorDescriptor(std::vector<std::size_t>({row, col}), return HostTensorDescriptor({row, col}, {1_uz, stride});
std::vector<std::size_t>({1, stride}));
} }
}; };
...@@ -90,27 +90,27 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -90,27 +90,27 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
for(std::size_t i = 0; i < gemm_descs.size(); i++) for(std::size_t i = 0; i < gemm_descs.size(); i++)
{ {
a_tensors.push_back(Tensor<ADataType>(f_host_tensor_descriptor( a_tensors.emplace_back(f_host_tensor_descriptor(
gemm_descs[i].M_, gemm_descs[i].K_, gemm_descs[i].stride_A_, ALayout{}))); gemm_descs[i].M_, gemm_descs[i].K_, gemm_descs[i].stride_A_, ALayout{}));
b_tensors.push_back(Tensor<BDataType>(f_host_tensor_descriptor( b_tensors.emplace_back(f_host_tensor_descriptor(
gemm_descs[i].K_, gemm_descs[i].N_, gemm_descs[i].stride_B_, BLayout{}))); gemm_descs[i].K_, gemm_descs[i].N_, gemm_descs[i].stride_B_, BLayout{}));
c_host_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor( c_host_tensors.emplace_back(f_host_tensor_descriptor(
gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{}))); gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{}));
#ifdef BUILD_INT4_EXAMPLE #ifdef BUILD_INT4_EXAMPLE
c_device_tensors.push_back(Tensor<KernelEDataType>(f_host_tensor_descriptor( c_device_tensors.emplace_back(f_host_tensor_descriptor(
gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{}))); gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{}));
#else #else
c_device_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor( c_device_tensors.emplace_back(f_host_tensor_descriptor(
gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{}))); gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{}));
#endif #endif
std::cout << "gemm[" << i << "] a_m_k: " << a_tensors[i].mDesc std::cout << "gemm[" << i << "] a_m_k: " << a_tensors[i].GetDesc()
<< " b_k_n: " << b_tensors[i].mDesc << " c_m_n: " << c_device_tensors[i].mDesc << " b_k_n: " << b_tensors[i].GetDesc()
<< std::endl; << " c_m_n: " << c_device_tensors[i].GetDesc() << std::endl;
flop += std::size_t(2) * gemm_descs[i].M_ * gemm_descs[i].K_ * gemm_descs[i].N_; flop += std::size_t(2) * gemm_descs[i].M_ * gemm_descs[i].K_ * gemm_descs[i].N_;
num_btype += sizeof(ADataType) * a_tensors[i].mDesc.GetElementSize() + num_btype += sizeof(ADataType) * a_tensors[i].GetElementSize() +
sizeof(BDataType) * b_tensors[i].mDesc.GetElementSize() + sizeof(BDataType) * b_tensors[i].GetElementSize() +
sizeof(EDataType) * c_device_tensors[i].mDesc.GetElementSize(); sizeof(EDataType) * c_device_tensors[i].GetElementSize();
switch(config.init_method) switch(config.init_method)
{ {
...@@ -131,22 +131,20 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -131,22 +131,20 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
for(std::size_t i = 0; i < gemm_descs.size(); i++) for(std::size_t i = 0; i < gemm_descs.size(); i++)
{ {
a_tensors_device.emplace_back(std::make_unique<DeviceMem>( a_tensors_device.emplace_back(std::make_unique<DeviceMem>(a_tensors[i].GetMemorySize()));
sizeof(ADataType) * a_tensors[i].mDesc.GetElementSpaceSize())); b_tensors_device.emplace_back(std::make_unique<DeviceMem>(b_tensors[i].GetMemorySize()));
b_tensors_device.emplace_back(std::make_unique<DeviceMem>( c_tensors_device.emplace_back(
sizeof(BDataType) * b_tensors[i].mDesc.GetElementSpaceSize())); std::make_unique<DeviceMem>(c_device_tensors[i].GetMemorySize()));
c_tensors_device.emplace_back(std::make_unique<DeviceMem>(
sizeof(EDataType) * c_device_tensors[i].mDesc.GetElementSpaceSize()));
#ifdef BUILD_INT4_EXAMPLE #ifdef BUILD_INT4_EXAMPLE
const Tensor<KernelADataType> a_converted(a_tensors[i]); const Tensor<KernelADataType> a_converted(a_tensors[i]);
const Tensor<KernelBDataType> b_converted(b_tensors[i]); const Tensor<KernelBDataType> b_converted(b_tensors[i]);
a_tensors_device[i]->ToDevice(a_converted.mData.data()); a_tensors_device[i]->ToDevice(a_converted.data());
b_tensors_device[i]->ToDevice(b_converted.mData.data()); b_tensors_device[i]->ToDevice(b_converted.data());
#else #else
a_tensors_device[i]->ToDevice(a_tensors[i].mData.data()); a_tensors_device[i]->ToDevice(a_tensors[i].data());
b_tensors_device[i]->ToDevice(b_tensors[i].mData.data()); b_tensors_device[i]->ToDevice(b_tensors[i].data());
#endif #endif
p_a.push_back(a_tensors_device[i]->GetDeviceBuffer()); p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());
...@@ -193,7 +191,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -193,7 +191,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
for(std::size_t i = 0; i < gemm_descs.size(); i++) for(std::size_t i = 0; i < gemm_descs.size(); i++)
{ {
c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data()); c_tensors_device[i]->FromDevice(c_device_tensors[i].data());
auto ref_gemm = ReferenceGemmInstance{}; auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_invoker = ref_gemm.MakeInvoker();
...@@ -208,10 +206,10 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co ...@@ -208,10 +206,10 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
#ifdef BUILD_INT4_EXAMPLE #ifdef BUILD_INT4_EXAMPLE
const Tensor<EDataType> c_device_result_converted(c_device_tensors[i]); const Tensor<EDataType> c_device_result_converted(c_device_tensors[i]);
pass &= ck::utils::check_err(c_device_result_converted.mData, c_host_tensors[i].mData); pass &= ck::utils::check_err(c_device_result_converted, c_host_tensors[i]);
#else #else
pass &= ck::utils::check_err(c_device_tensors[i].mData, c_host_tensors[i].mData); pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]);
#endif #endif
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment