Commit e4e99a49 authored by Po-Yen, Chen's avatar Po-Yen, Chen
Browse files

Use new utilities to shorten codes

parent 7acbf104
......@@ -63,10 +63,10 @@ template <typename DataType>
std::ostream& show_2d_matrix(std::ostream& os, Tensor<DataType>& matrix)
{
os << "[" << std::endl;
for(size_t x = 0; x < matrix.mDesc.GetLengths()[0]; x++)
for(size_t x = 0; x < matrix.GetLengths()[0]; x++)
{
os << "[";
for(size_t y = 0; y < matrix.mDesc.GetLengths()[1]; y++)
for(size_t y = 0; y < matrix.GetLengths()[1]; y++)
{
os << std::setw(5) << static_cast<float>(matrix(x, y));
}
......@@ -133,17 +133,17 @@ int main(int argc, char* argv[])
exit(0);
}
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -152,9 +152,9 @@ int main(int argc, char* argv[])
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.GetDesc() << std::endl;
switch(init_method)
{
......@@ -173,12 +173,12 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
}
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem a_m_k_device_buf(a_m_k.GetMemorySize());
DeviceMem b_k_n_device_buf(b_k_n.GetMemorySize());
DeviceMem c_m_n_device_buf(c_m_n_device_result.GetMemorySize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
a_m_k_device_buf.ToDevice(a_m_k.data());
b_k_n_device_buf.ToDevice(b_k_n.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
......@@ -187,9 +187,9 @@ int main(int argc, char* argv[])
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
auto argument = gemm.MakeArgument(a_m_k_device_buf.GetDeviceBuffer(),
b_k_n_device_buf.GetDeviceBuffer(),
c_m_n_device_buf.GetDeviceBuffer(),
M,
N,
K,
......@@ -220,7 +220,7 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
c_m_n_device_buf.FromDevice(c_m_n_device_result.data());
if(do_verification)
{
......@@ -240,7 +240,7 @@ int main(int argc, char* argv[])
show_2d_matrix(std::cout << "c_host :", c_m_n_host_result) << std::endl;
}
#endif
ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
}
return 0;
......
......@@ -9,10 +9,10 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
static_assert(sizeof(ck::int4_t) == sizeof(int8_t));
#endif
using namespace ck::literals;
auto& [M, N, K, StrideA, StrideB, StrideC] = problem_size;
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
......@@ -32,41 +32,38 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k.begin(),
a_m_k.end());
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n.begin(),
b_k_n.end());
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_m_k);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_k_n);
break;
default:
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k.begin(), a_m_k.end());
ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n.begin(), b_k_n.end());
ck::utils::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
ck::utils::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
}
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.GetDesc() << std::endl;
#ifdef BUILD_INT4_EXAMPLE
DeviceMem a_m_k_device_buf(sizeof(KernelADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(KernelBDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(KernelCDataType) *
c_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem a_m_k_device_buf(a_m_k.GetMemorySize());
DeviceMem b_k_n_device_buf(b_k_n.GetMemorySize());
DeviceMem c_m_n_device_buf(c_m_n_device_result.GetMemorySize());
const Tensor<KernelADataType> a_m_k_converted(a_m_k);
const Tensor<KernelBDataType> b_k_n_converted(b_k_n);
a_m_k_device_buf.ToDevice(a_m_k_converted.mData.data());
b_k_n_device_buf.ToDevice(b_k_n_converted.mData.data());
a_m_k_device_buf.ToDevice(a_m_k_converted.data());
b_k_n_device_buf.ToDevice(b_k_n_converted.data());
#else
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem a_m_k_device_buf(a_m_k.GetMemorySize());
DeviceMem b_k_n_device_buf(b_k_n.GetMemorySize());
DeviceMem c_m_n_device_buf(c_m_n_device_result.GetMemorySize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
a_m_k_device_buf.ToDevice(a_m_k.data());
b_k_n_device_buf.ToDevice(b_k_n.data());
#endif
auto a_element_op = AElementOp{};
......@@ -76,25 +73,18 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(
#ifdef BUILD_INT4_EXAMPLE
static_cast<KernelADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<KernelBDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<KernelCDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
#else
static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
#endif
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
auto argument = gemm.MakeArgument(a_m_k_device_buf.GetDeviceBuffer(),
b_k_n_device_buf.GetDeviceBuffer(),
c_m_n_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
StrideC,
a_element_op,
b_element_op,
c_element_op);
if(!gemm.IsSupportedArgument(argument))
{
......@@ -127,17 +117,17 @@ bool run_gemm(const ProblemSize& problem_size, const ExecutionConfig& config)
ref_invoker.Run(ref_argument);
#ifdef BUILD_INT4_EXAMPLE
Tensor<CDataType> c_m_n_device_result_converted(c_m_n_host_result.mDesc);
Tensor<CDataType> c_m_n_device_result_converted(c_m_n_host_result.GetDesc());
c_m_n_device_buf.FromDevice(c_m_n_device_result_converted.mData.data());
c_m_n_device_buf.FromDevice(c_m_n_device_result_converted.data());
c_m_n_device_result = c_m_n_device_result_converted.CopyAsType<CDataType>();
return ck::utils::check_err(c_m_n_device_result_converted.mData, c_m_n_host_result.mData);
return ck::utils::check_err(c_m_n_device_result_converted, c_m_n_host_result);
#else
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
c_m_n_device_buf.FromDevice(c_m_n_device_result.data());
return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData);
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
#endif
}
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <initializer_list>
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/literals.hpp"
struct AlphaBetaAdd
{
......@@ -175,17 +177,17 @@ int main(int argc, char* argv[])
exit(0);
}
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -195,10 +197,10 @@ int main(int argc, char* argv[])
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "d_m_n: " << d_m_n.GetDesc() << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.GetDesc() << std::endl;
switch(init_method)
{
......@@ -214,15 +216,15 @@ int main(int argc, char* argv[])
d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{-0.5, 0.5});
}
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem d_device_buf(d_m_n.GetMemorySize());
DeviceMem e_device_buf(e_m_n_device_result.GetMemorySize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
d_device_buf.ToDevice(d_m_n.mData.data());
e_device_buf.ToDevice(e_m_n_device_result.mData.data());
a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.data());
d_device_buf.ToDevice(d_m_n.data());
e_device_buf.ToDevice(e_m_n_device_result.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
......@@ -231,21 +233,20 @@ int main(int argc, char* argv[])
// do GEMM
auto device_op = DeviceOpInstance{};
auto invoker = device_op.MakeInvoker();
auto argument =
device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
e_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
std::array<ck::index_t, 1>{StrideD},
StrideE,
a_element_op,
b_element_op,
cde_element_op);
auto argument = device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
ck::utils::to_array({d_device_buf.GetDeviceBuffer()}),
e_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
ck::utils::to_array({StrideD}),
StrideE,
a_element_op,
b_element_op,
cde_element_op);
if(!device_op.IsSupportedArgument(argument))
{
......@@ -267,12 +268,11 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
<< std::endl;
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
e_device_buf.FromDevice(e_m_n_device_result.data());
if(do_verification)
{
Tensor<CShuffleDataType> c_m_n(HostTensorDescriptor(
std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
Tensor<CShuffleDataType> c_m_n(HostTensorDescriptor({M, N}));
using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
BDataType,
......@@ -297,9 +297,9 @@ int main(int argc, char* argv[])
}
}
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
e_device_buf.FromDevice(e_m_n_device_result.data());
return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1;
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
}
return 0;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <initializer_list>
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/literals.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
......@@ -153,17 +155,17 @@ int main(int argc, char* argv[])
exit(0);
}
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -173,10 +175,10 @@ int main(int argc, char* argv[])
Tensor<EDataType> e_m_n_host_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
Tensor<EDataType> e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "d_m_n: " << d_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "d_m_n: " << d_m_n.GetDesc() << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.GetDesc() << std::endl;
switch(init_method)
{
......@@ -192,14 +194,14 @@ int main(int argc, char* argv[])
d_m_n.GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
}
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d_device_buf(sizeof(DDataType) * d_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem d_device_buf(d_m_n.GetMemorySize());
DeviceMem e_device_buf(e_m_n_device_result.GetMemorySize());
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
d_device_buf.ToDevice(d_m_n.mData.data());
a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.data());
d_device_buf.ToDevice(d_m_n.data());
auto a_element_op = AElementOp{};
auto b_element_op = BElementOp{};
......@@ -210,21 +212,20 @@ int main(int argc, char* argv[])
auto invoker = device_op.MakeInvoker();
auto argument =
device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
std::array<const void*, 1>{d_device_buf.GetDeviceBuffer()},
e_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
std::array<ck::index_t, 1>{0},
StrideE,
a_element_op,
b_element_op,
cde_element_op);
auto argument = device_op.MakeArgument(a_device_buf.GetDeviceBuffer(),
b_device_buf.GetDeviceBuffer(),
ck::utils::to_array({d_device_buf.GetDeviceBuffer()}),
e_device_buf.GetDeviceBuffer(),
M,
N,
K,
StrideA,
StrideB,
ck::utils::to_array({0}),
StrideE,
a_element_op,
b_element_op,
cde_element_op);
if(!device_op.IsSupportedArgument(argument))
{
......@@ -247,7 +248,7 @@ int main(int argc, char* argv[])
if(do_verification)
{
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
e_device_buf.FromDevice(e_m_n_device_result.data());
Tensor<AccDataType> c_m_n(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
......@@ -275,7 +276,7 @@ int main(int argc, char* argv[])
}
}
return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData) ? 0 : 1;
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result) ? 0 : 1;
}
return 0;
......
......@@ -35,11 +35,11 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
>
e_m_n_device_result(f_host_tensor_descriptor(M, N, StrideE, ELayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "d0_m_n: " << d0_m_n.mDesc << std::endl;
std::cout << "d1_m_n: " << d1_m_n.mDesc << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.mDesc << std::endl;
std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "d0_m_n: " << d0_m_n.GetDesc() << std::endl;
std::cout << "d1_m_n: " << d1_m_n.GetDesc() << std::endl;
std::cout << "e_m_n: " << e_m_n_host_result.GetDesc() << std::endl;
switch(config.init_method)
{
......@@ -57,11 +57,11 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
d1_m_n.GenerateTensorValue(GeneratorTensor_3<D1DataType>{0.0, 1.0});
}
DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem d0_device_buf(sizeof(D0DataType) * d0_m_n.mDesc.GetElementSpaceSize());
DeviceMem d1_device_buf(sizeof(D1DataType) * d1_m_n.mDesc.GetElementSpaceSize());
DeviceMem e_device_buf(sizeof(EDataType) * e_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem a_device_buf(a_m_k.GetMemorySize());
DeviceMem b_device_buf(b_k_n.GetMemorySize());
DeviceMem d0_device_buf(d0_m_n.GetMemorySize());
DeviceMem d1_device_buf(d1_m_n.GetMemorySize());
DeviceMem e_device_buf(e_m_n_device_result.GetMemorySize());
#ifdef BUILD_INT4_EXAMPLE
const Tensor<KernelADataType> a_m_k_converted(a_m_k);
......@@ -69,15 +69,15 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
const Tensor<KernelD0DataType> d0_m_n_converted(d0_m_n);
const Tensor<KernelD1DataType> d1_m_n_converted(d1_m_n);
a_device_buf.ToDevice(a_m_k_converted.mData.data());
b_device_buf.ToDevice(b_k_n_converted.mData.data());
d0_device_buf.ToDevice(d0_m_n_converted.mData.data());
d1_device_buf.ToDevice(d1_m_n_converted.mData.data());
a_device_buf.ToDevice(a_m_k_converted.data());
b_device_buf.ToDevice(b_k_n_converted.data());
d0_device_buf.ToDevice(d0_m_n_converted.data());
d1_device_buf.ToDevice(d1_m_n_converted.data());
#else
a_device_buf.ToDevice(a_m_k.mData.data());
b_device_buf.ToDevice(b_k_n.mData.data());
d0_device_buf.ToDevice(d0_m_n.mData.data());
d1_device_buf.ToDevice(d1_m_n.mData.data());
a_device_buf.ToDevice(a_m_k.data());
b_device_buf.ToDevice(b_k_n.data());
d0_device_buf.ToDevice(d0_m_n.data());
d1_device_buf.ToDevice(d1_m_n.data());
#endif
auto a_element_op = AElementOp{};
......@@ -142,14 +142,14 @@ bool run_gemm_add_add_fastgelu(const ProblemSize& problem_size, const ExecutionC
}
}
e_device_buf.FromDevice(e_m_n_device_result.mData.data());
e_device_buf.FromDevice(e_m_n_device_result.data());
#ifdef BUILD_INT4_EXAMPLE
const Tensor<EDataType> e_m_n_device_result_converted(e_m_n_device_result);
return ck::utils::check_err(e_m_n_device_result_converted.mData, e_m_n_host_result.mData);
return ck::utils::check_err(e_m_n_device_result_converted, e_m_n_host_result);
#else
return ck::utils::check_err(e_m_n_device_result.mData, e_m_n_host_result.mData);
return ck::utils::check_err(e_m_n_device_result, e_m_n_host_result);
#endif
}
......
......@@ -10,13 +10,14 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
void print_helper_msg()
{
......@@ -50,9 +51,9 @@ bool run_grouped_conv_fwd(bool do_verification,
Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
std::cout << "in: " << in.mDesc << std::endl;
std::cout << "wei: " << wei.mDesc << std::endl;
std::cout << "out: " << out_host.mDesc << std::endl;
std::cout << "in: " << in.GetDesc() << std::endl;
std::cout << "wei: " << wei.GetDesc() << std::endl;
std::cout << "out: " << out_host.GetDesc() << std::endl;
switch(init_method)
{
......@@ -66,56 +67,34 @@ bool run_grouped_conv_fwd(bool do_verification,
wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
}
DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
in_device_buf.ToDevice(in.mData.data());
wei_device_buf.ToDevice(wei.mData.data());
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{};
std::array<ck::index_t, NDimSpatial> input_right_pads{};
auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); };
copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
copy(conv_param.conv_filter_strides_, conv_filter_strides);
copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
copy(conv_param.input_left_pads_, input_left_pads);
copy(conv_param.input_right_pads_, input_right_pads);
DeviceMem in_device_buf(in.GetMemorySize());
DeviceMem wei_device_buf(wei.GetMemorySize());
DeviceMem out_device_buf(out_device.GetMemorySize());
in_device_buf.ToDevice(in.data());
wei_device_buf.ToDevice(wei.data());
using ck::utils::empty_array, ck::utils::to_array;
// do Conv
auto conv = DeviceConvNDFwdInstance{};
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
wei_device_buf.GetDeviceBuffer(),
std::array<const void*, 0>{},
empty_array(),
out_device_buf.GetDeviceBuffer(),
a_g_n_c_wis_lengths,
a_g_n_c_wis_strides,
b_g_k_c_xs_lengths,
b_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
e_g_n_k_wos_lengths,
e_g_n_k_wos_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
to_array(in_g_n_c_wis_desc.GetLengths()),
to_array(in_g_n_c_wis_desc.GetStrides()),
to_array(wei_g_k_c_xs_desc.GetLengths()),
to_array(wei_g_k_c_xs_desc.GetStrides()),
empty_array(),
empty_array(),
to_array(out_g_n_k_wos_desc.GetLengths()),
to_array(out_g_n_k_wos_desc.GetStrides()),
to_array(conv_param.conv_filter_strides_),
to_array(conv_param.conv_filter_dilations_),
to_array(conv_param.input_left_pads_),
to_array(conv_param.input_right_pads_),
in_element_op,
wei_element_op,
out_element_op);
......@@ -161,10 +140,10 @@ bool run_grouped_conv_fwd(bool do_verification,
ref_invoker.Run(ref_argument);
out_device_buf.FromDevice(out_device.mData.data());
out_device_buf.FromDevice(out_device.data());
return ck::utils::check_err(
out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f);
out_device, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
}
return true;
......
......@@ -16,6 +16,9 @@
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
......@@ -23,7 +26,6 @@
#include "ck/library/utility/fill.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
using BF16 = ck::bhalf_t;
using FP16 = ck::half_t;
......@@ -140,9 +142,7 @@ make_r0_host_tensor_descriptor(const ck::utils::conv::ConvParam& problem_size)
{
std::vector<ck::index_t> dimensions{problem_size.G_, problem_size.N_};
std::copy(begin(problem_size.output_spatial_lengths_),
end(problem_size.output_spatial_lengths_),
std::back_inserter(dimensions));
ck::ranges::copy(problem_size.output_spatial_lengths_, std::back_inserter(dimensions));
return HostTensorDescriptor(dimensions);
}
......@@ -158,10 +158,3 @@ void unpack_host_tensor_descriptor(const HostTensorDescriptor& descriptor,
assert(size(descriptor.GetStrides()) == size(strides));
std::copy_n(begin(descriptor.GetStrides()), size(descriptor.GetStrides()), begin(strides));
}
template <typename Range, typename OutputIterator>
auto copy(const Range& range, OutputIterator iter)
-> decltype(std::copy(std::begin(range), std::end(range), iter))
{
return std::copy(std::begin(range), std::end(range), iter);
}
......@@ -77,32 +77,28 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
{
case 0: break;
case 1:
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input.begin(),
conv_input.end());
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight.begin(),
conv_weight.end());
ck::utils::FillUniformDistributionIntegerValue<ADataType>{-8, 7}(conv_input);
ck::utils::FillUniformDistributionIntegerValue<BDataType>{-8, 7}(conv_weight);
break;
default:
ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input.begin(), conv_input.end());
ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight.begin(),
conv_weight.end());
ck::utils::FillUniformDistribution<ADataType>{-5, 5}(conv_input);
ck::utils::FillUniformDistribution<BDataType>{-5, 5}(conv_weight);
}
DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize());
DeviceMem conv_weight_device_buf(sizeof(BDataType) * conv_weight.mDesc.GetElementSpaceSize());
DeviceMem conv_output_device_buf(sizeof(EDataType) *
conv_output_device.mDesc.GetElementSpaceSize());
DeviceMem r0_device_buf(sizeof(R0DataType) * r0_device.mDesc.GetElementSpaceSize());
DeviceMem conv_input_device_buf(conv_input.GetMemorySize());
DeviceMem conv_weight_device_buf(conv_weight.GetMemorySize());
DeviceMem conv_output_device_buf(conv_output_device.GetMemorySize());
DeviceMem r0_device_buf(r0_device.GetMemorySize());
#ifdef BUILD_INT4_EXAMPLE
const Tensor<KernelADataType> conv_input_converted(conv_input);
const Tensor<KernelBDataType> conv_weight_converted(conv_weight);
conv_input_device_buf.ToDevice(conv_input_converted.mData.data());
conv_weight_device_buf.ToDevice(conv_weight_converted.mData.data());
conv_input_device_buf.ToDevice(conv_input_converted.data());
conv_weight_device_buf.ToDevice(conv_weight_converted.data());
#else
conv_input_device_buf.ToDevice(conv_input.mData.data());
conv_weight_device_buf.ToDevice(conv_weight.mData.data());
conv_input_device_buf.ToDevice(conv_input.data());
conv_weight_device_buf.ToDevice(conv_weight.data());
#endif
std::array<ck::index_t, NDimSpatial + 3> conv_input_g_n_c_wis_lengths{},
......@@ -112,8 +108,6 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
std::array<ck::index_t, NDimSpatial + 3> conv_output_g_n_k_wos_lengths{},
conv_output_g_n_k_wos_strides{};
std::array<ck::index_t, NDimSpatial + 2> r0_lengths{}, r0_strides{};
std::array<ck::index_t, NDimSpatial> conv_filter_strides{}, conv_filter_dilations{};
std::array<ck::index_t, NDimSpatial> input_left_pads{}, input_right_pads{};
unpack_host_tensor_descriptor(
conv_input_g_n_c_wis_desc, conv_input_g_n_c_wis_lengths, conv_input_g_n_c_wis_strides);
......@@ -123,33 +117,30 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
conv_output_g_n_k_wos_desc, conv_output_g_n_k_wos_lengths, conv_output_g_n_k_wos_strides);
unpack_host_tensor_descriptor(r0_desc, r0_lengths, r0_strides);
copy(problem_size.conv_filter_strides_, begin(conv_filter_strides));
copy(problem_size.conv_filter_dilations_, begin(conv_filter_dilations));
copy(problem_size.input_left_pads_, begin(input_left_pads));
copy(problem_size.input_right_pads_, begin(input_right_pads));
using ck::utils::empty_array, ck::utils::to_array;
// run Conv + Reduction on device
auto conv = DeviceInstance<NDimSpatial>{};
auto invoker = conv.MakeInvoker();
auto argument = conv.MakeArgument(conv_input_device_buf.GetDeviceBuffer(),
conv_weight_device_buf.GetDeviceBuffer(),
std::array<const void*, 0>{},
empty_array(),
conv_output_device_buf.GetDeviceBuffer(),
{r0_device_buf.GetDeviceBuffer()},
conv_input_g_n_c_wis_lengths,
conv_input_g_n_c_wis_strides,
conv_weight_g_k_c_xs_lengths,
conv_weight_g_k_c_xs_strides,
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
empty_array(),
empty_array(),
conv_output_g_n_k_wos_lengths,
conv_output_g_n_k_wos_strides,
r0_lengths,
r0_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads,
to_array(problem_size.conv_filter_strides_),
to_array(problem_size.conv_filter_dilations_),
to_array(problem_size.input_left_pads_),
to_array(problem_size.input_right_pads_),
AElementOp{},
BElementOp{},
CDEElementOp{},
......@@ -194,11 +185,11 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
ref_invoker.Run(ref_argument);
Tensor<R0DataType> r0_host(r0_device.mDesc);
Tensor<R0DataType> r0_host(r0_device.GetDesc());
auto reduce0_op = RsThreadReduceOp{}[ck::Number<0>{}];
auto& output_dims = conv_output_g_n_k_wos_desc.GetLengths();
auto output_dims = conv_output_g_n_k_wos_desc.GetLengths();
if constexpr(NDimSpatial == 1)
{
......@@ -273,19 +264,16 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size,
}
}
conv_output_device_buf.FromDevice(conv_output_device.mData.data());
r0_device_buf.FromDevice(r0_device.mData.data());
conv_output_device_buf.FromDevice(conv_output_device.data());
r0_device_buf.FromDevice(r0_device.data());
return ck::utils::check_err(conv_output_device.mData,
conv_output_host.mData,
return ck::utils::check_err(conv_output_device,
conv_output_host,
"Error: incorrect results! (Matrix E)",
1e-5f,
1e-4f) &&
ck::utils::check_err(r0_device.mData,
r0_host.mData,
"Error: incorrect results! (Matrix R0)",
1e-5f,
1e-4f);
ck::utils::check_err(
r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-5f, 1e-4f);
}
return true;
......
......@@ -7,15 +7,17 @@
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/ranges.hpp"
#include "reduce_example_common.hpp"
......@@ -156,11 +158,11 @@ int reduce_blockwise_impl(bool do_verification,
Tensor<int> out_indices_ref(outLengths);
Tensor<int> out_indices(outLengths);
auto inStrides = in.mDesc.GetStrides();
auto outStrides = out.mDesc.GetStrides();
auto inStrides = in.GetStrides();
auto outStrides = out.GetStrides();
size_t invariant_total_length = out.mDesc.GetElementSize();
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length;
size_t invariant_total_length = out.GetElementSize();
size_t reduce_total_length = in.GetElementSize() / invariant_total_length;
std::size_t num_thread = 1;
......@@ -187,42 +189,43 @@ int reduce_blockwise_impl(bool do_verification,
}
if(beta != 0.0f)
for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
out.mData[i] = out_ref.mData[i];
{
ck::ranges::copy(out_ref, out.begin());
}
};
// these buffers are usually provided by the user application
DeviceMem in_dev(sizeof(InOutDataTypeInDevice) * in.mDesc.GetElementSpaceSize());
DeviceMem out_dev(sizeof(InOutDataTypeInDevice) * out.mDesc.GetElementSpaceSize());
DeviceMem in_dev(in.GetMemorySize());
DeviceMem out_dev(out.GetMemorySize());
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if(std::is_same<InOutDataType, int4_t>::value)
{
std::vector<InOutDataTypeInDevice> tmp_buf(in.mData.size());
std::vector<InOutDataTypeInDevice> tmp_buf(in.size());
std::copy_n(in.mData.data(), in.mData.size(), tmp_buf.data());
std::copy_n(in.data(), in.size(), tmp_buf.data());
in_dev.ToDevice(tmp_buf.data());
}
else
#endif
in_dev.ToDevice(in.mData.data());
in_dev.ToDevice(in.data());
if(beta != 0.0f)
{
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if(std::is_same<InOutDataType, int4_t>::value)
{
std::vector<InOutDataTypeInDevice> tmp_buf(in.mData.size());
std::vector<InOutDataTypeInDevice> tmp_buf(in.size());
std::copy_n(out.mData.data(), out.mData.size(), tmp_buf.data());
std::copy_n(out.data(), out.size(), tmp_buf.data());
out_dev.ToDevice(tmp_buf.data());
}
else
#endif
out_dev.ToDevice(out.mData.data());
out_dev.ToDevice(out.data());
};
size_t indicesSizeInBytes = OutputIndex ? out.mDesc.GetElementSize() * sizeof(int32_t) : 0;
size_t indicesSizeInBytes = OutputIndex ? out.GetElementSize() * sizeof(int32_t) : 0;
DeviceMem out_index_dev(indicesSizeInBytes);
......@@ -245,33 +248,25 @@ int reduce_blockwise_impl(bool do_verification,
NumReduceDim,
PropagateNan,
OutputIndex>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce(in.GetDesc(), out_ref.GetDesc(), invariantDims, reduceDims);
hostReduce.Run(alpha,
in.mData.data(),
in.data(),
beta,
out_ref.mData.data(),
out_indices_ref.mData.data(),
out_ref.data(),
out_indices_ref.data(),
in_elementwise_op,
acc_elementwise_op);
};
std::vector<ck::index_t> i_inLengths;
std::vector<ck::index_t> i_inStrides;
std::vector<ck::index_t> i_outLengths;
std::vector<ck::index_t> i_outStrides;
i_inLengths.assign(inLengths.begin(), inLengths.end());
i_inStrides.assign(inStrides.begin(), inStrides.end());
i_outLengths.assign(outLengths.begin(), outLengths.end());
i_outStrides.assign(outStrides.begin(), outStrides.end());
using Indices = std::vector<ck::index_t>;
auto reduce = DeviceReduceInstance{};
auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths,
i_inStrides,
i_outLengths,
i_outStrides,
auto argument_ptr = reduce.MakeArgumentPointer(ck::ranges::to<Indices>(inLengths),
ck::ranges::to<Indices>(inStrides),
ck::ranges::to<Indices>(outLengths),
ck::ranges::to<Indices>(outStrides),
reduceDims,
alpha,
beta,
......@@ -312,22 +307,22 @@ int reduce_blockwise_impl(bool do_verification,
#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4
if(std::is_same<InOutDataType, int4_t>::value)
{
std::vector<InOutDataTypeInDevice> tmp_buf(out.mData.size());
std::vector<InOutDataTypeInDevice> tmp_buf(out.size());
out_dev.FromDevice(tmp_buf.data());
std::copy_n(tmp_buf.data(), out.mData.size(), out.mData.data());
std::copy_n(tmp_buf.data(), out.size(), out.data());
}
else
#endif
out_dev.FromDevice(out.mData.data());
out_dev.FromDevice(out.data());
pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
pass = pass && ck::utils::check_err(out, out_ref);
if(OutputIndex)
{
out_index_dev.FromDevice(out_indices.mData.data());
pass = pass && ck::utils::check_err(out_indices.mData, out_indices_ref.mData);
out_index_dev.FromDevice(out_indices.data());
pass = pass && ck::utils::check_err(out_indices, out_indices_ref);
};
};
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <initializer_list>
#include <iostream>
#include <numeric>
#include <sstream>
#include <initializer_list>
#include <cstdlib>
#include <getopt.h>
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/ranges.hpp"
using namespace ck;
using namespace ck::tensor_operation::device;
......@@ -139,12 +142,12 @@ int main(int argc, char* argv[])
Tensor<InOutDataType> in_2(inLengths_2); // also the output tensor of the first reduction
Tensor<InOutDataType> out(outLengths);
auto inStrides_1 = in_1.mDesc.GetStrides();
auto inStrides_2 = in_2.mDesc.GetStrides();
auto outStrides = out.mDesc.GetStrides();
auto inStrides_1 = in_1.GetStrides();
auto inStrides_2 = in_2.GetStrides();
auto outStrides = out.GetStrides();
size_t invariant_total_length = out.mDesc.GetElementSize();
size_t reduce_total_length = in_1.mDesc.GetElementSize() / invariant_total_length;
size_t invariant_total_length = out.GetElementSize();
size_t reduce_total_length = in_1.GetElementSize() / invariant_total_length;
std::size_t num_thread = 1;
......@@ -171,18 +174,19 @@ int main(int argc, char* argv[])
}
if(beta != 0.0f)
for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
out.mData[i] = out_ref.mData[i];
{
ck::ranges::copy(out_ref, out.begin());
}
};
DeviceMem in_1_dev(sizeof(InOutDataType) * in_1.mDesc.GetElementSpaceSize());
DeviceMem in_2_dev(sizeof(InOutDataType) * in_2.mDesc.GetElementSpaceSize());
DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize());
DeviceMem in_1_dev(in_1.GetMemorySize());
DeviceMem in_2_dev(in_2.GetMemorySize());
DeviceMem out_dev(out.GetMemorySize());
in_1_dev.ToDevice(in_1.mData.data());
in_1_dev.ToDevice(in_1.data());
if(beta != 0.0f)
out_dev.ToDevice(out.mData.data());
out_dev.ToDevice(out.data());
InElementwiseOperation in_elementwise_op;
AccElementwiseOperation acc_elementwise_op;
......@@ -203,37 +207,25 @@ int main(int argc, char* argv[])
2, // NumReduceDim
PropagateNan,
OutputIndex>
hostReduce(in_1.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce(in_1.GetDesc(), out_ref.GetDesc(), invariantDims, reduceDims);
hostReduce.Run(alpha,
in_1.mData.data(),
in_1.data(),
beta,
out_ref.mData.data(),
out_ref.data(),
nullptr,
in_elementwise_op,
acc_elementwise_op);
};
std::vector<ck::index_t> i_inLengths_1;
std::vector<ck::index_t> i_inStrides_1;
std::vector<ck::index_t> i_inLengths_2;
std::vector<ck::index_t> i_inStrides_2;
std::vector<ck::index_t> i_outLengths;
std::vector<ck::index_t> i_outStrides;
i_inLengths_1.assign(inLengths_1.begin(), inLengths_1.end());
i_inStrides_1.assign(inStrides_1.begin(), inStrides_1.end());
i_inLengths_2.assign(inLengths_2.begin(), inLengths_2.end());
i_inStrides_2.assign(inStrides_2.begin(), inStrides_2.end());
i_outLengths.assign(outLengths.begin(), outLengths.end());
i_outStrides.assign(outStrides.begin(), outStrides.end());
using Indices = std::vector<ck::index_t>;
auto reduce_1 = DeviceReduceInstance_1{};
auto argument_ptr_1 = reduce_1.MakeArgumentPointer(i_inLengths_1,
i_inStrides_1,
i_inLengths_2,
i_inStrides_2,
auto argument_ptr_1 = reduce_1.MakeArgumentPointer(ck::ranges::to<Indices>(inLengths_1),
ck::ranges::to<Indices>(inStrides_1),
ck::ranges::to<Indices>(inLengths_2),
ck::ranges::to<Indices>(inStrides_2),
reduceDims_1,
1.0f,
0.0f,
......@@ -255,10 +247,10 @@ int main(int argc, char* argv[])
auto reduce_2 = DeviceReduceInstance_2{};
auto argument_ptr_2 = reduce_2.MakeArgumentPointer(i_inLengths_2,
i_inStrides_2,
i_outLengths,
i_outStrides,
auto argument_ptr_2 = reduce_2.MakeArgumentPointer(ck::ranges::to<Indices>(inLengths_2),
ck::ranges::to<Indices>(inStrides_2),
ck::ranges::to<Indices>(outLengths),
ck::ranges::to<Indices>(outStrides),
reduceDims_2,
alpha,
beta,
......@@ -293,8 +285,8 @@ int main(int argc, char* argv[])
if(do_verify)
{
out_dev.FromDevice(out.mData.data());
pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
out_dev.FromDevice(out.data());
pass = pass && ck::utils::check_err(out, out_ref);
};
return (pass ? 0 : 1);
......
......@@ -7,15 +7,17 @@
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/library/utility/algorithm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/host_common_util.hpp"
#include "ck/library/utility/host_reduction.hpp"
#include "ck/library/utility/ranges.hpp"
#include "reduce_example_common.hpp"
......@@ -95,11 +97,11 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
Tensor<InOutDataType> out_ref(outLengths);
Tensor<InOutDataType> out(outLengths);
auto inStrides = in.mDesc.GetStrides();
auto outStrides = out.mDesc.GetStrides();
auto inStrides = in.GetStrides();
auto outStrides = out.GetStrides();
size_t invariant_total_length = out.mDesc.GetElementSize();
size_t reduce_total_length = in.mDesc.GetElementSize() / invariant_total_length;
size_t invariant_total_length = out.GetElementSize();
size_t reduce_total_length = in.GetElementSize() / invariant_total_length;
std::size_t num_thread = 1;
......@@ -126,18 +128,19 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
}
if(beta != 0.0f)
for(size_t i = 0; i < out_ref.mDesc.GetElementSpaceSize(); i++)
out.mData[i] = out_ref.mData[i];
{
ck::ranges::copy(out_ref, out.begin());
}
};
// these buffers are usually provided by the user application
DeviceMem in_dev(sizeof(InOutDataType) * in.mDesc.GetElementSpaceSize());
DeviceMem out_dev(sizeof(InOutDataType) * out.mDesc.GetElementSpaceSize());
DeviceMem in_dev(in.GetMemorySize());
DeviceMem out_dev(out.GetMemorySize());
in_dev.ToDevice(in.mData.data());
in_dev.ToDevice(in.data());
if(beta != 0.0f)
out_dev.ToDevice(out.mData.data());
out_dev.ToDevice(out.data());
InElementwiseOperation in_elementwise_op;
AccElementwiseOperation acc_elementwise_op;
......@@ -158,33 +161,20 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
NumReduceDim,
PropagateNan,
false>
hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
hostReduce.Run(alpha,
in.mData.data(),
beta,
out_ref.mData.data(),
nullptr,
in_elementwise_op,
acc_elementwise_op);
};
hostReduce(in.GetDesc(), out_ref.GetDesc(), invariantDims, reduceDims);
std::vector<ck::index_t> i_inLengths;
std::vector<ck::index_t> i_inStrides;
std::vector<ck::index_t> i_outLengths;
std::vector<ck::index_t> i_outStrides;
hostReduce.Run(
alpha, in.data(), beta, out_ref.data(), nullptr, in_elementwise_op, acc_elementwise_op);
};
i_inLengths.assign(inLengths.begin(), inLengths.end());
i_inStrides.assign(inStrides.begin(), inStrides.end());
i_outLengths.assign(outLengths.begin(), outLengths.end());
i_outStrides.assign(outStrides.begin(), outStrides.end());
using Indices = std::vector<ck::index_t>;
auto reduce = DeviceReduceInstance{};
auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths,
i_inStrides,
i_outLengths,
i_outStrides,
auto argument_ptr = reduce.MakeArgumentPointer(ck::ranges::to<Indices>(inLengths),
ck::ranges::to<Indices>(inStrides),
ck::ranges::to<Indices>(outLengths),
ck::ranges::to<Indices>(outStrides),
reduceDims,
alpha,
beta,
......@@ -222,8 +212,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification,
if(do_verification)
{
out_dev.FromDevice(out.mData.data());
pass = pass && ck::utils::check_err(out.mData, out_ref.mData);
out_dev.FromDevice(out.data());
pass = pass && ck::utils::check_err(out, out_ref);
};
return (pass ? 0 : 1);
......
......@@ -8,14 +8,16 @@
#include "ck/ck.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/utility/reduction_functions_accumulate.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/device/device_pool2d_fwd_nhwc_nhwc.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/array.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
template <typename InDataType,
typename OutDataType,
......@@ -56,8 +58,8 @@ static void pool_host_verify(const Tensor<InDataType>& in,
for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x)
{
ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1];
if(hi >= 0 && hi < static_cast<ck::index_t>(in.mDesc.GetLengths()[2]) &&
wi >= 0 && wi < static_cast<ck::index_t>(in.mDesc.GetLengths()[3]))
if(hi >= 0 && hi < static_cast<ck::index_t>(in.GetLengths()[2]) && wi >= 0 &&
wi < static_cast<ck::index_t>(in.GetLengths()[3]))
{
AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
......@@ -74,10 +76,10 @@ static void pool_host_verify(const Tensor<InDataType>& in,
};
make_ParallelTensorFunctor(f_nchw,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
out.GetLengths()[0],
out.GetLengths()[1],
out.GetLengths()[2],
out.GetLengths()[3])(std::thread::hardware_concurrency());
}
else
{
......@@ -95,8 +97,7 @@ static void pool_host_verify(const Tensor<InDataType>& in,
for(ck::index_t x = 0; x < window_spatial_lengths[1]; ++x)
{
ck::index_t wi = wo * window_strides[1] + x - in_left_pads[1];
if(hi >= 0 && hi < in.mDesc.GetLengths()[2] && wi >= 0 &&
wi < in.mDesc.GetLengths()[3])
if(hi >= 0 && hi < in.GetLengths()[2] && wi >= 0 && wi < in.GetLengths()[3])
{
AccDataType currVal = static_cast<AccDataType>(in(n, c, hi, wi));
IndexDataType currIndex = y * window_spatial_lengths[1] + x;
......@@ -115,10 +116,10 @@ static void pool_host_verify(const Tensor<InDataType>& in,
};
make_ParallelTensorFunctor(f_nchw,
out.mDesc.GetLengths()[0],
out.mDesc.GetLengths()[1],
out.mDesc.GetLengths()[2],
out.mDesc.GetLengths()[3])(std::thread::hardware_concurrency());
out.GetLengths()[0],
out.GetLengths()[1],
out.GetLengths()[2],
out.GetLengths()[3])(std::thread::hardware_concurrency());
};
}
......@@ -169,19 +170,18 @@ bool pool_test(bool do_verification,
const std::array<ck::index_t, 2> input_left_pads{{in_left_pad_h, in_left_pad_w}};
const std::array<ck::index_t, 2> input_right_pads{{in_right_pad_h, in_right_pad_w}};
using namespace ck::literals;
// tensor layout
auto f_host_tensor_descriptor =
[](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value)
if constexpr(ck::is_same_v<decltype(layout), ck::tensor_layout::convolution::NCHW>)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, H * W, W, 1_uz});
}
else if constexpr(ck::is_same<decltype(layout),
ck::tensor_layout::convolution::NHWC>::value)
else if constexpr(ck::is_same_v<decltype(layout), ck::tensor_layout::convolution::NHWC>)
{
return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
return HostTensorDescriptor({N_, C_, H, W}, {C_ * H * W, 1_uz, W * C_, C_});
}
};
......@@ -193,8 +193,8 @@ bool pool_test(bool do_verification,
Tensor<IndexDataType> out_indices_n_c_ho_wo_device(
f_host_tensor_descriptor(N, C, Ho, Wo, OutLayout{}));
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.mDesc << std::endl;
std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.GetDesc() << std::endl;
std::cout << "out_n_c_ho_wo: " << out_n_c_ho_wo_host.GetDesc() << std::endl;
switch(init_method)
{
......@@ -204,28 +204,27 @@ bool pool_test(bool do_verification,
default: in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{-5.0, 5.0});
}
DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpaceSize());
DeviceMem out_device_buf(sizeof(OutDataType) *
out_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
DeviceMem out_indices_device_buf(sizeof(IndexDataType) *
out_indices_n_c_ho_wo_device.mDesc.GetElementSpaceSize());
DeviceMem in_device_buf(in_n_c_hi_wi.GetMemorySize());
DeviceMem out_device_buf(out_n_c_ho_wo_device.GetMemorySize());
DeviceMem out_indices_device_buf(out_indices_n_c_ho_wo_device.GetMemorySize());
in_device_buf.ToDevice(in_n_c_hi_wi.data());
in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
using ck::utils::to_array;
auto pool = DevicePoolFwdInstance{};
auto invoker_ptr = pool.MakeInvokerPointer();
auto argument_ptr = pool.MakeArgumentPointer(
static_cast<InDataType*>(in_device_buf.GetDeviceBuffer()),
static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
static_cast<IndexDataType*>(out_indices_device_buf.GetDeviceBuffer()),
N,
C,
std::array<ck::index_t, 2>{{Hi, Wi}},
std::array<ck::index_t, 2>{{Y, X}},
std::array<ck::index_t, 2>{{Ho, Wo}},
window_strides,
input_left_pads,
input_right_pads);
auto argument_ptr = pool.MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),
out_device_buf.GetDeviceBuffer(),
out_indices_device_buf.GetDeviceBuffer(),
N,
C,
to_array({Hi, Wi}),
to_array({Y, X}),
to_array({Ho, Wo}),
window_strides,
input_left_pads,
input_right_pads);
if(!pool.IsSupportedArgument(argument_ptr.get()))
{
......@@ -265,16 +264,16 @@ bool pool_test(bool do_verification,
input_left_pads,
input_right_pads);
out_device_buf.FromDevice(out_n_c_ho_wo_device.mData.data());
out_device_buf.FromDevice(out_n_c_ho_wo_device.data());
pass = pass && ck::utils::check_err(out_n_c_ho_wo_device.mData, out_n_c_ho_wo_host.mData);
pass = pass && ck::utils::check_err(out_n_c_ho_wo_device, out_n_c_ho_wo_host);
if constexpr(OutputIndex)
{
out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.mData.data());
out_indices_device_buf.FromDevice(out_indices_n_c_ho_wo_device.data());
pass = pass && ck::utils::check_err(out_indices_n_c_ho_wo_device.mData,
out_indices_n_c_ho_wo_host.mData);
pass = pass &&
ck::utils::check_err(out_indices_n_c_ho_wo_device, out_indices_n_c_ho_wo_host);
};
}
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <cstdlib>
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
......@@ -12,11 +12,12 @@
#include "ck/tensor_operation/gpu/device/device_gemm_xdl_cshuffle.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/literals.hpp"
struct RequantReluRequant
{
......@@ -155,17 +156,17 @@ int main(int argc, char* argv[])
exit(0);
}
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -174,9 +175,9 @@ int main(int argc, char* argv[])
Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
std::cout << "a_m_k: " << a_m_k.GetDesc() << std::endl;
std::cout << "b_k_n: " << b_k_n.GetDesc() << std::endl;
std::cout << "c_m_n: " << c_m_n_host_result.GetDesc() << std::endl;
switch(init_method)
{
......@@ -190,12 +191,12 @@ int main(int argc, char* argv[])
b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
}
DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
DeviceMem a_m_k_device_buf(a_m_k.GetMemorySize());
DeviceMem b_k_n_device_buf(b_k_n.GetMemorySize());
DeviceMem c_m_n_device_buf(c_m_n_device_result.GetMemorySize());
a_m_k_device_buf.ToDevice(a_m_k.mData.data());
b_k_n_device_buf.ToDevice(b_k_n.mData.data());
a_m_k_device_buf.ToDevice(a_m_k.data());
b_k_n_device_buf.ToDevice(b_k_n.data());
auto a_element_op = PassThrough{};
auto b_element_op = PassThrough{};
......@@ -204,9 +205,9 @@ int main(int argc, char* argv[])
// do GEMM
auto gemm = DeviceGemmInstance{};
auto invoker = gemm.MakeInvoker();
auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
auto argument = gemm.MakeArgument(a_m_k_device_buf.GetDeviceBuffer(),
b_k_n_device_buf.GetDeviceBuffer(),
c_m_n_device_buf.GetDeviceBuffer(),
M,
N,
K,
......@@ -237,7 +238,7 @@ int main(int argc, char* argv[])
std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< gemm.GetTypeString() << std::endl;
c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());
c_m_n_device_buf.FromDevice(c_m_n_device_result.data());
if(do_verification)
{
......@@ -249,7 +250,7 @@ int main(int argc, char* argv[])
ref_invoker.Run(ref_argument);
return ck::utils::check_err(c_m_n_device_result.mData, c_m_n_host_result.mData) ? 0 : 1;
return ck::utils::check_err(c_m_n_device_result, c_m_n_host_result) ? 0 : 1;
}
return 0;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <initializer_list>
#include <iostream>
#include <numeric>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/utility/literals.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using BF16 = ck::bhalf_t;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using BF16 = ck::bhalf_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
#include "common.hpp"
using ADataType = BF16;
using BDataType = BF16;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
#include "common.hpp"
using ADataType = F16;
using BDataType = F16;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using F16 = ck::half_t;
using F32 = float;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
#include "common.hpp"
using ADataType = F32;
using BDataType = F32;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
#include "common.hpp"
using ADataType = ck::int4_t;
using BDataType = ck::int4_t;
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
#include "ck/tensor_operation/gpu/device/device_grouped_gemm_xdl.hpp"
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
#include "ck/library/utility/check_err.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/host_tensor.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;
using Row = ck::tensor_layout::gemm::RowMajor;
using Col = ck::tensor_layout::gemm::ColumnMajor;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
#include "common.hpp"
using ADataType = int8_t;
using BDataType = int8_t;
......
......@@ -50,17 +50,17 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
gemm_descs.push_back({M, N, K, stride_A, stride_B, stride_C, {}});
}
using namespace ck::literals;
auto f_host_tensor_descriptor =
[](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({stride, 1}));
return HostTensorDescriptor({row, col}, {stride, 1_uz});
}
else
{
return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
std::vector<std::size_t>({1, stride}));
return HostTensorDescriptor({row, col}, {1_uz, stride});
}
};
......@@ -90,27 +90,27 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
for(std::size_t i = 0; i < gemm_descs.size(); i++)
{
a_tensors.push_back(Tensor<ADataType>(f_host_tensor_descriptor(
gemm_descs[i].M_, gemm_descs[i].K_, gemm_descs[i].stride_A_, ALayout{})));
b_tensors.push_back(Tensor<BDataType>(f_host_tensor_descriptor(
gemm_descs[i].K_, gemm_descs[i].N_, gemm_descs[i].stride_B_, BLayout{})));
c_host_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor(
gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{})));
a_tensors.emplace_back(f_host_tensor_descriptor(
gemm_descs[i].M_, gemm_descs[i].K_, gemm_descs[i].stride_A_, ALayout{}));
b_tensors.emplace_back(f_host_tensor_descriptor(
gemm_descs[i].K_, gemm_descs[i].N_, gemm_descs[i].stride_B_, BLayout{}));
c_host_tensors.emplace_back(f_host_tensor_descriptor(
gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{}));
#ifdef BUILD_INT4_EXAMPLE
c_device_tensors.push_back(Tensor<KernelEDataType>(f_host_tensor_descriptor(
gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{})));
c_device_tensors.emplace_back(f_host_tensor_descriptor(
gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{}));
#else
c_device_tensors.push_back(Tensor<EDataType>(f_host_tensor_descriptor(
gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{})));
c_device_tensors.emplace_back(f_host_tensor_descriptor(
gemm_descs[i].M_, gemm_descs[i].N_, gemm_descs[i].stride_C_, ELayout{}));
#endif
std::cout << "gemm[" << i << "] a_m_k: " << a_tensors[i].mDesc
<< " b_k_n: " << b_tensors[i].mDesc << " c_m_n: " << c_device_tensors[i].mDesc
<< std::endl;
std::cout << "gemm[" << i << "] a_m_k: " << a_tensors[i].GetDesc()
<< " b_k_n: " << b_tensors[i].GetDesc()
<< " c_m_n: " << c_device_tensors[i].GetDesc() << std::endl;
flop += std::size_t(2) * gemm_descs[i].M_ * gemm_descs[i].K_ * gemm_descs[i].N_;
num_btype += sizeof(ADataType) * a_tensors[i].mDesc.GetElementSize() +
sizeof(BDataType) * b_tensors[i].mDesc.GetElementSize() +
sizeof(EDataType) * c_device_tensors[i].mDesc.GetElementSize();
num_btype += sizeof(ADataType) * a_tensors[i].GetElementSize() +
sizeof(BDataType) * b_tensors[i].GetElementSize() +
sizeof(EDataType) * c_device_tensors[i].GetElementSize();
switch(config.init_method)
{
......@@ -131,22 +131,20 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
for(std::size_t i = 0; i < gemm_descs.size(); i++)
{
a_tensors_device.emplace_back(std::make_unique<DeviceMem>(
sizeof(ADataType) * a_tensors[i].mDesc.GetElementSpaceSize()));
b_tensors_device.emplace_back(std::make_unique<DeviceMem>(
sizeof(BDataType) * b_tensors[i].mDesc.GetElementSpaceSize()));
c_tensors_device.emplace_back(std::make_unique<DeviceMem>(
sizeof(EDataType) * c_device_tensors[i].mDesc.GetElementSpaceSize()));
a_tensors_device.emplace_back(std::make_unique<DeviceMem>(a_tensors[i].GetMemorySize()));
b_tensors_device.emplace_back(std::make_unique<DeviceMem>(b_tensors[i].GetMemorySize()));
c_tensors_device.emplace_back(
std::make_unique<DeviceMem>(c_device_tensors[i].GetMemorySize()));
#ifdef BUILD_INT4_EXAMPLE
const Tensor<KernelADataType> a_converted(a_tensors[i]);
const Tensor<KernelBDataType> b_converted(b_tensors[i]);
a_tensors_device[i]->ToDevice(a_converted.mData.data());
b_tensors_device[i]->ToDevice(b_converted.mData.data());
a_tensors_device[i]->ToDevice(a_converted.data());
b_tensors_device[i]->ToDevice(b_converted.data());
#else
a_tensors_device[i]->ToDevice(a_tensors[i].mData.data());
b_tensors_device[i]->ToDevice(b_tensors[i].mData.data());
a_tensors_device[i]->ToDevice(a_tensors[i].data());
b_tensors_device[i]->ToDevice(b_tensors[i].data());
#endif
p_a.push_back(a_tensors_device[i]->GetDeviceBuffer());
......@@ -193,7 +191,7 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
for(std::size_t i = 0; i < gemm_descs.size(); i++)
{
c_tensors_device[i]->FromDevice(c_device_tensors[i].mData.data());
c_tensors_device[i]->FromDevice(c_device_tensors[i].data());
auto ref_gemm = ReferenceGemmInstance{};
auto ref_invoker = ref_gemm.MakeInvoker();
......@@ -208,10 +206,10 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
#ifdef BUILD_INT4_EXAMPLE
const Tensor<EDataType> c_device_result_converted(c_device_tensors[i]);
pass &= ck::utils::check_err(c_device_result_converted.mData, c_host_tensors[i].mData);
pass &= ck::utils::check_err(c_device_result_converted, c_host_tensors[i]);
#else
pass &= ck::utils::check_err(c_device_tensors[i].mData, c_host_tensors[i].mData);
pass &= ck::utils::check_err(c_device_tensors[i], c_host_tensors[i]);
#endif
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment