Commit ef326c73 authored by Alan Turner's avatar Alan Turner
Browse files

Merge remote-tracking branch 'origin/develop' into migraphx-update

parents b7775add e4dfe4d8
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
#include "ck/host/headers.hpp"
#include "ck/host/stringutils.hpp"
#include "ck/host/utils.hpp"
#include "common.hpp"
#include "ck/tensor_operation/gpu/device/helper.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include <test.hpp>
#include <rtc/compile_kernel.hpp>
#include <rtc/hip.hpp>
#include <fstream>
// need this for validation
/**struct Epilogue
{
Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
template <typename E, typename D>
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
template <>
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
const ck::half_t& d) const
{
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
}
float alpha_;
float beta_;
};**/
const std::string conv_compile_check = R"__ck__(
#include <${include}>
${template};
)__ck__";
TEST_CASE(test_problem_kernel)
{
// set up problem specification
ck::host::conv::Problem_Conv_Fwd prob;
prob.NumDim = 2;
prob.G = 32;
prob.N = 256;
prob.C = 32;
prob.K = 64;
prob.Y = 3;
prob.X = 3;
prob.Hi = 28;
prob.Wi = 28;
prob.Ho = 28;
prob.Wo = 28;
check_all<ck::half_t> check;
// user provided fusion operations
std::string epilogue = R"(
struct Epilogue
{
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
template <typename E, typename D>
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
template <>
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
const ck::half_t& d) const
{
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
}
float alpha_;
float beta_;
};
)";
std::string prologue = "";
// length+stride arrays
ck::Array<ck::index_t, 5> in_lengths{static_cast<int>(prob.G),
static_cast<int>(prob.N),
static_cast<int>(prob.C),
static_cast<int>(prob.Hi),
static_cast<int>(prob.Wi)};
ck::Array<ck::index_t, 5> out_lengths{static_cast<int>(prob.G),
static_cast<int>(prob.N),
static_cast<int>(prob.K),
static_cast<int>(prob.Ho),
static_cast<int>(prob.Wo)};
ck::Array<ck::index_t, 5> wei_lengths{static_cast<int>(prob.G),
static_cast<int>(prob.K),
static_cast<int>(prob.C),
static_cast<int>(prob.Y),
static_cast<int>(prob.X)};
ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
1,
static_cast<int>(prob.Wi * prob.G * prob.C),
static_cast<int>(prob.G * prob.C)};
ck::Array<ck::index_t, 5> out_strides{static_cast<int>(prob.K),
static_cast<int>(prob.Ho * prob.Wo * prob.G * prob.K),
1,
static_cast<int>(prob.Wo * prob.G * prob.K),
static_cast<int>(prob.G * prob.K)};
ck::Array<ck::index_t, 5> wei_strides{static_cast<int>(prob.K * prob.Y * prob.X * prob.C),
static_cast<int>(prob.Y * prob.X * prob.C),
1,
static_cast<int>(prob.X * prob.C),
static_cast<int>(prob.C)};
ck::Array<ck::index_t, 2> conv_filter_strides = {1, 1};
ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
ck::Array<ck::index_t, 2> input_left_pads = {0, 0};
ck::Array<ck::index_t, 2> input_right_pads = {0, 0};
// move the data onto the device
auto in_dev =
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(in_lengths, in_strides, 0));
auto wei_dev =
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(wei_lengths, wei_strides, 1));
auto out_dev =
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(out_lengths, out_strides, 2));
// CK Verficiation: Reference Kernel
/**bool pass = true;
Tensor<ck::half_t> in_host(in_lengths, in_strides);
in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
Tensor<ck::half_t> out_host(out_lengths, out_strides);
std::vector<ck::index_t> conv_filter_strides_ = {1, 1};
std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
std::vector<ck::index_t> input_left_pads_ = {0, 0};
std::vector<ck::index_t> input_right_pads_ = {0, 0};
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
2,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
Epilogue>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in_host,
wei_host,
out_host,
conv_filter_strides_,
conv_filter_dilations_,
input_left_pads_,
input_right_pads_,
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{},
Epilogue{1.0f, 1.0f});
out_host.SetZero();
ref_invoker.Run(ref_argument);**/
for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
{
// substitute instance values into the template
auto src = ck::host::InterpolateString(
conv_compile_check,
{{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}});
auto srcs = get_headers_for_test();
srcs.push_back({"main.cpp", src});
rtc::compile_options options;
auto name = solution.GetTemplateParameter<std::string>("name");
options.kernel_name = "run_" + name;
auto k = rtc::compile_kernel(srcs, options);
// Grid size calculation
auto block_size = solution.GetTemplateParameter<ck::index_t>("BlockSize");
auto tmp = get_launch_params(solution, out_lengths, out_strides);
auto grid_size = tmp * in_lengths[1];
// launch the kernel with arguments needed for the argument pointer
k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(),
wei_dev.data(),
out_dev.data(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
out_lengths,
out_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
// auto res = rtc::from_gpu(out_dev);
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
// assert(pass);
// Simple check: this checks that the output from each instance matches the output from the
// first instance
CHECK(report(solution, check(rtc::from_gpu(out_dev))));
}
}
int main(int argc, const char* argv[]) { test::run(argc, argv); }
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
#include "ck/host/headers.hpp"
#include "ck/host/stringutils.hpp"
#include "ck/host/utils.hpp"
#include "ck/tensor_operation/gpu/device/helper.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "common.hpp"
#include <test.hpp>
#include <rtc/compile_kernel.hpp>
#include <rtc/hip.hpp>
#include <fstream>
// need this for verification
/**struct Epilogue
{
Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
template <typename E, typename D>
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
template <>
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
const ck::half_t& d) const
{
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
}
float alpha_;
float beta_;
};**/
const std::string conv_compile_check = R"__ck__(
#include <${include}>
${template};
)__ck__";
TEST_CASE(test_problem_kernel)
{
// set up problem specification
ck::host::conv::Problem_Conv_Fwd prob;
prob.NumDim = 2;
prob.G = 32;
prob.N = 256;
prob.C = 32;
prob.K = 64;
prob.Y = 3;
prob.X = 3;
prob.Hi = 28;
prob.Wi = 28;
prob.Ho = 28;
prob.Wo = 28;
check_all<ck::half_t> check;
// user provided fusion operations
std::string epilogue = R"(
struct Epilogue
{
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
template <typename E, typename D>
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
template <>
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
const ck::half_t& d) const
{
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
}
float alpha_;
float beta_;
};
)";
std::string prologue = "";
// length+stride arrays
ck::Array<ck::index_t, 5> in_lengths{static_cast<int>(prob.G),
static_cast<int>(prob.N),
static_cast<int>(prob.C),
static_cast<int>(prob.Hi),
static_cast<int>(prob.Wi)};
ck::Array<ck::index_t, 5> out_lengths{static_cast<int>(prob.G),
static_cast<int>(prob.N),
static_cast<int>(prob.K),
static_cast<int>(prob.Ho),
static_cast<int>(prob.Wo)};
ck::Array<ck::index_t, 5> wei_lengths{static_cast<int>(prob.G),
static_cast<int>(prob.K),
static_cast<int>(prob.C),
static_cast<int>(prob.Y),
static_cast<int>(prob.X)};
ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
1,
static_cast<int>(prob.Wi * prob.G * prob.C),
static_cast<int>(prob.G * prob.C)};
ck::Array<ck::index_t, 5> out_strides{static_cast<int>(prob.K),
static_cast<int>(prob.Ho * prob.Wo * prob.G * prob.K),
1,
static_cast<int>(prob.Wo * prob.G * prob.K),
static_cast<int>(prob.G * prob.K)};
ck::Array<ck::index_t, 5> wei_strides{static_cast<int>(prob.K * prob.Y * prob.X * prob.C),
static_cast<int>(prob.Y * prob.X * prob.C),
1,
static_cast<int>(prob.X * prob.C),
static_cast<int>(prob.C)};
ck::Array<ck::index_t, 2> conv_filter_strides = {2, 2};
ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
ck::Array<ck::index_t, 2> input_left_pads = {0, 0};
ck::Array<ck::index_t, 2> input_right_pads = {0, 0};
// move the data onto the device
auto in_dev =
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(in_lengths, in_strides, 0));
auto wei_dev =
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(wei_lengths, wei_strides, 1));
auto out_dev =
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(out_lengths, out_strides, 2));
// CK Verficiation: Reference Kernel
/**bool pass = true;
Tensor<ck::half_t> in_host(in_lengths, in_strides);
in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
Tensor<ck::half_t> out_host(out_lengths, out_strides);
std::vector<ck::index_t> conv_filter_strides_ = {2, 2};
std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
std::vector<ck::index_t> input_left_pads_ = {0, 0};
std::vector<ck::index_t> input_right_pads_ = {0, 0};
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
2,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
Epilogue>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in_host,
wei_host,
out_host,
conv_filter_strides_,
conv_filter_dilations_,
input_left_pads_,
input_right_pads_,
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{},
Epilogue{1.0f, 1.0f});
out_host.SetZero();
ref_invoker.Run(ref_argument);**/
for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
{
// substitute instance values into the template
auto src = ck::host::InterpolateString(
conv_compile_check,
{{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}});
auto srcs = get_headers_for_test();
srcs.push_back({"main.cpp", src});
rtc::compile_options options;
auto name = solution.GetTemplateParameter<std::string>("name");
options.kernel_name = "run_" + name;
auto k = rtc::compile_kernel(srcs, options);
// Grid size calculation
auto block_size = solution.GetTemplateParameter<ck::index_t>("BlockSize");
auto tmp = get_launch_params(solution, out_lengths, out_strides);
auto grid_size = tmp * in_lengths[1];
// launch the kernel with arguments needed for the argument pointer
k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(),
wei_dev.data(),
out_dev.data(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
out_lengths,
out_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
// auto res = rtc::from_gpu(out_dev);
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
// assert(pass);
// Simple check: this checks that the output from each instance matches the output from the
// first instance
CHECK(report(solution, check(rtc::from_gpu(out_dev))));
}
}
int main(int argc, const char* argv[]) { test::run(argc, argv); }
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
#include "ck/host/headers.hpp"
#include "ck/host/stringutils.hpp"
#include "ck/host/utils.hpp"
#include "ck/tensor_operation/gpu/device/helper.hpp"
#include "ck/library/utility/host_tensor_generator.hpp"
#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
#include "common.hpp"
#include <test.hpp>
#include <rtc/compile_kernel.hpp>
#include <rtc/hip.hpp>
#include <fstream>
// need this for verification
/**struct Epilogue
{
Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
template <typename E, typename D>
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
template <>
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
const ck::half_t& d) const
{
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
}
float alpha_;
float beta_;
};**/
const std::string conv_compile_check = R"__ck__(
#include <${include}>
${template};
)__ck__";
TEST_CASE(test_problem_kernel)
{
// set up problem specification
ck::host::conv::Problem_Conv_Fwd prob;
prob.NumDim = 2;
prob.G = 32;
prob.N = 256;
prob.C = 32;
prob.K = 64;
prob.Y = 3;
prob.X = 3;
prob.Hi = 28;
prob.Wi = 28;
prob.Ho = 28;
prob.Wo = 28;
check_all<ck::half_t> check;
// user provided fusion operations
std::string epilogue = R"(
struct Epilogue
{
__host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
template <typename E, typename D>
__host__ __device__ constexpr void operator()(E& e, const D& d) const;
template <>
__host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
const ck::half_t& d) const
{
e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
}
float alpha_;
float beta_;
};
)";
std::string prologue = "";
// length+stride arrays
ck::Array<ck::index_t, 5> in_lengths{static_cast<int>(prob.G),
static_cast<int>(prob.N),
static_cast<int>(prob.C),
static_cast<int>(prob.Hi),
static_cast<int>(prob.Wi)};
ck::Array<ck::index_t, 5> out_lengths{static_cast<int>(prob.G),
static_cast<int>(prob.N),
static_cast<int>(prob.K),
static_cast<int>(prob.Ho),
static_cast<int>(prob.Wo)};
ck::Array<ck::index_t, 5> wei_lengths{static_cast<int>(prob.G),
static_cast<int>(prob.K),
static_cast<int>(prob.C),
static_cast<int>(prob.Y),
static_cast<int>(prob.X)};
ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
1,
static_cast<int>(prob.Wi * prob.G * prob.C),
static_cast<int>(prob.G * prob.C)};
ck::Array<ck::index_t, 5> out_strides{static_cast<int>(prob.K),
static_cast<int>(prob.Ho * prob.Wo * prob.G * prob.K),
1,
static_cast<int>(prob.Wo * prob.G * prob.K),
static_cast<int>(prob.G * prob.K)};
ck::Array<ck::index_t, 5> wei_strides{static_cast<int>(prob.K * prob.Y * prob.X * prob.C),
static_cast<int>(prob.Y * prob.X * prob.C),
1,
static_cast<int>(prob.X * prob.C),
static_cast<int>(prob.C)};
ck::Array<ck::index_t, 2> conv_filter_strides = {1, 1};
ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
ck::Array<ck::index_t, 2> input_left_pads = {1, 1};
ck::Array<ck::index_t, 2> input_right_pads = {1, 1};
// move the data onto the device
auto in_dev =
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(in_lengths, in_strides, 0));
auto wei_dev =
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(wei_lengths, wei_strides, 1));
auto out_dev =
to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(out_lengths, out_strides, 2));
// CK Verficiation: Reference Kernel
/**bool pass = true;
Tensor<ck::half_t> in_host(in_lengths, in_strides);
in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
Tensor<ck::half_t> out_host(out_lengths, out_strides);
std::vector<ck::index_t> conv_filter_strides_ = {1, 1};
std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
std::vector<ck::index_t> input_left_pads_ = {1, 1};
std::vector<ck::index_t> input_right_pads_ = {1, 1};
auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
2,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
Epilogue>();
auto ref_invoker = ref_conv.MakeInvoker();
auto ref_argument = ref_conv.MakeArgument(in_host,
wei_host,
out_host,
conv_filter_strides_,
conv_filter_dilations_,
input_left_pads_,
input_right_pads_,
ck::tensor_operation::element_wise::PassThrough{},
ck::tensor_operation::element_wise::PassThrough{},
Epilogue{1.0f, 1.0f});
out_host.SetZero();
ref_invoker.Run(ref_argument);**/
for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
{
// substitute instance values into the template
auto src = ck::host::InterpolateString(
conv_compile_check,
{{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}});
auto srcs = get_headers_for_test();
srcs.push_back({"main.cpp", src});
rtc::compile_options options;
auto name = solution.GetTemplateParameter<std::string>("name");
options.kernel_name = "run_" + name;
auto k = rtc::compile_kernel(srcs, options);
// Grid size calculation
auto block_size = solution.GetTemplateParameter<ck::index_t>("BlockSize");
auto tmp = get_launch_params(solution, out_lengths, out_strides);
auto grid_size = tmp * in_lengths[1];
// launch the kernel with arguments needed for the argument pointer
k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(),
wei_dev.data(),
out_dev.data(),
in_lengths,
in_strides,
wei_lengths,
wei_strides,
out_lengths,
out_strides,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads);
// auto res = rtc::from_gpu(out_dev);
// pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
// assert(pass);
// Simple check: this checks that the output from each instance matches the output from the
// first instance
CHECK(report(solution, check(rtc::from_gpu(out_dev))));
}
}
int main(int argc, const char* argv[]) { test::run(argc, argv); }
#pragma once
#include <algorithm>
#include <cmath>
#include <iterator>
#include <numeric>
#include <random>
#include <test.hpp>
#include <rtc/compile_kernel.hpp>
#include <rtc/hip.hpp>
#include <fstream>
std::vector<rtc::src_file> get_headers_for_test()
{
std::vector<rtc::src_file> result;
auto hs = ck::host::GetHeaders();
std::transform(
hs.begin(), hs.end(), std::back_inserter(result), [&](const auto& p) -> rtc::src_file {
return {p.first, p.second};
});
return result;
}
template <typename V>
std::size_t GetSize(V mLens, V mStrides)
{
std::size_t space = 1;
for(std::size_t i = 0; i < mLens.Size(); ++i)
{
if(mLens[i] == 0)
continue;
space += (mLens[i] - 1) * mStrides[i];
}
return space;
}
template <class T, typename V>
rtc::buffer<T> generate_buffer(V mLens, V mStrides, std::size_t seed = 0)
{
std::size_t space = GetSize(mLens, mStrides);
rtc::buffer<T> result(space);
std::mt19937 gen(seed);
std::uniform_real_distribution<double> dis(-1.0);
std::generate(result.begin(), result.end(), [&] { return dis(gen); });
// std::fill(result.begin(), result.end(), 1);
return result;
}
template <class T, class U>
bool allclose(const T& a, const U& b, double atol = 0.01, double rtol = 0.01)
{
return std::equal(a.begin(), a.end(), b.begin(), b.end(), [&](double x, double y) {
return fabs(x - y) < atol + rtol * fabs(y);
});
}
std::string classify(double x)
{
switch(std::fpclassify(x))
{
case FP_INFINITE: return "inf";
case FP_NAN: return "nan";
case FP_NORMAL: return "normal";
case FP_SUBNORMAL: return "subnormal";
case FP_ZERO: return "zero";
default: return "unknown";
}
}
template <class Buffer>
void print_classification(const Buffer& x)
{
std::unordered_set<std::string> result;
for(const auto& i : x)
result.insert(classify(i));
for(const auto& c : result)
std::cout << c << ", ";
std::cout << std::endl;
}
template <class Buffer>
void print_statistics(const Buffer& x)
{
std::cout << "Min value: " << *std::min_element(x.begin(), x.end()) << ", ";
std::cout << "Max value: " << *std::max_element(x.begin(), x.end()) << ", ";
double num_elements = x.size();
auto mean =
std::accumulate(x.begin(), x.end(), double{0.0}, std::plus<double>{}) / num_elements;
auto stddev = std::sqrt(
std::accumulate(x.begin(),
x.end(),
double{0.0},
[&](double r, double v) { return r + std::pow((v - mean), 2.0); }) /
num_elements);
std::cout << "Mean: " << mean << ", ";
std::cout << "StdDev: " << stddev << "\n";
}
template <class Buffer>
void print_preview(const Buffer& x)
{
if(x.size() <= 10)
{
std::for_each(x.begin(), x.end(), [&](double i) { std::cout << i << ", "; });
}
else
{
std::for_each(x.begin(), x.begin() + 5, [&](double i) { std::cout << i << ", "; });
std::cout << "..., ";
std::for_each(x.end() - 5, x.end(), [&](double i) { std::cout << i << ", "; });
}
std::cout << std::endl;
}
template <class T>
struct check_all
{
rtc::buffer<T> data{};
bool operator()(const rtc::buffer<T>& x)
{
if(data.empty())
{
data = x;
return true;
}
return allclose(data, x);
}
};
template <class Solution>
auto report(const Solution& solution, bool pass)
{
return test::make_predicate(solution.ToTemplateString(), [=] { return pass; });
}
/*
* The MIT License (MIT)
*
* Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include <atomic>
#include <algorithm>
#include <array>
#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <chrono>
#include <functional>
#include <iostream>
#include <sstream>
#include <type_traits>
#include <unordered_map>
#include <vector>
#ifdef __linux__
#include <unistd.h>
#endif
#ifndef MIGRAPHX_GUARD_TEST_TEST_HPP
#define MIGRAPHX_GUARD_TEST_TEST_HPP
namespace test {
// clang-format off
// NOLINTNEXTLINE
#define TEST_FOREACH_BINARY_OPERATORS(m) \
m(==, equal) \
m(!=, not_equal) \
m(<=, less_than_equal) \
m(>=, greater_than_equal) \
m(<, less_than) \
m(>, greater_than) \
m(and, and_op) \
m(or, or_op)
// clang-format on
// clang-format off
// NOLINTNEXTLINE
#define TEST_FOREACH_UNARY_OPERATORS(m) \
m(not, not_op)
// clang-format on
// NOLINTNEXTLINE
#define TEST_EACH_BINARY_OPERATOR_OBJECT(op, name) \
struct name \
{ \
static std::string as_string() { return #op; } \
template <class T, class U> \
static decltype(auto) call(T&& x, U&& y) \
{ \
return x op y; \
} \
};
// NOLINTNEXTLINE
#define TEST_EACH_UNARY_OPERATOR_OBJECT(op, name) \
struct name \
{ \
static std::string as_string() { return #op; } \
template <class T> \
static decltype(auto) call(T&& x) \
{ \
return op x; \
} \
};
TEST_FOREACH_BINARY_OPERATORS(TEST_EACH_BINARY_OPERATOR_OBJECT)
TEST_FOREACH_UNARY_OPERATORS(TEST_EACH_UNARY_OPERATOR_OBJECT)
struct nop
{
static std::string as_string() { return ""; }
template <class T>
static auto call(T&& x)
{
return static_cast<T&&>(x);
}
};
struct function
{
static std::string as_string() { return ""; }
template <class T>
static decltype(auto) call(T&& x)
{
return x();
}
};
template <class Stream, class Iterator>
Stream& stream_range(Stream& s, Iterator start, Iterator last);
template <class Stream>
inline Stream& operator<<(Stream& s, std::nullptr_t)
{
s << "nullptr";
return s;
}
template <class Stream,
class Range,
class = typename std::enable_if<not std::is_convertible<Range, std::string>{}>::type>
inline auto operator<<(Stream& s, const Range& v) -> decltype(stream_range(s, v.begin(), v.end()))
{
s << "{ ";
stream_range(s, v.begin(), v.end());
s << "}";
return s;
}
template <class Stream, class Iterator>
inline Stream& stream_range(Stream& s, Iterator start, Iterator last)
{
if(start != last)
{
s << *start;
std::for_each(std::next(start), last, [&](auto&& x) { s << ", " << x; });
}
return s;
}
template <class T>
const T& get_value(const T& x)
{
return x;
}
template <class T, class Operator = nop>
struct lhs_expression;
template <class T>
lhs_expression<T> make_lhs_expression(T&& lhs);
template <class T, class Operator>
lhs_expression<T, Operator> make_lhs_expression(T&& lhs, Operator);
// NOLINTNEXTLINE
#define TEST_EXPR_BINARY_OPERATOR(op, name) \
template <class V> \
auto operator op(const V& rhs2) const \
{ \
return make_expression(*this, rhs2, name{}); /* NOLINT */ \
}
// NOLINTNEXTLINE
#define TEST_EXPR_UNARY_OPERATOR(op, name) \
auto operator op() const { return make_lhs_expression(lhs, name{}); /* NOLINT */ }
template <class T, class U, class Operator>
struct expression
{
T lhs;
U rhs;
friend std::ostream& operator<<(std::ostream& s, const expression& self)
{
s << self.lhs << " " << Operator::as_string() << " " << self.rhs;
return s;
}
friend decltype(auto) get_value(const expression& e) { return e.value(); }
decltype(auto) value() const { return Operator::call(get_value(lhs), get_value(rhs)); };
TEST_FOREACH_UNARY_OPERATORS(TEST_EXPR_UNARY_OPERATOR)
TEST_FOREACH_BINARY_OPERATORS(TEST_EXPR_BINARY_OPERATOR)
};
// TODO: Remove rvalue references
template <class T, class U, class Operator>
expression<T, U, Operator> make_expression(T&& rhs, U&& lhs, Operator)
{
return {std::forward<T>(rhs), std::forward<U>(lhs)};
}
// TODO: Remove rvalue reference
template <class T>
lhs_expression<T> make_lhs_expression(T&& lhs)
{
return lhs_expression<T>{std::forward<T>(lhs)};
}
template <class T, class Operator>
lhs_expression<T, Operator> make_lhs_expression(T&& lhs, Operator)
{
return lhs_expression<T, Operator>{std::forward<T>(lhs)};
}
template <class T, class Operator>
struct lhs_expression
{
T lhs;
explicit lhs_expression(T e) : lhs(e) {}
friend std::ostream& operator<<(std::ostream& s, const lhs_expression& self)
{
std::string op = Operator::as_string();
if(not op.empty())
s << Operator::as_string() << " ";
s << self.lhs;
return s;
}
friend decltype(auto) get_value(const lhs_expression& e) { return e.value(); }
decltype(auto) value() const { return Operator::call(get_value(lhs)); }
TEST_FOREACH_BINARY_OPERATORS(TEST_EXPR_BINARY_OPERATOR)
TEST_FOREACH_UNARY_OPERATORS(TEST_EXPR_UNARY_OPERATOR)
// NOLINTNEXTLINE
#define TEST_LHS_REOPERATOR(op) \
template <class U> \
auto operator op(const U& rhs) const \
{ \
return make_lhs_expression(lhs op rhs); \
}
TEST_LHS_REOPERATOR(+)
TEST_LHS_REOPERATOR(-)
TEST_LHS_REOPERATOR(*)
TEST_LHS_REOPERATOR(/)
TEST_LHS_REOPERATOR(%)
TEST_LHS_REOPERATOR(&)
TEST_LHS_REOPERATOR(|)
TEST_LHS_REOPERATOR(^)
};
template <class F>
struct predicate
{
std::string msg;
F f;
friend std::ostream& operator<<(std::ostream& s, const predicate& self)
{
s << self.msg;
return s;
}
decltype(auto) operator()() const { return f(); }
operator decltype(auto)() const { return f(); }
};
template <class F>
auto make_predicate(const std::string& msg, F f)
{
return make_lhs_expression(predicate<F>{msg, f}, function{});
}
inline std::string as_string(bool x)
{
if(x)
return "true";
return "false";
}
template <class T>
std::string as_string(const T& x)
{
std::stringstream ss;
ss << x;
return ss.str();
}
template <class Iterator>
std::string as_string(Iterator start, Iterator last)
{
std::stringstream ss;
stream_range(ss, start, last);
return ss.str();
}
template <class F>
auto make_function(const std::string& name, F f)
{
return [=](auto&&... xs) {
std::vector<std::string> args = {as_string(xs)...};
return make_predicate(name + "(" + as_string(args.begin(), args.end()) + ")",
[=] { return f(xs...); });
};
}
struct capture
{
template <class T>
auto operator->*(const T& x) const
{
return make_lhs_expression(x);
}
template <class T, class Operator>
auto operator->*(const lhs_expression<T, Operator>& x) const
{
return x;
}
};
enum class color
{
reset = 0,
bold = 1,
underlined = 4,
fg_red = 31,
fg_green = 32,
fg_yellow = 33,
fg_blue = 34,
fg_default = 39,
bg_red = 41,
bg_green = 42,
bg_yellow = 43,
bg_blue = 44,
bg_default = 49
};
inline std::ostream& operator<<(std::ostream& os, const color& c)
{
#ifndef _WIN32
static const bool use_color = isatty(STDOUT_FILENO) != 0;
if(use_color)
return os << "\033[" << static_cast<std::size_t>(c) << "m";
#else
(void)c;
#endif
return os;
}
inline std::atomic<int>& failures()
{
// NOLINTNEXTLINE
static std::atomic<int> f = 0;
return f;
}
template <class T, class F>
void failed(T x, const char* msg, const char* func, const char* file, int line, F f)
{
if(not bool(x.value()))
{
failures()++;
std::cout << func << std::endl;
std::cout << file << ":" << line << ":" << std::endl;
std::cout << color::bold << color::fg_red << " FAILED: " << color::reset << msg << " "
<< "[ " << x << " ]" << std::endl;
f();
}
}
template <class F>
bool throws(F f)
{
try
{
f();
return false;
}
catch(...)
{
return true;
}
}
template <class Exception, class F>
bool throws(F f, const std::string& msg = "")
{
try
{
f();
return false;
}
catch(const Exception& ex)
{
return std::string(ex.what()).find(msg) != std::string::npos;
}
}
template <class T, class U>
auto within_abs(T px, U py, double ptol = 1e-6f)
{
return make_function("near", [](auto x, auto y, auto tol) { return std::abs(x - y) < tol; })(
px, py, ptol);
}
// This implements the basic globbing algorithm where `*` matches any number
// of characters(including none) and `?` matches any single character. It
// doesnt support character classes.
//
// This is a simple recursive implementation that scans the string where the
// string and pattern matches. When a `*` is found in the pattern, the
// `glob_match` function is called recursively to compare the rest of the
// pattern to the rest of the string. If the recursive call returns true,
// then we have a match. However, if it returns false, then we advance one
// character and call the recusrsive call again. This is referred to as a
// star-loop, which will consume zero or more characters.
//
// This simple recursive implementation works well for short string and
// patterns with few stars. First, it is unlikely to use many stars to glob
// test names. Secondly, using many stars is still signficantly faster than
// using the equivalent std::regex, which has a much slower time complexity.
template <class Iterator1, class Iterator2>
bool glob_match(Iterator1 start, Iterator1 last, Iterator2 pattern_start, Iterator2 pattern_last)
{
std::tie(start, pattern_start) =
std::mismatch(start, last, pattern_start, pattern_last, [](auto c, auto m) {
if(m == '?')
return true;
// We need a loop for star, so bail and handle the loop below
if(m == '*')
return false;
return c == m;
});
// If there is no more pattern then return true if there is no more string to match
if(pattern_start == pattern_last)
return start == last;
// If the pattern is not a star then its a mismatch
if(*pattern_start != '*')
return false;
// Multiple stars are the same as a single star so skip over multiple stars
pattern_start = std::find_if(pattern_start, pattern_last, [](auto c) { return c != '*'; });
// If the star is at the end then return true
if(pattern_start == pattern_last)
return true;
// star-loop: match the rest of the pattern and text
while(not glob_match(start, last, pattern_start, pattern_last) and start != last)
start++;
// If the string is empty then it means a match was never found
return start != last;
}
using string_map = std::unordered_map<std::string, std::vector<std::string>>;
template <class Keyword>
string_map generic_parse(std::vector<std::string> as, Keyword keyword)
{
string_map result;
std::string flag;
for(auto&& x : as)
{
auto f = keyword(x);
if(f.empty())
{
result[flag].push_back(x);
}
else
{
flag = f.front();
result[flag]; // Ensure the flag exists
flag = f.back();
}
}
return result;
}
using test_case = std::function<void()>;
inline auto& get_test_cases()
{
// NOLINTNEXTLINE
static std::vector<std::pair<std::string, test_case>> cases;
return cases;
}
inline void add_test_case(std::string name, test_case f)
{
get_test_cases().emplace_back(std::move(name), std::move(f));
}
struct auto_register_test_case
{
template <class F>
auto_register_test_case(const char* name, F f) noexcept
{
add_test_case(name, f);
}
};
struct failure_error
{
};
[[noreturn]] inline void fail() { throw failure_error{}; }
struct driver
{
driver()
{
add_flag({"--help", "-h"}, "Show help");
add_flag({"--list", "-l"}, "List all test cases");
add_flag({"--continue", "-c"}, "Continue after failure");
add_flag({"--quiet", "-q"}, "Don't print out extra output");
}
struct argument
{
std::vector<std::string> flags = {};
std::string help = "";
int nargs = 1;
};
void add_arg(const std::vector<std::string>& flags, const std::string& help = "")
{
arguments.push_back(argument{flags, help, 1});
}
void add_flag(const std::vector<std::string>& flags, const std::string& help = "")
{
arguments.push_back(argument{flags, help, 0});
}
static void wrap(std::ostream& os,
const std::string& text,
const std::string& prefix = "",
unsigned int line_length = 80)
{
std::istringstream iss(text);
std::string line = prefix;
do
{
std::string word;
iss >> word;
if(line.length() + word.length() > line_length)
{
os << line << std::endl;
line = prefix;
}
line += word + " ";
} while(iss);
if(not line.empty())
os << line << std::endl;
}
void show_help(const std::string& exe) const
{
const std::string prefix = " ";
std::cout << std::endl;
std::cout << color::fg_yellow << "USAGE:" << color::reset << std::endl;
std::cout << " ";
std::cout << exe << " <test-case>... <options>" << std::endl;
std::cout << std::endl;
std::cout << color::fg_yellow << "ARGS:" << color::reset << std::endl;
std::cout << " ";
std::cout << color::fg_green << "<test-case>..." << color::reset;
std::cout << std::endl;
wrap(std::cout,
"Test cases to run. A test case can be either the exact test case name or a glob. A "
"glob expression uses a '*' to select zero or more characters or a '?' to select any "
"single character.",
prefix + prefix);
std::cout << std::endl;
std::cout << color::fg_yellow << "OPTIONS:" << color::reset << std::endl;
for(auto&& arg : arguments)
{
std::cout << color::fg_green;
std::string arg_prefix = prefix;
for(const std::string& a : arg.flags)
{
std::cout << arg_prefix;
std::cout << a;
arg_prefix = ", ";
}
std::cout << color::reset << std::endl;
wrap(std::cout, arg.help, prefix + prefix);
}
}
std::ostream& out() const
{
struct null_buffer : std::streambuf
{
virtual int overflow(int c) override { return c; }
};
static null_buffer buffer;
static std::ostream null_stream(&buffer);
if(quiet)
return null_stream;
return std::cout;
}
string_map parse(int argc, const char* argv[]) const
{
std::vector<std::string> args(argv + 1, argv + argc);
string_map keys;
for(auto&& arg : arguments)
{
for(auto&& flag : arg.flags)
{
keys[flag] = {arg.flags.front()};
if(arg.nargs == 0)
keys[flag].push_back("");
}
}
auto result = generic_parse(args, [&](auto&& s) -> std::vector<std::string> {
if(keys.count(s) > 0)
return keys[s];
else
return {};
});
result["__exe__"].push_back(argv[0]);
return result;
}
static std::string create_command(const string_map& args)
{
std::stringstream ss;
ss << args.at("__exe__").front();
if(args.count("") > 0)
{
for(auto&& arg : args.at(""))
ss << " \"" << arg << "\"";
}
for(auto&& p : args)
{
if(p.first == "__exe__")
continue;
if(p.first.empty())
continue;
ss << " " << p.first;
for(auto&& arg : p.second)
ss << " \"" << arg << "\"";
}
return ss.str();
}
static std::string fork(const std::string& name, string_map args)
{
std::string msg;
args[""] = {name};
args.erase("--continue");
args["--quiet"];
auto cmd = create_command(args);
auto r = std::system(cmd.c_str()); // NOLINT
if(r != 0)
msg = "Exited with " + std::to_string(r);
return msg;
}
static std::vector<std::pair<std::string, test_case>> glob_tests(const std::string& pattern)
{
std::vector<std::pair<std::string, test_case>> result;
std::copy_if(get_test_cases().begin(),
get_test_cases().end(),
std::back_inserter(result),
[&](auto&& p) {
return glob_match(
p.first.begin(), p.first.end(), pattern.begin(), pattern.end());
});
return result;
}
void run_test_case(const std::string& name, const test_case& f, const string_map& args)
{
ran++;
out() << color::fg_green << "[ RUN ] " << color::reset << color::bold << name
<< color::reset << std::endl;
std::string msg;
auto start = std::chrono::steady_clock::now();
if(args.count("--continue") > 0)
{
msg = fork(name, args);
}
else
{
try
{
failures() = 0;
f();
}
// cppcheck-suppress migraphx-EmptyCatchStatement
catch(const failure_error&)
{
}
}
auto finish = std::chrono::steady_clock::now();
auto elapsed_ms =
std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(finish - start)
.count();
if(msg.empty() and failures() != 0)
{
if(failures() == 1)
msg = "Test failure";
else
msg = std::to_string(failures()) + " test failures";
}
if(msg.empty())
{
out() << color::fg_green << "[ COMPLETE ] " << color::reset;
}
else
{
failed.push_back(name);
out() << color::fg_red << "[ FAILED ] " << color::reset;
}
out() << color::bold << name << color::reset;
out() << color::fg_blue << " (" << elapsed_ms << "ms)" << color::reset;
if(not msg.empty())
out() << ": " << color::fg_yellow << msg << color::reset;
out() << std::endl;
}
void run(int argc, const char* argv[])
{
auto args = parse(argc, argv);
if(args.count("--help") > 0)
{
show_help(args.at("__exe__").front());
return;
}
if(args.count("--list") > 0)
{
for(auto&& tc : get_test_cases())
out() << tc.first << std::endl;
return;
}
if(args.count("--quiet") > 0)
quiet = true;
auto cases = args[""];
if(cases.empty())
{
for(auto&& tc : get_test_cases())
run_test_case(tc.first, tc.second, args);
}
else
{
std::unordered_map<std::string, test_case> m(get_test_cases().begin(),
get_test_cases().end());
for(auto&& iname : cases)
{
std::vector<std::pair<std::string, test_case>> found_cases;
for(auto&& pattern : get_case_names(iname))
{
auto f = m.find(pattern);
if(f == m.end())
{
found_cases = glob_tests(pattern);
}
else
{
found_cases.push_back(*f);
}
}
if(found_cases.empty())
{
out() << color::fg_red << "[ ERROR ] Test case '" << iname << "' not found."
<< color::reset << std::endl;
failed.push_back(iname);
}
for(auto&& p : found_cases)
run_test_case(p.first, p.second, args);
}
}
out() << color::fg_green << "[==========] " << color::fg_yellow << ran << " tests ran"
<< color::reset << std::endl;
if(not failed.empty())
{
out() << color::fg_red << "[ FAILED ] " << color::fg_yellow << failed.size()
<< " tests failed" << color::reset << std::endl;
for(auto&& name : failed)
out() << color::fg_red << "[ FAILED ] " << color::fg_yellow << name
<< color::reset << std::endl;
std::exit(1);
}
}
std::function<std::vector<std::string>(const std::string&)> get_case_names =
[](const std::string& name) -> std::vector<std::string> { return {name}; };
std::vector<argument> arguments = {};
std::vector<std::string> failed = {};
std::size_t ran = 0;
bool quiet = false;
};
inline void run(int argc, const char* argv[])
{
driver d{};
d.run(argc, argv);
}
} // namespace test
// NOLINTNEXTLINE
#define TEST_CAPTURE(...) test::capture{}->*__VA_ARGS__
// NOLINTNEXTLINE
#define CHECK(...) \
test::failed( \
TEST_CAPTURE(__VA_ARGS__), #__VA_ARGS__, __PRETTY_FUNCTION__, __FILE__, __LINE__, [] {})
// NOLINTNEXTLINE
#define EXPECT(...) \
test::failed(TEST_CAPTURE(__VA_ARGS__), \
#__VA_ARGS__, \
__PRETTY_FUNCTION__, \
__FILE__, \
__LINE__, \
&test::fail)
// NOLINTNEXTLINE
#define STATUS(...) EXPECT((__VA_ARGS__) == 0)
// NOLINTNEXTLINE
#define TEST_CAT(x, ...) TEST_PRIMITIVE_CAT(x, __VA_ARGS__)
// NOLINTNEXTLINE
#define TEST_PRIMITIVE_CAT(x, ...) x##__VA_ARGS__
// NOLINTNEXTLINE
#define TEST_CASE_REGISTER(...) \
static test::auto_register_test_case TEST_CAT(register_test_case_, __LINE__) = \
test::auto_register_test_case(#__VA_ARGS__, &__VA_ARGS__);
// NOLINTNEXTLINE
#define TEST_CASE(...) \
void __VA_ARGS__(); \
TEST_CASE_REGISTER(__VA_ARGS__) \
void __VA_ARGS__()
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wglobal-constructors"
#endif
#endif
find_package(hip)
file(GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp)
add_library(ck_rtc ${RTC_SOURCES})
target_include_directories(ck_rtc PUBLIC include)
target_link_libraries(ck_rtc PUBLIC hip::host)
target_link_libraries(ck_rtc PUBLIC -lstdc++fs)
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
#include <rtc/kernel.hpp>
#include <rtc/filesystem.hpp>
#include <string>
namespace rtc {
struct src_file
{
fs::path path;
std::string_view content;
};
struct compile_options
{
std::string flags = "";
std::string kernel_name = "main";
};
kernel compile_kernel(const std::vector<src_file>& src,
compile_options options = compile_options{});
} // namespace rtc
#endif
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#ifndef GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
#define GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
#include <string>
#include <string_view>
// clang-format off
#if defined(CPPCHECK)
#define RTC_HAS_FILESYSTEM 1
#define RTC_HAS_FILESYSTEM_TS 1
#elif defined(_WIN32)
#if _MSC_VER >= 1920
#define RTC_HAS_FILESYSTEM 1
#define RTC_HAS_FILESYSTEM_TS 0
#elif _MSC_VER >= 1900
#define RTC_HAS_FILESYSTEM 0
#define RTC_HAS_FILESYSTEM_TS 1
#else
#define RTC_HAS_FILESYSTEM 0
#define RTC_HAS_FILESYSTEM_TS 0
#endif
#elif defined(__has_include)
#if __has_include(<filesystem>) && __cplusplus >= 201703L
#define RTC_HAS_FILESYSTEM 1
#else
#define RTC_HAS_FILESYSTEM 0
#endif
#if __has_include(<experimental/filesystem>) && __cplusplus >= 201103L
#define RTC_HAS_FILESYSTEM_TS 1
#else
#define RTC_HAS_FILESYSTEM_TS 0
#endif
#else
#define RTC_HAS_FILESYSTEM 0
#define RTC_HAS_FILESYSTEM_TS 0
#endif
// clang-format on
#if RTC_HAS_FILESYSTEM
#include <filesystem>
#elif RTC_HAS_FILESYSTEM_TS
#include <experimental/filesystem>
#else
#error "No filesystem include available"
#endif
namespace rtc {
#if RTC_HAS_FILESYSTEM
namespace fs = ::std::filesystem;
#elif RTC_HAS_FILESYSTEM_TS
namespace fs = ::std::experimental::filesystem;
#endif
} // namespace rtc
#endif // GUARD_RTC_FILESYSTEM_HPP_
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_HIP
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_HIP
#include <hip/hip_runtime_api.h>
#include <memory>
#include <string>
namespace rtc {
template <class T>
struct buffer
{
buffer() : ptr(), n(0) {}
buffer(std::shared_ptr<T> p, std::size_t sz) : ptr(p), n(sz) {}
buffer(std::shared_ptr<void> p, std::size_t sz)
: ptr(std::reinterpret_pointer_cast<T>(p)), n(sz)
{
}
explicit buffer(std::size_t sz) : ptr(new T[sz]), n(sz) {}
T* begin() { return data(); }
T* end() { return data() + size(); }
const T* begin() const { return data(); }
const T* end() const { return data() + size(); }
T& front() { return data()[0]; }
T& back() { return data()[size() - 1]; }
T& operator[](std::size_t i) { return data()[i]; }
T& at(std::size_t i)
{
if(i >= size())
throw std::runtime_error("Out of bounds");
return data()[i];
}
const T& front() const { return data()[0]; }
const T& back() const { return data()[size() - 1]; }
const T& operator[](std::size_t i) const { return data()[i]; }
const T& at(std::size_t i) const
{
if(i >= size())
throw std::runtime_error("Out of bounds");
return data()[i];
}
const T* data() const { return ptr.get(); }
T* data() { return ptr.get(); }
std::size_t size() const { return n; }
std::size_t bytes() const { return size() * sizeof(T); }
bool empty() const { return size() == 0; }
private:
std::shared_ptr<T> ptr;
std::size_t n;
};
std::string get_device_name();
std::string hip_error(int error);
std::shared_ptr<void> allocate_gpu(std::size_t sz, bool host = false);
std::shared_ptr<void> write_to_gpu(const void* x, std::size_t sz, bool host = false);
std::shared_ptr<void> read_from_gpu(const void* x, std::size_t sz);
template <class T>
buffer<T> to_gpu(const buffer<T>& input)
{
return {write_to_gpu(input.data(), input.bytes()), input.size()};
}
template <class T>
buffer<T> from_gpu(const buffer<T>& input)
{
return {read_from_gpu(input.data(), input.bytes()), input.size()};
}
} // namespace rtc
#endif
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL
#include <hip/hip_runtime_api.h>
#include <memory>
#include <string>
#include <vector>
namespace rtc {
struct kernel_argument
{
template <class T,
class U = std::remove_reference_t<T>,
class = std::enable_if_t<not std::is_base_of<kernel_argument, T>{}>>
kernel_argument(T&& x) : size(sizeof(U)), align(alignof(U)), data(&x) // NOLINT
{
}
std::size_t size;
std::size_t align;
void* data;
};
std::vector<char> pack_args(const std::vector<kernel_argument>& args);
struct kernel_impl;
struct kernel
{
kernel() = default;
kernel(const char* image, const std::string& name);
template <class T>
kernel(const std::vector<T>& image, const std::string& name)
: kernel(reinterpret_cast<const char*>(image.data()), name)
{
static_assert(sizeof(T) == 1, "Only byte types");
}
void launch(hipStream_t stream,
std::size_t global,
std::size_t local,
const std::vector<kernel_argument>& args) const;
void launch(hipStream_t stream,
std::size_t global,
std::size_t local,
std::vector<void*> args) const;
template <class... Ts>
auto launch(hipStream_t stream, std::size_t global, std::size_t local, Ts... zs) const
{
return [=](auto&&... xs) {
launch(stream, global, local, std::vector<kernel_argument>{xs...}, zs...);
};
}
private:
std::shared_ptr<kernel_impl> impl;
};
} // namespace rtc
#endif
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER
#include <type_traits>
#include <memory>
namespace rtc {
template <class F, F f>
struct manage_deleter
{
template <class T>
void operator()(T* x) const
{
if(x != nullptr)
{
(void)f(x);
}
}
};
struct null_deleter
{
template <class T>
void operator()(T*) const
{
}
};
template <class T, class F, F f>
using manage_ptr = std::unique_ptr<T, manage_deleter<F, f>>;
template <class T>
struct element_type
{
using type = typename T::element_type;
};
template <class T>
using remove_ptr = typename std::
conditional_t<std::is_pointer<T>{}, std::remove_pointer<T>, element_type<T>>::type;
template <class T>
using shared = std::shared_ptr<remove_ptr<T>>;
template <class T>
shared<T> share(T p)
{
return shared<T>{std::move(p)};
}
#define RTC_MANAGE_PTR(T, F) rtc::manage_ptr<std::remove_pointer_t<T>, decltype(&F), &F>
} // namespace rtc
#endif
#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
#include <string>
#include <rtc/filesystem.hpp>
namespace rtc {
struct tmp_dir
{
fs::path path;
tmp_dir(const std::string& prefix = "");
void execute(const std::string& cmd) const;
tmp_dir(tmp_dir const&) = delete;
tmp_dir& operator=(tmp_dir const&) = delete;
~tmp_dir();
};
} // namespace rtc
#endif
#include <rtc/hip.hpp>
#include <rtc/compile_kernel.hpp>
#include <rtc/tmp_dir.hpp>
#include <stdexcept>
#include <iostream>
#include <fstream>
#include <cassert>
namespace rtc {
template <class T>
T generic_read_file(const std::string& filename, size_t offset = 0, size_t nbytes = 0)
{
std::ifstream is(filename, std::ios::binary | std::ios::ate);
if(nbytes == 0)
{
// if there is a non-zero offset and nbytes is not set,
// calculate size of remaining bytes to read
nbytes = is.tellg();
if(offset > nbytes)
throw std::runtime_error("offset is larger than file size");
nbytes -= offset;
}
if(nbytes < 1)
throw std::runtime_error("Invalid size for: " + filename);
is.seekg(offset, std::ios::beg);
T buffer(nbytes, 0);
if(not is.read(&buffer[0], nbytes))
throw std::runtime_error("Error reading file: " + filename);
return buffer;
}
std::vector<char> read_buffer(const std::string& filename, size_t offset = 0, size_t nbytes = 0)
{
return generic_read_file<std::vector<char>>(filename, offset, nbytes);
}
std::string read_string(const std::string& filename)
{
return generic_read_file<std::string>(filename);
}
void write_buffer(const std::string& filename, const char* buffer, std::size_t size)
{
std::ofstream os(filename);
os.write(buffer, size);
}
void write_buffer(const std::string& filename, const std::vector<char>& buffer)
{
write_buffer(filename, buffer.data(), buffer.size());
}
void write_string(const std::string& filename, const std::string_view& buffer)
{
write_buffer(filename, buffer.data(), buffer.size());
}
std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip --cuda-device-only"; }
// TODO: undo after extracting the codeobj
// std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip"; }
kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options)
{
assert(not srcs.empty());
tmp_dir td{"compile"};
options.flags += " -I. -O3";
options.flags += " -std=c++17";
options.flags += " --offload-arch=" + get_device_name();
std::string out;
for(const auto& src : srcs)
{
fs::path full_path = td.path / src.path;
fs::path parent_path = full_path.parent_path();
fs::create_directories(parent_path);
write_string(full_path.string(), src.content);
if(src.path.extension().string() == ".cpp")
{
options.flags += " -c " + src.path.filename().string();
if(out.empty())
out = src.path.stem().string() + ".o";
}
}
options.flags += " -o " + out;
td.execute(compiler() + options.flags);
auto out_path = td.path / out;
if(not fs::exists(out_path))
throw std::runtime_error("Output file missing: " + out);
auto obj = read_buffer(out_path.string());
std::ofstream ofh("obj.o", std::ios::binary);
for(auto i : obj)
ofh << i;
ofh.close();
// int s = std::system(("/usr/bin/cp " + out_path.string() + " codeobj.bin").c_str());
// assert(s == 0);
return kernel{obj.data(), options.kernel_name};
}
} // namespace rtc
#include <rtc/hip.hpp>
#include <rtc/manage_ptr.hpp>
#include <stdexcept>
#include <cassert>
#include <iostream>
namespace rtc {
using hip_ptr = RTC_MANAGE_PTR(void, hipFree);
std::string hip_error(int error) { return hipGetErrorString(static_cast<hipError_t>(error)); }
int get_device_id()
{
int device;
auto status = hipGetDevice(&device);
if(status != hipSuccess)
throw std::runtime_error("No device");
return device;
}
std::string get_device_name()
{
hipDeviceProp_t props{};
auto status = hipGetDeviceProperties(&props, get_device_id());
if(status != hipSuccess)
throw std::runtime_error("Failed to get device properties");
return props.gcnArchName;
}
bool is_device_ptr(const void* ptr)
{
hipPointerAttribute_t attr;
auto status = hipPointerGetAttributes(&attr, ptr);
if(status != hipSuccess)
return false;
return attr.type == hipMemoryTypeDevice;
}
void gpu_sync()
{
auto status = hipDeviceSynchronize();
if(status != hipSuccess)
throw std::runtime_error("hip device synchronization failed: " + hip_error(status));
}
std::size_t get_available_gpu_memory()
{
size_t free;
size_t total;
auto status = hipMemGetInfo(&free, &total);
if(status != hipSuccess)
{
std::cerr << "Failed getting available memory: " + hip_error(status) << std::endl;
return (8ull * 1024ull * 1024ull * 1024ull);
}
return free;
}
std::shared_ptr<void> allocate_gpu(std::size_t sz, bool host)
{
if(sz > get_available_gpu_memory())
throw std::runtime_error("Memory not available to allocate buffer: " + std::to_string(sz));
void* alloc_ptr = nullptr;
auto status = host ? hipHostMalloc(&alloc_ptr, sz) : hipMalloc(&alloc_ptr, sz);
if(status != hipSuccess)
{
if(host)
throw std::runtime_error("Gpu allocation failed: " + hip_error(status));
else
return allocate_gpu(sz, true);
}
assert(alloc_ptr != nullptr);
std::shared_ptr<void> result = share(hip_ptr{alloc_ptr});
return result;
}
std::shared_ptr<void> write_to_gpu(const void* x, std::size_t sz, bool host)
{
gpu_sync();
auto result = allocate_gpu(sz, host);
assert(is_device_ptr(result.get()));
assert(not is_device_ptr(x));
auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
if(status != hipSuccess)
throw std::runtime_error("Copy to gpu failed: " + hip_error(status));
return result;
}
std::shared_ptr<void> read_from_gpu(const void* x, std::size_t sz)
{
gpu_sync();
std::shared_ptr<char> result(new char[sz]);
assert(not is_device_ptr(result.get()));
if(not is_device_ptr(x))
{
throw std::runtime_error(
"read_from_gpu() requires Src buffer to be on the GPU, Copy from gpu failed\n");
}
auto status = hipMemcpy(result.get(), x, sz, hipMemcpyDeviceToHost);
if(status != hipSuccess)
throw std::runtime_error("Copy from gpu failed: " + hip_error(status)); // NOLINT
return std::static_pointer_cast<void>(result);
}
} // namespace rtc
#include <rtc/kernel.hpp>
#include <rtc/manage_ptr.hpp>
#include <rtc/hip.hpp>
#include <cassert>
// extern declare the function since hip/hip_ext.h header is broken
extern hipError_t hipExtModuleLaunchKernel(hipFunction_t, // NOLINT
uint32_t,
uint32_t,
uint32_t,
uint32_t,
uint32_t,
uint32_t,
size_t,
hipStream_t,
void**,
void**,
hipEvent_t = nullptr,
hipEvent_t = nullptr,
uint32_t = 0);
namespace rtc {
std::vector<char> pack_args(const std::vector<kernel_argument>& args)
{
std::vector<char> kernargs;
for(auto&& arg : args)
{
std::size_t n = arg.size;
const auto* p = static_cast<const char*>(arg.data);
// Insert padding
std::size_t padding = (arg.align - (kernargs.size() % arg.align)) % arg.align;
kernargs.insert(kernargs.end(), padding, 0);
kernargs.insert(kernargs.end(), p, p + n);
}
return kernargs;
}
using hip_module_ptr = RTC_MANAGE_PTR(hipModule_t, hipModuleUnload);
struct kernel_impl
{
hip_module_ptr module = nullptr;
hipFunction_t fun = nullptr;
};
hip_module_ptr load_module(const char* image)
{
hipModule_t raw_m;
auto status = hipModuleLoadData(&raw_m, image);
hip_module_ptr m{raw_m};
if(status != hipSuccess)
throw std::runtime_error("Failed to load module: " + hip_error(status));
return m;
}
kernel::kernel(const char* image, const std::string& name) : impl(std::make_shared<kernel_impl>())
{
impl->module = load_module(image);
auto status = hipModuleGetFunction(&impl->fun, impl->module.get(), name.c_str());
if(hipSuccess != status)
throw std::runtime_error("Failed to get function: " + name + ": " + hip_error(status));
}
void launch_kernel(hipFunction_t fun,
hipStream_t stream,
std::size_t global,
std::size_t local,
void* kernargs,
std::size_t size)
{
assert(global > 0);
assert(local > 0);
void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER,
kernargs,
HIP_LAUNCH_PARAM_BUFFER_SIZE,
&size,
HIP_LAUNCH_PARAM_END};
auto status = hipExtModuleLaunchKernel(fun,
global,
1,
1,
local,
1,
1,
0,
stream,
nullptr,
reinterpret_cast<void**>(&config),
nullptr,
nullptr);
if(status != hipSuccess)
throw std::runtime_error("Failed to launch kernel: " + hip_error(status));
}
void kernel::launch(hipStream_t stream,
std::size_t global,
std::size_t local,
std::vector<void*> args) const
{
assert(impl != nullptr);
void* kernargs = args.data();
std::size_t size = args.size() * sizeof(void*);
launch_kernel(impl->fun, stream, global, local, kernargs, size);
}
void kernel::launch(hipStream_t stream,
std::size_t global,
std::size_t local,
const std::vector<kernel_argument>& args) const
{
assert(impl != nullptr);
std::vector<char> kernargs = pack_args(args);
std::size_t size = kernargs.size();
launch_kernel(impl->fun, stream, global, local, kernargs.data(), size);
}
} // namespace rtc
#include <rtc/tmp_dir.hpp>
#include <algorithm>
#include <random>
#include <thread>
#include <unistd.h>
namespace rtc {
std::string random_string(std::string::size_type length)
{
static const std::string& chars = "0123456789"
"abcdefghijklmnopqrstuvwxyz"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
std::mt19937 rg{std::random_device{}()};
std::uniform_int_distribution<std::string::size_type> pick(0, chars.length() - 1);
std::string str(length, 0);
std::generate(str.begin(), str.end(), [&] { return chars[pick(rg)]; });
return str;
}
std::string unique_string(const std::string& prefix)
{
auto pid = getpid();
auto tid = std::this_thread::get_id();
auto clk = std::chrono::steady_clock::now().time_since_epoch().count();
std::stringstream ss;
ss << std::hex << prefix << "-" << pid << "-" << tid << "-" << clk << "-" << random_string(16);
return ss.str();
}
tmp_dir::tmp_dir(const std::string& prefix)
: path(fs::temp_directory_path() /
unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix))
{
fs::create_directories(this->path);
}
void tmp_dir::execute(const std::string& cmd) const
{
std::string s = "cd " + path.string() + "; " + cmd;
std::system(s.c_str());
}
tmp_dir::~tmp_dir() { fs::remove_all(this->path); }
} // namespace rtc
.. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
.. _contributing-to:
********************************************************************
Contributor's guide
********************************************************************
This chapter explains the rules for contributing to the Composable Kernel project, and how to contribute.
Getting started
===============
#. **Documentation:** Before contributing to the library, familiarize yourself with the
`Composable Kernel User Guide <https://rocm.docs.amd.com/projects/composable_kernel/en/latest/>`_.
It provides insight into the core concepts, environment configuration, and steps to obtain or
build the library. You can also find some of this information in the
`README file <https://github.com/ROCm/composable_kernel/blob/develop/README.md>`_
on the project's GitHub page.
#. **Additional reading:** The blog post `AMD Composable Kernel library: efficient fused kernels for AI apps with just a few lines of code <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_ provides a deeper understanding of the CK library and showcases its performance capabilities.
<https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_
from the AMD Community portal. It offers a deeper understanding of the library's objectives and showcases its performance capabilities.
#. **General information:** For broader information about AMD products, consider exploring the
`AMD Developer Central portal <https://www.amd.com/en/developer.html>`_.
How to contribute
===================
Contributor's Guide
===================
Pull-request guidelines
=======================
You can make an impact by reporting issues or proposing code enhancements through pull requests.
Reporting issues
----------------
Use `Github issues <https://github.com/ROCm/composable_kernel/issues>`_
to track public bugs and enhancement requests.
If you encounter an issue with the library, please check if the problem has already been
reported by searching existing issues on GitHub. If your issue seems unique, please submit a new
issue. All reported issues must include:
* A comprehensive description of the problem, including:
* What did you observe?
* Why do you think it is a bug (if it seems like one)?
* What did you expect to happen? What would indicate the resolution of the problem?
* Are there any known workarounds?
* Your configuration details, including:
* Which GPU are you using?
* Which OS version are you on?
* Which ROCm version are you using?
* Are you using a Docker image? If so, which one?
* Steps to reproduce the issue, including:
* What actions trigger the issue? What are the reproduction steps?
* If you build the library from scratch, what CMake command did you use?
* How frequently does this issue happen? Does it reproduce every time? Or is it a sporadic issue?
Before submitting any issue, ensure you have addressed all relevant questions from the checklist.
Creating Pull Requests
----------------------
You can submit `Pull Requests (PR) on GitHub
<https://github.com/ROCm/composable_kernel/pulls>`_.
All contributors are required to develop their changes on a separate branch and then create a
pull request to merge their changes into the `develop` branch, which is the default
development branch in the Composable Kernel project. All external contributors must use their own
forks of the project to develop their changes.
When submitting a Pull Request you should:
* Describe the change providing information about the motivation for the change and a general
description of all code modifications.
* Verify and test the change:
* Run any relevant existing tests.
* Write new tests if added functionality is not covered by current tests.
* Ensure your changes align with the coding style defined in the ``.clang-format`` file located in
the project's root directory. We leverage `pre-commit` to run `clang-format` automatically. We
highly recommend contributors utilize this method to maintain consistent code formatting.
Instructions on setting up `pre-commit` can be found in the project's
`README file <https://github.com/ROCm/composable_kernel/blob/develop/README.md>`_
* Link your PR to any related issues:
* If there is an issue that is resolved by your change, please provide a link to the issue in
the description of your pull request.
* For larger contributions, structure your change into a sequence of smaller, focused commits, each
addressing a particular aspect or fix.
Following the above guidelines ensures a seamless review process and faster assistance from our
end.
[TODO]
Thank you for your commitment to enhancing the Composable Kernel project!
.. meta::
:description: Composable Kernel documentation and API reference library
:keywords: composable kernel, CK, ROCm, API, documentation
.. _what-is-ck:
********************************************************************
What is the Composable Kernel library
********************************************************************
Methodology
===========
The Composable Kernel (CK) library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages like HIP C++.
CK utilizes two concepts to achieve performance portability and code maintainability:
* A tile-based programming model
* Algorithm complexity reduction for complex ML operators using an innovative technique called
"Tensor Coordinate Transformation".
.. image:: ../data/ck_component.png
:alt: CK Components
Code Structure
==============
The CK library is structured into 4 layers:
* "Templated Tile Operators" layer
* "Templated Kernel and Invoker" layer
* "Instantiated Kernel and Invoker" layer
* "Client API" layer
It also includes a simple wrapper component used to perform tensor transform operations more easily and with fewer lines of code.
.. image:: ../data/ck_layer.png
:alt: CK Layers
\ No newline at end of file
......@@ -4,23 +4,34 @@
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
import subprocess
import re
from rocm_docs import ROCmDocs
html_theme_options = {"flavor": "list"}
name = "Composable Kernel"
get_version = r'sed -n -e "s/^rocm_setup_version(.* \([0-9\.]\{1,\}\).*/\1/p" ../CMakeLists.txt'
version = subprocess.getoutput(get_version)
if len(version) > 0:
name = f"{name} {version}"
with open('../CMakeLists.txt', encoding='utf-8') as f:
match = re.search(r'.*set\(version ([0-9.]+)[^0-9.]+', f.read())
if not match:
raise ValueError("VERSION not found!")
version_number = match[1]
left_nav_title = f"Composable Kernel {version_number} Documentation"
# for PDF output on Read the Docs
project = "Composable Kernel Documentation"
author = "Advanced Micro Devices, Inc."
copyright = "Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved."
version = version_number
release = version_number
external_toc_path = "./sphinx/_toc.yml"
docs_core = ROCmDocs(f"{name} Documentation")
docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/docBin/xml")
docs_core = ROCmDocs(left_nav_title)
docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
docs_core.setup()
external_projects_current_project = "composable_kernel"
mathjax3_config = {
'tex': {
'macros': {
......@@ -34,3 +45,5 @@ for sphinx_var in ROCmDocs.SPHINX_VARS:
extensions += ['sphinxcontrib.bibtex']
bibtex_bibfiles = ['refs.bib']
cpp_id_attributes = ["__global__", "__device__", "__host__"]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment