Merge remote-tracking branch 'origin/develop' into migraphx-update

ef326c73 · Alan Turner · b7775add · e4dfe4d8 · ef326c73 · ef326c73
Commit ef326c73 authored Nov 19, 2024 by Alan Turner
20 changed files
--- a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
+#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
+#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
+#include "ck/host/headers.hpp"
+#include "ck/host/stringutils.hpp"
+#include "ck/host/utils.hpp"
+#include "common.hpp"
+#include "ck/tensor_operation/gpu/device/helper.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include <test.hpp>
+#include <rtc/compile_kernel.hpp>
+#include <rtc/hip.hpp>
+#include <fstream>
+
+// need  this for validation
+/**struct Epilogue
+{
+    Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
+
+    template <typename E, typename D>
+    __host__ __device__ constexpr void operator()(E& e, const D& d) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
+                                                                          const ck::half_t& d) const
+    {
+        e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
+    }
+
+    float alpha_;
+    float beta_;
+};**/
+const std::string conv_compile_check = R"__ck__(
+#include <${include}>
+
+${template};
+
+)__ck__";
+
+TEST_CASE(test_problem_kernel)
+{
+    // set up problem specification
+    ck::host::conv::Problem_Conv_Fwd prob;
+    prob.NumDim = 2;
+    prob.G      = 32;
+    prob.N      = 256;
+    prob.C      = 32;
+    prob.K      = 64;
+    prob.Y      = 3;
+    prob.X      = 3;
+    prob.Hi     = 28;
+    prob.Wi     = 28;
+    prob.Ho     = 28;
+    prob.Wo     = 28;
+    check_all<ck::half_t> check;
+
+    // user provided fusion operations
+    std::string epilogue = R"(
+struct Epilogue
+{
+    __host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
+
+    template <typename E, typename D>
+    __host__ __device__ constexpr void operator()(E& e, const D& d) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
+                                                                          const ck::half_t& d) const
+    {
+        e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
+    }
+
+    float alpha_;
+    float beta_;
+};
+)";
+    std::string prologue = "";
+
+    // length+stride arrays
+    ck::Array<ck::index_t, 5> in_lengths{static_cast<int>(prob.G),
+                                         static_cast<int>(prob.N),
+                                         static_cast<int>(prob.C),
+                                         static_cast<int>(prob.Hi),
+                                         static_cast<int>(prob.Wi)};
+    ck::Array<ck::index_t, 5> out_lengths{static_cast<int>(prob.G),
+                                          static_cast<int>(prob.N),
+                                          static_cast<int>(prob.K),
+                                          static_cast<int>(prob.Ho),
+                                          static_cast<int>(prob.Wo)};
+    ck::Array<ck::index_t, 5> wei_lengths{static_cast<int>(prob.G),
+                                          static_cast<int>(prob.K),
+                                          static_cast<int>(prob.C),
+                                          static_cast<int>(prob.Y),
+                                          static_cast<int>(prob.X)};
+
+    ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
+                                         static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
+                                         1,
+                                         static_cast<int>(prob.Wi * prob.G * prob.C),
+                                         static_cast<int>(prob.G * prob.C)};
+    ck::Array<ck::index_t, 5> out_strides{static_cast<int>(prob.K),
+                                          static_cast<int>(prob.Ho * prob.Wo * prob.G * prob.K),
+                                          1,
+                                          static_cast<int>(prob.Wo * prob.G * prob.K),
+                                          static_cast<int>(prob.G * prob.K)};
+    ck::Array<ck::index_t, 5> wei_strides{static_cast<int>(prob.K * prob.Y * prob.X * prob.C),
+                                          static_cast<int>(prob.Y * prob.X * prob.C),
+                                          1,
+                                          static_cast<int>(prob.X * prob.C),
+                                          static_cast<int>(prob.C)};
+
+    ck::Array<ck::index_t, 2> conv_filter_strides   = {1, 1};
+    ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
+    ck::Array<ck::index_t, 2> input_left_pads       = {0, 0};
+    ck::Array<ck::index_t, 2> input_right_pads      = {0, 0};
+
+    // move the data onto the device
+    auto in_dev =
+        to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(in_lengths, in_strides, 0));
+    auto wei_dev =
+        to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(wei_lengths, wei_strides, 1));
+    auto out_dev =
+        to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(out_lengths, out_strides, 2));
+
+    // CK Verficiation: Reference Kernel
+    /**bool pass = true;
+    Tensor<ck::half_t> in_host(in_lengths, in_strides);
+    in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
+    Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
+    wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
+    Tensor<ck::half_t> out_host(out_lengths, out_strides);
+
+    std::vector<ck::index_t> conv_filter_strides_   = {1, 1};
+    std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
+    std::vector<ck::index_t> input_left_pads_       = {0, 0};
+    std::vector<ck::index_t> input_right_pads_      = {0, 0};
+
+    auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
+        2,
+        ck::half_t,
+        ck::half_t,
+        ck::half_t,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        Epilogue>();
+
+    auto ref_invoker  = ref_conv.MakeInvoker();
+    auto ref_argument = ref_conv.MakeArgument(in_host,
+                                              wei_host,
+                                              out_host,
+                                              conv_filter_strides_,
+                                              conv_filter_dilations_,
+                                              input_left_pads_,
+                                              input_right_pads_,
+                                              ck::tensor_operation::element_wise::PassThrough{},
+                                              ck::tensor_operation::element_wise::PassThrough{},
+                                              Epilogue{1.0f, 1.0f});
+    out_host.SetZero();
+    ref_invoker.Run(ref_argument);**/
+
+    for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
+    {
+        // substitute instance values into the template
+        auto src = ck::host::InterpolateString(
+            conv_compile_check,
+            {{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}});
+
+        auto srcs = get_headers_for_test();
+        srcs.push_back({"main.cpp", src});
+        rtc::compile_options options;
+        auto name           = solution.GetTemplateParameter<std::string>("name");
+        options.kernel_name = "run_" + name;
+        auto k              = rtc::compile_kernel(srcs, options);
+
+        // Grid size calculation
+        auto block_size = solution.GetTemplateParameter<ck::index_t>("BlockSize");
+
+        auto tmp = get_launch_params(solution, out_lengths, out_strides);
+
+        auto grid_size = tmp * in_lengths[1];
+
+        // launch the kernel with arguments needed for the argument pointer
+        k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(),
+                                                              wei_dev.data(),
+                                                              out_dev.data(),
+                                                              in_lengths,
+                                                              in_strides,
+                                                              wei_lengths,
+                                                              wei_strides,
+                                                              out_lengths,
+                                                              out_strides,
+                                                              conv_filter_strides,
+                                                              conv_filter_dilations,
+                                                              input_left_pads,
+                                                              input_right_pads);
+
+        // auto res = rtc::from_gpu(out_dev);
+        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+        // assert(pass);
+
+        // Simple check: this checks that the output from each instance matches the output from the
+        // first instance
+        CHECK(report(solution, check(rtc::from_gpu(out_dev))));
+    }
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
+#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
+#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
+#include "ck/host/headers.hpp"
+#include "ck/host/stringutils.hpp"
+#include "ck/host/utils.hpp"
+#include "ck/tensor_operation/gpu/device/helper.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "common.hpp"
+#include <test.hpp>
+#include <rtc/compile_kernel.hpp>
+#include <rtc/hip.hpp>
+#include <fstream>
+
+// need this for verification
+/**struct Epilogue
+{
+    Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
+
+    template <typename E, typename D>
+    __host__ __device__ constexpr void operator()(E& e, const D& d) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
+                                                                          const ck::half_t& d) const
+    {
+        e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
+    }
+
+    float alpha_;
+    float beta_;
+};**/
+const std::string conv_compile_check = R"__ck__(
+#include <${include}>
+
+${template};
+
+)__ck__";
+
+TEST_CASE(test_problem_kernel)
+{
+    // set up problem specification
+    ck::host::conv::Problem_Conv_Fwd prob;
+    prob.NumDim = 2;
+    prob.G      = 32;
+    prob.N      = 256;
+    prob.C      = 32;
+    prob.K      = 64;
+    prob.Y      = 3;
+    prob.X      = 3;
+    prob.Hi     = 28;
+    prob.Wi     = 28;
+    prob.Ho     = 28;
+    prob.Wo     = 28;
+    check_all<ck::half_t> check;
+
+    // user provided fusion operations
+    std::string epilogue = R"(
+struct Epilogue
+{
+    __host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
+
+    template <typename E, typename D>
+    __host__ __device__ constexpr void operator()(E& e, const D& d) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
+                                                                          const ck::half_t& d) const
+    {
+        e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
+    }
+
+    float alpha_;
+    float beta_;
+};
+)";
+    std::string prologue = "";
+
+    // length+stride arrays
+    ck::Array<ck::index_t, 5> in_lengths{static_cast<int>(prob.G),
+                                         static_cast<int>(prob.N),
+                                         static_cast<int>(prob.C),
+                                         static_cast<int>(prob.Hi),
+                                         static_cast<int>(prob.Wi)};
+    ck::Array<ck::index_t, 5> out_lengths{static_cast<int>(prob.G),
+                                          static_cast<int>(prob.N),
+                                          static_cast<int>(prob.K),
+                                          static_cast<int>(prob.Ho),
+                                          static_cast<int>(prob.Wo)};
+    ck::Array<ck::index_t, 5> wei_lengths{static_cast<int>(prob.G),
+                                          static_cast<int>(prob.K),
+                                          static_cast<int>(prob.C),
+                                          static_cast<int>(prob.Y),
+                                          static_cast<int>(prob.X)};
+
+    ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
+                                         static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
+                                         1,
+                                         static_cast<int>(prob.Wi * prob.G * prob.C),
+                                         static_cast<int>(prob.G * prob.C)};
+    ck::Array<ck::index_t, 5> out_strides{static_cast<int>(prob.K),
+                                          static_cast<int>(prob.Ho * prob.Wo * prob.G * prob.K),
+                                          1,
+                                          static_cast<int>(prob.Wo * prob.G * prob.K),
+                                          static_cast<int>(prob.G * prob.K)};
+    ck::Array<ck::index_t, 5> wei_strides{static_cast<int>(prob.K * prob.Y * prob.X * prob.C),
+                                          static_cast<int>(prob.Y * prob.X * prob.C),
+                                          1,
+                                          static_cast<int>(prob.X * prob.C),
+                                          static_cast<int>(prob.C)};
+
+    ck::Array<ck::index_t, 2> conv_filter_strides   = {2, 2};
+    ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
+    ck::Array<ck::index_t, 2> input_left_pads       = {0, 0};
+    ck::Array<ck::index_t, 2> input_right_pads      = {0, 0};
+
+    // move the data onto the device
+    auto in_dev =
+        to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(in_lengths, in_strides, 0));
+    auto wei_dev =
+        to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(wei_lengths, wei_strides, 1));
+    auto out_dev =
+        to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(out_lengths, out_strides, 2));
+
+    // CK Verficiation: Reference Kernel
+    /**bool pass = true;
+    Tensor<ck::half_t> in_host(in_lengths, in_strides);
+    in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
+    Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
+    wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
+    Tensor<ck::half_t> out_host(out_lengths, out_strides);
+
+    std::vector<ck::index_t> conv_filter_strides_   = {2, 2};
+    std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
+    std::vector<ck::index_t> input_left_pads_       = {0, 0};
+    std::vector<ck::index_t> input_right_pads_      = {0, 0};
+
+    auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
+        2,
+        ck::half_t,
+        ck::half_t,
+        ck::half_t,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        Epilogue>();
+
+    auto ref_invoker  = ref_conv.MakeInvoker();
+    auto ref_argument = ref_conv.MakeArgument(in_host,
+                                              wei_host,
+                                              out_host,
+                                              conv_filter_strides_,
+                                              conv_filter_dilations_,
+                                              input_left_pads_,
+                                              input_right_pads_,
+                                              ck::tensor_operation::element_wise::PassThrough{},
+                                              ck::tensor_operation::element_wise::PassThrough{},
+                                              Epilogue{1.0f, 1.0f});
+    out_host.SetZero();
+    ref_invoker.Run(ref_argument);**/
+
+    for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
+    {
+        // substitute instance values into the template
+        auto src = ck::host::InterpolateString(
+            conv_compile_check,
+            {{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}});
+
+        auto srcs = get_headers_for_test();
+        srcs.push_back({"main.cpp", src});
+        rtc::compile_options options;
+        auto name           = solution.GetTemplateParameter<std::string>("name");
+        options.kernel_name = "run_" + name;
+        auto k              = rtc::compile_kernel(srcs, options);
+
+        // Grid size calculation
+        auto block_size = solution.GetTemplateParameter<ck::index_t>("BlockSize");
+
+        auto tmp = get_launch_params(solution, out_lengths, out_strides);
+
+        auto grid_size = tmp * in_lengths[1];
+
+        // launch the kernel with arguments needed for the argument pointer
+        k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(),
+                                                              wei_dev.data(),
+                                                              out_dev.data(),
+                                                              in_lengths,
+                                                              in_strides,
+                                                              wei_lengths,
+                                                              wei_strides,
+                                                              out_lengths,
+                                                              out_strides,
+                                                              conv_filter_strides,
+                                                              conv_filter_dilations,
+                                                              input_left_pads,
+                                                              input_right_pads);
+
+        // auto res = rtc::from_gpu(out_dev);
+        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+        // assert(pass);
+
+        // Simple check: this checks that the output from each instance matches the output from the
+        // first instance
+        CHECK(report(solution, check(rtc::from_gpu(out_dev))));
+    }
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
+#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_op.hpp"
+#include "ck/host/device_grouped_conv_fwd_multiple_d/conv_fwd_problem.hpp"
+#include "ck/host/headers.hpp"
+#include "ck/host/stringutils.hpp"
+#include "ck/host/utils.hpp"
+#include "ck/tensor_operation/gpu/device/helper.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "common.hpp"
+#include <test.hpp>
+#include <rtc/compile_kernel.hpp>
+#include <rtc/hip.hpp>
+#include <fstream>
+
+// need this for verification
+/**struct Epilogue
+{
+    Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
+
+    template <typename E, typename D>
+    __host__ __device__ constexpr void operator()(E& e, const D& d) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
+                                                                          const ck::half_t& d) const
+    {
+        e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
+    }
+
+    float alpha_;
+    float beta_;
+};**/
+const std::string conv_compile_check = R"__ck__(
+#include <${include}>
+
+${template};
+
+)__ck__";
+
+TEST_CASE(test_problem_kernel)
+{
+    // set up problem specification
+    ck::host::conv::Problem_Conv_Fwd prob;
+    prob.NumDim = 2;
+    prob.G      = 32;
+    prob.N      = 256;
+    prob.C      = 32;
+    prob.K      = 64;
+    prob.Y      = 3;
+    prob.X      = 3;
+    prob.Hi     = 28;
+    prob.Wi     = 28;
+    prob.Ho     = 28;
+    prob.Wo     = 28;
+    check_all<ck::half_t> check;
+
+    // user provided fusion operations
+    std::string epilogue = R"(
+struct Epilogue
+{
+    __host__ __device__ Epilogue(float alpha, float beta) : alpha_(alpha), beta_(beta){};
+
+    template <typename E, typename D>
+    __host__ __device__ constexpr void operator()(E& e, const D& d) const;
+
+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, ck::half_t>(ck::half_t& e,
+                                                                          const ck::half_t& d) const
+    {
+        e = ck::type_convert<ck::half_t>(alpha_ * e + beta_ * ck::type_convert<float>(d));
+    }
+
+    float alpha_;
+    float beta_;
+};
+)";
+    std::string prologue = "";
+
+    // length+stride arrays
+    ck::Array<ck::index_t, 5> in_lengths{static_cast<int>(prob.G),
+                                         static_cast<int>(prob.N),
+                                         static_cast<int>(prob.C),
+                                         static_cast<int>(prob.Hi),
+                                         static_cast<int>(prob.Wi)};
+    ck::Array<ck::index_t, 5> out_lengths{static_cast<int>(prob.G),
+                                          static_cast<int>(prob.N),
+                                          static_cast<int>(prob.K),
+                                          static_cast<int>(prob.Ho),
+                                          static_cast<int>(prob.Wo)};
+    ck::Array<ck::index_t, 5> wei_lengths{static_cast<int>(prob.G),
+                                          static_cast<int>(prob.K),
+                                          static_cast<int>(prob.C),
+                                          static_cast<int>(prob.Y),
+                                          static_cast<int>(prob.X)};
+
+    ck::Array<ck::index_t, 5> in_strides{static_cast<int>(prob.C),
+                                         static_cast<int>(prob.Hi * prob.Wi * prob.G * prob.C),
+                                         1,
+                                         static_cast<int>(prob.Wi * prob.G * prob.C),
+                                         static_cast<int>(prob.G * prob.C)};
+    ck::Array<ck::index_t, 5> out_strides{static_cast<int>(prob.K),
+                                          static_cast<int>(prob.Ho * prob.Wo * prob.G * prob.K),
+                                          1,
+                                          static_cast<int>(prob.Wo * prob.G * prob.K),
+                                          static_cast<int>(prob.G * prob.K)};
+    ck::Array<ck::index_t, 5> wei_strides{static_cast<int>(prob.K * prob.Y * prob.X * prob.C),
+                                          static_cast<int>(prob.Y * prob.X * prob.C),
+                                          1,
+                                          static_cast<int>(prob.X * prob.C),
+                                          static_cast<int>(prob.C)};
+
+    ck::Array<ck::index_t, 2> conv_filter_strides   = {1, 1};
+    ck::Array<ck::index_t, 2> conv_filter_dilations = {1, 1};
+    ck::Array<ck::index_t, 2> input_left_pads       = {1, 1};
+    ck::Array<ck::index_t, 2> input_right_pads      = {1, 1};
+
+    // move the data onto the device
+    auto in_dev =
+        to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(in_lengths, in_strides, 0));
+    auto wei_dev =
+        to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(wei_lengths, wei_strides, 1));
+    auto out_dev =
+        to_gpu(generate_buffer<ck::half_t, ck::Array<ck::index_t, 5>>(out_lengths, out_strides, 2));
+
+    // CK Verficiation: Reference Kernel
+    /**bool pass = true;
+    Tensor<ck::half_t> in_host(in_lengths, in_strides);
+    in_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
+    Tensor<ck::half_t> wei_host(wei_lengths, wei_strides);
+    wei_host.GenerateTensorValue(GeneratorTensor_1<ck::half_t>{1});
+    Tensor<ck::half_t> out_host(out_lengths, out_strides);
+
+    std::vector<ck::index_t> conv_filter_strides_   = {1, 1};
+    std::vector<ck::index_t> conv_filter_dilations_ = {1, 1};
+    std::vector<ck::index_t> input_left_pads_       = {1, 1};
+    std::vector<ck::index_t> input_right_pads_      = {1, 1};
+
+    auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<
+        2,
+        ck::half_t,
+        ck::half_t,
+        ck::half_t,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        Epilogue>();
+
+    auto ref_invoker  = ref_conv.MakeInvoker();
+    auto ref_argument = ref_conv.MakeArgument(in_host,
+                                              wei_host,
+                                              out_host,
+                                              conv_filter_strides_,
+                                              conv_filter_dilations_,
+                                              input_left_pads_,
+                                              input_right_pads_,
+                                              ck::tensor_operation::element_wise::PassThrough{},
+                                              ck::tensor_operation::element_wise::PassThrough{},
+                                              Epilogue{1.0f, 1.0f});
+    out_host.SetZero();
+    ref_invoker.Run(ref_argument);**/
+
+    for(auto solution : prob.GetSolutions("gfx908", prologue, epilogue))
+    {
+        // substitute instance values into the template
+        auto src = ck::host::InterpolateString(
+            conv_compile_check,
+            {{"include", prob.GetIncludeHeader()}, {"template", solution.ToTemplateString()}});
+
+        auto srcs = get_headers_for_test();
+        srcs.push_back({"main.cpp", src});
+        rtc::compile_options options;
+        auto name           = solution.GetTemplateParameter<std::string>("name");
+        options.kernel_name = "run_" + name;
+        auto k              = rtc::compile_kernel(srcs, options);
+
+        // Grid size calculation
+        auto block_size = solution.GetTemplateParameter<ck::index_t>("BlockSize");
+
+        auto tmp = get_launch_params(solution, out_lengths, out_strides);
+
+        auto grid_size = tmp * in_lengths[1];
+
+        // launch the kernel with arguments needed for the argument pointer
+        k.launch(nullptr, grid_size * block_size, block_size)(in_dev.data(),
+                                                              wei_dev.data(),
+                                                              out_dev.data(),
+                                                              in_lengths,
+                                                              in_strides,
+                                                              wei_lengths,
+                                                              wei_strides,
+                                                              out_lengths,
+                                                              out_strides,
+                                                              conv_filter_strides,
+                                                              conv_filter_dilations,
+                                                              input_left_pads,
+                                                              input_right_pads);
+
+        // auto res = rtc::from_gpu(out_dev);
+        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
+        // assert(pass);
+
+        // Simple check: this checks that the output from each instance matches the output from the
+        // first instance
+        CHECK(report(solution, check(rtc::from_gpu(out_dev))));
+    }
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/codegen/test/include/common.hpp
+++ b/codegen/test/include/common.hpp
+#pragma once
+#include <algorithm>
+#include <cmath>
+#include <iterator>
+#include <numeric>
+#include <random>
+#include <test.hpp>
+#include <rtc/compile_kernel.hpp>
+#include <rtc/hip.hpp>
+#include <fstream>
+
+std::vector<rtc::src_file> get_headers_for_test()
+{
+    std::vector<rtc::src_file> result;
+    auto hs = ck::host::GetHeaders();
+    std::transform(
+        hs.begin(), hs.end(), std::back_inserter(result), [&](const auto& p) -> rtc::src_file {
+            return {p.first, p.second};
+        });
+    return result;
+}
+
+template <typename V>
+std::size_t GetSize(V mLens, V mStrides)
+{
+    std::size_t space = 1;
+    for(std::size_t i = 0; i < mLens.Size(); ++i)
+    {
+        if(mLens[i] == 0)
+            continue;
+
+        space += (mLens[i] - 1) * mStrides[i];
+    }
+    return space;
+}
+
+template <class T, typename V>
+rtc::buffer<T> generate_buffer(V mLens, V mStrides, std::size_t seed = 0)
+{
+    std::size_t space = GetSize(mLens, mStrides);
+    rtc::buffer<T> result(space);
+    std::mt19937 gen(seed);
+    std::uniform_real_distribution<double> dis(-1.0);
+    std::generate(result.begin(), result.end(), [&] { return dis(gen); });
+    // std::fill(result.begin(), result.end(), 1);
+    return result;
+}
+
+template <class T, class U>
+bool allclose(const T& a, const U& b, double atol = 0.01, double rtol = 0.01)
+{
+    return std::equal(a.begin(), a.end(), b.begin(), b.end(), [&](double x, double y) {
+        return fabs(x - y) < atol + rtol * fabs(y);
+    });
+}
+
+std::string classify(double x)
+{
+    switch(std::fpclassify(x))
+    {
+    case FP_INFINITE: return "inf";
+    case FP_NAN: return "nan";
+    case FP_NORMAL: return "normal";
+    case FP_SUBNORMAL: return "subnormal";
+    case FP_ZERO: return "zero";
+    default: return "unknown";
+    }
+}
+
+template <class Buffer>
+void print_classification(const Buffer& x)
+{
+    std::unordered_set<std::string> result;
+    for(const auto& i : x)
+        result.insert(classify(i));
+    for(const auto& c : result)
+        std::cout << c << ", ";
+    std::cout << std::endl;
+}
+
+template <class Buffer>
+void print_statistics(const Buffer& x)
+{
+    std::cout << "Min value: " << *std::min_element(x.begin(), x.end()) << ", ";
+    std::cout << "Max value: " << *std::max_element(x.begin(), x.end()) << ", ";
+    double num_elements = x.size();
+    auto mean =
+        std::accumulate(x.begin(), x.end(), double{0.0}, std::plus<double>{}) / num_elements;
+    auto stddev = std::sqrt(
+        std::accumulate(x.begin(),
+                        x.end(),
+                        double{0.0},
+                        [&](double r, double v) { return r + std::pow((v - mean), 2.0); }) /
+        num_elements);
+    std::cout << "Mean: " << mean << ", ";
+    std::cout << "StdDev: " << stddev << "\n";
+}
+
+template <class Buffer>
+void print_preview(const Buffer& x)
+{
+    if(x.size() <= 10)
+    {
+        std::for_each(x.begin(), x.end(), [&](double i) { std::cout << i << ", "; });
+    }
+    else
+    {
+        std::for_each(x.begin(), x.begin() + 5, [&](double i) { std::cout << i << ", "; });
+        std::cout << "..., ";
+        std::for_each(x.end() - 5, x.end(), [&](double i) { std::cout << i << ", "; });
+    }
+    std::cout << std::endl;
+}
+
+template <class T>
+struct check_all
+{
+    rtc::buffer<T> data{};
+    bool operator()(const rtc::buffer<T>& x)
+    {
+        if(data.empty())
+        {
+            data = x;
+            return true;
+        }
+        return allclose(data, x);
+    }
+};
+
+template <class Solution>
+auto report(const Solution& solution, bool pass)
+{
+    return test::make_predicate(solution.ToTemplateString(), [=] { return pass; });
+}
--- a/codegen/test/include/test.hpp
+++ b/codegen/test/include/test.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <atomic>
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <chrono>
+#include <functional>
+#include <iostream>
+#include <sstream>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+#ifdef __linux__
+#include <unistd.h>
+#endif
+
+#ifndef MIGRAPHX_GUARD_TEST_TEST_HPP
+#define MIGRAPHX_GUARD_TEST_TEST_HPP
+
+namespace test {
+// clang-format off
+// NOLINTNEXTLINE
+#define TEST_FOREACH_BINARY_OPERATORS(m) \
+    m(==, equal) \
+    m(!=, not_equal) \
+    m(<=, less_than_equal) \
+    m(>=, greater_than_equal) \
+    m(<, less_than) \
+    m(>, greater_than) \
+    m(and, and_op) \
+    m(or, or_op)
+// clang-format on
+
+// clang-format off
+// NOLINTNEXTLINE
+#define TEST_FOREACH_UNARY_OPERATORS(m) \
+    m(not, not_op)
+// clang-format on
+
+// NOLINTNEXTLINE
+#define TEST_EACH_BINARY_OPERATOR_OBJECT(op, name)     \
+    struct name                                        \
+    {                                                  \
+        static std::string as_string() { return #op; } \
+        template <class T, class U>                    \
+        static decltype(auto) call(T&& x, U&& y)       \
+        {                                              \
+            return x op y;                             \
+        }                                              \
+    };
+
+// NOLINTNEXTLINE
+#define TEST_EACH_UNARY_OPERATOR_OBJECT(op, name)      \
+    struct name                                        \
+    {                                                  \
+        static std::string as_string() { return #op; } \
+        template <class T>                             \
+        static decltype(auto) call(T&& x)              \
+        {                                              \
+            return op x;                               \
+        }                                              \
+    };
+
+TEST_FOREACH_BINARY_OPERATORS(TEST_EACH_BINARY_OPERATOR_OBJECT)
+TEST_FOREACH_UNARY_OPERATORS(TEST_EACH_UNARY_OPERATOR_OBJECT)
+
+struct nop
+{
+    static std::string as_string() { return ""; }
+    template <class T>
+    static auto call(T&& x)
+    {
+        return static_cast<T&&>(x);
+    }
+};
+
+struct function
+{
+    static std::string as_string() { return ""; }
+    template <class T>
+    static decltype(auto) call(T&& x)
+    {
+        return x();
+    }
+};
+
+template <class Stream, class Iterator>
+Stream& stream_range(Stream& s, Iterator start, Iterator last);
+
+template <class Stream>
+inline Stream& operator<<(Stream& s, std::nullptr_t)
+{
+    s << "nullptr";
+    return s;
+}
+
+template <class Stream,
+          class Range,
+          class = typename std::enable_if<not std::is_convertible<Range, std::string>{}>::type>
+inline auto operator<<(Stream& s, const Range& v) -> decltype(stream_range(s, v.begin(), v.end()))
+{
+    s << "{ ";
+    stream_range(s, v.begin(), v.end());
+    s << "}";
+    return s;
+}
+
+template <class Stream, class Iterator>
+inline Stream& stream_range(Stream& s, Iterator start, Iterator last)
+{
+    if(start != last)
+    {
+        s << *start;
+        std::for_each(std::next(start), last, [&](auto&& x) { s << ", " << x; });
+    }
+    return s;
+}
+
+template <class T>
+const T& get_value(const T& x)
+{
+    return x;
+}
+
+template <class T, class Operator = nop>
+struct lhs_expression;
+
+template <class T>
+lhs_expression<T> make_lhs_expression(T&& lhs);
+
+template <class T, class Operator>
+lhs_expression<T, Operator> make_lhs_expression(T&& lhs, Operator);
+
+// NOLINTNEXTLINE
+#define TEST_EXPR_BINARY_OPERATOR(op, name)                       \
+    template <class V>                                            \
+    auto operator op(const V& rhs2) const                         \
+    {                                                             \
+        return make_expression(*this, rhs2, name{}); /* NOLINT */ \
+    }
+
+// NOLINTNEXTLINE
+#define TEST_EXPR_UNARY_OPERATOR(op, name) \
+    auto operator op() const { return make_lhs_expression(lhs, name{}); /* NOLINT */ }
+
+template <class T, class U, class Operator>
+struct expression
+{
+    T lhs;
+    U rhs;
+
+    friend std::ostream& operator<<(std::ostream& s, const expression& self)
+    {
+        s << self.lhs << " " << Operator::as_string() << " " << self.rhs;
+        return s;
+    }
+
+    friend decltype(auto) get_value(const expression& e) { return e.value(); }
+
+    decltype(auto) value() const { return Operator::call(get_value(lhs), get_value(rhs)); };
+
+    TEST_FOREACH_UNARY_OPERATORS(TEST_EXPR_UNARY_OPERATOR)
+    TEST_FOREACH_BINARY_OPERATORS(TEST_EXPR_BINARY_OPERATOR)
+};
+
+// TODO: Remove rvalue references
+template <class T, class U, class Operator>
+expression<T, U, Operator> make_expression(T&& rhs, U&& lhs, Operator)
+{
+    return {std::forward<T>(rhs), std::forward<U>(lhs)};
+}
+
+// TODO: Remove rvalue reference
+template <class T>
+lhs_expression<T> make_lhs_expression(T&& lhs)
+{
+    return lhs_expression<T>{std::forward<T>(lhs)};
+}
+
+template <class T, class Operator>
+lhs_expression<T, Operator> make_lhs_expression(T&& lhs, Operator)
+{
+    return lhs_expression<T, Operator>{std::forward<T>(lhs)};
+}
+
+template <class T, class Operator>
+struct lhs_expression
+{
+    T lhs;
+    explicit lhs_expression(T e) : lhs(e) {}
+
+    friend std::ostream& operator<<(std::ostream& s, const lhs_expression& self)
+    {
+        std::string op = Operator::as_string();
+        if(not op.empty())
+            s << Operator::as_string() << " ";
+        s << self.lhs;
+        return s;
+    }
+
+    friend decltype(auto) get_value(const lhs_expression& e) { return e.value(); }
+
+    decltype(auto) value() const { return Operator::call(get_value(lhs)); }
+
+    TEST_FOREACH_BINARY_OPERATORS(TEST_EXPR_BINARY_OPERATOR)
+    TEST_FOREACH_UNARY_OPERATORS(TEST_EXPR_UNARY_OPERATOR)
+
+// NOLINTNEXTLINE
+#define TEST_LHS_REOPERATOR(op)                 \
+    template <class U>                          \
+    auto operator op(const U& rhs) const        \
+    {                                           \
+        return make_lhs_expression(lhs op rhs); \
+    }
+    TEST_LHS_REOPERATOR(+)
+    TEST_LHS_REOPERATOR(-)
+    TEST_LHS_REOPERATOR(*)
+    TEST_LHS_REOPERATOR(/)
+    TEST_LHS_REOPERATOR(%)
+    TEST_LHS_REOPERATOR(&)
+    TEST_LHS_REOPERATOR(|)
+    TEST_LHS_REOPERATOR(^)
+};
+
+template <class F>
+struct predicate
+{
+    std::string msg;
+    F f;
+
+    friend std::ostream& operator<<(std::ostream& s, const predicate& self)
+    {
+        s << self.msg;
+        return s;
+    }
+
+    decltype(auto) operator()() const { return f(); }
+
+    operator decltype(auto)() const { return f(); }
+};
+
+template <class F>
+auto make_predicate(const std::string& msg, F f)
+{
+    return make_lhs_expression(predicate<F>{msg, f}, function{});
+}
+
+inline std::string as_string(bool x)
+{
+    if(x)
+        return "true";
+    return "false";
+}
+
+template <class T>
+std::string as_string(const T& x)
+{
+    std::stringstream ss;
+    ss << x;
+    return ss.str();
+}
+
+template <class Iterator>
+std::string as_string(Iterator start, Iterator last)
+{
+    std::stringstream ss;
+    stream_range(ss, start, last);
+    return ss.str();
+}
+
+template <class F>
+auto make_function(const std::string& name, F f)
+{
+    return [=](auto&&... xs) {
+        std::vector<std::string> args = {as_string(xs)...};
+        return make_predicate(name + "(" + as_string(args.begin(), args.end()) + ")",
+                              [=] { return f(xs...); });
+    };
+}
+
+struct capture
+{
+    template <class T>
+    auto operator->*(const T& x) const
+    {
+        return make_lhs_expression(x);
+    }
+
+    template <class T, class Operator>
+    auto operator->*(const lhs_expression<T, Operator>& x) const
+    {
+        return x;
+    }
+};
+
+enum class color
+{
+    reset      = 0,
+    bold       = 1,
+    underlined = 4,
+    fg_red     = 31,
+    fg_green   = 32,
+    fg_yellow  = 33,
+    fg_blue    = 34,
+    fg_default = 39,
+    bg_red     = 41,
+    bg_green   = 42,
+    bg_yellow  = 43,
+    bg_blue    = 44,
+    bg_default = 49
+};
+inline std::ostream& operator<<(std::ostream& os, const color& c)
+{
+#ifndef _WIN32
+    static const bool use_color = isatty(STDOUT_FILENO) != 0;
+    if(use_color)
+        return os << "\033[" << static_cast<std::size_t>(c) << "m";
+#else
+    (void)c;
+#endif
+    return os;
+}
+
+inline std::atomic<int>& failures()
+{
+    // NOLINTNEXTLINE
+    static std::atomic<int> f = 0;
+    return f;
+}
+
+template <class T, class F>
+void failed(T x, const char* msg, const char* func, const char* file, int line, F f)
+{
+    if(not bool(x.value()))
+    {
+        failures()++;
+        std::cout << func << std::endl;
+        std::cout << file << ":" << line << ":" << std::endl;
+        std::cout << color::bold << color::fg_red << "    FAILED: " << color::reset << msg << " "
+                  << "[ " << x << " ]" << std::endl;
+        f();
+    }
+}
+
+template <class F>
+bool throws(F f)
+{
+    try
+    {
+        f();
+        return false;
+    }
+    catch(...)
+    {
+        return true;
+    }
+}
+
+template <class Exception, class F>
+bool throws(F f, const std::string& msg = "")
+{
+    try
+    {
+        f();
+        return false;
+    }
+    catch(const Exception& ex)
+    {
+        return std::string(ex.what()).find(msg) != std::string::npos;
+    }
+}
+
+template <class T, class U>
+auto within_abs(T px, U py, double ptol = 1e-6f)
+{
+    return make_function("near", [](auto x, auto y, auto tol) { return std::abs(x - y) < tol; })(
+        px, py, ptol);
+}
+
+// This implements the basic globbing algorithm where `*` matches any number
+// of characters(including none) and `?` matches any single character. It
+// doesnt support character classes.
+//
+// This is a simple recursive implementation that scans the string where the
+// string and pattern matches. When a `*` is found in the pattern, the
+// `glob_match` function is called recursively to compare the rest of the
+// pattern to the rest of the string. If the recursive call returns true,
+// then we have a match. However, if it returns false, then we advance one
+// character and call the recusrsive call again. This is referred to as a
+// star-loop, which will consume zero or more characters.
+//
+// This simple recursive implementation works well for short string and
+// patterns with few stars. First, it is unlikely to use many stars to glob
+// test names. Secondly, using many stars is still signficantly faster than
+// using the equivalent std::regex, which has a much slower time complexity.
+template <class Iterator1, class Iterator2>
+bool glob_match(Iterator1 start, Iterator1 last, Iterator2 pattern_start, Iterator2 pattern_last)
+{
+    std::tie(start, pattern_start) =
+        std::mismatch(start, last, pattern_start, pattern_last, [](auto c, auto m) {
+            if(m == '?')
+                return true;
+            // We need a loop for star, so bail and handle the loop below
+            if(m == '*')
+                return false;
+            return c == m;
+        });
+    // If there is no more pattern then return true if there is no more string to match
+    if(pattern_start == pattern_last)
+        return start == last;
+    // If the pattern is not a star then its a mismatch
+    if(*pattern_start != '*')
+        return false;
+    // Multiple stars are the same as a single star so skip over multiple stars
+    pattern_start = std::find_if(pattern_start, pattern_last, [](auto c) { return c != '*'; });
+    // If the star is at the end then return true
+    if(pattern_start == pattern_last)
+        return true;
+    // star-loop: match the rest of the pattern and text
+    while(not glob_match(start, last, pattern_start, pattern_last) and start != last)
+        start++;
+    // If the string is empty then it means a match was never found
+    return start != last;
+}
+
+using string_map = std::unordered_map<std::string, std::vector<std::string>>;
+
+template <class Keyword>
+string_map generic_parse(std::vector<std::string> as, Keyword keyword)
+{
+    string_map result;
+
+    std::string flag;
+    for(auto&& x : as)
+    {
+        auto f = keyword(x);
+        if(f.empty())
+        {
+            result[flag].push_back(x);
+        }
+        else
+        {
+            flag = f.front();
+            result[flag]; // Ensure the flag exists
+            flag = f.back();
+        }
+    }
+    return result;
+}
+
+using test_case = std::function<void()>;
+
+inline auto& get_test_cases()
+{
+    // NOLINTNEXTLINE
+    static std::vector<std::pair<std::string, test_case>> cases;
+    return cases;
+}
+
+inline void add_test_case(std::string name, test_case f)
+{
+    get_test_cases().emplace_back(std::move(name), std::move(f));
+}
+
+struct auto_register_test_case
+{
+    template <class F>
+    auto_register_test_case(const char* name, F f) noexcept
+    {
+        add_test_case(name, f);
+    }
+};
+
+struct failure_error
+{
+};
+
+[[noreturn]] inline void fail() { throw failure_error{}; }
+
+struct driver
+{
+    driver()
+    {
+        add_flag({"--help", "-h"}, "Show help");
+        add_flag({"--list", "-l"}, "List all test cases");
+        add_flag({"--continue", "-c"}, "Continue after failure");
+        add_flag({"--quiet", "-q"}, "Don't print out extra output");
+    }
+    struct argument
+    {
+        std::vector<std::string> flags = {};
+        std::string help               = "";
+        int nargs                      = 1;
+    };
+
+    void add_arg(const std::vector<std::string>& flags, const std::string& help = "")
+    {
+        arguments.push_back(argument{flags, help, 1});
+    }
+
+    void add_flag(const std::vector<std::string>& flags, const std::string& help = "")
+    {
+        arguments.push_back(argument{flags, help, 0});
+    }
+
+    static void wrap(std::ostream& os,
+                     const std::string& text,
+                     const std::string& prefix = "",
+                     unsigned int line_length  = 80)
+    {
+        std::istringstream iss(text);
+        std::string line = prefix;
+        do
+        {
+            std::string word;
+            iss >> word;
+            if(line.length() + word.length() > line_length)
+            {
+                os << line << std::endl;
+                line = prefix;
+            }
+            line += word + " ";
+        } while(iss);
+        if(not line.empty())
+            os << line << std::endl;
+    }
+
+    void show_help(const std::string& exe) const
+    {
+        const std::string prefix = "    ";
+        std::cout << std::endl;
+        std::cout << color::fg_yellow << "USAGE:" << color::reset << std::endl;
+        std::cout << "    ";
+        std::cout << exe << " <test-case>... <options>" << std::endl;
+        std::cout << std::endl;
+
+        std::cout << color::fg_yellow << "ARGS:" << color::reset << std::endl;
+        std::cout << "    ";
+        std::cout << color::fg_green << "<test-case>..." << color::reset;
+        std::cout << std::endl;
+
+        wrap(std::cout,
+             "Test cases to run. A test case can be either the exact test case name or a glob. A "
+             "glob expression uses a '*' to select zero or more characters or a '?' to select any "
+             "single character.",
+             prefix + prefix);
+
+        std::cout << std::endl;
+        std::cout << color::fg_yellow << "OPTIONS:" << color::reset << std::endl;
+        for(auto&& arg : arguments)
+        {
+            std::cout << color::fg_green;
+            std::string arg_prefix = prefix;
+            for(const std::string& a : arg.flags)
+            {
+                std::cout << arg_prefix;
+                std::cout << a;
+                arg_prefix = ", ";
+            }
+            std::cout << color::reset << std::endl;
+            wrap(std::cout, arg.help, prefix + prefix);
+        }
+    }
+
+    std::ostream& out() const
+    {
+        struct null_buffer : std::streambuf
+        {
+            virtual int overflow(int c) override { return c; }
+        };
+        static null_buffer buffer;
+        static std::ostream null_stream(&buffer);
+        if(quiet)
+            return null_stream;
+        return std::cout;
+    }
+
+    string_map parse(int argc, const char* argv[]) const
+    {
+        std::vector<std::string> args(argv + 1, argv + argc);
+        string_map keys;
+        for(auto&& arg : arguments)
+        {
+            for(auto&& flag : arg.flags)
+            {
+                keys[flag] = {arg.flags.front()};
+                if(arg.nargs == 0)
+                    keys[flag].push_back("");
+            }
+        }
+        auto result = generic_parse(args, [&](auto&& s) -> std::vector<std::string> {
+            if(keys.count(s) > 0)
+                return keys[s];
+            else
+                return {};
+        });
+        result["__exe__"].push_back(argv[0]);
+        return result;
+    }
+
+    static std::string create_command(const string_map& args)
+    {
+        std::stringstream ss;
+        ss << args.at("__exe__").front();
+        if(args.count("") > 0)
+        {
+            for(auto&& arg : args.at(""))
+                ss << " \"" << arg << "\"";
+        }
+        for(auto&& p : args)
+        {
+            if(p.first == "__exe__")
+                continue;
+            if(p.first.empty())
+                continue;
+            ss << " " << p.first;
+            for(auto&& arg : p.second)
+                ss << " \"" << arg << "\"";
+        }
+        return ss.str();
+    }
+
+    static std::string fork(const std::string& name, string_map args)
+    {
+        std::string msg;
+        args[""] = {name};
+        args.erase("--continue");
+        args["--quiet"];
+        auto cmd = create_command(args);
+        auto r   = std::system(cmd.c_str()); // NOLINT
+        if(r != 0)
+            msg = "Exited with " + std::to_string(r);
+        return msg;
+    }
+
+    static std::vector<std::pair<std::string, test_case>> glob_tests(const std::string& pattern)
+    {
+        std::vector<std::pair<std::string, test_case>> result;
+        std::copy_if(get_test_cases().begin(),
+                     get_test_cases().end(),
+                     std::back_inserter(result),
+                     [&](auto&& p) {
+                         return glob_match(
+                             p.first.begin(), p.first.end(), pattern.begin(), pattern.end());
+                     });
+        return result;
+    }
+
+    void run_test_case(const std::string& name, const test_case& f, const string_map& args)
+    {
+        ran++;
+        out() << color::fg_green << "[   RUN    ] " << color::reset << color::bold << name
+              << color::reset << std::endl;
+        std::string msg;
+        auto start = std::chrono::steady_clock::now();
+        if(args.count("--continue") > 0)
+        {
+            msg = fork(name, args);
+        }
+        else
+        {
+            try
+            {
+                failures() = 0;
+                f();
+            }
+            // cppcheck-suppress migraphx-EmptyCatchStatement
+            catch(const failure_error&)
+            {
+            }
+        }
+        auto finish = std::chrono::steady_clock::now();
+        auto elapsed_ms =
+            std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(finish - start)
+                .count();
+        if(msg.empty() and failures() != 0)
+        {
+            if(failures() == 1)
+                msg = "Test failure";
+            else
+                msg = std::to_string(failures()) + " test failures";
+        }
+        if(msg.empty())
+        {
+            out() << color::fg_green << "[ COMPLETE ] " << color::reset;
+        }
+        else
+        {
+            failed.push_back(name);
+            out() << color::fg_red << "[  FAILED  ] " << color::reset;
+        }
+        out() << color::bold << name << color::reset;
+        out() << color::fg_blue << " (" << elapsed_ms << "ms)" << color::reset;
+        if(not msg.empty())
+            out() << ": " << color::fg_yellow << msg << color::reset;
+        out() << std::endl;
+    }
+
+    void run(int argc, const char* argv[])
+    {
+        auto args = parse(argc, argv);
+        if(args.count("--help") > 0)
+        {
+            show_help(args.at("__exe__").front());
+            return;
+        }
+        if(args.count("--list") > 0)
+        {
+            for(auto&& tc : get_test_cases())
+                out() << tc.first << std::endl;
+            return;
+        }
+
+        if(args.count("--quiet") > 0)
+            quiet = true;
+
+        auto cases = args[""];
+        if(cases.empty())
+        {
+            for(auto&& tc : get_test_cases())
+                run_test_case(tc.first, tc.second, args);
+        }
+        else
+        {
+            std::unordered_map<std::string, test_case> m(get_test_cases().begin(),
+                                                         get_test_cases().end());
+
+            for(auto&& iname : cases)
+            {
+                std::vector<std::pair<std::string, test_case>> found_cases;
+                for(auto&& pattern : get_case_names(iname))
+                {
+                    auto f = m.find(pattern);
+                    if(f == m.end())
+                    {
+                        found_cases = glob_tests(pattern);
+                    }
+                    else
+                    {
+                        found_cases.push_back(*f);
+                    }
+                }
+                if(found_cases.empty())
+                {
+                    out() << color::fg_red << "[  ERROR   ] Test case '" << iname << "' not found."
+                          << color::reset << std::endl;
+                    failed.push_back(iname);
+                }
+                for(auto&& p : found_cases)
+                    run_test_case(p.first, p.second, args);
+            }
+        }
+        out() << color::fg_green << "[==========] " << color::fg_yellow << ran << " tests ran"
+              << color::reset << std::endl;
+        if(not failed.empty())
+        {
+            out() << color::fg_red << "[  FAILED  ] " << color::fg_yellow << failed.size()
+                  << " tests failed" << color::reset << std::endl;
+            for(auto&& name : failed)
+                out() << color::fg_red << "[  FAILED  ] " << color::fg_yellow << name
+                      << color::reset << std::endl;
+            std::exit(1);
+        }
+    }
+
+    std::function<std::vector<std::string>(const std::string&)> get_case_names =
+        [](const std::string& name) -> std::vector<std::string> { return {name}; };
+    std::vector<argument> arguments = {};
+    std::vector<std::string> failed = {};
+    std::size_t ran                 = 0;
+    bool quiet                      = false;
+};
+
+inline void run(int argc, const char* argv[])
+{
+    driver d{};
+    d.run(argc, argv);
+}
+
+} // namespace test
+
+// NOLINTNEXTLINE
+#define TEST_CAPTURE(...) test::capture{}->*__VA_ARGS__
+
+// NOLINTNEXTLINE
+#define CHECK(...) \
+    test::failed(  \
+        TEST_CAPTURE(__VA_ARGS__), #__VA_ARGS__, __PRETTY_FUNCTION__, __FILE__, __LINE__, [] {})
+
+// NOLINTNEXTLINE
+#define EXPECT(...)                         \
+    test::failed(TEST_CAPTURE(__VA_ARGS__), \
+                 #__VA_ARGS__,              \
+                 __PRETTY_FUNCTION__,       \
+                 __FILE__,                  \
+                 __LINE__,                  \
+                 &test::fail)
+// NOLINTNEXTLINE
+#define STATUS(...) EXPECT((__VA_ARGS__) == 0)
+
+// NOLINTNEXTLINE
+#define TEST_CAT(x, ...) TEST_PRIMITIVE_CAT(x, __VA_ARGS__)
+// NOLINTNEXTLINE
+#define TEST_PRIMITIVE_CAT(x, ...) x##__VA_ARGS__
+
+// NOLINTNEXTLINE
+#define TEST_CASE_REGISTER(...)                                                    \
+    static test::auto_register_test_case TEST_CAT(register_test_case_, __LINE__) = \
+        test::auto_register_test_case(#__VA_ARGS__, &__VA_ARGS__);
+
+// NOLINTNEXTLINE
+#define TEST_CASE(...)              \
+    void __VA_ARGS__();             \
+    TEST_CASE_REGISTER(__VA_ARGS__) \
+    void __VA_ARGS__()
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wglobal-constructors"
+#endif
+
+#endif
--- a/codegen/test/rtc/CMakeLists.txt
+++ b/codegen/test/rtc/CMakeLists.txt
+find_package(hip)
+file(GLOB RTC_SOURCES CONFIGURE_DEPENDS src/*.cpp)
+add_library(ck_rtc ${RTC_SOURCES})
+target_include_directories(ck_rtc PUBLIC include)
+target_link_libraries(ck_rtc PUBLIC hip::host)
+target_link_libraries(ck_rtc PUBLIC -lstdc++fs)
--- a/codegen/test/rtc/include/rtc/compile_kernel.hpp
+++ b/codegen/test/rtc/include/rtc/compile_kernel.hpp
+#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
+#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_COMPILE_KERNEL
+
+#include <rtc/kernel.hpp>
+#include <rtc/filesystem.hpp>
+#include <string>
+
+namespace rtc {
+
+struct src_file
+{
+    fs::path path;
+    std::string_view content;
+};
+
+struct compile_options
+{
+    std::string flags       = "";
+    std::string kernel_name = "main";
+};
+
+kernel compile_kernel(const std::vector<src_file>& src,
+                      compile_options options = compile_options{});
+
+} // namespace rtc
+
+#endif
--- a/codegen/test/rtc/include/rtc/filesystem.hpp
+++ b/codegen/test/rtc/include/rtc/filesystem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#ifndef GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
+#define GUARD_TEST_HOST_RTC_FILESYSTEM_HPP
+
+#include <string>
+#include <string_view>
+
+// clang-format off
+#if defined(CPPCHECK)
+  #define RTC_HAS_FILESYSTEM 1
+  #define RTC_HAS_FILESYSTEM_TS 1
+#elif defined(_WIN32)
+  #if _MSC_VER >= 1920
+    #define RTC_HAS_FILESYSTEM 1
+    #define RTC_HAS_FILESYSTEM_TS 0
+  #elif _MSC_VER >= 1900
+    #define RTC_HAS_FILESYSTEM 0
+    #define RTC_HAS_FILESYSTEM_TS 1
+  #else
+    #define RTC_HAS_FILESYSTEM 0
+    #define RTC_HAS_FILESYSTEM_TS 0
+  #endif
+#elif defined(__has_include)
+  #if __has_include(<filesystem>) && __cplusplus >= 201703L
+    #define RTC_HAS_FILESYSTEM 1
+  #else
+    #define RTC_HAS_FILESYSTEM 0
+  #endif
+  #if __has_include(<experimental/filesystem>) && __cplusplus >= 201103L
+    #define RTC_HAS_FILESYSTEM_TS 1
+  #else
+    #define RTC_HAS_FILESYSTEM_TS 0
+  #endif
+#else
+  #define RTC_HAS_FILESYSTEM 0
+  #define RTC_HAS_FILESYSTEM_TS 0
+#endif
+// clang-format on
+
+#if RTC_HAS_FILESYSTEM
+#include <filesystem>
+#elif RTC_HAS_FILESYSTEM_TS
+#include <experimental/filesystem>
+#else
+#error "No filesystem include available"
+#endif
+
+namespace rtc {
+
+#if RTC_HAS_FILESYSTEM
+namespace fs = ::std::filesystem;
+#elif RTC_HAS_FILESYSTEM_TS
+namespace fs = ::std::experimental::filesystem;
+#endif
+
+} // namespace rtc
+
+#endif // GUARD_RTC_FILESYSTEM_HPP_
--- a/codegen/test/rtc/include/rtc/hip.hpp
+++ b/codegen/test/rtc/include/rtc/hip.hpp
+#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_HIP
+#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_HIP
+
+#include <hip/hip_runtime_api.h>
+#include <memory>
+#include <string>
+
+namespace rtc {
+
+template <class T>
+struct buffer
+{
+    buffer() : ptr(), n(0) {}
+    buffer(std::shared_ptr<T> p, std::size_t sz) : ptr(p), n(sz) {}
+    buffer(std::shared_ptr<void> p, std::size_t sz)
+        : ptr(std::reinterpret_pointer_cast<T>(p)), n(sz)
+    {
+    }
+    explicit buffer(std::size_t sz) : ptr(new T[sz]), n(sz) {}
+    T* begin() { return data(); }
+    T* end() { return data() + size(); }
+    const T* begin() const { return data(); }
+    const T* end() const { return data() + size(); }
+
+    T& front() { return data()[0]; }
+    T& back() { return data()[size() - 1]; }
+    T& operator[](std::size_t i) { return data()[i]; }
+    T& at(std::size_t i)
+    {
+        if(i >= size())
+            throw std::runtime_error("Out of bounds");
+        return data()[i];
+    }
+
+    const T& front() const { return data()[0]; }
+    const T& back() const { return data()[size() - 1]; }
+    const T& operator[](std::size_t i) const { return data()[i]; }
+    const T& at(std::size_t i) const
+    {
+        if(i >= size())
+            throw std::runtime_error("Out of bounds");
+        return data()[i];
+    }
+    const T* data() const { return ptr.get(); }
+    T* data() { return ptr.get(); }
+
+    std::size_t size() const { return n; }
+    std::size_t bytes() const { return size() * sizeof(T); }
+
+    bool empty() const { return size() == 0; }
+
+    private:
+    std::shared_ptr<T> ptr;
+    std::size_t n;
+};
+
+std::string get_device_name();
+std::string hip_error(int error);
+
+std::shared_ptr<void> allocate_gpu(std::size_t sz, bool host = false);
+std::shared_ptr<void> write_to_gpu(const void* x, std::size_t sz, bool host = false);
+std::shared_ptr<void> read_from_gpu(const void* x, std::size_t sz);
+
+template <class T>
+buffer<T> to_gpu(const buffer<T>& input)
+{
+    return {write_to_gpu(input.data(), input.bytes()), input.size()};
+}
+
+template <class T>
+buffer<T> from_gpu(const buffer<T>& input)
+{
+    return {read_from_gpu(input.data(), input.bytes()), input.size()};
+}
+
+} // namespace rtc
+
+#endif
--- a/codegen/test/rtc/include/rtc/kernel.hpp
+++ b/codegen/test/rtc/include/rtc/kernel.hpp
+#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL
+#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_KERNEL
+
+#include <hip/hip_runtime_api.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace rtc {
+
+struct kernel_argument
+{
+    template <class T,
+              class U = std::remove_reference_t<T>,
+              class   = std::enable_if_t<not std::is_base_of<kernel_argument, T>{}>>
+    kernel_argument(T&& x) : size(sizeof(U)), align(alignof(U)), data(&x) // NOLINT
+    {
+    }
+    std::size_t size;
+    std::size_t align;
+    void* data;
+};
+
+std::vector<char> pack_args(const std::vector<kernel_argument>& args);
+
+struct kernel_impl;
+
+struct kernel
+{
+    kernel() = default;
+    kernel(const char* image, const std::string& name);
+    template <class T>
+    kernel(const std::vector<T>& image, const std::string& name)
+        : kernel(reinterpret_cast<const char*>(image.data()), name)
+    {
+        static_assert(sizeof(T) == 1, "Only byte types");
+    }
+
+    void launch(hipStream_t stream,
+                std::size_t global,
+                std::size_t local,
+                const std::vector<kernel_argument>& args) const;
+
+    void launch(hipStream_t stream,
+                std::size_t global,
+                std::size_t local,
+                std::vector<void*> args) const;
+
+    template <class... Ts>
+    auto launch(hipStream_t stream, std::size_t global, std::size_t local, Ts... zs) const
+    {
+        return [=](auto&&... xs) {
+            launch(stream, global, local, std::vector<kernel_argument>{xs...}, zs...);
+        };
+    }
+
+    private:
+    std::shared_ptr<kernel_impl> impl;
+};
+} // namespace rtc
+
+#endif
--- a/codegen/test/rtc/include/rtc/manage_ptr.hpp
+++ b/codegen/test/rtc/include/rtc/manage_ptr.hpp
+#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER
+#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_MANAGE_POINTER
+
+#include <type_traits>
+#include <memory>
+
+namespace rtc {
+template <class F, F f>
+struct manage_deleter
+{
+    template <class T>
+    void operator()(T* x) const
+    {
+        if(x != nullptr)
+        {
+            (void)f(x);
+        }
+    }
+};
+
+struct null_deleter
+{
+    template <class T>
+    void operator()(T*) const
+    {
+    }
+};
+
+template <class T, class F, F f>
+using manage_ptr = std::unique_ptr<T, manage_deleter<F, f>>;
+
+template <class T>
+struct element_type
+{
+    using type = typename T::element_type;
+};
+
+template <class T>
+using remove_ptr = typename std::
+    conditional_t<std::is_pointer<T>{}, std::remove_pointer<T>, element_type<T>>::type;
+
+template <class T>
+using shared = std::shared_ptr<remove_ptr<T>>;
+
+template <class T>
+shared<T> share(T p)
+{
+    return shared<T>{std::move(p)};
+}
+
+#define RTC_MANAGE_PTR(T, F) rtc::manage_ptr<std::remove_pointer_t<T>, decltype(&F), &F>
+
+} // namespace rtc
+
+#endif
--- a/codegen/test/rtc/include/rtc/tmp_dir.hpp
+++ b/codegen/test/rtc/include/rtc/tmp_dir.hpp
+#ifndef GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
+#define GUARD_HOST_TEST_RTC_INCLUDE_RTC_TMP_DIR
+
+#include <string>
+#include <rtc/filesystem.hpp>
+
+namespace rtc {
+
+struct tmp_dir
+{
+    fs::path path;
+    tmp_dir(const std::string& prefix = "");
+
+    void execute(const std::string& cmd) const;
+
+    tmp_dir(tmp_dir const&) = delete;
+    tmp_dir& operator=(tmp_dir const&) = delete;
+
+    ~tmp_dir();
+};
+
+} // namespace rtc
+
+#endif
--- a/codegen/test/rtc/src/compile_kernel.cpp
+++ b/codegen/test/rtc/src/compile_kernel.cpp
+#include <rtc/hip.hpp>
+#include <rtc/compile_kernel.hpp>
+#include <rtc/tmp_dir.hpp>
+#include <stdexcept>
+#include <iostream>
+#include <fstream>
+#include <cassert>
+
+namespace rtc {
+
+template <class T>
+T generic_read_file(const std::string& filename, size_t offset = 0, size_t nbytes = 0)
+{
+    std::ifstream is(filename, std::ios::binary | std::ios::ate);
+    if(nbytes == 0)
+    {
+        // if there is a non-zero offset and nbytes is not set,
+        // calculate size of remaining bytes to read
+        nbytes = is.tellg();
+        if(offset > nbytes)
+            throw std::runtime_error("offset is larger than file size");
+        nbytes -= offset;
+    }
+    if(nbytes < 1)
+        throw std::runtime_error("Invalid size for: " + filename);
+    is.seekg(offset, std::ios::beg);
+
+    T buffer(nbytes, 0);
+    if(not is.read(&buffer[0], nbytes))
+        throw std::runtime_error("Error reading file: " + filename);
+    return buffer;
+}
+
+std::vector<char> read_buffer(const std::string& filename, size_t offset = 0, size_t nbytes = 0)
+{
+    return generic_read_file<std::vector<char>>(filename, offset, nbytes);
+}
+
+std::string read_string(const std::string& filename)
+{
+    return generic_read_file<std::string>(filename);
+}
+
+void write_buffer(const std::string& filename, const char* buffer, std::size_t size)
+{
+    std::ofstream os(filename);
+    os.write(buffer, size);
+}
+void write_buffer(const std::string& filename, const std::vector<char>& buffer)
+{
+    write_buffer(filename, buffer.data(), buffer.size());
+}
+void write_string(const std::string& filename, const std::string_view& buffer)
+{
+    write_buffer(filename, buffer.data(), buffer.size());
+}
+
+std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip --cuda-device-only"; }
+// TODO: undo after extracting the codeobj
+// std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip"; }
+
+kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options)
+{
+    assert(not srcs.empty());
+    tmp_dir td{"compile"};
+    options.flags += " -I. -O3";
+    options.flags += " -std=c++17";
+    options.flags += " --offload-arch=" + get_device_name();
+    std::string out;
+
+    for(const auto& src : srcs)
+    {
+        fs::path full_path   = td.path / src.path;
+        fs::path parent_path = full_path.parent_path();
+        fs::create_directories(parent_path);
+        write_string(full_path.string(), src.content);
+        if(src.path.extension().string() == ".cpp")
+        {
+            options.flags += " -c " + src.path.filename().string();
+            if(out.empty())
+                out = src.path.stem().string() + ".o";
+        }
+    }
+
+    options.flags += " -o " + out;
+    td.execute(compiler() + options.flags);
+
+    auto out_path = td.path / out;
+    if(not fs::exists(out_path))
+        throw std::runtime_error("Output file missing: " + out);
+
+    auto obj = read_buffer(out_path.string());
+
+    std::ofstream ofh("obj.o", std::ios::binary);
+    for(auto i : obj)
+        ofh << i;
+    ofh.close();
+    // int s = std::system(("/usr/bin/cp " + out_path.string() + " codeobj.bin").c_str());
+    // assert(s == 0);
+    return kernel{obj.data(), options.kernel_name};
+}
+
+} // namespace rtc
--- a/codegen/test/rtc/src/hip.cpp
+++ b/codegen/test/rtc/src/hip.cpp
+#include <rtc/hip.hpp>
+#include <rtc/manage_ptr.hpp>
+#include <stdexcept>
+#include <cassert>
+#include <iostream>
+
+namespace rtc {
+
+using hip_ptr = RTC_MANAGE_PTR(void, hipFree);
+
+std::string hip_error(int error) { return hipGetErrorString(static_cast<hipError_t>(error)); }
+
+int get_device_id()
+{
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess)
+        throw std::runtime_error("No device");
+    return device;
+}
+
+std::string get_device_name()
+{
+    hipDeviceProp_t props{};
+    auto status = hipGetDeviceProperties(&props, get_device_id());
+    if(status != hipSuccess)
+        throw std::runtime_error("Failed to get device properties");
+    return props.gcnArchName;
+}
+
+bool is_device_ptr(const void* ptr)
+{
+    hipPointerAttribute_t attr;
+    auto status = hipPointerGetAttributes(&attr, ptr);
+    if(status != hipSuccess)
+        return false;
+    return attr.type == hipMemoryTypeDevice;
+}
+
+void gpu_sync()
+{
+    auto status = hipDeviceSynchronize();
+    if(status != hipSuccess)
+        throw std::runtime_error("hip device synchronization failed: " + hip_error(status));
+}
+
+std::size_t get_available_gpu_memory()
+{
+    size_t free;
+    size_t total;
+    auto status = hipMemGetInfo(&free, &total);
+    if(status != hipSuccess)
+    {
+        std::cerr << "Failed getting available memory: " + hip_error(status) << std::endl;
+        return (8ull * 1024ull * 1024ull * 1024ull);
+    }
+    return free;
+}
+
+std::shared_ptr<void> allocate_gpu(std::size_t sz, bool host)
+{
+    if(sz > get_available_gpu_memory())
+        throw std::runtime_error("Memory not available to allocate buffer: " + std::to_string(sz));
+    void* alloc_ptr = nullptr;
+    auto status     = host ? hipHostMalloc(&alloc_ptr, sz) : hipMalloc(&alloc_ptr, sz);
+    if(status != hipSuccess)
+    {
+        if(host)
+            throw std::runtime_error("Gpu allocation failed: " + hip_error(status));
+        else
+            return allocate_gpu(sz, true);
+    }
+    assert(alloc_ptr != nullptr);
+    std::shared_ptr<void> result = share(hip_ptr{alloc_ptr});
+    return result;
+}
+
+std::shared_ptr<void> write_to_gpu(const void* x, std::size_t sz, bool host)
+{
+    gpu_sync();
+    auto result = allocate_gpu(sz, host);
+    assert(is_device_ptr(result.get()));
+    assert(not is_device_ptr(x));
+    auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
+    if(status != hipSuccess)
+        throw std::runtime_error("Copy to gpu failed: " + hip_error(status));
+    return result;
+}
+
+std::shared_ptr<void> read_from_gpu(const void* x, std::size_t sz)
+{
+    gpu_sync();
+    std::shared_ptr<char> result(new char[sz]);
+    assert(not is_device_ptr(result.get()));
+    if(not is_device_ptr(x))
+    {
+        throw std::runtime_error(
+            "read_from_gpu() requires Src buffer to be on the GPU, Copy from gpu failed\n");
+    }
+    auto status = hipMemcpy(result.get(), x, sz, hipMemcpyDeviceToHost);
+    if(status != hipSuccess)
+        throw std::runtime_error("Copy from gpu failed: " + hip_error(status)); // NOLINT
+    return std::static_pointer_cast<void>(result);
+}
+
+} // namespace rtc
--- a/codegen/test/rtc/src/kernel.cpp
+++ b/codegen/test/rtc/src/kernel.cpp
+#include <rtc/kernel.hpp>
+#include <rtc/manage_ptr.hpp>
+#include <rtc/hip.hpp>
+#include <cassert>
+
+// extern declare the function since hip/hip_ext.h header is broken
+extern hipError_t hipExtModuleLaunchKernel(hipFunction_t, // NOLINT
+                                           uint32_t,
+                                           uint32_t,
+                                           uint32_t,
+                                           uint32_t,
+                                           uint32_t,
+                                           uint32_t,
+                                           size_t,
+                                           hipStream_t,
+                                           void**,
+                                           void**,
+                                           hipEvent_t = nullptr,
+                                           hipEvent_t = nullptr,
+                                           uint32_t   = 0);
+
+namespace rtc {
+
+std::vector<char> pack_args(const std::vector<kernel_argument>& args)
+{
+    std::vector<char> kernargs;
+    for(auto&& arg : args)
+    {
+        std::size_t n = arg.size;
+        const auto* p = static_cast<const char*>(arg.data);
+        // Insert padding
+        std::size_t padding = (arg.align - (kernargs.size() % arg.align)) % arg.align;
+        kernargs.insert(kernargs.end(), padding, 0);
+        kernargs.insert(kernargs.end(), p, p + n);
+    }
+    return kernargs;
+}
+
+using hip_module_ptr = RTC_MANAGE_PTR(hipModule_t, hipModuleUnload);
+
+struct kernel_impl
+{
+    hip_module_ptr module = nullptr;
+    hipFunction_t fun     = nullptr;
+};
+
+hip_module_ptr load_module(const char* image)
+{
+    hipModule_t raw_m;
+    auto status = hipModuleLoadData(&raw_m, image);
+    hip_module_ptr m{raw_m};
+    if(status != hipSuccess)
+        throw std::runtime_error("Failed to load module: " + hip_error(status));
+    return m;
+}
+
+kernel::kernel(const char* image, const std::string& name) : impl(std::make_shared<kernel_impl>())
+{
+    impl->module = load_module(image);
+    auto status  = hipModuleGetFunction(&impl->fun, impl->module.get(), name.c_str());
+    if(hipSuccess != status)
+        throw std::runtime_error("Failed to get function: " + name + ": " + hip_error(status));
+}
+
+void launch_kernel(hipFunction_t fun,
+                   hipStream_t stream,
+                   std::size_t global,
+                   std::size_t local,
+                   void* kernargs,
+                   std::size_t size)
+{
+    assert(global > 0);
+    assert(local > 0);
+    void* config[] = {HIP_LAUNCH_PARAM_BUFFER_POINTER,
+                      kernargs,
+                      HIP_LAUNCH_PARAM_BUFFER_SIZE,
+                      &size,
+                      HIP_LAUNCH_PARAM_END};
+
+    auto status = hipExtModuleLaunchKernel(fun,
+                                           global,
+                                           1,
+                                           1,
+                                           local,
+                                           1,
+                                           1,
+                                           0,
+                                           stream,
+                                           nullptr,
+                                           reinterpret_cast<void**>(&config),
+                                           nullptr,
+                                           nullptr);
+    if(status != hipSuccess)
+        throw std::runtime_error("Failed to launch kernel: " + hip_error(status));
+}
+
+void kernel::launch(hipStream_t stream,
+                    std::size_t global,
+                    std::size_t local,
+                    std::vector<void*> args) const
+{
+    assert(impl != nullptr);
+    void* kernargs   = args.data();
+    std::size_t size = args.size() * sizeof(void*);
+
+    launch_kernel(impl->fun, stream, global, local, kernargs, size);
+}
+
+void kernel::launch(hipStream_t stream,
+                    std::size_t global,
+                    std::size_t local,
+                    const std::vector<kernel_argument>& args) const
+{
+    assert(impl != nullptr);
+    std::vector<char> kernargs = pack_args(args);
+    std::size_t size           = kernargs.size();
+
+    launch_kernel(impl->fun, stream, global, local, kernargs.data(), size);
+}
+
+} // namespace rtc
--- a/codegen/test/rtc/src/tmp_dir.cpp
+++ b/codegen/test/rtc/src/tmp_dir.cpp
+#include <rtc/tmp_dir.hpp>
+#include <algorithm>
+#include <random>
+#include <thread>
+#include <unistd.h>
+
+namespace rtc {
+std::string random_string(std::string::size_type length)
+{
+    static const std::string& chars = "0123456789"
+                                      "abcdefghijklmnopqrstuvwxyz"
+                                      "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+
+    std::mt19937 rg{std::random_device{}()};
+    std::uniform_int_distribution<std::string::size_type> pick(0, chars.length() - 1);
+
+    std::string str(length, 0);
+    std::generate(str.begin(), str.end(), [&] { return chars[pick(rg)]; });
+
+    return str;
+}
+
+std::string unique_string(const std::string& prefix)
+{
+    auto pid = getpid();
+    auto tid = std::this_thread::get_id();
+    auto clk = std::chrono::steady_clock::now().time_since_epoch().count();
+    std::stringstream ss;
+    ss << std::hex << prefix << "-" << pid << "-" << tid << "-" << clk << "-" << random_string(16);
+    return ss.str();
+}
+
+tmp_dir::tmp_dir(const std::string& prefix)
+    : path(fs::temp_directory_path() /
+           unique_string(prefix.empty() ? "ck-rtc" : "ck-rtc-" + prefix))
+{
+    fs::create_directories(this->path);
+}
+
+void tmp_dir::execute(const std::string& cmd) const
+{
+    std::string s = "cd " + path.string() + "; " + cmd;
+    std::system(s.c_str());
+}
+
+tmp_dir::~tmp_dir() { fs::remove_all(this->path); }
+
+} // namespace rtc
--- a/dev-requirements.txt
+++ b/dev-requirements.txt
-ROCmSoftwarePlatform/rocm-recipes
-RadeonOpenCompute/rocm-cmake@04f694df2a8dc9d7e35fa4dee4ba5fa407ec04f8 --build
-danmar/cppcheck@2.9
\ No newline at end of file
+ROCm/rocm-recipes
+ROCm/rocm-cmake@04f694df2a8dc9d7e35fa4dee4ba5fa407ec04f8 --build
+danmar/cppcheck@2.9
--- a/docs/Contributors_Guide.rst
+++ b/docs/Contributors_Guide.rst
+.. meta::
+  :description: Composable Kernel documentation and API reference library
+  :keywords: composable kernel, CK, ROCm, API, documentation
+
+.. _contributing-to:
+
+********************************************************************
+Contributor's guide
+********************************************************************
+
+This chapter explains the rules for contributing to the Composable Kernel project, and how to contribute.
+
+Getting started
+===============
+
+#. **Documentation:** Before contributing to the library, familiarize yourself with the
+   `Composable Kernel User Guide <https://rocm.docs.amd.com/projects/composable_kernel/en/latest/>`_.
+   It provides insight into the core concepts, environment configuration, and steps to obtain or
+   build the library. You can also find some of this information in the
+   `README file <https://github.com/ROCm/composable_kernel/blob/develop/README.md>`_
+   on the project's GitHub page.
+#. **Additional reading:** The blog post `AMD Composable Kernel library: efficient fused kernels for AI apps with just a few lines of code <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_ provides a deeper understanding of the CK library and showcases its performance capabilities.
+   <https://community.amd.com/t5/instinct-accelerators/amd-composable-kernel-library-efficient-fused-kernels-for-ai/ba-p/553224>`_
+   from the AMD Community portal. It offers a deeper understanding of the library's objectives and showcases its performance capabilities.
+#. **General information:** For broader information about AMD products, consider exploring the
+   `AMD Developer Central portal <https://www.amd.com/en/developer.html>`_.
+
+How to contribute
 ===================
-Contributor's Guide
-===================

-Pull-request guidelines
-=======================
+You can make an impact by reporting issues or proposing code enhancements through pull requests.
+
+Reporting issues
+----------------
+
+Use `Github issues <https://github.com/ROCm/composable_kernel/issues>`_
+to track public bugs and enhancement requests.
+
+If you encounter an issue with the library, please check if the problem has already been
+reported by searching existing issues on GitHub. If your issue seems unique, please submit a new
+issue. All reported issues must include:
+
+* A comprehensive description of the problem, including:
+
+  * What did you observe?
+  * Why do you think it is a bug (if it seems like one)?
+  * What did you expect to happen? What would indicate the resolution of the problem?
+  * Are there any known workarounds?
+
+* Your configuration details, including:
+
+  * Which GPU are you using?
+  * Which OS version are you on?
+  * Which ROCm version are you using?
+  * Are you using a Docker image? If so, which one?
+
+* Steps to reproduce the issue, including:
+
+  * What actions trigger the issue? What are the reproduction steps?
+
+    * If you build the library from scratch, what CMake command did you use?
+
+  * How frequently does this issue happen? Does it reproduce every time? Or is it a sporadic issue?
+
+Before submitting any issue, ensure you have addressed all relevant questions from the checklist.
+
+Creating Pull Requests
+----------------------
+
+You can submit `Pull Requests (PR) on GitHub
+<https://github.com/ROCm/composable_kernel/pulls>`_.
+
+All contributors are required to develop their changes on a separate branch and then create a
+pull request to merge their changes into the `develop` branch, which is the default
+development branch in the Composable Kernel project. All external contributors must use their own
+forks of the project to develop their changes.
+
+When submitting a Pull Request you should:
+
+* Describe the change providing information about the motivation for the change and a general
+  description of all code modifications.
+
+* Verify and test the change:
+
+  * Run any relevant existing tests.
+  * Write new tests if added functionality is not covered by current tests.
+
+* Ensure your changes align with the coding style defined in the ``.clang-format`` file located in
+  the project's root directory. We leverage `pre-commit` to run `clang-format` automatically. We
+  highly recommend contributors utilize this method to maintain consistent code formatting.
+  Instructions on setting up `pre-commit` can be found in the project's
+  `README file <https://github.com/ROCm/composable_kernel/blob/develop/README.md>`_
+
+* Link your PR to any related issues:
+
+  * If there is an issue that is resolved by your change, please provide a link to the issue in
+    the description of your pull request.
+
+* For larger contributions, structure your change into a sequence of smaller, focused commits, each
+  addressing a particular aspect or fix.
+
+Following the above guidelines ensures a seamless review process and faster assistance from our
+end.

-[TODO]
+Thank you for your commitment to enhancing the Composable Kernel project! 
--- a/docs/conceptual/what-is-ck.rst
+++ b/docs/conceptual/what-is-ck.rst
+.. meta::
+  :description: Composable Kernel documentation and API reference library
+  :keywords: composable kernel, CK, ROCm, API, documentation
+
+.. _what-is-ck:
+
+********************************************************************
+What is the Composable Kernel library
+********************************************************************
+
+
+Methodology
+===========
+
+The Composable Kernel (CK) library provides a programming model for writing performance critical kernels for machine learning workloads across multiple architectures including GPUs and CPUs, through general purpose kernel languages like HIP C++.
+
+CK utilizes two concepts to achieve performance portability and code maintainability:
+
+* A tile-based programming model
+* Algorithm complexity reduction for complex ML operators using an innovative technique called
+  "Tensor Coordinate Transformation".
+
+.. image:: ../data/ck_component.png
+   :alt: CK Components
+
+
+Code Structure
+==============
+
+The CK library is structured into 4 layers:
+
+* "Templated Tile Operators" layer
+* "Templated Kernel and Invoker" layer
+* "Instantiated Kernel and Invoker" layer
+* "Client API" layer
+
+It also includes a simple wrapper component used to perform tensor transform operations more easily and with fewer lines of code.
+
+.. image:: ../data/ck_layer.png
+   :alt: CK Layers
+   
\ No newline at end of file
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -4,23 +4,34 @@
 # list see the documentation:
 # https://www.sphinx-doc.org/en/master/usage/configuration.html

-import subprocess
+import re

 from rocm_docs import ROCmDocs

+html_theme_options = {"flavor": "list"}

-name = "Composable Kernel"
-get_version = r'sed -n -e "s/^rocm_setup_version(.* \([0-9\.]\{1,\}\).*/\1/p" ../CMakeLists.txt'
-version = subprocess.getoutput(get_version)
-if len(version) > 0:
-    name = f"{name} {version}"
+with open('../CMakeLists.txt', encoding='utf-8') as f:
+    match = re.search(r'.*set\(version ([0-9.]+)[^0-9.]+', f.read())
+    if not match:
+        raise ValueError("VERSION not found!")
+    version_number = match[1]
+left_nav_title = f"Composable Kernel {version_number} Documentation"
+
+# for PDF output on Read the Docs
+project = "Composable Kernel Documentation"
+author = "Advanced Micro Devices, Inc."
+copyright = "Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved."
+version = version_number
+release = version_number

 external_toc_path = "./sphinx/_toc.yml"

-docs_core = ROCmDocs(f"{name} Documentation")
-docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/docBin/xml")
+docs_core = ROCmDocs(left_nav_title)
+docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml")
 docs_core.setup()

+external_projects_current_project = "composable_kernel"
+
 mathjax3_config = {
 'tex': {
    'macros': {
@@ -34,3 +45,5 @@ for sphinx_var in ROCmDocs.SPHINX_VARS:

 extensions += ['sphinxcontrib.bibtex']
 bibtex_bibfiles = ['refs.bib']
+
+cpp_id_attributes = ["__global__", "__device__", "__host__"]