Introduce gemm_softmax_gemm to codegen.

d43cd4ad · Mirza Halilcevic · 3528a523 · d43cd4ad · d43cd4ad · d43cd4ad
Commit d43cd4ad authored Sep 25, 2024 by Mirza Halilcevic
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -128,6 +128,8 @@ list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/ll

 message("GPU_TARGETS= ${GPU_TARGETS}")

+option(CK_BUILD_HOST_LIB, "Only build the CK JIT Helper Library" OFF)
+
 find_package(hip)
 # No assumption that HIP kernels are launched with uniform block size for backward compatibility
 # SWDEV-413293 and https://reviews.llvm.org/D155213
@@ -254,6 +256,7 @@ elseif(CK_PARALLEL_COMPILE_JOBS)
  message(WARNING "Job pooling is only available with Ninja generators.")
 endif()

+if (NOT CK_BUILD_HOST_LIB)

 option(USE_BITINT_EXTENSION_INT4 "Whether to enable clang's BitInt extension to provide int4 data type." OFF)
 option(USE_OPT_GFX11 "Whether to enable LDS cumode and Wavefront32 mode for GFX11 silicons." OFF)
@@ -275,6 +278,8 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 link_libraries(Threads::Threads)

+endif() # NOT CK_BUILD_HOST_LIB
+
 ## C++
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@@ -291,6 +296,8 @@ if(USE_GLIBCXX_ASSERTIONS)
  add_compile_options(-Wp,-D_GLIBCXX_ASSERTIONS)
 endif()

+if (NOT CK_BUILD_HOST_LIB)
+
 ## HIP
 set(CMAKE_HIP_PLATFORM amd)
 set(CMAKE_HIP_COMPILER ${CMAKE_CXX_COMPILER})
@@ -346,6 +353,8 @@ else()
    add_compile_definitions(__HIP_PLATFORM_HCC__=1)
 endif()

+endif() # NOT CK_BUILD_HOST_LIB
+
 ## tidy
 include(EnableCompilerWarnings)
 set(CK_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
@@ -499,6 +508,8 @@ include_directories(BEFORE
    ${HIP_INCLUDE_DIRS}
 )

+if (NOT CK_BUILD_HOST_LIB)
+
 SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
 if(BUILD_DEV)
    add_compile_options(-Werror)
@@ -506,6 +517,8 @@ if(BUILD_DEV)
 endif()
 message("CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")

+endif() # NOT CK_BUILD_HOST_LIB
+
 if("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
    add_compile_options(-fcolor-diagnostics)
 endif()
@@ -515,6 +528,8 @@ endif()

 add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure -C ${CMAKE_CFG_INTDIR})

+if (NOT CK_BUILD_HOST_LIB)
+
 file(GLOB_RECURSE INSTANCE_FILES "${PROJECT_SOURCE_DIR}/*/device_*_instance.cpp")
 file(GLOB dir_list RELATIVE ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu ${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/*)
 set(CK_DEVICE_INSTANCES)
@@ -590,6 +605,18 @@ if(NOT DEFINED PROFILER_ONLY AND (GPU_TARGETS MATCHES "gfx9" OR DEFINED INSTANCE
  add_subdirectory(codegen)
 endif()

+else() # NOT CK_BUILD_HOST_LIB
+
+if(GPU_TARGETS MATCHES "gfx9")
+  rocm_package_setup_component(ck_host
+    LIBRARY_NAME composablekernel
+    PACKAGE_NAME ck_host
+  )
+  add_subdirectory(codegen)
+endif()
+
+endif() # NOT CK_BUILD_HOST_LIB
+
 #Create an interface target for the include only files and call it "composablekernels"
 include(CMakePackageConfigHelpers)

@@ -627,4 +654,4 @@ rocm_create_package(
    MAINTAINER "MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
    LDCONFIG
    HEADER_ONLY
-)
+)
\ No newline at end of file
--- a/Config.cmake.in
+++ b/Config.cmake.in
 @PACKAGE_INIT@

-set(_composable_kernel_supported_components device_other_operations device_gemm_operations device_conv_operations device_mha_operations device_contraction_operations device_reduction_operations utility)
+set(_composable_kernel_supported_components device_other_operations device_gemm_operations device_conv_operations device_mha_operations device_contraction_operations device_reduction_operations utility ck_host)

 foreach(_comp ${composable_kernel_FIND_COMPONENTS})
 	if(NOT _comp IN_LIST _composable_kernel_supported_components)

--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -31,12 +31,21 @@ file(GLOB SOURCES CONFIGURE_DEPENDS src/*.cpp)
 ##message(STATUS "SOURCE_FILES: ${SOURCES}")
 # TODO: Use object library
 add_library(ck_host STATIC ${SOURCES})
-target_link_libraries(ck_host PRIVATE ck_headers)
+add_library(composable_kernel::ck_host ALIAS ck_host)

 set_target_properties(ck_host PROPERTIES
 	LINKER_LANGUAGE CXX
        POSITION_INDEPENDENT_CODE ON)

+target_include_directories(ck_host SYSTEM PRIVATE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    # $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/library/src/jit_library/solution_instances>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/solution_instances>
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/embed/ck_headers/include>
+)
+
+target_link_libraries(ck_host PRIVATE $<BUILD_INTERFACE:ck_headers>)
+
 target_include_directories(ck_host PUBLIC
    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
 )
@@ -45,9 +54,18 @@ add_executable(ck-template-driver driver/main.cpp)
 target_link_libraries(ck-template-driver ck_host)

 rocm_install(
-    TARGETS ck_host ck_headers
+    TARGETS ck_host
    EXPORT ck_hostTargets
 )
 rocm_install(DIRECTORY include/ck DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

-add_subdirectory(test)
+rocm_install(
+    EXPORT ck_hostTargets
+    FILE composable_kernelck_hostTargets.cmake
+    NAMESPACE composable_kernel::
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
+)
+
+if (NOT CK_BUILD_HOST_LIB)
+    add_subdirectory(test)
+endif()
--- a/codegen/include/ck/host/device_batched_gemm_softmax_gemm/operation.hpp
+++ b/codegen/include/ck/host/device_batched_gemm_softmax_gemm/operation.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+#include <string>
+#include "ck/host/types.hpp"
+#include "ck/host/operation/gemm.hpp"
+#include "ck/host/device_batched_gemm_softmax_gemm/problem.hpp"
+
+namespace ck {
+namespace host {
+namespace device_batched_gemm_softmax_gemm {
+
+// defines all values need for an instance of fwd conv
+struct Operation_Xdl_CShuffle
+{
+    // returns a vector of instances, only given fusion operators: will use default problem spec
+    static std::vector<std::vector<Operation_Xdl_CShuffle>>
+    CreateOperations(const std::string& prologue, const std::string& epilogue);
+    // returns a vector of instances, given a problem spec and fusion operators
+    static std::vector<Operation_Xdl_CShuffle>
+    CreateOperations(const Problem& prob, const std::string& prologue, const std::string& epilogue);
+    TensorDesc A{};
+    TensorDesc B{};
+    TensorDesc B1{};
+    TensorDesc C{};
+    std::string a_elem_op           = PassThrough;
+    std::string b_elem_op           = PassThrough;
+    std::string b1_elem_op          = PassThrough;
+    std::string c_elem_op           = PassThrough;
+    std::string acc_elem_op         = Scale;
+    std::string prologue            = "";
+    std::string epilogue            = "";
+    std::string gemm_specialization = "ck::tensor_operation::device::GemmSpecialization::Default";
+    // tuning parameters
+    operation::TileDescGemmSoftmaxGemm tile_desc{};
+    operation::BlockTransferDesc a_block_transfer{};
+    operation::BlockTransferDesc b0_block_transfer{};
+    operation::BlockTransferDesc b1_block_transfer{};
+    operation::CShuffleDesc cshuffle{};
+    operation::CBlockTransferDesc c_block_transfer{};
+
+    bool mask_out_upper_triangle = false;
+
+    // functions to update fusion operators if provided
+    void update_prologue(const std::string& prologue);
+    void update_epilogue(const std::string& epilogue);
+    /**constexpr**/ bool IsSupported(std::size_t MRaw_, std::size_t NRaw_, std::size_t KRaw_);
+    // returns a templated instance
+    Solution ToSolution() const;
+};
+
+} // namespace device_batched_gemm_softmax_gemm
+} // namespace host
+} // namespace ck
--- a/codegen/include/ck/host/device_batched_gemm_softmax_gemm/problem.hpp
+++ b/codegen/include/ck/host/device_batched_gemm_softmax_gemm/problem.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+#include <vector>
+#include <string>
+#include "ck/host/types.hpp"
+
+namespace ck {
+namespace host {
+namespace device_batched_gemm_softmax_gemm {
+
+// defines the problem specification for a GEMM operation
+struct Problem
+{
+    std::size_t M            = 0;
+    std::size_t N            = 0;
+    std::size_t K            = 0;
+    std::size_t O            = 0;
+    bool TransA              = false;
+    bool TransB              = false;
+    bool TransB1             = false;
+    bool TransC              = false;
+    DataType ADataType       = DataType::Half;
+    DataType BDataType       = DataType::Half;
+    DataType B1DataType      = DataType::Half;
+    DataType CDataType       = DataType::Half;
+    std::string AElementOp   = PassThrough;
+    std::string BElementOp   = PassThrough;
+    std::string B1ElementOp  = PassThrough;
+    std::string CElementOp   = PassThrough;
+    std::string AccElementOp = Scale;
+
+    // returns the correct device op file for the operation
+    std::string GetIncludeHeader() const;
+
+    // returns a list of instances based on the problem spec and provided fusion operations
+    std::vector<Solution> GetSolutions(const std::string& arch,
+                                       const std::string& prologue,
+                                       const std::string& epilogue) const;
+};
+
+} // namespace device_batched_gemm_softmax_gemm
+} // namespace host
+} // namespace ck
--- a/codegen/include/ck/host/device_gemm_multiple_d/operation.hpp
+++ b/codegen/include/ck/host/device_gemm_multiple_d/operation.hpp
@@ -41,6 +41,8 @@ struct Operation_Xdl_CShuffle
    operation::BlockTransferDesc b_block_transfer{};
    operation::CShuffleDesc cshuffle{};
    operation::CBlockTransferDesc c_block_transfer{};
+    LoopScheduler loop_scheduler{};
+    PipelineVersion pipeline_version{};

    // functions to update fusion operators if provided
    void update_prologue(const std::string& prologue);

--- a/codegen/include/ck/host/operation/gemm.hpp
+++ b/codegen/include/ck/host/operation/gemm.hpp
@@ -23,6 +23,26 @@ struct TileDesc
    int n_Xdl_per_wave           = 0;
    int num_gemmk_prefetch_stage = 0;
 };
+
+struct TileDescGemmSoftmaxGemm
+{
+    int block_size               = 0;
+    int gemm01_m_per_block       = 0;
+    int gemm0_n_per_block        = 0;
+    int gemm0_k_per_block        = 0;
+    int gemm1_n_per_block        = 0;
+    int gemm1_k_per_block        = 0;
+    int ak1                      = 0;
+    int bk1                      = 0;
+    int b1k1                     = 0;
+    int m_per_XDL                = 0;
+    int n_per_XDL                = 0;
+    int gemm0_m_Xdl_per_wave     = 0;
+    int gemm0_n_Xdl_per_wave     = 0;
+    int gemm1_n_Xdl_per_wave     = 0;
+    int num_gemmk_prefetch_stage = 0;
+};
+
 struct BlockTransferDesc
 {
    std::string thread_cluster_length        = "";

--- a/codegen/include/ck/host/types.hpp
+++ b/codegen/include/ck/host/types.hpp
@@ -66,6 +66,20 @@ enum class GemmType
 };
 std::string ToString(GemmType gt);

+enum class LoopScheduler
+{
+    Default,
+    Interwave,
+};
+std::string ToString(LoopScheduler ls);
+
+enum class PipelineVersion
+{
+    v1,
+    v2
+};
+std::string ToString(PipelineVersion pv);
+
 struct TensorDesc
 {
    DataType element;
@@ -84,6 +98,7 @@ const std::string S = SequenceStr({xs...});

 constexpr const char* PassThrough = "ck::tensor_operation::element_wise::PassThrough";
 constexpr const char* Bilinear    = "ck::tensor_operation::element_wise::Bilinear";
+constexpr const char* Scale       = "ck::tensor_operation::element_wise::Scale";

 } // namespace host
 } // namespace ck
--- a/codegen/src/device_batched_gemm_softmax_gemm.cpp
+++ b/codegen/src/device_batched_gemm_softmax_gemm.cpp
+
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/host/device_batched_gemm_softmax_gemm/problem.hpp"
+#include "ck/host/device_batched_gemm_softmax_gemm/operation.hpp"
+#include "ck/host/utils.hpp"
+#include <algorithm>
+
+namespace ck {
+namespace host {
+namespace device_batched_gemm_softmax_gemm {
+
+// return the relevant device op file based on the operation
+std::string Problem::GetIncludeHeader() const
+{
+    return "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp";
+}
+
+// returns templated instances when provided with a problem specification
+std::vector<Solution> Problem::GetSolutions(const std::string& arch,
+                                            const std::string& prologue,
+                                            const std::string& epilogue) const
+{
+    if(get_xdlop_archs().count(arch) == 0)
+        return {};
+    auto ops = ck::host::device_batched_gemm_softmax_gemm::Operation_Xdl_CShuffle::CreateOperations(
+        *this, prologue, epilogue); // obtains vector of instances
+    std::vector<Solution> result;
+    std::transform(ops.begin(), ops.end(), std::back_inserter(result), [&](const auto& op) {
+        return op.ToSolution(); // template instance with correct values
+    });
+    return result;
+}
+
+} // namespace device_batched_gemm_softmax_gemm
+} // namespace host
+} // namespace ck
--- a/codegen/src/device_batched_gemm_softmax_gemm_operation_xdl_cshuffle.cpp
+++ b/codegen/src/device_batched_gemm_softmax_gemm_operation_xdl_cshuffle.cpp
--- a/codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
+++ b/codegen/src/device_gemm_multiple_d_operation_xdl_cshuffle.cpp
@@ -62,6 +62,13 @@ void Operation_Xdl_CShuffle::update_epilogue(const std::string& epi)
 // accounts for all possible combinations of Row/Col major
 static Layout ToLayout(bool Trans) { return Trans ? Layout::Column : Layout::Row; }

+
+
+// DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1,
+
+// DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
+
+
 // Hard-code tuning parameters in modularized fashion, string them together into a vector of
 // instances
 std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
@@ -83,6 +90,8 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
  {   128,    64,   128,    32,   8,   8,   32,   32,    2,    2,        1},
  {   256,   128,    64,    32,   8,   8,   32,   32,    2,    1,        1},
  {   256,    64,   128,    32,   8,   8,   32,   32,    1,    2,        1},
+//  Irregular tile
+  {    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,        1},
        // clang-format on
    };

@@ -100,6 +109,8 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
  {    S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1},
  {    S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1},
  {    S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1},
+//  Irregular tile
+  {    S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1},
        // clang-format on
    };

@@ -109,15 +120,17 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
 //   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|
 // Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |
 //                |               |               |               |               |               |          |
+  {    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1},
+  {    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1},
+  {    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1},
+  {    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1},
+  {    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1},
+  {    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1},
+  {    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1},
+  {    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1},
+//  Irregular tile
+  {    S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1},
        // clang-format on
-        {S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
-        {S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
-        {S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
-        {S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
-        {S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 4, 8, 1},
-        {S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
-        {S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, 1},
-        {S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 1, 8, 1},
    };

    std::vector<operation::BlockTransferDesc> b_block_descriptions_rowmajor = {
@@ -134,6 +147,8 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
  {    S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1},
  {    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1},
  {    S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1},
+//  Irregular tile
+  {    S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1},
        // clang-format on
    };

@@ -151,6 +166,8 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
  {    S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1},
  {    S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1},
  {    S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1},
+//  Irregular tile
+  {    S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1},
        // clang-format on
    };

@@ -167,6 +184,7 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
  {          1,           1},
  {          1,           1},
  {          1,           1},
+  {          1,           1},
  {          1,           1},
        // clang-format on
    };
@@ -185,6 +203,8 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
  {              S<1, 16, 1, 8>,               8},
  {              S<1, 32, 1, 8>,               8},
  {              S<1, 32, 1, 8>,               8},
+//  Irregular tile
+  {              S<1, 16, 1, 4>,               1},
        // clang-format on
    };

@@ -199,33 +219,44 @@ std::vector<Operation_Xdl_CShuffle> Operation_Xdl_CShuffle::CreateOperations(
    assert(tile_descriptions.size() == cshuffle_descriptions.size());
    assert(tile_descriptions.size() == c_block_descriptions.size());

-    // Put all values together into a single operation > store into the result vector
-    for(std::size_t i = 0; i < tile_descriptions.size(); i++)
+    const std::vector<std::tuple<LoopScheduler, PipelineVersion>> scheduler_pipeline_descriptions =
+        {
+            {LoopScheduler::Default, PipelineVersion::v1},
+            {LoopScheduler::Interwave, PipelineVersion::v1},
+            {LoopScheduler::Default, PipelineVersion::v2},
+        };
+    for(auto [loop_scheduler, pipeline_version] : scheduler_pipeline_descriptions)
    {
-        Operation_Xdl_CShuffle x;
-        x.tile_desc           = tile_descriptions[i];
-        x.a_block_transfer    = a_block_descriptions[i];
-        x.b_block_transfer    = b_block_descriptions[i];
-        x.cshuffle            = cshuffle_descriptions[i];
-        x.c_block_transfer    = c_block_descriptions[i];
-        x.A                   = TensorDesc{prob.ADataType, ToLayout(prob.TransA)};
-        x.B                   = TensorDesc{prob.BDataType, ToLayout(prob.TransB)};
-        x.E                   = TensorDesc{prob.EDataType, ToLayout(prob.TransE)};
-        x.Ds                  = Transform(prob.DsTrans, prob.DsDataType, [](auto trans, auto dt) {
-            return TensorDesc{dt, ToLayout(trans)};
-        });
-        x.a_elem_op           = prob.AElementOp;
-        x.b_elem_op           = prob.BElementOp;
-        x.cde_elem_op         = prob.CDEElementOp;
-        x.gemm_specialization = GetGemmSpec(prob.M,
-                                            prob.N,
-                                            prob.K,
-                                            x.tile_desc.m_per_block,
-                                            x.tile_desc.n_per_block,
-                                            x.tile_desc.k_per_block);
-        x.update_prologue(prologue);
-        x.update_epilogue(epilogue);
-        result.push_back(x);
+        // Put all values together into a single operation > store into the result vector
+        for(std::size_t i = 0; i < tile_descriptions.size(); i++)
+        {
+            Operation_Xdl_CShuffle x;
+            x.tile_desc        = tile_descriptions[i];
+            x.a_block_transfer = a_block_descriptions[i];
+            x.b_block_transfer = b_block_descriptions[i];
+            x.cshuffle         = cshuffle_descriptions[i];
+            x.c_block_transfer = c_block_descriptions[i];
+            x.A                = TensorDesc{prob.ADataType, ToLayout(prob.TransA)};
+            x.B                = TensorDesc{prob.BDataType, ToLayout(prob.TransB)};
+            x.E                = TensorDesc{prob.EDataType, ToLayout(prob.TransE)};
+            x.Ds               = Transform(prob.DsTrans, prob.DsDataType, [](auto trans, auto dt) {
+                return TensorDesc{dt, ToLayout(trans)};
+            });
+            x.a_elem_op        = prob.AElementOp;
+            x.b_elem_op        = prob.BElementOp;
+            x.cde_elem_op      = prob.CDEElementOp;
+            x.gemm_specialization = GetGemmSpec(prob.M,
+                                                prob.N,
+                                                prob.K,
+                                                x.tile_desc.m_per_block,
+                                                x.tile_desc.n_per_block,
+                                                x.tile_desc.k_per_block);
+            x.loop_scheduler = loop_scheduler;
+            x.pipeline_version = pipeline_version;
+            x.update_prologue(prologue);
+            x.update_epilogue(epilogue);
+            result.push_back(x);
+        }
    }
    return result;
 }
@@ -263,7 +294,7 @@ static const char* const DeviceGemmMultipleD_Xdl_CShuffleTemplate =
    "${BBlockTransferSrcScalarPerVector}, ${BBlockTransferDstScalarPerVector_BK1}, "
    "${BBlockLdsExtraN}, ${CShuffleMXdlPerWavePerShuffle}, ${CShuffleNXdlPerWavePerShuffle}, "
    "${CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock}, "
-    "${CDEBlockTransferScalarPerVector_NPerBlock}>";
+    "${CDEBlockTransferScalarPerVector_NPerBlock}, ${LoopScheduler}, ${PipelineVersion}>";

 // use hardcoded instances from vector of operations to substitute values into instance template
 Solution Operation_Xdl_CShuffle::ToSolution() const
@@ -336,6 +367,8 @@ Solution Operation_Xdl_CShuffle::ToSolution() const
         this->c_block_transfer.cluster_lengths_m_block_m_wave_m_per_Xdl_n_block_n_wave_n_per_Xdl},
        {"CDEBlockTransferScalarPerVector_NPerBlock",
         std::to_string(this->c_block_transfer.scalar_per_vector_n_wave_n_per_Xdl)},
+        {"LoopScheduler", ToString(this->loop_scheduler)},
+        {"PipelineVersion", ToString(this->pipeline_version)},
    };

    return Solution{InterpolateString(DeviceGemmMultipleD_Xdl_CShuffleTemplate, values),

--- a/codegen/src/types.cpp
+++ b/codegen/src/types.cpp
@@ -56,6 +56,26 @@ std::string ToString(GemmType gt)
    throw std::runtime_error("Incorrect gemm type");
 }

+std::string ToString(LoopScheduler ls)
+{
+    switch(ls)
+    {
+    case LoopScheduler::Default: return "ck::LoopScheduler::Default";
+    case LoopScheduler::Interwave: return "ck::LoopScheduler::Interwave";
+    }
+    throw std::runtime_error("Incorrect LoopScheduler type");
+}
+
+std::string ToString(PipelineVersion pv)
+{
+    switch(pv)
+    {
+    case PipelineVersion::v1: return "ck::PipelineVersion::v1";
+    case PipelineVersion::v2: return "ck::PipelineVersion::v2";
+    }
+    throw std::runtime_error("Incorrect PipelineVersion type");
+}
+
 std::string SequenceStr(const std::vector<int>& v)
 {
    return "ck::Sequence<" +

--- a/codegen/test/common.hpp
+++ b/codegen/test/common.hpp
@@ -15,7 +15,8 @@ std::vector<rtc::src_file> get_headers_for_test()
    auto hs = ck::host::GetHeaders();
    std::transform(
        hs.begin(), hs.end(), std::back_inserter(result), [&](const auto& p) -> rtc::src_file {
-            return {p.first, p.second};
+            std::string sec(p.second.begin(), p.second.end());
+            return {p.first, sec};
        });
    return result;
 }

--- a/codegen/test/gemm_multiple_d.cpp
+++ b/codegen/test/gemm_multiple_d.cpp
 #include "ck/host/device_gemm_multiple_d/problem.hpp"
 #include "ck/host/device_gemm_multiple_d/operation.hpp"
+#include "ck/host/device_batched_gemm_softmax_gemm/problem.hpp"
+#include "ck/host/device_batched_gemm_softmax_gemm/operation.hpp"
 #include "ck/host/headers.hpp"
 #include "ck/host/stringutils.hpp"
 #include "ck/host/utils.hpp"
@@ -15,13 +17,59 @@
 using half = _Float16;
 // using half = __fp16;

+// NOLINTNEXTLINE
+const char* const disable_warning_pragma = R"__migraphx__(
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Weverything"
+${content}
+#pragma clang diagnostic pop
+)__migraphx__";
+
+template <class P>
+std::string ck_disable_warnings(P p)
+{
+    return ck::host::InterpolateString(disable_warning_pragma,
+                                       {{"content", std::string{p.data(), p.size()}}});
+}
+
+static std::unordered_map<std::string, std::string> create_ck_header_strings()
+{
+    std::unordered_map<std::string, std::string> result;
+    auto ck_headers = ck::host::GetHeaders();
+
+    std::transform(
+        ck_headers.begin(), ck_headers.end(), std::inserter(result, result.begin()), [&](auto& p) {
+            return std::pair<std::string, std::string>(p.first, ck_disable_warnings(p.second));
+        });
+    return result;
+}
+
+static std::vector<rtc::src_file> create_ck_headers()
+{
+    static const auto& header_strings = create_ck_header_strings();
+    std::vector<rtc::src_file> srcs;
+    std::transform(
+        header_strings.begin(), header_strings.end(), std::back_inserter(srcs), [&](auto& p) -> rtc::src_file {
+            std::string sec(p.second.begin(), p.second.end());
+            return {p.first, sec};
+        });
+    return srcs;
+}
+
+static inline const std::vector<rtc::src_file>& ck_headers()
+{
+    static const auto& headers = create_ck_headers();
+    return headers;
+}
+
 std::vector<rtc::src_file> get_headers_for_test()
 {
    std::vector<rtc::src_file> result;
    auto hs = ck::host::GetHeaders();
    std::transform(
        hs.begin(), hs.end(), std::back_inserter(result), [&](const auto& p) -> rtc::src_file {
-            return {p.first, p.second};
+            std::string sec(p.second.begin(), p.second.end());
+            return {p.first, sec};
        });
    return result;
 }
@@ -130,10 +178,13 @@ const std::string gemm_compile_check = R"__ck__(

 extern "C" __global__ void f(const ck::half_t* a, const ck::half_t* b, ck::half_t* c) {
    using G = ${template};
-    constexpr auto desc = ${template}::make_descriptor(ck::make_naive_tensor_descriptor_packed(ck::make_tuple(${m}, ${k})),
-                                             ck::make_naive_tensor_descriptor(ck::make_tuple(${n}, ${k}), ck::make_tuple(1, ${n})),
-                                             ck::make_tuple(),
-                                             ck::make_naive_tensor_descriptor_packed(ck::make_tuple(${m}, ${n})));
+    constexpr auto desc =
+    G::make_descriptor(ck::make_naive_tensor_descriptor_packed(ck::make_tuple(${m},
+    ${k})),
+                                             ck::make_naive_tensor_descriptor(ck::make_tuple(${n},
+                                             ${k}), ck::make_tuple(1, ${n})), ck::make_tuple(),
+                                             ck::make_naive_tensor_descriptor_packed(ck::make_tuple(${m},
+                                             ${n})));

    static_assert(desc.IsValid(), "Invalid ck gemm.");

@@ -163,23 +214,32 @@ TEST_CASE(test_problem_kernel)
    std::string epilogue = "";
    std::string prologue = "";

-    for(auto solution : prob.GetSolutions("gfx90a", prologue, epilogue))
+    auto solutions = prob.GetSolutions("gfx90a", prologue, epilogue);
+    std::cout << "Num solutions: " << solutions.size() << std::endl;
+    for(auto i = 0; i < solutions.size(); ++i)
    {
-        auto src  = ck::host::InterpolateString(gemm_compile_check,
-                                               {{"include", prob.GetIncludeHeader()},
-                                                {"template", solution.ToTemplateString()},
-                                                {"m", std::to_string(prob.M)},
-                                                {"n", std::to_string(prob.N)},
-                                                {"k", std::to_string(prob.K)}});
-        auto srcs = get_headers_for_test();
-        srcs.push_back({"main.cpp", src});
-        rtc::compile_options options;
+        std::cout << "Testing solution " << std::to_string(i + 1) << std::endl;
+        auto&& solution = solutions[i];
+        auto src        = ck::host::InterpolateString(gemm_compile_check,
+                                                      {{"include", prob.GetIncludeHeader()},
+                                                       {"template", solution.ToTemplateString()},
+                                                       {"m", std::to_string(prob.M)},
+                                                       {"n", std::to_string(prob.N)},
+                                                       {"k", std::to_string(prob.K)}});
+        // auto srcs       = get_headers_for_test();
+        // srcs.push_back({"main.cpp", src});
+        // rtc::compile_options options;
+        // options.kernel_name = "f";
+        rtc::hip_compile_options options;
        options.kernel_name = "f";
-        auto k              = rtc::compile_kernel(srcs, options);
-        auto block_size     = solution.GetTemplateParameter<std::size_t>("BlockSize");
-        auto m_per_block    = solution.GetTemplateParameter<std::size_t>("MPerBlock");
-        auto n_per_block    = solution.GetTemplateParameter<std::size_t>("NPerBlock");
-        auto grid_size      = ck::host::integer_divide_ceil(prob.M, m_per_block) *
+        options.additional_src_files = ck_headers();
+        // auto k              = rtc::compile_kernel(srcs, options);
+        std::cout << src << std::endl;
+        auto k           = rtc::compile_hip_code_object(src, options);
+        auto block_size  = solution.GetTemplateParameter<std::size_t>("BlockSize");
+        auto m_per_block = solution.GetTemplateParameter<std::size_t>("MPerBlock");
+        auto n_per_block = solution.GetTemplateParameter<std::size_t>("NPerBlock");
+        auto grid_size   = ck::host::integer_divide_ceil(prob.M, m_per_block) *
                         ck::host::integer_divide_ceil(prob.N, n_per_block);
        k.launch(nullptr, grid_size * block_size, block_size)(a.data(), b.data(), c.data());

@@ -187,4 +247,34 @@ TEST_CASE(test_problem_kernel)
    }
 }

+TEST_CASE(test_gemm_softmax_gemm)
+{
+    ck::host::device_batched_gemm_softmax_gemm::Problem prob;
+    prob.TransA  = false;
+    prob.TransB  = true;
+    prob.TransB1 = false;
+    prob.TransC  = false;
+    prob.M = 1024;
+    prob.N = 1024;
+    prob.K = 1024;
+    prob.O = 1024;
+    check_all<half> check;
+    auto a  = to_gpu(generate_buffer<half>(1024 * 1024, 0));
+    auto b  = to_gpu(generate_buffer<half>(1024 * 1024, 1));
+    auto b1 = to_gpu(generate_buffer<half>(1024 * 1024, 2));
+    auto c  = to_gpu(generate_buffer<half>(1024 * 1024, 3));
+
+    std::string epilogue = "";
+    std::string prologue = "";
+
+    auto solutions = prob.GetSolutions("gfx90a", prologue, epilogue);
+    std::cout << "Num solutions: " << solutions.size() << std::endl;
+
+    for(auto i = 0; i < solutions.size(); ++i) {
+        std::cout << "Solution " << i << std::endl;
+        std::cout << solutions[i].ToTemplateString() << std::endl;
+        std::cout << std::endl;
+    }
+}
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/codegen/test/rtc/include/rtc/compile_kernel.hpp
+++ b/codegen/test/rtc/include/rtc/compile_kernel.hpp
@@ -4,6 +4,7 @@
 #include <rtc/kernel.hpp>
 #include <ck/filesystem.hpp>
 #include <string>
+#include <functional>

 namespace rtc {

@@ -19,9 +20,36 @@ struct compile_options
    std::string kernel_name = "main";
 };

+struct hip_compile_options
+{
+    std::size_t global;
+    std::size_t local;
+    std::string kernel_name                    = "kernel";
+    std::string params                         = "";
+    std::vector<src_file> additional_src_files = {};
+
+    /**
+     * @brief Set the launch parameters but allow v to override the values
+     *
+     * @param v A value class which can have a "global" and/or "local" keys to override the default
+     * global and local
+     * @param compute_global A function used to compute the global based on the local
+     * @param default_local The defaul local to use if its missing from the v parameter
+     */
+    void set_launch_params(const std::function<std::size_t(std::size_t local)>& compute_global,
+                           std::size_t default_local = 1024);
+
+    void set_launch_params(std::size_t default_global, std::size_t default_local = 1024)
+    {
+        set_launch_params([=](auto) { return default_global; }, default_local);
+    }
+};
+
 kernel compile_kernel(const std::vector<src_file>& src,
                      compile_options options = compile_options{});

+kernel compile_hip_code_object(const std::string& content, hip_compile_options options);
+
 } // namespace rtc

 #endif
--- a/codegen/test/rtc/include/rtc/hip.hpp
+++ b/codegen/test/rtc/include/rtc/hip.hpp
@@ -4,6 +4,7 @@
 #include <hip/hip_runtime_api.h>
 #include <memory>
 #include <string>
+#include <stdexcept>

 namespace rtc {


--- a/codegen/test/rtc/src/compile_kernel.cpp
+++ b/codegen/test/rtc/src/compile_kernel.cpp
 #include "rtc/hip.hpp"
 #include <rtc/compile_kernel.hpp>
+#include <hip/hiprtc.h>
 #include <rtc/tmp_dir.hpp>
 #include <stdexcept>
 #include <iostream>
 #include <fstream>
 #include <cassert>
+#include <deque>
+#include <numeric>

 namespace rtc {

@@ -100,4 +103,345 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
    return kernel{obj.data(), options.kernel_name};
 }

+struct hiprtc_src_file
+{
+    hiprtc_src_file() = default;
+    hiprtc_src_file(const src_file& s) : path(s.path.string()), content(s.content) {}
+    std::string path;
+    std::string content;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.path, "path"), f(self.content, "content"));
+    }
+};
+
+std::string hiprtc_error(hiprtcResult err, const std::string& msg)
+{
+    return "hiprtc: " + (hiprtcGetErrorString(err) + (": " + msg));
+}
+
+void hiprtc_check_error(hiprtcResult err, const std::string& msg, const std::string& ctx)
+{
+    if(err != HIPRTC_SUCCESS)
+        throw std::runtime_error(hiprtc_error(err, msg));
+}
+
+// NOLINTNEXTLINE
+#define MIGRAPHX_HIPRTC(...) \
+    hiprtc_check_error(__VA_ARGS__, #__VA_ARGS__, "Lorem ipsum dolor sit amet")
+
+#define MIGRAPHX_HIPRTC_THROW(error, msg) throw std::runtime_error(hiprtc_error(error, msg))
+
+template <class F, F f> // NOLINT
+struct manage_deleter
+{
+    template <class T>
+    void operator()(T* x) const
+    {
+        if(x != nullptr)
+        {
+            (void)f(x);
+        }
+    }
+};
+
+template <class T, class F, F f> // NOLINT
+using manage_ptr = std::unique_ptr<T, manage_deleter<F, f>>;
+
+#define MIGRAPHX_MANAGE_PTR(T, F) manage_ptr<std::remove_pointer_t<T>, decltype(&F), &F> // NOLINT
+
+// Workaround hiprtc's broken API
+void hiprtc_program_destroy(hiprtcProgram prog) { hiprtcDestroyProgram(&prog); }
+using hiprtc_program_ptr = MIGRAPHX_MANAGE_PTR(hiprtcProgram, hiprtc_program_destroy);
+
+template <class... Ts>
+hiprtc_program_ptr hiprtc_program_create(Ts... xs)
+{
+    hiprtcProgram prog = nullptr;
+    auto result        = hiprtcCreateProgram(&prog, xs...);
+    hiprtc_program_ptr p{prog};
+    if(result != HIPRTC_SUCCESS)
+        MIGRAPHX_HIPRTC_THROW(result, "Create program failed.");
+    return p;
+}
+
+bool starts_with(const std::string& value, const std::string& prefix)
+{
+    if(prefix.size() > value.size())
+        return false;
+    else
+        return std::equal(prefix.begin(), prefix.end(), value.begin());
+}
+
+bool ends_with(const std::string& value, const std::string& suffix)
+{
+    if(suffix.size() > value.size())
+        return false;
+    else
+        return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
+}
+
+std::vector<std::string> split_string(const std::string& s, char delim)
+{
+    std::vector<std::string> elems;
+    std::stringstream ss(s + delim);
+    std::string item;
+    while(std::getline(ss, item, delim))
+    {
+        elems.push_back(item);
+    }
+    return elems;
+}
+
+template <class Strings>
+inline std::string join_strings(Strings strings, const std::string& delim)
+{
+    auto it = strings.begin();
+    if(it == strings.end())
+        return "";
+
+    auto nit = std::next(it);
+    return std::accumulate(nit, strings.end(), *it, [&](std::string x, std::string y) {
+        return std::move(x) + delim + std::move(y);
+    });
+}
+
+struct hiprtc_program
+{
+    struct string_array
+    {
+        std::deque<std::string> strings{};
+        std::vector<const char*> c_strs{};
+
+        string_array() {}
+        string_array(const string_array&) = delete;
+
+        std::size_t size() const { return strings.size(); }
+
+        const char** data() { return c_strs.data(); }
+
+        void push_back(std::string s)
+        {
+            strings.push_back(std::move(s));
+            c_strs.push_back(strings.back().c_str());
+        }
+    };
+
+    hiprtc_program_ptr prog = nullptr;
+    string_array headers{};
+    string_array include_names{};
+    std::string cpp_src  = "";
+    std::string cpp_name = "";
+
+    hiprtc_program(const std::string& src, const std::string& name = "main.cpp")
+        : cpp_src(src), cpp_name(name)
+    {
+        create_program();
+    }
+
+    hiprtc_program(std::vector<src_file> srcs)
+    {
+        for(auto&& src : srcs)
+        {
+            if(ends_with(src.path, ".cpp"))
+            {
+                cpp_src  = std::move(src.content);
+                cpp_name = std::move(src.path);
+            }
+            else
+            {
+                headers.push_back(std::move(src.content));
+                include_names.push_back(std::move(src.path));
+            }
+        }
+        create_program();
+    }
+
+    void create_program()
+    {
+        assert(not cpp_src.empty());
+        assert(not cpp_name.empty());
+        assert(headers.size() == include_names.size());
+        prog = hiprtc_program_create(cpp_src.c_str(),
+                                     cpp_name.c_str(),
+                                     headers.size(),
+                                     headers.data(),
+                                     include_names.data());
+    }
+
+    void compile(const std::vector<std::string>& options, bool quiet = false) const
+    {
+        // if(enabled(MIGRAPHX_TRACE_HIPRTC{}))
+        //     std::cout << "hiprtc " << join_strings(options, " ") << " " << cpp_name << std::endl;
+        std::vector<const char*> c_options;
+        std::transform(options.begin(),
+                       options.end(),
+                       std::back_inserter(c_options),
+                       [](const std::string& s) { return s.c_str(); });
+        std::cout << "BEFORE HIPRTC COMPILE" << std::endl;
+        auto result   = hiprtcCompileProgram(prog.get(), c_options.size(), c_options.data());
+        auto prog_log = log();
+        if(not prog_log.empty() and not quiet)
+        {
+            std::cerr << prog_log << std::endl;
+        }
+        if(result != HIPRTC_SUCCESS)
+            throw std::runtime_error("Compilation failed.");
+    }
+
+    std::string log() const
+    {
+        std::size_t n = 0;
+        MIGRAPHX_HIPRTC(hiprtcGetProgramLogSize(prog.get(), &n));
+        if(n == 0)
+            return {};
+        std::string buffer(n, '\0');
+        MIGRAPHX_HIPRTC(hiprtcGetProgramLog(prog.get(), buffer.data()));
+        assert(buffer.back() != 0);
+        return buffer;
+    }
+
+    std::vector<char> get_code_obj() const
+    {
+        std::size_t n = 0;
+        MIGRAPHX_HIPRTC(hiprtcGetCodeSize(prog.get(), &n));
+        std::vector<char> buffer(n);
+        MIGRAPHX_HIPRTC(hiprtcGetCode(prog.get(), buffer.data()));
+        return buffer;
+    }
+};
+
+std::vector<std::vector<char>> compile_hip_src_with_hiprtc(std::vector<src_file> srcs,
+                                                           const std::string& params,
+                                                           const std::string& arch)
+{
+    hiprtc_program prog(std::move(srcs));
+    auto options = split_string(params, ' ');
+    options.push_back("-DMIGRAPHX_USE_HIPRTC=1");
+    if(true)
+    {
+        options.push_back("-DMIGRAPHX_HAS_DPP=0");
+        options.push_back("-DMIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS=1");
+        options.push_back("-Wno-reserved-identifier");
+        options.push_back("-Wno-unused-parameter");
+        options.push_back("-Wno-gnu-line-marker");
+        options.push_back("-Wno-old-style-cast");
+    }
+    if(true)
+        options.push_back("-DMIGRAPHX_DEBUG");
+    if(std::none_of(options.begin(), options.end(), [](const std::string& s) {
+           return starts_with(s, "--std=") or starts_with(s, "-std=");
+       }))
+        options.push_back("-std=c++17");
+    options.push_back("-fno-gpu-rdc");
+    options.push_back("-O3");
+    options.push_back("-Wno-cuda-compat");
+    options.push_back("--offload-arch=" + arch);
+    prog.compile(options);
+    return {prog.get_code_obj()};
+}
+
+bool hip_has_flags(const std::vector<std::string>& flags)
+{
+    hiprtc_program prog{" "};
+    try
+    {
+        prog.compile(flags, true);
+        return true;
+    }
+    catch(...)
+    {
+        return false;
+    }
+}
+
+bool hip_accept_non_uniform_wg()
+{
+    static bool non_uniform_wg = hip_has_flags({"-fno-offload-uniform-block"});
+    return non_uniform_wg;
+}
+
+static std::vector<std::string> get_compiler_warnings()
+{
+    std::vector<std::string> warnings = {
+        "-Weverything",
+        "-Wno-c++98-compat",
+        "-Wno-c++98-compat-pedantic",
+        "-Wno-conversion",
+        "-Wno-double-promotion",
+        "-Wno-exit-time-destructors",
+        "-Wno-extra-semi",
+        "-Wno-extra-semi-stmt",
+        "-Wno-float-conversion",
+        "-Wno-gnu-anonymous-struct",
+        "-Wno-gnu-zero-variadic-macro-arguments",
+        "-Wno-missing-prototypes",
+        "-Wno-nested-anon-types",
+        "-Wno-padded",
+        "-Wno-shorten-64-to-32",
+        "-Wno-sign-conversion",
+        "-Wno-sign-compare",
+        "-Wno-unused-command-line-argument",
+        "-Wno-weak-vtables",
+        "-Wno-c99-extensions",
+    };
+
+    if(hip_has_flags({"-Werror", "-Wunsafe-buffer-usage"}))
+        warnings.push_back("-Wno-unsafe-buffer-usage");
+    return warnings;
+}
+
+const std::vector<std::string>& compiler_warnings()
+{
+    static std::vector<std::string> warnings = get_compiler_warnings();
+    return warnings;
+}
+
+kernel compile_hip_code_object(const std::string& content, hip_compile_options options)
+{
+    assert(options.global > 0);
+    assert(options.local > 0);
+    // assert(not options.inputs.empty());
+    // assert(options.inputs.size() == options.virtual_inputs.size() or
+    //        options.virtual_inputs.empty());
+    std::vector<src_file> srcs = options.additional_src_files;
+    // Neko sranje
+    // static auto kernels{::migraphx_kernels()};
+    // std::transform(
+    //     kernels.begin(),
+    //     kernels.end(),
+    //     std::back_inserter(srcs),
+    //     [](const std::pair<std::string_view, std::string_view>& elem) { return src_file{elem};
+    //     });
+    srcs.emplace_back("main.cpp", content);
+
+    for (auto src : srcs) {
+        std::cout << src.path << std::endl;
+    }
+
+
+    // auto args_hpp =
+    //     generate_args_hpp(options.virtual_inputs.empty() ? options.inputs :
+    //     options.virtual_inputs);
+    // srcs.emplace_back("args.hpp", args_hpp);
+
+    if(options.global % options.local != 0 and hip_accept_non_uniform_wg())
+        options.params += " -fno-offload-uniform-block";
+    else
+        assert(options.global % options.local == 0);
+
+    options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global);
+    options.params += " -DMIGRAPHX_NLOCAL=" + std::to_string(options.local);
+    options.params += " " + join_strings(compiler_warnings(), " ");
+    options.params += " -ftemplate-backtrace-limit=0";
+    options.params += " -Werror";
+    auto cos = compile_hip_src_with_hiprtc(srcs, options.params, get_device_name());
+    if(cos.size() != 1)
+        std::runtime_error("No code object");
+    auto& obj = cos.front(); 
+
+    return kernel{obj.data(), options.kernel_name};
+}
+
 } // namespace rtc
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -4,16 +4,12 @@
 #pragma once

 #include "ck/config.h"
-#include "ck/utility/env.hpp"
-
+#ifndef __HIPCC_RTC__
 #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
 #endif
-
-// environment variable to enable logging:
-// export CK_LOGGING=ON or CK_LOGGING=1 or CK_LOGGING=ENABLED
-CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING)
+#endif

 // to do: add various levels of logging with CK_LOG_LEVEL


--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -3,6 +3,7 @@

 #pragma once

+#ifndef __HIPCC_RTC__
 #include <string>
 #include <map>
 #include <hip/hip_runtime.h>
@@ -96,3 +97,4 @@ inline bool is_gfx12_supported()
 }

 } // namespace ck
+#endif
--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
-
+#ifndef __HIPCC_RTC__
 #include <hip/hip_runtime.h>

 #include "ck/ck.hpp"
@@ -160,3 +160,4 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
    return 0;
 #endif
 }
+#endif