Merge branch 'develop' into rocblas_fp8

c4cee345 · Umang Yadav · GitHub · c40a39c3 · eafd55de · c4cee345
Unverified Commit c4cee345 authored Dec 01, 2023 by Umang Yadav Committed by GitHub Dec 01, 2023
20 changed files
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -465,7 +465,7 @@ jobs:
    - name: Upload code coverage
      if: "matrix.configuration == 'codecov'"
      env:
-        CODECOV_TOKEN: "8545af1c-f90b-4345-92a5-0d075503ca56"
+        CODECOV_TOKEN: "f5d5a10b-3177-4c76-b25f-9b1c2f165e8b"
      run: |
        sudo apt-get install -y lcov
        cd build

--- a/.gitignore
+++ b/.gitignore
@@ -81,5 +81,7 @@ cmake-build*/
 build*/

 # Recommended location to install rbuild dependencies from README.md
-depend
+depend*/

+# local Python virtual environment
+.venv/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,9 +41,12 @@ if(NOT MIGRAPHX_GENERATOR_IS_MULTI_CONFIG)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYPES})
 endif()

-set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
+if(NOT WIN32)
+    set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
+    set(CMAKE_BUILD_RPATH "${CMAKE_BINARY_DIR}/lib")
+endif()

-set(CMAKE_BUILD_RPATH "${CMAKE_BINARY_DIR}/lib")
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm /opt/rocm/llvm $ENV{ROCM_PATH} $ENV{HIP_PATH})

 project(migraphx LANGUAGES C CXX)
 include(CTest)
@@ -57,6 +60,9 @@ else()
 option(MIGRAPHX_ENABLE_PYTHON "Enable python bindings" ON)
 endif()

+# By default build shared libraries
+option(BUILD_SHARED_LIBS "Create shared libraries" ON)
+
 if(WIN32) # CK is not yet ported to Windows
 option(MIGRAPHX_USE_COMPOSABLEKERNEL "Enable MIGraphX to use composable kernel JIT library" OFF)
 else()
@@ -102,13 +108,21 @@ set(MIGRAPHX_ENABLE_CPU Off CACHE BOOL "")
 # Disable fpga backend by default
 set(MIGRAPHX_ENABLE_FPGA Off CACHE BOOL "")

+if(WIN32)
+    add_compile_definitions("$<$<COMPILE_LANGUAGE:C,CXX>:_CRT_SECURE_NO_WARNINGS;_USE_MATH_DEFINES>")
+endif()
+
 set(CMAKE_CXX_STANDARD_DEFAULT "")
-add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-std=c++17>)
+if(MSVC)
+    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/std:c++17>)
+else()
+    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-std=c++17>)
+endif()

 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 include(EnableCompilerWarnings)
 include(ROCMClangTidy)
-if(CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
+if(CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+.*")
    set(MIGRAPHX_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
 # Enable tidy on hip
 elseif(MIGRAPHX_ENABLE_GPU)

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -22,6 +22,8 @@ def rocmtestnode(Map conf) {
        def cmd = """
            ulimit -c unlimited
            echo "leak:dnnl::impl::malloc" > suppressions.txt
+            echo "leak:libtbb.so" >> suppressions.txt
+            cat suppressions.txt
            export LSAN_OPTIONS="suppressions=\$(pwd)/suppressions.txt"
            export MIGRAPHX_GPU_DEBUG=${gpu_debug}
            export CXX=${compiler}
@@ -134,12 +136,14 @@ rocmtest clang_debug: rocmnode('mi100+') { cmake_build ->
    }
 }, mlir_debug: rocmnode('mi100+') { cmake_build ->
    stage('MLIR Debug') {
-        withEnv(['MIGRAPHX_ENABLE_EXTRA_MLIR=1']) {
+        withEnv(['MIGRAPHX_ENABLE_EXTRA_MLIR=1', 'MIGRAPHX_MLIR_USE_SPECIFIC_OPS=fused,attention,convolution,dot']) {
            def sanitizers = "undefined"
            // Note: the -fno-sanitize= is copied from upstream LLVM_UBSAN_FLAGS.
            def debug_flags_cxx = "-g -O2 -fsanitize=${sanitizers} -fno-sanitize=vptr,function -fno-sanitize-recover=${sanitizers}"
            def debug_flags = "-g -O2 -fsanitize=${sanitizers} -fno-sanitize=vptr -fno-sanitize-recover=${sanitizers}"
            def gpu_targets = getgputargets()
+            // Since the purpose of this run verify all things MLIR supports,
+            // enabling all possible types of offloads
            cmake_build(flags: "-DCMAKE_BUILD_TYPE=debug -DMIGRAPHX_ENABLE_PYTHON=Off -DMIGRAPHX_ENABLE_MLIR=On -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags_cxx}' -DCMAKE_C_FLAGS_DEBUG='${debug_flags}' -DGPU_TARGETS='${gpu_targets}'")
        }
    }

--- a/cmake/Embed.cmake
+++ b/cmake/Embed.cmake
@@ -21,17 +21,25 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 #####################################################################################
-find_program(EMBED_LD ld)
-find_program(EMBED_OBJCOPY objcopy)

-option(EMBED_USE_LD "Use ld to embed data files" OFF)
+if(WIN32)
+    set(EMBED_USE RC CACHE STRING "Use RC or CArrays to embed data files")
+    set_property(CACHE EMBED_USE PROPERTY STRINGS "RC;CArrays")
+else()
+    set(EMBED_USE CArrays CACHE STRING "Use LD or CArrays to embed data files")
+    set_property(CACHE EMBED_USE PROPERTY STRINGS "LD;CArrays")
+endif()
+
+if(EMBED_USE STREQUAL "LD")
+    find_program(EMBED_LD ld REQUIRED)
+    find_program(EMBED_OBJCOPY objcopy REQUIRED)
+endif()

 function(wrap_string)
    set(options)
    set(oneValueArgs VARIABLE AT_COLUMN)
    set(multiValueArgs)
    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})

    string(LENGTH ${${PARSE_VARIABLE}} string_length)
    math(EXPR offset "0")
@@ -54,97 +62,108 @@ function(wrap_string)
    set(${PARSE_VARIABLE} "${lines}" PARENT_SCOPE)
 endfunction()

-function(generate_embed_source EMBED_NAME)
+function(generate_embed_source EMBED_NAME EMBED_DIR BASE_DIRECTORY)
    set(options)
-    set(oneValueArgs SRC HEADER RELATIVE)
-    set(multiValueArgs OBJECTS SYMBOLS FILES)
-
+    set(oneValueArgs)
+    set(multiValueArgs SYMBOLS FILES)
    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

-    set(EXTERNS)
-    set(INIT_KERNELS)
-
-    list(LENGTH PARSE_SYMBOLS SYMBOLS_LEN)
-    list(LENGTH PARSE_OBJECTS OBJECTS_LEN)
-    if(NOT ${SYMBOLS_LEN} EQUAL ${OBJECTS_LEN})
-        message(FATAL_ERROR "Symbols and objects dont match: ${SYMBOLS_LEN} != ${OBJECTS_LEN}")
-    endif()
-    math(EXPR LEN "${SYMBOLS_LEN} - 1")
-
-    foreach(idx RANGE ${LEN})
-        list(GET PARSE_SYMBOLS ${idx} SYMBOL)
-        list(GET PARSE_OBJECTS ${idx} OBJECT)
-        list(GET PARSE_FILES ${idx} FILE)
-
+    set(RESOURCE_ID 100)
+    foreach(SYMBOL FILE IN ZIP_LISTS PARSE_SYMBOLS PARSE_FILES)
+        cmake_path(RELATIVE_PATH FILE BASE_DIRECTORY ${BASE_DIRECTORY} OUTPUT_VARIABLE BASE_NAME)
+        if(EMBED_USE STREQUAL "RC")
+            string(TOUPPER "${SYMBOL}" SYMBOL)
+            string(APPEND FILE_IDS "#define IDR_${SYMBOL} ${RESOURCE_ID}\n")
+            cmake_path(NATIVE_PATH FILE NORMALIZE NATIVE_FILE)
+            string(REPLACE "\\" "\\\\" NATIVE_FILE "${NATIVE_FILE}")
+            string(APPEND RC_FILE_MAPPING "IDR_${SYMBOL} TEXTFILE \"${NATIVE_FILE}\"\n")
+            string(APPEND INIT_KERNELS "\n        {\"${BASE_NAME}\", resource::read(IDR_${SYMBOL})},")
+            math(EXPR RESOURCE_ID "${RESOURCE_ID} + 1" OUTPUT_FORMAT DECIMAL)
+        else()
            set(START_SYMBOL "_binary_${SYMBOL}_start")
            set(LENGTH_SYMBOL "_binary_${SYMBOL}_length")
-        if(EMBED_USE_LD)
+            if(EMBED_USE STREQUAL "LD")
                string(APPEND EXTERNS "
 extern const char ${START_SYMBOL}[];
 extern const size_t _binary_${SYMBOL}_size;
 const auto ${LENGTH_SYMBOL} = reinterpret_cast<size_t>(&_binary_${SYMBOL}_size);
-            ")
+")
            else()
                string(APPEND EXTERNS "
 extern const char ${START_SYMBOL}[];
 extern const size_t ${LENGTH_SYMBOL};
-            ")
-        endif()
-
-        if(PARSE_RELATIVE)
-            file(RELATIVE_PATH BASE_NAME ${PARSE_RELATIVE} "${FILE}")
-        else()
-            get_filename_component(BASE_NAME "${FILE}" NAME)
+")
            endif()
-
            string(APPEND INIT_KERNELS "
        { \"${BASE_NAME}\", { ${START_SYMBOL}, ${LENGTH_SYMBOL}} },")
+        endif()
    endforeach()
+    if(EMBED_USE STREQUAL "RC")
+       file(WRITE "${EMBED_DIR}/include/resource.h" "
+#define TEXTFILE 256
+
+${FILE_IDS}
+")
+        file(WRITE "${EMBED_DIR}/resource.rc" "
+#include \"resource.h\"
+
+${RC_FILE_MAPPING}
+")
+        set(EXTERNS "
+#include <Windows.h>
+#include \"resource.h\"

-    file(WRITE "${PARSE_HEADER}" "
+namespace resource {
+std::string_view read(int id)
+{
+    HMODULE handle = GetModuleHandle(nullptr);
+    HRSRC rc = FindResource(handle, MAKEINTRESOURCE(id), MAKEINTRESOURCE(TEXTFILE));
+    HGLOBAL data = LoadResource(handle, rc);
+    return {static_cast<const char*>(LockResource(data)), SizeofResource(handle, rc)};
+}
+}
+")
+        set(EMBED_FILES ${EMBED_DIR}/include/resource.h ${EMBED_DIR}/resource.rc)
+    endif()
+    file(WRITE "${EMBED_DIR}/include/${EMBED_NAME}.hpp" "
 #include <string_view>
 #include <unordered_map>
 #include <utility>
 std::unordered_map<std::string_view, std::string_view> ${EMBED_NAME}();
 ")

-    file(WRITE "${PARSE_SRC}" "
+    file(WRITE "${EMBED_DIR}/${EMBED_NAME}.cpp" "
 #include <${EMBED_NAME}.hpp>
 ${EXTERNS}
 std::unordered_map<std::string_view, std::string_view> ${EMBED_NAME}()
 {
-    static std::unordered_map<std::string_view, std::string_view> result = {${INIT_KERNELS}};
+    static std::unordered_map<std::string_view, std::string_view> result = {${INIT_KERNELS}
+    };
    return result;
 }
 ")
+    list(APPEND EMBED_FILES ${EMBED_DIR}/${EMBED_NAME}.cpp ${EMBED_DIR}/include/${EMBED_NAME}.hpp)
+    set(EMBED_FILES ${EMBED_FILES} PARENT_SCOPE)
 endfunction()

-function(embed_file OUTPUT_FILE OUTPUT_SYMBOL FILE)
-    set(WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-    # Glob is used to compute the relative path
-    file(GLOB FILES RELATIVE ${WORKING_DIRECTORY} ${FILE})
-    foreach(REL_FILE ${FILES})
-        string(MAKE_C_IDENTIFIER "${REL_FILE}" SYMBOL)
+function(embed_file FILE BASE_DIRECTORY)
+    message(STATUS "    ${FILE}")
+    cmake_path(RELATIVE_PATH FILE BASE_DIRECTORY "${BASE_DIRECTORY}" OUTPUT_VARIABLE REL_FILE)
+    string(MAKE_C_IDENTIFIER "${REL_FILE}" OUTPUT_SYMBOL)
    get_filename_component(OUTPUT_FILE_DIR "${REL_FILE}" DIRECTORY)
    file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_FILE_DIR}")
-        if(EMBED_USE_LD)
-            set(OUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.o")
-        else()
-            set(OUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.cpp")
-        endif()
-        set(${OUTPUT_SYMBOL} ${SYMBOL} PARENT_SCOPE)
-        set(${OUTPUT_FILE} "${OUT_FILE}" PARENT_SCOPE)
-        if(EMBED_USE_LD)
+    if(EMBED_USE STREQUAL "LD")
+        set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.o")
        add_custom_command(
-                OUTPUT "${OUT_FILE}"
-                COMMAND ${EMBED_LD} -r -o "${OUT_FILE}" -z noexecstack --format=binary "${REL_FILE}" 
-                COMMAND ${EMBED_OBJCOPY} --rename-section .data=.rodata,alloc,load,readonly,data,contents "${OUT_FILE}"
-                WORKING_DIRECTORY ${WORKING_DIRECTORY}
-                DEPENDS ${FILE}
-                VERBATIM
-            )
-        else()
-            set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${FILE})
+            OUTPUT "${OUTPUT_FILE}"
+            COMMAND ${EMBED_LD} -r -o "${OUTPUT_FILE}" -z noexecstack --format=binary "${REL_FILE}"
+            COMMAND ${EMBED_OBJCOPY} --rename-section .data=.rodata,alloc,load,readonly,data,contents "${OUTPUT_FILE}"
+            WORKING_DIRECTORY "${BASE_DIRECTORY}"
+            DEPENDS "${FILE}"
+            VERBATIM)
+        set(OUTPUT_FILE ${OUTPUT_FILE} PARENT_SCOPE)
+    elseif(EMBED_USE STREQUAL "CArrays")
+        set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.cpp")
        # reads source file contents as hex string
        file(READ ${FILE} HEX_STRING HEX)
        # wraps the hex string into multiple lines
@@ -153,13 +172,14 @@ function(embed_file OUTPUT_FILE OUTPUT_SYMBOL FILE)
        string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1, " ARRAY_VALUES ${HEX_STRING})
        # removes trailing comma
        string(REGEX REPLACE ", $" "" ARRAY_VALUES ${ARRAY_VALUES})
-            file(WRITE "${OUT_FILE}" "
+        file(WRITE "${OUTPUT_FILE}" "
 #include <cstddef>
-extern const char _binary_${SYMBOL}_start[] = { ${ARRAY_VALUES} };
-extern const size_t _binary_${SYMBOL}_length = sizeof(_binary_${SYMBOL}_start);
+extern const char _binary_${OUTPUT_SYMBOL}_start[] = { ${ARRAY_VALUES} };
+extern const size_t _binary_${OUTPUT_SYMBOL}_length = sizeof(_binary_${OUTPUT_SYMBOL}_start);
 ")
+        set(OUTPUT_FILE ${OUTPUT_FILE} PARENT_SCOPE)
    endif()
-    endforeach()
+    set(OUTPUT_SYMBOL ${OUTPUT_SYMBOL} PARENT_SCOPE)
 endfunction()

 function(add_embed_library EMBED_NAME)
@@ -168,35 +188,32 @@ function(add_embed_library EMBED_NAME)
    set(multiValueArgs)
    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

-    file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/embed)
-    file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/embed/${EMBED_NAME})
    set(EMBED_DIR ${CMAKE_CURRENT_BINARY_DIR}/embed/${EMBED_NAME})
-    set(SRC_FILE "${EMBED_DIR}/${EMBED_NAME}.cpp")
-    set(HEADER_FILE "${EMBED_DIR}/include/${EMBED_NAME}.hpp")
-    set(WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    set(OUTPUT_FILES)
-    set(SYMBOLS)
-    message(STATUS "Embedding files")
+    file(MAKE_DIRECTORY ${EMBED_DIR})
+    message(STATUS "Embedding kernel files:")
    foreach(FILE ${PARSE_UNPARSED_ARGUMENTS})
-        embed_file(OUTPUT_FILE OUTPUT_SYMBOL ${FILE})
+        embed_file(${FILE} ${PARSE_RELATIVE})
        list(APPEND OUTPUT_FILES ${OUTPUT_FILE})
        list(APPEND SYMBOLS ${OUTPUT_SYMBOL})
    endforeach()
-    message(STATUS "Generating embedding library ${EMBED_NAME}")
-    generate_embed_source(${EMBED_NAME} SRC ${SRC_FILE} HEADER ${HEADER_FILE} OBJECTS ${OUTPUT_FILES} SYMBOLS ${SYMBOLS} RELATIVE ${PARSE_RELATIVE} FILES ${PARSE_UNPARSED_ARGUMENTS})
-    
+    message(STATUS "Generating embedding library '${EMBED_NAME}'")
+    generate_embed_source(${EMBED_NAME} ${EMBED_DIR} "${PARSE_RELATIVE}" SYMBOLS ${SYMBOLS} FILES ${PARSE_UNPARSED_ARGUMENTS})
    set(INTERNAL_EMBED_LIB embed_lib_${EMBED_NAME})
-    add_library(${INTERNAL_EMBED_LIB} OBJECT "${SRC_FILE}")
+    add_library(${INTERNAL_EMBED_LIB} OBJECT ${EMBED_FILES})
+    if(EMBED_USE STREQUAL "CArrays")
+        target_sources(${INTERNAL_EMBED_LIB} PRIVATE ${OUTPUT_FILES})
+    endif()
    target_include_directories(${INTERNAL_EMBED_LIB} PRIVATE "${EMBED_DIR}/include")
    target_compile_options(${INTERNAL_EMBED_LIB} PRIVATE -Wno-reserved-identifier -Wno-extern-initializer -Wno-missing-variable-declarations)
    set_target_properties(${INTERNAL_EMBED_LIB} PROPERTIES POSITION_INDEPENDENT_CODE On)
-    
    add_library(${EMBED_NAME} INTERFACE)
-    if(EMBED_USE_LD)
+    if(EMBED_USE STREQUAL "LD")
        target_sources(${EMBED_NAME} INTERFACE ${OUTPUT_FILES})
-    else()
-        target_sources(${INTERNAL_EMBED_LIB} PRIVATE ${OUTPUT_FILES})
+    endif()
+    if(EMBED_USE STREQUAL "RC")
+        target_link_libraries(${EMBED_NAME} INTERFACE $<TARGET_OBJECTS:${INTERNAL_EMBED_LIB}>)
    endif()
    target_sources(${EMBED_NAME} INTERFACE $<TARGET_OBJECTS:${INTERNAL_EMBED_LIB}>)
    target_include_directories(${EMBED_NAME} INTERFACE "${EMBED_DIR}/include")
 endfunction()
+
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -21,7 +21,7 @@ charset-normalizer==3.1.0
    # via requests
 click==8.1.3
    # via sphinx-external-toc
-cryptography==41.0.4
+cryptography==41.0.6
    # via pyjwt
 deprecated==1.2.13
    # via pygithub
@@ -89,7 +89,7 @@ requests==2.28.2
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==0.28.0
+rocm-docs-core==0.30.0
    # via -r requirements.in
 smmap==5.0.0
    # via gitdb

--- a/docs/dev/env_vars.rst
+++ b/docs/dev/env_vars.rst
@@ -4,13 +4,13 @@ Environment Variables
 For parsing
 ---------------

-**MIGRAPHX_TRACE_ONNX_PARSER**
+.. envvar:: MIGRAPHX_TRACE_ONNX_PARSER

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print debugging traces for the onnx parser.
 Prints: initializers (if used), ONNX node operators, added MIGraphX instructions

-**MIGRAPHX_DISABLE_FP16_INSTANCENORM_CONVERT**
+.. envvar:: MIGRAPHX_DISABLE_FP16_INSTANCENORM_CONVERT

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Disables the conversion from fp16 to fp32 for the InstanceNormalization ONNX operator that MIGX does as a workaround for accuracy issues with reduce_mean/variance.
@@ -20,16 +20,16 @@ See ``parse_instancenorm.cpp`` for more details.
 Matchers
 ------------

-**MIGRAPHX_TRACE_MATCHES**
+.. envvar:: MIGRAPHX_TRACE_MATCHES

 Set to "1" to print the matcher that matches an instruction and the matched instruction.
 Set to "2" and use the ``MIGRAPHX_TRACE_MATHCES_FOR`` flag to filter out results.

-**MIGRAPHX_TRACE_MATCHES_FOR**
+.. envvar:: MIGRAPHX_TRACE_MATCHES_FOR

 Set to the name of any matcher and only traces for that matcher will be printed out.

-**MIGRAPHX_VALIDATE_MATCHES**
+.. envvar:: MIGRAPHX_VALIDATE_MATCHES

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Validate the module after finding the matches (runs ``module.validate()``).
@@ -37,7 +37,7 @@ Validate the module after finding the matches (runs ``module.validate()``).
 Program Execution 
 ---------------------

-**MIGRAPHX_TRACE_EVAL**
+.. envvar:: MIGRAPHX_TRACE_EVAL

 Set to "1", "2", or "3" to use.
 "1" prints the instruction run and the time taken.
@@ -48,7 +48,7 @@ Set to "1", "2", or "3" to use.
 Program Verification
 ------------------------

-**MIGRAPHX_VERIFY_ENABLE_ALLCLOSE**
+.. envvar:: MIGRAPHX_VERIFY_ENABLE_ALLCLOSE

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Uses ``allclose`` with the given ``atol`` and ``rtol`` for verifying ranges with ``driver verify`` or the tests that use ``migraphx/verify.hpp``.
@@ -57,76 +57,76 @@ Uses ``allclose`` with the given ``atol`` and ``rtol`` for verifying ranges with
 Pass debugging or Pass controls
 -----------------------------------

-**MIGRAPHX_TRACE_ELIMINATE_CONTIGUOUS**
+.. envvar:: MIGRAPHX_TRACE_ELIMINATE_CONTIGUOUS

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Debug print the instructions that have input ``contiguous`` instructions removed.

-**MIGRAPHX_DISABLE_POINTWISE_FUSION**
+.. envvar:: MIGRAPHX_DISABLE_POINTWISE_FUSION

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Disables the ``fuse_pointwise`` compile pass.

-**MIGRAPHX_DEBUG_MEMORY_COLORING**
+.. envvar:: MIGRAPHX_DEBUG_MEMORY_COLORING

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print debug statements for the ``memory_coloring`` pass.

-**MIGRAPHX_TRACE_SCHEDULE**
+.. envvar:: MIGRAPHX_TRACE_SCHEDULE

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print debug statements for the ``schedule`` pass.

-**MIGRAPHX_TRACE_PROPAGATE_CONSTANT**
+.. envvar:: MIGRAPHX_TRACE_PROPAGATE_CONSTANT

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Traces instructions replaced with a constant.

-**MIGRAPHX_INT8_QUANTIZATION_PARAMS**
+.. envvar:: MIGRAPHX_INT8_QUANTIZATION_PARAMS

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print the quantization parameters in only the main module.

-**MIGRAPHX_DISABLE_DNNL_POST_OPS_WORKAROUND**
+.. envvar:: MIGRAPHX_DISABLE_DNNL_POST_OPS_WORKAROUND

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Disable the DNNL post ops workaround.

-**MIGRAPHX_DISABLE_MIOPEN_FUSION**
+.. envvar:: MIGRAPHX_DISABLE_MIOPEN_FUSION

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Disable MIOpen fusions.

-**MIGRAPHX_DISABLE_SCHEDULE_PASS**
+.. envvar:: MIGRAPHX_DISABLE_SCHEDULE_PASS

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Disable the ``schedule`` pass.

-**MIGRAPHX_DISABLE_REDUCE_FUSION**
+.. envvar:: MIGRAPHX_DISABLE_REDUCE_FUSION

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Disable the ``fuse_reduce`` pass.

-**MIGRAPHX_ENABLE_NHWC**
+.. envvar:: MIGRAPHX_ENABLE_NHWC

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Enable the ``layout_nhwc`` pass.

-**MIGRAPHX_ENABLE_CK**
+.. envvar:: MIGRAPHX_ENABLE_CK

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Enable using the Composable Kernels library.
 Should be used in conjunction with ``MIGRAPHX_DISABLE_MLIR=1``.

-**MIGRAPHX_DISABLE_MLIR** 
+.. envvar:: MIGRAPHX_DISABLE_MLIR*
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Disable using the rocMLIR library.

-**MIGRAPHX_ENABLE_EXTRA_MLIR**
+.. envvar:: MIGRAPHX_ENABLE_EXTRA_MLIR
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Enables additional opportunities to use MLIR that may improve performance.

-**MIGRAPHX_COPY_LITERALS**
+.. envvar:: MIGRAPHX_COPY_LITERALS

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Use ``hip_copy_to_gpu`` with a new ``literal`` instruction rather than use ``hip_copy_literal{}``.
@@ -134,22 +134,22 @@ Use ``hip_copy_to_gpu`` with a new ``literal`` instruction rather than use ``hip
 Compilation traces
 ----------------------

-**MIGRAPHX_TRACE_FINALIZE**
+.. envvar:: MIGRAPHX_TRACE_FINALIZE

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Debug print instructions during the ``module.finalize()`` step.

-**MIGRAPHX_TRACE_COMPILE**
+.. envvar:: MIGRAPHX_TRACE_COMPILE

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print trace information for the graph compilation process.

-**MIGRAPHX_TRACE_PASSES**
+.. envvar:: MIGRAPHX_TRACE_PASSES

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print the compile pass and the program after the pass.

-**MIGRAPHX_TIME_PASSES**
+.. envvar:: MIGRAPHX_TIME_PASSES

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Time the compile passes.
@@ -158,77 +158,77 @@ Time the compile passes.
 GPU Kernels JIT compilation debugging (applicable for both hiprtc and hipclang)
 -----------------------------------------

-**MIGRAPHX_TRACE_CMD_EXECUTE**
+.. envvar:: MIGRAPHX_TRACE_CMD_EXECUTE

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print commands executed by the MIGraphX ``process``.

-**MIGRAPHX_TRACE_HIPRTC**
+.. envvar:: MIGRAPHX_TRACE_HIPRTC

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print HIPRTC options and C++ file executed.

-**MIGRAPHX_DEBUG_SAVE_TEMP_DIR**
+.. envvar:: MIGRAPHX_DEBUG_SAVE_TEMP_DIR

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Make it so the created temporary directories are not deleted.

-**MIGRAPHX_GPU_DEBUG**
+.. envvar:: MIGRAPHX_GPU_DEBUG

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Internally, this adds the option ``-DMIGRAPHX_DEBUG`` when compiling GPU kernels. It enables assertions and capture of source locations for the errors. 

-**MIGRAPHX_GPU_DEBUG_SYM**
+.. envvar:: MIGRAPHX_GPU_DEBUG_SYM

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Adds the option ``-g`` when compiling HIPRTC.

-**MIGRAPHX_GPU_DUMP_SRC**
+.. envvar:: MIGRAPHX_GPU_DUMP_SRC

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Dump the HIPRTC source files compiled.

-**MIGRAPHX_GPU_DUMP_ASM**
+.. envvar:: MIGRAPHX_GPU_DUMP_ASM

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Dump the hip-clang assembly.

-**MIGRAPHX_GPU_OPTIMIZE**
+.. envvar:: MIGRAPHX_GPU_OPTIMIZE

 Set the optimization mode for GPU compile (``-O`` option).
 Defaults to ``-O3``.

-**MIGRAPHX_GPU_COMPILE_PARALLEL**
+.. envvar:: MIGRAPHX_GPU_COMPILE_PARALLEL

 Set to the number of threads to use.
 Compile GPU code in parallel with the given number of threads.

-**MIGRAPHX_TRACE_NARY**
+.. envvar:: MIGRAPHX_TRACE_NARY

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print the ``nary`` device functions used.

-**MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS**
+.. envvar:: MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Enable HIPRTC workarounds for bugs in HIPRTC.

-**MIGRAPHX_USE_FAST_SOFTMAX**
+.. envvar:: MIGRAPHX_USE_FAST_SOFTMAX

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Use the fast softmax optimization.

-**MIGRAPHX_ENABLE_NULL_STREAM**
+.. envvar:: MIGRAPHX_ENABLE_NULL_STREAM

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Allow using null stream for miopen and hipStream.

-**MIGRAPHX_NSTREAMS**
+.. envvar:: MIGRAPHX_NSTREAMS

 Set to the number of streams to use.
 Defaults to 1.

-**MIGRAPHX_TRACE_BENCHMARKING**
+.. envvar:: MIGRAPHX_TRACE_BENCHMARKING

 Set to "1" to print benchmarching trace.
 Set to "2" to print benchmarching trace with more detail.
@@ -236,45 +236,49 @@ Set to "2" to print benchmarching trace with more detail.
 MLIR vars
 -------------

-**MIGRAPHX_TRACE_MLIR**
+.. envvar:: MIGRAPHX_TRACE_MLIR

 Set to "1" to trace MLIR and print any failures.
 Set to "2" to additionally print all MLIR operations.

-**MIGRAPHX_MLIR_USE_SPECIFIC_OPS**
+.. envvar:: MIGRAPHX_MLIR_USE_SPECIFIC_OPS

 Set to the name of the operations you want to always use MLIR regardless of GPU architecture.
 Accepts a list of operators separated by commas (ex: "fused", "convolution", "dot").

-**MIGRAPHX_MLIR_TUNING_DB**
+.. envvar:: MIGRAPHX_MLIR_TUNING_DB

 Set to the path of the MLIR tuning database to load.

-**MIGRAPHX_MLIR_TUNING_CFG**
+.. envvar:: MIGRAPHX_MLIR_TUNING_CFG

 Set to the path of the tuning configuration.
 Appends to tuning cfg file that could be used with rocMLIR tuning scripts.

-**MIGRAPHX_MLIR_TUNE_EXHAUSTIVE**
+.. envvar:: MIGRAPHX_MLIR_TUNE_EXHAUSTIVE

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Do exhaustive tuning for MLIR.

+.. envvar:: MIGRAPHX_MLIR_TUNE_LIMIT
+
+Set to an integer greater than 1.
+Limits the number of solutions that MLIR will use for tuning.

 CK vars
 -----------

-**MIGRAPHX_LOG_CK_GEMM**
+.. envvar:: MIGRAPHX_LOG_CK_GEMM

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Print Composable Kernels GEMM traces.

-**MIGRAPHX_CK_DEBUG**
+.. envvar:: MIGRAPHX_CK_DEBUG

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Always add the ``-DMIGRAPHX_CK_CHECK=1`` for compiling Composable Kernels operators.

-**MIGRAPHX_TUNE_CK**
+.. envvar:: MIGRAPHX_TUNE_CK

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Use tuning for Composable Kernels.
@@ -282,19 +286,19 @@ Use tuning for Composable Kernels.
 Testing 
 ------------

-**MIGRAPHX_TRACE_TEST_COMPILE**
+.. envvar:: MIGRAPHX_TRACE_TEST_COMPILE

 Set to the target that you want to trace the compilation of (ex. "gpu", "cpu").
 Prints the compile trace for the given target for the verify tests.
 This flag shouldn't be used in conjunction with ``MIGRAPHX_TRACE_COMPILE``.
 For the verify tests only use ``MIGRAPHX_TRACE_TEST_COMPILE``.

-**MIGRAPHX_TRACE_TEST**
+.. envvar:: MIGRAPHX_TRACE_TEST

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Prints the reference and target programs even if the verify passed successfully.

-**MIGRAPHX_DUMP_TEST**
+.. envvar:: MIGRAPHX_DUMP_TEST

 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Dumps verify tests to ``.mxr`` files.
--- a/examples/README.md
+++ b/examples/README.md
@@ -7,3 +7,4 @@ This directory contains examples of common use cases for MIGraphX.
 - [MIGraphX usage and utilities](./migraphx)
 - [Vision inference examples](./vision)
 - [Natural language inference examples](./nlp)
+- [Diffusion inference examples](./diffusion)
--- a/examples/diffusion/README.md
+++ b/examples/diffusion/README.md
+# Diffusion Inference Examples
+
+- [Python Stable Diffusion 2.1](./python_stable_diffusion_21)
--- a/examples/diffusion/python_stable_diffusion_21/README.md
+++ b/examples/diffusion/python_stable_diffusion_21/README.md
+# Stable Diffusion 2.1
+
+This version was tested with [rocm 5.7](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/tree/rocm-5.7.0) revision.
+
+## Jupyter notebook
+
+There is a dedicated step-by-step notebook. See [sd21.ipynb](./sd21.ipynb)
+
+## Console application
+
+To run the console application, follow these steps below.
+
+Setup python environment
+
+```bash
+# this will require the python venv to installed (e.g. apt install python3.8-venv)
+python3 -m venv sd_venv
+. sd_venv/bin/activate
+```
+
+Install dependencies
+
+```bash
+pip install -r requirements.txt
+```
+
+Use MIGraphX Python Module
+
+```bash
+export PYTHONPATH=/opt/rocm/lib:$PYTHONPATH
+```
+
+Get models with optimum
+
+```bash
+optimum-cli export onnx --model stabilityai/stable-diffusion-2-1 models/sd21-onnx
+```
+*Note: `models/sd21-onnx` will be used in the scripts.*
+
+Run the text-to-image script with the following example prompt and seed:
+
+```bash
+python txt2img.py --prompt "a photograph of an astronaut riding a horse" --seed 13 --output astro_horse.jpg
+```
+*Note: The first run will compile the models and cache them to make subsequent runs faster.*
+
+The result should look like this:
+
+![example_output.jpg](./example_output.jpg)
+
+## Gradio application
+
+Note: requires `Console application` to work
+
+Install gradio dependencies
+
+```bash
+pip install -r gradio_requirements.txt
+```
+
+Usage
+
+```bash
+python gradio_app.py
+```
+
+This will load the models (which can take several minutes), and when the setup is ready, starts a server on `http://127.0.0.1:7860`.
--- a/examples/diffusion/python_stable_diffusion_21/example_output.jpg
+++ b/examples/diffusion/python_stable_diffusion_21/example_output.jpg
--- a/examples/diffusion/python_stable_diffusion_21/gradio_app.py
+++ b/examples/diffusion/python_stable_diffusion_21/gradio_app.py
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+
+from txt2img import StableDiffusionMGX
+import gradio as gr
+
+
+def main():
+    # Note: This will load the models, which can take several minutes
+    sd = StableDiffusionMGX()
+
+    def gr_wrapper(prompt, negative_prompt, steps, seed, scale):
+        result = sd.run(str(prompt), str(negative_prompt), int(steps),
+                        int(seed), float(scale))
+        return StableDiffusionMGX.convert_to_rgb_image(result)
+
+    demo = gr.Interface(
+        gr_wrapper,
+        [
+            gr.Textbox(value="a photograph of an astronaut riding a horse",
+                       label="Prompt"),
+            gr.Textbox(value="", label="Negative prompt (Optional)"),
+            gr.Slider(1, 100, step=1, value=20, label="Number of steps"),
+            gr.Textbox(value=13, label="Random seed"),
+            gr.Slider(1, 20, step=0.1, value=7.0, label="Guidance scale"),
+        ],
+        "image",
+    )
+    demo.launch()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/diffusion/python_stable_diffusion_21/gradio_reqirements.txt
+++ b/examples/diffusion/python_stable_diffusion_21/gradio_reqirements.txt
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+-f requirements.txt
+gradio
\ No newline at end of file
--- a/examples/diffusion/python_stable_diffusion_21/requirements.txt
+++ b/examples/diffusion/python_stable_diffusion_21/requirements.txt
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+accelerate
+diffusers
+optimum[onnxruntime]
+transformers
\ No newline at end of file
--- a/examples/diffusion/python_stable_diffusion_21/sd21.ipynb
+++ b/examples/diffusion/python_stable_diffusion_21/sd21.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#  The MIT License (MIT)\n",
+    "#\n",
+    "#  Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.\n",
+    "#\n",
+    "#  Permission is hereby granted, free of charge, to any person obtaining a copy\n",
+    "#  of this software and associated documentation files (the 'Software'), to deal\n",
+    "#  in the Software without restriction, including without limitation the rights\n",
+    "#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n",
+    "#  copies of the Software, and to permit persons to whom the Software is\n",
+    "#  furnished to do so, subject to the following conditions:\n",
+    "#\n",
+    "#  The above copyright notice and this permission notice shall be included in\n",
+    "#  all copies or substantial portions of the Software.\n",
+    "#\n",
+    "#  THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n",
+    "#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n",
+    "#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE\n",
+    "#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n",
+    "#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n",
+    "#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n",
+    "#  THE SOFTWARE."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Stable Diffusion 2.1\n",
+    "\n",
+    "The following example will show how to run `Stable Diffusion 2.1` with `MIGraphX`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install the required dependencies."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install dependencies\n",
+    "!pip install optimum[onnxruntime] transformers diffusers accelerate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will use optimum to generate the onnx files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# export models\n",
+    "!optimum-cli export onnx --model stabilityai/stable-diffusion-2-1 models/sd21-onnx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now it is time to load these models with python.\n",
+    "\n",
+    "First, we make sure that MIGraphX module is found in the python path."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "mgx_lib_path = \"/opt/rocm/lib/\" # or \"/code/AMDMIGraphX/build/lib/\"\n",
+    "if mgx_lib_path not in sys.path:\n",
+    "    sys.path.append(mgx_lib_path)\n",
+    "import migraphx as mgx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, a helper method to load and cache the models.\n",
+    "\n",
+    "This will use the `models/sd21-onnx` path. If you changed it, make sure to update here as well."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "# helper for model loading\n",
+    "def load_mgx_model(name, shapes):\n",
+    "    file = f\"models/sd21-onnx/{name}/model\"\n",
+    "    print(f\"Loading {name} model from {file}\")\n",
+    "    if os.path.isfile(f\"{file}.mxr\"):\n",
+    "        print(f\"Found mxr, loading it...\")\n",
+    "        model = mgx.load(f\"{file}.mxr\", format=\"msgpack\")\n",
+    "    elif os.path.isfile(f\"{file}.onnx\"):\n",
+    "        print(f\"Parsing from onnx file...\")\n",
+    "        model = mgx.parse_onnx(f\"{file}.onnx\", map_input_dims=shapes)\n",
+    "        model.compile(mgx.get_target(\"gpu\"))\n",
+    "        print(f\"Saving {name} model to mxr file...\")\n",
+    "        mgx.save(model, f\"{file}.mxr\", format=\"msgpack\")\n",
+    "    else:\n",
+    "        print(f\"No {name} model found. Please verify the path is correct and re-try, or re-download model.\")\n",
+    "        os.exit(1)\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With that, we can load the models. This could take several minutes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text_encoder = load_mgx_model(\"text_encoder\", {\"input_ids\": [1, 77]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unet = load_mgx_model(\n",
+    "        \"unet\", {\n",
+    "            \"sample\": [1, 4, 64, 64],\n",
+    "            \"encoder_hidden_states\": [1, 77, 1024],\n",
+    "            \"timestep\": [1],\n",
+    "        })"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vae = load_mgx_model(\"vae_decoder\", {\"latent_sample\": [1, 4, 64, 64]})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Import the remaining packages."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from diffusers import EulerDiscreteScheduler\n",
+    "from transformers import CLIPTokenizer\n",
+    "import torch\n",
+    "import numpy as np\n",
+    "from tqdm.auto import tqdm\n",
+    "from PIL import Image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Time to load the scheduler and tokenizer from the original source."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_id = \"stabilityai/stable-diffusion-2-1\"\n",
+    "scheduler = EulerDiscreteScheduler.from_pretrained(model_id,\n",
+    "                                                   subfolder=\"scheduler\")\n",
+    "tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder=\"tokenizer\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we will define all the steps one by one, to make the last step short and simple.\n",
+    "\n",
+    "The first step will be to tokenize the user prompt. It will make a `(1, 77)` shaped `input_ids`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize(input):\n",
+    "    return tokenizer([input],\n",
+    "                     padding=\"max_length\",\n",
+    "                     max_length=tokenizer.model_max_length,\n",
+    "                     truncation=True,\n",
+    "                     return_tensors=\"np\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optional\n",
+    "test_tk = tokenize(\"test tokenizer to see the tokens\")\n",
+    "test_tk.input_ids.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We run the tokenized prompt through the `Text Encoder` model. It expects the `(1, 77)` data as `int32`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optional\n",
+    "text_encoder.get_parameter_shapes()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_embeddings(input):\n",
+    "    return np.array(\n",
+    "        text_encoder.run({\"input_ids\": input.input_ids.astype(np.int32)\n",
+    "                          })[0]).astype(np.float32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optional\n",
+    "test_emb = get_embeddings(tokenize(\"test tokenizer to see the tokens\"))\n",
+    "test_emb.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The other input of the model is latent representation (pure noise). It will be transformed into a 512x512 image later.\n",
+    "The last input will be the timestep."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_latents(seed):\n",
+    "    return torch.randn(\n",
+    "        (1, 4, 64, 64),\n",
+    "        generator=torch.manual_seed(seed),\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optional\n",
+    "test_latents = generate_latents(42)\n",
+    "latents.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we add two helpers to access and convert from torch to numpy with the proper datatype."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_scaled_sample(latents, t):\n",
+    "    return scheduler.scale_model_input(latents, t).numpy().astype(np.float32)\n",
+    "\n",
+    "\n",
+    "def get_timestep(t):\n",
+    "    return np.atleast_1d(t.numpy().astype(np.int64))  # convert 0D -> 1D"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The UNet model will be run in a loop. It will predict the noise residual."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optional\n",
+    "unet.get_parameter_shapes()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def denoise(sample, embeddings, timestep):\n",
+    "    return np.array(\n",
+    "        unet.run({\n",
+    "            \"sample\": sample,\n",
+    "            \"encoder_hidden_states\": embeddings,\n",
+    "            \"timestep\": timestep\n",
+    "        })[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Helpers to do the classifier-free guidance and computing the previous noisy sample."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def perform_guidance(noise_pred_uncond, noise_pred_text, scale):\n",
+    "    return noise_pred_uncond + scale * (noise_pred_text - noise_pred_uncond)\n",
+    "\n",
+    "def compute_previous(noise_pred, t, latents):\n",
+    "    # compute the previous noisy sample x_t -> x_t-1\n",
+    "    return scheduler.step(noise_pred, t, latents).prev_sample\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Scale and decode the image latents with VAE."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def scale_denoised(latents):\n",
+    "    return 1 / 0.18215 * latents\n",
+    "\n",
+    "\n",
+    "def decode(latents):\n",
+    "    return np.array(\n",
+    "        vae.run({\"latent_sample\": latents.numpy().astype(np.float32)})[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And lastly, we need to convert it to an image to display or save."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convert_to_rgb_image(image):\n",
+    "    image = np.clip(image / 2 + 0.5, 0, 1)\n",
+    "    image = np.transpose(image, (0, 2, 3, 1))\n",
+    "    images = (image * 255).round().astype(\"uint8\")\n",
+    "    return Image.fromarray(images[0])\n",
+    "\n",
+    "def save_image(pil_image, filename=\"output.png\"):\n",
+    "    pil_image.save(filename, format=\"png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Feel free to play around with these params."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = \"a photograph of an astronaut riding a horse\"\n",
+    "negative_prompt = \"\"\n",
+    "steps = 20\n",
+    "seed = 13\n",
+    "scale = 7.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And now, to put everything together and run the whole pipeline:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scheduler.set_timesteps(steps)\n",
+    "\n",
+    "text_input, uncond_input = tokenize(prompt), tokenize(negative_prompt)\n",
+    "text_embeddings, uncond_embeddings = get_embeddings(\n",
+    "    text_input), get_embeddings(uncond_input)\n",
+    "latents = generate_latents(seed) * scheduler.init_noise_sigma\n",
+    "\n",
+    "for t in tqdm(scheduler.timesteps):\n",
+    "    sample = get_scaled_sample(latents, t)\n",
+    "    timestep = get_timestep(t)\n",
+    "\n",
+    "    noise_pred_uncond = denoise(sample, uncond_embeddings, timestep)\n",
+    "    noise_pred_text = denoise(sample, text_embeddings, timestep)\n",
+    "\n",
+    "    noise_pred = perform_guidance(noise_pred_uncond, noise_pred_text, scale)\n",
+    "    latents = compute_previous(torch.from_numpy(noise_pred), t, latents)\n",
+    "\n",
+    "latents = scale_denoised(latents)\n",
+    "result = decode(latents)\n",
+    "image = convert_to_rgb_image(result)\n",
+    "\n",
+    "# show the image\n",
+    "image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you like the generated image, save it with the following:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_image(image, \"output.png\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sd_venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/examples/diffusion/python_stable_diffusion_21/txt2img.py
+++ b/examples/diffusion/python_stable_diffusion_21/txt2img.py
+#  The MIT License (MIT)
+#
+#  Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the 'Software'), to deal
+#  in the Software without restriction, including without limitation the rights
+#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included in
+#  all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#  THE SOFTWARE.
+
+from argparse import ArgumentParser
+from diffusers import EulerDiscreteScheduler
+from transformers import CLIPTokenizer
+from PIL import Image
+
+import migraphx as mgx
+import numpy as np
+import os
+import torch
+import time
+from functools import wraps
+
+
+# measurement helper
+def measure(fn):
+    @wraps(fn)
+    def measure_ms(*args, **kwargs):
+        start_time = time.perf_counter_ns()
+        result = fn(*args, **kwargs)
+        end_time = time.perf_counter_ns()
+        print(f"Elapsed time: {(end_time - start_time) * 1e-6:.4f} ms\n")
+        return result
+
+    return measure_ms
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed",
+    )
+
+    parser.add_argument(
+        "-t",
+        "--steps",
+        type=int,
+        default=20,
+        help="Number of steps",
+    )
+
+    parser.add_argument(
+        "-p",
+        "--prompt",
+        type=str,
+        required=True,
+        help="Prompt",
+    )
+
+    parser.add_argument(
+        "-n",
+        "--negative-prompt",
+        type=str,
+        default="",
+        help="Negative prompt",
+    )
+
+    parser.add_argument(
+        "--scale",
+        type=float,
+        default=7.0,
+        help="Guidance scale",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        default=None,
+        help="Output name",
+    )
+    return parser.parse_args()
+
+
+class StableDiffusionMGX():
+    def __init__(self):
+        model_id = "stabilityai/stable-diffusion-2-1"
+        print(f"Using {model_id}")
+
+        print("Creating EulerDiscreteScheduler scheduler")
+        self.scheduler = EulerDiscreteScheduler.from_pretrained(
+            model_id, subfolder="scheduler")
+
+        print("Creating CLIPTokenizer tokenizer...")
+        self.tokenizer = CLIPTokenizer.from_pretrained(model_id,
+                                                       subfolder="tokenizer")
+
+        print("Load models...")
+        self.vae = StableDiffusionMGX.load_mgx_model(
+            "vae_decoder", {"latent_sample": [1, 4, 64, 64]})
+        self.text_encoder = StableDiffusionMGX.load_mgx_model(
+            "text_encoder", {"input_ids": [1, 77]})
+        self.unet = StableDiffusionMGX.load_mgx_model(
+            "unet", {
+                "sample": [1, 4, 64, 64],
+                "encoder_hidden_states": [1, 77, 1024],
+                "timestep": [1],
+            })
+
+    def run(self, prompt, negative_prompt, steps, seed, scale):
+        # need to set this for each run
+        self.scheduler.set_timesteps(steps)
+
+        print("Tokenizing prompt...")
+        text_input = self.tokenize(prompt)
+
+        print("Creating text embeddings for prompt...")
+        text_embeddings = self.get_embeddings(text_input)
+
+        print("Tokenizing negative prompt...")
+        uncond_input = self.tokenize(negative_prompt)
+
+        print("Creating text embeddings for negative prompt...")
+        uncond_embeddings = self.get_embeddings(uncond_input)
+
+        print(
+            f"Creating random input data ({1}x{4}x{64}x{64}) (latents) with seed={seed}..."
+        )
+        latents = torch.randn((1, 4, 64, 64),
+                              generator=torch.manual_seed(seed))
+
+        print("Apply initial noise sigma\n")
+        latents = latents * self.scheduler.init_noise_sigma
+
+        print("Running denoising loop...")
+        for step, t in enumerate(self.scheduler.timesteps):
+            print(f"#{step}/{len(self.scheduler.timesteps)} step")
+            latents = self.denoise_step(text_embeddings, uncond_embeddings,
+                                        latents, t, scale)
+
+        print("Scale denoised result...")
+        latents = 1 / 0.18215 * latents
+
+        print("Decode denoised result...")
+        image = self.decode(latents)
+
+        return image
+
+    @staticmethod
+    @measure
+    def load_mgx_model(name, shapes):
+        file = f"models/sd21-onnx/{name}/model"
+        print(f"Loading {name} model from {file}")
+        if os.path.isfile(f"{file}.mxr"):
+            print("Found mxr, loading it...")
+            model = mgx.load(f"{file}.mxr", format="msgpack")
+        elif os.path.isfile(f"{file}.onnx"):
+            print("Parsing from onnx file...")
+            model = mgx.parse_onnx(f"{file}.onnx", map_input_dims=shapes)
+            model.compile(mgx.get_target("gpu"))
+            print(f"Saving {name} model to mxr file...")
+            mgx.save(model, f"{file}.mxr", format="msgpack")
+        else:
+            print(f"No {name} model found. Please download it and re-try.")
+            os.exit(1)
+        return model
+
+    @measure
+    def tokenize(self, input):
+        return self.tokenizer([input],
+                              padding="max_length",
+                              max_length=self.tokenizer.model_max_length,
+                              truncation=True,
+                              return_tensors="np")
+
+    @measure
+    def get_embeddings(self, input):
+        return np.array(
+            self.text_encoder.run(
+                {"input_ids":
+                 input.input_ids.astype(np.int32)})[0]).astype(np.float32)
+
+    @staticmethod
+    def convert_to_rgb_image(image):
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = np.transpose(image, (0, 2, 3, 1))
+        images = (image * 255).round().astype("uint8")
+        return Image.fromarray(images[0])
+
+    @staticmethod
+    def save_image(pil_image, filename="output.png"):
+        pil_image.save(filename)
+
+    @measure
+    def denoise_step(self, text_embeddings, uncond_embeddings, latents, t,
+                     scale):
+        sample = self.scheduler.scale_model_input(latents,
+                                                  t).numpy().astype(np.float32)
+        timestep = np.atleast_1d(t.numpy().astype(
+            np.int64))  # convert 0D -> 1D
+
+        noise_pred_uncond = np.array(
+            self.unet.run({
+                "sample": sample,
+                "encoder_hidden_states": uncond_embeddings,
+                "timestep": timestep
+            })[0])
+
+        noise_pred_text = np.array(
+            self.unet.run({
+                "sample": sample,
+                "encoder_hidden_states": text_embeddings,
+                "timestep": timestep
+            })[0])
+
+        # perform guidance
+        noise_pred = noise_pred_uncond + scale * (noise_pred_text -
+                                                  noise_pred_uncond)
+
+        # compute the previous noisy sample x_t -> x_t-1
+        return self.scheduler.step(torch.from_numpy(noise_pred), t,
+                                   latents).prev_sample
+
+    @measure
+    def decode(self, latents):
+        return np.array(
+            self.vae.run({"latent_sample":
+                          latents.numpy().astype(np.float32)})[0])
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    sd = StableDiffusionMGX()
+    result = sd.run(args.prompt, args.negative_prompt, args.steps, args.seed,
+                    args.scale)
+
+    print("Convert result to rgb image...")
+    image = StableDiffusionMGX.convert_to_rgb_image(result)
+    filename = args.output if args.output else f"output_s{args.seed}_t{args.steps}.png"
+    StableDiffusionMGX.save_image(image, args.output)
+    print(f"Image saved to {filename}")
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,4 +29,4 @@ pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build
 msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off
 sqlite3@3.43.2 -DCMAKE_POSITION_INDEPENDENT_CODE=On
 ROCmSoftwarePlatform/composable_kernel@70eefcf4f263aa5c25f3c9ff0db8f6f199ef0fb9 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On
-ROCmSoftwarePlatform/rocMLIR@13f6c2a69cfe80a575c6b241ec7353d1e953cb12 -DBUILD_FAT_LIBROCKCOMPILER=On
+ROCmSoftwarePlatform/rocMLIR@9e66e8050209f03349a41b6b497f0da2b285a53b -DBUILD_FAT_LIBROCKCOMPILER=On
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -28,7 +28,7 @@ include(ROCMInstallTargets)
 include(ROCMPackageConfigHelpers)
 include(RegisterOp)
 include(CheckCXXLinkerFlag)
- 
+include(CheckCXXSourceCompiles)

 add_library(migraphx
    adjust_allocation.cpp
@@ -105,6 +105,12 @@ add_library(migraphx
    value.cpp
    verify_args.cpp
 )
+
+if(WIN32)
+    # Due to compilation crashing, we need to use type-erased matchers on Windows.
+    target_compile_definitions(migraphx PUBLIC MIGRAPHX_USE_TYPE_ERASED_MATCHERS=1)
+endif()
+
 configure_file(version.h.in include/migraphx/version.h)
 rocm_set_soversion(migraphx ${MIGRAPHX_SO_VERSION})
 function(register_migraphx_ops)
@@ -216,6 +222,8 @@ register_migraphx_ops(
    scatternd_add
    scatternd_mul
    scatternd_none
+    scatternd_max
+    scatternd_min
    select_module
    sigmoid
    sign
@@ -234,6 +242,7 @@ register_migraphx_ops(
    transpose
    unary_not
    undefined
+    unique
    unknown
    unsqueeze
    where
@@ -248,17 +257,62 @@ rocm_install_targets(
    ${CMAKE_CURRENT_BINARY_DIR}/include
 )

-
-check_cxx_linker_flag(-lstdc++fs HAS_LIB_STD_FILESYSTEM)
-if(HAS_LIB_STD_FILESYSTEM)
-target_link_libraries(migraphx PRIVATE -lstdc++fs)
+if(NOT WIN32)
+    check_cxx_linker_flag(-lstdc++fs HAS_LIB_STD_FILESYSTEM)
+    if(HAS_LIB_STD_FILESYSTEM)
+        target_link_libraries(migraphx PRIVATE -lstdc++fs)
+    endif()
+    target_link_libraries(migraphx PRIVATE -ldl)
 endif()

-target_link_libraries(migraphx PRIVATE -ldl)
-
 target_include_directories(migraphx SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
 target_link_libraries(migraphx PUBLIC Threads::Threads)

+function(check_execution_par RESULT)
+    set(CMAKE_REQUIRED_LIBRARIES ${ARGN})
+    set(CMAKE_REQUIRED_FLAGS)
+    if(NOT MSVC)
+        set(CMAKE_REQUIRED_FLAGS "-std=c++17")
+    endif()
+    string(MD5 _flags_hash "${CMAKE_REQUIRED_FLAGS} ${CMAKE_REQUIRED_LIBRARIES}")
+    set(_source "
+#include <execution>
+
+int main() {
+    int* i = nullptr;
+    std::sort(std::execution::par, i, i);
+}
+")
+    check_cxx_source_compiles("${_source}" _has_execution_${_flags_hash})
+    set(${RESULT} ${_has_execution_${_flags_hash}} PARENT_SCOPE)
+endfunction()
+
+set(MIGRAPHX_HAS_EXECUTORS_DEFAULT Off)
+find_package(TBB QUIET)
+if(TBB_FOUND)
+    check_execution_par(TBB_HAS_EXECUTION_PAR TBB::tbb)
+    if(TBB_HAS_EXECUTION_PAR)
+        list(APPEND PACKAGE_DEPENDS PACKAGE TBB)
+        target_link_libraries(migraphx PUBLIC TBB::tbb)
+        set(MIGRAPHX_HAS_EXECUTORS_DEFAULT On)
+        message(STATUS "Using TBB for parallel execution")
+    endif()
+else()
+    check_execution_par(HAS_EXECUTION_PAR)
+    if(HAS_EXECUTION_PAR)
+        set(MIGRAPHX_HAS_EXECUTORS_DEFAULT On)
+    endif()
+endif()
+
+option(MIGRAPHX_HAS_EXECUTORS "C++ supports parallel executors" ${MIGRAPHX_HAS_EXECUTORS_DEFAULT})
+if(MIGRAPHX_HAS_EXECUTORS)
+    message("Parallel STL enabled")
+    target_compile_definitions(migraphx PUBLIC MIGRAPHX_HAS_EXECUTORS=1)
+else()
+    message("Parallel STL disabled")
+    target_compile_definitions(migraphx PUBLIC MIGRAPHX_HAS_EXECUTORS=0)
+endif()
+
 find_package(nlohmann_json 3.8.0 REQUIRED)
 target_link_libraries(migraphx PRIVATE nlohmann_json::nlohmann_json)
 migraphx_generate_export_header(migraphx)
@@ -276,8 +330,6 @@ target_link_libraries(migraphx INTERFACE $<BUILD_INTERFACE:msgpackc-cxx>)

 add_library(migraphx_all_targets INTERFACE)

-set(PACKAGE_DEPENDS)
-
 add_subdirectory(api)
 add_subdirectory(driver)
 add_subdirectory(onnx)

--- a/src/driver/argument_parser.hpp
+++ b/src/driver/argument_parser.hpp
@@ -105,6 +105,8 @@ inline std::ostream& operator<<(std::ostream& os, const color& c)
    static const bool use_color = isatty(STDOUT_FILENO) != 0;
    if(use_color)
        return os << "\033[" << static_cast<std::size_t>(c) << "m";
+#else
+    (void)c;
 #endif
    return os;
 }

--- a/src/include/migraphx/bit_cast.hpp
+++ b/src/include/migraphx/bit_cast.hpp
@@ -21,10 +21,13 @@
 * ************************************************************************ */
 #ifndef MIGRAPHX_GUARD_RTGLIB_BITCAST_HPP
 #define MIGRAPHX_GUARD_RTGLIB_BITCAST_HPP
+#include <type_traits>
 #if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #endif
+
+#include <migraphx/requires.hpp>
 #include <migraphx/config.hpp>

 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
@@ -32,7 +35,10 @@

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
-template <typename To, typename From>
+template <typename To,
+          typename From,
+          MIGRAPHX_REQUIRES(std::is_trivially_copyable<To>{} and
+                            std::is_trivially_copyable<From>{})>
 inline constexpr To bit_cast(From fr) noexcept
 {
    static_assert(sizeof(To) == sizeof(From));