Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into auto_contig_fix

c6ec6638 · Khalique Ahmed · b42c7b41 · a6d1540f · c6ec6638 · c6ec6638
Commit c6ec6638 authored Nov 29, 2023 by Khalique Ahmed
20 changed files
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -465,7 +465,7 @@ jobs:
    - name: Upload code coverage
      if: "matrix.configuration == 'codecov'"
      env:
-        CODECOV_TOKEN: "8545af1c-f90b-4345-92a5-0d075503ca56"
+        CODECOV_TOKEN: "f5d5a10b-3177-4c76-b25f-9b1c2f165e8b"
      run: |
        sudo apt-get install -y lcov
        cd build

--- a/.gitignore
+++ b/.gitignore
@@ -81,5 +81,7 @@ cmake-build*/
 build*/

 # Recommended location to install rbuild dependencies from README.md
-depend
+depend*/

+# local Python virtual environment
+.venv/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,9 +41,12 @@ if(NOT MIGRAPHX_GENERATOR_IS_MULTI_CONFIG)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYPES})
 endif()

-set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
+if(NOT WIN32)
+    set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
+    set(CMAKE_BUILD_RPATH "${CMAKE_BINARY_DIR}/lib")
+endif()

-set(CMAKE_BUILD_RPATH "${CMAKE_BINARY_DIR}/lib")
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm /opt/rocm/llvm $ENV{ROCM_PATH} $ENV{HIP_PATH})

 project(migraphx LANGUAGES C CXX)
 include(CTest)
@@ -57,6 +60,9 @@ else()
 option(MIGRAPHX_ENABLE_PYTHON "Enable python bindings" ON)
 endif()

+# By default build shared libraries
+option(BUILD_SHARED_LIBS "Create shared libraries" ON)
+
 if(WIN32) # CK is not yet ported to Windows
 option(MIGRAPHX_USE_COMPOSABLEKERNEL "Enable MIGraphX to use composable kernel JIT library" OFF)
 else()
@@ -67,7 +73,7 @@ find_path(HALF_INCLUDE_DIR half.hpp PATH_SUFFIXES half)
 if (NOT HALF_INCLUDE_DIR)
    message(FATAL_ERROR "Could not find half.hpp - Please check that the install path of half.hpp has been added to CMAKE_PREFIX_PATH")
 else()
-	message(STATUS "half.hpp is at ${HALF_INCLUDE_DIR}")
+    message(STATUS "half.hpp is at ${HALF_INCLUDE_DIR}")
 endif()

 include(CheckTypeSize)
@@ -102,13 +108,21 @@ set(MIGRAPHX_ENABLE_CPU Off CACHE BOOL "")
 # Disable fpga backend by default
 set(MIGRAPHX_ENABLE_FPGA Off CACHE BOOL "")

+if(WIN32)
+    add_compile_definitions("$<$<COMPILE_LANGUAGE:C,CXX>:_CRT_SECURE_NO_WARNINGS;_USE_MATH_DEFINES>")
+endif()
+
 set(CMAKE_CXX_STANDARD_DEFAULT "")
-add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-std=c++17>)
+if(MSVC)
+    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/std:c++17>)
+else()
+    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-std=c++17>)
+endif()

 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 include(EnableCompilerWarnings)
 include(ROCMClangTidy)
-if(CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
+if(CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+.*")
    set(MIGRAPHX_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
 # Enable tidy on hip
 elseif(MIGRAPHX_ENABLE_GPU)

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -22,6 +22,8 @@ def rocmtestnode(Map conf) {
        def cmd = """
            ulimit -c unlimited
            echo "leak:dnnl::impl::malloc" > suppressions.txt
+            echo "leak:libtbb.so" >> suppressions.txt
+            cat suppressions.txt
            export LSAN_OPTIONS="suppressions=\$(pwd)/suppressions.txt"
            export MIGRAPHX_GPU_DEBUG=${gpu_debug}
            export CXX=${compiler}

--- a/cmake/Embed.cmake
+++ b/cmake/Embed.cmake
@@ -21,17 +21,25 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 #####################################################################################
-find_program(EMBED_LD ld)
-find_program(EMBED_OBJCOPY objcopy)

-option(EMBED_USE_LD "Use ld to embed data files" OFF)
+if(WIN32)
+    set(EMBED_USE RC CACHE STRING "Use RC or CArrays to embed data files")
+    set_property(CACHE EMBED_USE PROPERTY STRINGS "RC;CArrays")
+else()
+    set(EMBED_USE CArrays CACHE STRING "Use LD or CArrays to embed data files")
+    set_property(CACHE EMBED_USE PROPERTY STRINGS "LD;CArrays")
+endif()
+
+if(EMBED_USE STREQUAL "LD")
+    find_program(EMBED_LD ld REQUIRED)
+    find_program(EMBED_OBJCOPY objcopy REQUIRED)
+endif()

 function(wrap_string)
    set(options)
    set(oneValueArgs VARIABLE AT_COLUMN)
    set(multiValueArgs)
    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})

    string(LENGTH ${${PARSE_VARIABLE}} string_length)
    math(EXPR offset "0")
@@ -54,112 +62,124 @@ function(wrap_string)
    set(${PARSE_VARIABLE} "${lines}" PARENT_SCOPE)
 endfunction()

-function(generate_embed_source EMBED_NAME)
+function(generate_embed_source EMBED_NAME EMBED_DIR BASE_DIRECTORY)
    set(options)
-    set(oneValueArgs SRC HEADER RELATIVE)
-    set(multiValueArgs OBJECTS SYMBOLS FILES)
-
+    set(oneValueArgs)
+    set(multiValueArgs SYMBOLS FILES)
    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

-    set(EXTERNS)
-    set(INIT_KERNELS)
-
-    list(LENGTH PARSE_SYMBOLS SYMBOLS_LEN)
-    list(LENGTH PARSE_OBJECTS OBJECTS_LEN)
-    if(NOT ${SYMBOLS_LEN} EQUAL ${OBJECTS_LEN})
-        message(FATAL_ERROR "Symbols and objects dont match: ${SYMBOLS_LEN} != ${OBJECTS_LEN}")
-    endif()
-    math(EXPR LEN "${SYMBOLS_LEN} - 1")
-
-    foreach(idx RANGE ${LEN})
-        list(GET PARSE_SYMBOLS ${idx} SYMBOL)
-        list(GET PARSE_OBJECTS ${idx} OBJECT)
-        list(GET PARSE_FILES ${idx} FILE)
-
-        set(START_SYMBOL "_binary_${SYMBOL}_start")
-        set(LENGTH_SYMBOL "_binary_${SYMBOL}_length")
-        if(EMBED_USE_LD)
-            string(APPEND EXTERNS "
+    set(RESOURCE_ID 100)
+    foreach(SYMBOL FILE IN ZIP_LISTS PARSE_SYMBOLS PARSE_FILES)
+        cmake_path(RELATIVE_PATH FILE BASE_DIRECTORY ${BASE_DIRECTORY} OUTPUT_VARIABLE BASE_NAME)
+        if(EMBED_USE STREQUAL "RC")
+            string(TOUPPER "${SYMBOL}" SYMBOL)
+            string(APPEND FILE_IDS "#define IDR_${SYMBOL} ${RESOURCE_ID}\n")
+            cmake_path(NATIVE_PATH FILE NORMALIZE NATIVE_FILE)
+            string(REPLACE "\\" "\\\\" NATIVE_FILE "${NATIVE_FILE}")
+            string(APPEND RC_FILE_MAPPING "IDR_${SYMBOL} TEXTFILE \"${NATIVE_FILE}\"\n")
+            string(APPEND INIT_KERNELS "\n        {\"${BASE_NAME}\", resource::read(IDR_${SYMBOL})},")
+            math(EXPR RESOURCE_ID "${RESOURCE_ID} + 1" OUTPUT_FORMAT DECIMAL)
+        else()
+            set(START_SYMBOL "_binary_${SYMBOL}_start")
+            set(LENGTH_SYMBOL "_binary_${SYMBOL}_length")
+            if(EMBED_USE STREQUAL "LD")
+                string(APPEND EXTERNS "
 extern const char ${START_SYMBOL}[];
 extern const size_t _binary_${SYMBOL}_size;
 const auto ${LENGTH_SYMBOL} = reinterpret_cast<size_t>(&_binary_${SYMBOL}_size);
-            ")
-        else()
-            string(APPEND EXTERNS "
+")
+            else()
+                string(APPEND EXTERNS "
 extern const char ${START_SYMBOL}[];
 extern const size_t ${LENGTH_SYMBOL};
-            ")
+")
+            endif()
+            string(APPEND INIT_KERNELS "
+        { \"${BASE_NAME}\", { ${START_SYMBOL}, ${LENGTH_SYMBOL}} },")
        endif()
+    endforeach()
+    if(EMBED_USE STREQUAL "RC")
+       file(WRITE "${EMBED_DIR}/include/resource.h" "
+#define TEXTFILE 256

-        if(PARSE_RELATIVE)
-            file(RELATIVE_PATH BASE_NAME ${PARSE_RELATIVE} "${FILE}")
-        else()
-            get_filename_component(BASE_NAME "${FILE}" NAME)
-        endif()
+${FILE_IDS}
+")
+        file(WRITE "${EMBED_DIR}/resource.rc" "
+#include \"resource.h\"

-        string(APPEND INIT_KERNELS "
-            { \"${BASE_NAME}\", { ${START_SYMBOL}, ${LENGTH_SYMBOL}} },")
-    endforeach()
+${RC_FILE_MAPPING}
+")
+        set(EXTERNS "
+#include <Windows.h>
+#include \"resource.h\"

-    file(WRITE "${PARSE_HEADER}" "
+namespace resource {
+std::string_view read(int id)
+{
+    HMODULE handle = GetModuleHandle(nullptr);
+    HRSRC rc = FindResource(handle, MAKEINTRESOURCE(id), MAKEINTRESOURCE(TEXTFILE));
+    HGLOBAL data = LoadResource(handle, rc);
+    return {static_cast<const char*>(LockResource(data)), SizeofResource(handle, rc)};
+}
+}
+")
+        set(EMBED_FILES ${EMBED_DIR}/include/resource.h ${EMBED_DIR}/resource.rc)
+    endif()
+    file(WRITE "${EMBED_DIR}/include/${EMBED_NAME}.hpp" "
 #include <string_view>
 #include <unordered_map>
 #include <utility>
 std::unordered_map<std::string_view, std::string_view> ${EMBED_NAME}();
 ")

-    file(WRITE "${PARSE_SRC}" "
+    file(WRITE "${EMBED_DIR}/${EMBED_NAME}.cpp" "
 #include <${EMBED_NAME}.hpp>
 ${EXTERNS}
 std::unordered_map<std::string_view, std::string_view> ${EMBED_NAME}()
 {
-    static std::unordered_map<std::string_view, std::string_view> result = {${INIT_KERNELS}};
+    static std::unordered_map<std::string_view, std::string_view> result = {${INIT_KERNELS}
+    };
    return result;
 }
 ")
+    list(APPEND EMBED_FILES ${EMBED_DIR}/${EMBED_NAME}.cpp ${EMBED_DIR}/include/${EMBED_NAME}.hpp)
+    set(EMBED_FILES ${EMBED_FILES} PARENT_SCOPE)
 endfunction()

-function(embed_file OUTPUT_FILE OUTPUT_SYMBOL FILE)
-    set(WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-    # Glob is used to compute the relative path
-    file(GLOB FILES RELATIVE ${WORKING_DIRECTORY} ${FILE})
-    foreach(REL_FILE ${FILES})
-        string(MAKE_C_IDENTIFIER "${REL_FILE}" SYMBOL)
-        get_filename_component(OUTPUT_FILE_DIR "${REL_FILE}" DIRECTORY)
-        file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_FILE_DIR}")
-        if(EMBED_USE_LD)
-            set(OUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.o")
-        else()
-            set(OUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.cpp")
-        endif()
-        set(${OUTPUT_SYMBOL} ${SYMBOL} PARENT_SCOPE)
-        set(${OUTPUT_FILE} "${OUT_FILE}" PARENT_SCOPE)
-        if(EMBED_USE_LD)
-            add_custom_command(
-                OUTPUT "${OUT_FILE}"
-                COMMAND ${EMBED_LD} -r -o "${OUT_FILE}" -z noexecstack --format=binary "${REL_FILE}" 
-                COMMAND ${EMBED_OBJCOPY} --rename-section .data=.rodata,alloc,load,readonly,data,contents "${OUT_FILE}"
-                WORKING_DIRECTORY ${WORKING_DIRECTORY}
-                DEPENDS ${FILE}
-                VERBATIM
-            )
-        else()
-            set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${FILE})
-            # reads source file contents as hex string
-            file(READ ${FILE} HEX_STRING HEX)
-            # wraps the hex string into multiple lines
-            wrap_string(VARIABLE HEX_STRING AT_COLUMN 80)
-            # adds '0x' prefix and comma suffix before and after every byte respectively
-            string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1, " ARRAY_VALUES ${HEX_STRING})
-            # removes trailing comma
-            string(REGEX REPLACE ", $" "" ARRAY_VALUES ${ARRAY_VALUES})
-            file(WRITE "${OUT_FILE}" "
+function(embed_file FILE BASE_DIRECTORY)
+    message(STATUS "    ${FILE}")
+    cmake_path(RELATIVE_PATH FILE BASE_DIRECTORY "${BASE_DIRECTORY}" OUTPUT_VARIABLE REL_FILE)
+    string(MAKE_C_IDENTIFIER "${REL_FILE}" OUTPUT_SYMBOL)
+    get_filename_component(OUTPUT_FILE_DIR "${REL_FILE}" DIRECTORY)
+    file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_FILE_DIR}")
+    if(EMBED_USE STREQUAL "LD")
+        set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.o")
+        add_custom_command(
+            OUTPUT "${OUTPUT_FILE}"
+            COMMAND ${EMBED_LD} -r -o "${OUTPUT_FILE}" -z noexecstack --format=binary "${REL_FILE}"
+            COMMAND ${EMBED_OBJCOPY} --rename-section .data=.rodata,alloc,load,readonly,data,contents "${OUTPUT_FILE}"
+            WORKING_DIRECTORY "${BASE_DIRECTORY}"
+            DEPENDS "${FILE}"
+            VERBATIM)
+        set(OUTPUT_FILE ${OUTPUT_FILE} PARENT_SCOPE)
+    elseif(EMBED_USE STREQUAL "CArrays")
+        set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.cpp")
+        # reads source file contents as hex string
+        file(READ ${FILE} HEX_STRING HEX)
+        # wraps the hex string into multiple lines
+        wrap_string(VARIABLE HEX_STRING AT_COLUMN 80)
+        # adds '0x' prefix and comma suffix before and after every byte respectively
+        string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1, " ARRAY_VALUES ${HEX_STRING})
+        # removes trailing comma
+        string(REGEX REPLACE ", $" "" ARRAY_VALUES ${ARRAY_VALUES})
+        file(WRITE "${OUTPUT_FILE}" "
 #include <cstddef>
-extern const char _binary_${SYMBOL}_start[] = { ${ARRAY_VALUES} };
-extern const size_t _binary_${SYMBOL}_length = sizeof(_binary_${SYMBOL}_start);
+extern const char _binary_${OUTPUT_SYMBOL}_start[] = { ${ARRAY_VALUES} };
+extern const size_t _binary_${OUTPUT_SYMBOL}_length = sizeof(_binary_${OUTPUT_SYMBOL}_start);
 ")
-        endif()
-    endforeach()
+        set(OUTPUT_FILE ${OUTPUT_FILE} PARENT_SCOPE)
+    endif()
+    set(OUTPUT_SYMBOL ${OUTPUT_SYMBOL} PARENT_SCOPE)
 endfunction()

 function(add_embed_library EMBED_NAME)
@@ -168,35 +188,32 @@ function(add_embed_library EMBED_NAME)
    set(multiValueArgs)
    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

-    file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/embed)
-    file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/embed/${EMBED_NAME})
    set(EMBED_DIR ${CMAKE_CURRENT_BINARY_DIR}/embed/${EMBED_NAME})
-    set(SRC_FILE "${EMBED_DIR}/${EMBED_NAME}.cpp")
-    set(HEADER_FILE "${EMBED_DIR}/include/${EMBED_NAME}.hpp")
-    set(WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    set(OUTPUT_FILES)
-    set(SYMBOLS)
-    message(STATUS "Embedding files")
+    file(MAKE_DIRECTORY ${EMBED_DIR})
+    message(STATUS "Embedding kernel files:")
    foreach(FILE ${PARSE_UNPARSED_ARGUMENTS})
-        embed_file(OUTPUT_FILE OUTPUT_SYMBOL ${FILE})
+        embed_file(${FILE} ${PARSE_RELATIVE})
        list(APPEND OUTPUT_FILES ${OUTPUT_FILE})
        list(APPEND SYMBOLS ${OUTPUT_SYMBOL})
    endforeach()
-    message(STATUS "Generating embedding library ${EMBED_NAME}")
-    generate_embed_source(${EMBED_NAME} SRC ${SRC_FILE} HEADER ${HEADER_FILE} OBJECTS ${OUTPUT_FILES} SYMBOLS ${SYMBOLS} RELATIVE ${PARSE_RELATIVE} FILES ${PARSE_UNPARSED_ARGUMENTS})
-    
+    message(STATUS "Generating embedding library '${EMBED_NAME}'")
+    generate_embed_source(${EMBED_NAME} ${EMBED_DIR} "${PARSE_RELATIVE}" SYMBOLS ${SYMBOLS} FILES ${PARSE_UNPARSED_ARGUMENTS})
    set(INTERNAL_EMBED_LIB embed_lib_${EMBED_NAME})
-    add_library(${INTERNAL_EMBED_LIB} OBJECT "${SRC_FILE}")
+    add_library(${INTERNAL_EMBED_LIB} OBJECT ${EMBED_FILES})
+    if(EMBED_USE STREQUAL "CArrays")
+        target_sources(${INTERNAL_EMBED_LIB} PRIVATE ${OUTPUT_FILES})
+    endif()
    target_include_directories(${INTERNAL_EMBED_LIB} PRIVATE "${EMBED_DIR}/include")
    target_compile_options(${INTERNAL_EMBED_LIB} PRIVATE -Wno-reserved-identifier -Wno-extern-initializer -Wno-missing-variable-declarations)
    set_target_properties(${INTERNAL_EMBED_LIB} PROPERTIES POSITION_INDEPENDENT_CODE On)
-    
    add_library(${EMBED_NAME} INTERFACE)
-    if(EMBED_USE_LD)
+    if(EMBED_USE STREQUAL "LD")
        target_sources(${EMBED_NAME} INTERFACE ${OUTPUT_FILES})
-    else()
-        target_sources(${INTERNAL_EMBED_LIB} PRIVATE ${OUTPUT_FILES})
+    endif()
+    if(EMBED_USE STREQUAL "RC")
+        target_link_libraries(${EMBED_NAME} INTERFACE $<TARGET_OBJECTS:${INTERNAL_EMBED_LIB}>)
    endif()
    target_sources(${EMBED_NAME} INTERFACE $<TARGET_OBJECTS:${INTERNAL_EMBED_LIB}>)
    target_include_directories(${EMBED_NAME} INTERFACE "${EMBED_DIR}/include")
 endfunction()
+
--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -21,7 +21,7 @@ charset-normalizer==3.1.0
    # via requests
 click==8.1.3
    # via sphinx-external-toc
-cryptography==41.0.4
+cryptography==41.0.6
    # via pyjwt
 deprecated==1.2.13
    # via pygithub
@@ -89,7 +89,7 @@ requests==2.28.2
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==0.27.0
+rocm-docs-core==0.29.0
    # via -r requirements.in
 smmap==5.0.0
    # via gitdb

--- a/examples/README.md
+++ b/examples/README.md
@@ -6,4 +6,5 @@ This directory contains examples of common use cases for MIGraphX.
 ## Examples:
 - [MIGraphX usage and utilities](./migraphx)
 - [Vision inference examples](./vision)
- [Natural language inference examples](./nlp)
\ No newline at end of file
+- [Natural language inference examples](./nlp)
+- [Diffusion inference examples](./diffusion)
--- a/examples/diffusion/README.md
+++ b/examples/diffusion/README.md
+# Diffusion Inference Examples
+
+- [Python Stable Diffusion 2.1](./python_stable_diffusion_21)
--- a/examples/diffusion/python_stable_diffusion_21/README.md
+++ b/examples/diffusion/python_stable_diffusion_21/README.md
+# Stable Diffusion 2.1
+
+This version was tested with [rocm 5.7](https://github.com/ROCmSoftwarePlatform/AMDMIGraphX/tree/rocm-5.7.0) revision.
+
+## Jupyter notebook
+
+There is a dedicated step-by-step notebook. See [sd21.ipynb](./sd21.ipynb)
+
+## Console application
+
+To run the console application, follow these steps below.
+
+Setup python environment
+
+```bash
+# this will require the python venv to installed (e.g. apt install python3.8-venv)
+python3 -m venv sd_venv
+. sd_venv/bin/activate
+```
+
+Install dependencies
+
+```bash
+pip install -r requirements.txt
+```
+
+Use MIGraphX Python Module
+
+```bash
+export PYTHONPATH=/opt/rocm/lib:$PYTHONPATH
+```
+
+Get models with optimum
+
+```bash
+optimum-cli export onnx --model stabilityai/stable-diffusion-2-1 models/sd21-onnx
+```
+*Note: `models/sd21-onnx` will be used in the scripts.*
+
+Run the text-to-image script with the following example prompt and seed:
+
+```bash
+python txt2img.py --prompt "a photograph of an astronaut riding a horse" --seed 13 --output astro_horse.jpg
+```
+*Note: The first run will compile the models and cache them to make subsequent runs faster.*
+
+The result should look like this:
+
+![example_output.jpg](./example_output.jpg)
+
+## Gradio application
+
+Note: requires `Console application` to work
+
+Install gradio dependencies
+
+```bash
+pip install -r gradio_requirements.txt
+```
+
+Usage
+
+```bash
+python gradio_app.py
+```
+
+This will load the models (which can take several minutes), and when the setup is ready, starts a server on `http://127.0.0.1:7860`.
--- a/examples/diffusion/python_stable_diffusion_21/example_output.jpg
+++ b/examples/diffusion/python_stable_diffusion_21/example_output.jpg
--- a/examples/diffusion/python_stable_diffusion_21/gradio_app.py
+++ b/examples/diffusion/python_stable_diffusion_21/gradio_app.py
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+
+from txt2img import StableDiffusionMGX
+import gradio as gr
+
+
+def main():
+    # Note: This will load the models, which can take several minutes
+    sd = StableDiffusionMGX()
+
+    def gr_wrapper(prompt, negative_prompt, steps, seed, scale):
+        result = sd.run(str(prompt), str(negative_prompt), int(steps),
+                        int(seed), float(scale))
+        return StableDiffusionMGX.convert_to_rgb_image(result)
+
+    demo = gr.Interface(
+        gr_wrapper,
+        [
+            gr.Textbox(value="a photograph of an astronaut riding a horse",
+                       label="Prompt"),
+            gr.Textbox(value="", label="Negative prompt (Optional)"),
+            gr.Slider(1, 100, step=1, value=20, label="Number of steps"),
+            gr.Textbox(value=13, label="Random seed"),
+            gr.Slider(1, 20, step=0.1, value=7.0, label="Guidance scale"),
+        ],
+        "image",
+    )
+    demo.launch()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/diffusion/python_stable_diffusion_21/gradio_reqirements.txt
+++ b/examples/diffusion/python_stable_diffusion_21/gradio_reqirements.txt
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+-f requirements.txt
+gradio
\ No newline at end of file
--- a/examples/diffusion/python_stable_diffusion_21/requirements.txt
+++ b/examples/diffusion/python_stable_diffusion_21/requirements.txt
+#####################################################################################
+# The MIT License (MIT)
+#
+# Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#####################################################################################
+accelerate
+diffusers
+optimum[onnxruntime]
+transformers
\ No newline at end of file
--- a/examples/diffusion/python_stable_diffusion_21/sd21.ipynb
+++ b/examples/diffusion/python_stable_diffusion_21/sd21.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#  The MIT License (MIT)\n",
+    "#\n",
+    "#  Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.\n",
+    "#\n",
+    "#  Permission is hereby granted, free of charge, to any person obtaining a copy\n",
+    "#  of this software and associated documentation files (the 'Software'), to deal\n",
+    "#  in the Software without restriction, including without limitation the rights\n",
+    "#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\n",
+    "#  copies of the Software, and to permit persons to whom the Software is\n",
+    "#  furnished to do so, subject to the following conditions:\n",
+    "#\n",
+    "#  The above copyright notice and this permission notice shall be included in\n",
+    "#  all copies or substantial portions of the Software.\n",
+    "#\n",
+    "#  THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n",
+    "#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n",
+    "#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE\n",
+    "#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n",
+    "#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\n",
+    "#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN\n",
+    "#  THE SOFTWARE."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Stable Diffusion 2.1\n",
+    "\n",
+    "The following example will show how to run `Stable Diffusion 2.1` with `MIGraphX`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install the required dependencies."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install dependencies\n",
+    "!pip install optimum[onnxruntime] transformers diffusers accelerate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We will use optimum to generate the onnx files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# export models\n",
+    "!optimum-cli export onnx --model stabilityai/stable-diffusion-2-1 models/sd21-onnx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now it is time to load these models with python.\n",
+    "\n",
+    "First, we make sure that MIGraphX module is found in the python path."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "mgx_lib_path = \"/opt/rocm/lib/\" # or \"/code/AMDMIGraphX/build/lib/\"\n",
+    "if mgx_lib_path not in sys.path:\n",
+    "    sys.path.append(mgx_lib_path)\n",
+    "import migraphx as mgx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, a helper method to load and cache the models.\n",
+    "\n",
+    "This will use the `models/sd21-onnx` path. If you changed it, make sure to update here as well."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "# helper for model loading\n",
+    "def load_mgx_model(name, shapes):\n",
+    "    file = f\"models/sd21-onnx/{name}/model\"\n",
+    "    print(f\"Loading {name} model from {file}\")\n",
+    "    if os.path.isfile(f\"{file}.mxr\"):\n",
+    "        print(f\"Found mxr, loading it...\")\n",
+    "        model = mgx.load(f\"{file}.mxr\", format=\"msgpack\")\n",
+    "    elif os.path.isfile(f\"{file}.onnx\"):\n",
+    "        print(f\"Parsing from onnx file...\")\n",
+    "        model = mgx.parse_onnx(f\"{file}.onnx\", map_input_dims=shapes)\n",
+    "        model.compile(mgx.get_target(\"gpu\"))\n",
+    "        print(f\"Saving {name} model to mxr file...\")\n",
+    "        mgx.save(model, f\"{file}.mxr\", format=\"msgpack\")\n",
+    "    else:\n",
+    "        print(f\"No {name} model found. Please verify the path is correct and re-try, or re-download model.\")\n",
+    "        os.exit(1)\n",
+    "    return model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "With that, we can load the models. This could take several minutes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text_encoder = load_mgx_model(\"text_encoder\", {\"input_ids\": [1, 77]})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "unet = load_mgx_model(\n",
+    "        \"unet\", {\n",
+    "            \"sample\": [1, 4, 64, 64],\n",
+    "            \"encoder_hidden_states\": [1, 77, 1024],\n",
+    "            \"timestep\": [1],\n",
+    "        })"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vae = load_mgx_model(\"vae_decoder\", {\"latent_sample\": [1, 4, 64, 64]})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Import the remaining packages."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from diffusers import EulerDiscreteScheduler\n",
+    "from transformers import CLIPTokenizer\n",
+    "import torch\n",
+    "import numpy as np\n",
+    "from tqdm.auto import tqdm\n",
+    "from PIL import Image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Time to load the scheduler and tokenizer from the original source."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_id = \"stabilityai/stable-diffusion-2-1\"\n",
+    "scheduler = EulerDiscreteScheduler.from_pretrained(model_id,\n",
+    "                                                   subfolder=\"scheduler\")\n",
+    "tokenizer = CLIPTokenizer.from_pretrained(model_id, subfolder=\"tokenizer\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we will define all the steps one by one, to make the last step short and simple.\n",
+    "\n",
+    "The first step will be to tokenize the user prompt. It will make a `(1, 77)` shaped `input_ids`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize(input):\n",
+    "    return tokenizer([input],\n",
+    "                     padding=\"max_length\",\n",
+    "                     max_length=tokenizer.model_max_length,\n",
+    "                     truncation=True,\n",
+    "                     return_tensors=\"np\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optional\n",
+    "test_tk = tokenize(\"test tokenizer to see the tokens\")\n",
+    "test_tk.input_ids.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We run the tokenized prompt through the `Text Encoder` model. It expects the `(1, 77)` data as `int32`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optional\n",
+    "text_encoder.get_parameter_shapes()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_embeddings(input):\n",
+    "    return np.array(\n",
+    "        text_encoder.run({\"input_ids\": input.input_ids.astype(np.int32)\n",
+    "                          })[0]).astype(np.float32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optional\n",
+    "test_emb = get_embeddings(tokenize(\"test tokenizer to see the tokens\"))\n",
+    "test_emb.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The other input of the model is latent representation (pure noise). It will be transformed into a 512x512 image later.\n",
+    "The last input will be the timestep."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_latents(seed):\n",
+    "    return torch.randn(\n",
+    "        (1, 4, 64, 64),\n",
+    "        generator=torch.manual_seed(seed),\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optional\n",
+    "test_latents = generate_latents(42)\n",
+    "latents.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we add two helpers to access and convert from torch to numpy with the proper datatype."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_scaled_sample(latents, t):\n",
+    "    return scheduler.scale_model_input(latents, t).numpy().astype(np.float32)\n",
+    "\n",
+    "\n",
+    "def get_timestep(t):\n",
+    "    return np.atleast_1d(t.numpy().astype(np.int64))  # convert 0D -> 1D"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The UNet model will be run in a loop. It will predict the noise residual."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optional\n",
+    "unet.get_parameter_shapes()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def denoise(sample, embeddings, timestep):\n",
+    "    return np.array(\n",
+    "        unet.run({\n",
+    "            \"sample\": sample,\n",
+    "            \"encoder_hidden_states\": embeddings,\n",
+    "            \"timestep\": timestep\n",
+    "        })[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Helpers to do the classifier-free guidance and computing the previous noisy sample."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def perform_guidance(noise_pred_uncond, noise_pred_text, scale):\n",
+    "    return noise_pred_uncond + scale * (noise_pred_text - noise_pred_uncond)\n",
+    "\n",
+    "def compute_previous(noise_pred, t, latents):\n",
+    "    # compute the previous noisy sample x_t -> x_t-1\n",
+    "    return scheduler.step(noise_pred, t, latents).prev_sample\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Scale and decode the image latents with VAE."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def scale_denoised(latents):\n",
+    "    return 1 / 0.18215 * latents\n",
+    "\n",
+    "\n",
+    "def decode(latents):\n",
+    "    return np.array(\n",
+    "        vae.run({\"latent_sample\": latents.numpy().astype(np.float32)})[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And lastly, we need to convert it to an image to display or save."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def convert_to_rgb_image(image):\n",
+    "    image = np.clip(image / 2 + 0.5, 0, 1)\n",
+    "    image = np.transpose(image, (0, 2, 3, 1))\n",
+    "    images = (image * 255).round().astype(\"uint8\")\n",
+    "    return Image.fromarray(images[0])\n",
+    "\n",
+    "def save_image(pil_image, filename=\"output.png\"):\n",
+    "    pil_image.save(filename, format=\"png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Feel free to play around with these params."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = \"a photograph of an astronaut riding a horse\"\n",
+    "negative_prompt = \"\"\n",
+    "steps = 20\n",
+    "seed = 13\n",
+    "scale = 7.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And now, to put everything together and run the whole pipeline:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scheduler.set_timesteps(steps)\n",
+    "\n",
+    "text_input, uncond_input = tokenize(prompt), tokenize(negative_prompt)\n",
+    "text_embeddings, uncond_embeddings = get_embeddings(\n",
+    "    text_input), get_embeddings(uncond_input)\n",
+    "latents = generate_latents(seed) * scheduler.init_noise_sigma\n",
+    "\n",
+    "for t in tqdm(scheduler.timesteps):\n",
+    "    sample = get_scaled_sample(latents, t)\n",
+    "    timestep = get_timestep(t)\n",
+    "\n",
+    "    noise_pred_uncond = denoise(sample, uncond_embeddings, timestep)\n",
+    "    noise_pred_text = denoise(sample, text_embeddings, timestep)\n",
+    "\n",
+    "    noise_pred = perform_guidance(noise_pred_uncond, noise_pred_text, scale)\n",
+    "    latents = compute_previous(torch.from_numpy(noise_pred), t, latents)\n",
+    "\n",
+    "latents = scale_denoised(latents)\n",
+    "result = decode(latents)\n",
+    "image = convert_to_rgb_image(result)\n",
+    "\n",
+    "# show the image\n",
+    "image"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you like the generated image, save it with the following:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_image(image, \"output.png\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "sd_venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/examples/diffusion/python_stable_diffusion_21/txt2img.py
+++ b/examples/diffusion/python_stable_diffusion_21/txt2img.py
+#  The MIT License (MIT)
+#
+#  Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the 'Software'), to deal
+#  in the Software without restriction, including without limitation the rights
+#  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included in
+#  all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#  THE SOFTWARE.
+
+from argparse import ArgumentParser
+from diffusers import EulerDiscreteScheduler
+from transformers import CLIPTokenizer
+from PIL import Image
+
+import migraphx as mgx
+import numpy as np
+import os
+import torch
+import time
+from functools import wraps
+
+
+# measurement helper
+def measure(fn):
+    @wraps(fn)
+    def measure_ms(*args, **kwargs):
+        start_time = time.perf_counter_ns()
+        result = fn(*args, **kwargs)
+        end_time = time.perf_counter_ns()
+        print(f"Elapsed time: {(end_time - start_time) * 1e-6:.4f} ms\n")
+        return result
+
+    return measure_ms
+
+
+def get_args():
+    parser = ArgumentParser()
+    parser.add_argument(
+        "-s",
+        "--seed",
+        type=int,
+        default=42,
+        help="Random seed",
+    )
+
+    parser.add_argument(
+        "-t",
+        "--steps",
+        type=int,
+        default=20,
+        help="Number of steps",
+    )
+
+    parser.add_argument(
+        "-p",
+        "--prompt",
+        type=str,
+        required=True,
+        help="Prompt",
+    )
+
+    parser.add_argument(
+        "-n",
+        "--negative-prompt",
+        type=str,
+        default="",
+        help="Negative prompt",
+    )
+
+    parser.add_argument(
+        "--scale",
+        type=float,
+        default=7.0,
+        help="Guidance scale",
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output",
+        type=str,
+        default=None,
+        help="Output name",
+    )
+    return parser.parse_args()
+
+
+class StableDiffusionMGX():
+    def __init__(self):
+        model_id = "stabilityai/stable-diffusion-2-1"
+        print(f"Using {model_id}")
+
+        print("Creating EulerDiscreteScheduler scheduler")
+        self.scheduler = EulerDiscreteScheduler.from_pretrained(
+            model_id, subfolder="scheduler")
+
+        print("Creating CLIPTokenizer tokenizer...")
+        self.tokenizer = CLIPTokenizer.from_pretrained(model_id,
+                                                       subfolder="tokenizer")
+
+        print("Load models...")
+        self.vae = StableDiffusionMGX.load_mgx_model(
+            "vae_decoder", {"latent_sample": [1, 4, 64, 64]})
+        self.text_encoder = StableDiffusionMGX.load_mgx_model(
+            "text_encoder", {"input_ids": [1, 77]})
+        self.unet = StableDiffusionMGX.load_mgx_model(
+            "unet", {
+                "sample": [1, 4, 64, 64],
+                "encoder_hidden_states": [1, 77, 1024],
+                "timestep": [1],
+            })
+
+    def run(self, prompt, negative_prompt, steps, seed, scale):
+        # need to set this for each run
+        self.scheduler.set_timesteps(steps)
+
+        print("Tokenizing prompt...")
+        text_input = self.tokenize(prompt)
+
+        print("Creating text embeddings for prompt...")
+        text_embeddings = self.get_embeddings(text_input)
+
+        print("Tokenizing negative prompt...")
+        uncond_input = self.tokenize(negative_prompt)
+
+        print("Creating text embeddings for negative prompt...")
+        uncond_embeddings = self.get_embeddings(uncond_input)
+
+        print(
+            f"Creating random input data ({1}x{4}x{64}x{64}) (latents) with seed={seed}..."
+        )
+        latents = torch.randn((1, 4, 64, 64),
+                              generator=torch.manual_seed(seed))
+
+        print("Apply initial noise sigma\n")
+        latents = latents * self.scheduler.init_noise_sigma
+
+        print("Running denoising loop...")
+        for step, t in enumerate(self.scheduler.timesteps):
+            print(f"#{step}/{len(self.scheduler.timesteps)} step")
+            latents = self.denoise_step(text_embeddings, uncond_embeddings,
+                                        latents, t, scale)
+
+        print("Scale denoised result...")
+        latents = 1 / 0.18215 * latents
+
+        print("Decode denoised result...")
+        image = self.decode(latents)
+
+        return image
+
+    @staticmethod
+    @measure
+    def load_mgx_model(name, shapes):
+        file = f"models/sd21-onnx/{name}/model"
+        print(f"Loading {name} model from {file}")
+        if os.path.isfile(f"{file}.mxr"):
+            print("Found mxr, loading it...")
+            model = mgx.load(f"{file}.mxr", format="msgpack")
+        elif os.path.isfile(f"{file}.onnx"):
+            print("Parsing from onnx file...")
+            model = mgx.parse_onnx(f"{file}.onnx", map_input_dims=shapes)
+            model.compile(mgx.get_target("gpu"))
+            print(f"Saving {name} model to mxr file...")
+            mgx.save(model, f"{file}.mxr", format="msgpack")
+        else:
+            print(f"No {name} model found. Please download it and re-try.")
+            os.exit(1)
+        return model
+
+    @measure
+    def tokenize(self, input):
+        return self.tokenizer([input],
+                              padding="max_length",
+                              max_length=self.tokenizer.model_max_length,
+                              truncation=True,
+                              return_tensors="np")
+
+    @measure
+    def get_embeddings(self, input):
+        return np.array(
+            self.text_encoder.run(
+                {"input_ids":
+                 input.input_ids.astype(np.int32)})[0]).astype(np.float32)
+
+    @staticmethod
+    def convert_to_rgb_image(image):
+        image = np.clip(image / 2 + 0.5, 0, 1)
+        image = np.transpose(image, (0, 2, 3, 1))
+        images = (image * 255).round().astype("uint8")
+        return Image.fromarray(images[0])
+
+    @staticmethod
+    def save_image(pil_image, filename="output.png"):
+        pil_image.save(filename)
+
+    @measure
+    def denoise_step(self, text_embeddings, uncond_embeddings, latents, t,
+                     scale):
+        sample = self.scheduler.scale_model_input(latents,
+                                                  t).numpy().astype(np.float32)
+        timestep = np.atleast_1d(t.numpy().astype(
+            np.int64))  # convert 0D -> 1D
+
+        noise_pred_uncond = np.array(
+            self.unet.run({
+                "sample": sample,
+                "encoder_hidden_states": uncond_embeddings,
+                "timestep": timestep
+            })[0])
+
+        noise_pred_text = np.array(
+            self.unet.run({
+                "sample": sample,
+                "encoder_hidden_states": text_embeddings,
+                "timestep": timestep
+            })[0])
+
+        # perform guidance
+        noise_pred = noise_pred_uncond + scale * (noise_pred_text -
+                                                  noise_pred_uncond)
+
+        # compute the previous noisy sample x_t -> x_t-1
+        return self.scheduler.step(torch.from_numpy(noise_pred), t,
+                                   latents).prev_sample
+
+    @measure
+    def decode(self, latents):
+        return np.array(
+            self.vae.run({"latent_sample":
+                          latents.numpy().astype(np.float32)})[0])
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    sd = StableDiffusionMGX()
+    result = sd.run(args.prompt, args.negative_prompt, args.steps, args.seed,
+                    args.scale)
+
+    print("Convert result to rgb image...")
+    image = StableDiffusionMGX.convert_to_rgb_image(result)
+    filename = args.output if args.output else f"output_s{args.seed}_t{args.steps}.png"
+    StableDiffusionMGX.save_image(image, args.output)
+    print(f"Image saved to {filename}")
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -28,9 +28,9 @@ include(ROCMInstallTargets)
 include(ROCMPackageConfigHelpers)
 include(RegisterOp)
 include(CheckCXXLinkerFlag)
- 
+include(CheckCXXSourceCompiles)

-add_library(migraphx 
+add_library(migraphx
    adjust_allocation.cpp
    analyze_streams.cpp
    apply_alpha_beta.cpp
@@ -104,6 +104,12 @@ add_library(migraphx
    value.cpp
    verify_args.cpp
 )
+
+if(WIN32)
+    # Due to compilation crashing, we need to use type-erased matchers on Windows.
+    target_compile_definitions(migraphx PUBLIC MIGRAPHX_USE_TYPE_ERASED_MATCHERS=1)
+endif()
+
 configure_file(version.h.in include/migraphx/version.h)
 rocm_set_soversion(migraphx ${MIGRAPHX_SO_VERSION})
 function(register_migraphx_ops)
@@ -215,6 +221,8 @@ register_migraphx_ops(
    scatternd_add
    scatternd_mul
    scatternd_none
+    scatternd_max
+    scatternd_min
    select_module
    sigmoid
    sign
@@ -247,17 +255,61 @@ rocm_install_targets(
    ${CMAKE_CURRENT_BINARY_DIR}/include
 )

-
-check_cxx_linker_flag(-lstdc++fs HAS_LIB_STD_FILESYSTEM)
-if(HAS_LIB_STD_FILESYSTEM)
-target_link_libraries(migraphx PRIVATE -lstdc++fs)
+if(NOT WIN32)
+    check_cxx_linker_flag(-lstdc++fs HAS_LIB_STD_FILESYSTEM)
+    if(HAS_LIB_STD_FILESYSTEM)
+        target_link_libraries(migraphx PRIVATE -lstdc++fs)
+    endif()
+    target_link_libraries(migraphx PRIVATE -ldl)
 endif()

-target_link_libraries(migraphx PRIVATE -ldl)
-
 target_include_directories(migraphx SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
 target_link_libraries(migraphx PUBLIC Threads::Threads)

+function(check_execution_par RESULT)
+    set(CMAKE_REQUIRED_LIBRARIES ${ARGN})
+    set(CMAKE_REQUIRED_FLAGS)
+    if(NOT MSVC)
+        set(CMAKE_REQUIRED_FLAGS "-std=c++17")
+    endif()
+    string(MD5 _flags_hash "${CMAKE_REQUIRED_FLAGS} ${CMAKE_REQUIRED_LIBRARIES}")
+    set(_source "
+#include <execution>
+
+int main() {
+    int* i = nullptr;
+    std::sort(std::execution::par, i, i);
+}
+")
+    check_cxx_source_compiles("${_source}" _has_execution_${_flags_hash})
+    set(${RESULT} ${_has_execution_${_flags_hash}} PARENT_SCOPE)
+endfunction()
+
+set(MIGRAPHX_HAS_EXECUTORS_DEFAULT Off)
+find_package(TBB QUIET)
+if(TBB_FOUND)
+    check_execution_par(TBB_HAS_EXECUTION_PAR TBB::tbb)
+    if(TBB_HAS_EXECUTION_PAR)
+        target_link_libraries(migraphx PUBLIC TBB::tbb)
+        set(MIGRAPHX_HAS_EXECUTORS_DEFAULT On)
+        message(STATUS "Using TBB for parallel execution")
+    endif()
+else()
+    check_execution_par(HAS_EXECUTION_PAR)
+    if(HAS_EXECUTION_PAR)
+        set(MIGRAPHX_HAS_EXECUTORS_DEFAULT On)
+    endif()
+endif()
+
+option(MIGRAPHX_HAS_EXECUTORS "C++ supports parallel executors" ${MIGRAPHX_HAS_EXECUTORS_DEFAULT})
+if(MIGRAPHX_HAS_EXECUTORS)
+    message("Parallel STL enabled")
+    target_compile_definitions(migraphx PUBLIC MIGRAPHX_HAS_EXECUTORS=1)
+else()
+    message("Parallel STL disabled")
+    target_compile_definitions(migraphx PUBLIC MIGRAPHX_HAS_EXECUTORS=0)
+endif()
+
 find_package(nlohmann_json 3.8.0 REQUIRED)
 target_link_libraries(migraphx PRIVATE nlohmann_json::nlohmann_json)
 migraphx_generate_export_header(migraphx)
@@ -275,8 +327,6 @@ target_link_libraries(migraphx INTERFACE $<BUILD_INTERFACE:msgpackc-cxx>)

 add_library(migraphx_all_targets INTERFACE)

-set(PACKAGE_DEPENDS)
-
 add_subdirectory(api)
 add_subdirectory(driver)
 add_subdirectory(onnx)

--- a/src/api/include/migraphx/migraphx.h
+++ b/src/api/include/migraphx/migraphx.h
@@ -44,7 +44,8 @@
    m(int32_type, int32_t) \
    m(int64_type, int64_t) \
    m(uint32_type, uint32_t) \
-    m(uint64_type, uint64_t)
+    m(uint64_type, uint64_t) \
+    m(fp8e4m3fnuz_type, migraphx::fp8::fp8e4m3fnuz)
 // clang-format on

 #ifdef __cplusplus

--- a/src/driver/argument_parser.hpp
+++ b/src/driver/argument_parser.hpp
@@ -105,6 +105,8 @@ inline std::ostream& operator<<(std::ostream& os, const color& c)
    static const bool use_color = isatty(STDOUT_FILENO) != 0;
    if(use_color)
        return os << "\033[" << static_cast<std::size_t>(c) << "m";
+#else
+    (void)c;
 #endif
    return os;
 }

--- a/src/dynamic_loader.cpp
+++ b/src/dynamic_loader.cpp
@@ -130,6 +130,30 @@ struct dynamic_loader_impl
    tmp_dir temp;
 };

+fs::path dynamic_loader::path(void* address)
+{
+    HMODULE module = nullptr;
+    if(GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS |
+                             GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT,
+                         static_cast<LPCSTR>(address),
+                         &module) == 0)
+    {
+        auto err = GetLastError();
+        MIGRAPHX_THROW("Unable to obtain module handle, error = " + std::to_string(err));
+    }
+    TCHAR buffer[MAX_PATH];
+    if(GetModuleFileName(module, buffer, sizeof(buffer)) == 0)
+    {
+        auto err = GetLastError();
+        MIGRAPHX_THROW("Unable to read module file path, error = " + std::to_string(err));
+    }
+    if(GetLastError() == ERROR_INSUFFICIENT_BUFFER)
+    {
+        MIGRAPHX_THROW("Buffer too small (" + std::to_string(MAX_PATH) + ") to hold the path");
+    }
+    return {buffer};
+}
+
 #endif

 optional<dynamic_loader> dynamic_loader::try_load(const fs::path& p)

--- a/src/fuse_pointwise.cpp
+++ b/src/fuse_pointwise.cpp
@@ -219,9 +219,8 @@ struct find_pointwise_reshape_pointwise

        auto reshape_input = [&](const auto& ins_to_insert) {
            return [&](auto input) {
-                auto c = m.insert_instruction(ins_to_insert, make_op("contiguous"), input);
                return m.insert_instruction(
-                    ins_to_insert, make_op("reshape", {{"dims", cd.dims}}), c);
+                    ins_to_insert, make_op("reshape", {{"dims", cd.dims}}), input);
            };
        };
        auto x_inputs = x_ins->inputs();