Merge branch 'develop' into navi-reduce

664b2f7c · Chris Austen · GitHub · 20cdddac · 9c46821c · 664b2f7c
Unverified Commit 664b2f7c authored Dec 04, 2023 by Chris Austen Committed by GitHub Dec 04, 2023
20 changed files
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -465,7 +465,7 @@ jobs:
    - name: Upload code coverage
      if: "matrix.configuration == 'codecov'"
      env:
-        CODECOV_TOKEN: "8545af1c-f90b-4345-92a5-0d075503ca56"
+        CODECOV_TOKEN: "f5d5a10b-3177-4c76-b25f-9b1c2f165e8b"
      run: |
        sudo apt-get install -y lcov
        cd build

--- a/.gitignore
+++ b/.gitignore
@@ -81,5 +81,7 @@ cmake-build*/
 build*/

 # Recommended location to install rbuild dependencies from README.md
-depend
+depend*/

+# local Python virtual environment
+.venv/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,9 +41,12 @@ if(NOT MIGRAPHX_GENERATOR_IS_MULTI_CONFIG)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYPES})
 endif()

-set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
+if(NOT WIN32)
+    set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "")
+    set(CMAKE_BUILD_RPATH "${CMAKE_BINARY_DIR}/lib")
+endif()

-set(CMAKE_BUILD_RPATH "${CMAKE_BINARY_DIR}/lib")
+list(APPEND CMAKE_PREFIX_PATH /opt/rocm /opt/rocm/llvm $ENV{ROCM_PATH} $ENV{HIP_PATH})

 project(migraphx LANGUAGES C CXX)
 include(CTest)
@@ -57,6 +60,9 @@ else()
 option(MIGRAPHX_ENABLE_PYTHON "Enable python bindings" ON)
 endif()

+# By default build shared libraries
+option(BUILD_SHARED_LIBS "Create shared libraries" ON)
+
 if(WIN32) # CK is not yet ported to Windows
 option(MIGRAPHX_USE_COMPOSABLEKERNEL "Enable MIGraphX to use composable kernel JIT library" OFF)
 else()
@@ -67,7 +73,7 @@ find_path(HALF_INCLUDE_DIR half.hpp PATH_SUFFIXES half)
 if (NOT HALF_INCLUDE_DIR)
    message(FATAL_ERROR "Could not find half.hpp - Please check that the install path of half.hpp has been added to CMAKE_PREFIX_PATH")
 else()
-	message(STATUS "half.hpp is at ${HALF_INCLUDE_DIR}")
+    message(STATUS "half.hpp is at ${HALF_INCLUDE_DIR}")
 endif()

 include(CheckTypeSize)
@@ -102,13 +108,21 @@ set(MIGRAPHX_ENABLE_CPU Off CACHE BOOL "")
 # Disable fpga backend by default
 set(MIGRAPHX_ENABLE_FPGA Off CACHE BOOL "")

+if(WIN32)
+    add_compile_definitions("$<$<COMPILE_LANGUAGE:C,CXX>:_CRT_SECURE_NO_WARNINGS;_USE_MATH_DEFINES>")
+endif()
+
 set(CMAKE_CXX_STANDARD_DEFAULT "")
-add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-std=c++17>)
+if(MSVC)
+    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/std:c++17>)
+else()
+    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-std=c++17>)
+endif()

 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 include(EnableCompilerWarnings)
 include(ROCMClangTidy)
-if(CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
+if(CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+.*")
    set(MIGRAPHX_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
 # Enable tidy on hip
 elseif(MIGRAPHX_ENABLE_GPU)

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -22,6 +22,8 @@ def rocmtestnode(Map conf) {
        def cmd = """
            ulimit -c unlimited
            echo "leak:dnnl::impl::malloc" > suppressions.txt
+            echo "leak:libtbb.so" >> suppressions.txt
+            cat suppressions.txt
            export LSAN_OPTIONS="suppressions=\$(pwd)/suppressions.txt"
            export MIGRAPHX_GPU_DEBUG=${gpu_debug}
            export CXX=${compiler}
@@ -134,12 +136,14 @@ rocmtest clang_debug: rocmnode('mi100+') { cmake_build ->
    }
 }, mlir_debug: rocmnode('mi100+') { cmake_build ->
    stage('MLIR Debug') {
-        withEnv(['MIGRAPHX_ENABLE_EXTRA_MLIR=1']) {
+        withEnv(['MIGRAPHX_ENABLE_EXTRA_MLIR=1', 'MIGRAPHX_MLIR_USE_SPECIFIC_OPS=fused,attention,convolution,dot']) {
            def sanitizers = "undefined"
            // Note: the -fno-sanitize= is copied from upstream LLVM_UBSAN_FLAGS.
            def debug_flags_cxx = "-g -O2 -fsanitize=${sanitizers} -fno-sanitize=vptr,function -fno-sanitize-recover=${sanitizers}"
            def debug_flags = "-g -O2 -fsanitize=${sanitizers} -fno-sanitize=vptr -fno-sanitize-recover=${sanitizers}"
            def gpu_targets = getgputargets()
+            // Since the purpose of this run verify all things MLIR supports,
+            // enabling all possible types of offloads
            cmake_build(flags: "-DCMAKE_BUILD_TYPE=debug -DMIGRAPHX_ENABLE_PYTHON=Off -DMIGRAPHX_ENABLE_MLIR=On -DCMAKE_CXX_FLAGS_DEBUG='${debug_flags_cxx}' -DCMAKE_C_FLAGS_DEBUG='${debug_flags}' -DGPU_TARGETS='${gpu_targets}'")
        }
    }

--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ The following is a list of prerequisites for building MIGraphX.
 3. Build MIGraphX source code:

    ```bash
-    rbuild build -d depend -B build
+    rbuild build -d depend -B build -DGPU_TARGETS=$(/opt/rocm/bin/rocminfo | grep -o -m1 'gfx.*')
    ```

 Once completed, all prerequisites are in the `depend` folder and MIGraphX is in the `build` directory.
@@ -106,7 +106,7 @@ the folder to `PATH`, or add the option `--prefix /usr/local` in the pip3 comman
 3. Configure CMake. If the prerequisites are installed at the default location `/usr/local`, use:

    ```bash
-    CXX=/opt/rocm/llvm/bin/clang++ cmake ..
+    CXX=/opt/rocm/llvm/bin/clang++ cmake .. -DGPU_TARGETS=$(/opt/rocm/bin/rocminfo | grep -o -m1 'gfx.*')
    ```

    Otherwise, you need to set `-DCMAKE_PREFIX_PATH=$your_loc` to configure CMake.

--- a/cmake/Embed.cmake
+++ b/cmake/Embed.cmake
@@ -21,17 +21,25 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 #####################################################################################
-find_program(EMBED_LD ld)
-find_program(EMBED_OBJCOPY objcopy)

-option(EMBED_USE_LD "Use ld to embed data files" OFF)
+if(WIN32)
+    set(EMBED_USE RC CACHE STRING "Use RC or CArrays to embed data files")
+    set_property(CACHE EMBED_USE PROPERTY STRINGS "RC;CArrays")
+else()
+    set(EMBED_USE CArrays CACHE STRING "Use LD or CArrays to embed data files")
+    set_property(CACHE EMBED_USE PROPERTY STRINGS "LD;CArrays")
+endif()
+
+if(EMBED_USE STREQUAL "LD")
+    find_program(EMBED_LD ld REQUIRED)
+    find_program(EMBED_OBJCOPY objcopy REQUIRED)
+endif()

 function(wrap_string)
    set(options)
    set(oneValueArgs VARIABLE AT_COLUMN)
    set(multiValueArgs)
    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cmake_parse_arguments(WRAP_STRING "${options}" "${oneValueArgs}" "" ${ARGN})

    string(LENGTH ${${PARSE_VARIABLE}} string_length)
    math(EXPR offset "0")
@@ -54,112 +62,124 @@ function(wrap_string)
    set(${PARSE_VARIABLE} "${lines}" PARENT_SCOPE)
 endfunction()

-function(generate_embed_source EMBED_NAME)
+function(generate_embed_source EMBED_NAME EMBED_DIR BASE_DIRECTORY)
    set(options)
-    set(oneValueArgs SRC HEADER RELATIVE)
-    set(multiValueArgs OBJECTS SYMBOLS FILES)
-
+    set(oneValueArgs)
+    set(multiValueArgs SYMBOLS FILES)
    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

-    set(EXTERNS)
-    set(INIT_KERNELS)
-
-    list(LENGTH PARSE_SYMBOLS SYMBOLS_LEN)
-    list(LENGTH PARSE_OBJECTS OBJECTS_LEN)
-    if(NOT ${SYMBOLS_LEN} EQUAL ${OBJECTS_LEN})
-        message(FATAL_ERROR "Symbols and objects dont match: ${SYMBOLS_LEN} != ${OBJECTS_LEN}")
-    endif()
-    math(EXPR LEN "${SYMBOLS_LEN} - 1")
-
-    foreach(idx RANGE ${LEN})
-        list(GET PARSE_SYMBOLS ${idx} SYMBOL)
-        list(GET PARSE_OBJECTS ${idx} OBJECT)
-        list(GET PARSE_FILES ${idx} FILE)
-
-        set(START_SYMBOL "_binary_${SYMBOL}_start")
-        set(LENGTH_SYMBOL "_binary_${SYMBOL}_length")
-        if(EMBED_USE_LD)
-            string(APPEND EXTERNS "
+    set(RESOURCE_ID 100)
+    foreach(SYMBOL FILE IN ZIP_LISTS PARSE_SYMBOLS PARSE_FILES)
+        cmake_path(RELATIVE_PATH FILE BASE_DIRECTORY ${BASE_DIRECTORY} OUTPUT_VARIABLE BASE_NAME)
+        if(EMBED_USE STREQUAL "RC")
+            string(TOUPPER "${SYMBOL}" SYMBOL)
+            string(APPEND FILE_IDS "#define IDR_${SYMBOL} ${RESOURCE_ID}\n")
+            cmake_path(NATIVE_PATH FILE NORMALIZE NATIVE_FILE)
+            string(REPLACE "\\" "\\\\" NATIVE_FILE "${NATIVE_FILE}")
+            string(APPEND RC_FILE_MAPPING "IDR_${SYMBOL} TEXTFILE \"${NATIVE_FILE}\"\n")
+            string(APPEND INIT_KERNELS "\n        {\"${BASE_NAME}\", resource::read(IDR_${SYMBOL})},")
+            math(EXPR RESOURCE_ID "${RESOURCE_ID} + 1" OUTPUT_FORMAT DECIMAL)
+        else()
+            set(START_SYMBOL "_binary_${SYMBOL}_start")
+            set(LENGTH_SYMBOL "_binary_${SYMBOL}_length")
+            if(EMBED_USE STREQUAL "LD")
+                string(APPEND EXTERNS "
 extern const char ${START_SYMBOL}[];
 extern const size_t _binary_${SYMBOL}_size;
 const auto ${LENGTH_SYMBOL} = reinterpret_cast<size_t>(&_binary_${SYMBOL}_size);
-            ")
-        else()
-            string(APPEND EXTERNS "
+")
+            else()
+                string(APPEND EXTERNS "
 extern const char ${START_SYMBOL}[];
 extern const size_t ${LENGTH_SYMBOL};
-            ")
+")
+            endif()
+            string(APPEND INIT_KERNELS "
+        { \"${BASE_NAME}\", { ${START_SYMBOL}, ${LENGTH_SYMBOL}} },")
        endif()
+    endforeach()
+    if(EMBED_USE STREQUAL "RC")
+       file(WRITE "${EMBED_DIR}/include/resource.h" "
+#define TEXTFILE 256

-        if(PARSE_RELATIVE)
-            file(RELATIVE_PATH BASE_NAME ${PARSE_RELATIVE} "${FILE}")
-        else()
-            get_filename_component(BASE_NAME "${FILE}" NAME)
-        endif()
+${FILE_IDS}
+")
+        file(WRITE "${EMBED_DIR}/resource.rc" "
+#include \"resource.h\"

-        string(APPEND INIT_KERNELS "
-            { \"${BASE_NAME}\", { ${START_SYMBOL}, ${LENGTH_SYMBOL}} },")
-    endforeach()
+${RC_FILE_MAPPING}
+")
+        set(EXTERNS "
+#include <Windows.h>
+#include \"resource.h\"

-    file(WRITE "${PARSE_HEADER}" "
+namespace resource {
+std::string_view read(int id)
+{
+    HMODULE handle = GetModuleHandle(nullptr);
+    HRSRC rc = FindResource(handle, MAKEINTRESOURCE(id), MAKEINTRESOURCE(TEXTFILE));
+    HGLOBAL data = LoadResource(handle, rc);
+    return {static_cast<const char*>(LockResource(data)), SizeofResource(handle, rc)};
+}
+}
+")
+        set(EMBED_FILES ${EMBED_DIR}/include/resource.h ${EMBED_DIR}/resource.rc)
+    endif()
+    file(WRITE "${EMBED_DIR}/include/${EMBED_NAME}.hpp" "
 #include <string_view>
 #include <unordered_map>
 #include <utility>
 std::unordered_map<std::string_view, std::string_view> ${EMBED_NAME}();
 ")

-    file(WRITE "${PARSE_SRC}" "
+    file(WRITE "${EMBED_DIR}/${EMBED_NAME}.cpp" "
 #include <${EMBED_NAME}.hpp>
 ${EXTERNS}
 std::unordered_map<std::string_view, std::string_view> ${EMBED_NAME}()
 {
-    static std::unordered_map<std::string_view, std::string_view> result = {${INIT_KERNELS}};
+    static std::unordered_map<std::string_view, std::string_view> result = {${INIT_KERNELS}
+    };
    return result;
 }
 ")
+    list(APPEND EMBED_FILES ${EMBED_DIR}/${EMBED_NAME}.cpp ${EMBED_DIR}/include/${EMBED_NAME}.hpp)
+    set(EMBED_FILES ${EMBED_FILES} PARENT_SCOPE)
 endfunction()

-function(embed_file OUTPUT_FILE OUTPUT_SYMBOL FILE)
-    set(WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-    # Glob is used to compute the relative path
-    file(GLOB FILES RELATIVE ${WORKING_DIRECTORY} ${FILE})
-    foreach(REL_FILE ${FILES})
-        string(MAKE_C_IDENTIFIER "${REL_FILE}" SYMBOL)
-        get_filename_component(OUTPUT_FILE_DIR "${REL_FILE}" DIRECTORY)
-        file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_FILE_DIR}")
-        if(EMBED_USE_LD)
-            set(OUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.o")
-        else()
-            set(OUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.cpp")
-        endif()
-        set(${OUTPUT_SYMBOL} ${SYMBOL} PARENT_SCOPE)
-        set(${OUTPUT_FILE} "${OUT_FILE}" PARENT_SCOPE)
-        if(EMBED_USE_LD)
-            add_custom_command(
-                OUTPUT "${OUT_FILE}"
-                COMMAND ${EMBED_LD} -r -o "${OUT_FILE}" -z noexecstack --format=binary "${REL_FILE}" 
-                COMMAND ${EMBED_OBJCOPY} --rename-section .data=.rodata,alloc,load,readonly,data,contents "${OUT_FILE}"
-                WORKING_DIRECTORY ${WORKING_DIRECTORY}
-                DEPENDS ${FILE}
-                VERBATIM
-            )
-        else()
-            set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${FILE})
-            # reads source file contents as hex string
-            file(READ ${FILE} HEX_STRING HEX)
-            # wraps the hex string into multiple lines
-            wrap_string(VARIABLE HEX_STRING AT_COLUMN 80)
-            # adds '0x' prefix and comma suffix before and after every byte respectively
-            string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1, " ARRAY_VALUES ${HEX_STRING})
-            # removes trailing comma
-            string(REGEX REPLACE ", $" "" ARRAY_VALUES ${ARRAY_VALUES})
-            file(WRITE "${OUT_FILE}" "
+function(embed_file FILE BASE_DIRECTORY)
+    message(STATUS "    ${FILE}")
+    cmake_path(RELATIVE_PATH FILE BASE_DIRECTORY "${BASE_DIRECTORY}" OUTPUT_VARIABLE REL_FILE)
+    string(MAKE_C_IDENTIFIER "${REL_FILE}" OUTPUT_SYMBOL)
+    get_filename_component(OUTPUT_FILE_DIR "${REL_FILE}" DIRECTORY)
+    file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${OUTPUT_FILE_DIR}")
+    if(EMBED_USE STREQUAL "LD")
+        set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.o")
+        add_custom_command(
+            OUTPUT "${OUTPUT_FILE}"
+            COMMAND ${EMBED_LD} -r -o "${OUTPUT_FILE}" -z noexecstack --format=binary "${REL_FILE}"
+            COMMAND ${EMBED_OBJCOPY} --rename-section .data=.rodata,alloc,load,readonly,data,contents "${OUTPUT_FILE}"
+            WORKING_DIRECTORY "${BASE_DIRECTORY}"
+            DEPENDS "${FILE}"
+            VERBATIM)
+        set(OUTPUT_FILE ${OUTPUT_FILE} PARENT_SCOPE)
+    elseif(EMBED_USE STREQUAL "CArrays")
+        set(OUTPUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${REL_FILE}.cpp")
+        # reads source file contents as hex string
+        file(READ ${FILE} HEX_STRING HEX)
+        # wraps the hex string into multiple lines
+        wrap_string(VARIABLE HEX_STRING AT_COLUMN 80)
+        # adds '0x' prefix and comma suffix before and after every byte respectively
+        string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1, " ARRAY_VALUES ${HEX_STRING})
+        # removes trailing comma
+        string(REGEX REPLACE ", $" "" ARRAY_VALUES ${ARRAY_VALUES})
+        file(WRITE "${OUTPUT_FILE}" "
 #include <cstddef>
-extern const char _binary_${SYMBOL}_start[] = { ${ARRAY_VALUES} };
-extern const size_t _binary_${SYMBOL}_length = sizeof(_binary_${SYMBOL}_start);
+extern const char _binary_${OUTPUT_SYMBOL}_start[] = { ${ARRAY_VALUES} };
+extern const size_t _binary_${OUTPUT_SYMBOL}_length = sizeof(_binary_${OUTPUT_SYMBOL}_start);
 ")
-        endif()
-    endforeach()
+        set(OUTPUT_FILE ${OUTPUT_FILE} PARENT_SCOPE)
+    endif()
+    set(OUTPUT_SYMBOL ${OUTPUT_SYMBOL} PARENT_SCOPE)
 endfunction()

 function(add_embed_library EMBED_NAME)
@@ -168,35 +188,32 @@ function(add_embed_library EMBED_NAME)
    set(multiValueArgs)
    cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

-    file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/embed)
-    file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/embed/${EMBED_NAME})
    set(EMBED_DIR ${CMAKE_CURRENT_BINARY_DIR}/embed/${EMBED_NAME})
-    set(SRC_FILE "${EMBED_DIR}/${EMBED_NAME}.cpp")
-    set(HEADER_FILE "${EMBED_DIR}/include/${EMBED_NAME}.hpp")
-    set(WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    set(OUTPUT_FILES)
-    set(SYMBOLS)
-    message(STATUS "Embedding files")
+    file(MAKE_DIRECTORY ${EMBED_DIR})
+    message(STATUS "Embedding kernel files:")
    foreach(FILE ${PARSE_UNPARSED_ARGUMENTS})
-        embed_file(OUTPUT_FILE OUTPUT_SYMBOL ${FILE})
+        embed_file(${FILE} ${PARSE_RELATIVE})
        list(APPEND OUTPUT_FILES ${OUTPUT_FILE})
        list(APPEND SYMBOLS ${OUTPUT_SYMBOL})
    endforeach()
-    message(STATUS "Generating embedding library ${EMBED_NAME}")
-    generate_embed_source(${EMBED_NAME} SRC ${SRC_FILE} HEADER ${HEADER_FILE} OBJECTS ${OUTPUT_FILES} SYMBOLS ${SYMBOLS} RELATIVE ${PARSE_RELATIVE} FILES ${PARSE_UNPARSED_ARGUMENTS})
-    
+    message(STATUS "Generating embedding library '${EMBED_NAME}'")
+    generate_embed_source(${EMBED_NAME} ${EMBED_DIR} "${PARSE_RELATIVE}" SYMBOLS ${SYMBOLS} FILES ${PARSE_UNPARSED_ARGUMENTS})
    set(INTERNAL_EMBED_LIB embed_lib_${EMBED_NAME})
-    add_library(${INTERNAL_EMBED_LIB} OBJECT "${SRC_FILE}")
+    add_library(${INTERNAL_EMBED_LIB} OBJECT ${EMBED_FILES})
+    if(EMBED_USE STREQUAL "CArrays")
+        target_sources(${INTERNAL_EMBED_LIB} PRIVATE ${OUTPUT_FILES})
+    endif()
    target_include_directories(${INTERNAL_EMBED_LIB} PRIVATE "${EMBED_DIR}/include")
    target_compile_options(${INTERNAL_EMBED_LIB} PRIVATE -Wno-reserved-identifier -Wno-extern-initializer -Wno-missing-variable-declarations)
    set_target_properties(${INTERNAL_EMBED_LIB} PROPERTIES POSITION_INDEPENDENT_CODE On)
-    
    add_library(${EMBED_NAME} INTERFACE)
-    if(EMBED_USE_LD)
+    if(EMBED_USE STREQUAL "LD")
        target_sources(${EMBED_NAME} INTERFACE ${OUTPUT_FILES})
-    else()
-        target_sources(${INTERNAL_EMBED_LIB} PRIVATE ${OUTPUT_FILES})
+    endif()
+    if(EMBED_USE STREQUAL "RC")
+        target_link_libraries(${EMBED_NAME} INTERFACE $<TARGET_OBJECTS:${INTERNAL_EMBED_LIB}>)
    endif()
    target_sources(${EMBED_NAME} INTERFACE $<TARGET_OBJECTS:${INTERNAL_EMBED_LIB}>)
    target_include_directories(${EMBED_NAME} INTERFACE "${EMBED_DIR}/include")
 endfunction()
+
--- a/docs/.doxygen/Doxyfile
+++ b/docs/.doxygen/Doxyfile
@@ -28,7 +28,14 @@ MACRO_EXPANSION = YES

 OUTPUT_DIRECTORY = docBin

-PREDEFINED = DOXYGEN
+PREDEFINED = \
+    DOXYGEN \
+    MIGRAPHX_EXPORT= \
+    MIGRAPHX_API_EXPORT= \
+    MIGRAPHX_GPU_EXPORT= \
+    MIGRAPHX_CPU_EXPORT= \
+    MIGRAPHX_ONNX_EXPORT= \
+    MIGRAPHX_TF_EXPORT= \

 PROJECT_NAME = MIGraphX


--- a/docs/.sphinx/requirements.txt
+++ b/docs/.sphinx/requirements.txt
@@ -21,7 +21,7 @@ charset-normalizer==3.1.0
    # via requests
 click==8.1.3
    # via sphinx-external-toc
-cryptography==41.0.4
+cryptography==41.0.6
    # via pyjwt
 deprecated==1.2.13
    # via pygithub
@@ -75,7 +75,9 @@ pygments==2.15.0
    #   pydata-sphinx-theme
    #   sphinx
 pyjwt[crypto]==2.6.0
-    # via pygithub
+    # via
+    #   pygithub
+    #   pyjwt
 pynacl==1.5.0
    # via pygithub
 pyyaml==6.0
@@ -87,7 +89,7 @@ requests==2.28.2
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==0.26.0
+rocm-docs-core==0.30.0
    # via -r requirements.in
 smmap==5.0.0
    # via gitdb

--- a/docs/contributor_guide.rst
+++ b/docs/contributor_guide.rst
 Contributor Guide
-===============
+=================

 .. toctree::
   :maxdepth: 2
   :caption: Contents:

-   dev_intro
+   dev/dev_intro
   dev/data
   dev/operators
   dev/program
@@ -14,3 +14,4 @@ Contributor Guide
   dev/pass
   dev/matchers
   dev/tools
+   dev/env_vars
--- a/docs/dev/data.rst
+++ b/docs/dev/data.rst
@@ -5,26 +5,36 @@ shape
 -----

 .. doxygenstruct:: migraphx::internal::shape
+   :members:
+   :undoc-members:

 literal
 -------

 .. doxygenstruct:: migraphx::internal::literal
+   :members:
+   :undoc-members:

 argument
 --------

 .. doxygenstruct:: migraphx::internal::argument
+   :members:
+   :undoc-members:

 raw_data
 --------

 .. doxygenstruct:: migraphx::internal::raw_data
+   :members:
+   :undoc-members:

-.. doxygenfunction:: migraphx::internal::visit_all
+.. doxygenfunction:: template<class T, class ...Ts> auto migraphx::internal::visit_all(T &&x, Ts&&... xs)


 tensor_view
 -----------

 .. doxygenstruct:: migraphx::internal::tensor_view
+   :members:
+   :undoc-members:
--- a/docs/dev_intro.rst
+++ b/docs/dev_intro.rst
-MIGraphX Fundamentals
+Developer Introduction
 ======================

 MIGraphX provides an optimized execution engine for deep learning neural networks.
@@ -18,8 +18,8 @@ Directions for building MIGraphX from source can be found in the main README fil
 Adding Two Literals
 --------------------

-A program is a collection of modules, which are collections of instructions to be executed when calling `eval <migraphx::program::eval>`.
-Each instruction has an associated `operation <migraphx::operation>` which represents the computation to be performed by the instruction.
+A program is a collection of modules, which are collections of instructions to be executed when calling :cpp:any:`eval <migraphx::internal::program::eval>`.
+Each instruction has an associated :cpp:any:`operation <migraphx::internal::operation>` which represents the computation to be performed by the instruction.

 We start with a snippet of the simple ``add_two_literals()`` function::

@@ -41,14 +41,14 @@ We start with a snippet of the simple ``add_two_literals()`` function::
    auto result = p.eval({}).back();
    std::cout << "add_two_literals: 1 + 2 = " << result << "\n";

-We start by creating a simple ``migraphx::program`` object and then getting a pointer to the main module of it.
+We start by creating a simple :cpp:any:`migraphx::program <migraphx::internal::program>` object and then getting a pointer to the main module of it.
 The program is a collection of ``modules`` that start executing from the main module, so instructions are added to the modules rather than directly onto the program object.
-We then use the `add_literal <migraphx::program::add_literal>` function to add an instruction that stores the literal number ``1`` while returning an `instruction_ref <migraphx::instruction_ref>`.
-The returned `instruction_ref <migraphx::instruction_ref>` can be used in another instruction as an input.
-We use the same `add_literal <migraphx::program::add_literal>` function to add a ``2`` to the program.
+We then use the :cpp:any:`add_literal <migraphx::internal::program::add_literal>` function to add an instruction that stores the literal number ``1`` while returning an :cpp:any:`instruction_ref <migraphx::internal::instruction_ref>`.
+The returned :cpp:any:`instruction_ref <migraphx::internal::instruction_ref>` can be used in another instruction as an input.
+We use the same :cpp:any:`add_literal <migraphx::internal::program::add_literal>` function to add a ``2`` to the program.
 After creating the literals, we then create the instruction to add the numbers together.
-This is done by using the `add_instruction <migraphx::program::add_instruction>` function with the ``"add"`` `operation <migraphx::program::operation>` created by `make_op <migraphx::program::make_op>` along with the previous `add_literal` `instruction_ref <migraphx::instruction_ref>` for the input arguments of the instruction.
-Finally, we can run this `program <migraphx::program>` by compiling it for the reference target (CPU) and then running it with `eval <migraphx::program::eval>`
+This is done by using the :cpp:any:`add_instruction <migraphx::internal::program::add_instruction>` function with the ``"add"`` :cpp:any:`operation <migraphx::internal::program::operation>` created by :cpp:any:`make_op <migraphx::internal::program::make_op>` along with the previous `add_literal` :cpp:any:`instruction_ref <migraphx::internal::instruction_ref>` for the input arguments of the instruction.
+Finally, we can run this :cpp:any:`program <migraphx::internal::program>` by compiling it for the reference target (CPU) and then running it with :cpp:any:`eval <migraphx::internal::program::eval>`
 The result is then retreived and printed to the console.

 We can compile the program for the GPU as well, but the file will have to be moved to the ``test/gpu/`` directory and the correct target must be included::
@@ -76,8 +76,8 @@ We can modify the program to take an input parameter ``x``, as seen in the ``add
    p.compile(migraphx::ref::target{});

 This adds a parameter of type ``int32``, and compiles it for the CPU.
-To run the program, we need to pass the parameter as a ``parameter_map`` when we call `eval <migraphx::program::eval>`.
-We create the ``parameter_map`` by setting the ``x`` key to an `argument <migraphx::argument>` object with an ``int`` data type::
+To run the program, we need to pass the parameter as a ``parameter_map`` when we call :cpp:any:`eval <migraphx::internal::program::eval>`.
+We create the ``parameter_map`` by setting the ``x`` key to an :cpp:any:`argument <migraphx::internal::argument>` object with an ``int`` data type::

    // create a parameter_map object for passing a value to the "x" parameter
    std::vector<int> data = {4};
@@ -92,7 +92,7 @@ We create the ``parameter_map`` by setting the ``x`` key to an `argument <migrap
 Handling Tensor Data
 ---------------------

-In the previous examples we have only been dealing with scalars, but the `shape <migraphx::shape>` class can describe multi-dimensional tensors.
+In the previous examples we have only been dealing with scalars, but the :cpp:any:`shape <migraphx::internal::shape>` class can describe multi-dimensional tensors.
 For example, we can compute a simple convolution::

    migraphx::program p;
@@ -109,7 +109,7 @@ For example, we can compute a simple convolution::

 Here we create two parameters for both the ``input`` and ``weights``.
 In the previous examples, we created simple literals, however, most programs will take data from allocated buffers (usually on the GPU).
-In this case, we can create `argument <migraphx::argument>` objects directly from the pointers to the buffers::
+In this case, we can create :cpp:any:`argument <migraphx::internal::argument>` objects directly from the pointers to the buffers::

    // Compile the program
    p.compile(migraphx::ref::target{});
@@ -133,8 +133,8 @@ In this case, we can create `argument <migraphx::argument>` objects directly fro

    EXPECT(migraphx::verify::verify_rms_range(results_vector, sol));

-An `argument <migraphx::argument>` can handle memory buffers from either the GPU or the CPU.
-By default when running the `program <migraphx::program>`, buffers are allocated on the corresponding target.
+An :cpp:any:`argument <migraphx::internal::argument>` can handle memory buffers from either the GPU or the CPU.
+By default when running the :cpp:any:`program <migraphx::internal::program>`, buffers are allocated on the corresponding target.
 When compiling for the CPU, the buffers by default will be allocated on the CPU.
 When compiling for the GPU, the buffers by default will be allocated on the GPU.
 With the option ``offload_copy=true`` set while compiling for the GPU, the buffers will be located on the CPU.
@@ -143,7 +143,7 @@ With the option ``offload_copy=true`` set while compiling for the GPU, the buffe
 Importing From ONNX
 --------------------

-A `program <migraphx::program>` can be built directly from an onnx file using the MIGraphX ONNX parser.
+A :cpp:any:`program <migraphx::internal::program>` can be built directly from an onnx file using the MIGraphX ONNX parser.
 This makes it easier to use neural networks directly from other frameworks.
 In this case, there is an ``parse_onnx`` function::


--- a/docs/dev/env_vars.rst
+++ b/docs/dev/env_vars.rst
+Environment Variables
+=====================
+
+For parsing
+---------------
+
+.. envvar:: MIGRAPHX_TRACE_ONNX_PARSER
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Print debugging traces for the onnx parser.
+Prints: initializers (if used), ONNX node operators, added MIGraphX instructions
+
+.. envvar:: MIGRAPHX_DISABLE_FP16_INSTANCENORM_CONVERT
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Disables the conversion from fp16 to fp32 for the InstanceNormalization ONNX operator that MIGX does as a workaround for accuracy issues with reduce_mean/variance.
+See ``parse_instancenorm.cpp`` for more details.
+
+
+Matchers
+------------
+
+.. envvar:: MIGRAPHX_TRACE_MATCHES
+
+Set to "1" to print the matcher that matches an instruction and the matched instruction.
+Set to "2" and use the ``MIGRAPHX_TRACE_MATHCES_FOR`` flag to filter out results.
+
+.. envvar:: MIGRAPHX_TRACE_MATCHES_FOR
+
+Set to the name of any matcher and only traces for that matcher will be printed out.
+
+.. envvar:: MIGRAPHX_VALIDATE_MATCHES
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Validate the module after finding the matches (runs ``module.validate()``).
+
+Program Execution 
+---------------------
+
+.. envvar:: MIGRAPHX_TRACE_EVAL
+
+Set to "1", "2", or "3" to use.
+"1" prints the instruction run and the time taken.
+"2" prints everything in "1" and a snippet of the output argument and some statistics (ex. min, max, mean) of the output.
+"3" prints everything in "1" and the full output buffers.
+
+
+Program Verification
+------------------------
+
+.. envvar:: MIGRAPHX_VERIFY_ENABLE_ALLCLOSE
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Uses ``allclose`` with the given ``atol`` and ``rtol`` for verifying ranges with ``driver verify`` or the tests that use ``migraphx/verify.hpp``.
+
+
+Pass debugging or Pass controls
+-----------------------------------
+
+.. envvar:: MIGRAPHX_TRACE_ELIMINATE_CONTIGUOUS
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Debug print the instructions that have input ``contiguous`` instructions removed.
+
+.. envvar:: MIGRAPHX_DISABLE_POINTWISE_FUSION
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Disables the ``fuse_pointwise`` compile pass.
+
+.. envvar:: MIGRAPHX_DEBUG_MEMORY_COLORING
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Print debug statements for the ``memory_coloring`` pass.
+
+.. envvar:: MIGRAPHX_TRACE_SCHEDULE
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Print debug statements for the ``schedule`` pass.
+
+.. envvar:: MIGRAPHX_TRACE_PROPAGATE_CONSTANT
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Traces instructions replaced with a constant.
+
+.. envvar:: MIGRAPHX_INT8_QUANTIZATION_PARAMS
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Print the quantization parameters in only the main module.
+
+.. envvar:: MIGRAPHX_DISABLE_DNNL_POST_OPS_WORKAROUND
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Disable the DNNL post ops workaround.
+
+.. envvar:: MIGRAPHX_DISABLE_MIOPEN_FUSION
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Disable MIOpen fusions.
+
+.. envvar:: MIGRAPHX_DISABLE_SCHEDULE_PASS
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Disable the ``schedule`` pass.
+
+.. envvar:: MIGRAPHX_DISABLE_REDUCE_FUSION
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Disable the ``fuse_reduce`` pass.
+
+.. envvar:: MIGRAPHX_ENABLE_NHWC
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Enable the ``layout_nhwc`` pass.
+
+.. envvar:: MIGRAPHX_ENABLE_CK
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Enable using the Composable Kernels library.
+Should be used in conjunction with ``MIGRAPHX_DISABLE_MLIR=1``.
+
+.. envvar:: MIGRAPHX_DISABLE_MLIR*
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Disable using the rocMLIR library.
+
+.. envvar:: MIGRAPHX_ENABLE_EXTRA_MLIR
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Enables additional opportunities to use MLIR that may improve performance.
+
+.. envvar:: MIGRAPHX_COPY_LITERALS
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Use ``hip_copy_to_gpu`` with a new ``literal`` instruction rather than use ``hip_copy_literal{}``.
+
+Compilation traces
+----------------------
+
+.. envvar:: MIGRAPHX_TRACE_FINALIZE
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Debug print instructions during the ``module.finalize()`` step.
+
+.. envvar:: MIGRAPHX_TRACE_COMPILE
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Print trace information for the graph compilation process.
+
+.. envvar:: MIGRAPHX_TRACE_PASSES
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Print the compile pass and the program after the pass.
+
+.. envvar:: MIGRAPHX_TIME_PASSES
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Time the compile passes.
+
+
+GPU Kernels JIT compilation debugging (applicable for both hiprtc and hipclang)
+-----------------------------------------
+
+.. envvar:: MIGRAPHX_TRACE_CMD_EXECUTE
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Print commands executed by the MIGraphX ``process``.
+
+.. envvar:: MIGRAPHX_TRACE_HIPRTC
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Print HIPRTC options and C++ file executed.
+
+.. envvar:: MIGRAPHX_DEBUG_SAVE_TEMP_DIR
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Make it so the created temporary directories are not deleted.
+
+.. envvar:: MIGRAPHX_GPU_DEBUG
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Internally, this adds the option ``-DMIGRAPHX_DEBUG`` when compiling GPU kernels. It enables assertions and capture of source locations for the errors. 
+
+.. envvar:: MIGRAPHX_GPU_DEBUG_SYM
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Adds the option ``-g`` when compiling HIPRTC.
+
+.. envvar:: MIGRAPHX_GPU_DUMP_SRC
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Dump the HIPRTC source files compiled.
+
+.. envvar:: MIGRAPHX_GPU_DUMP_ASM
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Dump the hip-clang assembly.
+
+.. envvar:: MIGRAPHX_GPU_OPTIMIZE
+
+Set the optimization mode for GPU compile (``-O`` option).
+Defaults to ``-O3``.
+
+.. envvar:: MIGRAPHX_GPU_COMPILE_PARALLEL
+
+Set to the number of threads to use.
+Compile GPU code in parallel with the given number of threads.
+
+.. envvar:: MIGRAPHX_TRACE_NARY
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Print the ``nary`` device functions used.
+
+.. envvar:: MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Enable HIPRTC workarounds for bugs in HIPRTC.
+
+.. envvar:: MIGRAPHX_USE_FAST_SOFTMAX
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Use the fast softmax optimization.
+
+.. envvar:: MIGRAPHX_ENABLE_NULL_STREAM
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Allow using null stream for miopen and hipStream.
+
+.. envvar:: MIGRAPHX_NSTREAMS
+
+Set to the number of streams to use.
+Defaults to 1.
+
+.. envvar:: MIGRAPHX_TRACE_BENCHMARKING
+
+Set to "1" to print benchmarching trace.
+Set to "2" to print benchmarching trace with more detail.
+
+MLIR vars
+-------------
+
+.. envvar:: MIGRAPHX_TRACE_MLIR
+
+Set to "1" to trace MLIR and print any failures.
+Set to "2" to additionally print all MLIR operations.
+
+.. envvar:: MIGRAPHX_MLIR_USE_SPECIFIC_OPS
+
+Set to the name of the operations you want to always use MLIR regardless of GPU architecture.
+Accepts a list of operators separated by commas (ex: "fused", "convolution", "dot").
+
+.. envvar:: MIGRAPHX_MLIR_TUNING_DB
+
+Set to the path of the MLIR tuning database to load.
+
+.. envvar:: MIGRAPHX_MLIR_TUNING_CFG
+
+Set to the path of the tuning configuration.
+Appends to tuning cfg file that could be used with rocMLIR tuning scripts.
+
+.. envvar:: MIGRAPHX_MLIR_TUNE_EXHAUSTIVE
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Do exhaustive tuning for MLIR.
+
+.. envvar:: MIGRAPHX_MLIR_TUNE_LIMIT
+
+Set to an integer greater than 1.
+Limits the number of solutions that MLIR will use for tuning.
+
+CK vars
+-----------
+
+.. envvar:: MIGRAPHX_LOG_CK_GEMM
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Print Composable Kernels GEMM traces.
+
+.. envvar:: MIGRAPHX_CK_DEBUG
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Always add the ``-DMIGRAPHX_CK_CHECK=1`` for compiling Composable Kernels operators.
+
+.. envvar:: MIGRAPHX_TUNE_CK
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Use tuning for Composable Kernels.
+
+Testing 
+------------
+
+.. envvar:: MIGRAPHX_TRACE_TEST_COMPILE
+
+Set to the target that you want to trace the compilation of (ex. "gpu", "cpu").
+Prints the compile trace for the given target for the verify tests.
+This flag shouldn't be used in conjunction with ``MIGRAPHX_TRACE_COMPILE``.
+For the verify tests only use ``MIGRAPHX_TRACE_TEST_COMPILE``.
+
+.. envvar:: MIGRAPHX_TRACE_TEST
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Prints the reference and target programs even if the verify passed successfully.
+
+.. envvar:: MIGRAPHX_DUMP_TEST
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Dumps verify tests to ``.mxr`` files.
--- a/docs/dev/operators.rst
+++ b/docs/dev/operators.rst
@@ -5,6 +5,8 @@ operation
 ---------

 .. doxygenstruct:: migraphx::internal::operation
+   :members:
+   :undoc-members:

 .. doxygenfunction:: migraphx::internal::is_context_free

@@ -14,3 +16,5 @@ operators
 ---------

 .. doxygennamespace:: migraphx::internal::op
+   :members:
+   :undoc-members:
--- a/docs/dev/pass.rst
+++ b/docs/dev/pass.rst
@@ -5,63 +5,82 @@ pass
 ----

 .. doxygenstruct:: migraphx::internal::pass
+   :members:
+   :undoc-members:

 dead_code_elimination
 ---------------------

 .. doxygenstruct:: migraphx::internal::dead_code_elimination
+   :members:
+   :undoc-members:

 eliminate_common_subexpression
 ------------------------------

 .. doxygenstruct:: migraphx::internal::eliminate_common_subexpression
+   :members:
+   :undoc-members:

 eliminate_concat
 ----------------

 .. doxygenstruct:: migraphx::internal::eliminate_concat
+   :members:
+   :undoc-members:

 eliminate_contiguous
 --------------------

 .. doxygenstruct:: migraphx::internal::eliminate_contiguous
+   :members:
+   :undoc-members:

 eliminate_identity
 ------------------

 .. doxygenstruct:: migraphx::internal::eliminate_identity
+   :members:
+   :undoc-members:

 eliminate_pad
 -------------

 .. doxygenstruct:: migraphx::internal::eliminate_pad
+   :members:
+   :undoc-members:

 propagate_constant
 ------------------

 .. doxygenstruct:: migraphx::internal::propagate_constant
-
-rewrite_batchnorm
-----------------
-
-.. doxygenstruct:: migraphx::internal::rewrite_batchnorm
+   :members:
+   :undoc-members:

 rewrite_rnn
 -----------

 .. doxygenstruct:: migraphx::internal::rewrite_rnn
+   :members:
+   :undoc-members:

 schedule
 --------

 .. doxygenstruct:: migraphx::internal::schedule
+   :members:
+   :undoc-members:

 simplify_algebra
 ----------------

 .. doxygenstruct:: migraphx::internal::simplify_algebra
+   :members:
+   :undoc-members:

 simplify_reshapes
 -----------------

 .. doxygenstruct:: migraphx::internal::simplify_reshapes
+   :members:
+   :undoc-members:
--- a/docs/dev/program.rst
+++ b/docs/dev/program.rst
@@ -5,6 +5,8 @@ instruction
 -----------

 .. doxygenstruct:: migraphx::internal::instruction
+   :members:
+   :undoc-members:

 instruction_ref
 ---------------
@@ -17,6 +19,8 @@ program
 -------

 .. doxygenstruct:: migraphx::internal::program
+   :members:
+   :undoc-members:

 parse_onnx
 ----------

--- a/docs/dev/targets.rst
+++ b/docs/dev/targets.rst
@@ -5,14 +5,20 @@ target
 ------

 .. doxygenstruct:: migraphx::internal::target
+   :members:
+   :undoc-members:

 gpu::target
 -----------

 .. doxygenstruct:: migraphx::internal::gpu::target
+   :members:
+   :undoc-members:

 cpu::target
 -----------

 .. doxygenstruct:: migraphx::internal::cpu::target
+   :members:
+   :undoc-members:

--- a/docs/driver.rst
+++ b/docs/driver.rst
 MIGraphX Driver
 ===============

+The MIGraphX driver is a tool that allows you to utilize many of the core functions of MIGraphX without having to write your own program. It can read, compile, run, and test the performance of a model with randomized data.
+
 read
 ----

@@ -17,6 +19,7 @@ compile

 Compiles and prints input graph.

+.. include:: ./driver/read.rst
 .. include:: ./driver/compile.rst

 run
@@ -26,6 +29,7 @@ run

 Loads and prints input graph.

+.. include:: ./driver/read.rst
 .. include:: ./driver/compile.rst

 perf
@@ -35,6 +39,7 @@ perf

 Compiles and runs input graph then prints performance report.

+.. include:: ./driver/read.rst
 .. include:: ./driver/compile.rst

 .. option::  --iterations, -n [unsigned int]
@@ -48,6 +53,7 @@ verify

 Runs reference and CPU or GPU implementations and checks outputs for consistency.

+.. include:: ./driver/read.rst
 .. include:: ./driver/compile.rst

 .. option::  --rms-tol [double]
@@ -71,7 +77,7 @@ Verify each instruction
 Reduce program and verify

 roctx
----
+-----

 .. program:: migraphx-driver roctx

@@ -86,4 +92,5 @@ An example command line combined with rocprof for tracing purposes is given belo
 After `rocprof` is run, the output directory will contain trace information for HIP, HCC and ROCTX in seperate `.txt` files.
 To understand the interactions between API calls, it is recommended to utilize `roctx.py` helper script as desribed in :ref:`dev/tools:rocTX` section. 

-.. include:: ./driver/compile.rst
\ No newline at end of file
+.. include:: ./driver/read.rst
+.. include:: ./driver/compile.rst
--- a/docs/driver/compile.rst
+++ b/docs/driver/compile.rst
-.. include:: ./driver/read.rst
-
 .. option::  --fill0 [std::vector<std::string>]

 Fill parameter with 0s

--- a/docs/driver/read.rst
+++ b/docs/driver/read.rst
@@ -46,11 +46,11 @@ Trim instructions from the end (Default: 0)

 Dim of a parameter (format: "@name d1 d2 dn")

-.. options:: --dyn-input-dim [std::vector<std::string>]
+.. option:: --dyn-input-dim [std::vector<std::string>]

 Set dynamic dimensions of a parameter using JSON formatting (format "@name" "dynamic_dimension_json")

-.. options:: --default-dyn-dim
+.. option:: --default-dyn-dim

 Set the default dynamic dimension (format {min:x, max:y, optimals:[o1,o2,...]})


--- a/docs/reference/cpp.rst
+++ b/docs/reference/cpp.rst
@@ -8,45 +8,65 @@ shape
 .. doxygenenum:: migraphx_shape_datatype_t

 .. doxygenstruct:: migraphx::shape
+   :members:
+   :undoc-members:

 argument
 --------

 .. doxygenstruct:: migraphx::argument
+   :members:
+   :undoc-members:

 target
 ------

 .. doxygenstruct:: migraphx::target
+   :members:
+   :undoc-members:

 program
 -------

 .. doxygenstruct:: migraphx::program_parameter_shapes
+   :members:
+   :undoc-members:

 .. doxygenstruct:: migraphx::program_parameters
+   :members:
+   :undoc-members:

 .. doxygenstruct:: migraphx_compile_options
+   :members:
+   :undoc-members:

 .. doxygenstruct:: migraphx::program
+   :members:
+   :undoc-members:

 quantize
 --------

 .. doxygenstruct:: migraphx::quantize_op_names
+   :members:
+   :undoc-members:

 .. doxygenfunction:: migraphx::quantize_fp16(const program&)

 .. doxygenfunction:: migraphx::quantize_fp16(const program&, const quantize_op_names&)

 .. doxygenstruct:: migraphx::quantize_int8_options
+   :members:
+   :undoc-members:

-.. doxygenfunction:: migraphx::quantize_int8
+.. doxygenfunction::migraphx::quantize_int8

 parse_onnx
 ----------

 .. doxygenstruct:: migraphx::onnx_options
+   :members:
+   :undoc-members:

 .. doxygenfunction:: migraphx::parse_onnx(const char *)

@@ -63,16 +83,18 @@ parse_onnx
 load
 ----

-.. doxygenstruct:: migraphx_file_options
+.. doxygenstruct:: migraphx::file_options
+   :members:
+   :undoc-members:

 .. doxygenfunction:: migraphx::load(const char *)

-.. doxygenfunction:: migraphx::load(const char *, migraphx_file_options)
+.. doxygenfunction:: migraphx::load(const char *, const file_options&)

 save
 ----

 .. doxygenfunction:: migraphx::save(const program&, const char *)

-.. doxygenfunction:: migraphx::save(const program&, const char *, migraphx_file_options)
+.. doxygenfunction:: migraphx::save(const program&, const char *, const file_options&)