Update tuning method

78a300ff · Alan Turner · dea0555f · 78a300ff · 78a300ff · 78a300ff
Commit 78a300ff authored Oct 07, 2022 by Alan Turner
20 changed files
--- a/deps/bin/ccache
+++ b/deps/bin/ccache
+../cget/pkg/ccache/install/bin/ccache
\ No newline at end of file
--- a/deps/bin/cppcheck
+++ b/deps/bin/cppcheck
+../cget/pkg/danmar__cppcheck/install/bin/cppcheck
\ No newline at end of file
--- a/deps/bin/pcre-config
+++ b/deps/bin/pcre-config
+../cget/pkg/pcre/install/bin/pcre-config
\ No newline at end of file
--- a/deps/bin/pcre_scanner_unittest
+++ b/deps/bin/pcre_scanner_unittest
+../cget/pkg/pcre/install/bin/pcre_scanner_unittest
\ No newline at end of file
--- a/deps/bin/pcre_stringpiece_unittest
+++ b/deps/bin/pcre_stringpiece_unittest
+../cget/pkg/pcre/install/bin/pcre_stringpiece_unittest
\ No newline at end of file
--- a/deps/bin/pcrecpp_unittest
+++ b/deps/bin/pcrecpp_unittest
+../cget/pkg/pcre/install/bin/pcrecpp_unittest
\ No newline at end of file
--- a/deps/bin/pcregrep
+++ b/deps/bin/pcregrep
+../cget/pkg/pcre/install/bin/pcregrep
\ No newline at end of file
--- a/deps/bin/pcretest
+++ b/deps/bin/pcretest
+../cget/pkg/pcre/install/bin/pcretest
\ No newline at end of file
--- a/deps/bin/protoc
+++ b/deps/bin/protoc
+../cget/pkg/google__protobuf/install/bin/protoc
\ No newline at end of file
--- a/deps/bin/protoc-3.11.0.0
+++ b/deps/bin/protoc-3.11.0.0
+../cget/pkg/google__protobuf/install/bin/protoc-3.11.0.0
\ No newline at end of file
--- a/deps/bin/sqlite3
+++ b/deps/bin/sqlite3
+../cget/pkg/sqlite3/install/bin/sqlite3
\ No newline at end of file
--- a/deps/bin/unzstd
+++ b/deps/bin/unzstd
+../cget/pkg/facebook__zstd/install/bin/unzstd
\ No newline at end of file
--- a/deps/bin/zstd
+++ b/deps/bin/zstd
+../cget/pkg/facebook__zstd/install/bin/zstd
\ No newline at end of file
--- a/deps/bin/zstdcat
+++ b/deps/bin/zstdcat
+../cget/pkg/facebook__zstd/install/bin/zstdcat
\ No newline at end of file
--- a/deps/bin/zstdgrep
+++ b/deps/bin/zstdgrep
+../cget/pkg/facebook__zstd/install/bin/zstdgrep
\ No newline at end of file
--- a/deps/bin/zstdless
+++ b/deps/bin/zstdless
+../cget/pkg/facebook__zstd/install/bin/zstdless
\ No newline at end of file
--- a/deps/bin/zstdmt
+++ b/deps/bin/zstdmt
+../cget/pkg/facebook__zstd/install/bin/zstdmt
\ No newline at end of file
--- a/deps/cget/cget.cmake
+++ b/deps/cget/cget.cmake
+set(CGET_PREFIX "/code/AMDMIGraphX/AMDMIGraphX/deps")
+set(CMAKE_PREFIX_PATH "/code/AMDMIGraphX/AMDMIGraphX/deps")
+if (${CMAKE_VERSION} VERSION_LESS "3.6.0")
+    include_directories(SYSTEM ${CGET_PREFIX}/include)
+    else ()
+        set(CMAKE_CXX_STANDARD_INCLUDE_DIRECTORIES "${CGET_PREFIX}/include")
+        set(CMAKE_C_STANDARD_INCLUDE_DIRECTORIES "${CGET_PREFIX}/include")
+endif()
+if (CMAKE_CROSSCOMPILING)
+    list(APPEND CMAKE_FIND_ROOT_PATH "/code/AMDMIGraphX/AMDMIGraphX/deps")
+endif()
+if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+    set(CMAKE_INSTALL_PREFIX "/code/AMDMIGraphX/AMDMIGraphX/deps")
+endif()
+set(CMAKE_CXX_COMPILER "/opt/rocm-5.3.0/llvm/bin/clang++")
+set(CMAKE_C_COMPILER "/opt/rocm-5.3.0/llvm/bin/clang")
+if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
+    set(CMAKE_CXX_ENABLE_PARALLEL_BUILD_FLAG "/MP")
+endif()
+if (BUILD_SHARED_LIBS)
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS "ON" CACHE BOOL "")
+endif()
+set(CMAKE_FIND_FRAMEWORK "LAST" CACHE STRING "")
+set(CMAKE_INSTALL_RPATH "${CGET_PREFIX}/lib" CACHE STRING "")
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/ck.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/ck.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
+#include "hip/hip_runtime.h"
+#include "hip/hip_fp16.h"
+#endif
+#define CK_TIME_KERNEL 1
+// constant address space for kernel parameter
+// https://llvm.org/docs/AMDGPUUsage.html#address-spaces
+#define CK_CONSTANT_ADDRESS_SPACE __attribute__((address_space(4)))
+// launch bounds
+#define CK_USE_LAUNCH_BOUNDS 1
+#ifdef CK_USE_LAUNCH_BOUNDS
+#define CK_MAX_THREAD_PER_BLOCK 256
+#define CK_MIN_BLOCK_PER_CU 2
+#endif
+// check GPU target
+#ifdef __HIP_DEVICE_COMPILE__
+#if !(defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
+      defined(__gfx90a__) || defined(__gfx1030__))
+#error Not supported target
+#endif
+#endif
+// buffer resource
+#ifndef __HIP_DEVICE_COMPILE__ // for host code
+#define CK_BUFFER_RESOURCE_3RD_DWORD -1
+#elif defined(__gfx803__) || defined(__gfx900__) || defined(__gfx906__) || defined(__gfx908__) || \
+    defined(__gfx90a__) // for GPU code
+#define CK_BUFFER_RESOURCE_3RD_DWORD 0x00020000
+#elif defined(__gfx1030__) // for GPU code
+#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
+#endif
+// FMA instruction
+#ifndef __HIP_DEVICE_COMPILE__                   // for host code, define nothing
+#elif defined(__gfx803__) || defined(__gfx900__) // for GPU code
+#define CK_USE_AMD_V_MAC_F32
+#elif defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx1030__) // for GPU code
+#define CK_USE_AMD_V_FMAC_F32
+#define CK_USE_AMD_V_DOT2_F32_F16
+#define CK_USE_AMD_V_DOT4_I32_I8
+#endif
+// MFMA instruction
+#ifndef __HIP_DEVICE_COMPILE__ // for host code
+#define CK_USE_AMD_MFMA
+#elif defined(__gfx908__) || defined(__gfx90a__) // for GPU code
+#define CK_USE_AMD_MFMA
+#endif
+#if defined(__gfx90a__)
+#define CK_USE_AMD_MFMA_BF16_1K_OP
+#endif
+// buffer load
+#define CK_USE_AMD_BUFFER_LOAD 1
+// buffer store
+#define CK_USE_AMD_BUFFER_STORE 1
+// buffer atomic add: integer
+#define CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER 1
+// buffer atomic add: floating point
+#ifndef __HIP_DEVICE_COMPILE__ // for host code
+#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
+#elif defined(__gfx908__) || defined(__gfx90a__) // for GPU code
+#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 1
+#else // for GPU code
+#define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
+#endif
+#if defined(__gfx90a__) // for GPU code
+#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
+#else
+#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
+#endif
+// inline asm
+#define CK_USE_AMD_INLINE_ASM 1
+// inner product (DLOP)
+#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1
+// block synchronization only s_wait lgkmcnt(0), not vmcnt(0)
+#define CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM 1
+// experimental feature: multi index implemented as array
+#define CK_EXPERIMENTAL_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
+// experimental feature: static tensor descriptor
+#define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0
+// experimental feature: buffer load/store/atomic-add/ OOB trick
+// This (ifndef) is a hack to use customized behavior for buffer load rather than using default
+// setting. Don't use this hack unless absolutely necessary!
+// FIXME: make the behavior of buffer load a configurable (template) parameter for each usage
+#ifndef CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
+#define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
+#endif
+#define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
+#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1
+#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK 1
+// experimental feature: in-regsiter sub-dword transpose
+#define CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE 1
+// experimental feature: merge transformation use magic number division
+#define CK_EXPERIMENTAL_MERGE_USE_MAGIC_DIVISION 1
+// experimental feature: use __builtin_memcpy instead of pointer cast to access a vector from
+// pointer of scalar
+#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS 0
+// experimental feature: use __builtin_memcpy instead of union to do bit_cast
+#define CK_EXPERIMENTAL_USE_MEMCPY_FOR_BIT_CAST 1
+// experimental feature: optimize for inter-wave scheduling policy
+#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING 0
+#define CK_EXPERIMENTAL_INTER_WAVE_SCHEDULING_MAC_CLUSTERS 1
+// hack: have underlying assumption that need to be satsified, otherwise it's a bug
+// hack for forcing register to keep idx_diff_low_const in SGPR. idx_diff_low_const must be
+// thread-invariant, otherwise it's a bug
+// TODO: separate index calculation into "compile-time", "global", "block", "wave", "thread"
+#define CK_HACK_MERGE_CALCULATE_IDX_DIFF_LOW_CONST_USE_AMD_GCN_READ_FIRST_LANE 0
+// workaround: compiler crash when compiling recursive lambda
+#define CK_WORKAROUND_SWDEV_275126 1
+// workaround: compiler crash when using buffer load/store for i8
+#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_BUFFER_LOAD_STORE_ISSUE 1
+// workaround: compiler gnerating inefficient ds_write instructions
+#define CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE 1
+// (gfx908 only) workaround: compiler crash in fused kernels on mainline #9110; #10738 seems ok
+// error message was "fatal error: error in backend: Error while trying to spill VGPR0 from class
+// VGPR_32: Cannot scavenge register without an emergency spill slot!"
+// this fall back to less ideal way of handle NPadding in fused attention kernel
+#ifdef __gfx908__
+#define CK_WORKAROUND_SWDEV_XXXXXX_ATTN_KERNEL_CLANG_CANNOT_SCAVENGE_REGISTER 1
+#else
+// for __gfx90a__, ...
+#define CK_WORKAROUND_SWDEV_XXXXXX_ATTN_KERNEL_CLANG_CANNOT_SCAVENGE_REGISTER 0
+#endif // __gfx908__
+// workaround: verifaction failure, due to compiler regression, for conv bwd-data fp16 using some
+// tuning parameter
+#define CK_WORKAROUND_SWDEV_325164 0
+namespace ck {
+enum struct InMemoryDataOperationEnum
+{
+    Set,
+    AtomicAdd,
+    AtomicMax,
+    Add
+};
+// FIXME: use regular Sequence and remove this
+template <InMemoryDataOperationEnum... Is>
+struct InMemoryDataOperationEnumSequence
+{
+    static constexpr int mSize = sizeof...(Is);
+    __host__ __device__ static constexpr InMemoryDataOperationEnum At(int I)
+    {
+        // the last dummy element is to prevent compiler complain about empty array, when mSize = 0
+        const InMemoryDataOperationEnum mData[mSize + 1] = {Is..., InMemoryDataOperationEnum::Set};
+        return mData[I];
+    }
+};
+// index type
+using index_t      = int32_t;
+using long_index_t = int64_t;
+} // namespace ck
--- a/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/host_utility/device_prop.hpp
+++ b/deps/cget/pkg/ROCmSoftwarePlatform__composable_kernel/install/include/ck/host_utility/device_prop.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <string>
+#include <map>
+#include <hip/hip_runtime.h>
+namespace ck {
+inline std::string get_device_name()
+{
+    hipDeviceProp_t props{};
+    int device;
+    auto status = hipGetDevice(&device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+    status = hipGetDeviceProperties(&props, device);
+    if(status != hipSuccess)
+    {
+        return std::string();
+    }
+    const std::string raw_name(props.gcnArchName);
+    // https://github.com/ROCmSoftwarePlatform/MIOpen/blob/8498875aef84878e04c1eabefdf6571514891086/src/target_properties.cpp#L40
+    static std::map<std::string, std::string> device_name_map = {
+        {"Ellesmere", "gfx803"},
+        {"Baffin", "gfx803"},
+        {"RacerX", "gfx803"},
+        {"Polaris10", "gfx803"},
+        {"Polaris11", "gfx803"},
+        {"Tonga", "gfx803"},
+        {"Fiji", "gfx803"},
+        {"gfx800", "gfx803"},
+        {"gfx802", "gfx803"},
+        {"gfx804", "gfx803"},
+        {"Vega10", "gfx900"},
+        {"gfx901", "gfx900"},
+        {"10.3.0 Sienna_Cichlid 18", "gfx1030"},
+    };
+    const auto name = raw_name.substr(0, raw_name.find(':')); // str.substr(0, npos) returns str.
+    auto match = device_name_map.find(name);
+    if(match != device_name_map.end())
+        return match->second;
+    return name;
+}
+} // namespace ck