Merge from master

c6560f37 · Paul · 218e20fc · 3124c7f7 · c6560f37 · c6560f37
Commit c6560f37 authored Jun 01, 2020 by Paul
5 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,7 +47,7 @@ add_compile_options(-std=c++14)
 list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 include(EnableCompilerWarnings)
 include(ROCMClangTidy)
-if(CMAKE_CXX_COMPILER MATCHES ".*hcc")
+if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
    set(MIGRAPHX_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
 # Enable tidy on hip
 elseif(MIGRAPHX_ENABLE_GPU)

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -72,7 +72,7 @@ add_library(migraphx_device
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
 rocm_set_soversion(migraphx_device ${MIGRAPHX_SO_VERSION})
 rocm_clang_tidy_check(migraphx_device)
-target_compile_options(migraphx_device PRIVATE -std=c++17 -fno-gpu-rdc -Wno-unused-command-line-argument)
+target_compile_options(migraphx_device PRIVATE -std=c++17 -fno-gpu-rdc -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
 target_link_libraries(migraphx_device migraphx hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument)
 if(CMAKE_CXX_COMPILER MATCHES ".*hcc")
    set(AMDGPU_TARGETS "gfx803;gfx900;gfx906" CACHE STRING "")
@@ -81,6 +81,11 @@ if(CMAKE_CXX_COMPILER MATCHES ".*hcc")
        target_link_libraries(migraphx_device -amdgpu-target=${AMDGPU_TARGET})
    endforeach()
 endif()
+check_cxx_compiler_flag("--cuda-host-only -fhip-lambda-host-device -x hip" HAS_HIP_LAMBDA_HOST_DEVICE)
+if(HAS_HIP_LAMBDA_HOST_DEVICE)
+  message(STATUS "Enable -fhip-lambda-host-device")
+  target_compile_options(migraphx_device PRIVATE -fhip-lambda-host-device)
+endif()
 target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
 target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)

--- a/src/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/multi_index.hpp
@@ -31,7 +31,7 @@ struct multi_index
 };
 template <class ForStride>
-auto deduce_for_stride(ForStride fs) -> decltype(fs(id{}));
+__device__ __host__ auto deduce_for_stride(ForStride fs) -> decltype(fs(id{}));
 MIGRAPHX_DEVICE_CONSTEXPR multi_index<1> make_multi_index(index_int i, index_int n)
 {

--- a/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/reduce.hpp
@@ -69,7 +69,7 @@ struct min
 struct lowest
 {
    template <class T>
-    operator T() const
+    __device__ __host__ operator T() const
    {
        return device_cast(std::numeric_limits<host_type<T>>::lowest());
    }
@@ -78,7 +78,7 @@ struct lowest
 struct highest
 {
    template <class T>
-    operator T() const
+    __device__ __host__ operator T() const
    {
        return device_cast(std::numeric_limits<host_type<T>>::max());
    }
@@ -140,7 +140,11 @@ __device__ T dpp_mov(T& x)
    input.data = x;
    for(index_int i = 0; i < n; i++)
    {
+#if defined(__HCC__)
        output.reg[i] = __llvm_amdgcn_move_dpp(input.reg[i], DppCtrl, RowMask, BankMask, BoundCtrl);
+#else
+        output.reg[i] = __hip_move_dpp(input.reg[i], DppCtrl, RowMask, BankMask, BoundCtrl);
+#endif
    }
    return output.data;
 }

--- a/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
+++ b/src/targets/gpu/device/include/migraphx/gpu/device/types.hpp
@@ -103,19 +103,19 @@ host_type<T>* host_cast(T* x)
 }
 template <class T>
-device_type<T> device_cast(const T& x)
+__device__ __host__ device_type<T> device_cast(const T& x)
 {
    return reinterpret_cast<const device_type<T>&>(x);
 }
 template <class T>
-device_type<T>* device_cast(T* x)
+__device__ __host__ device_type<T>* device_cast(T* x)
 {
    return reinterpret_cast<device_type<T>*>(x);
 }
 template <class T>
-tensor_view<device_type<T>> device_cast(tensor_view<T> x)
+__device__ __host__ tensor_view<device_type<T>> device_cast(tensor_view<T> x)
 {
    return {x.get_shape(), reinterpret_cast<device_type<T>*>(x.data())};
 }