Commit c6560f37 authored by Paul's avatar Paul
Browse files

Merge from master

parents 218e20fc 3124c7f7
...@@ -47,7 +47,7 @@ add_compile_options(-std=c++14) ...@@ -47,7 +47,7 @@ add_compile_options(-std=c++14)
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
include(EnableCompilerWarnings) include(EnableCompilerWarnings)
include(ROCMClangTidy) include(ROCMClangTidy)
if(CMAKE_CXX_COMPILER MATCHES ".*hcc") if(CMAKE_CXX_COMPILER MATCHES ".*hcc" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")
set(MIGRAPHX_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name) set(MIGRAPHX_TIDY_ERRORS ERRORS * -readability-inconsistent-declaration-parameter-name)
# Enable tidy on hip # Enable tidy on hip
elseif(MIGRAPHX_ENABLE_GPU) elseif(MIGRAPHX_ENABLE_GPU)
......
...@@ -72,7 +72,7 @@ add_library(migraphx_device ...@@ -72,7 +72,7 @@ add_library(migraphx_device
set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device) set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
rocm_set_soversion(migraphx_device ${MIGRAPHX_SO_VERSION}) rocm_set_soversion(migraphx_device ${MIGRAPHX_SO_VERSION})
rocm_clang_tidy_check(migraphx_device) rocm_clang_tidy_check(migraphx_device)
target_compile_options(migraphx_device PRIVATE -std=c++17 -fno-gpu-rdc -Wno-unused-command-line-argument) target_compile_options(migraphx_device PRIVATE -std=c++17 -fno-gpu-rdc -Wno-unused-command-line-argument -Xclang -fallow-half-arguments-and-returns)
target_link_libraries(migraphx_device migraphx hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument) target_link_libraries(migraphx_device migraphx hip::device -fno-gpu-rdc -Wno-invalid-command-line-argument -Wno-unused-command-line-argument)
if(CMAKE_CXX_COMPILER MATCHES ".*hcc") if(CMAKE_CXX_COMPILER MATCHES ".*hcc")
set(AMDGPU_TARGETS "gfx803;gfx900;gfx906" CACHE STRING "") set(AMDGPU_TARGETS "gfx803;gfx900;gfx906" CACHE STRING "")
...@@ -81,6 +81,11 @@ if(CMAKE_CXX_COMPILER MATCHES ".*hcc") ...@@ -81,6 +81,11 @@ if(CMAKE_CXX_COMPILER MATCHES ".*hcc")
target_link_libraries(migraphx_device -amdgpu-target=${AMDGPU_TARGET}) target_link_libraries(migraphx_device -amdgpu-target=${AMDGPU_TARGET})
endforeach() endforeach()
endif() endif()
check_cxx_compiler_flag("--cuda-host-only -fhip-lambda-host-device -x hip" HAS_HIP_LAMBDA_HOST_DEVICE)
if(HAS_HIP_LAMBDA_HOST_DEVICE)
message(STATUS "Enable -fhip-lambda-host-device")
target_compile_options(migraphx_device PRIVATE -fhip-lambda-host-device)
endif()
target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>) target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>) target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)
......
...@@ -31,7 +31,7 @@ struct multi_index ...@@ -31,7 +31,7 @@ struct multi_index
}; };
template <class ForStride> template <class ForStride>
auto deduce_for_stride(ForStride fs) -> decltype(fs(id{})); __device__ __host__ auto deduce_for_stride(ForStride fs) -> decltype(fs(id{}));
MIGRAPHX_DEVICE_CONSTEXPR multi_index<1> make_multi_index(index_int i, index_int n) MIGRAPHX_DEVICE_CONSTEXPR multi_index<1> make_multi_index(index_int i, index_int n)
{ {
......
...@@ -69,7 +69,7 @@ struct min ...@@ -69,7 +69,7 @@ struct min
struct lowest struct lowest
{ {
template <class T> template <class T>
operator T() const __device__ __host__ operator T() const
{ {
return device_cast(std::numeric_limits<host_type<T>>::lowest()); return device_cast(std::numeric_limits<host_type<T>>::lowest());
} }
...@@ -78,7 +78,7 @@ struct lowest ...@@ -78,7 +78,7 @@ struct lowest
struct highest struct highest
{ {
template <class T> template <class T>
operator T() const __device__ __host__ operator T() const
{ {
return device_cast(std::numeric_limits<host_type<T>>::max()); return device_cast(std::numeric_limits<host_type<T>>::max());
} }
...@@ -140,7 +140,11 @@ __device__ T dpp_mov(T& x) ...@@ -140,7 +140,11 @@ __device__ T dpp_mov(T& x)
input.data = x; input.data = x;
for(index_int i = 0; i < n; i++) for(index_int i = 0; i < n; i++)
{ {
#if defined(__HCC__)
output.reg[i] = __llvm_amdgcn_move_dpp(input.reg[i], DppCtrl, RowMask, BankMask, BoundCtrl); output.reg[i] = __llvm_amdgcn_move_dpp(input.reg[i], DppCtrl, RowMask, BankMask, BoundCtrl);
#else
output.reg[i] = __hip_move_dpp(input.reg[i], DppCtrl, RowMask, BankMask, BoundCtrl);
#endif
} }
return output.data; return output.data;
} }
......
...@@ -103,19 +103,19 @@ host_type<T>* host_cast(T* x) ...@@ -103,19 +103,19 @@ host_type<T>* host_cast(T* x)
} }
template <class T> template <class T>
device_type<T> device_cast(const T& x) __device__ __host__ device_type<T> device_cast(const T& x)
{ {
return reinterpret_cast<const device_type<T>&>(x); return reinterpret_cast<const device_type<T>&>(x);
} }
template <class T> template <class T>
device_type<T>* device_cast(T* x) __device__ __host__ device_type<T>* device_cast(T* x)
{ {
return reinterpret_cast<device_type<T>*>(x); return reinterpret_cast<device_type<T>*>(x);
} }
template <class T> template <class T>
tensor_view<device_type<T>> device_cast(tensor_view<T> x) __device__ __host__ tensor_view<device_type<T>> device_cast(tensor_view<T> x)
{ {
return {x.get_shape(), reinterpret_cast<device_type<T>*>(x.data())}; return {x.get_shape(), reinterpret_cast<device_type<T>*>(x.data())};
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment