Commit c82b833d authored by Chao Liu's avatar Chao Liu
Browse files

change build

parent f2b92ba9
...@@ -32,13 +32,13 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") ) ...@@ -32,13 +32,13 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") ) endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
#GPU backend #GPU backend
if(DEVICE_BACKEND STREQUAL "HIP") if(DEVICE_BACKEND STREQUAL "AMD")
set(DEVICE_BACKEND_HIP 1) set(CK_DEVICE_BACKEND_AMD 1)
set(CMAKE_MODULE_PATH "/opt/rocm/hip/cmake" ${CMAKE_MODULE_PATH}) set(CMAKE_MODULE_PATH "/opt/rocm/hip/cmake" ${CMAKE_MODULE_PATH})
find_package(HIP REQUIRED) find_package(HIP REQUIRED)
elseif(DEVICE_BACKEND STREQUAL "CUDA") elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set(DEVICE_BACKEND_CUDA 1) set(CK_DEVICE_BACKEND_NVIDIA 1)
enable_language(CUDA) enable_language(CUDA)
include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR}) include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR})
......
if(DEVICE_BACKEND STREQUAL "HIP") if(DEVICE_BACKEND STREQUAL "AMD")
set(DRIVER_SOURCE driver.cpp) set(DRIVER_SOURCE driver.cpp)
elseif(DEVICE_BACKEND STREQUAL "CUDA") elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set(DRIVER_SOURCE driver.cu) set(DRIVER_SOURCE driver.cu)
endif() endif()
......
#ifndef CK_CONFIG_HPP
#define CK_CONFIG_HPP
#cmakedefine01 CK_DEVICE_BACKEND_AMD
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#define CK_USE_AMD_INLINE_ASM 1
namespace ck {
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
using index_t = uint32_t;
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{
d += s0 * s1;
}
#if 0
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x;
d += s0.y * s1.y;
}
__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x + s0.y * s1.y;
}
#endif
} // namespace ck
#endif
#ifndef CK_CONFIG_HPP #ifndef CK_CONFIG_CUDA_HPP
#define CK_CONFIG_HPP #define CK_CONFIG_CUDA_HPP
#cmakedefine01 DEVICE_BACKEND_HIP #cmakedefine01 CK_DEVICE_BACKEND_NVIDIA
#cmakedefine01 DEVICE_BACKEND_CUDA
#if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#define CK_USE_AMD_INLINE_ASM 1
#elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include "cuda_fp16.h" #include "cuda_fp16.h"
#include "nvToolsExt.h" #include "nvToolsExt.h"
#include "helper_cuda.h" #include "helper_cuda.h"
#define CK_USE_AMD_INLINE_ASM 0 #define CK_USE_AMD_INLINE_ASM 0
#endif
namespace ck { namespace ck {
#if DEVICE_BACKEND_HIP
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
#else
// For some reason, CUDA need this definition, otherwise // For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and // compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct // kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction, // instruction,
using float2_t = float2; using float2_t = float2;
using float4_t = float4; using float4_t = float4;
#endif
using index_t = uint32_t; using index_t = uint32_t;
...@@ -60,7 +45,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s ...@@ -60,7 +45,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s
// need to make a better interface // need to make a better interface
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1) __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
{ {
#if DEVICE_BACKEND_CUDA #if CK_DEVICE_BACKEND_NVIDIA
d = __dp4a(s0, s1, d); d = __dp4a(s0, s1, d);
#endif #endif
} }
......
...@@ -37,7 +37,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt ...@@ -37,7 +37,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
{ {
KernelTimer timer; KernelTimer timer;
#if DEVICE_BACKEND_HIP #if CK_DEVICE_BACKEND_AMD
timer.Start(); timer.Start();
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, 0, args...); hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, 0, args...);
...@@ -45,7 +45,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt ...@@ -45,7 +45,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
timer.End(); timer.End();
hipGetErrorString(hipGetLastError()); hipGetErrorString(hipGetLastError());
#elif DEVICE_BACKEND_CUDA #elif CK_DEVICE_BACKEND_NVIDIA
const void* f = reinterpret_cast<const void*>(kernel); const void* f = reinterpret_cast<const void*>(kernel);
void* p_args[] = {&args...}; void* p_args[] = {&args...};
......
...@@ -12,7 +12,7 @@ cmake ...@@ -12,7 +12,7 @@ cmake
-D CMAKE_CXX_COMPILER=clang++ \ -D CMAKE_CXX_COMPILER=clang++ \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D DEVICE_BACKEND=CUDA \ -D DEVICE_BACKEND=NVIDIA \
-D BOOST_ROOT="/package/install/boost_1.67.0" \ -D BOOST_ROOT="/package/install/boost_1.67.0" \
-D CUDA_COMMON_INCLUDE_DIR="/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc" \ -D CUDA_COMMON_INCLUDE_DIR="/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc" \
-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \ -D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \
......
...@@ -4,16 +4,16 @@ rm -f CMakeCache.txt ...@@ -4,16 +4,16 @@ rm -f CMakeCache.txt
rm -f *.cmake rm -f *.cmake
rm -rf CMakeFiles rm -rf CMakeFiles
MY_PROJECT_SOURCE=/home/chao/code/modular_convolution MY_PROJECT_SOURCE=../../../
MY_PROJECT_INSTALL=../install.dir MY_PROJECT_INSTALL=../install.dir
cmake \ cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \ -D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Release \ -D CMAKE_BUILD_TYPE=Release \
-D DEVICE_BACKEND="HIP" \ -D DEVICE_BACKEND="AMD" \
-D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only" \ -D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only -v" \
-D CMAKE_CXX_FLAGS="-gline-tables-only" \ -D CMAKE_CXX_FLAGS="-gline-tables-only --amdgpu-target=gfx906" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ -D CMAKE_CXX_COMPILER=/opt/rocm/hip/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm;/home/package/build/mlopen_dep" \ -D CMAKE_PREFIX_PATH="/opt/rocm" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \ -D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
${MY_PROJECT_SOURCE} ${MY_PROJECT_SOURCE}
#!/bin/bash
/root/workspace/rocprofiler_pkg/bin/rpl_run.sh --timestamp on -i /root/workspace/rocprofiler_pkg/input.xml -d ./trace ./driver/driver 0 10
configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp") if(DEVICE_BACKEND STREQUAL "AMD")
configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config_amd.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config_nvidia.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
endif()
set(TENSOR_SOURCE set(TENSOR_SOURCE
tensor.cpp; tensor.cpp;
...@@ -9,7 +13,7 @@ add_library(tensor SHARED ${TENSOR_SOURCE}) ...@@ -9,7 +13,7 @@ add_library(tensor SHARED ${TENSOR_SOURCE})
target_compile_features(tensor PUBLIC) target_compile_features(tensor PUBLIC)
set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
if(DEVICE_BACKEND STREQUAL "CUDA") if(DEVICE_BACKEND STREQUAL "NVIDIA")
target_link_libraries(tensor nvToolsExt cudart) target_link_libraries(tensor nvToolsExt cudart)
endif() endif()
......
...@@ -3,9 +3,9 @@ ...@@ -3,9 +3,9 @@
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size) DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{ {
#if DEVICE_BACKEND_HIP #if CK_DEVICE_BACKEND_AMD
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize)); hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
#elif DEVICE_BACKEND_CUDA #elif CK_DEVICE_BACKEND_NVIDIA
checkCudaErrors(cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize)); checkCudaErrors(cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
#endif #endif
} }
...@@ -14,10 +14,10 @@ void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; } ...@@ -14,10 +14,10 @@ void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
void DeviceMem::ToDevice(const void* p) void DeviceMem::ToDevice(const void* p)
{ {
#if DEVICE_BACKEND_HIP #if CK_DEVICE_BACKEND_AMD
hipGetErrorString( hipGetErrorString(
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice)); hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
#elif DEVICE_BACKEND_CUDA #elif CK_DEVICE_BACKEND_NVIDIA
checkCudaErrors( checkCudaErrors(
cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice)); cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice));
#endif #endif
...@@ -25,18 +25,18 @@ void DeviceMem::ToDevice(const void* p) ...@@ -25,18 +25,18 @@ void DeviceMem::ToDevice(const void* p)
void DeviceMem::FromDevice(void* p) void DeviceMem::FromDevice(void* p)
{ {
#if DEVICE_BACKEND_HIP #if CK_DEVICE_BACKEND_AMD
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost)); hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
#elif DEVICE_BACKEND_CUDA #elif CK_DEVICE_BACKEND_NVIDIA
checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost)); checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost));
#endif #endif
} }
DeviceMem::~DeviceMem() DeviceMem::~DeviceMem()
{ {
#if DEVICE_BACKEND_HIP #if CK_DEVICE_BACKEND_AMD
hipGetErrorString(hipFree(mpDeviceBuf)); hipGetErrorString(hipFree(mpDeviceBuf));
#elif DEVICE_BACKEND_CUDA #elif CK_DEVICE_BACKEND_NVIDIA
checkCudaErrors(cudaFree(mpDeviceBuf)); checkCudaErrors(cudaFree(mpDeviceBuf));
#endif #endif
} }
...@@ -45,10 +45,10 @@ struct KernelTimerImpl ...@@ -45,10 +45,10 @@ struct KernelTimerImpl
{ {
KernelTimerImpl() KernelTimerImpl()
{ {
#if DEVICE_BACKEND_HIP #if CK_DEVICE_BACKEND_AMD
hipEventCreate(&mStart); hipEventCreate(&mStart);
hipEventCreate(&mEnd); hipEventCreate(&mEnd);
#elif DEVICE_BACKEND_CUDA #elif CK_DEVICE_BACKEND_NVIDIA
cudaEventCreate(&mStart); cudaEventCreate(&mStart);
cudaEventCreate(&mEnd); cudaEventCreate(&mEnd);
#endif #endif
...@@ -56,10 +56,10 @@ struct KernelTimerImpl ...@@ -56,10 +56,10 @@ struct KernelTimerImpl
~KernelTimerImpl() ~KernelTimerImpl()
{ {
#if DEVICE_BACKEND_HIP #if CK_DEVICE_BACKEND_AMD
hipEventDestroy(mStart); hipEventDestroy(mStart);
hipEventDestroy(mEnd); hipEventDestroy(mEnd);
#elif DEVICE_BACKEND_CUDA #elif CK_DEVICE_BACKEND_NVIDIA
cudaEventDestroy(mStart); cudaEventDestroy(mStart);
cudaEventDestroy(mEnd); cudaEventDestroy(mEnd);
#endif #endif
...@@ -67,19 +67,19 @@ struct KernelTimerImpl ...@@ -67,19 +67,19 @@ struct KernelTimerImpl
void Start() void Start()
{ {
#if DEVICE_BACKEND_HIP #if CK_DEVICE_BACKEND_AMD
hipEventRecord(mStart, 0); hipEventRecord(mStart, 0);
#elif DEVICE_BACKEND_CUDA #elif CK_DEVICE_BACKEND_NVIDIA
cudaEventRecord(mStart, 0); cudaEventRecord(mStart, 0);
#endif #endif
} }
void End() void End()
{ {
#if DEVICE_BACKEND_HIP #if CK_DEVICE_BACKEND_AMD
hipEventRecord(mEnd, 0); hipEventRecord(mEnd, 0);
hipEventSynchronize(mEnd); hipEventSynchronize(mEnd);
#elif DEVICE_BACKEND_CUDA #elif CK_DEVICE_BACKEND_NVIDIA
cudaEventRecord(mEnd, 0); cudaEventRecord(mEnd, 0);
cudaEventSynchronize(mEnd); cudaEventSynchronize(mEnd);
#endif #endif
...@@ -88,17 +88,17 @@ struct KernelTimerImpl ...@@ -88,17 +88,17 @@ struct KernelTimerImpl
float GetElapsedTime() const float GetElapsedTime() const
{ {
float time; float time;
#if DEVICE_BACKEND_HIP #if CK_DEVICE_BACKEND_AMD
hipEventElapsedTime(&time, mStart, mEnd); hipEventElapsedTime(&time, mStart, mEnd);
#elif DEVICE_BACKEND_CUDA #elif CK_DEVICE_BACKEND_NVIDIA
cudaEventElapsedTime(&time, mStart, mEnd); cudaEventElapsedTime(&time, mStart, mEnd);
#endif #endif
return time; return time;
} }
#if DEVICE_BACKEND_HIP #if CK_DEVICE_BACKEND_AMD
hipEvent_t mStart, mEnd; hipEvent_t mStart, mEnd;
#elif DEVICE_BACKEND_CUDA #elif CK_DEVICE_BACKEND_NVIDIA
cudaEvent_t mStart, mEnd; cudaEvent_t mStart, mEnd;
#endif #endif
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment