"vscode:/vscode.git/clone" did not exist on "3e30c75f3e2c34a9cb87835918825d00f3d78bff"
Commit c82b833d authored by Chao Liu's avatar Chao Liu
Browse files

change build

parent f2b92ba9
......@@ -32,13 +32,13 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
#GPU backend
if(DEVICE_BACKEND STREQUAL "HIP")
set(DEVICE_BACKEND_HIP 1)
if(DEVICE_BACKEND STREQUAL "AMD")
set(CK_DEVICE_BACKEND_AMD 1)
set(CMAKE_MODULE_PATH "/opt/rocm/hip/cmake" ${CMAKE_MODULE_PATH})
find_package(HIP REQUIRED)
elseif(DEVICE_BACKEND STREQUAL "CUDA")
set(DEVICE_BACKEND_CUDA 1)
elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set(CK_DEVICE_BACKEND_NVIDIA 1)
enable_language(CUDA)
include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR})
......
if(DEVICE_BACKEND STREQUAL "HIP")
if(DEVICE_BACKEND STREQUAL "AMD")
set(DRIVER_SOURCE driver.cpp)
elseif(DEVICE_BACKEND STREQUAL "CUDA")
elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
set(DRIVER_SOURCE driver.cu)
endif()
......
#ifndef CK_CONFIG_HPP
#define CK_CONFIG_HPP
#cmakedefine01 CK_DEVICE_BACKEND_AMD
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#define CK_USE_AMD_INLINE_ASM 1
namespace ck {
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
using index_t = uint32_t;
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{
d += s0 * s1;
}
#if 0
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x;
d += s0.y * s1.y;
}
__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x + s0.y * s1.y;
}
#endif
} // namespace ck
#endif
#ifndef CK_CONFIG_HPP
#define CK_CONFIG_HPP
#ifndef CK_CONFIG_CUDA_HPP
#define CK_CONFIG_CUDA_HPP
#cmakedefine01 DEVICE_BACKEND_HIP
#cmakedefine01 DEVICE_BACKEND_CUDA
#cmakedefine01 CK_DEVICE_BACKEND_NVIDIA
#if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#define CK_USE_AMD_INLINE_ASM 1
#elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h"
#include "cuda_fp16.h"
#include "nvToolsExt.h"
#include "helper_cuda.h"
#define CK_USE_AMD_INLINE_ASM 0
#endif
namespace ck {
#if DEVICE_BACKEND_HIP
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
#else
// For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
using float2_t = float2;
using float4_t = float4;
#endif
using index_t = uint32_t;
......@@ -60,7 +45,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s
// need to make a better interface
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
{
#if DEVICE_BACKEND_CUDA
#if CK_DEVICE_BACKEND_NVIDIA
d = __dp4a(s0, s1, d);
#endif
}
......
......@@ -37,7 +37,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
{
KernelTimer timer;
#if DEVICE_BACKEND_HIP
#if CK_DEVICE_BACKEND_AMD
timer.Start();
hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, 0, args...);
......@@ -45,7 +45,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
timer.End();
hipGetErrorString(hipGetLastError());
#elif DEVICE_BACKEND_CUDA
#elif CK_DEVICE_BACKEND_NVIDIA
const void* f = reinterpret_cast<const void*>(kernel);
void* p_args[] = {&args...};
......
......@@ -12,7 +12,7 @@ cmake
-D CMAKE_CXX_COMPILER=clang++ \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D DEVICE_BACKEND=CUDA \
-D DEVICE_BACKEND=NVIDIA \
-D BOOST_ROOT="/package/install/boost_1.67.0" \
-D CUDA_COMMON_INCLUDE_DIR="/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc" \
-D CMAKE_CUDA_FLAGS="-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128" \
......
......@@ -4,16 +4,16 @@ rm -f CMakeCache.txt
rm -f *.cmake
rm -rf CMakeFiles
MY_PROJECT_SOURCE=/home/chao/code/modular_convolution
MY_PROJECT_SOURCE=../../../
MY_PROJECT_INSTALL=../install.dir
cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D CMAKE_BUILD_TYPE=Release \
-D DEVICE_BACKEND="HIP" \
-D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only" \
-D CMAKE_CXX_FLAGS="-gline-tables-only" \
-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm;/home/package/build/mlopen_dep" \
-D DEVICE_BACKEND="AMD" \
-D HIP_HIPCC_FLAGS="${HIP_HIPCC_FLAGS} -gline-tables-only -v" \
-D CMAKE_CXX_FLAGS="-gline-tables-only --amdgpu-target=gfx906" \
-D CMAKE_CXX_COMPILER=/opt/rocm/hip/bin/hipcc \
-D CMAKE_PREFIX_PATH="/opt/rocm" \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
${MY_PROJECT_SOURCE}
#!/bin/bash
/root/workspace/rocprofiler_pkg/bin/rpl_run.sh --timestamp on -i /root/workspace/rocprofiler_pkg/input.xml -d ./trace ./driver/driver 0 10
configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
if(DEVICE_BACKEND STREQUAL "AMD")
configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config_amd.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
elseif(DEVICE_BACKEND STREQUAL "NVIDIA")
configure_file("${PROJECT_SOURCE_DIR}/include/composable_kernel/utility/config_nvidia.hpp.in" "${PROJECT_BINARY_DIR}/include/composable_kernel/utility/config.hpp")
endif()
set(TENSOR_SOURCE
tensor.cpp;
......@@ -9,7 +13,7 @@ add_library(tensor SHARED ${TENSOR_SOURCE})
target_compile_features(tensor PUBLIC)
set_target_properties(tensor PROPERTIES POSITION_INDEPENDENT_CODE ON)
if(DEVICE_BACKEND STREQUAL "CUDA")
if(DEVICE_BACKEND STREQUAL "NVIDIA")
target_link_libraries(tensor nvToolsExt cudart)
endif()
......
......@@ -3,9 +3,9 @@
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
#if DEVICE_BACKEND_HIP
#if CK_DEVICE_BACKEND_AMD
hipGetErrorString(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
#elif DEVICE_BACKEND_CUDA
#elif CK_DEVICE_BACKEND_NVIDIA
checkCudaErrors(cudaMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
#endif
}
......@@ -14,10 +14,10 @@ void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
void DeviceMem::ToDevice(const void* p)
{
#if DEVICE_BACKEND_HIP
#if CK_DEVICE_BACKEND_AMD
hipGetErrorString(
hipMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, hipMemcpyHostToDevice));
#elif DEVICE_BACKEND_CUDA
#elif CK_DEVICE_BACKEND_NVIDIA
checkCudaErrors(
cudaMemcpy(mpDeviceBuf, const_cast<void*>(p), mMemSize, cudaMemcpyHostToDevice));
#endif
......@@ -25,18 +25,18 @@ void DeviceMem::ToDevice(const void* p)
void DeviceMem::FromDevice(void* p)
{
#if DEVICE_BACKEND_HIP
#if CK_DEVICE_BACKEND_AMD
hipGetErrorString(hipMemcpy(p, mpDeviceBuf, mMemSize, hipMemcpyDeviceToHost));
#elif DEVICE_BACKEND_CUDA
#elif CK_DEVICE_BACKEND_NVIDIA
checkCudaErrors(cudaMemcpy(p, mpDeviceBuf, mMemSize, cudaMemcpyDeviceToHost));
#endif
}
DeviceMem::~DeviceMem()
{
#if DEVICE_BACKEND_HIP
#if CK_DEVICE_BACKEND_AMD
hipGetErrorString(hipFree(mpDeviceBuf));
#elif DEVICE_BACKEND_CUDA
#elif CK_DEVICE_BACKEND_NVIDIA
checkCudaErrors(cudaFree(mpDeviceBuf));
#endif
}
......@@ -45,10 +45,10 @@ struct KernelTimerImpl
{
KernelTimerImpl()
{
#if DEVICE_BACKEND_HIP
#if CK_DEVICE_BACKEND_AMD
hipEventCreate(&mStart);
hipEventCreate(&mEnd);
#elif DEVICE_BACKEND_CUDA
#elif CK_DEVICE_BACKEND_NVIDIA
cudaEventCreate(&mStart);
cudaEventCreate(&mEnd);
#endif
......@@ -56,10 +56,10 @@ struct KernelTimerImpl
~KernelTimerImpl()
{
#if DEVICE_BACKEND_HIP
#if CK_DEVICE_BACKEND_AMD
hipEventDestroy(mStart);
hipEventDestroy(mEnd);
#elif DEVICE_BACKEND_CUDA
#elif CK_DEVICE_BACKEND_NVIDIA
cudaEventDestroy(mStart);
cudaEventDestroy(mEnd);
#endif
......@@ -67,19 +67,19 @@ struct KernelTimerImpl
void Start()
{
#if DEVICE_BACKEND_HIP
#if CK_DEVICE_BACKEND_AMD
hipEventRecord(mStart, 0);
#elif DEVICE_BACKEND_CUDA
#elif CK_DEVICE_BACKEND_NVIDIA
cudaEventRecord(mStart, 0);
#endif
}
void End()
{
#if DEVICE_BACKEND_HIP
#if CK_DEVICE_BACKEND_AMD
hipEventRecord(mEnd, 0);
hipEventSynchronize(mEnd);
#elif DEVICE_BACKEND_CUDA
#elif CK_DEVICE_BACKEND_NVIDIA
cudaEventRecord(mEnd, 0);
cudaEventSynchronize(mEnd);
#endif
......@@ -88,17 +88,17 @@ struct KernelTimerImpl
float GetElapsedTime() const
{
float time;
#if DEVICE_BACKEND_HIP
#if CK_DEVICE_BACKEND_AMD
hipEventElapsedTime(&time, mStart, mEnd);
#elif DEVICE_BACKEND_CUDA
#elif CK_DEVICE_BACKEND_NVIDIA
cudaEventElapsedTime(&time, mStart, mEnd);
#endif
return time;
}
#if DEVICE_BACKEND_HIP
#if CK_DEVICE_BACKEND_AMD
hipEvent_t mStart, mEnd;
#elif DEVICE_BACKEND_CUDA
#elif CK_DEVICE_BACKEND_NVIDIA
cudaEvent_t mStart, mEnd;
#endif
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment