Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
composable_kernel
Commits
c82b833d
Commit
c82b833d
authored
Jun 12, 2019
by
Chao Liu
Browse files
change build
parent
f2b92ba9
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
89 additions
and
56 deletions
+89
-56
CMakeLists.txt
CMakeLists.txt
+4
-4
driver/CMakeLists.txt
driver/CMakeLists.txt
+2
-2
include/composable_kernel/utility/config_amd.hpp.in
include/composable_kernel/utility/config_amd.hpp.in
+41
-0
include/composable_kernel/utility/config_nvidia.hpp.in
include/composable_kernel/utility/config_nvidia.hpp.in
+4
-19
include/device.hpp
include/device.hpp
+2
-2
script/cmake-cuda.sh
script/cmake-cuda.sh
+1
-1
script/cmake-hip.sh
script/cmake-hip.sh
+6
-6
script/trace.sh
script/trace.sh
+3
-0
src/CMakeLists.txt
src/CMakeLists.txt
+6
-2
src/device.cpp
src/device.cpp
+20
-20
No files found.
CMakeLists.txt
View file @
c82b833d
...
@@ -32,13 +32,13 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
...
@@ -32,13 +32,13 @@ if( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") )
endif
(
NOT
(
${
CMAKE_CXX_COMPILER_ID
}
STREQUAL
"AppleClang"
)
)
endif
(
NOT
(
${
CMAKE_CXX_COMPILER_ID
}
STREQUAL
"AppleClang"
)
)
#GPU backend
#GPU backend
if
(
DEVICE_BACKEND STREQUAL
"
HIP
"
)
if
(
DEVICE_BACKEND STREQUAL
"
AMD
"
)
set
(
DEVICE_BACKEND_
HIP
1
)
set
(
CK_
DEVICE_BACKEND_
AMD
1
)
set
(
CMAKE_MODULE_PATH
"/opt/rocm/hip/cmake"
${
CMAKE_MODULE_PATH
}
)
set
(
CMAKE_MODULE_PATH
"/opt/rocm/hip/cmake"
${
CMAKE_MODULE_PATH
}
)
find_package
(
HIP REQUIRED
)
find_package
(
HIP REQUIRED
)
elseif
(
DEVICE_BACKEND STREQUAL
"
CUD
A"
)
elseif
(
DEVICE_BACKEND STREQUAL
"
NVIDI
A"
)
set
(
DEVICE_BACKEND_
CUD
A 1
)
set
(
CK_
DEVICE_BACKEND_
NVIDI
A 1
)
enable_language
(
CUDA
)
enable_language
(
CUDA
)
include_directories
(
BEFORE
${
CUDA_COMMON_INCLUDE_DIR
}
)
include_directories
(
BEFORE
${
CUDA_COMMON_INCLUDE_DIR
}
)
...
...
driver/CMakeLists.txt
View file @
c82b833d
if
(
DEVICE_BACKEND STREQUAL
"
HIP
"
)
if
(
DEVICE_BACKEND STREQUAL
"
AMD
"
)
set
(
DRIVER_SOURCE driver.cpp
)
set
(
DRIVER_SOURCE driver.cpp
)
elseif
(
DEVICE_BACKEND STREQUAL
"
CUD
A"
)
elseif
(
DEVICE_BACKEND STREQUAL
"
NVIDI
A"
)
set
(
DRIVER_SOURCE driver.cu
)
set
(
DRIVER_SOURCE driver.cu
)
endif
()
endif
()
...
...
include/composable_kernel/utility/config_amd.hpp.in
0 → 100644
View file @
c82b833d
#ifndef CK_CONFIG_HPP
#define CK_CONFIG_HPP
#cmakedefine01 CK_DEVICE_BACKEND_AMD
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#define CK_USE_AMD_INLINE_ASM 1
namespace ck {
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
using index_t = uint32_t;
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
{
d += s0 * s1;
}
#if 0
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x;
d += s0.y * s1.y;
}
__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
{
d += s0.x * s1.x + s0.y * s1.y;
}
#endif
} // namespace ck
#endif
include/composable_kernel/utility/config.hpp.in
→
include/composable_kernel/utility/config
_nvidia
.hpp.in
View file @
c82b833d
#ifndef CK_CONFIG_HPP
#ifndef CK_CONFIG_
CUDA_
HPP
#define CK_CONFIG_HPP
#define CK_CONFIG_
CUDA_
HPP
#cmakedefine01 DEVICE_BACKEND_HIP
#cmakedefine01 CK_DEVICE_BACKEND_NVIDIA
#cmakedefine01 DEVICE_BACKEND_CUDA
#if DEVICE_BACKEND_HIP
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#define CK_USE_AMD_INLINE_ASM 1
#elif DEVICE_BACKEND_CUDA
#include "cuda_runtime.h"
#include "cuda_runtime.h"
#include "cuda_fp16.h"
#include "cuda_fp16.h"
#include "nvToolsExt.h"
#include "nvToolsExt.h"
#include "helper_cuda.h"
#include "helper_cuda.h"
#define CK_USE_AMD_INLINE_ASM 0
#define CK_USE_AMD_INLINE_ASM 0
#endif
namespace ck {
namespace ck {
#if DEVICE_BACKEND_HIP
// For some reason, HIP compiler need this definition to generate optimal load and store
// instruction
typedef float float2_t __attribute__((ext_vector_type(2)));
typedef float float4_t __attribute__((ext_vector_type(4)));
#else
// For some reason, CUDA need this definition, otherwise
// For some reason, CUDA need this definition, otherwise
// compiler won't generate optimal load and store instruction, and
// compiler won't generate optimal load and store instruction, and
// kernel would produce wrong result, indicating the compiler fail to generate correct
// kernel would produce wrong result, indicating the compiler fail to generate correct
// instruction,
// instruction,
using float2_t = float2;
using float2_t = float2;
using float4_t = float4;
using float4_t = float4;
#endif
using index_t = uint32_t;
using index_t = uint32_t;
...
@@ -60,7 +45,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s
...
@@ -60,7 +45,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s
// need to make a better interface
// need to make a better interface
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
{
{
#if DEVICE_BACKEND_
CUD
A
#if
CK_
DEVICE_BACKEND_
NVIDI
A
d = __dp4a(s0, s1, d);
d = __dp4a(s0, s1, d);
#endif
#endif
}
}
...
...
include/device.hpp
View file @
c82b833d
...
@@ -37,7 +37,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
...
@@ -37,7 +37,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
{
{
KernelTimer
timer
;
KernelTimer
timer
;
#if DEVICE_BACKEND_
HIP
#if
CK_
DEVICE_BACKEND_
AMD
timer
.
Start
();
timer
.
Start
();
hipLaunchKernelGGL
(
kernel
,
grid_dim
,
block_dim
,
lds_byte
,
0
,
args
...);
hipLaunchKernelGGL
(
kernel
,
grid_dim
,
block_dim
,
lds_byte
,
0
,
args
...);
...
@@ -45,7 +45,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
...
@@ -45,7 +45,7 @@ float launch_kernel(F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byt
timer
.
End
();
timer
.
End
();
hipGetErrorString
(
hipGetLastError
());
hipGetErrorString
(
hipGetLastError
());
#elif DEVICE_BACKEND_
CUD
A
#elif
CK_
DEVICE_BACKEND_
NVIDI
A
const
void
*
f
=
reinterpret_cast
<
const
void
*>
(
kernel
);
const
void
*
f
=
reinterpret_cast
<
const
void
*>
(
kernel
);
void
*
p_args
[]
=
{
&
args
...};
void
*
p_args
[]
=
{
&
args
...};
...
...
script/cmake-cuda.sh
View file @
c82b833d
...
@@ -12,7 +12,7 @@ cmake
...
@@ -12,7 +12,7 @@ cmake
-D
CMAKE_CXX_COMPILER
=
clang++
\
-D
CMAKE_CXX_COMPILER
=
clang++
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
DEVICE_BACKEND
=
CUDA
\
-D
DEVICE_BACKEND
=
NVIDIA
\
-D
BOOST_ROOT
=
"/package/install/boost_1.67.0"
\
-D
BOOST_ROOT
=
"/package/install/boost_1.67.0"
\
-D
CUDA_COMMON_INCLUDE_DIR
=
"/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc"
\
-D
CUDA_COMMON_INCLUDE_DIR
=
"/home/chao/code/test_feature/cuda_common/cuda_10.0_common/inc"
\
-D
CMAKE_CUDA_FLAGS
=
"-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128"
\
-D
CMAKE_CUDA_FLAGS
=
"-ccbin clang++ -m64 -Xcompiler -fopenmp -lineinfo --source-in-ptx -keep -Xptxas -v -gencode=arch=compute_61,code=sm_61 -Xptxas -v -Xptxas -v -maxrregcount=128"
\
...
...
script/cmake-hip.sh
View file @
c82b833d
...
@@ -4,16 +4,16 @@ rm -f CMakeCache.txt
...
@@ -4,16 +4,16 @@ rm -f CMakeCache.txt
rm
-f
*
.cmake
rm
-f
*
.cmake
rm
-rf
CMakeFiles
rm
-rf
CMakeFiles
MY_PROJECT_SOURCE
=
/home/chao/code/modular_convolution
MY_PROJECT_SOURCE
=
../../../
MY_PROJECT_INSTALL
=
../install.dir
MY_PROJECT_INSTALL
=
../install.dir
cmake
\
cmake
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_INSTALL_PREFIX
=
${
MY_PROJECT_INSTALL
}
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
CMAKE_BUILD_TYPE
=
Release
\
-D
DEVICE_BACKEND
=
"
HIP
"
\
-D
DEVICE_BACKEND
=
"
AMD
"
\
-D
HIP_HIPCC_FLAGS
=
"
${
HIP_HIPCC_FLAGS
}
-gline-tables-only
"
\
-D
HIP_HIPCC_FLAGS
=
"
${
HIP_HIPCC_FLAGS
}
-gline-tables-only
-v"
\
-D
CMAKE_CXX_FLAGS
=
"-gline-tables-only
"
\
-D
CMAKE_CXX_FLAGS
=
"-gline-tables-only
--amdgpu-target=gfx906"
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/bin/hipcc
\
-D
CMAKE_CXX_COMPILER
=
/opt/rocm/
hip/
bin/hipcc
\
-D
CMAKE_PREFIX_PATH
=
"/opt/rocm
;/home/package/build/mlopen_dep"
\
-D
CMAKE_PREFIX_PATH
=
"/opt/rocm
"
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
-D
CMAKE_VERBOSE_MAKEFILE:BOOL
=
ON
\
${
MY_PROJECT_SOURCE
}
${
MY_PROJECT_SOURCE
}
script/trace.sh
0 → 100755
View file @
c82b833d
#!/bin/bash
/root/workspace/rocprofiler_pkg/bin/rpl_run.sh
--timestamp
on
-i
/root/workspace/rocprofiler_pkg/input.xml
-d
./trace ./driver/driver 0 10
src/CMakeLists.txt
View file @
c82b833d
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/include/composable_kernel/utility/config.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/include/composable_kernel/utility/config.hpp"
)
if
(
DEVICE_BACKEND STREQUAL
"AMD"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/include/composable_kernel/utility/config_amd.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/include/composable_kernel/utility/config.hpp"
)
elseif
(
DEVICE_BACKEND STREQUAL
"NVIDIA"
)
configure_file
(
"
${
PROJECT_SOURCE_DIR
}
/include/composable_kernel/utility/config_nvidia.hpp.in"
"
${
PROJECT_BINARY_DIR
}
/include/composable_kernel/utility/config.hpp"
)
endif
()
set
(
TENSOR_SOURCE
set
(
TENSOR_SOURCE
tensor.cpp;
tensor.cpp;
...
@@ -9,7 +13,7 @@ add_library(tensor SHARED ${TENSOR_SOURCE})
...
@@ -9,7 +13,7 @@ add_library(tensor SHARED ${TENSOR_SOURCE})
target_compile_features
(
tensor PUBLIC
)
target_compile_features
(
tensor PUBLIC
)
set_target_properties
(
tensor PROPERTIES POSITION_INDEPENDENT_CODE ON
)
set_target_properties
(
tensor PROPERTIES POSITION_INDEPENDENT_CODE ON
)
if
(
DEVICE_BACKEND STREQUAL
"
CUD
A"
)
if
(
DEVICE_BACKEND STREQUAL
"
NVIDI
A"
)
target_link_libraries
(
tensor nvToolsExt cudart
)
target_link_libraries
(
tensor nvToolsExt cudart
)
endif
()
endif
()
...
...
src/device.cpp
View file @
c82b833d
...
@@ -3,9 +3,9 @@
...
@@ -3,9 +3,9 @@
DeviceMem
::
DeviceMem
(
std
::
size_t
mem_size
)
:
mMemSize
(
mem_size
)
DeviceMem
::
DeviceMem
(
std
::
size_t
mem_size
)
:
mMemSize
(
mem_size
)
{
{
#if DEVICE_BACKEND_
HIP
#if
CK_
DEVICE_BACKEND_
AMD
hipGetErrorString
(
hipMalloc
(
static_cast
<
void
**>
(
&
mpDeviceBuf
),
mMemSize
));
hipGetErrorString
(
hipMalloc
(
static_cast
<
void
**>
(
&
mpDeviceBuf
),
mMemSize
));
#elif DEVICE_BACKEND_
CUD
A
#elif
CK_
DEVICE_BACKEND_
NVIDI
A
checkCudaErrors
(
cudaMalloc
(
static_cast
<
void
**>
(
&
mpDeviceBuf
),
mMemSize
));
checkCudaErrors
(
cudaMalloc
(
static_cast
<
void
**>
(
&
mpDeviceBuf
),
mMemSize
));
#endif
#endif
}
}
...
@@ -14,10 +14,10 @@ void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
...
@@ -14,10 +14,10 @@ void* DeviceMem::GetDeviceBuffer() { return mpDeviceBuf; }
void
DeviceMem
::
ToDevice
(
const
void
*
p
)
void
DeviceMem
::
ToDevice
(
const
void
*
p
)
{
{
#if DEVICE_BACKEND_
HIP
#if
CK_
DEVICE_BACKEND_
AMD
hipGetErrorString
(
hipGetErrorString
(
hipMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
mMemSize
,
hipMemcpyHostToDevice
));
hipMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
mMemSize
,
hipMemcpyHostToDevice
));
#elif DEVICE_BACKEND_
CUD
A
#elif
CK_
DEVICE_BACKEND_
NVIDI
A
checkCudaErrors
(
checkCudaErrors
(
cudaMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
mMemSize
,
cudaMemcpyHostToDevice
));
cudaMemcpy
(
mpDeviceBuf
,
const_cast
<
void
*>
(
p
),
mMemSize
,
cudaMemcpyHostToDevice
));
#endif
#endif
...
@@ -25,18 +25,18 @@ void DeviceMem::ToDevice(const void* p)
...
@@ -25,18 +25,18 @@ void DeviceMem::ToDevice(const void* p)
void
DeviceMem
::
FromDevice
(
void
*
p
)
void
DeviceMem
::
FromDevice
(
void
*
p
)
{
{
#if DEVICE_BACKEND_
HIP
#if
CK_
DEVICE_BACKEND_
AMD
hipGetErrorString
(
hipMemcpy
(
p
,
mpDeviceBuf
,
mMemSize
,
hipMemcpyDeviceToHost
));
hipGetErrorString
(
hipMemcpy
(
p
,
mpDeviceBuf
,
mMemSize
,
hipMemcpyDeviceToHost
));
#elif DEVICE_BACKEND_
CUD
A
#elif
CK_
DEVICE_BACKEND_
NVIDI
A
checkCudaErrors
(
cudaMemcpy
(
p
,
mpDeviceBuf
,
mMemSize
,
cudaMemcpyDeviceToHost
));
checkCudaErrors
(
cudaMemcpy
(
p
,
mpDeviceBuf
,
mMemSize
,
cudaMemcpyDeviceToHost
));
#endif
#endif
}
}
DeviceMem
::~
DeviceMem
()
DeviceMem
::~
DeviceMem
()
{
{
#if DEVICE_BACKEND_
HIP
#if
CK_
DEVICE_BACKEND_
AMD
hipGetErrorString
(
hipFree
(
mpDeviceBuf
));
hipGetErrorString
(
hipFree
(
mpDeviceBuf
));
#elif DEVICE_BACKEND_
CUD
A
#elif
CK_
DEVICE_BACKEND_
NVIDI
A
checkCudaErrors
(
cudaFree
(
mpDeviceBuf
));
checkCudaErrors
(
cudaFree
(
mpDeviceBuf
));
#endif
#endif
}
}
...
@@ -45,10 +45,10 @@ struct KernelTimerImpl
...
@@ -45,10 +45,10 @@ struct KernelTimerImpl
{
{
KernelTimerImpl
()
KernelTimerImpl
()
{
{
#if DEVICE_BACKEND_
HIP
#if
CK_
DEVICE_BACKEND_
AMD
hipEventCreate
(
&
mStart
);
hipEventCreate
(
&
mStart
);
hipEventCreate
(
&
mEnd
);
hipEventCreate
(
&
mEnd
);
#elif DEVICE_BACKEND_
CUD
A
#elif
CK_
DEVICE_BACKEND_
NVIDI
A
cudaEventCreate
(
&
mStart
);
cudaEventCreate
(
&
mStart
);
cudaEventCreate
(
&
mEnd
);
cudaEventCreate
(
&
mEnd
);
#endif
#endif
...
@@ -56,10 +56,10 @@ struct KernelTimerImpl
...
@@ -56,10 +56,10 @@ struct KernelTimerImpl
~
KernelTimerImpl
()
~
KernelTimerImpl
()
{
{
#if DEVICE_BACKEND_
HIP
#if
CK_
DEVICE_BACKEND_
AMD
hipEventDestroy
(
mStart
);
hipEventDestroy
(
mStart
);
hipEventDestroy
(
mEnd
);
hipEventDestroy
(
mEnd
);
#elif DEVICE_BACKEND_
CUD
A
#elif
CK_
DEVICE_BACKEND_
NVIDI
A
cudaEventDestroy
(
mStart
);
cudaEventDestroy
(
mStart
);
cudaEventDestroy
(
mEnd
);
cudaEventDestroy
(
mEnd
);
#endif
#endif
...
@@ -67,19 +67,19 @@ struct KernelTimerImpl
...
@@ -67,19 +67,19 @@ struct KernelTimerImpl
void
Start
()
void
Start
()
{
{
#if DEVICE_BACKEND_
HIP
#if
CK_
DEVICE_BACKEND_
AMD
hipEventRecord
(
mStart
,
0
);
hipEventRecord
(
mStart
,
0
);
#elif DEVICE_BACKEND_
CUD
A
#elif
CK_
DEVICE_BACKEND_
NVIDI
A
cudaEventRecord
(
mStart
,
0
);
cudaEventRecord
(
mStart
,
0
);
#endif
#endif
}
}
void
End
()
void
End
()
{
{
#if DEVICE_BACKEND_
HIP
#if
CK_
DEVICE_BACKEND_
AMD
hipEventRecord
(
mEnd
,
0
);
hipEventRecord
(
mEnd
,
0
);
hipEventSynchronize
(
mEnd
);
hipEventSynchronize
(
mEnd
);
#elif DEVICE_BACKEND_
CUD
A
#elif
CK_
DEVICE_BACKEND_
NVIDI
A
cudaEventRecord
(
mEnd
,
0
);
cudaEventRecord
(
mEnd
,
0
);
cudaEventSynchronize
(
mEnd
);
cudaEventSynchronize
(
mEnd
);
#endif
#endif
...
@@ -88,17 +88,17 @@ struct KernelTimerImpl
...
@@ -88,17 +88,17 @@ struct KernelTimerImpl
float
GetElapsedTime
()
const
float
GetElapsedTime
()
const
{
{
float
time
;
float
time
;
#if DEVICE_BACKEND_
HIP
#if
CK_
DEVICE_BACKEND_
AMD
hipEventElapsedTime
(
&
time
,
mStart
,
mEnd
);
hipEventElapsedTime
(
&
time
,
mStart
,
mEnd
);
#elif DEVICE_BACKEND_
CUD
A
#elif
CK_
DEVICE_BACKEND_
NVIDI
A
cudaEventElapsedTime
(
&
time
,
mStart
,
mEnd
);
cudaEventElapsedTime
(
&
time
,
mStart
,
mEnd
);
#endif
#endif
return
time
;
return
time
;
}
}
#if DEVICE_BACKEND_
HIP
#if
CK_
DEVICE_BACKEND_
AMD
hipEvent_t
mStart
,
mEnd
;
hipEvent_t
mStart
,
mEnd
;
#elif DEVICE_BACKEND_
CUD
A
#elif
CK_
DEVICE_BACKEND_
NVIDI
A
cudaEvent_t
mStart
,
mEnd
;
cudaEvent_t
mStart
,
mEnd
;
#endif
#endif
};
};
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment