Commit 9a7fa123 authored by carlushuang's avatar carlushuang
Browse files

support gcc with cpu only compile

parent ad09ebdb
......@@ -38,6 +38,7 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
}
};
#ifndef CK_NOGPU
// static buffer for vector
template <AddressSpaceEnum AddressSpace,
typename S,
......@@ -151,6 +152,7 @@ struct StaticBufferTupleOfVector
static_for<0, Number<numScalars>{}, 1>{}([&](auto i) { SetAsType(i, S{0}); });
}
};
#endif
template <AddressSpaceEnum AddressSpace, typename T, index_t N>
__host__ __device__ constexpr auto make_static_buffer(Number<N>)
......
#ifndef CK_SYNCHRONIZATION_AMD_HPP
#define CK_SYNCHRONIZATION_AMD_HPP
#ifndef CK_NOGPU
#include "config.hpp"
......@@ -19,3 +20,4 @@ __device__ void block_sync_lds()
} // namespace ck
#endif
#endif
#pragma once
#ifndef CK_NOGPU
#include "get_id.hpp"
namespace ck {
......@@ -16,3 +17,4 @@ struct ThisThreadBlock
};
} // namespace ck
#endif
......@@ -5,6 +5,7 @@
#include "statically_indexed_array.hpp"
#include "data_type.hpp"
#ifndef CK_NOGPU
namespace ck {
template <typename S,
......@@ -166,3 +167,4 @@ struct transpose_vectors<int8_t, NX, NY>
} // namespace ck
#endif
#endif
\ No newline at end of file
......@@ -4,12 +4,14 @@
#include <functional>
#include <thread>
#include <chrono>
#include "ck/options.hpp"
#ifndef CK_NOGPU
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#endif
#include "stream_config.hpp"
#include "ck/options.hpp"
#ifndef CK_NOGPU
inline void hip_check_error(hipError_t x)
{
if(x != hipSuccess)
......@@ -36,22 +38,6 @@ struct DeviceMem
std::size_t mMemSize;
};
struct DeviceAlignedMemCPU
{
DeviceAlignedMemCPU() = delete;
DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment);
void* GetDeviceBuffer();
std::size_t GetBufferSize();
void ToDevice(const void* p);
void FromDevice(void* p);
void SetZero();
~DeviceAlignedMemCPU();
void* mpDeviceBuf;
std::size_t mMemSize;
std::size_t mAlignment;
};
struct KernelTimerImpl;
struct KernelTimer
......@@ -65,19 +51,6 @@ struct KernelTimer
std::unique_ptr<KernelTimerImpl> impl;
};
struct WallTimerImpl;
struct WallTimer
{
WallTimer();
~WallTimer();
void Start();
void End();
float GetElapsedTime() const;
std::unique_ptr<WallTimerImpl> impl;
};
using device_stream_t = hipStream_t;
template <typename... Args, typename F>
......@@ -136,6 +109,36 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
return 0;
#endif
}
#endif
struct DeviceAlignedMemCPU
{
DeviceAlignedMemCPU() = delete;
DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment);
void* GetDeviceBuffer();
std::size_t GetBufferSize();
void ToDevice(const void* p);
void FromDevice(void* p);
void SetZero();
~DeviceAlignedMemCPU();
void* mpDeviceBuf;
std::size_t mMemSize;
std::size_t mAlignment;
};
struct WallTimerImpl;
struct WallTimer
{
WallTimer();
~WallTimer();
void Start();
void End();
float GetElapsedTime() const;
std::unique_ptr<WallTimerImpl> impl;
};
template <typename... Args, typename F>
void launch_cpu_kernel(F kernel, Args... args)
......@@ -162,4 +165,3 @@ float launch_and_time_cpu_kernel(F kernel, int nrepeat, Args... args)
return timer.GetElapsedTime() / nrepeat;
}
......@@ -8,7 +8,7 @@
#include <utility>
#include <cassert>
#include <iostream>
#include "data_type.hpp"
#include "common_header.hpp"
template <typename Range>
std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
......
#include <chrono>
#include <assert.h>
#include <string.h>
#include "device.hpp"
#ifndef CK_NOGPU
DeviceMem::DeviceMem(std::size_t mem_size) : mMemSize(mem_size)
{
hip_check_error(hipMalloc(static_cast<void**>(&mpDeviceBuf), mMemSize));
......@@ -24,45 +27,6 @@ void DeviceMem::SetZero() { hip_check_error(hipMemset(mpDeviceBuf, 0, mMemSize))
DeviceMem::~DeviceMem() { hip_check_error(hipFree(mpDeviceBuf)); }
DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment)
: mMemSize(mem_size), mAlignment(alignment)
{
if(mem_size == 0)
{
mpDeviceBuf = nullptr;
}
else
{
assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2
void* p1;
void** p2;
int offset = alignment - 1 + sizeof(void*);
p1 = malloc(mem_size + offset);
assert(p1 != nullptr);
p2 = reinterpret_cast<void**>((reinterpret_cast<size_t>(p1) + offset) & ~(alignment - 1));
p2[-1] = p1;
mpDeviceBuf = reinterpret_cast<void*>(p2);
}
}
void* DeviceAlignedMemCPU::GetDeviceBuffer() { return mpDeviceBuf; }
std::size_t DeviceAlignedMemCPU::GetBufferSize() { return mMemSize; }
void DeviceAlignedMemCPU::ToDevice(const void* p) { memcpy(mpDeviceBuf, p, mMemSize); }
void DeviceAlignedMemCPU::FromDevice(void* p) { memcpy(p, mpDeviceBuf, mMemSize); }
void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
DeviceAlignedMemCPU::~DeviceAlignedMemCPU()
{
if(mpDeviceBuf != nullptr)
free((reinterpret_cast<void**>(mpDeviceBuf))[-1]);
}
struct KernelTimerImpl
{
KernelTimerImpl()
......@@ -108,6 +72,46 @@ void KernelTimer::Start() { impl->Start(); }
void KernelTimer::End() { impl->End(); }
float KernelTimer::GetElapsedTime() const { return impl->GetElapsedTime(); }
#endif
DeviceAlignedMemCPU::DeviceAlignedMemCPU(std::size_t mem_size, std::size_t alignment)
: mMemSize(mem_size), mAlignment(alignment)
{
if(mem_size == 0)
{
mpDeviceBuf = nullptr;
}
else
{
assert(!(alignment == 0 || (alignment & (alignment - 1)))); // check pow of 2
void* p1;
void** p2;
int offset = alignment - 1 + sizeof(void*);
p1 = malloc(mem_size + offset);
assert(p1 != nullptr);
p2 = reinterpret_cast<void**>((reinterpret_cast<size_t>(p1) + offset) & ~(alignment - 1));
p2[-1] = p1;
mpDeviceBuf = reinterpret_cast<void*>(p2);
}
}
void* DeviceAlignedMemCPU::GetDeviceBuffer() { return mpDeviceBuf; }
std::size_t DeviceAlignedMemCPU::GetBufferSize() { return mMemSize; }
void DeviceAlignedMemCPU::ToDevice(const void* p) { memcpy(mpDeviceBuf, p, mMemSize); }
void DeviceAlignedMemCPU::FromDevice(void* p) { memcpy(p, mpDeviceBuf, mMemSize); }
void DeviceAlignedMemCPU::SetZero() { memset(mpDeviceBuf, 0, mMemSize); }
DeviceAlignedMemCPU::~DeviceAlignedMemCPU()
{
if(mpDeviceBuf != nullptr)
free((reinterpret_cast<void**>(mpDeviceBuf))[-1]);
}
struct WallTimerImpl
{
......
......@@ -6,8 +6,8 @@ set(DEVICE_CONV2D_FWD_CPU_INSTANCE_SOURCE
add_library(device_conv2d_fwd_cpu_instance SHARED ${DEVICE_CONV2D_FWD_CPU_INSTANCE_SOURCE})
target_compile_features(device_conv2d_fwd_cpu_instance PUBLIC)
set_target_properties(device_conv2d_fwd_cpu_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(device_conv2d_fwd_cpu_instance PRIVATE /opt/rocm/llvm/lib/libomp.so)
target_compile_options(device_conv2d_fwd_cpu_instance PRIVATE -fopenmp=libomp -Wno-unused-command-line-argument)
target_link_libraries(device_conv2d_fwd_cpu_instance PRIVATE "${OMP_LIBRARY}")
target_compile_options(device_conv2d_fwd_cpu_instance PRIVATE "${OMP_CXX_FLAG}")
install(TARGETS device_conv2d_fwd_cpu_instance LIBRARY DESTINATION lib)
clang_tidy_check(device_conv2d_fwd_cpu_instance)
#!/bin/bash
rm -f CMakeCache.txt
rm -f *.cmake
rm -rf CMakeFiles
AVX2_FLAGS='-m64 -mavx2 -mf16c -mfma -DHALF_ENABLE_F16C_INTRINSICS=0 '
rm -rf build/
mkdir build && cd build
MY_PROJECT_SOURCE=..
MY_PROJECT_INSTALL=../install.dir
rm -rf $MY_PROJECT_INSTALL
mkdir $MY_PROJECT_INSTALL
cmake \
-D CMAKE_INSTALL_PREFIX=${MY_PROJECT_INSTALL} \
-D BUILD_DEV=OFF \
-D CMAKE_BUILD_TYPE=Release \
-D CMAKE_CXX_FLAGS="$AVX2_FLAGS " \
-D CMAKE_CXX_COMPILER=g++ \
-D CMAKE_PREFIX_PATH=/usr/local \
-D CMAKE_VERBOSE_MAKEFILE:BOOL=ON \
-D CK_NOGPU=ON \
${MY_PROJECT_SOURCE}
add_test_executable(test_conv2d_fwd_cpu conv2d_fwd_cpu.cpp)
target_link_libraries(test_conv2d_fwd_cpu PRIVATE host_tensor)
target_link_libraries(test_conv2d_fwd_cpu PRIVATE device_conv2d_fwd_cpu_instance)
# 3.13 introduce target_link_directories, which is better
set_target_properties(test_conv2d_fwd_cpu PROPERTIES LINK_FLAGS -Wl,-rpath,/opt/rocm/llvm/lib )
target_link_libraries(test_conv2d_fwd_cpu PRIVATE /opt/rocm/llvm/lib/libomp.so)
target_compile_options(test_conv2d_fwd_cpu PRIVATE -fopenmp=libomp -Wno-unused-command-line-argument)
add_test_executable(test_conv2d_fwd_cpu conv2d_fwd_cpu.cpp)
target_link_libraries(test_conv2d_fwd_cpu PRIVATE host_tensor)
target_link_libraries(test_conv2d_fwd_cpu PRIVATE device_conv2d_fwd_cpu_instance)
# 3.13 introduce target_link_directories, which is better
set_target_properties(test_conv2d_fwd_cpu PROPERTIES LINK_FLAGS "${OMP_LINK_FLAG}")
target_link_libraries(test_conv2d_fwd_cpu PRIVATE "${OMP_LIBRARY}")
target_compile_options(test_conv2d_fwd_cpu PRIVATE "${OMP_CXX_FLAG}")
add_test_executable(test_cpu_gemm_uk cpu_gemm_uk.cpp)
target_link_libraries(test_cpu_gemm_uk PRIVATE host_tensor)
# 3.13 introduce target_link_directories, which is better
set_target_properties(test_cpu_gemm_uk PROPERTIES LINK_FLAGS -Wl,-rpath,/opt/rocm/llvm/lib )
target_link_libraries(test_cpu_gemm_uk PRIVATE /opt/rocm/llvm/lib/libomp.so)
target_compile_options(test_cpu_gemm_uk PRIVATE -fopenmp=libomp -Wno-unused-command-line-argument)
set_target_properties(test_cpu_gemm_uk PROPERTIES LINK_FLAGS "${OMP_LINK_FLAG}")
target_link_libraries(test_cpu_gemm_uk PRIVATE "${OMP_LIBRARY}")
target_compile_options(test_cpu_gemm_uk PRIVATE "${OMP_CXX_FLAG}")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment