Commit 9a7fa123 authored by carlushuang's avatar carlushuang
Browse files

support gcc with cpu only compile

parent ad09ebdb
......@@ -7,6 +7,10 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
enable_testing()
option(CK_NOGPU "build without gpu backend" OFF)
if(NOT CK_NOGPU)
find_package(ROCM REQUIRED PATHS /opt/rocm)
include(ROCMInstallTargets)
......@@ -19,6 +23,7 @@ include(CheckCXXCompilerFlag)
rocm_setup_version(VERSION 1.0.0)
include(TargetFlags)
list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip)
endif()
## C++
enable_language(CXX)
......@@ -31,25 +36,26 @@ option(CK_TIME_KERNEL "Turning off will disable kernel timing globally" ON)
## OpenMP
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# workaround issue hipcc in rocm3.5 cannot find openmp
set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
set(OMP_CXX_FLAG -fopenmp=libomp -Wno-unused-command-line-argument)
set(OMP_LIBRARY /opt/rocm/llvm/lib/libomp.so)
set(OMP_LINK_FLAG -Wl,-rpath,/opt/rocm/llvm/lib)
elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
set(OMP_CXX_FLAG -fopenmp)
set(OMP_LIBRARY "")
set(OMP_LINK_FLAG -fopenmp)
else()
find_package(OpenMP REQUIRED)
endif()
message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
# message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
# message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
# message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
# message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
link_libraries(${OpenMP_gomp_LIBRARY})
link_libraries(${OpenMP_pthread_LIBRARY})
# link_libraries(${OpenMP_gomp_LIBRARY})
# link_libraries(${OpenMP_pthread_LIBRARY})
if(NOT CK_NOGPU)
## HIP
find_package(HIP REQUIRED)
# Override HIP version in config.h, if necessary.
......@@ -79,6 +85,7 @@ rocm_create_package(
MAINTAINER "MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
LDCONFIG
)
endif()
## half
set(HALF_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/external/include/half")
......@@ -94,7 +101,8 @@ elseif(CK_BACKEND STREQUAL "HIP" OR CK_BACKEND STREQUAL "HIPNOGPU")
set(CK_TIDY_ERRORS ALL)
endif()
if(NOT CK_NOGPU)
# currently tidy and cppcheck seems also need something from rocm environment
include(ClangTidy)
enable_clang_tidy(
CHECKS
......@@ -224,6 +232,11 @@ enable_cppcheck(
CPPCHECK=1
__linux__=1
)
else()
function(clang_tidy_check TARGET)
# dummy empty functoin
endfunction()
endif()
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
......
#ifndef CK_CONFIG_AMD_HPP
#define CK_CONFIG_AMD_HPP
#ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
#include "ck/options.hpp"
#ifdef CK_NOGPU
#define __host__
#define __device__
#else
#include "hip/hip_runtime.h"
#include "hip/hip_fp16.h"
#endif
......@@ -26,6 +31,12 @@
#endif
#endif
#if defined(__GNUC__) && !defined(__clang__) && !defined(__llvm__)
#if __GNUC__ < 9
#error "If use gcc, need make sure use at least gcc-9"
#endif
#endif
// buffer resource
#ifndef __HIP_DEVICE_COMPILE__ // for host code
#define CK_BUFFER_RESOURCE_3RD_DWORD -1
......
#pragma once
#cmakedefine01 CK_TIME_KERNEL
#cmakedefine CK_NOGPU
#pragma once
#ifndef CK_NOGPU
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#endif
struct StreamConfig
{
#ifndef CK_NOGPU
hipStream_t stream_id_ = nullptr;
#endif
bool time_kernel_ = false;
};
......@@ -79,6 +79,7 @@ struct StaticTensor
T ignored_element_scalar_;
};
#ifndef CK_NOGPU
// StaticTensor for vector
template <AddressSpaceEnum AddressSpace,
typename S,
......@@ -244,6 +245,7 @@ struct StaticTensorTupleOfVectorBuffer
const S invalid_element_scalar_value_ = S{0};
S ignored_element_scalar_;
};
#endif
template <AddressSpaceEnum AddressSpace,
typename T,
......
......@@ -277,7 +277,12 @@ struct TensorCoordinateStep
MultiIndex<NTransform> do_transforms_;
// HACK: control UpdateLowerIndex()
#if defined(__GNUC__) && !defined(__clang__) && !defined(__llvm__)
// constexpr static data member ‘update_lower_index_hack_’ must have an initializer
static constexpr UpdateLowerIndexHack update_lower_index_hack_{};
#else
static constexpr UpdateLowerIndexHack update_lower_index_hack_;
#endif
};
// TODO: How to fix this? It uses an struct instead of lambda because lambda
......
#ifndef CK_THREADWISE_GEMM_AVX2_HPP
#define CK_THREADWISE_GEMM_AVX2_HPP
#include <assert.h>
#if CK_USE_X86_INLINE_ASM == 0
#include <immintrin.h>
#endif
......@@ -122,22 +123,22 @@ struct ThreadwiseGemmAvx2_MxN_6x16
".macro vbroadcast_a%= i_k, i_m, ymm\n" // A in rax(r8, r9), lda in rcx
".if m_ABytes == 4\n"
".if m_TransA == 0\n"
"vbroadcastss_%= %%rax, 0, 0, (\\i_m + \\i_k * m_Mr) * m_ABytes, \\ymm\n"
"vbroadcastss_%= %%rax, 0, 0, ((\\i_m + \\i_k * m_Mr) * m_ABytes), \\ymm\n"
".else\n"
".if (\\i_m == 0) || (\\i_m == 1) || (\\i_m == 2)\n"
"vbroadcastss_%= %%rax, %%rcx, \\i_m, \\i_k * m_ABytes, \\ymm\n"
"vbroadcastss_%= %%rax, %%rcx, \\i_m, (\\i_k * m_ABytes), \\ymm\n"
".else\n"
"vbroadcastss_%= %%r8, %%rcx, \\i_m-3, \\i_k * m_ABytes, \\ymm\n"
"vbroadcastss_%= %%r8, %%rcx, \\i_m-3, (\\i_k * m_ABytes), \\ymm\n"
".endif\n"
".endif\n"
".else\n"
".if m_TransA == 0\n"
"vpbroadcastw_%= %%rax, 0, 0, (\\i_m + \\i_k * m_Mr) * m_ABytes, %%xmm15\n"
"vpbroadcastw_%= %%rax, 0, 0, ((\\i_m + \\i_k * m_Mr) * m_ABytes), %%xmm15\n"
".else\n"
".if (\\i_m == 0) || (\\i_m == 1) || (\\i_m == 2)\n"
"vpbroadcastw_%= %%rax, %%rcx, \\i_m, \\i_k * m_ABytes, %%xmm15\n"
"vpbroadcastw_%= %%rax, %%rcx, \\i_m, (\\i_k * m_ABytes), %%xmm15\n"
".else\n"
"vpbroadcastw_%= %%r8, %%rcx, \\i_m-3, \\i_k * m_ABytes, %%xmm15\n"
"vpbroadcastw_%= %%r8, %%rcx, \\i_m-3, (\\i_k * m_ABytes), %%xmm15\n"
".endif\n"
".endif\n"
"vcvtph2ps %%xmm15, \\ymm\n"
......@@ -147,15 +148,15 @@ struct ThreadwiseGemmAvx2_MxN_6x16
".macro vload_b%= i_k, i_n, ymm\n" // B in rbx, lda in rdx, i_n should be 0, 1
".if m_BBytes == 4\n"
".if m_TransB == 0\n"
"vmovups_%= %%rbx, %%rdx, \\i_n, \\i_k*m_BBytes*8, \\ymm\n"
"vmovups_%= %%rbx, %%rdx, \\i_n, (\\i_k*m_BBytes*8), \\ymm\n"
".else\n"
"vmovups_%= %%rbx, 0, 0, (\\i_k*m_Nr + \\i_n*8)*m_BBytes, \\ymm\n"
"vmovups_%= %%rbx, 0, 0, ((\\i_k*m_Nr + \\i_n*8)*m_BBytes), \\ymm\n"
".endif\n"
".else\n"
".if m_TransB == 0\n"
"vcvtph2ps_%= %%rbx, %%rdx, \\i_n, \\i_k*m_BBytes*8, \\ymm\n"
"vcvtph2ps_%= %%rbx, %%rdx, \\i_n, (\\i_k*m_BBytes*8), \\ymm\n"
".else\n"
"vcvtph2ps_%= %%rbx, 0, 0, (\\i_k*m_Nr + \\i_n*8)*m_BBytes, \\ymm\n"
"vcvtph2ps_%= %%rbx, 0, 0, ((\\i_k*m_Nr + \\i_n*8)*m_BBytes), \\ymm\n"
".endif\n"
".endif\n"
".endm\n"
......@@ -682,22 +683,22 @@ struct ThreadwiseGemmAvx2_MxN_4x24
".macro vbroadcast_a%= i_k, i_m, ymm\n" // A in rax(r8), lda in rcx
".if m_ABytes == 4\n"
".if m_TransA == 0\n"
"vbroadcastss_%= %%rax, 0, 0, (\\i_m + \\i_k * m_Mr) * m_ABytes, \\ymm\n"
"vbroadcastss_%= %%rax, 0, 0, ((\\i_m + \\i_k * m_Mr) * m_ABytes), \\ymm\n"
".else\n"
".if (\\i_m == 0) || (\\i_m == 1)\n"
"vbroadcastss_%= %%rax, %%rcx, \\i_m, \\i_k * m_ABytes, \\ymm\n"
"vbroadcastss_%= %%rax, %%rcx, \\i_m, (\\i_k * m_ABytes), \\ymm\n"
".else\n"
"vbroadcastss_%= %%r8, %%rcx, \\i_m-2, \\i_k * m_ABytes, \\ymm\n"
"vbroadcastss_%= %%r8, %%rcx, \\i_m-2, (\\i_k * m_ABytes), \\ymm\n"
".endif\n"
".endif\n"
".else\n"
".if m_TransA == 0\n"
"vpbroadcastw_%= %%rax, 0, 0, (\\i_m + \\i_k * m_Mr) * m_ABytes, %%xmm15\n"
"vpbroadcastw_%= %%rax, 0, 0, ((\\i_m + \\i_k * m_Mr) * m_ABytes), %%xmm15\n"
".else\n"
".if (\\i_m == 0) || (\\i_m == 1)\n"
"vpbroadcastw_%= %%rax, %%rcx, \\i_m, \\i_k * m_ABytes, %%xmm15\n"
"vpbroadcastw_%= %%rax, %%rcx, \\i_m, (\\i_k * m_ABytes), %%xmm15\n"
".else\n"
"vpbroadcastw_%= %%r8, %%rcx, \\i_m-2, \\i_k * m_ABytes, %%xmm15\n"
"vpbroadcastw_%= %%r8, %%rcx, \\i_m-2, (\\i_k * m_ABytes), %%xmm15\n"
".endif\n"
".endif\n"
"vcvtph2ps %%xmm15, \\ymm\n"
......@@ -707,15 +708,15 @@ struct ThreadwiseGemmAvx2_MxN_4x24
".macro vload_b%= i_k, i_n, ymm\n" // B in rbx, lda in rdx, i_n should be 0, 1, 2
".if m_BBytes == 4\n"
".if m_TransB == 0\n"
"vmovups_%= %%rbx, %%rdx, \\i_n, \\i_k*m_BBytes*8, \\ymm\n"
"vmovups_%= %%rbx, %%rdx, \\i_n, (\\i_k*m_BBytes*8), \\ymm\n"
".else\n"
"vmovups_%= %%rbx, 0, 0, (\\i_k*m_Nr + \\i_n*8)*m_BBytes, \\ymm\n"
"vmovups_%= %%rbx, 0, 0, ((\\i_k*m_Nr + \\i_n*8)*m_BBytes), \\ymm\n"
".endif\n"
".else\n"
".if m_TransB == 0\n"
"vcvtph2ps_%= %%rbx, %%rdx, \\i_n, \\i_k*m_BBytes*8, \\ymm\n"
"vcvtph2ps_%= %%rbx, %%rdx, \\i_n, (\\i_k*m_BBytes*8), \\ymm\n"
".else\n"
"vcvtph2ps_%= %%rbx, 0, 0, (\\i_k*m_Nr + \\i_n*8)*m_BBytes, \\ymm\n"
"vcvtph2ps_%= %%rbx, 0, 0, ((\\i_k*m_Nr + \\i_n*8)*m_BBytes), \\ymm\n"
".endif\n"
".endif\n"
".endm\n"
......
......@@ -46,7 +46,13 @@ void memcpy32_avx2(void* dst, const void* src, const ck::index_t n, const Elemen
}
if(i_n & 2)
{
#if defined(__GNUC__) && !defined(__clang__) && !defined(__llvm__)
__m128i s = _mm_loadu_si64(p_src);
__m128 v = element_op.Apply(*reinterpret_cast<__m128*>(&s));
_mm_storeu_si64(p_dst, *reinterpret_cast<__m128i*>(&v));
#else
_mm_storeu_si64(p_dst, element_op.Apply(_mm_loadu_si64(p_src)));
#endif
p_dst += 2;
p_src += 2;
}
......@@ -82,7 +88,11 @@ inline void memset32_avx2(void* dst, const int32_t value, const ck::index_t n)
}
if(i_n & 2)
{
#if defined(__GNUC__) && !defined(__clang__) && !defined(__llvm__)
_mm_storeu_si64(p_dst, *reinterpret_cast<__m128i*>(&xmm));
#else
_mm_storeu_si64(p_dst, xmm);
#endif
p_dst += 2;
}
if(i_n & 1)
......
#pragma once
#include "data_type.hpp"
#ifndef CK_NOGPU
namespace ck {
template <typename T>
......@@ -1047,3 +1048,5 @@ amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thr
}
} // namespace ck
#endif
......@@ -4,6 +4,8 @@
#include "data_type.hpp"
#include "c_style_pointer_cast.hpp"
#ifndef CK_NOGPU
// TODO: deprecate all amd_assembly_outer_product_xxx
namespace ck {
......@@ -354,3 +356,4 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,
} // namespace ck
#endif
#endif
#ifndef CK_AMD_LLVM_INTRINSIC_HPP
#define CK_AMD_LLVM_INTRINSIC_HPP
#ifndef CK_NOGPU
#include "data_type.hpp"
namespace ck {
......@@ -9,3 +10,4 @@ __device__ int32_t llvm_amdgcn_readfirstlane_i32(int32_t i) __asm("llvm.amdgcn.r
} // namespace ck
#endif
#endif
#ifndef CK_AMD_XDLOPS_HPP
#define CK_AMD_XDLOPS_HPP
#ifndef CK_NOGPU
#include "data_type.hpp"
namespace ck {
......@@ -296,3 +297,4 @@ struct intrin_mfma_i32_16x16x16i8<16, 16>
} // namespace ck
#endif
#endif
#pragma once
#include "statically_indexed_array.hpp"
#ifdef CK_NOGPU
#include "half.hpp"
#endif
namespace ck {
using bhalf_t = ushort;
using half_t = _Float16;
#ifdef CK_NOGPU
using half_t = half_float::half;
#else
using half_t = _Float16;
#endif
// vector_type
template <typename T, index_t N>
......@@ -14,8 +21,10 @@ struct vector_type;
// intentionally have only declaration but no definition to cause compilation failure when trying to
// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
// vectors"
#ifdef __clang__
template <typename T, index_t V, index_t N>
struct vector_type<T __attribute__((ext_vector_type(V))), N>;
#endif
// Caution: DO NOT REMOVE
// intentionally have only declaration but no definition to cause compilation failure when trying to
......@@ -32,11 +41,13 @@ struct vector_type_maker
using type = vector_type<T, N>;
};
#ifdef __clang__
template <typename T, index_t N0, index_t N1>
struct vector_type_maker<T __attribute__((ext_vector_type(N1))), N0>
{
using type = vector_type<T, N0 * N1>;
};
#endif
template <typename T, index_t N0, index_t N1>
struct vector_type_maker<vector_type<T, N1>, N0>
......@@ -69,12 +80,14 @@ template <typename X, typename Y>
using has_same_scalar_type = is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<Y>>::type>;
#ifdef __clang__
template <typename T, index_t N>
struct scalar_type<T __attribute__((ext_vector_type(N)))>
{
using type = T;
static constexpr index_t vector_size = N;
};
#endif
template <typename T, index_t N>
struct scalar_type<vector_type<T, N>>
......
#pragma once
#include <immintrin.h>
namespace ck {
namespace cpu {
// vector_type
template <typename T, index_t N>
struct vector_type;
// Caution: DO NOT REMOVE
// intentionally have only declaration but no definition to cause compilation failure when trying to
// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
// vectors"
template <typename T, index_t V, index_t N>
struct vector_type<T __attribute__((ext_vector_type(V))), N>;
// Caution: DO NOT REMOVE
// intentionally have only declaration but no definition to cause compilation failure when trying to
// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
// vectors"
template <typename T, index_t V, index_t N>
struct vector_type<vector_type<T, V>, N>;
// vector_type_maker
// This is the right way to handle "vector of vectors": making a bigger vector instead
template <typename T, index_t N>
struct vector_type_maker
{
using type = vector_type<T, N>;
};
template <typename T, index_t N>
using vector_type_maker_t = typename vector_type_maker<T, N>::type;
template <typename T, index_t N>
constexpr auto make_vector_type(Number<N>)
{
return typename vector_type_maker<T, N>::type{};
}
template <>
struct vector_type<float, 1>
{
using d1_t = float;
// SSE
using type = float;
type data_;
vector_type() : data_{0} {}
// vector_type(float x) : data_{x} {}
vector_type(type v) : data_{v} {}
vector_type(const float* mem) : data_{*mem} {}
template <typename X>
constexpr const auto& AsType() const
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
template <typename X>
constexpr auto& AsType()
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
constexpr void Load(const float* mem) { data_ = *mem; }
constexpr void Store(float* mem) const { *mem = data_; }
};
template <>
struct vector_type<float, 4>
{
using d1_t = float;
// SSE
using type = __m128;
type data_;
vector_type() : data_{_mm_setzero_ps()} {}
vector_type(float x) : data_{_mm_set1_ps(x)} {}
vector_type(type v) : data_{v} {}
vector_type(const float* mem) : data_{_mm_loadu_ps(mem)} {}
template <typename X>
constexpr const auto& AsType() const
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
template <typename X>
constexpr auto& AsType()
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
constexpr void Load(const float* mem) { data_ = _mm_loadu_ps(mem); }
constexpr void Store(float* mem) const { _mm_storeu_ps(mem, data_); }
};
template <>
struct vector_type<float, 8>
{
using d1_t = float;
// SSE
using type = __m256;
type data_;
vector_type() : data_{_mm256_setzero_ps()} {}
vector_type(float x) : data_{_mm256_set1_ps(x)} {}
vector_type(type v) : data_{v} {}
vector_type(const float* mem) : data_{_mm256_loadu_ps(mem)} {}
template <typename X>
constexpr const auto& AsType() const
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
template <typename X>
constexpr auto& AsType()
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
constexpr void Load(const float* mem) { data_ = _mm256_loadu_ps(mem); }
constexpr void Store(float* mem) const { _mm256_storeu_ps(mem, data_); }
};
template <typename T>
struct to_vector_type
{
using type = T;
};
template <>
struct to_vector_type<__m128>
{
using type = vector_type<float, 4>;
};
template <>
struct to_vector_type<__m256>
{
using type = vector_type<float, 8>;
};
template <typename Tv, typename Tp>
inline void load_vector(Tv& v, const Tp* mem)
{
v = *reinterpret_cast<const Tv*>(mem);
}
template <>
inline void load_vector(__m128& v, const float* mem)
{
v = _mm_loadu_ps(mem);
}
template <>
inline void load_vector(__m256& v, const float* mem)
{
v = _mm256_loadu_ps(mem);
}
template <typename Tv, typename Tp>
inline void store_vector(const Tv& v, Tp* mem)
{
*reinterpret_cast<Tv*>(mem) = v;
}
template <>
inline void store_vector(const __m128& v, float* mem)
{
_mm_storeu_ps(mem, v);
}
template <>
inline void store_vector(const __m256& v, float* mem)
{
_mm256_storeu_ps(mem, v);
}
template <typename Tv, typename Tx>
inline void set_vector(Tv& v, const Tx x)
{
v = static_cast<const Tv>(x);
}
template <>
inline void set_vector(__m128& v, const float x)
{
v = _mm_set1_ps(x);
}
template <>
inline void set_vector(__m256& v, const float x)
{
v = _mm256_set1_ps(x);
}
template <typename Tv>
inline void clear_vector(Tv& v)
{
v = static_cast<Tv>(0);
}
template <>
inline void clear_vector(__m128& v)
{
v = _mm_setzero_ps();
}
template <>
inline void clear_vector(__m256& v)
{
v = _mm256_setzero_ps();
}
using float4_t = typename vector_type<float, 4>::type;
using float8_t = typename vector_type<float, 8>::type;
// scalar_type
template <typename TV>
struct scalar_type;
// is_scalar_type
template <typename TV>
struct is_scalar_type
{
static constexpr bool value = (scalar_type<remove_cvref_t<TV>>::vector_size == 1);
};
// has_same_scalar_type
template <typename X, typename Y>
using has_same_scalar_type = is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<Y>>::type>;
template <typename T, index_t N>
struct scalar_type<vector_type<T, N>>
{
using type = T;
static constexpr index_t vector_size = N;
};
template <>
struct scalar_type<float4_t>
{
using type = float;
static constexpr index_t vector_size = 4;
};
template <>
struct scalar_type<float8_t>
{
using type = float;
static constexpr index_t vector_size = 8;
};
//
template <>
struct scalar_type<float>
{
using type = float;
static constexpr index_t vector_size = 1;
};
} // namespace cpu
} // namespace ck
#pragma once
#include <immintrin.h>
#include "half.hpp"
namespace ck {
namespace cpu {
// vector_type
template <typename T, index_t N>
struct vector_type;
// Caution: DO NOT REMOVE
// intentionally have only declaration but no definition to cause compilation failure when trying to
// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
// vectors"
#ifdef __clang__
template <typename T, index_t V, index_t N>
struct vector_type<T __attribute__((ext_vector_type(V))), N>;
#endif
// Caution: DO NOT REMOVE
// intentionally have only declaration but no definition to cause compilation failure when trying to
// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
// vectors"
template <typename T, index_t V, index_t N>
struct vector_type<vector_type<T, V>, N>;
// vector_type_maker
// This is the right way to handle "vector of vectors": making a bigger vector instead
template <typename T, index_t N>
struct vector_type_maker
{
using type = vector_type<T, N>;
};
template <typename T, index_t N>
using vector_type_maker_t = typename vector_type_maker<T, N>::type;
template <typename T, index_t N>
constexpr auto make_vector_type(Number<N>)
{
return typename vector_type_maker<T, N>::type{};
}
template <>
struct vector_type<float, 1>
{
using d1_t = float;
// SSE
using type = float;
type data_;
vector_type() : data_{0} {}
// vector_type(float x) : data_{x} {}
vector_type(type v) : data_{v} {}
vector_type(const float* mem) : data_{*mem} {}
template <typename X>
constexpr const auto& AsType() const
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
template <typename X>
constexpr auto& AsType()
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
constexpr void Load(const float* mem) { data_ = *mem; }
constexpr void Store(float* mem) const { *mem = data_; }
};
template <>
struct vector_type<float, 4>
{
using d1_t = float;
// SSE
using type = __m128;
type data_;
vector_type() : data_{_mm_setzero_ps()} {}
vector_type(float x) : data_{_mm_set1_ps(x)} {}
vector_type(type v) : data_{v} {}
vector_type(const float* mem) : data_{_mm_loadu_ps(mem)} {}
template <typename X>
constexpr const auto& AsType() const
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
template <typename X>
constexpr auto& AsType()
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
void Load(const float* mem) { data_ = _mm_loadu_ps(mem); }
void Store(float* mem) const { _mm_storeu_ps(mem, data_); }
};
template <>
struct vector_type<float, 8>
{
using d1_t = float;
// SSE
using type = __m256;
type data_;
vector_type() : data_{_mm256_setzero_ps()} {}
vector_type(float x) : data_{_mm256_set1_ps(x)} {}
vector_type(type v) : data_{v} {}
vector_type(const float* mem) : data_{_mm256_loadu_ps(mem)} {}
template <typename X>
constexpr const auto& AsType() const
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
template <typename X>
constexpr auto& AsType()
{
static_assert(std::is_same<X, type>::value, "wrong!");
return data_;
}
void Load(const float* mem) { data_ = _mm256_loadu_ps(mem); }
void Store(float* mem) const { _mm256_storeu_ps(mem, data_); }
};
template <typename T>
struct to_vector_type
{
using type = T;
};
template <>
struct to_vector_type<__m128>
{
using type = vector_type<float, 4>;
};
template <>
struct to_vector_type<__m256>
{
using type = vector_type<float, 8>;
};
template <typename Tv, typename Tp>
inline void load_vector(Tv& v, const Tp* mem)
{
v = *reinterpret_cast<const Tv*>(mem);
}
template <>
inline void load_vector(__m128& v, const float* mem)
{
v = _mm_loadu_ps(mem);
}
template <>
inline void load_vector(__m256& v, const float* mem)
{
v = _mm256_loadu_ps(mem);
}
template <typename Tv, typename Tp>
inline void store_vector(const Tv& v, Tp* mem)
{
*reinterpret_cast<Tv*>(mem) = v;
}
template <>
inline void store_vector(const __m128& v, float* mem)
{
_mm_storeu_ps(mem, v);
}
template <>
inline void store_vector(const __m256& v, float* mem)
{
_mm256_storeu_ps(mem, v);
}
template <typename Tv, typename Tx>
inline void set_vector(Tv& v, const Tx x)
{
v = static_cast<const Tv>(x);
}
template <>
inline void set_vector(__m128& v, const float x)
{
v = _mm_set1_ps(x);
}
template <>
inline void set_vector(__m256& v, const float x)
{
v = _mm256_set1_ps(x);
}
template <typename Tv>
inline void clear_vector(Tv& v)
{
v = static_cast<Tv>(0);
}
template <>
inline void clear_vector(__m128& v)
{
v = _mm_setzero_ps();
}
template <>
inline void clear_vector(__m256& v)
{
v = _mm256_setzero_ps();
}
using float4_t = typename vector_type<float, 4>::type;
using float8_t = typename vector_type<float, 8>::type;
// scalar_type
template <typename TV>
struct scalar_type;
// is_scalar_type
template <typename TV>
struct is_scalar_type
{
static constexpr bool value = (scalar_type<remove_cvref_t<TV>>::vector_size == 1);
};
// has_same_scalar_type
template <typename X, typename Y>
using has_same_scalar_type = is_same<typename scalar_type<remove_cvref_t<X>>::type,
typename scalar_type<remove_cvref_t<Y>>::type>;
template <typename T, index_t N>
struct scalar_type<vector_type<T, N>>
{
using type = T;
static constexpr index_t vector_size = N;
};
template <>
struct scalar_type<float4_t>
{
using type = float;
static constexpr index_t vector_size = 4;
};
template <>
struct scalar_type<float8_t>
{
using type = float;
static constexpr index_t vector_size = 8;
};
//
template <>
struct scalar_type<float>
{
using type = float;
static constexpr index_t vector_size = 1;
};
} // namespace cpu
} // namespace ck
#ifndef UTILITY_DEBUG_HPP
#define UTILITY_DEBUG_HPP
#ifndef CK_NOGPU
namespace ck {
namespace debug {
......@@ -74,4 +74,5 @@ __device__ void print_shared(T const* p_shared, index_t num_elements)
} // namespace debug
} // namespace ck
#endif
#endif // UTILITY_DEBUG_HPP
......@@ -5,6 +5,7 @@
#include "amd_buffer_addressing.hpp"
#include "generic_memory_space_atomic_add.hpp"
#ifndef CK_NOGPU
namespace ck {
// T may be scalar or vector
......@@ -351,3 +352,4 @@ make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element
}
} // namespace ck
#endif
#pragma once
#ifndef CK_NOGPU
#include "data_type.hpp"
namespace ck {
......@@ -42,3 +43,4 @@ __device__ float2_t atomic_add<float2_t>(float2_t* p_dst, const float2_t& x)
}
} // namespace ck
#endif
#pragma once
#include "config.hpp"
#ifndef CK_NOGPU
namespace ck {
__host__ __device__ constexpr index_t get_warp_size()
......@@ -18,3 +19,4 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; }
__device__ index_t get_grid_size() { return gridDim.x; }
} // namespace ck
#endif
\ No newline at end of file
......@@ -2,7 +2,7 @@
#define CK_INNER_PRODUCT_HPP
#include "data_type.hpp"
#ifndef CK_NOGPU
namespace ck {
template <typename TA, typename TB, typename TC>
......@@ -203,3 +203,4 @@ inner_product<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a, const int8x16_t
} // namespace ck
#endif
#endif
\ No newline at end of file
......@@ -118,7 +118,7 @@ struct MagicDivision
{
return CalculateMagicShift(integral_constant<uint32_t, Divisor>{});
}
#ifndef CK_NOGPU
// magic division for uint32_t
__device__ static constexpr uint32_t
DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
......@@ -126,7 +126,7 @@ struct MagicDivision
uint32_t tmp = __umulhi(dividend, multiplier);
return (tmp + dividend) >> shift;
}
#endif
__host__ static constexpr uint32_t
DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
{
......@@ -138,6 +138,7 @@ struct MagicDivision
// HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
// non-negative for result to be correct
// TODO: figure out how to do magic number divison for int32_t as dividended
#ifndef CK_NOGPU
__device__ static constexpr int32_t
DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
{
......@@ -145,6 +146,7 @@ struct MagicDivision
uint32_t tmp = __umulhi(dividend_u32, multiplier);
return (tmp + dividend_u32) >> shift;
}
#endif
__host__ static constexpr int32_t
DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment