support gcc with cpu only compile

9a7fa123 · carlushuang · ad09ebdb · 9a7fa123 · 9a7fa123 · 9a7fa123
Commit 9a7fa123 authored May 17, 2022 by carlushuang
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,10 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")

 enable_testing()

+option(CK_NOGPU "build without gpu backend" OFF)
+
+
+if(NOT CK_NOGPU)
 find_package(ROCM REQUIRED PATHS /opt/rocm)

 include(ROCMInstallTargets)
@@ -19,6 +23,7 @@ include(CheckCXXCompilerFlag)
 rocm_setup_version(VERSION 1.0.0)
 include(TargetFlags)
 list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip)
+endif()

 ## C++
 enable_language(CXX)
@@ -31,25 +36,26 @@ option(CK_TIME_KERNEL "Turning off will disable kernel timing globally" ON)

 ## OpenMP
 if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-	# workaround issue hipcc in rocm3.5 cannot find openmp
-	set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
-	set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
-	set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
-	set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
-	set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
-	set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+    set(OMP_CXX_FLAG -fopenmp=libomp -Wno-unused-command-line-argument)
+    set(OMP_LIBRARY /opt/rocm/llvm/lib/libomp.so)
+    set(OMP_LINK_FLAG -Wl,-rpath,/opt/rocm/llvm/lib)
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+    set(OMP_CXX_FLAG -fopenmp)
+    set(OMP_LIBRARY "")
+    set(OMP_LINK_FLAG -fopenmp)
 else()
 	find_package(OpenMP REQUIRED)
 endif()

-message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
-message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
-message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
-message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
+# message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
+# message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
+# message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
+# message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")

-link_libraries(${OpenMP_gomp_LIBRARY})
-link_libraries(${OpenMP_pthread_LIBRARY})
+# link_libraries(${OpenMP_gomp_LIBRARY})
+# link_libraries(${OpenMP_pthread_LIBRARY})

+if(NOT CK_NOGPU)
 ## HIP
 find_package(HIP REQUIRED)
 # Override HIP version in config.h, if necessary.
@@ -79,6 +85,7 @@ rocm_create_package(
    MAINTAINER "MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
    LDCONFIG
 )
+endif()

 ## half
 set(HALF_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/external/include/half")
@@ -94,7 +101,8 @@ elseif(CK_BACKEND STREQUAL "HIP" OR CK_BACKEND STREQUAL "HIPNOGPU")
    set(CK_TIDY_ERRORS ALL)
 endif()

-
+if(NOT CK_NOGPU)
+# currently tidy and cppcheck seems also need something from rocm environment
 include(ClangTidy)
 enable_clang_tidy(
    CHECKS
@@ -224,6 +232,11 @@ enable_cppcheck(
        CPPCHECK=1
        __linux__=1
 )
+else()
+function(clang_tidy_check TARGET)
+# dummy empty functoin
+endfunction()
+endif()

 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)

--- a/include/ck/config.hpp
+++ b/include/ck/config.hpp
 #ifndef CK_CONFIG_AMD_HPP
 #define CK_CONFIG_AMD_HPP

-#ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
+#include "ck/options.hpp"
+
+#ifdef CK_NOGPU
+#define __host__
+#define __device__
+#else
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
 #endif
@@ -26,6 +31,12 @@
 #endif
 #endif

+#if defined(__GNUC__) && !defined(__clang__) && !defined(__llvm__)
+#if __GNUC__ < 9
+#error "If use gcc, need make sure use at least gcc-9"
+#endif
+#endif
+
 // buffer resource
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_BUFFER_RESOURCE_3RD_DWORD -1

--- a/include/ck/options.hpp.in
+++ b/include/ck/options.hpp.in
 #pragma once

 #cmakedefine01 CK_TIME_KERNEL
+#cmakedefine CK_NOGPU
--- a/include/ck/stream_config.hpp
+++ b/include/ck/stream_config.hpp
 #pragma once

+#ifndef CK_NOGPU
 #include <hip/hip_runtime.h>
 #include <hip/hip_fp16.h>
+#endif

 struct StreamConfig
 {
+#ifndef CK_NOGPU
    hipStream_t stream_id_ = nullptr;
+#endif
    bool time_kernel_      = false;
 };
--- a/include/ck/tensor/static_tensor.hpp
+++ b/include/ck/tensor/static_tensor.hpp
@@ -79,6 +79,7 @@ struct StaticTensor
    T ignored_element_scalar_;
 };

+#ifndef CK_NOGPU
 // StaticTensor for vector
 template <AddressSpaceEnum AddressSpace,
          typename S,
@@ -244,6 +245,7 @@ struct StaticTensorTupleOfVectorBuffer
    const S invalid_element_scalar_value_ = S{0};
    S ignored_element_scalar_;
 };
+#endif

 template <AddressSpaceEnum AddressSpace,
          typename T,

--- a/include/ck/tensor_description/tensor_descriptor.hpp
+++ b/include/ck/tensor_description/tensor_descriptor.hpp
@@ -277,7 +277,12 @@ struct TensorCoordinateStep
    MultiIndex<NTransform> do_transforms_;

    // HACK: control UpdateLowerIndex()
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__llvm__)
+    // constexpr static data member ‘update_lower_index_hack_’ must have an initializer
+    static constexpr UpdateLowerIndexHack update_lower_index_hack_{};
+#else
    static constexpr UpdateLowerIndexHack update_lower_index_hack_;
+#endif
 };

 // TODO: How to fix this? It uses an struct instead of lambda because lambda

--- a/include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
 #ifndef CK_THREADWISE_GEMM_AVX2_HPP
 #define CK_THREADWISE_GEMM_AVX2_HPP

+#include <assert.h>
 #if CK_USE_X86_INLINE_ASM == 0
 #include <immintrin.h>
 #endif
@@ -122,22 +123,22 @@ struct ThreadwiseGemmAvx2_MxN_6x16
            ".macro vbroadcast_a%= i_k, i_m, ymm\n" // A in rax(r8, r9), lda in rcx
            ".if m_ABytes == 4\n"
                ".if m_TransA == 0\n"
-                    "vbroadcastss_%= %%rax, 0, 0, (\\i_m + \\i_k * m_Mr) * m_ABytes, \\ymm\n"
+                    "vbroadcastss_%= %%rax, 0, 0, ((\\i_m + \\i_k * m_Mr) * m_ABytes), \\ymm\n"
                ".else\n"
                    ".if (\\i_m == 0) || (\\i_m == 1) || (\\i_m == 2)\n"
-                        "vbroadcastss_%= %%rax, %%rcx, \\i_m, \\i_k * m_ABytes, \\ymm\n"
+                        "vbroadcastss_%= %%rax, %%rcx, \\i_m, (\\i_k * m_ABytes), \\ymm\n"
                    ".else\n"
-                        "vbroadcastss_%= %%r8, %%rcx, \\i_m-3, \\i_k * m_ABytes, \\ymm\n"
+                        "vbroadcastss_%= %%r8, %%rcx, \\i_m-3, (\\i_k * m_ABytes), \\ymm\n"
                    ".endif\n"
                ".endif\n"
            ".else\n"
                ".if m_TransA == 0\n"
-                    "vpbroadcastw_%= %%rax, 0, 0, (\\i_m + \\i_k * m_Mr) * m_ABytes, %%xmm15\n"
+                    "vpbroadcastw_%= %%rax, 0, 0, ((\\i_m + \\i_k * m_Mr) * m_ABytes), %%xmm15\n"
                ".else\n"
                    ".if (\\i_m == 0) || (\\i_m == 1) || (\\i_m == 2)\n"
-                        "vpbroadcastw_%= %%rax, %%rcx, \\i_m, \\i_k * m_ABytes, %%xmm15\n"
+                        "vpbroadcastw_%= %%rax, %%rcx, \\i_m, (\\i_k * m_ABytes), %%xmm15\n"
                    ".else\n"
-                        "vpbroadcastw_%= %%r8, %%rcx, \\i_m-3, \\i_k * m_ABytes, %%xmm15\n"
+                        "vpbroadcastw_%= %%r8, %%rcx, \\i_m-3, (\\i_k * m_ABytes), %%xmm15\n"
                    ".endif\n"
                ".endif\n"
                "vcvtph2ps  %%xmm15, \\ymm\n"
@@ -147,15 +148,15 @@ struct ThreadwiseGemmAvx2_MxN_6x16
            ".macro vload_b%= i_k, i_n, ymm\n" // B in rbx, lda in rdx, i_n should be 0, 1
            ".if m_BBytes == 4\n"
                ".if m_TransB == 0\n"
-                    "vmovups_%= %%rbx, %%rdx, \\i_n, \\i_k*m_BBytes*8, \\ymm\n"
+                    "vmovups_%= %%rbx, %%rdx, \\i_n, (\\i_k*m_BBytes*8), \\ymm\n"
                ".else\n"
-                    "vmovups_%= %%rbx, 0, 0, (\\i_k*m_Nr + \\i_n*8)*m_BBytes, \\ymm\n"
+                    "vmovups_%= %%rbx, 0, 0, ((\\i_k*m_Nr + \\i_n*8)*m_BBytes), \\ymm\n"
                ".endif\n"
            ".else\n"
                ".if m_TransB == 0\n"
-                    "vcvtph2ps_%= %%rbx, %%rdx, \\i_n, \\i_k*m_BBytes*8, \\ymm\n"
+                    "vcvtph2ps_%= %%rbx, %%rdx, \\i_n, (\\i_k*m_BBytes*8), \\ymm\n"
                ".else\n"
-                    "vcvtph2ps_%= %%rbx, 0, 0, (\\i_k*m_Nr + \\i_n*8)*m_BBytes, \\ymm\n"
+                    "vcvtph2ps_%= %%rbx, 0, 0, ((\\i_k*m_Nr + \\i_n*8)*m_BBytes), \\ymm\n"
                ".endif\n"
            ".endif\n"
            ".endm\n"
@@ -682,22 +683,22 @@ struct ThreadwiseGemmAvx2_MxN_4x24
            ".macro vbroadcast_a%= i_k, i_m, ymm\n" // A in rax(r8), lda in rcx
            ".if m_ABytes == 4\n"
                ".if m_TransA == 0\n"
-                    "vbroadcastss_%= %%rax, 0, 0, (\\i_m + \\i_k * m_Mr) * m_ABytes, \\ymm\n"
+                    "vbroadcastss_%= %%rax, 0, 0, ((\\i_m + \\i_k * m_Mr) * m_ABytes), \\ymm\n"
                ".else\n"
                    ".if (\\i_m == 0) || (\\i_m == 1)\n"
-                        "vbroadcastss_%= %%rax, %%rcx, \\i_m, \\i_k * m_ABytes, \\ymm\n"
+                        "vbroadcastss_%= %%rax, %%rcx, \\i_m, (\\i_k * m_ABytes), \\ymm\n"
                    ".else\n"
-                        "vbroadcastss_%= %%r8, %%rcx, \\i_m-2, \\i_k * m_ABytes, \\ymm\n"
+                        "vbroadcastss_%= %%r8, %%rcx, \\i_m-2, (\\i_k * m_ABytes), \\ymm\n"
                    ".endif\n"
                ".endif\n"
            ".else\n"
                ".if m_TransA == 0\n"
-                    "vpbroadcastw_%= %%rax, 0, 0, (\\i_m + \\i_k * m_Mr) * m_ABytes, %%xmm15\n"
+                    "vpbroadcastw_%= %%rax, 0, 0, ((\\i_m + \\i_k * m_Mr) * m_ABytes), %%xmm15\n"
                ".else\n"
                    ".if (\\i_m == 0) || (\\i_m == 1)\n"
-                        "vpbroadcastw_%= %%rax, %%rcx, \\i_m, \\i_k * m_ABytes, %%xmm15\n"
+                        "vpbroadcastw_%= %%rax, %%rcx, \\i_m, (\\i_k * m_ABytes), %%xmm15\n"
                    ".else\n"
-                        "vpbroadcastw_%= %%r8, %%rcx, \\i_m-2, \\i_k * m_ABytes, %%xmm15\n"
+                        "vpbroadcastw_%= %%r8, %%rcx, \\i_m-2, (\\i_k * m_ABytes), %%xmm15\n"
                    ".endif\n"
                ".endif\n"
                "vcvtph2ps  %%xmm15, \\ymm\n"
@@ -707,15 +708,15 @@ struct ThreadwiseGemmAvx2_MxN_4x24
            ".macro vload_b%= i_k, i_n, ymm\n" // B in rbx, lda in rdx, i_n should be 0, 1, 2
            ".if m_BBytes == 4\n"
                ".if m_TransB == 0\n"
-                    "vmovups_%= %%rbx, %%rdx, \\i_n, \\i_k*m_BBytes*8, \\ymm\n"
+                    "vmovups_%= %%rbx, %%rdx, \\i_n, (\\i_k*m_BBytes*8), \\ymm\n"
                ".else\n"
-                    "vmovups_%= %%rbx, 0, 0, (\\i_k*m_Nr + \\i_n*8)*m_BBytes, \\ymm\n"
+                    "vmovups_%= %%rbx, 0, 0, ((\\i_k*m_Nr + \\i_n*8)*m_BBytes), \\ymm\n"
                ".endif\n"
            ".else\n"
                ".if m_TransB == 0\n"
-                    "vcvtph2ps_%= %%rbx, %%rdx, \\i_n, \\i_k*m_BBytes*8, \\ymm\n"
+                    "vcvtph2ps_%= %%rbx, %%rdx, \\i_n, (\\i_k*m_BBytes*8), \\ymm\n"
                ".else\n"
-                    "vcvtph2ps_%= %%rbx, 0, 0, (\\i_k*m_Nr + \\i_n*8)*m_BBytes, \\ymm\n"
+                    "vcvtph2ps_%= %%rbx, 0, 0, ((\\i_k*m_Nr + \\i_n*8)*m_BBytes), \\ymm\n"
                ".endif\n"
            ".endif\n"
            ".endm\n"

--- a/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
@@ -46,7 +46,13 @@ void memcpy32_avx2(void* dst, const void* src, const ck::index_t n, const Elemen
    }
    if(i_n & 2)
    {
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__llvm__)
+        __m128i s = _mm_loadu_si64(p_src);
+        __m128 v  = element_op.Apply(*reinterpret_cast<__m128*>(&s));
+        _mm_storeu_si64(p_dst, *reinterpret_cast<__m128i*>(&v));
+#else
        _mm_storeu_si64(p_dst, element_op.Apply(_mm_loadu_si64(p_src)));
+#endif
        p_dst += 2;
        p_src += 2;
    }
@@ -82,7 +88,11 @@ inline void memset32_avx2(void* dst, const int32_t value, const ck::index_t n)
    }
    if(i_n & 2)
    {
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__llvm__)
+        _mm_storeu_si64(p_dst, *reinterpret_cast<__m128i*>(&xmm));
+#else
        _mm_storeu_si64(p_dst, xmm);
+#endif
        p_dst += 2;
    }
    if(i_n & 1)

--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
 #pragma once
 #include "data_type.hpp"

+#ifndef CK_NOGPU
 namespace ck {

 template <typename T>
@@ -1047,3 +1048,5 @@ amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thr
 }

 } // namespace ck
+#endif
+
--- a/include/ck/utility/amd_inline_asm.hpp
+++ b/include/ck/utility/amd_inline_asm.hpp
@@ -4,6 +4,8 @@
 #include "data_type.hpp"
 #include "c_style_pointer_cast.hpp"

+#ifndef CK_NOGPU
+
 // TODO: deprecate all amd_assembly_outer_product_xxx

 namespace ck {
@@ -354,3 +356,4 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,

 } // namespace ck
 #endif
+#endif
--- a/include/ck/utility/amd_llvm_intrinsic.hpp
+++ b/include/ck/utility/amd_llvm_intrinsic.hpp
 #ifndef CK_AMD_LLVM_INTRINSIC_HPP
 #define CK_AMD_LLVM_INTRINSIC_HPP

+#ifndef CK_NOGPU
 #include "data_type.hpp"

 namespace ck {
@@ -9,3 +10,4 @@ __device__ int32_t llvm_amdgcn_readfirstlane_i32(int32_t i) __asm("llvm.amdgcn.r

 } // namespace ck
 #endif
+#endif
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
 #ifndef CK_AMD_XDLOPS_HPP
 #define CK_AMD_XDLOPS_HPP

+#ifndef CK_NOGPU
 #include "data_type.hpp"

 namespace ck {
@@ -296,3 +297,4 @@ struct intrin_mfma_i32_16x16x16i8<16, 16>

 } // namespace ck
 #endif
+#endif
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
 #pragma once
 #include "statically_indexed_array.hpp"
+#ifdef CK_NOGPU
+#include "half.hpp"
+#endif

 namespace ck {

 using bhalf_t = ushort;
-using half_t  = _Float16;
+#ifdef CK_NOGPU
+using half_t = half_float::half;
+#else
+using half_t = _Float16;
+#endif

 // vector_type
 template <typename T, index_t N>
@@ -14,8 +21,10 @@ struct vector_type;
 // intentionally have only declaration but no definition to cause compilation failure when trying to
 // instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
 // vectors"
+#ifdef __clang__
 template <typename T, index_t V, index_t N>
 struct vector_type<T __attribute__((ext_vector_type(V))), N>;
+#endif

 // Caution: DO NOT REMOVE
 // intentionally have only declaration but no definition to cause compilation failure when trying to
@@ -32,11 +41,13 @@ struct vector_type_maker
    using type = vector_type<T, N>;
 };

+#ifdef __clang__
 template <typename T, index_t N0, index_t N1>
 struct vector_type_maker<T __attribute__((ext_vector_type(N1))), N0>
 {
    using type = vector_type<T, N0 * N1>;
 };
+#endif

 template <typename T, index_t N0, index_t N1>
 struct vector_type_maker<vector_type<T, N1>, N0>
@@ -69,12 +80,14 @@ template <typename X, typename Y>
 using has_same_scalar_type = is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                     typename scalar_type<remove_cvref_t<Y>>::type>;

+#ifdef __clang__
 template <typename T, index_t N>
 struct scalar_type<T __attribute__((ext_vector_type(N)))>
 {
    using type                           = T;
    static constexpr index_t vector_size = N;
 };
+#endif

 template <typename T, index_t N>
 struct scalar_type<vector_type<T, N>>

--- a/include/ck/utility/data_type_cpu.hpp
+++ b/include/ck/utility/data_type_cpu.hpp
-#pragma once
-
-#include <immintrin.h>
-
-namespace ck {
-namespace cpu {
-
-// vector_type
-template <typename T, index_t N>
-struct vector_type;
-
-// Caution: DO NOT REMOVE
-// intentionally have only declaration but no definition to cause compilation failure when trying to
-// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
-// vectors"
-template <typename T, index_t V, index_t N>
-struct vector_type<T __attribute__((ext_vector_type(V))), N>;
-
-// Caution: DO NOT REMOVE
-// intentionally have only declaration but no definition to cause compilation failure when trying to
-// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
-// vectors"
-template <typename T, index_t V, index_t N>
-struct vector_type<vector_type<T, V>, N>;
-
-// vector_type_maker
-// This is the right way to handle "vector of vectors": making a bigger vector instead
-template <typename T, index_t N>
-struct vector_type_maker
-{
-    using type = vector_type<T, N>;
-};
-
-template <typename T, index_t N>
-using vector_type_maker_t = typename vector_type_maker<T, N>::type;
-
-template <typename T, index_t N>
-constexpr auto make_vector_type(Number<N>)
-{
-    return typename vector_type_maker<T, N>::type{};
-}
-
-template <>
-struct vector_type<float, 1>
-{
-    using d1_t = float;
-    // SSE
-    using type = float;
-
-    type data_;
-
-    vector_type() : data_{0} {}
-
-    // vector_type(float x) : data_{x} {}
-
-    vector_type(type v) : data_{v} {}
-
-    vector_type(const float* mem) : data_{*mem} {}
-
-    template <typename X>
-    constexpr const auto& AsType() const
-    {
-        static_assert(std::is_same<X, type>::value, "wrong!");
-
-        return data_;
-    }
-
-    template <typename X>
-    constexpr auto& AsType()
-    {
-        static_assert(std::is_same<X, type>::value, "wrong!");
-
-        return data_;
-    }
-
-    constexpr void Load(const float* mem) { data_ = *mem; }
-
-    constexpr void Store(float* mem) const { *mem = data_; }
-};
-
-template <>
-struct vector_type<float, 4>
-{
-    using d1_t = float;
-    // SSE
-    using type = __m128;
-
-    type data_;
-
-    vector_type() : data_{_mm_setzero_ps()} {}
-
-    vector_type(float x) : data_{_mm_set1_ps(x)} {}
-
-    vector_type(type v) : data_{v} {}
-
-    vector_type(const float* mem) : data_{_mm_loadu_ps(mem)} {}
-
-    template <typename X>
-    constexpr const auto& AsType() const
-    {
-        static_assert(std::is_same<X, type>::value, "wrong!");
-
-        return data_;
-    }
-
-    template <typename X>
-    constexpr auto& AsType()
-    {
-        static_assert(std::is_same<X, type>::value, "wrong!");
-
-        return data_;
-    }
-
-    constexpr void Load(const float* mem) { data_ = _mm_loadu_ps(mem); }
-
-    constexpr void Store(float* mem) const { _mm_storeu_ps(mem, data_); }
-};
-
-template <>
-struct vector_type<float, 8>
-{
-    using d1_t = float;
-    // SSE
-    using type = __m256;
-
-    type data_;
-
-    vector_type() : data_{_mm256_setzero_ps()} {}
-
-    vector_type(float x) : data_{_mm256_set1_ps(x)} {}
-
-    vector_type(type v) : data_{v} {}
-
-    vector_type(const float* mem) : data_{_mm256_loadu_ps(mem)} {}
-
-    template <typename X>
-    constexpr const auto& AsType() const
-    {
-        static_assert(std::is_same<X, type>::value, "wrong!");
-
-        return data_;
-    }
-
-    template <typename X>
-    constexpr auto& AsType()
-    {
-        static_assert(std::is_same<X, type>::value, "wrong!");
-
-        return data_;
-    }
-
-    constexpr void Load(const float* mem) { data_ = _mm256_loadu_ps(mem); }
-
-    constexpr void Store(float* mem) const { _mm256_storeu_ps(mem, data_); }
-};
-
-template <typename T>
-struct to_vector_type
-{
-    using type = T;
-};
-
-template <>
-struct to_vector_type<__m128>
-{
-    using type = vector_type<float, 4>;
-};
-
-template <>
-struct to_vector_type<__m256>
-{
-    using type = vector_type<float, 8>;
-};
-
-template <typename Tv, typename Tp>
-inline void load_vector(Tv& v, const Tp* mem)
-{
-    v = *reinterpret_cast<const Tv*>(mem);
-}
-
-template <>
-inline void load_vector(__m128& v, const float* mem)
-{
-    v = _mm_loadu_ps(mem);
-}
-
-template <>
-inline void load_vector(__m256& v, const float* mem)
-{
-    v = _mm256_loadu_ps(mem);
-}
-
-template <typename Tv, typename Tp>
-inline void store_vector(const Tv& v, Tp* mem)
-{
-    *reinterpret_cast<Tv*>(mem) = v;
-}
-
-template <>
-inline void store_vector(const __m128& v, float* mem)
-{
-    _mm_storeu_ps(mem, v);
-}
-
-template <>
-inline void store_vector(const __m256& v, float* mem)
-{
-    _mm256_storeu_ps(mem, v);
-}
-
-template <typename Tv, typename Tx>
-inline void set_vector(Tv& v, const Tx x)
-{
-    v = static_cast<const Tv>(x);
-}
-
-template <>
-inline void set_vector(__m128& v, const float x)
-{
-    v = _mm_set1_ps(x);
-}
-
-template <>
-inline void set_vector(__m256& v, const float x)
-{
-    v = _mm256_set1_ps(x);
-}
-
-template <typename Tv>
-inline void clear_vector(Tv& v)
-{
-    v = static_cast<Tv>(0);
-}
-
-template <>
-inline void clear_vector(__m128& v)
-{
-    v = _mm_setzero_ps();
-}
-
-template <>
-inline void clear_vector(__m256& v)
-{
-    v = _mm256_setzero_ps();
-}
-
-using float4_t = typename vector_type<float, 4>::type;
-using float8_t = typename vector_type<float, 8>::type;
-
-// scalar_type
-template <typename TV>
-struct scalar_type;
-
-// is_scalar_type
-template <typename TV>
-struct is_scalar_type
-{
-    static constexpr bool value = (scalar_type<remove_cvref_t<TV>>::vector_size == 1);
-};
-
-// has_same_scalar_type
-template <typename X, typename Y>
-using has_same_scalar_type = is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                     typename scalar_type<remove_cvref_t<Y>>::type>;
-
-template <typename T, index_t N>
-struct scalar_type<vector_type<T, N>>
-{
-    using type                           = T;
-    static constexpr index_t vector_size = N;
-};
-
-template <>
-struct scalar_type<float4_t>
-{
-    using type                           = float;
-    static constexpr index_t vector_size = 4;
-};
-
-template <>
-struct scalar_type<float8_t>
-{
-    using type                           = float;
-    static constexpr index_t vector_size = 8;
-};
-
-//
-template <>
-struct scalar_type<float>
-{
-    using type                           = float;
-    static constexpr index_t vector_size = 1;
-};
-
-} // namespace cpu
-} // namespace ck
+#pragma once
+
+#include <immintrin.h>
+#include "half.hpp"
+
+namespace ck {
+
+namespace cpu {
+
+// vector_type
+template <typename T, index_t N>
+struct vector_type;
+
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
+// vectors"
+#ifdef __clang__
+template <typename T, index_t V, index_t N>
+struct vector_type<T __attribute__((ext_vector_type(V))), N>;
+#endif
+
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
+// vectors"
+template <typename T, index_t V, index_t N>
+struct vector_type<vector_type<T, V>, N>;
+
+// vector_type_maker
+// This is the right way to handle "vector of vectors": making a bigger vector instead
+template <typename T, index_t N>
+struct vector_type_maker
+{
+    using type = vector_type<T, N>;
+};
+
+template <typename T, index_t N>
+using vector_type_maker_t = typename vector_type_maker<T, N>::type;
+
+template <typename T, index_t N>
+constexpr auto make_vector_type(Number<N>)
+{
+    return typename vector_type_maker<T, N>::type{};
+}
+
+template <>
+struct vector_type<float, 1>
+{
+    using d1_t = float;
+    // SSE
+    using type = float;
+
+    type data_;
+
+    vector_type() : data_{0} {}
+
+    // vector_type(float x) : data_{x} {}
+
+    vector_type(type v) : data_{v} {}
+
+    vector_type(const float* mem) : data_{*mem} {}
+
+    template <typename X>
+    constexpr const auto& AsType() const
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+
+        return data_;
+    }
+
+    template <typename X>
+    constexpr auto& AsType()
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+
+        return data_;
+    }
+
+    constexpr void Load(const float* mem) { data_ = *mem; }
+
+    constexpr void Store(float* mem) const { *mem = data_; }
+};
+
+template <>
+struct vector_type<float, 4>
+{
+    using d1_t = float;
+    // SSE
+    using type = __m128;
+
+    type data_;
+
+    vector_type() : data_{_mm_setzero_ps()} {}
+
+    vector_type(float x) : data_{_mm_set1_ps(x)} {}
+
+    vector_type(type v) : data_{v} {}
+
+    vector_type(const float* mem) : data_{_mm_loadu_ps(mem)} {}
+
+    template <typename X>
+    constexpr const auto& AsType() const
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+
+        return data_;
+    }
+
+    template <typename X>
+    constexpr auto& AsType()
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+
+        return data_;
+    }
+
+    void Load(const float* mem) { data_ = _mm_loadu_ps(mem); }
+
+    void Store(float* mem) const { _mm_storeu_ps(mem, data_); }
+};
+
+template <>
+struct vector_type<float, 8>
+{
+    using d1_t = float;
+    // SSE
+    using type = __m256;
+
+    type data_;
+
+    vector_type() : data_{_mm256_setzero_ps()} {}
+
+    vector_type(float x) : data_{_mm256_set1_ps(x)} {}
+
+    vector_type(type v) : data_{v} {}
+
+    vector_type(const float* mem) : data_{_mm256_loadu_ps(mem)} {}
+
+    template <typename X>
+    constexpr const auto& AsType() const
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+
+        return data_;
+    }
+
+    template <typename X>
+    constexpr auto& AsType()
+    {
+        static_assert(std::is_same<X, type>::value, "wrong!");
+
+        return data_;
+    }
+
+    void Load(const float* mem) { data_ = _mm256_loadu_ps(mem); }
+
+    void Store(float* mem) const { _mm256_storeu_ps(mem, data_); }
+};
+
+template <typename T>
+struct to_vector_type
+{
+    using type = T;
+};
+
+template <>
+struct to_vector_type<__m128>
+{
+    using type = vector_type<float, 4>;
+};
+
+template <>
+struct to_vector_type<__m256>
+{
+    using type = vector_type<float, 8>;
+};
+
+template <typename Tv, typename Tp>
+inline void load_vector(Tv& v, const Tp* mem)
+{
+    v = *reinterpret_cast<const Tv*>(mem);
+}
+
+template <>
+inline void load_vector(__m128& v, const float* mem)
+{
+    v = _mm_loadu_ps(mem);
+}
+
+template <>
+inline void load_vector(__m256& v, const float* mem)
+{
+    v = _mm256_loadu_ps(mem);
+}
+
+template <typename Tv, typename Tp>
+inline void store_vector(const Tv& v, Tp* mem)
+{
+    *reinterpret_cast<Tv*>(mem) = v;
+}
+
+template <>
+inline void store_vector(const __m128& v, float* mem)
+{
+    _mm_storeu_ps(mem, v);
+}
+
+template <>
+inline void store_vector(const __m256& v, float* mem)
+{
+    _mm256_storeu_ps(mem, v);
+}
+
+template <typename Tv, typename Tx>
+inline void set_vector(Tv& v, const Tx x)
+{
+    v = static_cast<const Tv>(x);
+}
+
+template <>
+inline void set_vector(__m128& v, const float x)
+{
+    v = _mm_set1_ps(x);
+}
+
+template <>
+inline void set_vector(__m256& v, const float x)
+{
+    v = _mm256_set1_ps(x);
+}
+
+template <typename Tv>
+inline void clear_vector(Tv& v)
+{
+    v = static_cast<Tv>(0);
+}
+
+template <>
+inline void clear_vector(__m128& v)
+{
+    v = _mm_setzero_ps();
+}
+
+template <>
+inline void clear_vector(__m256& v)
+{
+    v = _mm256_setzero_ps();
+}
+
+using float4_t = typename vector_type<float, 4>::type;
+using float8_t = typename vector_type<float, 8>::type;
+
+// scalar_type
+template <typename TV>
+struct scalar_type;
+
+// is_scalar_type
+template <typename TV>
+struct is_scalar_type
+{
+    static constexpr bool value = (scalar_type<remove_cvref_t<TV>>::vector_size == 1);
+};
+
+// has_same_scalar_type
+template <typename X, typename Y>
+using has_same_scalar_type = is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                     typename scalar_type<remove_cvref_t<Y>>::type>;
+
+template <typename T, index_t N>
+struct scalar_type<vector_type<T, N>>
+{
+    using type                           = T;
+    static constexpr index_t vector_size = N;
+};
+
+template <>
+struct scalar_type<float4_t>
+{
+    using type                           = float;
+    static constexpr index_t vector_size = 4;
+};
+
+template <>
+struct scalar_type<float8_t>
+{
+    using type                           = float;
+    static constexpr index_t vector_size = 8;
+};
+
+//
+template <>
+struct scalar_type<float>
+{
+    using type                           = float;
+    static constexpr index_t vector_size = 1;
+};
+
+} // namespace cpu
+} // namespace ck
--- a/include/ck/utility/debug.hpp
+++ b/include/ck/utility/debug.hpp
 #ifndef UTILITY_DEBUG_HPP
 #define UTILITY_DEBUG_HPP
-
+#ifndef CK_NOGPU
 namespace ck {
 namespace debug {

@@ -74,4 +74,5 @@ __device__ void print_shared(T const* p_shared, index_t num_elements)
 } // namespace debug
 } // namespace ck

+#endif
 #endif // UTILITY_DEBUG_HPP
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -5,6 +5,7 @@
 #include "amd_buffer_addressing.hpp"
 #include "generic_memory_space_atomic_add.hpp"

+#ifndef CK_NOGPU
 namespace ck {

 // T may be scalar or vector
@@ -351,3 +352,4 @@ make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element
 }

 } // namespace ck
+#endif
--- a/include/ck/utility/generic_memory_space_atomic_add.hpp
+++ b/include/ck/utility/generic_memory_space_atomic_add.hpp
 #pragma once
+#ifndef CK_NOGPU
 #include "data_type.hpp"

 namespace ck {
@@ -42,3 +43,4 @@ __device__ float2_t atomic_add<float2_t>(float2_t* p_dst, const float2_t& x)
 }

 } // namespace ck
+#endif
--- a/include/ck/utility/get_id.hpp
+++ b/include/ck/utility/get_id.hpp
 #pragma once
 #include "config.hpp"

+#ifndef CK_NOGPU
 namespace ck {

 __host__ __device__ constexpr index_t get_warp_size()
@@ -18,3 +19,4 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; }
 __device__ index_t get_grid_size() { return gridDim.x; }

 } // namespace ck
+#endif
\ No newline at end of file
--- a/include/ck/utility/inner_product.hpp
+++ b/include/ck/utility/inner_product.hpp
@@ -2,7 +2,7 @@
 #define CK_INNER_PRODUCT_HPP

 #include "data_type.hpp"
-
+#ifndef CK_NOGPU
 namespace ck {

 template <typename TA, typename TB, typename TC>
@@ -203,3 +203,4 @@ inner_product<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a, const int8x16_t

 } // namespace ck
 #endif
+#endif
\ No newline at end of file
--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
@@ -118,7 +118,7 @@ struct MagicDivision
    {
        return CalculateMagicShift(integral_constant<uint32_t, Divisor>{});
    }
-
+#ifndef CK_NOGPU
    // magic division for uint32_t
    __device__ static constexpr uint32_t
    DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
@@ -126,7 +126,7 @@ struct MagicDivision
        uint32_t tmp = __umulhi(dividend, multiplier);
        return (tmp + dividend) >> shift;
    }
-
+#endif
    __host__ static constexpr uint32_t
    DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
    {
@@ -138,6 +138,7 @@ struct MagicDivision
    // HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
    // non-negative for result to be correct
    // TODO: figure out how to do magic number divison for int32_t as dividended
+#ifndef CK_NOGPU
    __device__ static constexpr int32_t
    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
    {
@@ -145,6 +146,7 @@ struct MagicDivision
        uint32_t tmp          = __umulhi(dividend_u32, multiplier);
        return (tmp + dividend_u32) >> shift;
    }
+#endif

    __host__ static constexpr int32_t
    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)