support gcc with cpu only compile

9a7fa123 · carlushuang · ad09ebdb · 9a7fa123 · 9a7fa123 · 9a7fa123
Commit 9a7fa123 authored May 17, 2022 by carlushuang
20 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,10 @@ list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
 enable_testing()
+option(CK_NOGPU "build without gpu backend" OFF)
+if(NOT CK_NOGPU)
 find_package(ROCM REQUIRED PATHS /opt/rocm)
 include(ROCMInstallTargets)
@@ -19,6 +23,7 @@ include(CheckCXXCompilerFlag)
 rocm_setup_version(VERSION 1.0.0)
 include(TargetFlags)
 list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip)
+endif()
 ## C++
 enable_language(CXX)
@@ -31,25 +36,26 @@ option(CK_TIME_KERNEL "Turning off will disable kernel timing globally" ON)
 ## OpenMP
 if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-	# workaround issue hipcc in rocm3.5 cannot find openmp
+    set(OMP_CXX_FLAG -fopenmp=libomp -Wno-unused-command-line-argument)
-	set(OpenMP_CXX "${CMAKE_CXX_COMPILER}")
+    set(OMP_LIBRARY /opt/rocm/llvm/lib/libomp.so)
-	set(OpenMP_CXX_FLAGS "-fopenmp=libomp -Wno-unused-command-line-argument")
+    set(OMP_LINK_FLAG -Wl,-rpath,/opt/rocm/llvm/lib)
-	set(OpenMP_CXX_LIB_NAMES "libomp" "libgomp" "libiomp5")
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU")
-	set(OpenMP_libomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+    set(OMP_CXX_FLAG -fopenmp)
-	set(OpenMP_libgomp_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+    set(OMP_LIBRARY "")
-	set(OpenMP_libiomp5_LIBRARY ${OpenMP_CXX_LIB_NAMES})
+    set(OMP_LINK_FLAG -fopenmp)
 else()
 	find_package(OpenMP REQUIRED)
 endif()
-message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
+# message("OpenMP_CXX_LIB_NAMES: ${OpenMP_CXX_LIB_NAMES}")
-message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
+# message("OpenMP_gomp_LIBRARY: ${OpenMP_gomp_LIBRARY}")
-message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
+# message("OpenMP_pthread_LIBRARY: ${OpenMP_pthread_LIBRARY}")
-message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
+# message("OpenMP_CXX_FLAGS: ${OpenMP_CXX_FLAGS}")
-link_libraries(${OpenMP_gomp_LIBRARY})
+# link_libraries(${OpenMP_gomp_LIBRARY})
-link_libraries(${OpenMP_pthread_LIBRARY})
+# link_libraries(${OpenMP_pthread_LIBRARY})
+if(NOT CK_NOGPU)
 ## HIP
 find_package(HIP REQUIRED)
 # Override HIP version in config.h, if necessary.
@@ -79,6 +85,7 @@ rocm_create_package(
    MAINTAINER "MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
    LDCONFIG
 )
+endif()
 ## half
 set(HALF_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/external/include/half")
@@ -94,7 +101,8 @@ elseif(CK_BACKEND STREQUAL "HIP" OR CK_BACKEND STREQUAL "HIPNOGPU")
    set(CK_TIDY_ERRORS ALL)
 endif()
+if(NOT CK_NOGPU)
+# currently tidy and cppcheck seems also need something from rocm environment
 include(ClangTidy)
 enable_clang_tidy(
    CHECKS
@@ -224,6 +232,11 @@ enable_cppcheck(
        CPPCHECK=1
        __linux__=1
 )
+else()
+function(clang_tidy_check TARGET)
+# dummy empty functoin
+endfunction()
+endif()
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)

--- a/include/ck/config.hpp
+++ b/include/ck/config.hpp
 #ifndef CK_CONFIG_AMD_HPP
 #define CK_CONFIG_AMD_HPP
-#ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
+#include "ck/options.hpp"
+#ifdef CK_NOGPU
+#define __host__
+#define __device__
+#else
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
 #endif
@@ -26,6 +31,12 @@
 #endif
 #endif
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__llvm__)
+#if __GNUC__ < 9
+#error "If use gcc, need make sure use at least gcc-9"
+#endif
+#endif
 // buffer resource
 #ifndef __HIP_DEVICE_COMPILE__ // for host code
 #define CK_BUFFER_RESOURCE_3RD_DWORD -1

--- a/include/ck/options.hpp.in
+++ b/include/ck/options.hpp.in
 #pragma once
 #cmakedefine01 CK_TIME_KERNEL
+#cmakedefine CK_NOGPU
--- a/include/ck/stream_config.hpp
+++ b/include/ck/stream_config.hpp
 #pragma once
+#ifndef CK_NOGPU
 #include <hip/hip_runtime.h>
 #include <hip/hip_fp16.h>
+#endif
 struct StreamConfig
 {
+#ifndef CK_NOGPU
    hipStream_t stream_id_ = nullptr;
+#endif
    bool time_kernel_      = false;
 };
--- a/include/ck/tensor/static_tensor.hpp
+++ b/include/ck/tensor/static_tensor.hpp
@@ -79,6 +79,7 @@ struct StaticTensor
    T ignored_element_scalar_;
 };
+#ifndef CK_NOGPU
 // StaticTensor for vector
 template <AddressSpaceEnum AddressSpace,
          typename S,
@@ -244,6 +245,7 @@ struct StaticTensorTupleOfVectorBuffer
    const S invalid_element_scalar_value_ = S{0};
    S ignored_element_scalar_;
 };
+#endif
 template <AddressSpaceEnum AddressSpace,
          typename T,

--- a/include/ck/tensor_description/tensor_descriptor.hpp
+++ b/include/ck/tensor_description/tensor_descriptor.hpp
@@ -277,7 +277,12 @@ struct TensorCoordinateStep
    MultiIndex<NTransform> do_transforms_;
    // HACK: control UpdateLowerIndex()
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__llvm__)
+    // constexpr static data member ‘update_lower_index_hack_’ must have an initializer
+    static constexpr UpdateLowerIndexHack update_lower_index_hack_{};
+#else
    static constexpr UpdateLowerIndexHack update_lower_index_hack_;
+#endif
 };
 // TODO: How to fix this? It uses an struct instead of lambda because lambda

--- a/include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_gemm_avx2.hpp
 #ifndef CK_THREADWISE_GEMM_AVX2_HPP
 #define CK_THREADWISE_GEMM_AVX2_HPP
+#include <assert.h>
 #if CK_USE_X86_INLINE_ASM == 0
 #include <immintrin.h>
 #endif
@@ -122,22 +123,22 @@ struct ThreadwiseGemmAvx2_MxN_6x16
            ".macro vbroadcast_a%= i_k, i_m, ymm\n" // A in rax(r8, r9), lda in rcx
            ".if m_ABytes == 4\n"
                ".if m_TransA == 0\n"
-                    "vbroadcastss_%= %%rax, 0, 0, (\\i_m + \\i_k * m_Mr) * m_ABytes, \\ymm\n"
+                    "vbroadcastss_%= %%rax, 0, 0, ((\\i_m + \\i_k * m_Mr) * m_ABytes), \\ymm\n"
                ".else\n"
                    ".if (\\i_m == 0) || (\\i_m == 1) || (\\i_m == 2)\n"
-                        "vbroadcastss_%= %%rax, %%rcx, \\i_m, \\i_k * m_ABytes, \\ymm\n"
+                        "vbroadcastss_%= %%rax, %%rcx, \\i_m, (\\i_k * m_ABytes), \\ymm\n"
                    ".else\n"
-                        "vbroadcastss_%= %%r8, %%rcx, \\i_m-3, \\i_k * m_ABytes, \\ymm\n"
+                        "vbroadcastss_%= %%r8, %%rcx, \\i_m-3, (\\i_k * m_ABytes), \\ymm\n"
                    ".endif\n"
                ".endif\n"
            ".else\n"
                ".if m_TransA == 0\n"
-                    "vpbroadcastw_%= %%rax, 0, 0, (\\i_m + \\i_k * m_Mr) * m_ABytes, %%xmm15\n"
+                    "vpbroadcastw_%= %%rax, 0, 0, ((\\i_m + \\i_k * m_Mr) * m_ABytes), %%xmm15\n"
                ".else\n"
                    ".if (\\i_m == 0) || (\\i_m == 1) || (\\i_m == 2)\n"
-                        "vpbroadcastw_%= %%rax, %%rcx, \\i_m, \\i_k * m_ABytes, %%xmm15\n"
+                        "vpbroadcastw_%= %%rax, %%rcx, \\i_m, (\\i_k * m_ABytes), %%xmm15\n"
                    ".else\n"
-                        "vpbroadcastw_%= %%r8, %%rcx, \\i_m-3, \\i_k * m_ABytes, %%xmm15\n"
+                        "vpbroadcastw_%= %%r8, %%rcx, \\i_m-3, (\\i_k * m_ABytes), %%xmm15\n"
                    ".endif\n"
                ".endif\n"
                "vcvtph2ps  %%xmm15, \\ymm\n"
@@ -147,15 +148,15 @@ struct ThreadwiseGemmAvx2_MxN_6x16
            ".macro vload_b%= i_k, i_n, ymm\n" // B in rbx, lda in rdx, i_n should be 0, 1
            ".if m_BBytes == 4\n"
                ".if m_TransB == 0\n"
-                    "vmovups_%= %%rbx, %%rdx, \\i_n, \\i_k*m_BBytes*8, \\ymm\n"
+                    "vmovups_%= %%rbx, %%rdx, \\i_n, (\\i_k*m_BBytes*8), \\ymm\n"
                ".else\n"
-                    "vmovups_%= %%rbx, 0, 0, (\\i_k*m_Nr + \\i_n*8)*m_BBytes, \\ymm\n"
+                    "vmovups_%= %%rbx, 0, 0, ((\\i_k*m_Nr + \\i_n*8)*m_BBytes), \\ymm\n"
                ".endif\n"
            ".else\n"
                ".if m_TransB == 0\n"
-                    "vcvtph2ps_%= %%rbx, %%rdx, \\i_n, \\i_k*m_BBytes*8, \\ymm\n"
+                    "vcvtph2ps_%= %%rbx, %%rdx, \\i_n, (\\i_k*m_BBytes*8), \\ymm\n"
                ".else\n"
-                    "vcvtph2ps_%= %%rbx, 0, 0, (\\i_k*m_Nr + \\i_n*8)*m_BBytes, \\ymm\n"
+                    "vcvtph2ps_%= %%rbx, 0, 0, ((\\i_k*m_Nr + \\i_n*8)*m_BBytes), \\ymm\n"
                ".endif\n"
            ".endif\n"
            ".endm\n"
@@ -682,22 +683,22 @@ struct ThreadwiseGemmAvx2_MxN_4x24
            ".macro vbroadcast_a%= i_k, i_m, ymm\n" // A in rax(r8), lda in rcx
            ".if m_ABytes == 4\n"
                ".if m_TransA == 0\n"
-                    "vbroadcastss_%= %%rax, 0, 0, (\\i_m + \\i_k * m_Mr) * m_ABytes, \\ymm\n"
+                    "vbroadcastss_%= %%rax, 0, 0, ((\\i_m + \\i_k * m_Mr) * m_ABytes), \\ymm\n"
                ".else\n"
                    ".if (\\i_m == 0) || (\\i_m == 1)\n"
-                        "vbroadcastss_%= %%rax, %%rcx, \\i_m, \\i_k * m_ABytes, \\ymm\n"
+                        "vbroadcastss_%= %%rax, %%rcx, \\i_m, (\\i_k * m_ABytes), \\ymm\n"
                    ".else\n"
-                        "vbroadcastss_%= %%r8, %%rcx, \\i_m-2, \\i_k * m_ABytes, \\ymm\n"
+                        "vbroadcastss_%= %%r8, %%rcx, \\i_m-2, (\\i_k * m_ABytes), \\ymm\n"
                    ".endif\n"
                ".endif\n"
            ".else\n"
                ".if m_TransA == 0\n"
-                    "vpbroadcastw_%= %%rax, 0, 0, (\\i_m + \\i_k * m_Mr) * m_ABytes, %%xmm15\n"
+                    "vpbroadcastw_%= %%rax, 0, 0, ((\\i_m + \\i_k * m_Mr) * m_ABytes), %%xmm15\n"
                ".else\n"
                    ".if (\\i_m == 0) || (\\i_m == 1)\n"
-                        "vpbroadcastw_%= %%rax, %%rcx, \\i_m, \\i_k * m_ABytes, %%xmm15\n"
+                        "vpbroadcastw_%= %%rax, %%rcx, \\i_m, (\\i_k * m_ABytes), %%xmm15\n"
                    ".else\n"
-                        "vpbroadcastw_%= %%r8, %%rcx, \\i_m-2, \\i_k * m_ABytes, %%xmm15\n"
+                        "vpbroadcastw_%= %%r8, %%rcx, \\i_m-2, (\\i_k * m_ABytes), %%xmm15\n"
                    ".endif\n"
                ".endif\n"
                "vcvtph2ps  %%xmm15, \\ymm\n"
@@ -707,15 +708,15 @@ struct ThreadwiseGemmAvx2_MxN_4x24
            ".macro vload_b%= i_k, i_n, ymm\n" // B in rbx, lda in rdx, i_n should be 0, 1, 2
            ".if m_BBytes == 4\n"
                ".if m_TransB == 0\n"
-                    "vmovups_%= %%rbx, %%rdx, \\i_n, \\i_k*m_BBytes*8, \\ymm\n"
+                    "vmovups_%= %%rbx, %%rdx, \\i_n, (\\i_k*m_BBytes*8), \\ymm\n"
                ".else\n"
-                    "vmovups_%= %%rbx, 0, 0, (\\i_k*m_Nr + \\i_n*8)*m_BBytes, \\ymm\n"
+                    "vmovups_%= %%rbx, 0, 0, ((\\i_k*m_Nr + \\i_n*8)*m_BBytes), \\ymm\n"
                ".endif\n"
            ".else\n"
                ".if m_TransB == 0\n"
-                    "vcvtph2ps_%= %%rbx, %%rdx, \\i_n, \\i_k*m_BBytes*8, \\ymm\n"
+                    "vcvtph2ps_%= %%rbx, %%rdx, \\i_n, (\\i_k*m_BBytes*8), \\ymm\n"
                ".else\n"
-                    "vcvtph2ps_%= %%rbx, 0, 0, (\\i_k*m_Nr + \\i_n*8)*m_BBytes, \\ymm\n"
+                    "vcvtph2ps_%= %%rbx, 0, 0, ((\\i_k*m_Nr + \\i_n*8)*m_BBytes), \\ymm\n"
                ".endif\n"
            ".endif\n"
            ".endm\n"

--- a/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
+++ b/include/ck/tensor_operation/cpu/thread/threadwise_tensor_slice_transfer_avx2_specialization.hpp
@@ -46,7 +46,13 @@ void memcpy32_avx2(void* dst, const void* src, const ck::index_t n, const Elemen
    }
    if(i_n & 2)
    {
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__llvm__)
+        __m128i s = _mm_loadu_si64(p_src);
+        __m128 v  = element_op.Apply(*reinterpret_cast<__m128*>(&s));
+        _mm_storeu_si64(p_dst, *reinterpret_cast<__m128i*>(&v));
+#else
        _mm_storeu_si64(p_dst, element_op.Apply(_mm_loadu_si64(p_src)));
+#endif
        p_dst += 2;
        p_src += 2;
    }
@@ -82,7 +88,11 @@ inline void memset32_avx2(void* dst, const int32_t value, const ck::index_t n)
    }
    if(i_n & 2)
    {
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__llvm__)
+        _mm_storeu_si64(p_dst, *reinterpret_cast<__m128i*>(&xmm));
+#else
        _mm_storeu_si64(p_dst, xmm);
+#endif
        p_dst += 2;
    }
    if(i_n & 1)

--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
 #pragma once
 #include "data_type.hpp"
+#ifndef CK_NOGPU
 namespace ck {
 template <typename T>
@@ -1047,3 +1048,5 @@ amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thr
 }
 } // namespace ck
+#endif
--- a/include/ck/utility/amd_inline_asm.hpp
+++ b/include/ck/utility/amd_inline_asm.hpp
@@ -4,6 +4,8 @@
 #include "data_type.hpp"
 #include "c_style_pointer_cast.hpp"
+#ifndef CK_NOGPU
 // TODO: deprecate all amd_assembly_outer_product_xxx
 namespace ck {
@@ -354,3 +356,4 @@ __device__ void amd_assembly_outer_product_1x4(int8x16_t a,
 } // namespace ck
 #endif
+#endif
--- a/include/ck/utility/amd_llvm_intrinsic.hpp
+++ b/include/ck/utility/amd_llvm_intrinsic.hpp
 #ifndef CK_AMD_LLVM_INTRINSIC_HPP
 #define CK_AMD_LLVM_INTRINSIC_HPP
+#ifndef CK_NOGPU
 #include "data_type.hpp"
 namespace ck {
@@ -9,3 +10,4 @@ __device__ int32_t llvm_amdgcn_readfirstlane_i32(int32_t i) __asm("llvm.amdgcn.r
 } // namespace ck
 #endif
+#endif
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
 #ifndef CK_AMD_XDLOPS_HPP
 #define CK_AMD_XDLOPS_HPP
+#ifndef CK_NOGPU
 #include "data_type.hpp"
 namespace ck {
@@ -296,3 +297,4 @@ struct intrin_mfma_i32_16x16x16i8<16, 16>
 } // namespace ck
 #endif
+#endif
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
 #pragma once
 #include "statically_indexed_array.hpp"
+#ifdef CK_NOGPU
+#include "half.hpp"
+#endif
 namespace ck {
 using bhalf_t = ushort;
+#ifdef CK_NOGPU
+using half_t = half_float::half;
+#else
 using half_t = _Float16;
+#endif
 // vector_type
 template <typename T, index_t N>
@@ -14,8 +21,10 @@ struct vector_type;
 // intentionally have only declaration but no definition to cause compilation failure when trying to
 // instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
 // vectors"
+#ifdef __clang__
 template <typename T, index_t V, index_t N>
 struct vector_type<T __attribute__((ext_vector_type(V))), N>;
+#endif
 // Caution: DO NOT REMOVE
 // intentionally have only declaration but no definition to cause compilation failure when trying to
@@ -32,11 +41,13 @@ struct vector_type_maker
    using type = vector_type<T, N>;
 };
+#ifdef __clang__
 template <typename T, index_t N0, index_t N1>
 struct vector_type_maker<T __attribute__((ext_vector_type(N1))), N0>
 {
    using type = vector_type<T, N0 * N1>;
 };
+#endif
 template <typename T, index_t N0, index_t N1>
 struct vector_type_maker<vector_type<T, N1>, N0>
@@ -69,12 +80,14 @@ template <typename X, typename Y>
 using has_same_scalar_type = is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                     typename scalar_type<remove_cvref_t<Y>>::type>;
+#ifdef __clang__
 template <typename T, index_t N>
 struct scalar_type<T __attribute__((ext_vector_type(N)))>
 {
    using type                           = T;
    static constexpr index_t vector_size = N;
 };
+#endif
 template <typename T, index_t N>
 struct scalar_type<vector_type<T, N>>

--- a/include/ck/utility/data_type_cpu.hpp
+++ b/include/ck/utility/data_type_cpu.hpp
 #pragma once
 #include <immintrin.h>
+#include "half.hpp"
 namespace ck {
 namespace cpu {
 // vector_type
@@ -13,8 +15,10 @@ struct vector_type;
 // intentionally have only declaration but no definition to cause compilation failure when trying to
 // instantiate this template. The purpose is to catch user's mistake when trying to make "vector of
 // vectors"
+#ifdef __clang__
 template <typename T, index_t V, index_t N>
 struct vector_type<T __attribute__((ext_vector_type(V))), N>;
+#endif
 // Caution: DO NOT REMOVE
 // intentionally have only declaration but no definition to cause compilation failure when trying to
@@ -111,9 +115,9 @@ struct vector_type<float, 4>
        return data_;
    }
-    constexpr void Load(const float* mem) { data_ = _mm_loadu_ps(mem); }
+    void Load(const float* mem) { data_ = _mm_loadu_ps(mem); }
-    constexpr void Store(float* mem) const { _mm_storeu_ps(mem, data_); }
+    void Store(float* mem) const { _mm_storeu_ps(mem, data_); }
 };
 template <>
@@ -149,9 +153,9 @@ struct vector_type<float, 8>
        return data_;
    }
-    constexpr void Load(const float* mem) { data_ = _mm256_loadu_ps(mem); }
+    void Load(const float* mem) { data_ = _mm256_loadu_ps(mem); }
-    constexpr void Store(float* mem) const { _mm256_storeu_ps(mem, data_); }
+    void Store(float* mem) const { _mm256_storeu_ps(mem, data_); }
 };
 template <typename T>

--- a/include/ck/utility/debug.hpp
+++ b/include/ck/utility/debug.hpp
 #ifndef UTILITY_DEBUG_HPP
 #define UTILITY_DEBUG_HPP
+#ifndef CK_NOGPU
 namespace ck {
 namespace debug {
@@ -74,4 +74,5 @@ __device__ void print_shared(T const* p_shared, index_t num_elements)
 } // namespace debug
 } // namespace ck
+#endif
 #endif // UTILITY_DEBUG_HPP
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -5,6 +5,7 @@
 #include "amd_buffer_addressing.hpp"
 #include "generic_memory_space_atomic_add.hpp"
+#ifndef CK_NOGPU
 namespace ck {
 // T may be scalar or vector
@@ -351,3 +352,4 @@ make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element
 }
 } // namespace ck
+#endif
--- a/include/ck/utility/generic_memory_space_atomic_add.hpp
+++ b/include/ck/utility/generic_memory_space_atomic_add.hpp
 #pragma once
+#ifndef CK_NOGPU
 #include "data_type.hpp"
 namespace ck {
@@ -42,3 +43,4 @@ __device__ float2_t atomic_add<float2_t>(float2_t* p_dst, const float2_t& x)
 }
 } // namespace ck
+#endif
--- a/include/ck/utility/get_id.hpp
+++ b/include/ck/utility/get_id.hpp
 #pragma once
 #include "config.hpp"
+#ifndef CK_NOGPU
 namespace ck {
 __host__ __device__ constexpr index_t get_warp_size()
@@ -18,3 +19,4 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; }
 __device__ index_t get_grid_size() { return gridDim.x; }
 } // namespace ck
+#endif
\ No newline at end of file
--- a/include/ck/utility/inner_product.hpp
+++ b/include/ck/utility/inner_product.hpp
@@ -2,7 +2,7 @@
 #define CK_INNER_PRODUCT_HPP
 #include "data_type.hpp"
+#ifndef CK_NOGPU
 namespace ck {
 template <typename TA, typename TB, typename TC>
@@ -203,3 +203,4 @@ inner_product<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a, const int8x16_t
 } // namespace ck
 #endif
+#endif
\ No newline at end of file
--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
@@ -118,7 +118,7 @@ struct MagicDivision
    {
        return CalculateMagicShift(integral_constant<uint32_t, Divisor>{});
    }
+#ifndef CK_NOGPU
    // magic division for uint32_t
    __device__ static constexpr uint32_t
    DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
@@ -126,7 +126,7 @@ struct MagicDivision
        uint32_t tmp = __umulhi(dividend, multiplier);
        return (tmp + dividend) >> shift;
    }
+#endif
    __host__ static constexpr uint32_t
    DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
    {
@@ -138,6 +138,7 @@ struct MagicDivision
    // HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
    // non-negative for result to be correct
    // TODO: figure out how to do magic number divison for int32_t as dividended
+#ifndef CK_NOGPU
    __device__ static constexpr int32_t
    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
    {
@@ -145,6 +146,7 @@ struct MagicDivision
        uint32_t tmp          = __umulhi(dividend_u32, multiplier);
        return (tmp + dividend_u32) >> shift;
    }
+#endif
    __host__ static constexpr int32_t
    DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)