[Fix] Fix building with CUDA 11.3 (#280)

* disable cache hint for CUDA < 11.4 * fix lint * fix lint * fix cuda-11.3 build

[Fix] Fix building with CUDA 11.3 (#280)
* disable cache hint for CUDA < 11.4 * fix lint * fix lint * fix cuda-11.3 build
9e366482 · Li Zhang · GitHub · 06327355 · 9e366482 · 9e366482
Unverified Commit 9e366482 authored Aug 22, 2023 by Li Zhang Committed by GitHub Aug 22, 2023
8 changed files
--- a/src/turbomind/kernels/gemm_s_f16/common.h
+++ b/src/turbomind/kernels/gemm_s_f16/common.h
@@ -293,10 +293,4 @@ struct Shape {
    }
 };

-template<int... Ns>
-Shape(std::integral_constant<int, Ns>...) -> Shape<Ns...>;
-
-template<int... Ns>
-inline constexpr Shape<Ns...> shape_c{};
-
 }  // namespace turbomind
--- a/src/turbomind/kernels/gemm_s_f16/cta_iterator.h
+++ b/src/turbomind/kernels/gemm_s_f16/cta_iterator.h
@@ -7,20 +7,28 @@

 namespace turbomind {

+#if (__CUDACC_VER_MAJOR__ >= 11) && (__CUDACC_VER_MINOR__ >= 4)
+#define L2_CACHEHINT(size) ".L2::" #size "B"
+#else
+#define L2_CACHEHINT(size)
+#endif
+
 template<typename T>
 __inline__ __device__ void cp_async_cg_A(uint32_t smem_int_ptr, const T* __restrict__ src, bool mask)
 {
 #if TURBOMIND_ARCH_SM80
    constexpr int cp_size = sizeof(T);
    static_assert(cp_size == 16, "cp.async.cg requreis cp_size == 16");
+    // clang-format off
    asm volatile("{\n"
                 "  .reg .pred p;\n"
                 "  setp.ne.b32 p, %0, 0;\n"
-                 "  @p cp.async.cg.shared.global.L2::256B [%1], [%2], %3;\n"
+                 "  @p cp.async.cg.shared.global" L2_CACHEHINT(256) " [%1], [%2], %3;\n"
                 "}\n" ::"r"((int)mask),
                 "r"(smem_int_ptr),
                 "l"(src),
                 "n"(cp_size));
+    // clang-format on
 #else
    assert(TURBOMIND_ARCH_SM80);
 #endif
@@ -32,14 +40,16 @@ __inline__ __device__ void cp_async_cg_B(uint32_t smem_int_ptr, const T* __restr
 #if TURBOMIND_ARCH_SM80
    constexpr int cp_size = sizeof(T);
    static_assert(cp_size == 16, "cp.async.cg requreis cp_size == 16");
+    // clang-format off
    asm volatile("{\n"
                 "  .reg .pred p;\n"
                 "  setp.ne.b32 p, %0, 0;\n"
-                 "  @p cp.async.cg.shared.global.L2::128B [%1], [%2], %3;\n"
+                 "  @p cp.async.cg.shared.global" L2_CACHEHINT(128) " [%1], [%2], %3;\n"
                 "}\n" ::"r"((int)mask),
                 "r"(smem_int_ptr),
                 "l"(src),
                 "n"(cp_size));
+    // clang-format on
 #else
    assert(TURBOMIND_ARCH_SM80);
 #endif
@@ -50,14 +60,16 @@ __inline__ __device__ void cp_async_ca(uint32_t smem_int_ptr, const T* __restric
 {
 #if TURBOMIND_ARCH_SM80
    constexpr int cp_size = sizeof(T);
+    // clang-format off
    asm volatile("{\n"
                 "  .reg .pred p;\n"
                 "  setp.ne.b32 p, %0, 0;\n"
-                 "  @p cp.async.ca.shared.global.L2::128B [%1], [%2], %3;\n"
+                 "  @p cp.async.ca.shared.global" L2_CACHEHINT(128) " [%1], [%2], %3;\n"
                 "}\n" ::"r"((int)mask),
                 "r"(smem_int_ptr),
                 "l"(src),
                 "n"(cp_size));
+    // clang-format on
 #else
    assert(TURBOMIND_ARCH_SM80);
 #endif

--- a/src/turbomind/kernels/gpt_kernels.cu
+++ b/src/turbomind/kernels/gpt_kernels.cu
@@ -17,7 +17,7 @@
 #include "src/turbomind/utils/cuda_fp8_utils.h"
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
-#elif (CUDART_VERSION >= 11050)
+#elif (CUDART_VERSION >= 11000)
 #include <cub/cub.cuh>
 #else
 #include "3rdparty/cub/cub.cuh"

--- a/src/turbomind/kernels/logprob_kernels.cu
+++ b/src/turbomind/kernels/logprob_kernels.cu
@@ -20,7 +20,7 @@

 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
-#elif (CUDART_VERSION >= 11050)
+#elif (CUDART_VERSION >= 11000)
 #include <cub/cub.cuh>
 #else
 #include "3rdparty/cub/cub.cuh"

--- a/src/turbomind/kernels/sampling_topk_kernels.cu
+++ b/src/turbomind/kernels/sampling_topk_kernels.cu
@@ -18,7 +18,7 @@
 #include <stdexcept>
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
-#elif (CUDART_VERSION >= 11050)
+#elif (CUDART_VERSION >= 11000)
 #include <cub/cub.cuh>
 #else
 #include "3rdparty/cub/cub.cuh"

--- a/src/turbomind/kernels/sampling_topp_kernels.cu
+++ b/src/turbomind/kernels/sampling_topp_kernels.cu
@@ -16,7 +16,7 @@

 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
-#elif (CUDART_VERSION >= 11050)
+#elif (CUDART_VERSION >= 11000)
 #include <cub/cub.cuh>
 #else
 #include "3rdparty/cub/cub.cuh"

--- a/src/turbomind/models/llama/llama_decoder_kernels.cu
+++ b/src/turbomind/models/llama/llama_decoder_kernels.cu
@@ -115,7 +115,7 @@ __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,

    constexpr int PACK_DIM = sizeof(uint4) / sizeof(T);

-    const auto batch_idx            = grid.block_rank();
+    const auto batch_idx            = block.group_index().x;
    uint4* __restrict__ r_ptr       = reinterpret_cast<uint4*>(r_data + batch_idx * n_dims);
    uint4* __restrict__ x_ptr       = reinterpret_cast<uint4*>(x_data + batch_idx * n_dims);
    const uint4* __restrict__ b_ptr = reinterpret_cast<const uint4*>(bias);
@@ -123,7 +123,7 @@ __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
    res_norm_t<T> ops;

    float thread_sum{};
-    for (auto i = block.thread_rank(); i < n_dims / PACK_DIM; i += block.num_threads()) {
+    for (auto i = block.thread_rank(); i < n_dims / PACK_DIM; i += block.size()) {
        auto  r  = r_ptr[i];
        auto  x  = x_ptr[i];
        uint4 b  = b_ptr ? b_ptr[i] : uint4{};
@@ -136,7 +136,7 @@ __global__ void fusedAddBiasResidualNorm(T* __restrict__ r_data,
    float s_inv_mean = rsqrt(total_sum / n_dims + eps);

    const uint4* __restrict__ s_ptr = reinterpret_cast<const uint4*>(scale);
-    for (uint i = block.thread_rank(); i < n_dims / PACK_DIM; i += block.num_threads()) {
+    for (uint i = block.thread_rank(); i < n_dims / PACK_DIM; i += block.size()) {
        auto r   = r_ptr[i];
        auto s   = s_ptr[i];
        auto o   = ops.normvec(r, s, s_inv_mean);

--- a/src/turbomind/triton_backend/CMakeLists.txt
+++ b/src/turbomind/triton_backend/CMakeLists.txt
@@ -118,10 +118,13 @@ endif()

 set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})

-target_compile_definitions(triton-turbomind-backend
-  PUBLIC
-  USE_TRITONSERVER_DATATYPE
+target_compile_definitions(triton-turbomind-backend PUBLIC
+  USE_TRITONSERVER_DATATYPE)
+
+if (BUILD_MULTI_GPU)
+  target_compile_definitions(triton-turbomind-backend PUBLIC
    BUILD_MULTI_GPU)
+endif ()

 target_include_directories(
  triton-turbomind-backend