improvement(build): enable ninja and gold linker (#767)

* feat(build): enable ninja and lld * fix(.github): add ninja installation * fix(CI): remove dimsize=256 * fix(CI): add option for generate.sh * fix(docs): update

improvement(build): enable ninja and gold linker (#767)
* feat(build): enable ninja and lld * fix(.github): add ninja installation * fix(CI): remove dimsize=256 * fix(CI): add option for generate.sh * fix(docs): update
8add942d · tpoisonooo · GitHub · 8c672a7b · 8add942d · 8add942d
Unverified Commit 8add942d authored Nov 29, 2023 by tpoisonooo Committed by GitHub Nov 29, 2023
9 changed files
--- a/.github/workflows/linux-x64-gpu.yml
+++ b/.github/workflows/linux-x64-gpu.yml
@@ -52,5 +52,5 @@ jobs:
            source /opt/conda/bin/activate
            conda activate py38
            mkdir build && cd build
-            bash ../generate.sh
+            bash ../generate.sh make
            make -j$(nproc) && make install
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,7 +103,9 @@ if(USE_TRITONSERVER_DATATYPE)
 endif()
 set(CXX_STD "17" CACHE STRING "C++ standard")
+# enable gold linker for binary and .so
+set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold")
+set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=gold")
 set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
 set(TF_PATH "" CACHE STRING "TensorFlow path")

--- a/builder/manywheel/entrypoint_build.sh
+++ b/builder/manywheel/entrypoint_build.sh
@@ -12,7 +12,7 @@ conda activate $PYTHON_VERSION
 cd lmdeploy
 mkdir -p build && cd build && rm -rf *
-bash ../generate.sh
+bash ../generate.sh make
 make -j$(nproc) && make install
 if [ $? != 0 ]; then
    echo "build failed"

--- a/docs/en/build.md
+++ b/docs/en/build.md
@@ -67,10 +67,11 @@ Then, follow the steps below to set up the compilation environment:
  ```
 - build and install lmdeploy libraries:
  ```shell
+  apt install ninja-build # install ninja
  cd lmdeploy # the home folder of lmdeploy
  mkdir build && cd build
  sh ../generate.sh
-  make -j$(nproc) && make install
+  ninja -j$(nproc) && ninja install
  ```
 - install lmdeploy python package:
  ```shell

--- a/docs/zh_cn/build.md
+++ b/docs/zh_cn/build.md
@@ -67,10 +67,12 @@ wheel 文件存放在目录 `builder/manywheel/cuda11.8_dist` 下。
  ```
 - lmdeploy 编译安装:
  ```shell
+  apt install ninja-build # 安装更快的 Ninja
  cd lmdeploy # lmdeploy 源码的根目录
  mkdir build && cd build
  sh ../generate.sh
-  make -j$(nproc) && make install
+  ninja && ninja install
+  ninja -j$(nproc) && ninja install
  ```
 - 安装 lmdeploy python package:
  ```shell

--- a/generate.sh
+++ b/generate.sh
 #!/bin/sh
-cmake .. \
+builder="-G Ninja"
+if [ "$1" == "make" ]; then
+    builder=""
+fi
+cmake ${builder} .. \
    -DCMAKE_BUILD_TYPE=RelWithDebInfo \
    -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
    -DCMAKE_INSTALL_PREFIX=./install \

--- a/src/turbomind/models/llama/flash_attention2/CMakeLists.txt
+++ b/src/turbomind/models/llama/flash_attention2/CMakeLists.txt
@@ -4,10 +4,10 @@ project(flash_attention2)
 add_library(${PROJECT_NAME} STATIC
    flash_api.cpp
-    flash_fwd_hdim32_fp16_sm80.cu
+    # flash_fwd_hdim32_fp16_sm80.cu
-    flash_fwd_hdim64_fp16_sm80.cu
+    # flash_fwd_hdim64_fp16_sm80.cu
    flash_fwd_hdim128_fp16_sm80.cu
-    flash_fwd_hdim256_fp16_sm80.cu
+    # flash_fwd_hdim256_fp16_sm80.cu
    )
 target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include)
 target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass)

--- a/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h
+++ b/src/turbomind/models/llama/flash_attention2/flash_fwd_launch_template.h
@@ -63,6 +63,7 @@ void run_flash_fwd(Flash_fwd_params& params, cudaStream_t stream)
    });
 }
+#if 0
 template<typename T>
 void run_mha_fwd_hdim32(Flash_fwd_params& params, cudaStream_t stream)
 {
@@ -100,6 +101,7 @@ void run_mha_fwd_hdim64(Flash_fwd_params& params, cudaStream_t stream)
        }
    });
 }
+#endif
 template<typename T>
 void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream)
@@ -145,6 +147,7 @@ void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream)
    });
 }
+#if 0
 template<typename T>
 void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream)
 {
@@ -174,3 +177,4 @@ void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream)
        // Is_causal>(params, stream);
    });
 }
+#endif
--- a/src/turbomind/models/llama/flash_attention2/static_switch.h
+++ b/src/turbomind/models/llama/flash_attention2/static_switch.h
@@ -38,6 +38,7 @@
        }                                                                                                              \
    }()
+#if 0
 #define FWD_HEADDIM_SWITCH(HEADDIM, ...)                                                                               \
    [&] {                                                                                                              \
        if (HEADDIM <= 32) {                                                                                           \
@@ -57,3 +58,10 @@
            return __VA_ARGS__();                                                                                      \
        }                                                                                                              \
    }()
+#else
+#define FWD_HEADDIM_SWITCH(HEADDIM, ...)                                                                               \
+    [&] {                                                                                                              \
+        constexpr static int kHeadDim = 128;                                                                           \
+        return __VA_ARGS__();                                                                                          \
+    }()
+#endif