Unverified Commit 8add942d authored by tpoisonooo's avatar tpoisonooo Committed by GitHub
Browse files

improvement(build): enable ninja and gold linker (#767)

* feat(build): enable ninja and lld

* fix(.github): add ninja installation

* fix(CI): remove dimsize=256

* fix(CI): add option for generate.sh

* fix(docs): update
parent 8c672a7b
...@@ -52,5 +52,5 @@ jobs: ...@@ -52,5 +52,5 @@ jobs:
source /opt/conda/bin/activate source /opt/conda/bin/activate
conda activate py38 conda activate py38
mkdir build && cd build mkdir build && cd build
bash ../generate.sh bash ../generate.sh make
make -j$(nproc) && make install make -j$(nproc) && make install
...@@ -103,7 +103,9 @@ if(USE_TRITONSERVER_DATATYPE) ...@@ -103,7 +103,9 @@ if(USE_TRITONSERVER_DATATYPE)
endif() endif()
set(CXX_STD "17" CACHE STRING "C++ standard") set(CXX_STD "17" CACHE STRING "C++ standard")
# enable gold linker for binary and .so
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=gold")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=gold")
set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR}) set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
set(TF_PATH "" CACHE STRING "TensorFlow path") set(TF_PATH "" CACHE STRING "TensorFlow path")
......
...@@ -12,7 +12,7 @@ conda activate $PYTHON_VERSION ...@@ -12,7 +12,7 @@ conda activate $PYTHON_VERSION
cd lmdeploy cd lmdeploy
mkdir -p build && cd build && rm -rf * mkdir -p build && cd build && rm -rf *
bash ../generate.sh bash ../generate.sh make
make -j$(nproc) && make install make -j$(nproc) && make install
if [ $? != 0 ]; then if [ $? != 0 ]; then
echo "build failed" echo "build failed"
......
...@@ -67,10 +67,11 @@ Then, follow the steps below to set up the compilation environment: ...@@ -67,10 +67,11 @@ Then, follow the steps below to set up the compilation environment:
``` ```
- build and install lmdeploy libraries: - build and install lmdeploy libraries:
```shell ```shell
apt install ninja-build # install ninja
cd lmdeploy # the home folder of lmdeploy cd lmdeploy # the home folder of lmdeploy
mkdir build && cd build mkdir build && cd build
sh ../generate.sh sh ../generate.sh
make -j$(nproc) && make install ninja -j$(nproc) && ninja install
``` ```
- install lmdeploy python package: - install lmdeploy python package:
```shell ```shell
......
...@@ -67,10 +67,12 @@ wheel 文件存放在目录 `builder/manywheel/cuda11.8_dist` 下。 ...@@ -67,10 +67,12 @@ wheel 文件存放在目录 `builder/manywheel/cuda11.8_dist` 下。
``` ```
- lmdeploy 编译安装: - lmdeploy 编译安装:
```shell ```shell
apt install ninja-build # 安装更快的 Ninja
cd lmdeploy # lmdeploy 源码的根目录 cd lmdeploy # lmdeploy 源码的根目录
mkdir build && cd build mkdir build && cd build
sh ../generate.sh sh ../generate.sh
make -j$(nproc) && make install ninja && ninja install
ninja -j$(nproc) && ninja install
``` ```
- 安装 lmdeploy python package: - 安装 lmdeploy python package:
```shell ```shell
......
#!/bin/sh #!/bin/sh
cmake .. \ builder="-G Ninja"
if [ "$1" == "make" ]; then
builder=""
fi
cmake ${builder} .. \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \ -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DCMAKE_EXPORT_COMPILE_COMMANDS=1 \ -DCMAKE_EXPORT_COMPILE_COMMANDS=1 \
-DCMAKE_INSTALL_PREFIX=./install \ -DCMAKE_INSTALL_PREFIX=./install \
......
...@@ -4,10 +4,10 @@ project(flash_attention2) ...@@ -4,10 +4,10 @@ project(flash_attention2)
add_library(${PROJECT_NAME} STATIC add_library(${PROJECT_NAME} STATIC
flash_api.cpp flash_api.cpp
flash_fwd_hdim32_fp16_sm80.cu # flash_fwd_hdim32_fp16_sm80.cu
flash_fwd_hdim64_fp16_sm80.cu # flash_fwd_hdim64_fp16_sm80.cu
flash_fwd_hdim128_fp16_sm80.cu flash_fwd_hdim128_fp16_sm80.cu
flash_fwd_hdim256_fp16_sm80.cu # flash_fwd_hdim256_fp16_sm80.cu
) )
target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include) target_include_directories(${PROJECT_NAME} PRIVATE ${CUTLASS_DIR} / include)
target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass) target_link_libraries(${PROJECT_NAME} PRIVATE nvidia::cutlass::cutlass)
......
...@@ -63,6 +63,7 @@ void run_flash_fwd(Flash_fwd_params& params, cudaStream_t stream) ...@@ -63,6 +63,7 @@ void run_flash_fwd(Flash_fwd_params& params, cudaStream_t stream)
}); });
} }
#if 0
template<typename T> template<typename T>
void run_mha_fwd_hdim32(Flash_fwd_params& params, cudaStream_t stream) void run_mha_fwd_hdim32(Flash_fwd_params& params, cudaStream_t stream)
{ {
...@@ -100,6 +101,7 @@ void run_mha_fwd_hdim64(Flash_fwd_params& params, cudaStream_t stream) ...@@ -100,6 +101,7 @@ void run_mha_fwd_hdim64(Flash_fwd_params& params, cudaStream_t stream)
} }
}); });
} }
#endif
template<typename T> template<typename T>
void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream) void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream)
...@@ -145,6 +147,7 @@ void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream) ...@@ -145,6 +147,7 @@ void run_mha_fwd_hdim128(Flash_fwd_params& params, cudaStream_t stream)
}); });
} }
#if 0
template<typename T> template<typename T>
void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream) void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream)
{ {
...@@ -174,3 +177,4 @@ void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream) ...@@ -174,3 +177,4 @@ void run_mha_fwd_hdim256(Flash_fwd_params& params, cudaStream_t stream)
// Is_causal>(params, stream); // Is_causal>(params, stream);
}); });
} }
#endif
...@@ -38,6 +38,7 @@ ...@@ -38,6 +38,7 @@
} \ } \
}() }()
#if 0
#define FWD_HEADDIM_SWITCH(HEADDIM, ...) \ #define FWD_HEADDIM_SWITCH(HEADDIM, ...) \
[&] { \ [&] { \
if (HEADDIM <= 32) { \ if (HEADDIM <= 32) { \
...@@ -57,3 +58,10 @@ ...@@ -57,3 +58,10 @@
return __VA_ARGS__(); \ return __VA_ARGS__(); \
} \ } \
}() }()
#else
#define FWD_HEADDIM_SWITCH(HEADDIM, ...) \
[&] { \
constexpr static int kHeadDim = 128; \
return __VA_ARGS__(); \
}()
#endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment