Unverified Commit 70e6ab26 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

Change target tritonfastertransformerbackend to trtonturbomindbackend (#36)

* change target tritonfastertransformerbackend to tritonturbomindbackend

* install targets to backends/turbomind

* changge model_dir
parent 35d64462
......@@ -374,8 +374,8 @@ install(
transformer-shared
EXPORT
transformer-shared-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
)
install(
......
......@@ -100,7 +100,7 @@ Run one of the following commands to serve a LLaMA model on NVIDIA GPU server:
```shell
python3 lmdeploy/serve/turbomind/deploy.py llama-7B /path/to/llama-7b llama \
--tokenizer_path /path/to/tokenizer/model
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```
</details>
......@@ -111,7 +111,7 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
```shell
python3 lmdeploy/serve/turbomind/deploy.py llama-13B /path/to/llama-13b llama \
--tokenizer_path /path/to/tokenizer/model --tp 2
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```
</details>
......@@ -129,7 +129,7 @@ python3 -m fastchat.model.apply_delta \
--delta-path lmsys/vicuna-7b-delta-v1.1
python3 lmdeploy/serve/turbomind/deploy.py vicuna-7B /path/to/vicuna-7b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```
</details>
......@@ -145,7 +145,7 @@ python3 -m fastchat.model.apply_delta \
--delta-path lmsys/vicuna-13b-delta-v1.1
python3 lmdeploy/serve/turbomind/deploy.py vicuna-13B /path/to/vicuna-13b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```
</details>
......
......@@ -98,7 +98,7 @@ make -j$(nproc) && make install
```shell
python3 lmdeploy/serve/turbomind/deploy.py llama-7B /path/to/llama-7b llama \
--tokenizer_path /path/to/tokenizer/model
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```
</details>
......@@ -109,7 +109,7 @@ bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fast
```shell
python3 lmdeploy/serve/turbomind/deploy.py llama-13B /path/to/llama-13b llama \
--tokenizer_path /path/to/tokenizer/model --tp 2
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```
</details>
......@@ -127,7 +127,7 @@ python3 -m fastchat.model.apply_delta \
--delta-path lmsys/vicuna-7b-delta-v1.1
python3 lmdeploy/serve/turbomind/deploy.py vicuna-7B /path/to/vicuna-7b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```
</details>
......@@ -143,7 +143,7 @@ python3 -m fastchat.model.apply_delta \
--delta-path lmsys/vicuna-13b-delta-v1.1
python3 lmdeploy/serve/turbomind/deploy.py vicuna-13B /path/to/vicuna-13b hf
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/fastertransformer
bash workspace/service_docker_up.sh --lib-dir $(pwd)/build/install/backends/turbomind
```
</details>
......
......@@ -2,8 +2,8 @@
data_type=fp16
enable_custom_all_reduce=0
pipeline_para_size=1
tensor_para_size=8
model_dir=/shared_data/chatpjlm-0/v0.2.3/fastertransformer/weights/
tensor_para_size=1
model_dir=/workspace/models/triton_models/weights/
[request]
......
......@@ -227,7 +227,7 @@ def deploy_llama(model_name: str, model_path: str, tokenizer_path: str,
del ckpt
for name, param in model_params.items():
# transpose all weights as FasterTransformer is expecting column-major
# transpose all weights as TurboMind is expecting column-major
# weights: (output_dims, input_dims) -> (input_dims, output_dims)
key = name.split('.')[-2]
if key in ['w1', 'w3', 'wq', 'wk', 'wv', 'w2', 'wo']:
......
......@@ -5,7 +5,7 @@ show_help() {
echo
echo "Options:"
echo " -h, --help Show this help message and exit"
echo " --lib-dir Specify the directory of fastertransformer libraries"
echo " --lib-dir Specify the directory of turbomind libraries"
}
# check if '-h' or '--help' in the arguments
......@@ -64,7 +64,7 @@ for ((i = 1; i <= $#; i++)); do
docker run \
--gpus $DEVICES \
--rm \
-v "${LIB_PATH}":/opt/tritonserver/backends/fastertransformer \
-v "${LIB_PATH}":/opt/tritonserver/backends/turbomind \
-v ""${SCRIPT_ABS_DIR}"":/workspace/models \
--shm-size 16g \
-p 33336:22 \
......
......@@ -25,7 +25,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
name: "turbomind"
backend: "fastertransformer"
backend: "turbomind"
default_model_filename: "weights"
max_batch_size: 1
......
......@@ -148,7 +148,7 @@ size_t curandStateGetSize()
bool isDebug()
{
static const bool is_debug = [] {
const auto level = std::getenv("FT_DEBUG_LEVEL");
const auto level = std::getenv("TM_DEBUG_LEVEL");
if (level && level == std::string("DEBUG")) {
return true;
}
......
......@@ -26,7 +26,7 @@
cmake_minimum_required (VERSION 3.18)
project(tritonfastertransformerbackend LANGUAGES C CXX)
project(tritonturbomindbackend LANGUAGES C CXX)
#
# Options
......@@ -89,12 +89,12 @@ endif() # TRITON_ENABLE_GPU
configure_file(libtriton_fastertransformer.ldscript libtriton_fastertransformer.ldscript COPYONLY)
add_library(
triton-fastertransformer-backend SHARED
triton-turbomind-backend SHARED
libfastertransformer.cc
)
add_library(
TritonFasterTransformerBackend::triton-fastertransformer-backend ALIAS triton-fastertransformer-backend
TritonTurboMindBackend::triton-turbomind-backend ALIAS triton-turbomind-backend
)
find_package(CUDAToolkit REQUIRED)
......@@ -106,13 +106,13 @@ endif()
set(CUDA_PATH ${CUDA_TOOLKIT_ROOT_DIR})
target_compile_definitions(triton-fastertransformer-backend
target_compile_definitions(triton-turbomind-backend
PUBLIC
USE_TRITONSERVER_DATATYPE
BUILD_MULTI_GPU)
target_include_directories(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/src
${TRITON_PYTORCH_INCLUDE_PATHS}
......@@ -123,31 +123,31 @@ target_include_directories(
)
target_link_directories(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE
${CUDA_PATH}/lib64
)
target_compile_features(triton-fastertransformer-backend PRIVATE cxx_std_14)
target_compile_features(triton-turbomind-backend PRIVATE cxx_std_14)
target_compile_options(
triton-fastertransformer-backend PRIVATE
triton-turbomind-backend PRIVATE
$<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:
-Wall -Wextra -Wno-unused-parameter -Wno-type-limits >#-Werror>
)
if(${TRITON_ENABLE_GPU})
target_compile_definitions(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE TRITON_ENABLE_GPU=1
)
endif() # TRITON_ENABLE_GPU
set_target_properties(
triton-fastertransformer-backend
triton-turbomind-backend
PROPERTIES
POSITION_INDEPENDENT_CODE ON
OUTPUT_NAME triton_fastertransformer
OUTPUT_NAME triton_turbomind
SKIP_BUILD_RPATH TRUE
BUILD_WITH_INSTALL_RPATH TRUE
INSTALL_RPATH_USE_LINK_PATH FALSE
......@@ -159,7 +159,7 @@ set_target_properties(
# Need to turn off unused-but-set-variable due to Torchvision
# Need to turn off unknown-pragmas due to ATen OpenMP
set_target_properties(
triton-fastertransformer-backend
triton-turbomind-backend
PROPERTIES COMPILE_FLAGS
"-Wno-unknown-pragmas -Wno-unused-but-set-variable"
)
......@@ -170,7 +170,7 @@ FOREACH(p ${TRITON_PYTORCH_LIB_PATHS})
ENDFOREACH(p)
target_link_libraries(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE
triton-core-serverapi # from repo-core
triton-core-backendapi # from repo-core
......@@ -186,23 +186,23 @@ target_link_libraries(
if (BUILD_MULTI_GPU)
target_compile_definitions(
triton-fastertransformer-backend
triton-turbomind-backend
PUBLIC
BUILD_MULTI_GPU
)
target_include_directories(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE
${MPI_INCLUDE_PATH}
)
target_link_directories(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE
${MPI_Libraries}
/usr/local/mpi/lib
)
target_link_libraries(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE
${NCCL_LIBRARIES}
${MPI_LIBRARIES}
......@@ -211,7 +211,7 @@ endif()
if(${TRITON_ENABLE_GPU})
target_link_libraries(
triton-fastertransformer-backend
triton-turbomind-backend
PRIVATE
CUDA::cudart
)
......@@ -221,38 +221,38 @@ endif() # TRITON_ENABLE_GPU
# Install
#
include(GNUInstallDirs)
set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TritonFasterTransformerBackend)
set(INSTALL_CONFIGDIR ${CMAKE_INSTALL_LIBDIR}/cmake/TurboMindBackend)
install(
TARGETS
triton-fastertransformer-backend
triton-turbomind-backend
EXPORT
triton-fastertransformer-backend-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/fastertransformer
triton-turbomind-backend-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
)
install(
EXPORT
triton-fastertransformer-backend-targets
triton-turbomind-backend-targets
FILE
TritonFasterTransformerBackendTargets.cmake
TritonTurboMindBackendTargets.cmake
NAMESPACE
TritonFasterTransformerBackend::
TritonTurboMindBackend::
DESTINATION
${INSTALL_CONFIGDIR}
)
include(CMakePackageConfigHelpers)
configure_package_config_file(
${CMAKE_SOURCE_DIR}/cmake/TritonFasterTransformerBackendConfig.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendConfig.cmake
${CMAKE_SOURCE_DIR}/cmake/TritonTurboMindBackendConfig.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/TritonTurboMindBackendConfig.cmake
INSTALL_DESTINATION ${INSTALL_CONFIGDIR}
)
install(
FILES
${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendConfig.cmake
${CMAKE_CURRENT_BINARY_DIR}/TritonTurboMindBackendConfig.cmake
DESTINATION ${INSTALL_CONFIGDIR}
)
......@@ -260,12 +260,12 @@ install(
# Export from build tree
#
export(
EXPORT triton-fastertransformer-backend-targets
FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonFasterTransformerBackendTargets.cmake
NAMESPACE TritonFasterTransformerBackend::
EXPORT triton-turbomind-backend-targets
FILE ${CMAKE_CURRENT_BINARY_DIR}/TritonTurboMindBackendTargets.cmake
NAMESPACE TritonTurboMindBackend::
)
export(PACKAGE TritonFasterTransformerBackend)
export(PACKAGE TritonTurboMindBackend)
# Copyright (c) 2021-2023, NVIDIA CORPORATION. All rights reserved.
......
......@@ -511,11 +511,11 @@ TRITONSERVER_Error* ModelState::AutoCompleteConfig()
}
}
else {
// Auto-complete configuration is not supported since fastertransformer does
// Auto-complete configuration is not supported since turbomind does
// not store/capture sufficient model metadata so just log error instead.
LOG_MESSAGE(TRITONSERVER_LOG_WARN,
(std::string("skipping model configuration auto-complete for '") + Name()
+ "': not supported for fastertransformer backend")
+ "': not supported for turbomind backend")
.c_str());
}
......@@ -940,7 +940,7 @@ void ModelInstanceState::ProcessRequests(TRITONBACKEND_Request** requests, const
request_count,
TRITONSERVER_ErrorNew(
TRITONSERVER_ERROR_INTERNAL,
std::string("null request given to FasterTransformer backend for '" + Name() + "'").c_str()));
std::string("null request given to TurboMind backend for '" + Name() + "'").c_str()));
return;
}
......@@ -1115,7 +1115,7 @@ void ModelInstanceState::ProcessRequests(TRITONBACKEND_Request** requests, const
for (auto& response : responses) {
if (response != nullptr) {
LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr),
"failed to send FasterTransformer backend response");
"failed to send TurboMind backend response");
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("response is sent")).c_str());
}
else {
......@@ -1160,7 +1160,7 @@ void streaming_callback(std::shared_ptr<std::unordered_map<std::string, Tensor>>
if (response != nullptr) {
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("start to send streaming response")).c_str());
LOG_IF_ERROR(TRITONBACKEND_ResponseSend(response, 0, nullptr),
"failed to send FasterTransformer backend response");
"failed to send TurboMind backend response");
LOG_MESSAGE(TRITONSERVER_LOG_VERBOSE, (std::string("streaming response is sent")).c_str());
}
else {
......@@ -1358,7 +1358,7 @@ ModelInstanceState::Execute(std::vector<TRITONBACKEND_Response*>*
responses,
response_count,
TRITONSERVER_ErrorNew(TRITONSERVER_ERROR_INTERNAL,
("FasterTransformer execute failure: " + std::string(ex.what())).c_str()));
("TurboMind execute failure: " + std::string(ex.what())).c_str()));
}
auto output_tensors = output_tensors_list[0];
return output_tensors;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment