install triton_example and TransformerTritonBackend to runtime and lib respectively (#39)

bb6f8060 · lvhan028 · GitHub · 6e58fced · bb6f8060 · bb6f8060
Unverified Commit bb6f8060 authored Jul 03, 2023 by lvhan028 Committed by GitHub Jul 03, 2023
6 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -376,6 +376,7 @@ install(
    transformer-shared-targets
  LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
  ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
+  RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
 )

 install(

--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -72,7 +72,7 @@ def warmup(tritonserver_addr: str,
 def main(tritonserver_addr: str,
         model_name: str,
         concurrency: int = 1,
-         session_len: int = 2048,
+         session_len: int = 2056,
         input_seqlen: int = 0,
         output_seqlen: int = 512,
         test_round: int = 10):
@@ -116,7 +116,7 @@ def main(tritonserver_addr: str,
    token_latency_max = np.max(stats[:, 2], axis=0)
    token_latency_ave = np.mean(stats[:, 2], axis=0)
    throughput = np.sum(stats[:, 1], axis=0) / np.sum(stats[:, 2], axis=0)
-    print(f'\n{"-" * 50}\ncocurrency: {concurrency}, input_tokens: '
+    print(f'\n{"-" * 50}\nconcurrency: {concurrency}, input_tokens: '
          f'{input_seqlen}, output_tokens: {output_seqlen}\n'
          f'elapsed_time: {elapsed_time:.2f}s\n'
          f'first_token latency(min, max, ave): '

--- a/examples/cpp/llama/CMakeLists.txt
+++ b/examples/cpp/llama/CMakeLists.txt
@@ -4,3 +4,5 @@ add_executable(llama_triton_example llama_triton_example.cc)
 target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart
        LlamaTritonBackend TransformerTritonBackend mpi_utils nccl_utils
        nvtx_utils word_list glog)
+
+install(TARGETS llama_triton_example DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
--- a/lmdeploy/serve/turbomind/chatbot.py
+++ b/lmdeploy/serve/turbomind/chatbot.py
@@ -328,6 +328,7 @@ class Chatbot:
                     f'#input tokens {input_tokens}, ' \
                     f'history tokens {session.sequence_length}, ' \
                     f'request length {request_output_len}'
+            logger.warning(errmsg)
            yield StatusCode.TRITON_SESSION_OUT_OF_LIMIT, errmsg, 0
            return


--- a/lmdeploy/serve/turbomind/deploy.py
+++ b/lmdeploy/serve/turbomind/deploy.py
@@ -143,7 +143,7 @@ def export(model_name: str,
        # parameters for turbomind
        max_batch_size=32,
        max_context_token_num=4,
-        session_len=2048,
+        session_len=2056,
        step_length=1,
        cache_max_entry_count=48,
        cache_chunk_size=8,

--- a/src/turbomind/triton_backend/CMakeLists.txt
+++ b/src/turbomind/triton_backend/CMakeLists.txt
@@ -284,5 +284,6 @@ export(PACKAGE TritonTurboMindBackend)

 add_library(TransformerTritonBackend SHARED transformer_triton_backend.cpp)
 target_link_libraries(TransformerTritonBackend PRIVATE nccl_utils mpi_utils)
+install(TARGETS TransformerTritonBackend DESTINATION ${CMAKE_INSTALL_LIBDIR})

 add_subdirectory(llama)