Unverified Commit bb6f8060 authored by lvhan028's avatar lvhan028 Committed by GitHub
Browse files

install triton_example and TransformerTritonBackend to runtime and lib respectively (#39)

parent 6e58fced
...@@ -376,6 +376,7 @@ install( ...@@ -376,6 +376,7 @@ install(
transformer-shared-targets transformer-shared-targets
LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/backends/turbomind
RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin
) )
install( install(
......
...@@ -72,7 +72,7 @@ def warmup(tritonserver_addr: str, ...@@ -72,7 +72,7 @@ def warmup(tritonserver_addr: str,
def main(tritonserver_addr: str, def main(tritonserver_addr: str,
model_name: str, model_name: str,
concurrency: int = 1, concurrency: int = 1,
session_len: int = 2048, session_len: int = 2056,
input_seqlen: int = 0, input_seqlen: int = 0,
output_seqlen: int = 512, output_seqlen: int = 512,
test_round: int = 10): test_round: int = 10):
...@@ -116,7 +116,7 @@ def main(tritonserver_addr: str, ...@@ -116,7 +116,7 @@ def main(tritonserver_addr: str,
token_latency_max = np.max(stats[:, 2], axis=0) token_latency_max = np.max(stats[:, 2], axis=0)
token_latency_ave = np.mean(stats[:, 2], axis=0) token_latency_ave = np.mean(stats[:, 2], axis=0)
throughput = np.sum(stats[:, 1], axis=0) / np.sum(stats[:, 2], axis=0) throughput = np.sum(stats[:, 1], axis=0) / np.sum(stats[:, 2], axis=0)
print(f'\n{"-" * 50}\ncocurrency: {concurrency}, input_tokens: ' print(f'\n{"-" * 50}\nconcurrency: {concurrency}, input_tokens: '
f'{input_seqlen}, output_tokens: {output_seqlen}\n' f'{input_seqlen}, output_tokens: {output_seqlen}\n'
f'elapsed_time: {elapsed_time:.2f}s\n' f'elapsed_time: {elapsed_time:.2f}s\n'
f'first_token latency(min, max, ave): ' f'first_token latency(min, max, ave): '
......
...@@ -4,3 +4,5 @@ add_executable(llama_triton_example llama_triton_example.cc) ...@@ -4,3 +4,5 @@ add_executable(llama_triton_example llama_triton_example.cc)
target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart target_link_libraries(llama_triton_example PUBLIC -lcublas -lcublasLt -lcudart
LlamaTritonBackend TransformerTritonBackend mpi_utils nccl_utils LlamaTritonBackend TransformerTritonBackend mpi_utils nccl_utils
nvtx_utils word_list glog) nvtx_utils word_list glog)
install(TARGETS llama_triton_example DESTINATION ${CMAKE_INSTALL_PREFIX}/bin)
...@@ -328,6 +328,7 @@ class Chatbot: ...@@ -328,6 +328,7 @@ class Chatbot:
f'#input tokens {input_tokens}, ' \ f'#input tokens {input_tokens}, ' \
f'history tokens {session.sequence_length}, ' \ f'history tokens {session.sequence_length}, ' \
f'request length {request_output_len}' f'request length {request_output_len}'
logger.warning(errmsg)
yield StatusCode.TRITON_SESSION_OUT_OF_LIMIT, errmsg, 0 yield StatusCode.TRITON_SESSION_OUT_OF_LIMIT, errmsg, 0
return return
......
...@@ -143,7 +143,7 @@ def export(model_name: str, ...@@ -143,7 +143,7 @@ def export(model_name: str,
# parameters for turbomind # parameters for turbomind
max_batch_size=32, max_batch_size=32,
max_context_token_num=4, max_context_token_num=4,
session_len=2048, session_len=2056,
step_length=1, step_length=1,
cache_max_entry_count=48, cache_max_entry_count=48,
cache_chunk_size=8, cache_chunk_size=8,
......
...@@ -284,5 +284,6 @@ export(PACKAGE TritonTurboMindBackend) ...@@ -284,5 +284,6 @@ export(PACKAGE TritonTurboMindBackend)
add_library(TransformerTritonBackend SHARED transformer_triton_backend.cpp) add_library(TransformerTritonBackend SHARED transformer_triton_backend.cpp)
target_link_libraries(TransformerTritonBackend PRIVATE nccl_utils mpi_utils) target_link_libraries(TransformerTritonBackend PRIVATE nccl_utils mpi_utils)
install(TARGETS TransformerTritonBackend DESTINATION ${CMAKE_INSTALL_LIBDIR})
add_subdirectory(llama) add_subdirectory(llama)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment