"examples/vscode:/vscode.git/clone" did not exist on "75bf7c9b0fb7f6cd160a8babba5e7234f92305bb"
Unverified Commit 84454ab4 authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

fix: Fix message truncation in disagg flow (#1572)

parent 4abab20f
...@@ -30,9 +30,7 @@ context_servers: ...@@ -30,9 +30,7 @@ context_servers:
max_batch_size: 16 max_batch_size: 16
enable_chunked_prefill: false enable_chunked_prefill: false
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.40 free_gpu_memory_fraction: 0.75
cache_transceiver_config:
max_num_tokens: 10240
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
# Overlap scheduler not currently supported in context-only # Overlap scheduler not currently supported in context-only
...@@ -47,9 +45,7 @@ generation_servers: ...@@ -47,9 +45,7 @@ generation_servers:
max_num_tokens: 256 max_num_tokens: 256
max_batch_size: 256 max_batch_size: 256
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.40 free_gpu_memory_fraction: 0.75
cache_transceiver_config:
max_num_tokens: 256
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
disable_overlap_scheduler: false disable_overlap_scheduler: false
......
...@@ -30,11 +30,9 @@ context_servers: ...@@ -30,11 +30,9 @@ context_servers:
max_batch_size: 16 max_batch_size: 16
enable_chunked_prefill: false enable_chunked_prefill: false
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.40 free_gpu_memory_fraction: 0.75
event_buffer_max_size: 1024 event_buffer_max_size: 1024
enable_block_reuse: true enable_block_reuse: true
cache_transceiver_config:
max_num_tokens: 10240
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
# Overlap scheduler not currently supported in context-only # Overlap scheduler not currently supported in context-only
...@@ -50,11 +48,9 @@ generation_servers: ...@@ -50,11 +48,9 @@ generation_servers:
max_num_tokens: 256 max_num_tokens: 256
max_batch_size: 256 max_batch_size: 256
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.40 free_gpu_memory_fraction: 0.75
event_buffer_max_size: 1024 event_buffer_max_size: 1024
enable_block_reuse: true enable_block_reuse: true
cache_transceiver_config:
max_num_tokens: 256
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
disable_overlap_scheduler: false disable_overlap_scheduler: false
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment