Unverified Commit b5ee1e32 authored by Peter Pan's avatar Peter Pan Committed by GitHub
Browse files

Remove deprecated `PyNcclConnector` (#24151)


Signed-off-by: default avatarPeter Pan <Peter.Pan@daocloud.io>
parent 36c260da
...@@ -62,7 +62,7 @@ benchmark() { ...@@ -62,7 +62,7 @@ benchmark() {
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \ CUDA_VISIBLE_DEVICES=1 python3 \
...@@ -72,7 +72,7 @@ benchmark() { ...@@ -72,7 +72,7 @@ benchmark() {
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
wait_for_server 8100 wait_for_server 8100
wait_for_server 8200 wait_for_server 8200
......
...@@ -69,7 +69,7 @@ launch_disagg_prefill() { ...@@ -69,7 +69,7 @@ launch_disagg_prefill() {
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
CUDA_VISIBLE_DEVICES=1 python3 \ CUDA_VISIBLE_DEVICES=1 python3 \
-m vllm.entrypoints.openai.api_server \ -m vllm.entrypoints.openai.api_server \
...@@ -78,7 +78,7 @@ launch_disagg_prefill() { ...@@ -78,7 +78,7 @@ launch_disagg_prefill() {
--max-model-len 10000 \ --max-model-len 10000 \
--gpu-memory-utilization 0.6 \ --gpu-memory-utilization 0.6 \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
wait_for_server 8100 wait_for_server 8100
wait_for_server 8200 wait_for_server 8200
......
...@@ -30,12 +30,12 @@ def run_prefill(prefill_done): ...@@ -30,12 +30,12 @@ def run_prefill(prefill_done):
] ]
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1) sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
# Using PyNcclConnector to transmit KV caches between vLLM instances. # Using P2pNcclConnector to transmit KV caches between vLLM instances.
# This instance is the prefill node (kv_producer, rank 0). # This instance is the prefill node (kv_producer, rank 0).
# The number of parallel instances for KV cache transfer is set to 2, # The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector. # as required for P2pNcclConnector.
ktc = KVTransferConfig( ktc = KVTransferConfig(
kv_connector="PyNcclConnector", kv_connector="P2pNcclConnector",
kv_role="kv_producer", kv_role="kv_producer",
kv_rank=0, kv_rank=0,
kv_parallel_size=2, kv_parallel_size=2,
...@@ -74,12 +74,12 @@ def run_decode(prefill_done): ...@@ -74,12 +74,12 @@ def run_decode(prefill_done):
] ]
sampling_params = SamplingParams(temperature=0, top_p=0.95) sampling_params = SamplingParams(temperature=0, top_p=0.95)
# Using PyNcclConnector to transmit KV caches between vLLM instances. # Using P2pNcclConnector to transmit KV caches between vLLM instances.
# This instance is the decode node (kv_consumer, rank 1). # This instance is the decode node (kv_consumer, rank 1).
# The number of parallel instances for KV cache transfer is set to 2, # The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector. # as required for P2pNcclConnector.
ktc = KVTransferConfig( ktc = KVTransferConfig(
kv_connector="PyNcclConnector", kv_connector="P2pNcclConnector",
kv_role="kv_consumer", kv_role="kv_consumer",
kv_rank=1, kv_rank=1,
kv_parallel_size=2, kv_parallel_size=2,
......
...@@ -53,7 +53,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \ ...@@ -53,7 +53,7 @@ CUDA_VISIBLE_DEVICES=0 vllm serve $MODEL_NAME \
--gpu-memory-utilization 0.8 \ --gpu-memory-utilization 0.8 \
--trust-remote-code \ --trust-remote-code \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}' &
# decoding instance, which is the KV consumer # decoding instance, which is the KV consumer
CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
...@@ -62,7 +62,7 @@ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \ ...@@ -62,7 +62,7 @@ CUDA_VISIBLE_DEVICES=1 vllm serve $MODEL_NAME \
--gpu-memory-utilization 0.8 \ --gpu-memory-utilization 0.8 \
--trust-remote-code \ --trust-remote-code \
--kv-transfer-config \ --kv-transfer-config \
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' & '{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}' &
# wait until prefill and decode instances are ready # wait until prefill and decode instances are ready
wait_for_server 8100 wait_for_server 8100
......
...@@ -128,7 +128,7 @@ if __name__ == "__main__": ...@@ -128,7 +128,7 @@ if __name__ == "__main__":
print(f"initialized! My rank is {my_rank}") print(f"initialized! My rank is {my_rank}")
config = KVTransferConfig( config = KVTransferConfig(
kv_connector='PyNcclConnector', kv_connector='P2pNcclConnector',
kv_buffer_device='cuda', kv_buffer_device='cuda',
kv_buffer_size=1e9, kv_buffer_size=1e9,
kv_rank=my_rank, kv_rank=my_rank,
......
...@@ -137,7 +137,7 @@ if __name__ == "__main__": ...@@ -137,7 +137,7 @@ if __name__ == "__main__":
) )
config = KVTransferConfig( config = KVTransferConfig(
kv_connector='PyNcclConnector', kv_connector='P2pNcclConnector',
kv_buffer_device='cuda', kv_buffer_device='cuda',
kv_buffer_size=1e9, kv_buffer_size=1e9,
kv_rank=my_rank, kv_rank=my_rank,
......
...@@ -3247,7 +3247,7 @@ class KVTransferConfig: ...@@ -3247,7 +3247,7 @@ class KVTransferConfig:
kv_parallel_size: int = 1 kv_parallel_size: int = 1
"""The number of parallel instances for KV cache transfer. For """The number of parallel instances for KV cache transfer. For
PyNcclConnector, this should be 2.""" P2pNcclConnector, this should be 2."""
kv_ip: str = "127.0.0.1" kv_ip: str = "127.0.0.1"
"""The KV connector ip, used to build distributed connection.""" """The KV connector ip, used to build distributed connection."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment