Commit fb3c32c6 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev-pd_tbo' into 'v0.9.2-dev'

V0.9.2 dev pd tbo

See merge request dcutoolkit/deeplearing/vllm!204
parents 0627b53a fc0e53fe
...@@ -215,9 +215,18 @@ class P2pNcclConnector(KVConnectorBase_V1): ...@@ -215,9 +215,18 @@ class P2pNcclConnector(KVConnectorBase_V1):
inject_kv_into_layer(kv_cache_layer, kv_cache, inject_kv_into_layer(kv_cache_layer, kv_cache,
request.slot_mapping, request.request_id) request.slot_mapping, request.request_id)
tensor = self.p2p_nccl_engine.recv_store.pop(request.request_id + "#" + layer_name, None) tensor_id = request.request_id + "#" + layer_name
if tensor is not None: if tensor_id in self.p2p_nccl_engine.recv_store:
del tensor tensor = self.p2p_nccl_engine.recv_store.pop(tensor_id, None)
self.p2p_nccl_engine.send_request_id_to_tensor_ids.pop(
request.request_id, None)
self.p2p_nccl_engine.recv_request_id_to_tensor_ids.pop(
request.request_id, None)
addr = 0
if isinstance(tensor, tuple):
addr, _, _ = tensor
self.p2p_nccl_engine.pool.free(addr)
def wait_for_layer_load(self, layer_name: str) -> None: def wait_for_layer_load(self, layer_name: str) -> None:
"""Blocking until the KV for a specific layer is loaded into vLLM's """Blocking until the KV for a specific layer is loaded into vLLM's
......
...@@ -287,18 +287,29 @@ def tbo_split_and_execute_model( ...@@ -287,18 +287,29 @@ def tbo_split_and_execute_model(
attn_metadata_left = prepare_tbo_atten_metadata(runner, input_split.scheduler_output_left, input_split.req_ids_left, 0) attn_metadata_left = prepare_tbo_atten_metadata(runner, input_split.scheduler_output_left, input_split.req_ids_left, 0)
attn_metadata_right = prepare_tbo_atten_metadata(runner, input_split.scheduler_output_right, input_split.req_ids_right, input_split.req_num_left) attn_metadata_right = prepare_tbo_atten_metadata(runner, input_split.scheduler_output_right, input_split.req_ids_right, input_split.req_num_left)
model_output = tbo_model_executable_v1( with set_forward_context(attn_metadata,
runner, runner.vllm_config,
attn_metadata_left, num_tokens=num_input_tokens,
attn_metadata_right, num_tokens_across_dp=num_tokens_across_dp,
num_input_tokens_left, skip_cuda_graphs=True):
num_input_tokens_right, runner.maybe_setup_kv_connector(scheduler_output)
num_tokens_across_dp,
input_ids, model_output = tbo_model_executable_v1(
positions, runner,
intermediate_tensors, attn_metadata_left,
inputs_embeds) attn_metadata_right,
finished_sending, finished_recving = None, None num_input_tokens_left,
num_input_tokens_right,
num_tokens_across_dp,
input_ids,
positions,
intermediate_tensors,
inputs_embeds)
runner.maybe_wait_for_kv_save()
finished_sending, finished_recving = (
runner.get_finished_kv_transfers(scheduler_output))
#finished_sending, finished_recving = None, None
else: else:
# Run the decoder. # Run the decoder.
# Use persistent buffers for CUDA graphs. # Use persistent buffers for CUDA graphs.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment