Commit fc5bfc66 authored by maxiao1's avatar maxiao1 Committed by lizhigong
Browse files

keep same version

parent 12291212
...@@ -414,10 +414,10 @@ def unified_attention( ...@@ -414,10 +414,10 @@ def unified_attention(
output = self.impl.forward(self, query, key, value, kv_cache, output = self.impl.forward(self, query, key, value, kv_cache,
attn_metadata) attn_metadata)
# if envs.VLLM_ENABLE_TBO: if envs.VLLM_ENABLE_TBO:
# tbo_maybe_save_kv_layer_to_connector(layer_name, kv_cache) tbo_maybe_save_kv_layer_to_connector(layer_name, kv_cache)
# else: else:
maybe_save_kv_layer_to_connector(layer_name, kv_cache) maybe_save_kv_layer_to_connector(layer_name, kv_cache)
return output return output
...@@ -462,10 +462,10 @@ def unified_attention_with_output( ...@@ -462,10 +462,10 @@ def unified_attention_with_output(
attn_metadata, attn_metadata,
output=output, output=output,
output_scale=output_scale) output_scale=output_scale)
# if envs.VLLM_ENABLE_TBO: if envs.VLLM_ENABLE_TBO:
# tbo_maybe_save_kv_layer_to_connector(layer_name, kv_cache) tbo_maybe_save_kv_layer_to_connector(layer_name, kv_cache)
# else: else:
maybe_save_kv_layer_to_connector(layer_name, kv_cache) maybe_save_kv_layer_to_connector(layer_name, kv_cache)
def unified_attention_with_output_fake( def unified_attention_with_output_fake(
......
...@@ -18,7 +18,7 @@ from vllm.forward_context import get_forward_context ...@@ -18,7 +18,7 @@ from vllm.forward_context import get_forward_context
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.v1.attention.backends.mla.common import MLACommonMetadata from vllm.v1.attention.backends.mla.common import MLACommonMetadata
from vllm.v1.core.sched.output import SchedulerOutput from vllm.v1.core.sched.output import SchedulerOutput
from vllm.two_batch_overlap.v1.two_batch_overlap_v1 import tbo_get_done_event
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionMetadata from vllm.attention.backends.abstract import AttentionMetadata
from vllm.forward_context import ForwardContext from vllm.forward_context import ForwardContext
...@@ -274,8 +274,8 @@ class P2pNcclConnector(KVConnectorBase_V1): ...@@ -274,8 +274,8 @@ class P2pNcclConnector(KVConnectorBase_V1):
Assume the shape of the layer is (2, num_pages, page_size, xxx) Assume the shape of the layer is (2, num_pages, page_size, xxx)
if MLA is not used, and (num_pages, page_size, xxx) otherwise. if MLA is not used, and (num_pages, page_size, xxx) otherwise.
""" """
if envs.VLLM_ENABLE_TBO: # if envs.VLLM_ENABLE_TBO:
slot_mapping = slot_mapping.pin_memory().to(device=layer.device, non_blocking=True) # slot_mapping = slot_mapping.pin_memory().to(device=layer.device, non_blocking=True)
if isinstance(attn_metadata, MLACommonMetadata): if isinstance(attn_metadata, MLACommonMetadata):
num_pages, page_size = layer.shape[0], layer.shape[1] num_pages, page_size = layer.shape[0], layer.shape[1]
return layer.reshape(num_pages * page_size, -1)[slot_mapping, return layer.reshape(num_pages * page_size, -1)[slot_mapping,
...@@ -286,34 +286,73 @@ class P2pNcclConnector(KVConnectorBase_V1): ...@@ -286,34 +286,73 @@ class P2pNcclConnector(KVConnectorBase_V1):
connector_metadata = self._get_connector_metadata() connector_metadata = self._get_connector_metadata()
assert isinstance(connector_metadata, P2pNcclConnectorMetadata) assert isinstance(connector_metadata, P2pNcclConnectorMetadata)
for request in connector_metadata.requests:
request_id = request.request_id if envs.VLLM_ENABLE_TBO:
ip, port = self.parse_request_id(request_id, True) send_stream = self.p2p_nccl_engine.send_stream
remote_address = ip + ":" + str(port + self._rank) for request in connector_metadata.requests:
kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping) request_id = request.request_id
ip, port = self.parse_request_id(request_id, True)
pp_rank = (self.parallel_config.rank // self.parallel_config.tensor_parallel_size remote_address = ip + ":" + str(port + self._rank)
) % self.parallel_config.pipeline_parallel_size
if (self.pp_size == 1): kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping)
self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name, # tbo_evt = torch.cuda.Event(enable_timing=False)
kv_cache, remote_address) # tbo_evt.record()
elif (self.pp_size == 2): # with torch.cuda.stream(send_stream):
if (pp_rank == 0): # send_stream.wait_event(tbo_evt) # 等 TBO all_reduce_stream 完成本轮
self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name, # kv_cache.record_stream(send_stream)
kv_cache, remote_address)
pp_rank = (self.parallel_config.rank //
self.parallel_config.tensor_parallel_size) % \
self.parallel_config.pipeline_parallel_size
if (self.pp_size == 1):
self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name, self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
kv_cache, ip + ":" + str(port + self._rank + 4)) kv_cache, remote_address)
elif (self.pp_size == 2):
if (pp_rank == 0):
self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
kv_cache, remote_address)
self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
kv_cache, ip + ":" + str(port + self._rank + 4))
else:
self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
kv_cache, remote_address)
self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
kv_cache, ip + ":" + str(port + self._rank - 4))
elif (self.pp_size == 8):
for i in range(8):
self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
kv_cache, ip + ":" + str(port + i))
else: else:
print("Error: only suppprt pp1 pp2 pp8!!!!!!")
else:
for request in connector_metadata.requests:
request_id = request.request_id
ip, port = self.parse_request_id(request_id, True)
remote_address = ip + ":" + str(port + self._rank)
kv_cache = extract_kv_from_layer(kv_layer, request.slot_mapping)
pp_rank = (self.parallel_config.rank // self.parallel_config.tensor_parallel_size
) % self.parallel_config.pipeline_parallel_size
if (self.pp_size == 1):
self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name, self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
kv_cache, remote_address) kv_cache, remote_address)
self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name, elif (self.pp_size == 2):
kv_cache, ip + ":" + str(port + self._rank - 4)) if (pp_rank == 0):
elif (self.pp_size == 8): self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
for i in range(8): kv_cache, remote_address)
self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name, self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
kv_cache, ip + ":" + str(port + i)) kv_cache, ip + ":" + str(port + self._rank + 4))
else: else:
print("Error: only suppprt pp1 pp2 pp8!!!!!!") self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
kv_cache, remote_address)
self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
kv_cache, ip + ":" + str(port + self._rank - 4))
elif (self.pp_size == 8):
for i in range(8):
self.p2p_nccl_engine.send_tensor(request_id + "#" + layer_name,
kv_cache, ip + ":" + str(port + i))
else:
print("Error: only suppprt pp1 pp2 pp8!!!!!!")
def wait_for_save(self): def wait_for_save(self):
pass pass
......
...@@ -20,6 +20,7 @@ from vllm.distributed.device_communicators.pynccl_wrapper import ( ...@@ -20,6 +20,7 @@ from vllm.distributed.device_communicators.pynccl_wrapper import (
from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import ( # noqa: E501 from vllm.distributed.kv_transfer.kv_connector.v1.p2p.tensor_memory_pool import ( # noqa: E501
TensorMemoryPool) TensorMemoryPool)
from vllm.utils import current_stream, get_ip from vllm.utils import current_stream, get_ip
from vllm.two_batch_overlap.v1.two_batch_overlap_v1 import all_reduce_stream as tbo_all_reduce_stream
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.forward_context import ForwardContext from vllm.forward_context import ForwardContext
...@@ -110,6 +111,7 @@ class P2pNcclEngine: ...@@ -110,6 +111,7 @@ class P2pNcclEngine:
self.recv_store_cv = threading.Condition() self.recv_store_cv = threading.Condition()
self.send_stream = torch.cuda.Stream() self.send_stream = torch.cuda.Stream()
# self.send_stream = tbo_all_reduce_stream
self.recv_stream = torch.cuda.Stream() self.recv_stream = torch.cuda.Stream()
mem_pool_size_gb = self.config.get_from_extra_config( mem_pool_size_gb = self.config.get_from_extra_config(
......
...@@ -159,7 +159,9 @@ def prepare_tbo_atten_metadata( ...@@ -159,7 +159,9 @@ def prepare_tbo_atten_metadata(
# The block_table for RIGHT starts from (req_offset-1). # The block_table for RIGHT starts from (req_offset-1).
# Align both offsets to that, and re-build the seq_lens for row-0. # Align both offsets to that, and re-build the seq_lens for row-0.
seq_len_offset = req_offset - 1 seq_len_offset = req_offset - 1
query_start_offset = req_offset - 1
# query_start_offset = req_offset - 1
query_start_offset = req_offset
# row-0 is the split request (global row index = req_offset-1): # row-0 is the split request (global row index = req_offset-1):
base_hist = runner.input_batch.num_computed_tokens_cpu[req_offset - 1].item() base_hist = runner.input_batch.num_computed_tokens_cpu[req_offset - 1].item()
...@@ -180,7 +182,8 @@ def prepare_tbo_atten_metadata( ...@@ -180,7 +182,8 @@ def prepare_tbo_atten_metadata(
else: else:
# RIGHT without split-in-req: natural positions # RIGHT without split-in-req: natural positions
seq_len_offset = req_offset seq_len_offset = req_offset
query_start_offset = req_offset # query_start_offset = req_offset
query_start_offset = req_offset + 1
seq_lens_cpu_local = torch.as_tensor(default_seq_lens, device=runner.seq_lens_cpu.device) seq_lens_cpu_local = torch.as_tensor(default_seq_lens, device=runner.seq_lens_cpu.device)
# Copy query_start_loc into global GPU buffer window # Copy query_start_loc into global GPU buffer window
...@@ -201,8 +204,10 @@ def prepare_tbo_atten_metadata( ...@@ -201,8 +204,10 @@ def prepare_tbo_atten_metadata(
runner.seq_lens[seq_len_offset + num_reqs:].fill_(0) runner.seq_lens[seq_len_offset + num_reqs:].fill_(0)
# Build common metadata (pass CLONES to avoid aliasing between threads) # Build common metadata (pass CLONES to avoid aliasing between threads)
query_start_loc = runner.query_start_loc[query_start_offset: query_start_offset + num_reqs + 1].clone() # query_start_loc = runner.query_start_loc[query_start_offset: query_start_offset + num_reqs + 1].clone()
seq_lens = runner.seq_lens[seq_len_offset : seq_len_offset + num_reqs].clone() # seq_lens = runner.seq_lens[seq_len_offset : seq_len_offset + num_reqs].clone()
query_start_loc = runner.query_start_loc[query_start_offset: query_start_offset + num_reqs + 1]
seq_lens = runner.seq_lens[seq_len_offset : seq_len_offset + num_reqs]
common_attn_metadata = CommonAttentionMetadata( common_attn_metadata = CommonAttentionMetadata(
query_start_loc=query_start_loc, query_start_loc=query_start_loc,
seq_lens=seq_lens, seq_lens=seq_lens,
...@@ -319,20 +324,20 @@ def tbo_split_and_execute_model( ...@@ -319,20 +324,20 @@ def tbo_split_and_execute_model(
# === Added: split inputs_embeds & intermediate_tensors per half; setup KV connector === # === Added: split inputs_embeds & intermediate_tensors per half; setup KV connector ===
# 真实 token # 真实 token
real_L = int(input_split.scheduler_output_left.total_num_scheduled_tokens) num_tokens_left = int(input_split.scheduler_output_left.total_num_scheduled_tokens)
real_R = int(input_split.scheduler_output_right.total_num_scheduled_tokens) num_tokens_right = int(input_split.scheduler_output_right.total_num_scheduled_tokens)
# 按左右半批切成两份 # 按左右半批切成两份
def _split_it(it, l, r): def _split_intermediate_tensors(it, l, r):
if it is None: return None, None if it is None: return None, None
lm, rm = {}, {} left_tensor_map, right_tensor_map = {}, {}
for k, v in it.tensors.items(): for name, tensor in it.tensors.items():
vl, vr = torch.split(v[:l + r], [l, r], dim=0) vl, vr = torch.split(tensor[:l + r], [l, r], dim=0)
lm[k], rm[k] = vl, vr left_tensor_map[name], right_tensor_map[name] = vl, vr
return IntermediateTensors(lm), IntermediateTensors(rm) return IntermediateTensors(left_tensor_map), IntermediateTensors(right_tensor_map)
intermediate_tensors_left, intermediate_tensors_right = _split_it( intermediate_tensors_left, intermediate_tensors_right = _split_intermediate_tensors(
intermediate_tensors, real_L, real_R intermediate_tensors, num_tokens_left, num_tokens_right
) )
with set_forward_context(attn_metadata, with set_forward_context(attn_metadata,
......
...@@ -220,6 +220,7 @@ def tbo_all_reduce_v1(obj): ...@@ -220,6 +220,7 @@ def tbo_all_reduce_v1(obj):
all_reduce_stream.wait_event(event_c2t) all_reduce_stream.wait_event(event_c2t)
output = tensor_model_parallel_all_reduce(obj) output = tensor_model_parallel_all_reduce(obj)
event_t2c.record() event_t2c.record()
#tbo_mark_allreduce_done()
tbo_obj_v1.tbo_thread_synchronize(tid) tbo_obj_v1.tbo_thread_synchronize(tid)
tbo_step_stream.wait_event(event_t2c) tbo_step_stream.wait_event(event_t2c)
return output return output
...@@ -280,10 +281,22 @@ def tbo_model_executable_v1( ...@@ -280,10 +281,22 @@ def tbo_model_executable_v1(
return hidden_or_intermediate_states return hidden_or_intermediate_states
_tbo_done_event = torch.cuda.Event(enable_timing=False)
def tbo_mark_allreduce_done():
"""Record completion of all_reduce_stream for external synchronization."""
global all_reduce_stream, _tbo_done_event
_tbo_done_event.record(all_reduce_stream)
def tbo_get_done_event():
"""Return the event recorded by all_reduce_stream."""
return _tbo_done_event
def finalize_two_batch_overlap(): def finalize_two_batch_overlap():
global tbo_obj_v1 global tbo_obj_v1
if tbo_obj_v1 is not None: if tbo_obj_v1 is not None:
try: try:
tbo_obj_v1.shutdown() tbo_obj_v1.shutdown()
finally: finally:
tbo_obj_v1 = None tbo_obj_v1 = None
\ No newline at end of file
...@@ -69,7 +69,7 @@ from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch ...@@ -69,7 +69,7 @@ from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.two_batch_overlap.v1.model_input_split_v1 import tbo_split_and_execute_model from vllm.two_batch_overlap.v1.model_input_split_v1 import tbo_split_and_execute_model
from vllm.profiler.prof import profile
from ..sample.logits_processor import LogitsProcessorManager from ..sample.logits_processor import LogitsProcessorManager
from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing, from .utils import (gather_mm_placeholders, initialize_kv_cache_for_kv_sharing,
sanity_check_mm_encoder_outputs, scatter_mm_placeholders) sanity_check_mm_encoder_outputs, scatter_mm_placeholders)
...@@ -1295,6 +1295,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1295,6 +1295,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
scheduler_output: "SchedulerOutput", scheduler_output: "SchedulerOutput",
intermediate_tensors: Optional[IntermediateTensors] = None, intermediate_tensors: Optional[IntermediateTensors] = None,
) -> Union[ModelRunnerOutput, IntermediateTensors]: ) -> Union[ModelRunnerOutput, IntermediateTensors]:
profile.StartTracer()
self._update_states(scheduler_output) self._update_states(scheduler_output)
if not scheduler_output.total_num_scheduled_tokens: if not scheduler_output.total_num_scheduled_tokens:
if not has_kv_transfer_group(): if not has_kv_transfer_group():
...@@ -1573,7 +1574,7 @@ class GPUModelRunner(LoRAModelRunnerMixin): ...@@ -1573,7 +1574,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
get_kv_transfer_group().clear_connector_metadata() get_kv_transfer_group().clear_connector_metadata()
self.eplb_step() self.eplb_step()
print('###valid_sampled_token_ids', valid_sampled_token_ids)
return ModelRunnerOutput( return ModelRunnerOutput(
req_ids=self.input_batch.req_ids, req_ids=self.input_batch.req_ids,
req_id_to_index=self.input_batch.req_id_to_index, req_id_to_index=self.input_batch.req_id_to_index,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
try: try:
from ._version import __version__, __version_tuple__ __version__ = "0.9.2"
__version_tuple__ = (0, 9, 2)
__hcu_version__ = f'0.9.2+das.opt1.rc2.dtk25041'
from vllm.version import __version__, __version_tuple__, __hcu_version__
except Exception as e: except Exception as e:
import warnings import warnings
warnings.warn(f"Failed to read commit hash:\n{e}", warnings.warn(f"Failed to read commit hash:\n + str(e)",
RuntimeWarning, RuntimeWarning,
stacklevel=2) stacklevel=2)
__version__ = "dev" __version__ = "dev"
__version_tuple__ = (0, 0, __version__) __version_tuple__ = (0, 0, __version__)
def _prev_minor_version_was(version_str): def _prev_minor_version_was(version_str):
"""Check whether a given version matches the previous minor version. '''Check whether a given version matches the previous minor version.
Return True if version_str matches the previous minor version. Return True if version_str matches the previous minor version.
...@@ -23,19 +24,19 @@ def _prev_minor_version_was(version_str): ...@@ -23,19 +24,19 @@ def _prev_minor_version_was(version_str):
supplied version_str is '0.6'. supplied version_str is '0.6'.
Used for --show-hidden-metrics-for-version. Used for --show-hidden-metrics-for-version.
""" '''
# Match anything if this is a dev tree # Match anything if this is a dev tree
if __version_tuple__[0:2] == (0, 0): if __version_tuple__[0:2] == (0, 0):
return True return True
# Note - this won't do the right thing when we release 1.0! # Note - this won't do the right thing when we release 1.0!
assert __version_tuple__[0] == 0 # assert __version_tuple__[0] == 0
assert isinstance(__version_tuple__[1], int) assert isinstance(__version_tuple__[1], int)
return version_str == f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}" return version_str == f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"
def _prev_minor_version(): def _prev_minor_version():
"""For the purpose of testing, return a previous minor version number.""" '''For the purpose of testing, return a previous minor version number.'''
# In dev tree, this will return "0.-1", but that will work fine" # In dev tree, this will return "0.-1", but that will work fine"
assert isinstance(__version_tuple__[1], int) assert isinstance(__version_tuple__[1], int)
return f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}" return f"{__version_tuple__[0]}.{__version_tuple__[1] - 1}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment