Commit cf16c82a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev' into v0.9.2-dev-ds

parents 50bed026 484fcfca
...@@ -55,11 +55,6 @@ class ReqMeta: ...@@ -55,11 +55,6 @@ class ReqMeta:
slot_mapping=slot_mapping, slot_mapping=slot_mapping,
) )
self.parallel_config = vllm_config.parallel_config
self.model_config = vllm_config.model_config
self.total_num_hidden_layers = getattr(self.model_config.hf_text_config,
"num_hidden_layers", 0)
self.pp_size = self.parallel_config.pipeline_parallel_size
@dataclass @dataclass
class P2pNcclConnectorMetadata(KVConnectorMetadata): class P2pNcclConnectorMetadata(KVConnectorMetadata):
...@@ -100,6 +95,12 @@ class P2pNcclConnector(KVConnectorBase_V1): ...@@ -100,6 +95,12 @@ class P2pNcclConnector(KVConnectorBase_V1):
hostname="", hostname="",
port_offset=self._rank, port_offset=self._rank,
) if role == KVConnectorRole.WORKER else None ) if role == KVConnectorRole.WORKER else None
self.parallel_config = vllm_config.parallel_config
self.model_config = vllm_config.model_config
self.total_num_hidden_layers = getattr(self.model_config.hf_text_config,
"num_hidden_layers", 0)
self.pp_size = self.parallel_config.pipeline_parallel_size
# ============================== # ==============================
# Worker-side methods # Worker-side methods
......
...@@ -63,7 +63,7 @@ class TensorMemoryPool: ...@@ -63,7 +63,7 @@ class TensorMemoryPool:
than min_block_size than min_block_size
""" """
def __init__(self, max_block_size: int, min_block_size: int = 512): def __init__(self, max_block_size: int, min_block_size: int = 128):
if max_block_size <= 0 or min_block_size <= 0: if max_block_size <= 0 or min_block_size <= 0:
raise ValueError("Block sizes must be positive") raise ValueError("Block sizes must be positive")
if max_block_size < min_block_size: if max_block_size < min_block_size:
......
...@@ -279,6 +279,7 @@ class RocmPlatform(Platform): ...@@ -279,6 +279,7 @@ class RocmPlatform(Platform):
logger.info_once("Using Flash Attention backend on V1 engine. (only supports block size 64)") logger.info_once("Using Flash Attention backend on V1 engine. (only supports block size 64)")
return FLASH_ATTN_V1 return FLASH_ATTN_V1
else: else:
os.environ['VLLM_USE_FLASH_ATTN_PA'] = '0'
logger.info_once("Using Triton backend on V1 engine.") logger.info_once("Using Triton backend on V1 engine.")
return TRITON_ATTN_VLLM_V1 return TRITON_ATTN_VLLM_V1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment