Commit cf16c82a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.9.2-dev' into v0.9.2-dev-ds

parents 50bed026 484fcfca
......@@ -55,11 +55,6 @@ class ReqMeta:
slot_mapping=slot_mapping,
)
self.parallel_config = vllm_config.parallel_config
self.model_config = vllm_config.model_config
self.total_num_hidden_layers = getattr(self.model_config.hf_text_config,
"num_hidden_layers", 0)
self.pp_size = self.parallel_config.pipeline_parallel_size
@dataclass
class P2pNcclConnectorMetadata(KVConnectorMetadata):
......@@ -100,6 +95,12 @@ class P2pNcclConnector(KVConnectorBase_V1):
hostname="",
port_offset=self._rank,
) if role == KVConnectorRole.WORKER else None
self.parallel_config = vllm_config.parallel_config
self.model_config = vllm_config.model_config
self.total_num_hidden_layers = getattr(self.model_config.hf_text_config,
"num_hidden_layers", 0)
self.pp_size = self.parallel_config.pipeline_parallel_size
# ==============================
# Worker-side methods
......
......@@ -63,7 +63,7 @@ class TensorMemoryPool:
than min_block_size
"""
def __init__(self, max_block_size: int, min_block_size: int = 512):
def __init__(self, max_block_size: int, min_block_size: int = 128):
if max_block_size <= 0 or min_block_size <= 0:
raise ValueError("Block sizes must be positive")
if max_block_size < min_block_size:
......
......@@ -279,6 +279,7 @@ class RocmPlatform(Platform):
logger.info_once("Using Flash Attention backend on V1 engine. (only supports block size 64)")
return FLASH_ATTN_V1
else:
os.environ['VLLM_USE_FLASH_ATTN_PA'] = '0'
logger.info_once("Using Triton backend on V1 engine.")
return TRITON_ATTN_VLLM_V1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment