Commit 5a9c236d authored by zhuwenwen's avatar zhuwenwen
Browse files

update version and deps

parent 539aa992
......@@ -83,7 +83,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install
+ 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.6.1.post2;
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.6.2;
## Known Issue
-
......
......@@ -9,6 +9,7 @@ ray >= 2.10.0
peft
pytest-asyncio
tensorizer>=2.9.0
setuptools_scm
torch == 2.3.0
triton == 2.1.0
......
......@@ -399,7 +399,7 @@ def get_version_add(sha: Optional[str] = None) -> str:
try:
__version__ = "0.6.2"
__version_tuple__ = (0, 6, 2)
__dcu_version__ = f'0.6.2+{version}
__dcu_version__ = f'0.6.2+{version}'
from vllm.version import __version__, __version_tuple__, __dcu_version__
except Exception as e:
......
......@@ -572,9 +572,10 @@ class ROCmFlashAttentionImpl(AttentionImpl):
num_seqs, num_heads, head_size = decode_query.shape
block_size = value_cache.shape[3]
gqa_ratio = num_heads // self.num_kv_heads
use_custom = _use_rocm_custom_paged_attention(
decode_query.dtype, head_size, block_size, gqa_ratio,
decode_meta.max_decode_seq_len)
# use_custom = _use_rocm_custom_paged_attention(
# decode_query.dtype, head_size, block_size, gqa_ratio,
# decode_meta.max_decode_seq_len)
use_custom = False
if use_custom:
max_seq_len = decode_meta.max_decode_seq_len
max_num_partitions = (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment