"tests/models/quantization/test_bitsandbytes.py" did not exist on "1a4f35e2eaa3ebdecb8ef9ff8302b01e289305c9"
Commit 50ed6d0a authored by zhuwenwen's avatar zhuwenwen
Browse files

update version

parent 0640f227
...@@ -82,7 +82,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install ...@@ -82,7 +82,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install
+ 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/ + 若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证 ## 验证
- python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.5.5 - python -c "import vllm; print(vllm.\_\_version__)",版本号与官方版本同步,查询该软件的版本号,例如0.6.0
## Known Issue ## Known Issue
- -
......
...@@ -7,6 +7,7 @@ import torch ...@@ -7,6 +7,7 @@ import torch
from vllm import _custom_ops as ops from vllm import _custom_ops as ops
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser, from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
create_kv_caches_with_random) create_kv_caches_with_random)
import vllm.envs as envs
NUM_BLOCKS = 1024 NUM_BLOCKS = 1024
PARTITION_SIZE = 512 PARTITION_SIZE = 512
...@@ -104,6 +105,24 @@ def main( ...@@ -104,6 +105,24 @@ def main(
for _ in range(num_iters): for _ in range(num_iters):
if version == "v1": if version == "v1":
if envs.VLLM_USE_OPT_OP:
ops.paged_attention_v1_opt(
output,
query,
key_cache,
value_cache,
num_kv_heads,
scale,
block_tables,
seq_lens,
block_size,
max_seq_len,
alibi_slopes,
kv_cache_dtype,
k_scale,
v_scale,
)
else:
ops.paged_attention_v1( ops.paged_attention_v1(
output, output,
query, query,
...@@ -121,6 +140,7 @@ def main( ...@@ -121,6 +140,7 @@ def main(
v_scale, v_scale,
) )
elif version == "v2": elif version == "v2":
if envs.VLLM_USE_OPT_OP:
ops.paged_attention_v2( ops.paged_attention_v2(
output, output,
exp_sums, exp_sums,
...@@ -140,6 +160,26 @@ def main( ...@@ -140,6 +160,26 @@ def main(
k_scale, k_scale,
v_scale, v_scale,
) )
else:
ops.paged_attention_v2_opt(
output,
exp_sums,
max_logits,
tmp_output,
query,
key_cache,
value_cache,
num_kv_heads,
scale,
block_tables,
seq_lens,
block_size,
max_seq_len,
alibi_slopes,
kv_cache_dtype,
k_scale,
v_scale,
)
else: else:
raise ValueError(f"Invalid version: {version}") raise ValueError(f"Invalid version: {version}")
torch.cuda.synchronize() torch.cuda.synchronize()
......
...@@ -404,8 +404,8 @@ except Exception as e: ...@@ -404,8 +404,8 @@ except Exception as e:
stacklevel=2) stacklevel=2)
__commit__ = "COMMIT_HASH_PLACEHOLDER" __commit__ = "COMMIT_HASH_PLACEHOLDER"
__version__ = "0.5.5" __version__ = "0.6.0"
__dcu_version__ = f'0.5.5+{version}' __dcu_version__ = f'0.6.0+{version}'
""" """
......
...@@ -166,6 +166,24 @@ class LLMEngine: ...@@ -166,6 +166,24 @@ class LLMEngine:
@classmethod @classmethod
def validate_outputs( def validate_outputs(
cls,
outputs: GenericSequence[object],
output_type: Type[_O],
) -> List[_O]:
do_validate = cls.DO_VALIDATE_OUTPUT
outputs_: List[_O]
if TYPE_CHECKING or do_validate:
outputs_ = []
for output in outputs:
if not isinstance(output, output_type):
raise TypeError(f"Expected output of type {output_type}, "
f"but found type {type(output)}")
outputs_.append(output)
else:
outputs_ = outputs
return outputs_ return outputs_
tokenizer: Optional[BaseTokenizerGroup] tokenizer: Optional[BaseTokenizerGroup]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment