Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
50ed6d0a
Commit
50ed6d0a
authored
Sep 09, 2024
by
zhuwenwen
Browse files
update version
parent
0640f227
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
64 additions
and
6 deletions
+64
-6
README.md
README.md
+1
-1
benchmarks/kernels/benchmark_paged_attention.py
benchmarks/kernels/benchmark_paged_attention.py
+42
-2
setup.py
setup.py
+2
-2
vllm/engine/llm_engine.py
vllm/engine/llm_engine.py
+19
-1
No files found.
README.md
View file @
50ed6d0a
...
...
@@ -82,7 +82,7 @@ VLLM_INSTALL_PUNICA_KERNELS=1 python3 setup.py install
+
若使用 pip install 下载安装过慢,可添加源:-i https://pypi.tuna.tsinghua.edu.cn/simple/
## 验证
-
python -c "import vllm; print(vllm.
\_\_
version__)",版本号与官方版本同步,查询该软件的版本号,例如0.
5.5
;
-
python -c "import vllm; print(vllm.
\_\_
version__)",版本号与官方版本同步,查询该软件的版本号,例如0.
6.0
;
## Known Issue
-
无
...
...
benchmarks/kernels/benchmark_paged_attention.py
View file @
50ed6d0a
...
...
@@ -7,6 +7,7 @@ import torch
from
vllm
import
_custom_ops
as
ops
from
vllm.utils
import
(
STR_DTYPE_TO_TORCH_DTYPE
,
FlexibleArgumentParser
,
create_kv_caches_with_random
)
import
vllm.envs
as
envs
NUM_BLOCKS
=
1024
PARTITION_SIZE
=
512
...
...
@@ -104,6 +105,24 @@ def main(
for
_
in
range
(
num_iters
):
if
version
==
"v1"
:
if
envs
.
VLLM_USE_OPT_OP
:
ops
.
paged_attention_v1_opt
(
output
,
query
,
key_cache
,
value_cache
,
num_kv_heads
,
scale
,
block_tables
,
seq_lens
,
block_size
,
max_seq_len
,
alibi_slopes
,
kv_cache_dtype
,
k_scale
,
v_scale
,
)
else
:
ops
.
paged_attention_v1
(
output
,
query
,
...
...
@@ -121,6 +140,7 @@ def main(
v_scale
,
)
elif
version
==
"v2"
:
if
envs
.
VLLM_USE_OPT_OP
:
ops
.
paged_attention_v2
(
output
,
exp_sums
,
...
...
@@ -140,6 +160,26 @@ def main(
k_scale
,
v_scale
,
)
else
:
ops
.
paged_attention_v2_opt
(
output
,
exp_sums
,
max_logits
,
tmp_output
,
query
,
key_cache
,
value_cache
,
num_kv_heads
,
scale
,
block_tables
,
seq_lens
,
block_size
,
max_seq_len
,
alibi_slopes
,
kv_cache_dtype
,
k_scale
,
v_scale
,
)
else
:
raise
ValueError
(
f
"Invalid version:
{
version
}
"
)
torch
.
cuda
.
synchronize
()
...
...
setup.py
View file @
50ed6d0a
...
...
@@ -404,8 +404,8 @@ except Exception as e:
stacklevel=2)
__commit__ = "COMMIT_HASH_PLACEHOLDER"
__version__ = "0.
5.5
"
__dcu_version__ = f'0.
5.5
+
{
version
}
'
__version__ = "0.
6.0
"
__dcu_version__ = f'0.
6.0
+
{
version
}
'
"""
...
...
vllm/engine/llm_engine.py
View file @
50ed6d0a
...
...
@@ -166,6 +166,24 @@ class LLMEngine:
@
classmethod
def
validate_outputs
(
cls
,
outputs
:
GenericSequence
[
object
],
output_type
:
Type
[
_O
],
)
->
List
[
_O
]:
do_validate
=
cls
.
DO_VALIDATE_OUTPUT
outputs_
:
List
[
_O
]
if
TYPE_CHECKING
or
do_validate
:
outputs_
=
[]
for
output
in
outputs
:
if
not
isinstance
(
output
,
output_type
):
raise
TypeError
(
f
"Expected output of type
{
output_type
}
, "
f
"but found type
{
type
(
output
)
}
"
)
outputs_
.
append
(
output
)
else
:
outputs_
=
outputs
return
outputs_
tokenizer
:
Optional
[
BaseTokenizerGroup
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment