Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
21063c11
Unverified
Commit
21063c11
authored
Nov 06, 2024
by
Aaron Pham
Committed by
GitHub
Nov 06, 2024
Browse files
[CI/Build] drop support for Python 3.8 EOL (#8464)
Signed-off-by:
Aaron Pham
<
contact@aarnphm.xyz
>
parent
4be3a451
Changes
115
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
105 additions
and
130 deletions
+105
-130
.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
...ly-benchmarks/scripts/convert-results-json-to-markdown.py
+5
-5
.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
...e/nightly-benchmarks/scripts/generate-nightly-markdown.py
+2
-2
.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
...ite/nightly-benchmarks/scripts/summary-nightly-results.py
+2
-2
.github/workflows/mypy.yaml
.github/workflows/mypy.yaml
+1
-1
.github/workflows/publish.yml
.github/workflows/publish.yml
+1
-1
.github/workflows/ruff.yml
.github/workflows/ruff.yml
+16
-16
.github/workflows/yapf.yml
.github/workflows/yapf.yml
+13
-13
.readthedocs.yaml
.readthedocs.yaml
+5
-6
CMakeLists.txt
CMakeLists.txt
+18
-18
benchmarks/backend_request_func.py
benchmarks/backend_request_func.py
+7
-15
benchmarks/kernels/benchmark_machete.py
benchmarks/kernels/benchmark_machete.py
+3
-3
csrc/quantization/machete/generate.py
csrc/quantization/machete/generate.py
+4
-4
docs/source/getting_started/installation.rst
docs/source/getting_started/installation.rst
+5
-5
pyproject.toml
pyproject.toml
+2
-2
setup.py
setup.py
+4
-5
tests/compile/piecewise/test_toy_llama.py
tests/compile/piecewise/test_toy_llama.py
+2
-2
tests/conftest.py
tests/conftest.py
+8
-21
tests/core/block/test_prefix_caching_block.py
tests/core/block/test_prefix_caching_block.py
+5
-7
tests/kernels/test_mamba_ssm.py
tests/kernels/test_mamba_ssm.py
+1
-1
tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
...der_only/vision_language/mm_processor_kwargs/test_qwen.py
+1
-1
No files found.
.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
View file @
21063c11
...
...
@@ -56,7 +56,7 @@ serving_column_mapping = {
def
read_markdown
(
file
):
if
os
.
path
.
exists
(
file
):
with
open
(
file
,
"r"
)
as
f
:
with
open
(
file
)
as
f
:
return
f
.
read
()
+
"
\n
"
else
:
return
f
"
{
file
}
not found.
\n
"
...
...
@@ -75,14 +75,14 @@ if __name__ == "__main__":
# collect results
for
test_file
in
results_folder
.
glob
(
"*.json"
):
with
open
(
test_file
,
"r"
)
as
f
:
with
open
(
test_file
)
as
f
:
raw_result
=
json
.
loads
(
f
.
read
())
if
"serving"
in
str
(
test_file
):
# this result is generated via `benchmark_serving.py`
# attach the benchmarking command to raw_result
with
open
(
test_file
.
with_suffix
(
".commands"
)
,
"r"
)
as
f
:
with
open
(
test_file
.
with_suffix
(
".commands"
))
as
f
:
command
=
json
.
loads
(
f
.
read
())
raw_result
.
update
(
command
)
...
...
@@ -97,7 +97,7 @@ if __name__ == "__main__":
# this result is generated via `benchmark_latency.py`
# attach the benchmarking command to raw_result
with
open
(
test_file
.
with_suffix
(
".commands"
)
,
"r"
)
as
f
:
with
open
(
test_file
.
with_suffix
(
".commands"
))
as
f
:
command
=
json
.
loads
(
f
.
read
())
raw_result
.
update
(
command
)
...
...
@@ -119,7 +119,7 @@ if __name__ == "__main__":
# this result is generated via `benchmark_throughput.py`
# attach the benchmarking command to raw_result
with
open
(
test_file
.
with_suffix
(
".commands"
)
,
"r"
)
as
f
:
with
open
(
test_file
.
with_suffix
(
".commands"
))
as
f
:
command
=
json
.
loads
(
f
.
read
())
raw_result
.
update
(
command
)
...
...
.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
View file @
21063c11
...
...
@@ -72,7 +72,7 @@ def main(args):
# collect results
for
test_file
in
results_folder
.
glob
(
"*_nightly_results.json"
):
with
open
(
test_file
,
"r"
)
as
f
:
with
open
(
test_file
)
as
f
:
results
=
results
+
json
.
loads
(
f
.
read
())
# generate markdown table
...
...
@@ -80,7 +80,7 @@ def main(args):
md_table
=
tabulate
(
df
,
headers
=
'keys'
,
tablefmt
=
'pipe'
,
showindex
=
False
)
with
open
(
args
.
description
,
"r"
)
as
f
:
with
open
(
args
.
description
)
as
f
:
description
=
f
.
read
()
description
=
description
.
format
(
...
...
.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
View file @
21063c11
...
...
@@ -36,11 +36,11 @@ if __name__ == "__main__":
# collect results
for
test_file
in
results_folder
.
glob
(
"*.json"
):
with
open
(
test_file
,
"r"
)
as
f
:
with
open
(
test_file
)
as
f
:
raw_result
=
json
.
loads
(
f
.
read
())
# attach the benchmarking command to raw_result
with
open
(
test_file
.
with_suffix
(
".commands"
)
,
"r"
)
as
f
:
with
open
(
test_file
.
with_suffix
(
".commands"
))
as
f
:
command
=
json
.
loads
(
f
.
read
())
raw_result
.
update
(
command
)
...
...
.github/workflows/mypy.yaml
View file @
21063c11
...
...
@@ -25,7 +25,7 @@ jobs:
runs-on
:
ubuntu-latest
strategy
:
matrix
:
python-version
:
[
"
3.8"
,
"
3.9"
,
"
3.10"
,
"
3.11"
,
"
3.12"
]
python-version
:
[
"
3.9"
,
"
3.10"
,
"
3.11"
,
"
3.12"
]
steps
:
-
uses
:
actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871
# v4.2.1
-
name
:
Set up Python ${{ matrix.python-version }}
...
...
.github/workflows/publish.yml
View file @
21063c11
...
...
@@ -48,7 +48,7 @@ jobs:
fail-fast
:
false
matrix
:
os
:
[
'
ubuntu-20.04'
]
python-version
:
[
'
3.8'
,
'
3.9'
,
'
3.10'
,
'
3.11'
,
'
3.12'
]
python-version
:
[
'
3.9'
,
'
3.10'
,
'
3.11'
,
'
3.12'
]
pytorch-version
:
[
'
2.4.0'
]
# Must be the most recent version that meets requirements-cuda.txt.
cuda-version
:
[
'
11.8'
,
'
12.1'
]
...
...
.github/workflows/ruff.yml
View file @
21063c11
.github/workflows/yapf.yml
View file @
21063c11
.readthedocs.yaml
View file @
21063c11
...
...
@@ -6,7 +6,7 @@ version: 2
build
:
os
:
ubuntu-22.04
tools
:
python
:
"
3.
8"
python
:
'
3.
9'
sphinx
:
configuration
:
docs/source/conf.py
...
...
@@ -19,4 +19,3 @@ formats: []
python
:
install
:
-
requirements
:
docs/requirements-docs.txt
CMakeLists.txt
View file @
21063c11
benchmarks/backend_request_func.py
View file @
21063c11
...
...
@@ -79,7 +79,7 @@ async def async_request_tgi(
# any data, we should skip it.
if
chunk_bytes
.
startswith
(
":"
):
continue
chunk
=
remove
_
prefix
(
chunk_bytes
,
"data:"
)
chunk
=
chunk_bytes
.
removeprefix
(
"data:"
)
data
=
json
.
loads
(
chunk
)
timestamp
=
time
.
perf_counter
()
...
...
@@ -144,7 +144,7 @@ async def async_request_trt_llm(
if
not
chunk_bytes
:
continue
chunk
=
remove_prefix
(
chunk_bytes
.
decode
(
"utf-8"
)
,
chunk
=
chunk_bytes
.
decode
(
"utf-8"
)
.
removeprefix
(
"data:"
)
data
=
json
.
loads
(
chunk
)
...
...
@@ -261,7 +261,7 @@ async def async_request_openai_completions(
if
not
chunk_bytes
:
continue
chunk
=
remove_prefix
(
chunk_bytes
.
decode
(
"utf-8"
)
,
chunk
=
chunk_bytes
.
decode
(
"utf-8"
)
.
removeprefix
(
"data: "
)
if
chunk
==
"[DONE]"
:
latency
=
time
.
perf_counter
()
-
st
...
...
@@ -349,7 +349,7 @@ async def async_request_openai_chat_completions(
if
not
chunk_bytes
:
continue
chunk
=
remove_prefix
(
chunk_bytes
.
decode
(
"utf-8"
)
,
chunk
=
chunk_bytes
.
decode
(
"utf-8"
)
.
removeprefix
(
"data: "
)
if
chunk
==
"[DONE]"
:
latency
=
time
.
perf_counter
()
-
st
...
...
@@ -389,14 +389,6 @@ async def async_request_openai_chat_completions(
return
output
# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
# introduced in Python 3.9
def
remove_prefix
(
text
:
str
,
prefix
:
str
)
->
str
:
if
text
.
startswith
(
prefix
):
return
text
[
len
(
prefix
):]
return
text
def
get_model
(
pretrained_model_name_or_path
:
str
)
->
str
:
if
os
.
getenv
(
'VLLM_USE_MODELSCOPE'
,
'False'
).
lower
()
==
'true'
:
from
modelscope
import
snapshot_download
...
...
benchmarks/kernels/benchmark_machete.py
View file @
21063c11
...
...
@@ -269,10 +269,10 @@ def run_square_bench(args):
def
run_range_bench
(
args
):
m_start
,
k_start
,
n_start
=
[
int
(
x
)
for
x
in
args
.
dim_start
.
split
(
","
)
]
m_end
,
k_end
,
n_end
=
[
int
(
x
)
for
x
in
args
.
dim_end
.
split
(
","
)
]
m_start
,
k_start
,
n_start
=
(
int
(
x
)
for
x
in
args
.
dim_start
.
split
(
","
)
)
m_end
,
k_end
,
n_end
=
(
int
(
x
)
for
x
in
args
.
dim_end
.
split
(
","
)
)
m_increment
,
k_increment
,
n_increment
=
\
[
int
(
x
)
for
x
in
args
.
dim_increment
.
split
(
","
)
]
(
int
(
x
)
for
x
in
args
.
dim_increment
.
split
(
","
)
)
Ms
=
list
(
range
(
m_start
,
m_end
+
1
,
m_increment
))
Ks
=
list
(
range
(
k_start
,
k_end
+
1
,
k_increment
))
Ns
=
list
(
range
(
n_start
,
n_end
+
1
,
n_increment
))
...
...
csrc/quantization/machete/generate.py
View file @
21063c11
...
...
@@ -468,7 +468,7 @@ def generate():
impl_configs
=
[]
GPTQ_kernel_type_configs
=
list
(
(
TypeConfig
(
TypeConfig
(
element_a
=
element_a
,
element_b
=
element_b
,
element_b_scale
=
element_a
,
...
...
@@ -476,7 +476,7 @@ def generate():
element_d
=
element_a
,
accumulator
=
DataType
.
f32
,
)
for
element_b
in
(
VLLMDataType
.
u4b8
,
VLLMDataType
.
u8b128
)
for
element_a
in
(
DataType
.
f16
,
DataType
.
bf16
))
)
for
element_a
in
(
DataType
.
f16
,
DataType
.
bf16
))
GPTQ_kernel_specializations
=
[
Specialization
(
with_C
=
False
,
with_zeropoints
=
False
,
with_scales
=
True
)
...
...
@@ -490,7 +490,7 @@ def generate():
]
AWQ_kernel_type_configs
=
list
(
(
TypeConfig
(
TypeConfig
(
element_a
=
element_a
,
element_b
=
element_b
,
element_b_scale
=
element_a
,
...
...
@@ -498,7 +498,7 @@ def generate():
element_d
=
element_a
,
accumulator
=
DataType
.
f32
,
)
for
element_b
in
(
DataType
.
u4
,
DataType
.
u8
)
for
element_a
in
(
DataType
.
f16
,
DataType
.
bf16
))
)
for
element_a
in
(
DataType
.
f16
,
DataType
.
bf16
))
AWQ_kernel_specializations
=
[
Specialization
(
with_C
=
False
,
with_zeropoints
=
True
,
with_scales
=
True
)
...
...
docs/source/getting_started/installation.rst
View file @
21063c11
...
...
@@ -10,7 +10,7 @@ Requirements
============
* OS: Linux
* Python: 3.
8
- 3.12
* Python: 3.
9 -
- 3.12
* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
Install released versions
...
...
pyproject.toml
View file @
21063c11
...
...
@@ -34,7 +34,7 @@ select = [
# Pyflakes
"F"
,
# pyupgrade
#
"UP",
"UP"
,
# flake8-bugbear
"B"
,
# flake8-simplify
...
...
@@ -55,7 +55,7 @@ ignore = [
]
[tool.mypy]
python_version
=
"3.
8
"
python_version
=
"3.
9
"
ignore_missing_imports
=
true
check_untyped_defs
=
true
...
...
setup.py
View file @
21063c11
import
importlib.util
import
io
import
logging
import
os
import
re
...
...
@@ -327,7 +326,7 @@ def get_neuronxcc_version():
"__init__.py"
)
# Check if the command was executed successfully
with
open
(
version_file
,
"rt"
)
as
fp
:
with
open
(
version_file
)
as
fp
:
content
=
fp
.
read
()
# Extract the version using a regular expression
...
...
@@ -404,7 +403,8 @@ def read_readme() -> str:
"""Read the README file if present."""
p
=
get_path
(
"README.md"
)
if
os
.
path
.
isfile
(
p
):
return
io
.
open
(
get_path
(
"README.md"
),
"r"
,
encoding
=
"utf-8"
).
read
()
with
open
(
get_path
(
"README.md"
),
encoding
=
"utf-8"
)
as
f
:
return
f
.
read
()
else
:
return
""
...
...
@@ -498,7 +498,6 @@ setup(
"Documentation"
:
"https://vllm.readthedocs.io/en/latest/"
,
},
classifiers
=
[
"Programming Language :: Python :: 3.8"
,
"Programming Language :: Python :: 3.9"
,
"Programming Language :: Python :: 3.10"
,
"Programming Language :: Python :: 3.11"
,
...
...
@@ -512,7 +511,7 @@ setup(
],
packages
=
find_packages
(
exclude
=
(
"benchmarks"
,
"csrc"
,
"docs"
,
"examples"
,
"tests*"
)),
python_requires
=
">=3.
8
"
,
python_requires
=
">=3.
9
"
,
install_requires
=
get_requirements
(),
ext_modules
=
ext_modules
,
extras_require
=
{
...
...
tests/compile/piecewise/test_toy_llama.py
View file @
21063c11
...
...
@@ -429,8 +429,8 @@ def benchmark():
# print in tabular format
print
(
"batch size
\t
eager mode
\t
full cudagraph
\t
piecewise cudagraph"
)
for
b
in
cudagraph_sizes
:
print
(
(
f
"
{
b
}
\t
{
eager_time
[
b
]:.
3
f
}
\t
{
full_cudagraph_time
[
b
]:.
3
f
}
"
f
"
\t
{
piecewise_cudagraph_time
[
b
]:.
3
f
}
"
)
)
print
(
f
"
{
b
}
\t
{
eager_time
[
b
]:.
3
f
}
\t
{
full_cudagraph_time
[
b
]:.
3
f
}
"
f
"
\t
{
piecewise_cudagraph_time
[
b
]:.
3
f
}
"
)
if
__name__
==
"__main__"
:
...
...
tests/conftest.py
View file @
21063c11
import
json
import
os
import
sys
import
tempfile
from
collections
import
UserList
from
enum
import
Enum
...
...
@@ -52,7 +51,7 @@ PromptVideoInput = _PromptMultiModalInput[np.ndarray]
def
_read_prompts
(
filename
:
str
)
->
List
[
str
]:
with
open
(
filename
,
"r"
)
as
f
:
with
open
(
filename
)
as
f
:
prompts
=
f
.
readlines
()
return
prompts
...
...
@@ -62,13 +61,7 @@ class _ImageAssetPrompts(TypedDict):
cherry_blossom
:
str
if
sys
.
version_info
<
(
3
,
9
):
# UserList cannot be subscripted
class
_ImageAssetsBase
(
UserList
):
pass
else
:
class
_ImageAssetsBase
(
UserList
[
ImageAsset
]):
class
_ImageAssetsBase
(
UserList
[
ImageAsset
]):
pass
...
...
@@ -94,13 +87,7 @@ class _VideoAssetPrompts(TypedDict):
sample_demo_1
:
str
if
sys
.
version_info
<
(
3
,
9
):
# UserList cannot be subscripted
class
_VideoAssetsBase
(
UserList
):
pass
else
:
class
_VideoAssetsBase
(
UserList
[
VideoAsset
]):
class
_VideoAssetsBase
(
UserList
[
VideoAsset
]):
pass
...
...
@@ -958,7 +945,7 @@ def dummy_opt_path():
"*.msgpack"
])
assert
os
.
path
.
exists
(
json_path
)
with
open
(
json_path
,
"r"
)
as
f
:
with
open
(
json_path
)
as
f
:
config
=
json
.
load
(
f
)
config
[
"architectures"
]
=
[
"MyOPTForCausalLM"
]
with
open
(
json_path
,
"w"
)
as
f
:
...
...
@@ -977,7 +964,7 @@ def dummy_llava_path():
"*.msgpack"
])
assert
os
.
path
.
exists
(
json_path
)
with
open
(
json_path
,
"r"
)
as
f
:
with
open
(
json_path
)
as
f
:
config
=
json
.
load
(
f
)
config
[
"architectures"
]
=
[
"MyLlava"
]
with
open
(
json_path
,
"w"
)
as
f
:
...
...
@@ -996,7 +983,7 @@ def dummy_gemma2_embedding_path():
"*.msgpack"
])
assert
os
.
path
.
exists
(
json_path
)
with
open
(
json_path
,
"r"
)
as
f
:
with
open
(
json_path
)
as
f
:
config
=
json
.
load
(
f
)
config
[
"architectures"
]
=
[
"MyGemma2Embedding"
]
with
open
(
json_path
,
"w"
)
as
f
:
...
...
tests/core/block/test_prefix_caching_block.py
View file @
21063c11
...
...
@@ -99,13 +99,11 @@ class TestPrefixCachingBlock:
token_ids
=
[
random
.
randint
(
0
,
50_000
)
for
_
in
range
(
num_tokens
)]
first_chain
,
second_chain
=
[
TestPrefixCachingBlock
.
create_chain
(
first_chain
,
second_chain
=
(
TestPrefixCachingBlock
.
create_chain
(
block_size
=
block_size
,
token_ids
=
token_ids
,
num_empty_trailing_blocks
=
num_empty_trailing_blocks
)
for
_
in
range
(
2
)
]
for
_
in
range
(
2
))
for
first_chain_block
,
second_chain_block
in
zip
(
first_chain
,
second_chain
):
...
...
tests/kernels/test_mamba_ssm.py
View file @
21063c11
...
...
@@ -510,7 +510,7 @@ def test_selective_scan_varlen(with_padding, is_variable_B, is_variable_C,
for
var
in
(
u_ref
,
delta_ref
,
B_ref
,
C_ref
,
z_ref
)
]
for
i
in
range
(
len
(
seqlens
[
0
])):
u_s
,
delta_s
,
B_s
,
C_s
,
z_s
=
[
v
[
i
].
unsqueeze
(
0
)
for
v
in
splits
]
u_s
,
delta_s
,
B_s
,
C_s
,
z_s
=
(
v
[
i
].
unsqueeze
(
0
)
for
v
in
splits
)
if
padded_state_indices
[
i
]
==
PAD_SLOT_ID
:
continue
out_ref_s
,
_
=
selective_scan_ref
(
...
...
tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
View file @
21063c11
...
...
@@ -104,7 +104,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
# Sad path tests for the multimodal input processor and mapper, respectively
@
pytest
.
mark
.
parametrize
(
"mm_data"
,
[
{
"image"
:
torch
.
rand
(
(
5
)
)
"image"
:
torch
.
rand
(
5
)
},
{
"image"
:
torch
.
rand
((
5
,
5
,
5
,
5
,
5
))
...
...
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment