Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
081057de
Commit
081057de
authored
Apr 29, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.8.5' into v0.8.5-ori
parents
7cf5d5c4
ba41cc90
Changes
554
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
452 additions
and
61 deletions
+452
-61
examples/online_serving/openai_cross_encoder_score.py
examples/online_serving/openai_cross_encoder_score.py
+15
-8
examples/online_serving/openai_embedding_client.py
examples/online_serving/openai_embedding_client.py
+26
-19
examples/online_serving/openai_embedding_matryoshka_fy.py
examples/online_serving/openai_embedding_matryoshka_fy.py
+36
-0
examples/online_serving/openai_pooling_client.py
examples/online_serving/openai_pooling_client.py
+13
-2
examples/online_serving/openai_transcription_client.py
examples/online_serving/openai_transcription_client.py
+6
-1
examples/online_serving/ray_serve_deepseek.py
examples/online_serving/ray_serve_deepseek.py
+48
-0
examples/tool_chat_template_llama4_json.jinja
examples/tool_chat_template_llama4_json.jinja
+116
-0
pyproject.toml
pyproject.toml
+5
-5
requirements/common.txt
requirements/common.txt
+2
-2
requirements/cpu.txt
requirements/cpu.txt
+2
-2
requirements/docs.txt
requirements/docs.txt
+2
-0
requirements/hpu.txt
requirements/hpu.txt
+1
-1
requirements/nightly_torch_test.txt
requirements/nightly_torch_test.txt
+28
-0
requirements/rocm-build.txt
requirements/rocm-build.txt
+1
-0
requirements/test.in
requirements/test.in
+5
-1
requirements/test.txt
requirements/test.txt
+106
-9
requirements/tpu.txt
requirements/tpu.txt
+2
-3
setup.py
setup.py
+19
-8
tests/benchmarks/__init__.py
tests/benchmarks/__init__.py
+0
-0
tests/benchmarks/test_latency_cli.py
tests/benchmarks/test_latency_cli.py
+19
-0
No files found.
Too many changes to show.
To preserve performance only
554 of 554+
files are displayed.
Plain diff
Email patch
examples/online_serving/openai_cross_encoder_score.py
View file @
081057de
...
@@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
...
@@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
return
response
return
response
if
__name__
==
"__main__"
:
def
parse_args
()
:
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"BAAI/bge-reranker-v2-m3"
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"BAAI/bge-reranker-v2-m3"
)
return
parser
.
parse_args
()
args
=
parser
.
parse_
args
(
)
def
main
(
args
)
:
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/score"
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/score"
model_name
=
args
.
model
model_name
=
args
.
model
...
@@ -30,9 +32,9 @@ if __name__ == "__main__":
...
@@ -30,9 +32,9 @@ if __name__ == "__main__":
text_2
=
"The capital of Brazil is Brasilia."
text_2
=
"The capital of Brazil is Brasilia."
prompt
=
{
"model"
:
model_name
,
"text_1"
:
text_1
,
"text_2"
:
text_2
}
prompt
=
{
"model"
:
model_name
,
"text_1"
:
text_1
,
"text_2"
:
text_2
}
score_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
score_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
print
(
"Prompt when text_1 and text_2 are both strings:"
)
print
(
"
\n
Prompt when text_1 and text_2 are both strings:"
)
pprint
.
pprint
(
prompt
)
pprint
.
pprint
(
prompt
)
print
(
"Score Response:"
)
print
(
"
\n
Score Response:"
)
pprint
.
pprint
(
score_response
.
json
())
pprint
.
pprint
(
score_response
.
json
())
text_1
=
"What is the capital of France?"
text_1
=
"What is the capital of France?"
...
@@ -41,9 +43,9 @@ if __name__ == "__main__":
...
@@ -41,9 +43,9 @@ if __name__ == "__main__":
]
]
prompt
=
{
"model"
:
model_name
,
"text_1"
:
text_1
,
"text_2"
:
text_2
}
prompt
=
{
"model"
:
model_name
,
"text_1"
:
text_1
,
"text_2"
:
text_2
}
score_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
score_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
print
(
"Prompt when text_1 is string and text_2 is a list:"
)
print
(
"
\n
Prompt when text_1 is string and text_2 is a list:"
)
pprint
.
pprint
(
prompt
)
pprint
.
pprint
(
prompt
)
print
(
"Score Response:"
)
print
(
"
\n
Score Response:"
)
pprint
.
pprint
(
score_response
.
json
())
pprint
.
pprint
(
score_response
.
json
())
text_1
=
[
text_1
=
[
...
@@ -54,7 +56,12 @@ if __name__ == "__main__":
...
@@ -54,7 +56,12 @@ if __name__ == "__main__":
]
]
prompt
=
{
"model"
:
model_name
,
"text_1"
:
text_1
,
"text_2"
:
text_2
}
prompt
=
{
"model"
:
model_name
,
"text_1"
:
text_1
,
"text_2"
:
text_2
}
score_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
score_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
print
(
"Prompt when text_1 and text_2 are both lists:"
)
print
(
"
\n
Prompt when text_1 and text_2 are both lists:"
)
pprint
.
pprint
(
prompt
)
pprint
.
pprint
(
prompt
)
print
(
"Score Response:"
)
print
(
"
\n
Score Response:"
)
pprint
.
pprint
(
score_response
.
json
())
pprint
.
pprint
(
score_response
.
json
())
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/openai_embedding_client.py
View file @
081057de
...
@@ -6,22 +6,29 @@ from openai import OpenAI
...
@@ -6,22 +6,29 @@ from openai import OpenAI
openai_api_key
=
"EMPTY"
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
openai_api_base
=
"http://localhost:8000/v1"
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
def
main
():
api_key
=
openai_api_key
,
client
=
OpenAI
(
base_url
=
openai_api_base
,
# defaults to os.environ.get("OPENAI_API_KEY")
)
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
models
=
client
.
models
.
list
()
)
model
=
models
.
data
[
0
].
id
models
=
client
.
models
.
list
()
responses
=
client
.
embeddings
.
create
(
model
=
models
.
data
[
0
].
id
input
=
[
"Hello my name is"
,
responses
=
client
.
embeddings
.
create
(
"The best thing about vLLM is that it supports many different models"
# ruff: noqa: E501
],
input
=
[
model
=
model
,
"Hello my name is"
,
)
"The best thing about vLLM is that it supports many different models"
],
for
data
in
responses
.
data
:
model
=
model
,
print
(
data
.
embedding
)
# List of float of len 4096
)
for
data
in
responses
.
data
:
print
(
data
.
embedding
)
# List of float of len 4096
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_embedding_matryoshka_fy.py
0 → 100644
View file @
081057de
# SPDX-License-Identifier: Apache-2.0
"""Example Python client for embedding API dimensions using vLLM API server
NOTE:
start a supported Matryoshka Embeddings model server with `vllm serve`, e.g.
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
"""
from
openai
import
OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key
=
"EMPTY"
openai_api_base
=
"http://localhost:8000/v1"
def
main
():
client
=
OpenAI
(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key
=
openai_api_key
,
base_url
=
openai_api_base
,
)
models
=
client
.
models
.
list
()
model
=
models
.
data
[
0
].
id
responses
=
client
.
embeddings
.
create
(
input
=
[
"Follow the white rabbit."
],
model
=
model
,
dimensions
=
32
,
)
for
data
in
responses
.
data
:
print
(
data
.
embedding
)
# List of float of len 32
if
__name__
==
"__main__"
:
main
()
examples/online_serving/openai_pooling_client.py
View file @
081057de
...
@@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
...
@@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
return
response
return
response
if
__name__
==
"__main__"
:
def
parse_args
()
:
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
...
@@ -25,15 +25,20 @@ if __name__ == "__main__":
...
@@ -25,15 +25,20 @@ if __name__ == "__main__":
type
=
str
,
type
=
str
,
default
=
"jason9693/Qwen2.5-1.5B-apeach"
)
default
=
"jason9693/Qwen2.5-1.5B-apeach"
)
args
=
parser
.
parse_args
()
return
parser
.
parse_args
()
def
main
(
args
):
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/pooling"
api_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/pooling"
model_name
=
args
.
model
model_name
=
args
.
model
# Input like Completions API
# Input like Completions API
prompt
=
{
"model"
:
model_name
,
"input"
:
"vLLM is great!"
}
prompt
=
{
"model"
:
model_name
,
"input"
:
"vLLM is great!"
}
pooling_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
pooling_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
print
(
"-"
*
50
)
print
(
"Pooling Response:"
)
print
(
"Pooling Response:"
)
pprint
.
pprint
(
pooling_response
.
json
())
pprint
.
pprint
(
pooling_response
.
json
())
print
(
"-"
*
50
)
# Input like Chat API
# Input like Chat API
prompt
=
{
prompt
=
{
...
@@ -50,3 +55,9 @@ if __name__ == "__main__":
...
@@ -50,3 +55,9 @@ if __name__ == "__main__":
pooling_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
pooling_response
=
post_http_request
(
prompt
=
prompt
,
api_url
=
api_url
)
print
(
"Pooling Response:"
)
print
(
"Pooling Response:"
)
pprint
.
pprint
(
pooling_response
.
json
())
pprint
.
pprint
(
pooling_response
.
json
())
print
(
"-"
*
50
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
examples/online_serving/openai_transcription_client.py
View file @
081057de
...
@@ -26,7 +26,12 @@ def sync_openai():
...
@@ -26,7 +26,12 @@ def sync_openai():
model
=
"openai/whisper-large-v3"
,
model
=
"openai/whisper-large-v3"
,
language
=
"en"
,
language
=
"en"
,
response_format
=
"json"
,
response_format
=
"json"
,
temperature
=
0.0
)
temperature
=
0.0
,
# Additional sampling params not provided by OpenAI API.
extra_body
=
dict
(
seed
=
4419
,
repetition_penalty
=
1.3
,
))
print
(
"transcription result:"
,
transcription
.
text
)
print
(
"transcription result:"
,
transcription
.
text
)
...
...
examples/online_serving/ray_serve_deepseek.py
0 → 100644
View file @
081057de
# SPDX-License-Identifier: Apache-2.0
"""
Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
See Ray Serve LLM documentation at:
https://docs.ray.io/en/latest/serve/llm/serving-llms.html
Run `python3 ray_serve_deepseek.py` to deploy the model.
"""
from
ray
import
serve
from
ray.serve.llm
import
LLMConfig
,
build_openai_app
llm_config
=
LLMConfig
(
model_loading_config
=
{
"model_id"
:
"deepseek"
,
# Since DeepSeek model is huge, it is recommended to pre-download
# the model to local disk, say /path/to/the/model and specify:
# model_source="/path/to/the/model"
"model_source"
:
"deepseek-ai/DeepSeek-R1"
,
},
deployment_config
=
{
"autoscaling_config"
:
{
"min_replicas"
:
1
,
"max_replicas"
:
1
,
}
},
# Change to the accelerator type of the node
accelerator_type
=
"H100"
,
runtime_env
=
{
"env_vars"
:
{
"VLLM_USE_V1"
:
"1"
}},
# Customize engine arguments as needed (e.g. vLLM engine kwargs)
engine_kwargs
=
{
"tensor_parallel_size"
:
8
,
"pipeline_parallel_size"
:
2
,
"gpu_memory_utilization"
:
0.92
,
"dtype"
:
"auto"
,
"max_num_seqs"
:
40
,
"max_model_len"
:
16384
,
"enable_chunked_prefill"
:
True
,
"enable_prefix_caching"
:
True
,
"trust_remote_code"
:
True
,
},
)
# Deploy the application
llm_app
=
build_openai_app
({
"llm_configs"
:
[
llm_config
]})
serve
.
run
(
llm_app
)
examples/tool_chat_template_llama4_json.jinja
0 → 100644
View file @
081057de
{%- macro is_array_of_type_objects(var) -%}
{%- if var is iterable and var is not string -%}
{%- set valid = true -%}
{%- for item in var -%}
{%- if 'type' not in item -%}
{%- set valid = false -%}
{%- break -%}
{%- endif -%}
{%- endfor -%}
{{ valid }}
{%- else -%}
{{ false }}
{%- endif -%}
{%- endmacro %}
{%- macro render_message(message) %}
{%- if message['content'] is string %}
{{- message['content']|trim }}
{%- elif is_array_of_type_objects(data) == 'True' %}
{%- for content in message['content'] %}
{%- if content['type'] == 'image' %}
{{- '<|image|>' }}
{%- elif content['type'] == 'text' %}
{{- content['text']|trim }}
{%- endif %}
{%- endfor %}
{%- else %}
{{- message['content']|tojson }}
{%- endif %}
{%- endmacro %}
{{- bos_token }}
{%- if custom_tools is defined %}
{%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
{%- set tools_in_user_message = true %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
{%- set system_message = messages[0] %}
{%- set messages = messages[1:] %}
{%- else %}
{%- set system_message = ({ "content": "You are a helpful assistant with tool calling "
"capabilities. Only reply with a tool call if the function exists in the "
"library provided by the user. If it doesn't exist, just reply directly in "
"natural language. When you receive a tool call response, use the output to "
"format an answer to the original user question."}) %}
{%- endif %}
{%- set tool_lib_preamble = 'Tools: You have access to the following tools. You might need to use one '
'or more function/tool calls to fulfill the task. \n'
'If none are needed, then proceed to the response.\n\n'
'Tool Call Syntax: You can call tools using the following syntax:\n'
'{"name": function name, "parameters": dictionary of argument name and its value}.\n'
'Separate multiple function calls by "; ". Do not use variables.\n'
'Do not include anything else when calling the tools with the syntax above.\n\n'
'Here is a list of functions in JSON format that you can invoke.\n' %}
{{- "<|header_start|>system<|header_end|>\n\n" }}
{%- if tools is not none and not tools_in_user_message %}
{{- tool_lib_preamble }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}
{{- render_message(system_message) }}
{{ "<|eot|>\n" }}
{#- Custom tools are passed in a user message with some extra guidance #}
{%- if tools_in_user_message and not tools is none %}
{#- Extract the first user message so we can plug it in here #}
{%- if messages | length != 0 %}
{%- set first_user_message = messages[0] %}
{%- set messages = messages[1:] %}
{%- else %}
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
{%- endif %}
{{- '<|header_start|>user<|header_end|>\n\n' }}
{{- tool_lib_preamble }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{{- render_message(first_user_message) + "\n<|eot|>"}}
{%- endif %}
{%- for message in messages %}
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
{{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
{{- render_message(message) }}
{{- "\n<|eot|>" }}
{%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
{{- '\n<|header_start|>assistant<|header_end|>\n\n' -}}
{{- render_message(message) }}
{%- for tool_call in message.tool_calls %}
{{- '{"name": "' + tool_call.function.name + '", ' }}
{{- '"parameters": ' }}
{{- tool_call.function.arguments | tojson }}
{{- "}" }}
{%- endfor %}
{{- "\n<|eot|>" }}
{%- elif message.role == "tool" or message.role == "ipython" %}
{{- "\n<|header_start|>ipython<|header_end|>\n\n" }}
{{- render_message(message) }}
{{- "\n<|eom|>" }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '\n<|header_start|>assistant<|header_end|>\n\n' }}
{%- endif %}
pyproject.toml
View file @
081057de
...
@@ -15,7 +15,8 @@ build-backend = "setuptools.build_meta"
...
@@ -15,7 +15,8 @@ build-backend = "setuptools.build_meta"
[project]
[project]
name
=
"vllm"
name
=
"vllm"
authors
=
[
{name
=
"vLLM Team"
}
]
authors
=
[
{name
=
"vLLM Team"
}
]
license
=
{
"file"
=
"LICENSE"
}
license
=
"Apache-2.0"
license-files
=
["LICENSE"]
readme
=
"README.md"
readme
=
"README.md"
description
=
"A high-throughput and memory-efficient inference and serving engine for LLMs"
description
=
"A high-throughput and memory-efficient inference and serving engine for LLMs"
classifiers
=
[
classifiers
=
[
...
@@ -23,7 +24,6 @@ classifiers = [
...
@@ -23,7 +24,6 @@ classifiers = [
"Programming Language :: Python :: 3.10"
,
"Programming Language :: Python :: 3.10"
,
"Programming Language :: Python :: 3.11"
,
"Programming Language :: Python :: 3.11"
,
"Programming Language :: Python :: 3.12"
,
"Programming Language :: Python :: 3.12"
,
"License :: OSI Approved :: Apache Software License"
,
"Intended Audience :: Developers"
,
"Intended Audience :: Developers"
,
"Intended Audience :: Information Technology"
,
"Intended Audience :: Information Technology"
,
"Intended Audience :: Science/Research"
,
"Intended Audience :: Science/Research"
,
...
@@ -46,8 +46,7 @@ vllm = "vllm.entrypoints.cli.main:main"
...
@@ -46,8 +46,7 @@ vllm = "vllm.entrypoints.cli.main:main"
[tool.setuptools.packages.find]
[tool.setuptools.packages.find]
where
=
["."]
where
=
["."]
exclude
=
[
"benchmarks"
,
"csrc"
,
"docs"
,
"examples"
,
"tests*"
]
include
=
["vllm*"]
namespaces
=
false
[tool.yapfignore]
[tool.yapfignore]
ignore_patterns
=
[
ignore_patterns
=
[
...
@@ -59,7 +58,8 @@ ignore_patterns = [
...
@@ -59,7 +58,8 @@ ignore_patterns = [
line-length
=
80
line-length
=
80
exclude
=
[
exclude
=
[
# External file, leaving license intact
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py"
"examples/other/fp8/quantizer/quantize.py"
,
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
]
[tool.ruff.lint.per-file-ignores]
[tool.ruff.lint.per-file-ignores]
...
...
requirements/common.txt
View file @
081057de
...
@@ -8,7 +8,7 @@ blake3
...
@@ -8,7 +8,7 @@ blake3
py-cpuinfo
py-cpuinfo
transformers >= 4.51.1
transformers >= 4.51.1
huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads.
huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads.
tokenizers >= 0.1
9
.1 # Required for
Llama 3
.
tokenizers >= 0.
2
1.1 # Required for
fast incremental detokenization
.
protobuf # Required by LlamaTokenizer.
protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp
aiohttp
...
@@ -26,7 +26,7 @@ xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64
...
@@ -26,7 +26,7 @@ xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64
typing_extensions >= 4.10
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs
partial-json-parser # used for parsing partial JSON outputs
pyzmq
pyzmq
>= 25.0.0
msgspec
msgspec
gguf >= 0.13.0
gguf >= 0.13.0
importlib_metadata
importlib_metadata
...
...
requirements/cpu.txt
View file @
081057de
...
@@ -12,9 +12,9 @@ torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
...
@@ -12,9 +12,9 @@ torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
torchaudio==2.6.0; platform_machine == "ppc64le"
torchaudio==2.6.0; platform_machine == "ppc64le"
# required for the image processor of phi3v, this must be updated alongside torch
# required for the image processor of phi3v, this must be updated alongside torch
torchvision; platform_machine != "ppc64le"
and platform_machine != "s390x"
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
torchvision==0.21.0; platform_machine == "ppc64le"
torchvision==0.21.0; platform_machine == "ppc64le"
datasets # for benchmark scripts
datasets # for benchmark scripts
# cpu cannot use triton 3.3.0
# cpu cannot use triton 3.3.0
triton==3.2.0; platform_machine
!
= "
ppc64le
"
triton==3.2.0; platform_machine
=
= "
x86_64
"
requirements/docs.txt
View file @
081057de
...
@@ -7,6 +7,7 @@ sphinx-togglebutton==0.3.2
...
@@ -7,6 +7,7 @@ sphinx-togglebutton==0.3.2
myst-parser==3.0.1
myst-parser==3.0.1
msgspec
msgspec
cloudpickle
cloudpickle
commonmark # Required by sphinx-argparse when using :markdownhelp:
# packages to install to build the documentation
# packages to install to build the documentation
cachetools
cachetools
...
@@ -18,6 +19,7 @@ transformers
...
@@ -18,6 +19,7 @@ transformers
mistral_common >= 1.5.4
mistral_common >= 1.5.4
aiohttp
aiohttp
starlette
starlette
scipy
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
...
...
requirements/hpu.txt
View file @
081057de
...
@@ -9,4 +9,4 @@ numpy==1.26.4
...
@@ -9,4 +9,4 @@ numpy==1.26.4
tabulate
tabulate
setuptools>=61
setuptools>=61
setuptools-scm>=8
setuptools-scm>=8
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@
4312768
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@
f1f6624
requirements/nightly_torch_test.txt
0 → 100644
View file @
081057de
# Dependency that able to run entrypoints test
# pytest and its extensions
pytest
pytest-asyncio
pytest-forked
pytest-mock
pytest-rerunfailures
pytest-shard
pytest-timeout
librosa # required by audio tests in entrypoints/openai
sentence-transformers
numba == 0.61.2; python_version > '3.9'
# testing utils
awscli
boto3
botocore
datasets
ray >= 2.10.0
peft
runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
tensorizer>=2.9.0
lm-eval==0.4.8
buildkite-test-collector==0.1.9
lm-eval[api]==0.4.8 # required for model evaluation test
requirements/rocm-build.txt
View file @
081057de
...
@@ -6,6 +6,7 @@ torch==2.6.0
...
@@ -6,6 +6,7 @@ torch==2.6.0
torchvision==0.21.0
torchvision==0.21.0
torchaudio==2.6.0
torchaudio==2.6.0
triton==3.2
cmake>=3.26,<4
cmake>=3.26,<4
packaging
packaging
setuptools>=61
setuptools>=61
...
...
requirements/test.in
View file @
081057de
...
@@ -10,6 +10,7 @@ pytest-timeout
...
@@ -10,6 +10,7 @@ pytest-timeout
# testing utils
# testing utils
awscli
awscli
backoff # required for phi4mm test
backoff # required for phi4mm test
blobfile # required for kimi-vl test
einops # required for MPT, qwen-vl and Mamba
einops # required for MPT, qwen-vl and Mamba
httpx
httpx
librosa # required for audio tests
librosa # required for audio tests
...
@@ -26,14 +27,17 @@ torch==2.6.0
...
@@ -26,14 +27,17 @@ torch==2.6.0
torchaudio==2.6.0
torchaudio==2.6.0
torchvision==0.21.0
torchvision==0.21.0
transformers_stream_generator # required for qwen-vl test
transformers_stream_generator # required for qwen-vl test
mamba_ssm # required for plamo2 test
matplotlib # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.5.4 # required for pixtral test
mistral_common[opencv] >= 1.5.4 # required for pixtral test
num2words # required for smolvlm test
num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
lm-eval[api]==0.4.8 # required for model evaluation test
transformers==4.51.1
transformers==4.51.3
tokenizers==0.21.1
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
schemathesis>=3.39.15 # Required for openai schema test.
# quantization
# quantization
bitsandbytes>=0.45.3
bitsandbytes>=0.45.3
buildkite-test-collector==0.1.9
buildkite-test-collector==0.1.9
...
...
requirements/test.txt
View file @
081057de
...
@@ -20,25 +20,35 @@ aiosignal==1.3.1
...
@@ -20,25 +20,35 @@ aiosignal==1.3.1
annotated-types==0.7.0
annotated-types==0.7.0
# via pydantic
# via pydantic
anyio==4.6.2.post1
anyio==4.6.2.post1
# via httpx
# via
# httpx
# starlette
argcomplete==3.5.1
argcomplete==3.5.1
# via datamodel-code-generator
# via datamodel-code-generator
arrow==1.3.0
# via isoduration
attrs==24.2.0
attrs==24.2.0
# via
# via
# aiohttp
# aiohttp
# hypothesis
# jsonlines
# jsonlines
# jsonschema
# jsonschema
# pytest-subtests
# referencing
# referencing
audioread==3.0.1
audioread==3.0.1
# via librosa
# via librosa
awscli==1.35.23
awscli==1.35.23
# via -r requirements/test.in
# via -r requirements/test.in
backoff==2.2.1
backoff==2.2.1
# via -r requirements/test.in
# via
# -r requirements/test.in
# schemathesis
bitsandbytes==0.45.3
bitsandbytes==0.45.3
# via -r requirements/test.in
# via -r requirements/test.in
black==24.10.0
black==24.10.0
# via datamodel-code-generator
# via datamodel-code-generator
blobfile==3.0.0
# via -r requirements/test.in
boto3==1.35.57
boto3==1.35.57
# via tensorizer
# via tensorizer
botocore==1.35.57
botocore==1.35.57
...
@@ -67,11 +77,13 @@ click==8.1.7
...
@@ -67,11 +77,13 @@ click==8.1.7
# jiwer
# jiwer
# nltk
# nltk
# ray
# ray
# schemathesis
# typer
# typer
colorama==0.4.6
colorama==0.4.6
# via
# via
# awscli
# awscli
# sacrebleu
# sacrebleu
# schemathesis
# tqdm-multiprocess
# tqdm-multiprocess
contourpy==1.3.0
contourpy==1.3.0
# via matplotlib
# via matplotlib
...
@@ -109,6 +121,7 @@ einops==0.8.0
...
@@ -109,6 +121,7 @@ einops==0.8.0
# via
# via
# -r requirements/test.in
# -r requirements/test.in
# encodec
# encodec
# mamba-ssm
# vector-quantize-pytorch
# vector-quantize-pytorch
# vocos
# vocos
einx==0.3.0
einx==0.3.0
...
@@ -127,6 +140,7 @@ fastsafetensors==0.1.10
...
@@ -127,6 +140,7 @@ fastsafetensors==0.1.10
# via -r requirements/test.in
# via -r requirements/test.in
filelock==3.16.1
filelock==3.16.1
# via
# via
# blobfile
# datasets
# datasets
# huggingface-hub
# huggingface-hub
# ray
# ray
...
@@ -134,6 +148,8 @@ filelock==3.16.1
...
@@ -134,6 +148,8 @@ filelock==3.16.1
# transformers
# transformers
fonttools==4.54.1
fonttools==4.54.1
# via matplotlib
# via matplotlib
fqdn==1.5.1
# via jsonschema
frozendict==2.4.6
frozendict==2.4.6
# via einx
# via einx
frozenlist==1.5.0
frozenlist==1.5.0
...
@@ -152,8 +168,12 @@ genai-perf==0.0.8
...
@@ -152,8 +168,12 @@ genai-perf==0.0.8
# via -r requirements/test.in
# via -r requirements/test.in
genson==1.3.0
genson==1.3.0
# via datamodel-code-generator
# via datamodel-code-generator
graphql-core==3.2.6
# via hypothesis-graphql
h11==0.14.0
h11==0.14.0
# via httpcore
# via httpcore
harfile==0.3.0
# via schemathesis
hf-xet==0.1.4
hf-xet==0.1.4
# via huggingface-hub
# via huggingface-hub
hiredis==3.0.0
hiredis==3.0.0
...
@@ -161,7 +181,9 @@ hiredis==3.0.0
...
@@ -161,7 +181,9 @@ hiredis==3.0.0
httpcore==1.0.6
httpcore==1.0.6
# via httpx
# via httpx
httpx==0.27.2
httpx==0.27.2
# via -r requirements/test.in
# via
# -r requirements/test.in
# schemathesis
huggingface-hub==0.30.1
huggingface-hub==0.30.1
# via
# via
# -r requirements/test.in
# -r requirements/test.in
...
@@ -176,17 +198,29 @@ huggingface-hub==0.30.1
...
@@ -176,17 +198,29 @@ huggingface-hub==0.30.1
# vocos
# vocos
humanize==4.11.0
humanize==4.11.0
# via runai-model-streamer
# via runai-model-streamer
hypothesis==6.131.0
# via
# hypothesis-graphql
# hypothesis-jsonschema
# schemathesis
hypothesis-graphql==0.11.1
# via schemathesis
hypothesis-jsonschema==0.23.1
# via schemathesis
idna==3.10
idna==3.10
# via
# via
# anyio
# anyio
# email-validator
# email-validator
# httpx
# httpx
# jsonschema
# requests
# requests
# yarl
# yarl
inflect==5.6.2
inflect==5.6.2
# via datamodel-code-generator
# via datamodel-code-generator
iniconfig==2.0.0
iniconfig==2.0.0
# via pytest
# via pytest
isoduration==20.11.0
# via jsonschema
isort==5.13.2
isort==5.13.2
# via datamodel-code-generator
# via datamodel-code-generator
jinja2==3.1.6
jinja2==3.1.6
...
@@ -206,12 +240,18 @@ joblib==1.4.2
...
@@ -206,12 +240,18 @@ joblib==1.4.2
# scikit-learn
# scikit-learn
jsonlines==4.0.0
jsonlines==4.0.0
# via lm-eval
# via lm-eval
jsonpointer==3.0.0
# via jsonschema
jsonschema==4.23.0
jsonschema==4.23.0
# via
# via
# hypothesis-jsonschema
# mistral-common
# mistral-common
# ray
# ray
# schemathesis
jsonschema-specifications==2024.10.1
jsonschema-specifications==2024.10.1
# via jsonschema
# via jsonschema
junit-xml==1.9
# via schemathesis
kaleido==0.2.1
kaleido==0.2.1
# via genai-perf
# via genai-perf
kiwisolver==1.4.7
kiwisolver==1.4.7
...
@@ -227,11 +267,17 @@ llvmlite==0.44.0
...
@@ -227,11 +267,17 @@ llvmlite==0.44.0
lm-eval==0.4.8
lm-eval==0.4.8
# via -r requirements/test.in
# via -r requirements/test.in
lxml==5.3.0
lxml==5.3.0
# via sacrebleu
# via
# blobfile
# sacrebleu
mamba-ssm==2.2.4
# via -r requirements/test.in
markdown-it-py==3.0.0
markdown-it-py==3.0.0
# via rich
# via rich
markupsafe==3.0.2
markupsafe==3.0.2
# via jinja2
# via
# jinja2
# werkzeug
matplotlib==3.9.2
matplotlib==3.9.2
# via -r requirements/test.in
# via -r requirements/test.in
mbstrdecoder==1.1.3
mbstrdecoder==1.1.3
...
@@ -263,6 +309,8 @@ mypy-extensions==1.0.0
...
@@ -263,6 +309,8 @@ mypy-extensions==1.0.0
# via black
# via black
networkx==3.2.1
networkx==3.2.1
# via torch
# via torch
ninja==1.11.1.3
# via mamba-ssm
nltk==3.9.1
nltk==3.9.1
# via rouge-score
# via rouge-score
num2words==0.5.14
num2words==0.5.14
...
@@ -355,6 +403,7 @@ packaging==24.1
...
@@ -355,6 +403,7 @@ packaging==24.1
# fastparquet
# fastparquet
# huggingface-hub
# huggingface-hub
# lazy-loader
# lazy-loader
# mamba-ssm
# matplotlib
# matplotlib
# peft
# peft
# plotly
# plotly
...
@@ -426,6 +475,8 @@ pybind11==2.13.6
...
@@ -426,6 +475,8 @@ pybind11==2.13.6
# via lm-eval
# via lm-eval
pycparser==2.22
pycparser==2.22
# via cffi
# via cffi
pycryptodomex==3.22.0
# via blobfile
pydantic==2.9.2
pydantic==2.9.2
# via
# via
# datamodel-code-generator
# datamodel-code-generator
...
@@ -436,6 +487,8 @@ pygments==2.18.0
...
@@ -436,6 +487,8 @@ pygments==2.18.0
# via rich
# via rich
pyparsing==3.2.0
pyparsing==3.2.0
# via matplotlib
# via matplotlib
pyrate-limiter==3.7.0
# via schemathesis
pytablewriter==1.2.0
pytablewriter==1.2.0
# via lm-eval
# via lm-eval
pytest==8.3.3
pytest==8.3.3
...
@@ -448,7 +501,9 @@ pytest==8.3.3
...
@@ -448,7 +501,9 @@ pytest==8.3.3
# pytest-mock
# pytest-mock
# pytest-rerunfailures
# pytest-rerunfailures
# pytest-shard
# pytest-shard
# pytest-subtests
# pytest-timeout
# pytest-timeout
# schemathesis
pytest-asyncio==0.24.0
pytest-asyncio==0.24.0
# via -r requirements/test.in
# via -r requirements/test.in
pytest-forked==1.6.0
pytest-forked==1.6.0
...
@@ -459,10 +514,13 @@ pytest-rerunfailures==14.0
...
@@ -459,10 +514,13 @@ pytest-rerunfailures==14.0
# via -r requirements/test.in
# via -r requirements/test.in
pytest-shard==0.1.2
pytest-shard==0.1.2
# via -r requirements/test.in
# via -r requirements/test.in
pytest-subtests==0.14.1
# via schemathesis
pytest-timeout==2.3.1
pytest-timeout==2.3.1
# via -r requirements/test.in
# via -r requirements/test.in
python-dateutil==2.9.0.post0
python-dateutil==2.9.0.post0
# via
# via
# arrow
# botocore
# botocore
# matplotlib
# matplotlib
# pandas
# pandas
...
@@ -484,6 +542,7 @@ pyyaml==6.0.2
...
@@ -484,6 +542,7 @@ pyyaml==6.0.2
# peft
# peft
# ray
# ray
# responses
# responses
# schemathesis
# timm
# timm
# transformers
# transformers
# vocos
# vocos
...
@@ -514,10 +573,16 @@ requests==2.32.3
...
@@ -514,10 +573,16 @@ requests==2.32.3
# pooch
# pooch
# ray
# ray
# responses
# responses
# schemathesis
# starlette-testclient
# tiktoken
# tiktoken
# transformers
# transformers
responses==0.25.3
responses==0.25.3
# via genai-perf
# via genai-perf
rfc3339-validator==0.1.4
# via jsonschema
rfc3987==1.3.8
# via jsonschema
rich==13.9.4
rich==13.9.4
# via
# via
# genai-perf
# genai-perf
...
@@ -546,6 +611,8 @@ safetensors==0.4.5
...
@@ -546,6 +611,8 @@ safetensors==0.4.5
# peft
# peft
# timm
# timm
# transformers
# transformers
schemathesis==3.39.15
# via -r requirements/test.in
scikit-learn==1.5.2
scikit-learn==1.5.2
# via
# via
# librosa
# librosa
...
@@ -564,18 +631,23 @@ sentencepiece==0.2.0
...
@@ -564,18 +631,23 @@ sentencepiece==0.2.0
# via mistral-common
# via mistral-common
setuptools==75.8.0
setuptools==75.8.0
# via
# via
# mamba-ssm
# pytablewriter
# pytablewriter
# torch
# torch
shellingham==1.5.4
shellingham==1.5.4
# via typer
# via typer
six==1.16.0
six==1.16.0
# via
# via
# junit-xml
# python-dateutil
# python-dateutil
# rfc3339-validator
# rouge-score
# rouge-score
sniffio==1.3.1
sniffio==1.3.1
# via
# via
# anyio
# anyio
# httpx
# httpx
sortedcontainers==2.4.0
# via hypothesis
soundfile==0.12.1
soundfile==0.12.1
# via
# via
# -r requirements/test.in
# -r requirements/test.in
...
@@ -584,6 +656,12 @@ soxr==0.5.0.post1
...
@@ -584,6 +656,12 @@ soxr==0.5.0.post1
# via librosa
# via librosa
sqlitedict==2.1.0
sqlitedict==2.1.0
# via lm-eval
# via lm-eval
starlette==0.46.2
# via
# schemathesis
# starlette-testclient
starlette-testclient==0.4.1
# via schemathesis
statsmodels==0.14.4
statsmodels==0.14.4
# via genai-perf
# via genai-perf
sympy==1.13.1
sympy==1.13.1
...
@@ -610,8 +688,14 @@ tiktoken==0.7.0
...
@@ -610,8 +688,14 @@ tiktoken==0.7.0
# mistral-common
# mistral-common
timm==1.0.11
timm==1.0.11
# via -r requirements/test.in
# via -r requirements/test.in
tokenizers==0.21.0
tokenizers==0.21.1
# via transformers
# via
# -r requirements/test.in
# transformers
tomli==2.2.1
# via schemathesis
tomli-w==1.2.0
# via schemathesis
torch==2.6.0
torch==2.6.0
# via
# via
# -r requirements/test.in
# -r requirements/test.in
...
@@ -620,6 +704,7 @@ torch==2.6.0
...
@@ -620,6 +704,7 @@ torch==2.6.0
# encodec
# encodec
# fastsafetensors
# fastsafetensors
# lm-eval
# lm-eval
# mamba-ssm
# peft
# peft
# runai-model-streamer
# runai-model-streamer
# sentence-transformers
# sentence-transformers
...
@@ -652,11 +737,12 @@ tqdm==4.66.6
...
@@ -652,11 +737,12 @@ tqdm==4.66.6
# transformers
# transformers
tqdm-multiprocess==0.0.11
tqdm-multiprocess==0.0.11
# via lm-eval
# via lm-eval
transformers==4.51.
1
transformers==4.51.
3
# via
# via
# -r requirements/test.in
# -r requirements/test.in
# genai-perf
# genai-perf
# lm-eval
# lm-eval
# mamba-ssm
# peft
# peft
# sentence-transformers
# sentence-transformers
# transformers-stream-generator
# transformers-stream-generator
...
@@ -675,6 +761,8 @@ typepy==1.3.2
...
@@ -675,6 +761,8 @@ typepy==1.3.2
# tabledata
# tabledata
typer==0.15.2
typer==0.15.2
# via fastsafetensors
# via fastsafetensors
types-python-dateutil==2.9.0.20241206
# via arrow
typing-extensions==4.12.2
typing-extensions==4.12.2
# via
# via
# huggingface-hub
# huggingface-hub
...
@@ -687,8 +775,11 @@ typing-extensions==4.12.2
...
@@ -687,8 +775,11 @@ typing-extensions==4.12.2
# typer
# typer
tzdata==2024.2
tzdata==2024.2
# via pandas
# via pandas
uri-template==1.3.0
# via jsonschema
urllib3==2.2.3
urllib3==2.2.3
# via
# via
# blobfile
# botocore
# botocore
# requests
# requests
# responses
# responses
...
@@ -697,6 +788,10 @@ vector-quantize-pytorch==1.21.2
...
@@ -697,6 +788,10 @@ vector-quantize-pytorch==1.21.2
# via -r requirements/test.in
# via -r requirements/test.in
vocos==0.1.0
vocos==0.1.0
# via -r requirements/test.in
# via -r requirements/test.in
webcolors==24.11.1
# via jsonschema
werkzeug==3.1.3
# via schemathesis
word2number==1.1
word2number==1.1
# via lm-eval
# via lm-eval
xxhash==3.5.0
xxhash==3.5.0
...
@@ -704,6 +799,8 @@ xxhash==3.5.0
...
@@ -704,6 +799,8 @@ xxhash==3.5.0
# datasets
# datasets
# evaluate
# evaluate
yarl==1.17.1
yarl==1.17.1
# via aiohttp
# via
# aiohttp
# schemathesis
zstandard==0.23.0
zstandard==0.23.0
# via lm-eval
# via lm-eval
requirements/tpu.txt
View file @
081057de
...
@@ -17,9 +17,8 @@ ray[data]
...
@@ -17,9 +17,8 @@ ray[data]
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch==2.8.0.dev20250408
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torchvision==0.22.0.dev20250408
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
...
...
setup.py
View file @
081057de
...
@@ -269,15 +269,17 @@ class cmake_build_ext(build_ext):
...
@@ -269,15 +269,17 @@ class cmake_build_ext(build_ext):
# First, run the standard build_ext command to compile the extensions
# First, run the standard build_ext command to compile the extensions
super
().
run
()
super
().
run
()
# copy vllm/vllm_flash_attn/*.py from self.build_lib to current
# copy vllm/vllm_flash_attn/*
*/*
.py from self.build_lib to current
# directory so that they can be included in the editable build
# directory so that they can be included in the editable build
import
glob
import
glob
files
=
glob
.
glob
(
files
=
glob
.
glob
(
os
.
path
.
join
(
self
.
build_lib
,
"vllm"
,
os
.
path
.
join
(
self
.
build_lib
,
"vllm"
,
"vllm_flash_attn"
,
"*.py"
))
"vllm_flash_attn"
,
"**"
,
"*.py"
),
recursive
=
True
)
for
file
in
files
:
for
file
in
files
:
dst_file
=
os
.
path
.
join
(
"vllm/vllm_flash_attn"
,
dst_file
=
os
.
path
.
join
(
"vllm/vllm_flash_attn"
,
os
.
path
.
basename
(
file
)
)
file
.
split
(
"vllm/vllm_flash_attn/"
)[
-
1
]
)
print
(
f
"Copying
{
file
}
to
{
dst_file
}
"
)
print
(
f
"Copying
{
file
}
to
{
dst_file
}
"
)
os
.
makedirs
(
os
.
path
.
dirname
(
dst_file
),
exist_ok
=
True
)
self
.
copy_file
(
file
,
dst_file
)
self
.
copy_file
(
file
,
dst_file
)
...
@@ -377,13 +379,22 @@ class repackage_wheel(build_ext):
...
@@ -377,13 +379,22 @@ class repackage_wheel(build_ext):
"vllm/_flashmla_C.abi3.so"
,
"vllm/_flashmla_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so"
,
"vllm/vllm_flash_attn/flash_attn_interface.py"
,
"vllm/vllm_flash_attn/__init__.py"
,
"vllm/cumem_allocator.abi3.so"
,
"vllm/cumem_allocator.abi3.so"
,
# "vllm/_version.py", # not available in nightly wheels yet
# "vllm/_version.py", # not available in nightly wheels yet
]
]
file_members
=
filter
(
lambda
x
:
x
.
filename
in
files_to_copy
,
wheel
.
filelist
)
file_members
=
list
(
filter
(
lambda
x
:
x
.
filename
in
files_to_copy
,
wheel
.
filelist
))
# vllm_flash_attn python code:
# Regex from
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
import
re
compiled_regex
=
re
.
compile
(
r
"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
file_members
+=
list
(
filter
(
lambda
x
:
compiled_regex
.
match
(
x
.
filename
),
wheel
.
filelist
))
for
file
in
file_members
:
for
file
in
file_members
:
print
(
f
"Extracting and including
{
file
.
filename
}
"
print
(
f
"Extracting and including
{
file
.
filename
}
"
...
...
tests/benchmarks/__init__.py
0 → 100644
View file @
081057de
tests/benchmarks/test_latency_cli.py
0 → 100644
View file @
081057de
# SPDX-License-Identifier: Apache-2.0
import
subprocess
import
pytest
MODEL_NAME
=
"meta-llama/Llama-3.2-1B-Instruct"
@
pytest
.
mark
.
benchmark
def
test_bench_latency
():
command
=
[
"vllm"
,
"bench"
,
"latency"
,
"--model"
,
MODEL_NAME
,
"--input-len"
,
"32"
,
"--output-len"
,
"1"
,
"--enforce-eager"
,
"--load-format"
,
"dummy"
]
result
=
subprocess
.
run
(
command
,
capture_output
=
True
,
text
=
True
)
print
(
result
.
stdout
)
print
(
result
.
stderr
)
assert
result
.
returncode
==
0
,
f
"Benchmark failed:
{
result
.
stderr
}
"
Prev
1
…
6
7
8
9
10
11
12
13
14
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment