Commit 081057de authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.5' into v0.8.5-ori

parents 7cf5d5c4 ba41cc90
......@@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
return response
if __name__ == "__main__":
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
return parser.parse_args()
args = parser.parse_args()
def main(args):
api_url = f"http://{args.host}:{args.port}/score"
model_name = args.model
......@@ -30,9 +32,9 @@ if __name__ == "__main__":
text_2 = "The capital of Brazil is Brasilia."
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
score_response = post_http_request(prompt=prompt, api_url=api_url)
print("Prompt when text_1 and text_2 are both strings:")
print("\nPrompt when text_1 and text_2 are both strings:")
pprint.pprint(prompt)
print("Score Response:")
print("\nScore Response:")
pprint.pprint(score_response.json())
text_1 = "What is the capital of France?"
......@@ -41,9 +43,9 @@ if __name__ == "__main__":
]
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
score_response = post_http_request(prompt=prompt, api_url=api_url)
print("Prompt when text_1 is string and text_2 is a list:")
print("\nPrompt when text_1 is string and text_2 is a list:")
pprint.pprint(prompt)
print("Score Response:")
print("\nScore Response:")
pprint.pprint(score_response.json())
text_1 = [
......@@ -54,7 +56,12 @@ if __name__ == "__main__":
]
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
score_response = post_http_request(prompt=prompt, api_url=api_url)
print("Prompt when text_1 and text_2 are both lists:")
print("\nPrompt when text_1 and text_2 are both lists:")
pprint.pprint(prompt)
print("Score Response:")
print("\nScore Response:")
pprint.pprint(score_response.json())
if __name__ == "__main__":
args = parse_args()
main(args)
......@@ -6,22 +6,29 @@ from openai import OpenAI
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
responses = client.embeddings.create(
input=[
"Hello my name is",
"The best thing about vLLM is that it supports many different models"
],
model=model,
)
for data in responses.data:
print(data.embedding) # List of float of len 4096
def main():
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
responses = client.embeddings.create(
# ruff: noqa: E501
input=[
"Hello my name is",
"The best thing about vLLM is that it supports many different models"
],
model=model,
)
for data in responses.data:
print(data.embedding) # List of float of len 4096
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
"""Example Python client for embedding API dimensions using vLLM API server
NOTE:
start a supported Matryoshka Embeddings model server with `vllm serve`, e.g.
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
"""
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
def main():
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
responses = client.embeddings.create(
input=["Follow the white rabbit."],
model=model,
dimensions=32,
)
for data in responses.data:
print(data.embedding) # List of float of len 32
if __name__ == "__main__":
main()
......@@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
return response
if __name__ == "__main__":
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
......@@ -25,15 +25,20 @@ if __name__ == "__main__":
type=str,
default="jason9693/Qwen2.5-1.5B-apeach")
args = parser.parse_args()
return parser.parse_args()
def main(args):
api_url = f"http://{args.host}:{args.port}/pooling"
model_name = args.model
# Input like Completions API
prompt = {"model": model_name, "input": "vLLM is great!"}
pooling_response = post_http_request(prompt=prompt, api_url=api_url)
print("-" * 50)
print("Pooling Response:")
pprint.pprint(pooling_response.json())
print("-" * 50)
# Input like Chat API
prompt = {
......@@ -50,3 +55,9 @@ if __name__ == "__main__":
pooling_response = post_http_request(prompt=prompt, api_url=api_url)
print("Pooling Response:")
pprint.pprint(pooling_response.json())
print("-" * 50)
if __name__ == "__main__":
args = parse_args()
main(args)
......@@ -26,7 +26,12 @@ def sync_openai():
model="openai/whisper-large-v3",
language="en",
response_format="json",
temperature=0.0)
temperature=0.0,
# Additional sampling params not provided by OpenAI API.
extra_body=dict(
seed=4419,
repetition_penalty=1.3,
))
print("transcription result:", transcription.text)
......
# SPDX-License-Identifier: Apache-2.0
"""
Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
See Ray Serve LLM documentation at:
https://docs.ray.io/en/latest/serve/llm/serving-llms.html
Run `python3 ray_serve_deepseek.py` to deploy the model.
"""
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app
llm_config = LLMConfig(
model_loading_config={
"model_id": "deepseek",
# Since DeepSeek model is huge, it is recommended to pre-download
# the model to local disk, say /path/to/the/model and specify:
# model_source="/path/to/the/model"
"model_source": "deepseek-ai/DeepSeek-R1",
},
deployment_config={
"autoscaling_config": {
"min_replicas": 1,
"max_replicas": 1,
}
},
# Change to the accelerator type of the node
accelerator_type="H100",
runtime_env={"env_vars": {
"VLLM_USE_V1": "1"
}},
# Customize engine arguments as needed (e.g. vLLM engine kwargs)
engine_kwargs={
"tensor_parallel_size": 8,
"pipeline_parallel_size": 2,
"gpu_memory_utilization": 0.92,
"dtype": "auto",
"max_num_seqs": 40,
"max_model_len": 16384,
"enable_chunked_prefill": True,
"enable_prefix_caching": True,
"trust_remote_code": True,
},
)
# Deploy the application
llm_app = build_openai_app({"llm_configs": [llm_config]})
serve.run(llm_app)
{%- macro is_array_of_type_objects(var) -%}
{%- if var is iterable and var is not string -%}
{%- set valid = true -%}
{%- for item in var -%}
{%- if 'type' not in item -%}
{%- set valid = false -%}
{%- break -%}
{%- endif -%}
{%- endfor -%}
{{ valid }}
{%- else -%}
{{ false }}
{%- endif -%}
{%- endmacro %}
{%- macro render_message(message) %}
{%- if message['content'] is string %}
{{- message['content']|trim }}
{%- elif is_array_of_type_objects(data) == 'True' %}
{%- for content in message['content'] %}
{%- if content['type'] == 'image' %}
{{- '<|image|>' }}
{%- elif content['type'] == 'text' %}
{{- content['text']|trim }}
{%- endif %}
{%- endfor %}
{%- else %}
{{- message['content']|tojson }}
{%- endif %}
{%- endmacro %}
{{- bos_token }}
{%- if custom_tools is defined %}
{%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
{%- set tools_in_user_message = true %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
{%- set system_message = messages[0] %}
{%- set messages = messages[1:] %}
{%- else %}
{%- set system_message = ({ "content": "You are a helpful assistant with tool calling "
"capabilities. Only reply with a tool call if the function exists in the "
"library provided by the user. If it doesn't exist, just reply directly in "
"natural language. When you receive a tool call response, use the output to "
"format an answer to the original user question."}) %}
{%- endif %}
{%- set tool_lib_preamble = 'Tools: You have access to the following tools. You might need to use one '
'or more function/tool calls to fulfill the task. \n'
'If none are needed, then proceed to the response.\n\n'
'Tool Call Syntax: You can call tools using the following syntax:\n'
'{"name": function name, "parameters": dictionary of argument name and its value}.\n'
'Separate multiple function calls by "; ". Do not use variables.\n'
'Do not include anything else when calling the tools with the syntax above.\n\n'
'Here is a list of functions in JSON format that you can invoke.\n' %}
{{- "<|header_start|>system<|header_end|>\n\n" }}
{%- if tools is not none and not tools_in_user_message %}
{{- tool_lib_preamble }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}
{{- render_message(system_message) }}
{{ "<|eot|>\n" }}
{#- Custom tools are passed in a user message with some extra guidance #}
{%- if tools_in_user_message and not tools is none %}
{#- Extract the first user message so we can plug it in here #}
{%- if messages | length != 0 %}
{%- set first_user_message = messages[0] %}
{%- set messages = messages[1:] %}
{%- else %}
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
{%- endif %}
{{- '<|header_start|>user<|header_end|>\n\n' }}
{{- tool_lib_preamble }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{{- render_message(first_user_message) + "\n<|eot|>"}}
{%- endif %}
{%- for message in messages %}
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
{{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
{{- render_message(message) }}
{{- "\n<|eot|>" }}
{%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
{{- '\n<|header_start|>assistant<|header_end|>\n\n' -}}
{{- render_message(message) }}
{%- for tool_call in message.tool_calls %}
{{- '{"name": "' + tool_call.function.name + '", ' }}
{{- '"parameters": ' }}
{{- tool_call.function.arguments | tojson }}
{{- "}" }}
{%- endfor %}
{{- "\n<|eot|>" }}
{%- elif message.role == "tool" or message.role == "ipython" %}
{{- "\n<|header_start|>ipython<|header_end|>\n\n" }}
{{- render_message(message) }}
{{- "\n<|eom|>" }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '\n<|header_start|>assistant<|header_end|>\n\n' }}
{%- endif %}
......@@ -15,7 +15,8 @@ build-backend = "setuptools.build_meta"
[project]
name = "vllm"
authors = [{name = "vLLM Team"}]
license = { "file"= "LICENSE" }
license = "Apache-2.0"
license-files = ["LICENSE"]
readme = "README.md"
description = "A high-throughput and memory-efficient inference and serving engine for LLMs"
classifiers = [
......@@ -23,7 +24,6 @@ classifiers = [
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"License :: OSI Approved :: Apache Software License",
"Intended Audience :: Developers",
"Intended Audience :: Information Technology",
"Intended Audience :: Science/Research",
......@@ -46,8 +46,7 @@ vllm = "vllm.entrypoints.cli.main:main"
[tool.setuptools.packages.find]
where = ["."]
exclude = ["benchmarks", "csrc", "docs", "examples", "tests*"]
namespaces = false
include = ["vllm*"]
[tool.yapfignore]
ignore_patterns = [
......@@ -59,7 +58,8 @@ ignore_patterns = [
line-length = 80
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py"
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores]
......
......@@ -8,7 +8,7 @@ blake3
py-cpuinfo
transformers >= 4.51.1
huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads.
tokenizers >= 0.19.1 # Required for Llama 3.
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp
......@@ -26,7 +26,7 @@ xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs
pyzmq
pyzmq >= 25.0.0
msgspec
gguf >= 0.13.0
importlib_metadata
......
......@@ -12,9 +12,9 @@ torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
torchaudio==2.6.0; platform_machine == "ppc64le"
# required for the image processor of phi3v, this must be updated alongside torch
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
torchvision==0.21.0; platform_machine == "ppc64le"
datasets # for benchmark scripts
# cpu cannot use triton 3.3.0
triton==3.2.0; platform_machine != "ppc64le"
triton==3.2.0; platform_machine == "x86_64"
......@@ -7,6 +7,7 @@ sphinx-togglebutton==0.3.2
myst-parser==3.0.1
msgspec
cloudpickle
commonmark # Required by sphinx-argparse when using :markdownhelp:
# packages to install to build the documentation
cachetools
......@@ -18,6 +19,7 @@ transformers
mistral_common >= 1.5.4
aiohttp
starlette
scipy
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
......
......@@ -9,4 +9,4 @@ numpy==1.26.4
tabulate
setuptools>=61
setuptools-scm>=8
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@4312768
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@f1f6624
# Dependency that able to run entrypoints test
# pytest and its extensions
pytest
pytest-asyncio
pytest-forked
pytest-mock
pytest-rerunfailures
pytest-shard
pytest-timeout
librosa # required by audio tests in entrypoints/openai
sentence-transformers
numba == 0.61.2; python_version > '3.9'
# testing utils
awscli
boto3
botocore
datasets
ray >= 2.10.0
peft
runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
tensorizer>=2.9.0
lm-eval==0.4.8
buildkite-test-collector==0.1.9
lm-eval[api]==0.4.8 # required for model evaluation test
......@@ -6,6 +6,7 @@ torch==2.6.0
torchvision==0.21.0
torchaudio==2.6.0
triton==3.2
cmake>=3.26,<4
packaging
setuptools>=61
......
......@@ -10,6 +10,7 @@ pytest-timeout
# testing utils
awscli
backoff # required for phi4mm test
blobfile # required for kimi-vl test
einops # required for MPT, qwen-vl and Mamba
httpx
librosa # required for audio tests
......@@ -26,14 +27,17 @@ torch==2.6.0
torchaudio==2.6.0
torchvision==0.21.0
transformers_stream_generator # required for qwen-vl test
mamba_ssm # required for plamo2 test
matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.5.4 # required for pixtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
transformers==4.51.1
transformers==4.51.3
tokenizers==0.21.1
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
schemathesis>=3.39.15 # Required for openai schema test.
# quantization
bitsandbytes>=0.45.3
buildkite-test-collector==0.1.9
......
......@@ -20,25 +20,35 @@ aiosignal==1.3.1
annotated-types==0.7.0
# via pydantic
anyio==4.6.2.post1
# via httpx
# via
# httpx
# starlette
argcomplete==3.5.1
# via datamodel-code-generator
arrow==1.3.0
# via isoduration
attrs==24.2.0
# via
# aiohttp
# hypothesis
# jsonlines
# jsonschema
# pytest-subtests
# referencing
audioread==3.0.1
# via librosa
awscli==1.35.23
# via -r requirements/test.in
backoff==2.2.1
# via -r requirements/test.in
# via
# -r requirements/test.in
# schemathesis
bitsandbytes==0.45.3
# via -r requirements/test.in
black==24.10.0
# via datamodel-code-generator
blobfile==3.0.0
# via -r requirements/test.in
boto3==1.35.57
# via tensorizer
botocore==1.35.57
......@@ -67,11 +77,13 @@ click==8.1.7
# jiwer
# nltk
# ray
# schemathesis
# typer
colorama==0.4.6
# via
# awscli
# sacrebleu
# schemathesis
# tqdm-multiprocess
contourpy==1.3.0
# via matplotlib
......@@ -109,6 +121,7 @@ einops==0.8.0
# via
# -r requirements/test.in
# encodec
# mamba-ssm
# vector-quantize-pytorch
# vocos
einx==0.3.0
......@@ -127,6 +140,7 @@ fastsafetensors==0.1.10
# via -r requirements/test.in
filelock==3.16.1
# via
# blobfile
# datasets
# huggingface-hub
# ray
......@@ -134,6 +148,8 @@ filelock==3.16.1
# transformers
fonttools==4.54.1
# via matplotlib
fqdn==1.5.1
# via jsonschema
frozendict==2.4.6
# via einx
frozenlist==1.5.0
......@@ -152,8 +168,12 @@ genai-perf==0.0.8
# via -r requirements/test.in
genson==1.3.0
# via datamodel-code-generator
graphql-core==3.2.6
# via hypothesis-graphql
h11==0.14.0
# via httpcore
harfile==0.3.0
# via schemathesis
hf-xet==0.1.4
# via huggingface-hub
hiredis==3.0.0
......@@ -161,7 +181,9 @@ hiredis==3.0.0
httpcore==1.0.6
# via httpx
httpx==0.27.2
# via -r requirements/test.in
# via
# -r requirements/test.in
# schemathesis
huggingface-hub==0.30.1
# via
# -r requirements/test.in
......@@ -176,17 +198,29 @@ huggingface-hub==0.30.1
# vocos
humanize==4.11.0
# via runai-model-streamer
hypothesis==6.131.0
# via
# hypothesis-graphql
# hypothesis-jsonschema
# schemathesis
hypothesis-graphql==0.11.1
# via schemathesis
hypothesis-jsonschema==0.23.1
# via schemathesis
idna==3.10
# via
# anyio
# email-validator
# httpx
# jsonschema
# requests
# yarl
inflect==5.6.2
# via datamodel-code-generator
iniconfig==2.0.0
# via pytest
isoduration==20.11.0
# via jsonschema
isort==5.13.2
# via datamodel-code-generator
jinja2==3.1.6
......@@ -206,12 +240,18 @@ joblib==1.4.2
# scikit-learn
jsonlines==4.0.0
# via lm-eval
jsonpointer==3.0.0
# via jsonschema
jsonschema==4.23.0
# via
# hypothesis-jsonschema
# mistral-common
# ray
# schemathesis
jsonschema-specifications==2024.10.1
# via jsonschema
junit-xml==1.9
# via schemathesis
kaleido==0.2.1
# via genai-perf
kiwisolver==1.4.7
......@@ -227,11 +267,17 @@ llvmlite==0.44.0
lm-eval==0.4.8
# via -r requirements/test.in
lxml==5.3.0
# via sacrebleu
# via
# blobfile
# sacrebleu
mamba-ssm==2.2.4
# via -r requirements/test.in
markdown-it-py==3.0.0
# via rich
markupsafe==3.0.2
# via jinja2
# via
# jinja2
# werkzeug
matplotlib==3.9.2
# via -r requirements/test.in
mbstrdecoder==1.1.3
......@@ -263,6 +309,8 @@ mypy-extensions==1.0.0
# via black
networkx==3.2.1
# via torch
ninja==1.11.1.3
# via mamba-ssm
nltk==3.9.1
# via rouge-score
num2words==0.5.14
......@@ -355,6 +403,7 @@ packaging==24.1
# fastparquet
# huggingface-hub
# lazy-loader
# mamba-ssm
# matplotlib
# peft
# plotly
......@@ -426,6 +475,8 @@ pybind11==2.13.6
# via lm-eval
pycparser==2.22
# via cffi
pycryptodomex==3.22.0
# via blobfile
pydantic==2.9.2
# via
# datamodel-code-generator
......@@ -436,6 +487,8 @@ pygments==2.18.0
# via rich
pyparsing==3.2.0
# via matplotlib
pyrate-limiter==3.7.0
# via schemathesis
pytablewriter==1.2.0
# via lm-eval
pytest==8.3.3
......@@ -448,7 +501,9 @@ pytest==8.3.3
# pytest-mock
# pytest-rerunfailures
# pytest-shard
# pytest-subtests
# pytest-timeout
# schemathesis
pytest-asyncio==0.24.0
# via -r requirements/test.in
pytest-forked==1.6.0
......@@ -459,10 +514,13 @@ pytest-rerunfailures==14.0
# via -r requirements/test.in
pytest-shard==0.1.2
# via -r requirements/test.in
pytest-subtests==0.14.1
# via schemathesis
pytest-timeout==2.3.1
# via -r requirements/test.in
python-dateutil==2.9.0.post0
# via
# arrow
# botocore
# matplotlib
# pandas
......@@ -484,6 +542,7 @@ pyyaml==6.0.2
# peft
# ray
# responses
# schemathesis
# timm
# transformers
# vocos
......@@ -514,10 +573,16 @@ requests==2.32.3
# pooch
# ray
# responses
# schemathesis
# starlette-testclient
# tiktoken
# transformers
responses==0.25.3
# via genai-perf
rfc3339-validator==0.1.4
# via jsonschema
rfc3987==1.3.8
# via jsonschema
rich==13.9.4
# via
# genai-perf
......@@ -546,6 +611,8 @@ safetensors==0.4.5
# peft
# timm
# transformers
schemathesis==3.39.15
# via -r requirements/test.in
scikit-learn==1.5.2
# via
# librosa
......@@ -564,18 +631,23 @@ sentencepiece==0.2.0
# via mistral-common
setuptools==75.8.0
# via
# mamba-ssm
# pytablewriter
# torch
shellingham==1.5.4
# via typer
six==1.16.0
# via
# junit-xml
# python-dateutil
# rfc3339-validator
# rouge-score
sniffio==1.3.1
# via
# anyio
# httpx
sortedcontainers==2.4.0
# via hypothesis
soundfile==0.12.1
# via
# -r requirements/test.in
......@@ -584,6 +656,12 @@ soxr==0.5.0.post1
# via librosa
sqlitedict==2.1.0
# via lm-eval
starlette==0.46.2
# via
# schemathesis
# starlette-testclient
starlette-testclient==0.4.1
# via schemathesis
statsmodels==0.14.4
# via genai-perf
sympy==1.13.1
......@@ -610,8 +688,14 @@ tiktoken==0.7.0
# mistral-common
timm==1.0.11
# via -r requirements/test.in
tokenizers==0.21.0
# via transformers
tokenizers==0.21.1
# via
# -r requirements/test.in
# transformers
tomli==2.2.1
# via schemathesis
tomli-w==1.2.0
# via schemathesis
torch==2.6.0
# via
# -r requirements/test.in
......@@ -620,6 +704,7 @@ torch==2.6.0
# encodec
# fastsafetensors
# lm-eval
# mamba-ssm
# peft
# runai-model-streamer
# sentence-transformers
......@@ -652,11 +737,12 @@ tqdm==4.66.6
# transformers
tqdm-multiprocess==0.0.11
# via lm-eval
transformers==4.51.1
transformers==4.51.3
# via
# -r requirements/test.in
# genai-perf
# lm-eval
# mamba-ssm
# peft
# sentence-transformers
# transformers-stream-generator
......@@ -675,6 +761,8 @@ typepy==1.3.2
# tabledata
typer==0.15.2
# via fastsafetensors
types-python-dateutil==2.9.0.20241206
# via arrow
typing-extensions==4.12.2
# via
# huggingface-hub
......@@ -687,8 +775,11 @@ typing-extensions==4.12.2
# typer
tzdata==2024.2
# via pandas
uri-template==1.3.0
# via jsonschema
urllib3==2.2.3
# via
# blobfile
# botocore
# requests
# responses
......@@ -697,6 +788,10 @@ vector-quantize-pytorch==1.21.2
# via -r requirements/test.in
vocos==0.1.0
# via -r requirements/test.in
webcolors==24.11.1
# via jsonschema
werkzeug==3.1.3
# via schemathesis
word2number==1.1
# via lm-eval
xxhash==3.5.0
......@@ -704,6 +799,8 @@ xxhash==3.5.0
# datasets
# evaluate
yarl==1.17.1
# via aiohttp
# via
# aiohttp
# schemathesis
zstandard==0.23.0
# via lm-eval
......@@ -17,9 +17,8 @@ ray[data]
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch==2.8.0.dev20250408
torchvision==0.22.0.dev20250408
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
......
......@@ -269,15 +269,17 @@ class cmake_build_ext(build_ext):
# First, run the standard build_ext command to compile the extensions
super().run()
# copy vllm/vllm_flash_attn/*.py from self.build_lib to current
# copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
# directory so that they can be included in the editable build
import glob
files = glob.glob(
os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "*.py"))
files = glob.glob(os.path.join(self.build_lib, "vllm",
"vllm_flash_attn", "**", "*.py"),
recursive=True)
for file in files:
dst_file = os.path.join("vllm/vllm_flash_attn",
os.path.basename(file))
file.split("vllm/vllm_flash_attn/")[-1])
print(f"Copying {file} to {dst_file}")
os.makedirs(os.path.dirname(dst_file), exist_ok=True)
self.copy_file(file, dst_file)
......@@ -377,13 +379,22 @@ class repackage_wheel(build_ext):
"vllm/_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/vllm_flash_attn/flash_attn_interface.py",
"vllm/vllm_flash_attn/__init__.py",
"vllm/cumem_allocator.abi3.so",
# "vllm/_version.py", # not available in nightly wheels yet
]
file_members = filter(lambda x: x.filename in files_to_copy,
wheel.filelist)
file_members = list(
filter(lambda x: x.filename in files_to_copy, wheel.filelist))
# vllm_flash_attn python code:
# Regex from
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
import re
compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
file_members += list(
filter(lambda x: compiled_regex.match(x.filename),
wheel.filelist))
for file in file_members:
print(f"Extracting and including {file.filename} "
......
# SPDX-License-Identifier: Apache-2.0
import subprocess
import pytest
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
@pytest.mark.benchmark
def test_bench_latency():
command = [
"vllm", "bench", "latency", "--model", MODEL_NAME, "--input-len", "32",
"--output-len", "1", "--enforce-eager", "--load-format", "dummy"
]
result = subprocess.run(command, capture_output=True, text=True)
print(result.stdout)
print(result.stderr)
assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment