"vllm/vscode:/vscode.git/clone" did not exist on "9a5b1554b4f049aad6398bb29d3064138ac9a039"
Commit dcb5624a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.5' into v0.8.5-dev

parents 55880ca2 ba41cc90
...@@ -6,28 +6,36 @@ from openai import OpenAI ...@@ -6,28 +6,36 @@ from openai import OpenAI
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY") def main():
api_key=openai_api_key, client = OpenAI(
base_url=openai_api_base, # defaults to os.environ.get("OPENAI_API_KEY")
) api_key=openai_api_key,
base_url=openai_api_base,
models = client.models.list() )
model = models.data[0].id
models = client.models.list()
# Completion API model = models.data[0].id
stream = False
completion = client.completions.create( # Completion API
model=model, stream = False
prompt="A robot may not injure a human being", completion = client.completions.create(
echo=False, model=model,
n=2, prompt="A robot may not injure a human being",
stream=stream, echo=False,
logprobs=3) n=2,
stream=stream,
print("Completion results:") logprobs=3)
if stream:
for c in completion: print("-" * 50)
print(c) print("Completion results:")
else: if stream:
print(completion) for c in completion:
print(c)
else:
print(completion)
print("-" * 50)
if __name__ == "__main__":
main()
...@@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response: ...@@ -16,13 +16,15 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
return response return response
if __name__ == "__main__": def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000) parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3") parser.add_argument("--model", type=str, default="BAAI/bge-reranker-v2-m3")
return parser.parse_args()
args = parser.parse_args() def main(args):
api_url = f"http://{args.host}:{args.port}/score" api_url = f"http://{args.host}:{args.port}/score"
model_name = args.model model_name = args.model
...@@ -30,9 +32,9 @@ if __name__ == "__main__": ...@@ -30,9 +32,9 @@ if __name__ == "__main__":
text_2 = "The capital of Brazil is Brasilia." text_2 = "The capital of Brazil is Brasilia."
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
score_response = post_http_request(prompt=prompt, api_url=api_url) score_response = post_http_request(prompt=prompt, api_url=api_url)
print("Prompt when text_1 and text_2 are both strings:") print("\nPrompt when text_1 and text_2 are both strings:")
pprint.pprint(prompt) pprint.pprint(prompt)
print("Score Response:") print("\nScore Response:")
pprint.pprint(score_response.json()) pprint.pprint(score_response.json())
text_1 = "What is the capital of France?" text_1 = "What is the capital of France?"
...@@ -41,9 +43,9 @@ if __name__ == "__main__": ...@@ -41,9 +43,9 @@ if __name__ == "__main__":
] ]
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
score_response = post_http_request(prompt=prompt, api_url=api_url) score_response = post_http_request(prompt=prompt, api_url=api_url)
print("Prompt when text_1 is string and text_2 is a list:") print("\nPrompt when text_1 is string and text_2 is a list:")
pprint.pprint(prompt) pprint.pprint(prompt)
print("Score Response:") print("\nScore Response:")
pprint.pprint(score_response.json()) pprint.pprint(score_response.json())
text_1 = [ text_1 = [
...@@ -54,7 +56,12 @@ if __name__ == "__main__": ...@@ -54,7 +56,12 @@ if __name__ == "__main__":
] ]
prompt = {"model": model_name, "text_1": text_1, "text_2": text_2} prompt = {"model": model_name, "text_1": text_1, "text_2": text_2}
score_response = post_http_request(prompt=prompt, api_url=api_url) score_response = post_http_request(prompt=prompt, api_url=api_url)
print("Prompt when text_1 and text_2 are both lists:") print("\nPrompt when text_1 and text_2 are both lists:")
pprint.pprint(prompt) pprint.pprint(prompt)
print("Score Response:") print("\nScore Response:")
pprint.pprint(score_response.json()) pprint.pprint(score_response.json())
if __name__ == "__main__":
args = parse_args()
main(args)
...@@ -6,22 +6,29 @@ from openai import OpenAI ...@@ -6,22 +6,29 @@ from openai import OpenAI
openai_api_key = "EMPTY" openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1" openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY") def main():
api_key=openai_api_key, client = OpenAI(
base_url=openai_api_base, # defaults to os.environ.get("OPENAI_API_KEY")
) api_key=openai_api_key,
base_url=openai_api_base,
models = client.models.list() )
model = models.data[0].id
models = client.models.list()
responses = client.embeddings.create( model = models.data[0].id
input=[
"Hello my name is", responses = client.embeddings.create(
"The best thing about vLLM is that it supports many different models" # ruff: noqa: E501
], input=[
model=model, "Hello my name is",
) "The best thing about vLLM is that it supports many different models"
],
for data in responses.data: model=model,
print(data.embedding) # List of float of len 4096 )
for data in responses.data:
print(data.embedding) # List of float of len 4096
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
"""Example Python client for embedding API dimensions using vLLM API server
NOTE:
start a supported Matryoshka Embeddings model server with `vllm serve`, e.g.
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
"""
from openai import OpenAI
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
def main():
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
responses = client.embeddings.create(
input=["Follow the white rabbit."],
model=model,
dimensions=32,
)
for data in responses.data:
print(data.embedding) # List of float of len 32
if __name__ == "__main__":
main()
...@@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response: ...@@ -17,7 +17,7 @@ def post_http_request(prompt: dict, api_url: str) -> requests.Response:
return response return response
if __name__ == "__main__": def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000) parser.add_argument("--port", type=int, default=8000)
...@@ -25,15 +25,20 @@ if __name__ == "__main__": ...@@ -25,15 +25,20 @@ if __name__ == "__main__":
type=str, type=str,
default="jason9693/Qwen2.5-1.5B-apeach") default="jason9693/Qwen2.5-1.5B-apeach")
args = parser.parse_args() return parser.parse_args()
def main(args):
api_url = f"http://{args.host}:{args.port}/pooling" api_url = f"http://{args.host}:{args.port}/pooling"
model_name = args.model model_name = args.model
# Input like Completions API # Input like Completions API
prompt = {"model": model_name, "input": "vLLM is great!"} prompt = {"model": model_name, "input": "vLLM is great!"}
pooling_response = post_http_request(prompt=prompt, api_url=api_url) pooling_response = post_http_request(prompt=prompt, api_url=api_url)
print("-" * 50)
print("Pooling Response:") print("Pooling Response:")
pprint.pprint(pooling_response.json()) pprint.pprint(pooling_response.json())
print("-" * 50)
# Input like Chat API # Input like Chat API
prompt = { prompt = {
...@@ -50,3 +55,9 @@ if __name__ == "__main__": ...@@ -50,3 +55,9 @@ if __name__ == "__main__":
pooling_response = post_http_request(prompt=prompt, api_url=api_url) pooling_response = post_http_request(prompt=prompt, api_url=api_url)
print("Pooling Response:") print("Pooling Response:")
pprint.pprint(pooling_response.json()) pprint.pprint(pooling_response.json())
print("-" * 50)
if __name__ == "__main__":
args = parse_args()
main(args)
...@@ -26,7 +26,12 @@ def sync_openai(): ...@@ -26,7 +26,12 @@ def sync_openai():
model="openai/whisper-large-v3", model="openai/whisper-large-v3",
language="en", language="en",
response_format="json", response_format="json",
temperature=0.0) temperature=0.0,
# Additional sampling params not provided by OpenAI API.
extra_body=dict(
seed=4419,
repetition_penalty=1.3,
))
print("transcription result:", transcription.text) print("transcription result:", transcription.text)
......
# SPDX-License-Identifier: Apache-2.0
"""
Example to deploy DeepSeek R1 or V3 with Ray Serve LLM.
See Ray Serve LLM documentation at:
https://docs.ray.io/en/latest/serve/llm/serving-llms.html
Run `python3 ray_serve_deepseek.py` to deploy the model.
"""
from ray import serve
from ray.serve.llm import LLMConfig, build_openai_app
llm_config = LLMConfig(
model_loading_config={
"model_id": "deepseek",
# Since DeepSeek model is huge, it is recommended to pre-download
# the model to local disk, say /path/to/the/model and specify:
# model_source="/path/to/the/model"
"model_source": "deepseek-ai/DeepSeek-R1",
},
deployment_config={
"autoscaling_config": {
"min_replicas": 1,
"max_replicas": 1,
}
},
# Change to the accelerator type of the node
accelerator_type="H100",
runtime_env={"env_vars": {
"VLLM_USE_V1": "1"
}},
# Customize engine arguments as needed (e.g. vLLM engine kwargs)
engine_kwargs={
"tensor_parallel_size": 8,
"pipeline_parallel_size": 2,
"gpu_memory_utilization": 0.92,
"dtype": "auto",
"max_num_seqs": 40,
"max_model_len": 16384,
"enable_chunked_prefill": True,
"enable_prefix_caching": True,
"trust_remote_code": True,
},
)
# Deploy the application
llm_app = build_openai_app({"llm_configs": [llm_config]})
serve.run(llm_app)
{%- macro is_array_of_type_objects(var) -%}
{%- if var is iterable and var is not string -%}
{%- set valid = true -%}
{%- for item in var -%}
{%- if 'type' not in item -%}
{%- set valid = false -%}
{%- break -%}
{%- endif -%}
{%- endfor -%}
{{ valid }}
{%- else -%}
{{ false }}
{%- endif -%}
{%- endmacro %}
{%- macro render_message(message) %}
{%- if message['content'] is string %}
{{- message['content']|trim }}
{%- elif is_array_of_type_objects(data) == 'True' %}
{%- for content in message['content'] %}
{%- if content['type'] == 'image' %}
{{- '<|image|>' }}
{%- elif content['type'] == 'text' %}
{{- content['text']|trim }}
{%- endif %}
{%- endfor %}
{%- else %}
{{- message['content']|tojson }}
{%- endif %}
{%- endmacro %}
{{- bos_token }}
{%- if custom_tools is defined %}
{%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
{%- set tools_in_user_message = true %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
{%- set system_message = messages[0] %}
{%- set messages = messages[1:] %}
{%- else %}
{%- set system_message = ({ "content": "You are a helpful assistant with tool calling "
"capabilities. Only reply with a tool call if the function exists in the "
"library provided by the user. If it doesn't exist, just reply directly in "
"natural language. When you receive a tool call response, use the output to "
"format an answer to the original user question."}) %}
{%- endif %}
{%- set tool_lib_preamble = 'Tools: You have access to the following tools. You might need to use one '
'or more function/tool calls to fulfill the task. \n'
'If none are needed, then proceed to the response.\n\n'
'Tool Call Syntax: You can call tools using the following syntax:\n'
'{"name": function name, "parameters": dictionary of argument name and its value}.\n'
'Separate multiple function calls by "; ". Do not use variables.\n'
'Do not include anything else when calling the tools with the syntax above.\n\n'
'Here is a list of functions in JSON format that you can invoke.\n' %}
{{- "<|header_start|>system<|header_end|>\n\n" }}
{%- if tools is not none and not tools_in_user_message %}
{{- tool_lib_preamble }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}
{{- render_message(system_message) }}
{{ "<|eot|>\n" }}
{#- Custom tools are passed in a user message with some extra guidance #}
{%- if tools_in_user_message and not tools is none %}
{#- Extract the first user message so we can plug it in here #}
{%- if messages | length != 0 %}
{%- set first_user_message = messages[0] %}
{%- set messages = messages[1:] %}
{%- else %}
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
{%- endif %}
{{- '<|header_start|>user<|header_end|>\n\n' }}
{{- tool_lib_preamble }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{{- render_message(first_user_message) + "\n<|eot|>"}}
{%- endif %}
{%- for message in messages %}
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
{{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
{{- render_message(message) }}
{{- "\n<|eot|>" }}
{%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
{{- '\n<|header_start|>assistant<|header_end|>\n\n' -}}
{{- render_message(message) }}
{%- for tool_call in message.tool_calls %}
{{- '{"name": "' + tool_call.function.name + '", ' }}
{{- '"parameters": ' }}
{{- tool_call.function.arguments | tojson }}
{{- "}" }}
{%- endfor %}
{{- "\n<|eot|>" }}
{%- elif message.role == "tool" or message.role == "ipython" %}
{{- "\n<|header_start|>ipython<|header_end|>\n\n" }}
{{- render_message(message) }}
{{- "\n<|eom|>" }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '\n<|header_start|>assistant<|header_end|>\n\n' }}
{%- endif %}
...@@ -15,7 +15,8 @@ build-backend = "setuptools.build_meta" ...@@ -15,7 +15,8 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "vllm" name = "vllm"
authors = [{name = "vLLM Team"}] authors = [{name = "vLLM Team"}]
license = { "file"= "LICENSE" } license = "Apache-2.0"
license-files = ["LICENSE"]
readme = "README.md" readme = "README.md"
description = "A high-throughput and memory-efficient inference and serving engine for LLMs" description = "A high-throughput and memory-efficient inference and serving engine for LLMs"
classifiers = [ classifiers = [
...@@ -23,7 +24,6 @@ classifiers = [ ...@@ -23,7 +24,6 @@ classifiers = [
"Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.12",
"License :: OSI Approved :: Apache Software License",
"Intended Audience :: Developers", "Intended Audience :: Developers",
"Intended Audience :: Information Technology", "Intended Audience :: Information Technology",
"Intended Audience :: Science/Research", "Intended Audience :: Science/Research",
...@@ -46,8 +46,7 @@ vllm = "vllm.entrypoints.cli.main:main" ...@@ -46,8 +46,7 @@ vllm = "vllm.entrypoints.cli.main:main"
[tool.setuptools.packages.find] [tool.setuptools.packages.find]
where = ["."] where = ["."]
exclude = ["benchmarks", "csrc", "docs", "examples", "tests*"] include = ["vllm*"]
namespaces = false
[tool.yapfignore] [tool.yapfignore]
ignore_patterns = [ ignore_patterns = [
...@@ -59,7 +58,8 @@ ignore_patterns = [ ...@@ -59,7 +58,8 @@ ignore_patterns = [
line-length = 80 line-length = 80
exclude = [ exclude = [
# External file, leaving license intact # External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py" "examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
] ]
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]
......
...@@ -8,7 +8,7 @@ blake3 ...@@ -8,7 +8,7 @@ blake3
py-cpuinfo py-cpuinfo
transformers >= 4.51.1 transformers >= 4.51.1
huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads. huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads.
tokenizers >= 0.19.1 # Required for Llama 3. tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer. protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp aiohttp
...@@ -26,7 +26,7 @@ xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64 ...@@ -26,7 +26,7 @@ xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64
typing_extensions >= 4.10 typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs partial-json-parser # used for parsing partial JSON outputs
pyzmq pyzmq >= 25.0.0
msgspec msgspec
gguf >= 0.13.0 gguf >= 0.13.0
importlib_metadata importlib_metadata
......
...@@ -12,9 +12,9 @@ torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x" ...@@ -12,9 +12,9 @@ torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
torchaudio==2.6.0; platform_machine == "ppc64le" torchaudio==2.6.0; platform_machine == "ppc64le"
# required for the image processor of phi3v, this must be updated alongside torch # required for the image processor of phi3v, this must be updated alongside torch
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x" torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
torchvision==0.21.0; platform_machine == "ppc64le" torchvision==0.21.0; platform_machine == "ppc64le"
datasets # for benchmark scripts datasets # for benchmark scripts
# cpu cannot use triton 3.3.0 # cpu cannot use triton 3.3.0
triton==3.2.0; platform_machine != "ppc64le" triton==3.2.0; platform_machine == "x86_64"
...@@ -7,6 +7,7 @@ sphinx-togglebutton==0.3.2 ...@@ -7,6 +7,7 @@ sphinx-togglebutton==0.3.2
myst-parser==3.0.1 myst-parser==3.0.1
msgspec msgspec
cloudpickle cloudpickle
commonmark # Required by sphinx-argparse when using :markdownhelp:
# packages to install to build the documentation # packages to install to build the documentation
cachetools cachetools
...@@ -18,6 +19,7 @@ transformers ...@@ -18,6 +19,7 @@ transformers
mistral_common >= 1.5.4 mistral_common >= 1.5.4
aiohttp aiohttp
starlette starlette
scipy
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
......
...@@ -9,4 +9,4 @@ numpy==1.26.4 ...@@ -9,4 +9,4 @@ numpy==1.26.4
tabulate tabulate
setuptools>=61 setuptools>=61
setuptools-scm>=8 setuptools-scm>=8
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@4312768 vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@f1f6624
# Dependency that able to run entrypoints test
# pytest and its extensions
pytest
pytest-asyncio
pytest-forked
pytest-mock
pytest-rerunfailures
pytest-shard
pytest-timeout
librosa # required by audio tests in entrypoints/openai
sentence-transformers
numba == 0.61.2; python_version > '3.9'
# testing utils
awscli
boto3
botocore
datasets
ray >= 2.10.0
peft
runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
tensorizer>=2.9.0
lm-eval==0.4.8
buildkite-test-collector==0.1.9
lm-eval[api]==0.4.8 # required for model evaluation test
...@@ -6,6 +6,7 @@ torch==2.6.0 ...@@ -6,6 +6,7 @@ torch==2.6.0
torchvision==0.21.0 torchvision==0.21.0
torchaudio==2.6.0 torchaudio==2.6.0
triton==3.2
cmake>=3.26,<4 cmake>=3.26,<4
packaging packaging
setuptools>=61 setuptools>=61
......
...@@ -10,6 +10,7 @@ pytest-timeout ...@@ -10,6 +10,7 @@ pytest-timeout
# testing utils # testing utils
awscli awscli
backoff # required for phi4mm test backoff # required for phi4mm test
blobfile # required for kimi-vl test
einops # required for MPT, qwen-vl and Mamba einops # required for MPT, qwen-vl and Mamba
httpx httpx
librosa # required for audio tests librosa # required for audio tests
...@@ -26,14 +27,17 @@ torch==2.6.0 ...@@ -26,14 +27,17 @@ torch==2.6.0
torchaudio==2.6.0 torchaudio==2.6.0
torchvision==0.21.0 torchvision==0.21.0
transformers_stream_generator # required for qwen-vl test transformers_stream_generator # required for qwen-vl test
mamba_ssm # required for plamo2 test
matplotlib # required for qwen-vl test matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.5.4 # required for pixtral test mistral_common[opencv] >= 1.5.4 # required for pixtral test
num2words # required for smolvlm test num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test lm-eval[api]==0.4.8 # required for model evaluation test
transformers==4.51.1 transformers==4.51.3
tokenizers==0.21.1
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads. huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
schemathesis>=3.39.15 # Required for openai schema test.
# quantization # quantization
bitsandbytes>=0.45.3 bitsandbytes>=0.45.3
buildkite-test-collector==0.1.9 buildkite-test-collector==0.1.9
......
...@@ -20,25 +20,35 @@ aiosignal==1.3.1 ...@@ -20,25 +20,35 @@ aiosignal==1.3.1
annotated-types==0.7.0 annotated-types==0.7.0
# via pydantic # via pydantic
anyio==4.6.2.post1 anyio==4.6.2.post1
# via httpx # via
# httpx
# starlette
argcomplete==3.5.1 argcomplete==3.5.1
# via datamodel-code-generator # via datamodel-code-generator
arrow==1.3.0
# via isoduration
attrs==24.2.0 attrs==24.2.0
# via # via
# aiohttp # aiohttp
# hypothesis
# jsonlines # jsonlines
# jsonschema # jsonschema
# pytest-subtests
# referencing # referencing
audioread==3.0.1 audioread==3.0.1
# via librosa # via librosa
awscli==1.35.23 awscli==1.35.23
# via -r requirements/test.in # via -r requirements/test.in
backoff==2.2.1 backoff==2.2.1
# via -r requirements/test.in # via
# -r requirements/test.in
# schemathesis
bitsandbytes==0.45.3 bitsandbytes==0.45.3
# via -r requirements/test.in # via -r requirements/test.in
black==24.10.0 black==24.10.0
# via datamodel-code-generator # via datamodel-code-generator
blobfile==3.0.0
# via -r requirements/test.in
boto3==1.35.57 boto3==1.35.57
# via tensorizer # via tensorizer
botocore==1.35.57 botocore==1.35.57
...@@ -67,11 +77,13 @@ click==8.1.7 ...@@ -67,11 +77,13 @@ click==8.1.7
# jiwer # jiwer
# nltk # nltk
# ray # ray
# schemathesis
# typer # typer
colorama==0.4.6 colorama==0.4.6
# via # via
# awscli # awscli
# sacrebleu # sacrebleu
# schemathesis
# tqdm-multiprocess # tqdm-multiprocess
contourpy==1.3.0 contourpy==1.3.0
# via matplotlib # via matplotlib
...@@ -109,6 +121,7 @@ einops==0.8.0 ...@@ -109,6 +121,7 @@ einops==0.8.0
# via # via
# -r requirements/test.in # -r requirements/test.in
# encodec # encodec
# mamba-ssm
# vector-quantize-pytorch # vector-quantize-pytorch
# vocos # vocos
einx==0.3.0 einx==0.3.0
...@@ -127,6 +140,7 @@ fastsafetensors==0.1.10 ...@@ -127,6 +140,7 @@ fastsafetensors==0.1.10
# via -r requirements/test.in # via -r requirements/test.in
filelock==3.16.1 filelock==3.16.1
# via # via
# blobfile
# datasets # datasets
# huggingface-hub # huggingface-hub
# ray # ray
...@@ -134,6 +148,8 @@ filelock==3.16.1 ...@@ -134,6 +148,8 @@ filelock==3.16.1
# transformers # transformers
fonttools==4.54.1 fonttools==4.54.1
# via matplotlib # via matplotlib
fqdn==1.5.1
# via jsonschema
frozendict==2.4.6 frozendict==2.4.6
# via einx # via einx
frozenlist==1.5.0 frozenlist==1.5.0
...@@ -152,8 +168,12 @@ genai-perf==0.0.8 ...@@ -152,8 +168,12 @@ genai-perf==0.0.8
# via -r requirements/test.in # via -r requirements/test.in
genson==1.3.0 genson==1.3.0
# via datamodel-code-generator # via datamodel-code-generator
graphql-core==3.2.6
# via hypothesis-graphql
h11==0.14.0 h11==0.14.0
# via httpcore # via httpcore
harfile==0.3.0
# via schemathesis
hf-xet==0.1.4 hf-xet==0.1.4
# via huggingface-hub # via huggingface-hub
hiredis==3.0.0 hiredis==3.0.0
...@@ -161,7 +181,9 @@ hiredis==3.0.0 ...@@ -161,7 +181,9 @@ hiredis==3.0.0
httpcore==1.0.6 httpcore==1.0.6
# via httpx # via httpx
httpx==0.27.2 httpx==0.27.2
# via -r requirements/test.in # via
# -r requirements/test.in
# schemathesis
huggingface-hub==0.30.1 huggingface-hub==0.30.1
# via # via
# -r requirements/test.in # -r requirements/test.in
...@@ -176,17 +198,29 @@ huggingface-hub==0.30.1 ...@@ -176,17 +198,29 @@ huggingface-hub==0.30.1
# vocos # vocos
humanize==4.11.0 humanize==4.11.0
# via runai-model-streamer # via runai-model-streamer
hypothesis==6.131.0
# via
# hypothesis-graphql
# hypothesis-jsonschema
# schemathesis
hypothesis-graphql==0.11.1
# via schemathesis
hypothesis-jsonschema==0.23.1
# via schemathesis
idna==3.10 idna==3.10
# via # via
# anyio # anyio
# email-validator # email-validator
# httpx # httpx
# jsonschema
# requests # requests
# yarl # yarl
inflect==5.6.2 inflect==5.6.2
# via datamodel-code-generator # via datamodel-code-generator
iniconfig==2.0.0 iniconfig==2.0.0
# via pytest # via pytest
isoduration==20.11.0
# via jsonschema
isort==5.13.2 isort==5.13.2
# via datamodel-code-generator # via datamodel-code-generator
jinja2==3.1.6 jinja2==3.1.6
...@@ -206,12 +240,18 @@ joblib==1.4.2 ...@@ -206,12 +240,18 @@ joblib==1.4.2
# scikit-learn # scikit-learn
jsonlines==4.0.0 jsonlines==4.0.0
# via lm-eval # via lm-eval
jsonpointer==3.0.0
# via jsonschema
jsonschema==4.23.0 jsonschema==4.23.0
# via # via
# hypothesis-jsonschema
# mistral-common # mistral-common
# ray # ray
# schemathesis
jsonschema-specifications==2024.10.1 jsonschema-specifications==2024.10.1
# via jsonschema # via jsonschema
junit-xml==1.9
# via schemathesis
kaleido==0.2.1 kaleido==0.2.1
# via genai-perf # via genai-perf
kiwisolver==1.4.7 kiwisolver==1.4.7
...@@ -227,11 +267,17 @@ llvmlite==0.44.0 ...@@ -227,11 +267,17 @@ llvmlite==0.44.0
lm-eval==0.4.8 lm-eval==0.4.8
# via -r requirements/test.in # via -r requirements/test.in
lxml==5.3.0 lxml==5.3.0
# via sacrebleu # via
# blobfile
# sacrebleu
mamba-ssm==2.2.4
# via -r requirements/test.in
markdown-it-py==3.0.0 markdown-it-py==3.0.0
# via rich # via rich
markupsafe==3.0.2 markupsafe==3.0.2
# via jinja2 # via
# jinja2
# werkzeug
matplotlib==3.9.2 matplotlib==3.9.2
# via -r requirements/test.in # via -r requirements/test.in
mbstrdecoder==1.1.3 mbstrdecoder==1.1.3
...@@ -263,6 +309,8 @@ mypy-extensions==1.0.0 ...@@ -263,6 +309,8 @@ mypy-extensions==1.0.0
# via black # via black
networkx==3.2.1 networkx==3.2.1
# via torch # via torch
ninja==1.11.1.3
# via mamba-ssm
nltk==3.9.1 nltk==3.9.1
# via rouge-score # via rouge-score
num2words==0.5.14 num2words==0.5.14
...@@ -355,6 +403,7 @@ packaging==24.1 ...@@ -355,6 +403,7 @@ packaging==24.1
# fastparquet # fastparquet
# huggingface-hub # huggingface-hub
# lazy-loader # lazy-loader
# mamba-ssm
# matplotlib # matplotlib
# peft # peft
# plotly # plotly
...@@ -426,6 +475,8 @@ pybind11==2.13.6 ...@@ -426,6 +475,8 @@ pybind11==2.13.6
# via lm-eval # via lm-eval
pycparser==2.22 pycparser==2.22
# via cffi # via cffi
pycryptodomex==3.22.0
# via blobfile
pydantic==2.9.2 pydantic==2.9.2
# via # via
# datamodel-code-generator # datamodel-code-generator
...@@ -436,6 +487,8 @@ pygments==2.18.0 ...@@ -436,6 +487,8 @@ pygments==2.18.0
# via rich # via rich
pyparsing==3.2.0 pyparsing==3.2.0
# via matplotlib # via matplotlib
pyrate-limiter==3.7.0
# via schemathesis
pytablewriter==1.2.0 pytablewriter==1.2.0
# via lm-eval # via lm-eval
pytest==8.3.3 pytest==8.3.3
...@@ -448,7 +501,9 @@ pytest==8.3.3 ...@@ -448,7 +501,9 @@ pytest==8.3.3
# pytest-mock # pytest-mock
# pytest-rerunfailures # pytest-rerunfailures
# pytest-shard # pytest-shard
# pytest-subtests
# pytest-timeout # pytest-timeout
# schemathesis
pytest-asyncio==0.24.0 pytest-asyncio==0.24.0
# via -r requirements/test.in # via -r requirements/test.in
pytest-forked==1.6.0 pytest-forked==1.6.0
...@@ -459,10 +514,13 @@ pytest-rerunfailures==14.0 ...@@ -459,10 +514,13 @@ pytest-rerunfailures==14.0
# via -r requirements/test.in # via -r requirements/test.in
pytest-shard==0.1.2 pytest-shard==0.1.2
# via -r requirements/test.in # via -r requirements/test.in
pytest-subtests==0.14.1
# via schemathesis
pytest-timeout==2.3.1 pytest-timeout==2.3.1
# via -r requirements/test.in # via -r requirements/test.in
python-dateutil==2.9.0.post0 python-dateutil==2.9.0.post0
# via # via
# arrow
# botocore # botocore
# matplotlib # matplotlib
# pandas # pandas
...@@ -484,6 +542,7 @@ pyyaml==6.0.2 ...@@ -484,6 +542,7 @@ pyyaml==6.0.2
# peft # peft
# ray # ray
# responses # responses
# schemathesis
# timm # timm
# transformers # transformers
# vocos # vocos
...@@ -514,10 +573,16 @@ requests==2.32.3 ...@@ -514,10 +573,16 @@ requests==2.32.3
# pooch # pooch
# ray # ray
# responses # responses
# schemathesis
# starlette-testclient
# tiktoken # tiktoken
# transformers # transformers
responses==0.25.3 responses==0.25.3
# via genai-perf # via genai-perf
rfc3339-validator==0.1.4
# via jsonschema
rfc3987==1.3.8
# via jsonschema
rich==13.9.4 rich==13.9.4
# via # via
# genai-perf # genai-perf
...@@ -546,6 +611,8 @@ safetensors==0.4.5 ...@@ -546,6 +611,8 @@ safetensors==0.4.5
# peft # peft
# timm # timm
# transformers # transformers
schemathesis==3.39.15
# via -r requirements/test.in
scikit-learn==1.5.2 scikit-learn==1.5.2
# via # via
# librosa # librosa
...@@ -564,18 +631,23 @@ sentencepiece==0.2.0 ...@@ -564,18 +631,23 @@ sentencepiece==0.2.0
# via mistral-common # via mistral-common
setuptools==75.8.0 setuptools==75.8.0
# via # via
# mamba-ssm
# pytablewriter # pytablewriter
# torch # torch
shellingham==1.5.4 shellingham==1.5.4
# via typer # via typer
six==1.16.0 six==1.16.0
# via # via
# junit-xml
# python-dateutil # python-dateutil
# rfc3339-validator
# rouge-score # rouge-score
sniffio==1.3.1 sniffio==1.3.1
# via # via
# anyio # anyio
# httpx # httpx
sortedcontainers==2.4.0
# via hypothesis
soundfile==0.12.1 soundfile==0.12.1
# via # via
# -r requirements/test.in # -r requirements/test.in
...@@ -584,6 +656,12 @@ soxr==0.5.0.post1 ...@@ -584,6 +656,12 @@ soxr==0.5.0.post1
# via librosa # via librosa
sqlitedict==2.1.0 sqlitedict==2.1.0
# via lm-eval # via lm-eval
starlette==0.46.2
# via
# schemathesis
# starlette-testclient
starlette-testclient==0.4.1
# via schemathesis
statsmodels==0.14.4 statsmodels==0.14.4
# via genai-perf # via genai-perf
sympy==1.13.1 sympy==1.13.1
...@@ -610,8 +688,14 @@ tiktoken==0.7.0 ...@@ -610,8 +688,14 @@ tiktoken==0.7.0
# mistral-common # mistral-common
timm==1.0.11 timm==1.0.11
# via -r requirements/test.in # via -r requirements/test.in
tokenizers==0.21.0 tokenizers==0.21.1
# via transformers # via
# -r requirements/test.in
# transformers
tomli==2.2.1
# via schemathesis
tomli-w==1.2.0
# via schemathesis
torch==2.6.0 torch==2.6.0
# via # via
# -r requirements/test.in # -r requirements/test.in
...@@ -620,6 +704,7 @@ torch==2.6.0 ...@@ -620,6 +704,7 @@ torch==2.6.0
# encodec # encodec
# fastsafetensors # fastsafetensors
# lm-eval # lm-eval
# mamba-ssm
# peft # peft
# runai-model-streamer # runai-model-streamer
# sentence-transformers # sentence-transformers
...@@ -652,11 +737,12 @@ tqdm==4.66.6 ...@@ -652,11 +737,12 @@ tqdm==4.66.6
# transformers # transformers
tqdm-multiprocess==0.0.11 tqdm-multiprocess==0.0.11
# via lm-eval # via lm-eval
transformers==4.51.1 transformers==4.51.3
# via # via
# -r requirements/test.in # -r requirements/test.in
# genai-perf # genai-perf
# lm-eval # lm-eval
# mamba-ssm
# peft # peft
# sentence-transformers # sentence-transformers
# transformers-stream-generator # transformers-stream-generator
...@@ -675,6 +761,8 @@ typepy==1.3.2 ...@@ -675,6 +761,8 @@ typepy==1.3.2
# tabledata # tabledata
typer==0.15.2 typer==0.15.2
# via fastsafetensors # via fastsafetensors
types-python-dateutil==2.9.0.20241206
# via arrow
typing-extensions==4.12.2 typing-extensions==4.12.2
# via # via
# huggingface-hub # huggingface-hub
...@@ -687,8 +775,11 @@ typing-extensions==4.12.2 ...@@ -687,8 +775,11 @@ typing-extensions==4.12.2
# typer # typer
tzdata==2024.2 tzdata==2024.2
# via pandas # via pandas
uri-template==1.3.0
# via jsonschema
urllib3==2.2.3 urllib3==2.2.3
# via # via
# blobfile
# botocore # botocore
# requests # requests
# responses # responses
...@@ -697,6 +788,10 @@ vector-quantize-pytorch==1.21.2 ...@@ -697,6 +788,10 @@ vector-quantize-pytorch==1.21.2
# via -r requirements/test.in # via -r requirements/test.in
vocos==0.1.0 vocos==0.1.0
# via -r requirements/test.in # via -r requirements/test.in
webcolors==24.11.1
# via jsonschema
werkzeug==3.1.3
# via schemathesis
word2number==1.1 word2number==1.1
# via lm-eval # via lm-eval
xxhash==3.5.0 xxhash==3.5.0
...@@ -704,6 +799,8 @@ xxhash==3.5.0 ...@@ -704,6 +799,8 @@ xxhash==3.5.0
# datasets # datasets
# evaluate # evaluate
yarl==1.17.1 yarl==1.17.1
# via aiohttp # via
# aiohttp
# schemathesis
zstandard==0.23.0 zstandard==0.23.0
# via lm-eval # via lm-eval
...@@ -17,9 +17,8 @@ ray[data] ...@@ -17,9 +17,8 @@ ray[data]
--find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" torch==2.8.0.dev20250408
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" torchvision==0.22.0.dev20250408
torch @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9" torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10" torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11" torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250408-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
......
...@@ -276,15 +276,17 @@ class cmake_build_ext(build_ext): ...@@ -276,15 +276,17 @@ class cmake_build_ext(build_ext):
# First, run the standard build_ext command to compile the extensions # First, run the standard build_ext command to compile the extensions
super().run() super().run()
# copy vllm/vllm_flash_attn/*.py from self.build_lib to current # copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
# directory so that they can be included in the editable build # directory so that they can be included in the editable build
import glob import glob
files = glob.glob( files = glob.glob(os.path.join(self.build_lib, "vllm",
os.path.join(self.build_lib, "vllm", "vllm_flash_attn", "*.py")) "vllm_flash_attn", "**", "*.py"),
recursive=True)
for file in files: for file in files:
dst_file = os.path.join("vllm/vllm_flash_attn", dst_file = os.path.join("vllm/vllm_flash_attn",
os.path.basename(file)) file.split("vllm/vllm_flash_attn/")[-1])
print(f"Copying {file} to {dst_file}") print(f"Copying {file} to {dst_file}")
os.makedirs(os.path.dirname(dst_file), exist_ok=True)
self.copy_file(file, dst_file) self.copy_file(file, dst_file)
...@@ -384,13 +386,22 @@ class repackage_wheel(build_ext): ...@@ -384,13 +386,22 @@ class repackage_wheel(build_ext):
"vllm/_flashmla_C.abi3.so", "vllm/_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/vllm_flash_attn/flash_attn_interface.py",
"vllm/vllm_flash_attn/__init__.py",
"vllm/cumem_allocator.abi3.so", "vllm/cumem_allocator.abi3.so",
# "vllm/_version.py", # not available in nightly wheels yet # "vllm/_version.py", # not available in nightly wheels yet
] ]
file_members = filter(lambda x: x.filename in files_to_copy,
wheel.filelist) file_members = list(
filter(lambda x: x.filename in files_to_copy, wheel.filelist))
# vllm_flash_attn python code:
# Regex from
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
import re
compiled_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
file_members += list(
filter(lambda x: compiled_regex.match(x.filename),
wheel.filelist))
for file in file_members: for file in file_members:
print(f"Extracting and including {file.filename} " print(f"Extracting and including {file.filename} "
...@@ -563,9 +574,9 @@ def get_version_add(sha: Optional[str] = None) -> str: ...@@ -563,9 +574,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
new_version_content = f""" new_version_content = f"""
try: try:
__version__ = "0.8.4" __version__ = "0.8.5"
__version_tuple__ = (0, 8, 4) __version_tuple__ = (0, 8, 5)
__hcu_version__ = f'0.8.4+{version}' __hcu_version__ = f'0.8.5+{version}'
from vllm.version import __version__, __version_tuple__, __hcu_version__ from vllm.version import __version__, __version_tuple__, __hcu_version__
except Exception as e: except Exception as e:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment