Commit 99324e25 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.2' into v0.9.2-ori

parents cc7f22a8 a5dd03c1
......@@ -202,7 +202,7 @@ def parse_args():
def deserialize():
def deserialize(args, tensorizer_config):
if args.lora_path:
tensorizer_config.lora_dir = tensorizer_config.tensorizer_dir
llm = LLM(model=args.model,
......@@ -242,7 +242,7 @@ def deserialize():
return llm
if __name__ == '__main__':
def main():
args = parse_args()
s3_access_key_id = (getattr(args, 's3_access_key_id', None)
......@@ -260,8 +260,6 @@ if __name__ == '__main__':
model_ref = args.model
model_name = model_ref.split("/")[1]
if args.command == "serialize" or args.command == "deserialize":
keyfile = args.keyfile
else:
......@@ -309,6 +307,10 @@ if __name__ == '__main__':
encryption_keyfile = keyfile,
**credentials
)
deserialize()
deserialize(args, tensorizer_config)
else:
raise ValueError("Either serialize or deserialize must be specified.")
if __name__ == "__main__":
main()
{{ '<begin_of_document>' -}}
{%- if custom_tools is defined %}
{%- set tools = custom_tools %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{#- Extract system message #}
{% set ns = namespace(system_prompt='') -%}
{%- if messages[0]['role'] == 'system' %}
{%- if messages[0]['content'] is string %}
{%- set ns.system_prompt = messages[0]['content']|trim %}
{%- else %}
{%- set ns.system_prompt = messages[0]['content'][0]['text']|trim %}
{%- endif %}
{%- set messages = messages[1:] %}
{%- else %}
{%- if tools is not none %}
{%- set ns.system_prompt = "You are a helpful assistant created by Minimax based on MiniMax-M1 model." %}
{%- else %}
{%- set ns.system_prompt = "You are a helpful assistant created by Minimax based on MiniMax-M1 model." %}
{%- endif %}
{%- endif %}
{#- System message #}
{%- if ns.system_prompt != '' %}
{{ '<beginning_of_sentence>system ai_setting=assistant\n' + ns.system_prompt + '<end_of_sentence>\n' -}}
{%- endif %}
{#- Tools configuration #}
{%- if tools is not none %}
{{ '<beginning_of_sentence>system tool_setting=tools\nYou are provided with these tools:\n<tools>\n' -}}
{%- for tool in tools %}
{{ tool | tojson ~ '\n' -}}
{%- endfor %}
{{ '</tools>\n\nIf you need to call tools, please respond with <tool_calls></tool_calls> XML tags, and provide tool-name and json-object of arguments, following the format below:\n<tool_calls>\n{"name": <tool-name>, "arguments": <args-json-object>}\n...\n</tool_calls><end_of_sentence>\n' -}}
{%- endif %}
{#- Process messages #}
{%- for message in messages %}
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
{%- if message['role'] == 'user' %}
{{ '<beginning_of_sentence>user name=user\n' -}}
{%- if message['content'] is string %}
{{ message['content']|trim -}}
{%- else %}
{%- for content in message['content'] %}
{%- if content['type'] == 'text' %}
{{ content['text']|trim -}}
{%- endif %}
{%- endfor %}
{%- endif %}
{{ '<end_of_sentence>\n' -}}
{%- elif message['role'] == 'assistant' %}
{{ '<beginning_of_sentence>ai name=assistant\n' -}}
{%- if message['content'] is string %}
{{ message['content']|trim -}}
{%- else %}
{%- for content in message['content'] | selectattr('type', 'equalto', 'text') %}
{{ content['text']|trim -}}
{%- endfor %}
{%- endif %}
{{ '<end_of_sentence>\n' -}}
{%- endif %}
{%- elif 'tool_calls' in message %}
{{ '<beginning_of_sentence>ai name=assistant\n<tool_calls>\n' -}}
{%- for tool_call in message.tool_calls %}
{{ '{"name": "' + tool_call.function.name + '", "arguments": ' + tool_call.function.arguments | tojson + '}\n' -}}
{%- endfor %}
{{ '</tool_calls><end_of_sentence>\n' -}}
{%- elif message.role == "tool" or message.role == "ipython" %}
{{ '<beginning_of_sentence>tool name=tools\n' -}}
{%- if message.content is string %}
{{ 'tool result: ' + message.content + '\n\n' -}}
{%- else %}
{%- for content in message['content'] %}
{%- if content['type'] == 'text' %}
{{ 'tool result: ' + content['text'] + '\n\n' -}}
{%- elif content.get('name') %}
{{ 'tool name: ' + content['name'] + '\ntool result: ' + content['text'] + '\n\n' -}}
{%- endif %}
{%- endfor %}
{%- endif %}
{{ '<end_of_sentence>\n' -}}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{ '<beginning_of_sentence>ai name=assistant\n' -}}
{%- endif %}
\ No newline at end of file
{{- bos_token }}
{%- if custom_tools is defined %}
{%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
{%- set tools_in_user_message = true %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{#- Extract system message #}
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
{%- if messages[0]['role'] == 'system' %}
{%- set system_message = messages[0]['content'] | trim %}
{%- set messages = messages[1:] %}
{{- system_message + "\n" }}
{%- else %}
{%- set system_message = "You are a helpful assistant. You are developed by Salesforce xLAM team." %}
{% set format_instruction %}You have access to a set of tools. When using tools, make calls in a single JSON array:
[{"name": "tool_call_name", "arguments": {"arg1": "value1", "arg2": "value2"}}, ... (additional parallel tool calls as needed)]
If no tool is suitable, state that explicitly. If the user's input lacks required parameters, ask for clarification. Do not interpret or respond until tool results are returned. Once they are available, process them or make additional calls if needed. For tasks that don't require tools, such as casual conversation or general advice, respond directly in plain text. The available tools are:{% endset %}
{{- system_message + "\n" }}
{%- if tools is not none %}
{{- format_instruction + "\n\n" }}
{%- endif %}
{%- endif %}
{%- if tools is not none %}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}
{{- "<|eot_id|>" }}
{%- for message in messages %}
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
{%- elif 'tool_calls' in message %}
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
{%- if message['tool_calls'] %}
{{- "[" }}
{%- for tool_call_function in message.tool_calls %}
{%- set tool_call = tool_call_function.function %}
{{- '{"name": "' + tool_call.name + '", ' }}
{{- '"arguments": ' }}
{{- tool_call.arguments | tojson }}
{{- "}" }}
{%- if not loop.last %}
{{- ", " }}
{%- endif %}
{%- endfor %}
{{- "]" }}
{{- "<|eot_id|>" }}
{%- elif message['content'] %}
{{- message['content'] | trim + '<|eot_id|>' }}
{%- else %}
{{- "[]\n" + '<|eot_id|>' }}
{%- endif %}
{%- elif message.role == "tool" or message.role == "ipython" %}
{{- "<|start_header_id|>" + "ipython" + "<|end_header_id|>\n\n" }}
{%- set content = message["content"] %}
{%- if content is mapping or (content is iterable and content is not string) %}
{{- content | tojson }}
{%- else %}
{{- content }}
{%- endif %}
{{- "<|eot_id|>" }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
{%- endif %}
\ No newline at end of file
{# System message #}
{{- "<|im_start|>system\n" }}
{%- if messages[0]['role'] == 'system' %}
{%- set system_message = messages[0]['content'] | trim %}
{%- set messages = messages[1:] %}
{{- system_message + "\n" }}
{%- else %}
{%- set system_message = "You are a helpful assistant. You are developed by Salesforce xLAM team." %}
{% set format_instruction %}You have access to a set of tools. When using tools, make calls in a single JSON array:
[{"name": "tool_call_name", "arguments": {"arg1": "value1", "arg2": "value2"}}, ... (additional parallel tool calls as needed)]
If no tool is suitable, state that explicitly. If the user's input lacks required parameters, ask for clarification. Do not interpret or respond until tool results are returned. Once they are available, process them or make additional calls if needed. For tasks that don't require tools, such as casual conversation or general advice, respond directly in plain text. The available tools are:{% endset %}
{{- system_message + "\n" }}
{%- if tools is not none %}
{{- format_instruction + "\n\n" }}
{%- endif %}
{%- endif %}
{%- if tools is not none %}
{%- for func in tools %}
{{- func | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}
{{- "<|im_end|>\n" }}
{%- for message in messages %}
{%- if message['role'] == 'tool' %}
{{- "<|im_start|>tool\n" }}
{%- if message.content is defined and message.content.content is defined %}
{%- set content = message.content.content %}
{%- else %}
{%- set content = message.content %}
{%- endif %}
{%- if content is mapping or content is iterable and content is not string %}
{{- content | tojson }}
{%- else %}
{{- content }}
{%- endif %}
{{- "<|im_end|>\n" }}
{%- elif 'tool_calls' in message %}
{{- "<|im_start|>assistant\n" }}
{%- if message['tool_calls'] %}
{{- "[" }}
{%- for tool_call in message.tool_calls %}
{%- set out = tool_call.function | tojson %}
{{- out }}
{%- if not loop.last %}
{{- ", " }}
{%- endif %}
{%- endfor %}
{{- "]"}}
{%- elif message['content'] %}
{{- message['content'] | trim }}
{%- else %}
{{- "[]\n" }}
{%- endif %}
{{- "<|im_end|>\n" }}
{%- else %}
{{- "<|im_start|>" + message['role'] + "\n" + message['content'] | trim + "<|im_end|>\n" }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- "<|im_start|>assistant\n" }}
{%- endif %}
site_name: vLLM
site_url: https://docs.vllm.ai
repo_url: https://github.com/vllm-project/vllm
edit_uri: edit/main/docs/
exclude_docs: |
*.inc.md
*.template.md
......@@ -29,10 +30,12 @@ theme:
icon: material/brightness-2
name: Switch to system preference
features:
- content.action.edit
- content.code.copy
- content.tabs.link
- navigation.tracking
- navigation.tabs
- navigation.tabs.sticky
- navigation.sections
- navigation.prune
- navigation.top
......@@ -123,6 +126,8 @@ extra_css:
extra_javascript:
- mkdocs/javascript/run_llm_widget.js
- https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML
- mkdocs/javascript/edit_and_feedback.js
- mkdocs/javascript/slack_and_forum.js
# Makes the url format end in .html rather than act as a dir
# So index.md generates as index.html and is available under URL /index.html
......
......@@ -76,7 +76,7 @@ line-length = 80
"vllm/spec_decode/**/*.py" = ["UP006", "UP035"]
"vllm/worker/**/*.py" = ["UP006", "UP035"]
# Python 3.8 typing - skip utils for ROCm
"vllm/utils.py" = ["UP006", "UP035"]
"vllm/utils/__init__.py" = ["UP006", "UP035"]
[tool.ruff.lint]
select = [
......@@ -137,10 +137,6 @@ exclude = [
'vllm/attention/ops/.*\.py$'
]
[tool.codespell]
ignore-words-list = "dout, te, indicies, subtile, ElementE"
skip = "tests/models/fixtures/*,tests/prompts/*,benchmarks/sonnet.txt,tests/lora/data/*,build/*,vllm/third_party/*"
[tool.isort]
skip_glob = [
".buildkite/*",
......@@ -154,6 +150,7 @@ skip_gitignore = true
markers = [
"skip_global_cleanup",
"core_model: enable this model test in each PR instead of only nightly",
"hybrid_model: models that contain mamba layers (including pure SSM and hybrid architectures)",
"cpu_model: enable this model test in CPU tests",
"split: run this test as part of a split",
"distributed: run this test only in distributed GPU tests",
......
......@@ -8,12 +8,12 @@ tqdm
blake3
py-cpuinfo
transformers >= 4.51.1
huggingface-hub[hf_xet] >= 0.32.0 # Required for Xet downloads.
huggingface-hub[hf_xet] >= 0.33.0 # Required for Xet downloads.
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp
openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
openai >= 1.52.0, <= 1.90.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
pydantic >= 2.10
prometheus_client >= 0.18.0
pillow # Required for image processing
......@@ -23,7 +23,7 @@ lm-format-enforcer >= 0.10.11, < 0.11
llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
outlines == 0.1.11
lark == 1.2.2
xgrammar == 0.1.19; platform_machine == "x86_64" or platform_machine == "aarch64"
xgrammar == 0.1.19; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64"
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs
......@@ -31,20 +31,17 @@ pyzmq >= 25.0.0
msgspec
gguf >= 0.13.0
importlib_metadata; python_version < '3.10'
mistral_common[opencv] >= 1.5.4
mistral_common[opencv] >= 1.6.2
opencv-python-headless >= 4.11.0 # required for video IO
pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL.
compressed-tensors == 0.10.1 # required for compressed-tensors
compressed-tensors == 0.10.2 # required for compressed-tensors
depyf==0.18.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files
python-json-logger # Used by logging as per examples/others/logging_configuration.md
scipy # Required for phi-4-multimodal-instruct
ninja # Required for xgrammar, rocm, tpu, xpu
opentelemetry-sdk>=1.26.0 # vllm.tracing
opentelemetry-api>=1.26.0 # vllm.tracing
opentelemetry-exporter-otlp>=1.26.0 # vllm.tracing
opentelemetry-semantic-conventions-ai>=0.4.1 # vllm.tracing
pybase64 # fast base64 implementation
# Temporarily used for x86 CPU backend to avoid performance regression of torch>2.6.0+cpu,
# see https://github.com/pytorch/pytorch/pull/151218
cmake>=3.26.1
ninja
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.6.0+cpu
wheel
jinja2>=3.1.6
regex
......@@ -8,7 +8,7 @@ numba == 0.61.2; python_version > '3.9'
packaging>=24.2
setuptools>=77.0.3,<80.0.0
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.7.0+cpu; platform_machine == "x86_64"
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
torch==2.7.0; platform_system == "Darwin"
torch==2.7.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
......@@ -21,11 +21,9 @@ torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
torchvision==0.22.0; platform_machine == "ppc64le"
datasets # for benchmark scripts
# cpu cannot use triton 3.3.0
triton==3.2.0; platform_machine == "x86_64"
# Intel Extension for PyTorch, only for x86_64 CPUs
intel-openmp==2024.2.1; platform_machine == "x86_64"
intel_extension_for_pytorch==2.7.0; platform_machine == "x86_64"
intel_extension_for_pytorch==2.6.0; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
py-libnuma; platform_system != "Darwin"
psutil; platform_system != "Darwin"
triton==3.2.0; platform_machine == "x86_64" # Triton is required for torch 2.6+cpu, as it is imported in torch.compile.
lmcache
\ No newline at end of file
# Dependency that able to run entrypoints test
# pytest and its extensions
# testing
pytest
pytest-asyncio
tensorizer>=2.9.0
pytest-forked
pytest-mock
pytest-asyncio
pytest-rerunfailures
pytest-shard
pytest-timeout
librosa # required by audio tests in entrypoints/openai
sentence-transformers # required for embedding tests
transformers==4.51.3
transformers_stream_generator # required for qwen-vl test
numba == 0.61.2; python_version > '3.9'
# testing utils
boto3
botocore
datasets
ray >= 2.10.0
backoff # required for phi4mm test
blobfile # required for kimi-vl test
einops # required for MPT, qwen-vl and Mamba
httpx
librosa # required for audio tests
vocos # required for minicpmo_26 test
peft
runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
tensorizer>=2.9.0
lm-eval==0.4.8
buildkite-test-collector==0.1.9
pqdm
ray[cgraph,default]>=2.43.0, !=2.44.* # Ray Compiled Graph, required by pipeline parallelism tests
sentence-transformers # required for embedding tests
soundfile # required for audio tests
jiwer # required for audio tests
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.6.2 # required for pixtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
transformers==4.52.4
tokenizers==0.21.1
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
schemathesis>=3.39.15 # Required for openai schema test.
# quantization
bitsandbytes>=0.46.1
buildkite-test-collector==0.1.9
# required for quantization test
bitsandbytes>=0.45.3
# required for minicpmo_26 test
vector_quantize_pytorch
vocos
# required for Basic Models Test
blobfile # required for kimi-vl test
matplotlib # required for qwen-vl test
genai_perf==0.0.8
tritonclient==2.51.0
# required for Multi-Modal Models Test (Standard)
num2words # required for smolvlm test
pqdm
timm # required for internvl test
schemathesis>=3.39.15 # Required for openai schema test.
mteb>=1.38.11, <2 # required for mteb test
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61.2; python_version > '3.9'
numpy
runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
fastsafetensors>=0.1.10
pydantic>=2.10 # 2.9 leads to error on python 3.10
......@@ -28,20 +28,21 @@ torchvision==0.22.0
transformers_stream_generator # required for qwen-vl test
mamba_ssm # required for plamo2 test
matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.5.4 # required for pixtral test
mistral_common[opencv] >= 1.6.2 # required for pixtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
mteb[bm25s]>=1.38.11, <2 # required for mteb test
transformers==4.52.4
tokenizers==0.21.1
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
huggingface-hub[hf_xet]>=0.33.0 # Required for Xet downloads.
schemathesis>=3.39.15 # Required for openai schema test.
# quantization
bitsandbytes>=0.45.3
bitsandbytes==0.46.1
buildkite-test-collector==0.1.9
genai_perf==0.0.8
tritonclient==2.51.0
......@@ -51,4 +52,4 @@ numpy
runai-model-streamer==0.11.0
runai-model-streamer-s3==0.11.0
fastsafetensors>=0.1.10
pydantic>=2.10 # 2.9 leads to error on python 3.10
\ No newline at end of file
pydantic>=2.10 # 2.9 leads to error on python 3.10
......@@ -45,12 +45,14 @@ backoff==2.2.1
# via
# -r requirements/test.in
# schemathesis
bitsandbytes==0.45.3
bitsandbytes==0.46.1
# via -r requirements/test.in
black==24.10.0
# via datamodel-code-generator
blobfile==3.0.0
# via -r requirements/test.in
bm25s==0.2.13
# via mteb
boto3==1.35.57
# via tensorizer
botocore==1.35.57
......@@ -190,7 +192,7 @@ h11==0.14.0
# via httpcore
harfile==0.3.0
# via schemathesis
hf-xet==0.1.4
hf-xet==1.1.3
# via huggingface-hub
hiredis==3.0.0
# via tensorizer
......@@ -200,7 +202,7 @@ httpx==0.27.2
# via
# -r requirements/test.in
# schemathesis
huggingface-hub==0.30.1
huggingface-hub==0.33.0
# via
# -r requirements/test.in
# accelerate
......@@ -303,7 +305,7 @@ mbstrdecoder==1.1.3
# typepy
mdurl==0.1.2
# via markdown-it-py
mistral-common==1.5.4
mistral-common==1.6.2
# via -r requirements/test.in
more-itertools==10.5.0
# via lm-eval
......@@ -344,6 +346,7 @@ numpy==1.26.4
# -r requirements/test.in
# accelerate
# bitsandbytes
# bm25s
# contourpy
# cupy-cuda12x
# datasets
......@@ -534,6 +537,8 @@ pyparsing==3.2.0
# via matplotlib
pyrate-limiter==3.7.0
# via schemathesis
pystemmer==3.0.0
# via mteb
pytablewriter==1.2.0
# via lm-eval
pytest==8.3.3
......@@ -668,6 +673,7 @@ scikit-learn==1.5.2
# sentence-transformers
scipy==1.13.1
# via
# bm25s
# librosa
# mteb
# scikit-learn
......
......@@ -18,9 +18,9 @@ setuptools==78.1.0
--find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch==2.8.0.dev20250605
torchvision==0.23.0.dev20250605
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250605-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250605-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250605-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
torch==2.8.0.dev20250618
torchvision==0.23.0.dev20250618
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.8.0.dev20250618-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
......@@ -9,6 +9,7 @@ setuptools>=77.0.3,<80.0.0
wheel
jinja2>=3.1.6
datasets # for benchmark scripts
numba == 0.60.0 # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
torch==2.7.0+xpu
torchaudio
......
......@@ -6,8 +6,8 @@ import os
import uuid
from asyncio import CancelledError
from copy import copy
from dataclasses import dataclass
from typing import Optional
from dataclasses import dataclass, field
from typing import Any, Optional
import pytest
import pytest_asyncio
......@@ -32,6 +32,7 @@ class RequestOutput:
@dataclass
class MockModelConfig:
use_async_output_proc = True
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
class MockEngine:
......
......@@ -49,7 +49,13 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
@pytest.mark.parametrize("attention_backend", [
pytest.param("FLASHINFER",
marks=pytest.mark.skipif(
current_platform.is_rocm(),
reason="FLASHINFER isn't supported on ROCm")),
"FLASH_ATTN"
])
def test_models(
hf_runner: HfRunner,
vllm_runner: VllmRunner,
......@@ -99,7 +105,13 @@ def test_models(
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
@pytest.mark.parametrize("attention_backend", [
pytest.param("FLASHINFER",
marks=pytest.mark.skipif(
current_platform.is_rocm(),
reason="FLASHINFER isn't supported on ROCm")),
"FLASH_ATTN"
])
def test_models_distributed(
hf_runner: HfRunner,
vllm_runner: VllmRunner,
......@@ -172,6 +184,8 @@ def test_models_distributed(
# Due to low-precision numerical divergence, this test is too sensitive to
# the async postprocessor
@pytest.mark.parametrize("disable_async_output_proc", [True])
@pytest.mark.skipif(current_platform.is_rocm(),
reason="machete_prepack_B isn't supported on ROCm")
def test_models_with_fp8_kv_cache(
vllm_runner: VllmRunner,
example_prompts,
......
......@@ -31,6 +31,8 @@ def test_bench_serve(server):
server.host,
"--port",
str(server.port),
"--dataset-name",
"random",
"--random-input-len",
"32",
"--random-output-len",
......
......@@ -25,7 +25,7 @@ infiles += [
infiles += [
"vllm/model_executor/layers/sampler.py",
"vllm/sampling_params.py",
"vllm/utils.py",
"vllm/utils/__init__.py",
]
setup(ext_modules=cythonize(infiles,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
from copy import deepcopy
from typing import Callable, Union
from torch import fx
from torch._ops import OpOverload
from vllm.compilation.fx_utils import (find_specified_fn,
find_specified_fn_maybe)
from vllm.compilation.fx_utils import find_op_nodes
from vllm.compilation.inductor_pass import InductorPass
from vllm.config import get_current_vllm_config
......@@ -48,18 +49,19 @@ class TestBackend:
# assign by reference, will reflect the final state of the graph
self.final_graph = graph
def check_before_ops(self, ops,
find_fn=find_specified_fn, \
find_fn_maybe=find_specified_fn_maybe, \
ops_fully_replaced=True):
def check_before_ops(self, ops: Sequence[OpOverload], fully_replaced=True):
for op in ops:
find_fn(self.graph_pre_pass.nodes, op)
if ops_fully_replaced:
assert find_fn_maybe(self.graph_post_pass.nodes, op) is None
num_pre = len(list(find_op_nodes(op, self.graph_pre_pass)))
num_post = len(list(find_op_nodes(op, self.graph_post_pass)))
assert num_pre > 0, f"Op {op.name()} not found in pre-pass graph"
assert num_pre > num_post, f"All nodes remain for op {op.name()}"
if fully_replaced:
assert num_post == 0, \
f"Unexpected op {op.name()} in post-pass graph"
def check_after_ops(self, ops,
find_fn=find_specified_fn, \
find_fn_maybe=find_specified_fn_maybe):
def check_after_ops(self, ops: Sequence[OpOverload]):
for op in ops:
find_fn(self.graph_post_pass.nodes, op)
assert find_fn_maybe(self.graph_pre_pass.nodes, op) is None
num_pre = len(list(find_op_nodes(op, self.graph_pre_pass)))
num_post = len(list(find_op_nodes(op, self.graph_post_pass)))
assert num_pre == 0, f"Unexpected op {op.name()} in pre-pass graph"
assert num_post > 0, f"Op {op.name()} not found in post-pass graph"
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment