Commit afd0da21 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.1' into v0.7.1-dev

parents 1a11f127 4f4d427a
...@@ -5,7 +5,7 @@ requests >= 2.26.0 ...@@ -5,7 +5,7 @@ requests >= 2.26.0
tqdm tqdm
blake3 blake3
py-cpuinfo py-cpuinfo
transformers == 4.47.0 # Required for Llama 3.2 and Qwen2-VL. transformers >= 4.48.2 # Required for Bamba.
tokenizers >= 0.19.1 # Required for Llama 3. tokenizers >= 0.19.1 # Required for Llama 3.
protobuf # Required by LlamaTokenizer. protobuf # Required by LlamaTokenizer.
fastapi >= 0.107.0, < 0.113.0; python_version < '3.9' fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
...@@ -19,7 +19,7 @@ pillow # Required for image processing ...@@ -19,7 +19,7 @@ pillow # Required for image processing
prometheus-fastapi-instrumentator >= 7.0.0 prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer >= 0.10.9, < 0.11 lm-format-enforcer >= 0.10.9, < 0.11
outlines == 0.1.11 # Requires pytorch outlines == 0.1.11
lark == 1.2.2 lark == 1.2.2
xgrammar >= 0.1.6; platform_machine == "x86_64" xgrammar >= 0.1.6; platform_machine == "x86_64"
typing_extensions >= 4.10 typing_extensions >= 4.10
...@@ -34,6 +34,6 @@ pyyaml ...@@ -34,6 +34,6 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL. einops # Required for Qwen2-VL.
compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch compressed-tensors == 0.9.0 # required for compressed-tensors
depyf==0.18.0 # required for profiling and debugging with compilation config depyf==0.18.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
...@@ -2,7 +2,14 @@ ...@@ -2,7 +2,14 @@
-r requirements-common.txt -r requirements-common.txt
# Dependencies for CPUs # Dependencies for CPUs
torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" torch==2.5.1+cpu; platform_machine != "ppc64le" and platform_machine != "aarch64" and platform_system != "Darwin"
torch==2.5.1; platform_machine == "aarch64" torch==2.5.1; platform_machine == "ppc64le" or platform_machine == "aarch64" or platform_system == "Darwin"
torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch
datasets # for benchmark scripts # required for the image processor of minicpm-o-2_6, this must be updated alongside torch
\ No newline at end of file torchaudio; platform_machine != "ppc64le"
torchaudio==2.5.1; platform_machine == "ppc64le"
# required for the image processor of phi3v, this must be updated alongside torch
torchvision; platform_machine != "ppc64le"
torchvision==0.20.1; platform_machine == "ppc64le"
datasets # for benchmark scripts
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
ray[default] >= 2.9 ray[default] >= 2.9
nvidia-ml-py >= 12.560.30 # for pynvml package nvidia-ml-py >= 12.560.30 # for pynvml package
torch == 2.5.1 torch == 2.5.1
torchaudio==2.5.1
# These must be updated alongside torch # These must be updated alongside torch
torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch 2.5.1
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
# Dependencies for HPU code # Dependencies for HPU code
ray ray
triton triton==3.1.0
pandas pandas
tabulate tabulate
setuptools>=61 setuptools>=61
......
# formatting # formatting
yapf==0.32.0 pre-commit==4.0.1
toml==0.10.2
tomli==2.0.2
ruff==0.6.5
codespell==2.3.0
isort==5.13.2
clang-format==18.1.5
sphinx-lint==1.0.0
# type checking
mypy==1.11.1
types-PyYAML
types-requests
types-setuptools
...@@ -2,6 +2,6 @@ ...@@ -2,6 +2,6 @@
-r requirements-common.txt -r requirements-common.txt
# Dependencies for Neuron devices # Dependencies for Neuron devices
transformers-neuronx >= 0.12.0 transformers-neuronx >= 0.13.0
torch-neuronx >= 2.1.2 torch-neuronx >= 2.5.0
neuronx-cc neuronx-cc
...@@ -12,20 +12,27 @@ decord # required for video tests ...@@ -12,20 +12,27 @@ decord # required for video tests
einops # required for MPT, qwen-vl and Mamba einops # required for MPT, qwen-vl and Mamba
httpx httpx
librosa # required for audio tests librosa # required for audio tests
vector_quantize_pytorch # required for minicpmo_26 test
vocos # required for minicpmo_26 test
peft peft
pqdm
ray[adag]==2.40.0 ray[adag]==2.40.0
sentence-transformers # required for embedding tests sentence-transformers # required for embedding tests
soundfile # required for audio tests soundfile # required for audio tests
timm # required for internvl test timm # required for internvl test
torch==2.5.1 torch==2.5.1
torchaudio==2.5.1
transformers_stream_generator # required for qwen-vl test transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test matplotlib # required for qwen-vl test
mistral_common[opencv] >= 1.5.0 # required for pixtral test mistral_common[opencv] >= 1.5.0 # required for pixtral test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.4 # required for model evaluation test lm-eval[api]==0.4.4 # required for model evaluation test
transformers==4.48.2
# quantization # quantization
bitsandbytes>=0.45.0 bitsandbytes>=0.45.0
buildkite-test-collector==0.1.9 buildkite-test-collector==0.1.9
genai_perf==0.0.8
tritonclient==2.51.0
numpy < 2.0.0 numpy < 2.0.0
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.12 # This file is autogenerated by pip-compile with Python 3.12
# by the following command: # by the following command:
# #
# python3.12 -m piptools compile requirements-test.in -o requirements-test.txt # python3.12 -m piptools compile requirements-test.in -o requirements-test.txt
# #
absl-py==2.1.0 absl-py==2.1.0
# via rouge-score # via rouge-score
...@@ -37,7 +37,7 @@ audioread==3.0.1 ...@@ -37,7 +37,7 @@ audioread==3.0.1
# via librosa # via librosa
awscli==1.35.23 awscli==1.35.23
# via -r requirements-test.in # via -r requirements-test.in
bitsandbytes>=0.45.0 bitsandbytes==0.45.0
# via -r requirements-test.in # via -r requirements-test.in
black==24.10.0 black==24.10.0
# via datamodel-code-generator # via datamodel-code-generator
...@@ -48,6 +48,8 @@ botocore==1.35.57 ...@@ -48,6 +48,8 @@ botocore==1.35.57
# awscli # awscli
# boto3 # boto3
# s3transfer # s3transfer
bounded-pool-executor==0.0.3
# via pqdm
buildkite-test-collector==0.1.9 buildkite-test-collector==0.1.9
# via -r requirements-test.in # via -r requirements-test.in
certifi==2024.8.30 certifi==2024.8.30
...@@ -73,6 +75,8 @@ colorama==0.4.6 ...@@ -73,6 +75,8 @@ colorama==0.4.6
# tqdm-multiprocess # tqdm-multiprocess
contourpy==1.3.0 contourpy==1.3.0
# via matplotlib # via matplotlib
cramjam==2.9.0
# via fastparquet
cupy-cuda12x==13.3.0 cupy-cuda12x==13.3.0
# via ray # via ray
cycler==0.12.1 cycler==0.12.1
...@@ -102,11 +106,21 @@ dnspython==2.7.0 ...@@ -102,11 +106,21 @@ dnspython==2.7.0
docutils==0.16 docutils==0.16
# via awscli # via awscli
einops==0.8.0 einops==0.8.0
# via -r requirements-test.in # via
# -r requirements-test.in
# encodec
# vector-quantize-pytorch
# vocos
einx==0.3.0
# via vector-quantize-pytorch
email-validator==2.2.0 email-validator==2.2.0
# via pydantic # via pydantic
encodec==0.1.1
# via vocos
evaluate==0.4.3 evaluate==0.4.3
# via lm-eval # via lm-eval
fastparquet==2024.11.0
# via genai-perf
fastrlock==0.8.2 fastrlock==0.8.2
# via cupy-cuda12x # via cupy-cuda12x
filelock==3.16.1 filelock==3.16.1
...@@ -119,6 +133,8 @@ filelock==3.16.1 ...@@ -119,6 +133,8 @@ filelock==3.16.1
# triton # triton
fonttools==4.54.1 fonttools==4.54.1
# via matplotlib # via matplotlib
frozendict==2.4.6
# via einx
frozenlist==1.5.0 frozenlist==1.5.0
# via # via
# aiohttp # aiohttp
...@@ -128,8 +144,11 @@ fsspec[http]==2024.9.0 ...@@ -128,8 +144,11 @@ fsspec[http]==2024.9.0
# via # via
# datasets # datasets
# evaluate # evaluate
# fastparquet
# huggingface-hub # huggingface-hub
# torch # torch
genai-perf==0.0.8
# via -r requirements-test.in
genson==1.3.0 genson==1.3.0
# via datamodel-code-generator # via datamodel-code-generator
h11==0.14.0 h11==0.14.0
...@@ -150,6 +169,7 @@ huggingface-hub==0.26.2 ...@@ -150,6 +169,7 @@ huggingface-hub==0.26.2
# timm # timm
# tokenizers # tokenizers
# transformers # transformers
# vocos
idna==3.10 idna==3.10
# via # via
# anyio # anyio
...@@ -184,6 +204,8 @@ jsonschema==4.23.0 ...@@ -184,6 +204,8 @@ jsonschema==4.23.0
# ray # ray
jsonschema-specifications==2024.10.1 jsonschema-specifications==2024.10.1
# via jsonschema # via jsonschema
kaleido==0.2.1
# via genai-perf
kiwisolver==1.4.7 kiwisolver==1.4.7
# via matplotlib # via matplotlib
lazy-loader==0.4 lazy-loader==0.4
...@@ -198,6 +220,8 @@ lm-eval[api]==0.4.4 ...@@ -198,6 +220,8 @@ lm-eval[api]==0.4.4
# via -r requirements-test.in # via -r requirements-test.in
lxml==5.3.0 lxml==5.3.0
# via sacrebleu # via sacrebleu
markdown-it-py==3.0.0
# via rich
markupsafe==3.0.2 markupsafe==3.0.2
# via jinja2 # via jinja2
matplotlib==3.9.2 matplotlib==3.9.2
...@@ -207,6 +231,8 @@ mbstrdecoder==1.1.3 ...@@ -207,6 +231,8 @@ mbstrdecoder==1.1.3
# dataproperty # dataproperty
# pytablewriter # pytablewriter
# typepy # typepy
mdurl==0.1.2
# via markdown-it-py
mistral-common[opencv]==1.5.1 mistral-common[opencv]==1.5.1
# via # via
# -r requirements-test.in # -r requirements-test.in
...@@ -246,7 +272,11 @@ numpy==1.26.4 ...@@ -246,7 +272,11 @@ numpy==1.26.4
# cupy-cuda12x # cupy-cuda12x
# datasets # datasets
# decord # decord
# einx
# encodec
# evaluate # evaluate
# fastparquet
# genai-perf
# librosa # librosa
# matplotlib # matplotlib
# mistral-common # mistral-common
...@@ -254,15 +284,19 @@ numpy==1.26.4 ...@@ -254,15 +284,19 @@ numpy==1.26.4
# numexpr # numexpr
# opencv-python-headless # opencv-python-headless
# pandas # pandas
# patsy
# peft # peft
# rouge-score # rouge-score
# sacrebleu # sacrebleu
# scikit-learn # scikit-learn
# scipy # scipy
# soxr # soxr
# statsmodels
# tensorizer # tensorizer
# torchvision # torchvision
# transformers # transformers
# tritonclient
# vocos
nvidia-cublas-cu12==12.4.5.8 nvidia-cublas-cu12==12.4.5.8
# via # via
# nvidia-cudnn-cu12 # nvidia-cudnn-cu12
...@@ -304,30 +338,39 @@ packaging==24.1 ...@@ -304,30 +338,39 @@ packaging==24.1
# datamodel-code-generator # datamodel-code-generator
# datasets # datasets
# evaluate # evaluate
# fastparquet
# huggingface-hub # huggingface-hub
# lazy-loader # lazy-loader
# matplotlib # matplotlib
# peft # peft
# plotly
# pooch # pooch
# pytest # pytest
# pytest-rerunfailures # pytest-rerunfailures
# ray # ray
# statsmodels
# transformers # transformers
# typepy # typepy
pandas==2.2.3 pandas==2.2.3
# via # via
# datasets # datasets
# evaluate # evaluate
# fastparquet
# genai-perf
# statsmodels
pathspec==0.12.1 pathspec==0.12.1
# via black # via black
pathvalidate==3.2.1 pathvalidate==3.2.1
# via pytablewriter # via pytablewriter
patsy==1.0.1
# via statsmodels
peft==0.13.2 peft==0.13.2
# via # via
# -r requirements-test.in # -r requirements-test.in
# lm-eval # lm-eval
pillow==10.4.0 pillow==10.4.0
# via # via
# genai-perf
# matplotlib # matplotlib
# mistral-common # mistral-common
# sentence-transformers # sentence-transformers
...@@ -336,12 +379,16 @@ platformdirs==4.3.6 ...@@ -336,12 +379,16 @@ platformdirs==4.3.6
# via # via
# black # black
# pooch # pooch
plotly==5.24.1
# via genai-perf
pluggy==1.5.0 pluggy==1.5.0
# via pytest # via pytest
pooch==1.8.2 pooch==1.8.2
# via librosa # via librosa
portalocker==2.10.1 portalocker==2.10.1
# via sacrebleu # via sacrebleu
pqdm==0.2.0
# via -r requirements-test.in
propcache==0.2.0 propcache==0.2.0
# via yarl # via yarl
protobuf==5.28.3 protobuf==5.28.3
...@@ -356,7 +403,9 @@ psutil==6.1.0 ...@@ -356,7 +403,9 @@ psutil==6.1.0
py==1.11.0 py==1.11.0
# via pytest-forked # via pytest-forked
pyarrow==18.0.0 pyarrow==18.0.0
# via datasets # via
# datasets
# genai-perf
pyasn1==0.6.1 pyasn1==0.6.1
# via rsa # via rsa
pybind11==2.13.6 pybind11==2.13.6
...@@ -369,6 +418,8 @@ pydantic[email]==2.9.2 ...@@ -369,6 +418,8 @@ pydantic[email]==2.9.2
# mistral-common # mistral-common
pydantic-core==2.23.4 pydantic-core==2.23.4
# via pydantic # via pydantic
pygments==2.18.0
# via rich
pyparsing==3.2.0 pyparsing==3.2.0
# via matplotlib # via matplotlib
pytablewriter==1.2.0 pytablewriter==1.2.0
...@@ -377,14 +428,18 @@ pytest==8.3.3 ...@@ -377,14 +428,18 @@ pytest==8.3.3
# via # via
# -r requirements-test.in # -r requirements-test.in
# buildkite-test-collector # buildkite-test-collector
# genai-perf
# pytest-asyncio # pytest-asyncio
# pytest-forked # pytest-forked
# pytest-mock
# pytest-rerunfailures # pytest-rerunfailures
# pytest-shard # pytest-shard
pytest-asyncio==0.24.0 pytest-asyncio==0.24.0
# via -r requirements-test.in # via -r requirements-test.in
pytest-forked==1.6.0 pytest-forked==1.6.0
# via -r requirements-test.in # via -r requirements-test.in
pytest-mock==3.14.0
# via genai-perf
pytest-rerunfailures==14.0 pytest-rerunfailures==14.0
# via -r requirements-test.in # via -r requirements-test.in
pytest-shard==0.1.2 pytest-shard==0.1.2
...@@ -395,6 +450,8 @@ python-dateutil==2.9.0.post0 ...@@ -395,6 +450,8 @@ python-dateutil==2.9.0.post0
# matplotlib # matplotlib
# pandas # pandas
# typepy # typepy
python-rapidjson==1.20
# via tritonclient
pytz==2024.2 pytz==2024.2
# via # via
# pandas # pandas
...@@ -405,11 +462,14 @@ pyyaml==6.0.2 ...@@ -405,11 +462,14 @@ pyyaml==6.0.2
# awscli # awscli
# datamodel-code-generator # datamodel-code-generator
# datasets # datasets
# genai-perf
# huggingface-hub # huggingface-hub
# peft # peft
# ray # ray
# responses
# timm # timm
# transformers # transformers
# vocos
ray[adag]==2.40.0 ray[adag]==2.40.0
# via -r requirements-test.in # via -r requirements-test.in
redis==5.2.0 redis==5.2.0
...@@ -434,8 +494,13 @@ requests==2.32.3 ...@@ -434,8 +494,13 @@ requests==2.32.3
# mistral-common # mistral-common
# pooch # pooch
# ray # ray
# responses
# tiktoken # tiktoken
# transformers # transformers
responses==0.25.3
# via genai-perf
rich==13.9.4
# via genai-perf
rouge-score==0.1.2 rouge-score==0.1.2
# via lm-eval # via lm-eval
rpds-py==0.20.1 rpds-py==0.20.1
...@@ -466,6 +531,8 @@ scipy==1.13.1 ...@@ -466,6 +531,8 @@ scipy==1.13.1
# librosa # librosa
# scikit-learn # scikit-learn
# sentence-transformers # sentence-transformers
# statsmodels
# vocos
sentence-transformers==3.2.1 sentence-transformers==3.2.1
# via -r requirements-test.in # via -r requirements-test.in
sentencepiece==0.2.0 sentencepiece==0.2.0
...@@ -486,8 +553,12 @@ soxr==0.5.0.post1 ...@@ -486,8 +553,12 @@ soxr==0.5.0.post1
# via librosa # via librosa
sqlitedict==2.1.0 sqlitedict==2.1.0
# via lm-eval # via lm-eval
statsmodels==0.14.4
# via genai-perf
sympy==1.13.1 sympy==1.13.1
# via torch # via
# einx
# torch
tabledata==1.3.3 tabledata==1.3.3
# via pytablewriter # via pytablewriter
tabulate==0.9.0 tabulate==0.9.0
...@@ -495,7 +566,9 @@ tabulate==0.9.0 ...@@ -495,7 +566,9 @@ tabulate==0.9.0
tcolorpy==0.1.6 tcolorpy==0.1.6
# via pytablewriter # via pytablewriter
tenacity==9.0.0 tenacity==9.0.0
# via lm-eval # via
# lm-eval
# plotly
tensorizer==2.9.0 tensorizer==2.9.0
# via -r requirements-test.in # via -r requirements-test.in
threadpoolctl==3.5.0 threadpoolctl==3.5.0
...@@ -513,12 +586,21 @@ torch==2.5.1 ...@@ -513,12 +586,21 @@ torch==2.5.1
# -r requirements-test.in # -r requirements-test.in
# accelerate # accelerate
# bitsandbytes # bitsandbytes
# encodec
# lm-eval # lm-eval
# peft # peft
# sentence-transformers # sentence-transformers
# tensorizer # tensorizer
# timm # timm
# torchaudio
# torchvision # torchvision
# vector-quantize-pytorch
# vocos
torchaudio==2.5.1
# via
# -r requirements-test.in
# encodec
# vocos
torchvision==0.20.1 torchvision==0.20.1
# via timm # via timm
tqdm==4.66.6 tqdm==4.66.6
...@@ -529,13 +611,16 @@ tqdm==4.66.6 ...@@ -529,13 +611,16 @@ tqdm==4.66.6
# lm-eval # lm-eval
# nltk # nltk
# peft # peft
# pqdm
# sentence-transformers # sentence-transformers
# tqdm-multiprocess # tqdm-multiprocess
# transformers # transformers
tqdm-multiprocess==0.0.11 tqdm-multiprocess==0.0.11
# via lm-eval # via lm-eval
transformers==4.47.0 transformers==4.48.2
# via # via
# -r requirements-test.in
# genai-perf
# lm-eval # lm-eval
# peft # peft
# sentence-transformers # sentence-transformers
...@@ -544,6 +629,10 @@ transformers-stream-generator==0.0.5 ...@@ -544,6 +629,10 @@ transformers-stream-generator==0.0.5
# via -r requirements-test.in # via -r requirements-test.in
triton==3.1.0 triton==3.1.0
# via torch # via torch
tritonclient==2.51.0
# via
# -r requirements-test.in
# genai-perf
typepy[datetime]==1.3.2 typepy[datetime]==1.3.2
# via # via
# dataproperty # dataproperty
...@@ -551,18 +640,26 @@ typepy[datetime]==1.3.2 ...@@ -551,18 +640,26 @@ typepy[datetime]==1.3.2
# tabledata # tabledata
typing-extensions==4.12.2 typing-extensions==4.12.2
# via # via
# bitsandbytes
# huggingface-hub # huggingface-hub
# librosa # librosa
# mistral-common # mistral-common
# pqdm
# pydantic # pydantic
# pydantic-core # pydantic-core
# torch # torch
tzdata==2024.2 tzdata==2024.2
# via pandas # via pandas
urllib3==1.26.20 urllib3==2.2.3
# via # via
# botocore # botocore
# requests # requests
# responses
# tritonclient
vector-quantize-pytorch==1.21.2
# via -r requirements-test.in
vocos==0.1.0
# via -r requirements-test.in
word2number==1.1 word2number==1.1
# via lm-eval # via lm-eval
xxhash==3.5.0 xxhash==3.5.0
......
...@@ -13,11 +13,11 @@ ray[default] ...@@ -13,11 +13,11 @@ ray[default]
# Install torch_xla # Install torch_xla
--pre --pre
--extra-index-url https://download.pytorch.org/whl/nightly/cpu --extra-index-url https://download.pytorch.org/whl/nightly/cpu
--find-links https://storage.googleapis.com/libtpu-wheels/index.html
--find-links https://storage.googleapis.com/libtpu-releases/index.html --find-links https://storage.googleapis.com/libtpu-releases/index.html
--find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jax_nightly_releases.html
--find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html --find-links https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
torch==2.6.0.dev20241126+cpu torch==2.6.0.dev20241216+cpu
torchvision==0.20.0.dev20241126+cpu torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp39-cp39-linux_x86_64.whl ; python_version == "3.9"
torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241126-cp310-cp310-linux_x86_64.whl torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp310-cp310-linux_x86_64.whl ; python_version == "3.10"
jaxlib==0.4.36.dev20241122 torch_xla[tpu, pallas] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.7.0.dev20250124-cp311-cp311-linux_x86_64.whl ; python_version == "3.11"
jax==0.4.36.dev20241122
import ctypes
import importlib.util import importlib.util
import logging import logging
import os import os
...@@ -13,7 +14,7 @@ from packaging.version import Version, parse ...@@ -13,7 +14,7 @@ from packaging.version import Version, parse
from setuptools import Extension, find_packages, setup from setuptools import Extension, find_packages, setup
from setuptools.command.build_ext import build_ext from setuptools.command.build_ext import build_ext
from setuptools_scm import get_version from setuptools_scm import get_version
from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
from typing import Optional, Union from typing import Optional, Union
import subprocess import subprocess
...@@ -40,9 +41,14 @@ envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py')) ...@@ -40,9 +41,14 @@ envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
if not sys.platform.startswith("linux"): if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
logger.warning( logger.warning(
"vLLM only supports Linux platform (including WSL). " "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
VLLM_TARGET_DEVICE = "cpu"
elif not (sys.platform.startswith("linux")
or sys.platform.startswith("darwin")):
logger.warning(
"vLLM only supports Linux platform (including WSL) and MacOS."
"Building on %s, " "Building on %s, "
"so vLLM may not be able to run correctly", sys.platform) "so vLLM may not be able to run correctly", sys.platform)
VLLM_TARGET_DEVICE = "empty" VLLM_TARGET_DEVICE = "empty"
...@@ -229,8 +235,11 @@ class cmake_build_ext(build_ext): ...@@ -229,8 +235,11 @@ class cmake_build_ext(build_ext):
# CMake appends the extension prefix to the install path, # CMake appends the extension prefix to the install path,
# and outdir already contains that prefix, so we need to remove it. # and outdir already contains that prefix, so we need to remove it.
# We assume only the final component of extension prefix is added by
# CMake, this is currently true for current extensions but may not
# always be the case.
prefix = outdir prefix = outdir
for i in range(ext.name.count('.')): if '.' in ext.name:
prefix = prefix.parent prefix = prefix.parent
# prefix here should actually be the same for all components # prefix here should actually be the same for all components
...@@ -258,7 +267,7 @@ class cmake_build_ext(build_ext): ...@@ -258,7 +267,7 @@ class cmake_build_ext(build_ext):
class repackage_wheel(build_ext): class repackage_wheel(build_ext):
"""Extracts libraries and other files from an existing wheel.""" """Extracts libraries and other files from an existing wheel."""
default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl" default_wheel = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
def run(self) -> None: def run(self) -> None:
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",
...@@ -299,9 +308,11 @@ class repackage_wheel(build_ext): ...@@ -299,9 +308,11 @@ class repackage_wheel(build_ext):
files_to_copy = [ files_to_copy = [
"vllm/_C.abi3.so", "vllm/_C.abi3.so",
"vllm/_moe_C.abi3.so", "vllm/_moe_C.abi3.so",
"vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so", "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/vllm_flash_attn/flash_attn_interface.py", "vllm/vllm_flash_attn/flash_attn_interface.py",
"vllm/vllm_flash_attn/__init__.py", "vllm/vllm_flash_attn/__init__.py",
"vllm/cumem_allocator.abi3.so",
# "vllm/_version.py", # not available in nightly wheels yet # "vllm/_version.py", # not available in nightly wheels yet
] ]
file_members = filter(lambda x: x.filename in files_to_copy, file_members = filter(lambda x: x.filename in files_to_copy,
...@@ -325,21 +336,26 @@ class repackage_wheel(build_ext): ...@@ -325,21 +336,26 @@ class repackage_wheel(build_ext):
def _is_hpu() -> bool: def _is_hpu() -> bool:
is_hpu_available = True # if VLLM_TARGET_DEVICE env var was set explicitly, skip HPU autodetection
if os.getenv("VLLM_TARGET_DEVICE", None) == VLLM_TARGET_DEVICE:
return VLLM_TARGET_DEVICE == "hpu"
# if VLLM_TARGET_DEVICE was not set explicitly, check if hl-smi succeeds,
# and if it doesn't, check if habanalabs driver is loaded
is_hpu_available = False
try: try:
subprocess.run(["hl-smi"], capture_output=True, check=True) out = subprocess.run(["hl-smi"], capture_output=True, check=True)
is_hpu_available = out.returncode == 0
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError): except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
if not os.path.exists('/dev/accel/accel0') and not os.path.exists( if sys.platform.startswith("linux"):
'/dev/accel/accel_controlD0'):
# last resort...
try: try:
output = subprocess.check_output( output = subprocess.check_output(
'lsmod | grep habanalabs | wc -l', shell=True) 'lsmod | grep habanalabs | wc -l', shell=True)
is_hpu_available = int(output) > 0 is_hpu_available = int(output) > 0
except (ValueError, FileNotFoundError, PermissionError, except (ValueError, FileNotFoundError, PermissionError,
subprocess.CalledProcessError): subprocess.CalledProcessError):
is_hpu_available = False pass
return is_hpu_available or VLLM_TARGET_DEVICE == "hpu" return is_hpu_available
def _no_device() -> bool: def _no_device() -> bool:
...@@ -386,25 +402,31 @@ def _build_custom_ops() -> bool: ...@@ -386,25 +402,31 @@ def _build_custom_ops() -> bool:
return _is_cuda() or _is_hip() or _is_cpu() return _is_cuda() or _is_hip() or _is_cpu()
def get_hipcc_rocm_version(): def get_rocm_version():
# Run the hipcc --version command # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
result = subprocess.run(['hipcc', '--version'], # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
stdout=subprocess.PIPE, try:
stderr=subprocess.STDOUT, librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
text=True) if not librocm_core_file.is_file():
return None
librocm_core = ctypes.CDLL(librocm_core_file)
VerErrors = ctypes.c_uint32
get_rocm_core_version = librocm_core.getROCmVersion
get_rocm_core_version.restype = VerErrors
get_rocm_core_version.argtypes = [
ctypes.POINTER(ctypes.c_uint32),
ctypes.POINTER(ctypes.c_uint32),
ctypes.POINTER(ctypes.c_uint32),
]
major = ctypes.c_uint32()
minor = ctypes.c_uint32()
patch = ctypes.c_uint32()
# Check if the command was executed successfully if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
if result.returncode != 0: ctypes.byref(patch)) == 0):
print("Error running 'hipcc --version'") return f"{major.value}.{minor.value}.{patch.value}"
return None return None
except Exception:
# Extract the version using a regular expression
match = re.search(r'HIP version: (\S+)', result.stdout)
if match:
# Return the version string
return match.group(1)
else:
print("Could not find HIP version in the output")
return None return None
...@@ -482,9 +504,9 @@ def get_version_add(sha: Optional[str] = None) -> str: ...@@ -482,9 +504,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
new_version_content = f""" new_version_content = f"""
try: try:
__version__ = "0.6.6.post1" __version__ = "0.7.1"
__version_tuple__ = (0, 6, 6) __version_tuple__ = (0, 7, 1)
__hcu_version__ = f'0.6.6.post1+{version}' __hcu_version__ = f'0.7.1+{version}'
from vllm.version import __version__, __version_tuple__, __hcu_version__ from vllm.version import __version__, __version_tuple__, __hcu_version__
except Exception as e: except Exception as e:
...@@ -527,14 +549,10 @@ def get_gaudi_sw_version(): ...@@ -527,14 +549,10 @@ def get_gaudi_sw_version():
def get_vllm_version() -> str: def get_vllm_version() -> str:
# TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236 if not _is_hip():
try: version = get_version(
if not _is_hip(): write_to="vllm/_version.py", # TODO: move this to pyproject.toml
version = get_version( )
write_to="vllm/_version.py", # TODO: move this to pyproject.toml
)
except LookupError:
version = "0.0.0"
sep = "+" if "+" not in version else "." # dev versions might contain + sep = "+" if "+" not in version else "." # dev versions might contain +
...@@ -552,11 +570,10 @@ def get_vllm_version() -> str: ...@@ -552,11 +570,10 @@ def get_vllm_version() -> str:
if "sdist" not in sys.argv: if "sdist" not in sys.argv:
version += f"{sep}cu{cuda_version_str}" version += f"{sep}cu{cuda_version_str}"
elif _is_hip(): elif _is_hip():
# Get the HIP version # Get the Rocm Version
# hipcc_version = get_hipcc_rocm_version() # rocm_version = get_rocm_version() or torch.version.hip
# if hipcc_version != MAIN_CUDA_VERSION: # if rocm_version and rocm_version != MAIN_CUDA_VERSION:
# rocm_version_str = hipcc_version.replace(".", "")[:3] # version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
# version += f"{sep}rocm{rocm_version_str}"
version = get_version() version = get_version()
elif _is_neuron(): elif _is_neuron():
# Get the Neuron version # Get the Neuron version
...@@ -611,7 +628,7 @@ def get_requirements() -> List[str]: ...@@ -611,7 +628,7 @@ def get_requirements() -> List[str]:
return resolved_requirements return resolved_requirements
if _no_device(): if _no_device():
requirements = _read_requirements("requirements-cuda.txt") requirements = _read_requirements("requirements-cpu.txt")
elif _is_cuda(): elif _is_cuda():
requirements = _read_requirements("requirements-cuda.txt") requirements = _read_requirements("requirements-cuda.txt")
cuda_major, cuda_minor = torch.version.cuda.split(".") cuda_major, cuda_minor = torch.version.cuda.split(".")
...@@ -654,14 +671,24 @@ if _is_cuda() or _is_hip(): ...@@ -654,14 +671,24 @@ if _is_cuda() or _is_hip():
# ext_modules.append(CMakeExtension(name="vllm._rocm_C")) # ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
if _is_cuda(): if _is_cuda():
ext_modules.append( ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c")) if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.0"):
# FA3 requires CUDA 12.0 or later
ext_modules.append(
CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
if _build_custom_ops(): if _build_custom_ops():
ext_modules.append(CMakeExtension(name="vllm._C")) ext_modules.append(CMakeExtension(name="vllm._C"))
package_data = { package_data = {
"vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json", "benchmarks/*.py","model_executor/layers/quantization/configs/w8a8/*.json"] "vllm": [
"py.typed",
"model_executor/layers/fused_moe/configs/*.json",
"model_executor/layers/quantization/utils/configs/*.json",
"benchmarks/*.py",
"model_executor/layers/quantization/configs/w8a8/*.json"
]
} }
if _no_device(): if _no_device():
......
...@@ -27,27 +27,32 @@ def _query_server_long(prompt: str) -> dict: ...@@ -27,27 +27,32 @@ def _query_server_long(prompt: str) -> dict:
@pytest.fixture @pytest.fixture
def api_server(tokenizer_pool_size: int, worker_use_ray: bool): def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
script_path = Path(__file__).parent.joinpath( script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute() "api_server_async_engine.py").absolute()
commands = [ commands = [
sys.executable, "-u", sys.executable,
str(script_path), "--model", os.path.join(models_path_prefix, "facebook/opt-125m"), "--host", "-u",
"127.0.0.1", "--tokenizer-pool-size", str(script_path),
str(tokenizer_pool_size) "--model",
os.path.join(models_path_prefix, "facebook/opt-125m"),
"--host",
"127.0.0.1",
"--tokenizer-pool-size",
str(tokenizer_pool_size),
"--distributed-executor-backend",
distributed_executor_backend,
] ]
if worker_use_ray:
commands.append("--worker-use-ray")
uvicorn_process = subprocess.Popen(commands) uvicorn_process = subprocess.Popen(commands)
yield yield
uvicorn_process.terminate() uvicorn_process.terminate()
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2]) @pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
@pytest.mark.parametrize("worker_use_ray", [False, True]) @pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
def test_api_server(api_server, tokenizer_pool_size: int, def test_api_server(api_server, tokenizer_pool_size: int,
worker_use_ray: bool): distributed_executor_backend: str):
""" """
Run the API server and test it. Run the API server and test it.
......
...@@ -46,7 +46,6 @@ def test_vllm_gc_ed(): ...@@ -46,7 +46,6 @@ def test_vllm_gc_ed():
assert weak_llm() is None assert weak_llm() is None
@pytest.mark.skip_v1
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
# @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"]) # @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
@pytest.mark.parametrize("backend", ["FLASH_ATTN"]) @pytest.mark.parametrize("backend", ["FLASH_ATTN"])
...@@ -65,9 +64,10 @@ def test_models( ...@@ -65,9 +64,10 @@ def test_models(
if backend == "FLASHINFER" and current_platform.is_rocm(): if backend == "FLASHINFER" and current_platform.is_rocm():
pytest.skip("Flashinfer does not support ROCm/HIP.") pytest.skip("Flashinfer does not support ROCm/HIP.")
if backend == "XFORMERS" and model == "google/gemma-2-2b-it": if backend in ("XFORMERS",
"FLASHINFER") and model == "google/gemma-2-2b-it":
pytest.skip( pytest.skip(
"XFORMERS does not support gemma2 with full context length.") f"{backend} does not support gemma2 with full context length.")
os.environ["VLLM_ATTENTION_BACKEND"] = backend os.environ["VLLM_ATTENTION_BACKEND"] = backend
......
import torch
from vllm import LLM, SamplingParams
from vllm.device_allocator.cumem import CuMemAllocator
from vllm.utils import GiB_bytes
from ..utils import fork_new_process_for_each_test
@fork_new_process_for_each_test
def test_basic_cumem():
# some tensors from default memory pool
shape = (1024, 1024)
x = torch.empty(shape, device='cuda')
x.zero_()
# some tensors from custom memory pool
allocator = CuMemAllocator.get_instance()
with allocator.use_memory_pool():
# custom memory pool
y = torch.empty(shape, device='cuda')
y.zero_()
y += 1
z = torch.empty(shape, device='cuda')
z.zero_()
z += 2
# they can be used together
output = x + y + z
assert torch.allclose(output, torch.ones_like(output) * 3)
free_bytes = torch.cuda.mem_get_info()[0]
allocator.sleep()
free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
assert free_bytes_after_sleep > free_bytes
allocator.wake_up()
# they can be used together
output = x + y + z
assert torch.allclose(output, torch.ones_like(output) * 3)
@fork_new_process_for_each_test
def test_cumem_with_cudagraph():
allocator = CuMemAllocator.get_instance()
with allocator.use_memory_pool():
weight = torch.eye(1024, device='cuda')
with allocator.use_memory_pool(tag="discard"):
cache = torch.empty(1024, 1024, device='cuda')
def model(x):
out = x @ weight
cache[:out.size(0)].copy_(out)
return out + 1
x = torch.empty(128, 1024, device='cuda')
# warmup
model(x)
# capture cudagraph
model_graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(model_graph):
y = model(x)
free_bytes = torch.cuda.mem_get_info()[0]
allocator.sleep()
free_bytes_after_sleep = torch.cuda.mem_get_info()[0]
assert free_bytes_after_sleep > free_bytes
allocator.wake_up()
# after waking up, the content in the weight tensor
# should be restored, but the content in the cache tensor
# should be discarded
# this operation is also compatible with cudagraph
x.random_()
model_graph.replay()
# cache content is as expected
assert torch.allclose(x, cache[:x.size(0)])
# output content is as expected
assert torch.allclose(y, x + 1)
@fork_new_process_for_each_test
def test_end_to_end():
free, total = torch.cuda.mem_get_info()
used_bytes_baseline = total - free # in case other process is running
llm = LLM("meta-llama/Llama-3.2-1B", enable_sleep_mode=True)
prompt = "How are you?"
sampling_params = SamplingParams(temperature=0, max_tokens=10)
output = llm.generate(prompt, sampling_params)
# the benefit of `llm.sleep(level=2)` is mainly CPU memory usage,
# which is difficult to measure in the test. therefore, we only
# test sleep level 1 here.
llm.sleep(level=1)
free_gpu_bytes_after_sleep, total = torch.cuda.mem_get_info()
used_bytes = total - free_gpu_bytes_after_sleep - used_bytes_baseline
# now the memory usage is mostly cudagraph memory pool,
# and it should be less than the model weights (1B model, 2GiB weights)
assert used_bytes < 2 * GiB_bytes
llm.wake_up()
output2 = llm.generate(prompt, sampling_params)
# cmp output
assert output[0].outputs[0].text == output2[0].outputs[0].text
...@@ -32,10 +32,10 @@ def check_settings(): ...@@ -32,10 +32,10 @@ def check_settings():
@pytest.fixture @pytest.fixture
def worker_use_ray() -> bool: def distributed_executor_backend() -> str:
# When SPMD worker is used, use ray_use_worker=True # When SPMD worker is used, use distributed_executor_backend="ray"
# to test delta input optimization works with preemption. # to test delta input optimization works with preemption.
return envs.VLLM_USE_RAY_SPMD_WORKER return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
...@@ -50,7 +50,7 @@ def test_chunked_prefill_recompute( ...@@ -50,7 +50,7 @@ def test_chunked_prefill_recompute(
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
chunked_prefill_token_size: int, chunked_prefill_token_size: int,
worker_use_ray: bool, distributed_executor_backend: str,
) -> None: ) -> None:
"""Ensure that chunked prefill works with preemption.""" """Ensure that chunked prefill works with preemption."""
max_num_seqs = min(chunked_prefill_token_size, 256) max_num_seqs = min(chunked_prefill_token_size, 256)
...@@ -69,7 +69,7 @@ def test_chunked_prefill_recompute( ...@@ -69,7 +69,7 @@ def test_chunked_prefill_recompute(
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
worker_use_ray=worker_use_ray, distributed_executor_backend=distributed_executor_backend,
disable_log_stats=False, disable_log_stats=False,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
...@@ -97,7 +97,7 @@ def test_preemption( ...@@ -97,7 +97,7 @@ def test_preemption(
model: str, model: str,
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
worker_use_ray: bool, distributed_executor_backend: str,
) -> None: ) -> None:
"""By default, recompute preemption is enabled""" """By default, recompute preemption is enabled"""
...@@ -108,7 +108,7 @@ def test_preemption( ...@@ -108,7 +108,7 @@ def test_preemption(
model, model,
dtype=dtype, dtype=dtype,
disable_log_stats=False, disable_log_stats=False,
worker_use_ray=worker_use_ray, distributed_executor_backend=distributed_executor_backend,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
...@@ -149,7 +149,7 @@ def test_preemption_infeasible( ...@@ -149,7 +149,7 @@ def test_preemption_infeasible(
model: str, model: str,
dtype: str, dtype: str,
max_tokens: int, max_tokens: int,
worker_use_ray: bool, distributed_executor_backend: str,
) -> None: ) -> None:
"""Verify infeasible preemption request will be ignored.""" """Verify infeasible preemption request will be ignored."""
BLOCK_SIZE = 16 BLOCK_SIZE = 16
...@@ -164,7 +164,7 @@ def test_preemption_infeasible( ...@@ -164,7 +164,7 @@ def test_preemption_infeasible(
# ignored instead of hanging forever. # ignored instead of hanging forever.
num_gpu_blocks_override=prefill_blocks + decode_blocks // 2, num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE), max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
worker_use_ray=worker_use_ray, distributed_executor_backend=distributed_executor_backend,
) as vllm_model: ) as vllm_model:
sampling_params = SamplingParams(max_tokens=max_tokens, sampling_params = SamplingParams(max_tokens=max_tokens,
ignore_eos=True) ignore_eos=True)
......
...@@ -7,7 +7,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are ...@@ -7,7 +7,7 @@ if the config `tractable_init` is set to True. Otherwise, the weights are
initialized randomly with a fixed seed. initialized randomly with a fixed seed.
""" """
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional, Tuple from typing import Any, List, Optional, Tuple
import torch import torch
from torch import nn from torch import nn
...@@ -54,6 +54,16 @@ class LlamaConfig: ...@@ -54,6 +54,16 @@ class LlamaConfig:
tractable_init: bool = False tractable_init: bool = False
random_seed: int = 0 random_seed: int = 0
def compute_hash(self) -> str:
factors: List[Any] = []
for k, v in self.__dict__.items():
if k == "random_seed":
continue
factors.append((k, v))
factors.sort()
import hashlib
return hashlib.md5(str(factors).encode()).hexdigest()
def __post_init__(self): def __post_init__(self):
assert self.mlp_size >= self.hidden_size assert self.mlp_size >= self.hidden_size
...@@ -263,7 +273,8 @@ def run_model(llama_config, ...@@ -263,7 +273,8 @@ def run_model(llama_config,
compilation_config = CompilationConfig( compilation_config = CompilationConfig(
level=CompilationLevel.NO_COMPILATION, ) level=CompilationLevel.NO_COMPILATION, )
vllm_config = VllmConfig(compilation_config=compilation_config) vllm_config = VllmConfig(compilation_config=compilation_config,
additional_config=llama_config)
with set_current_vllm_config(vllm_config): with set_current_vllm_config(vllm_config):
model = LlamaModel(config=llama_config, model = LlamaModel(config=llama_config,
vllm_config=vllm_config, vllm_config=vllm_config,
......
...@@ -59,7 +59,7 @@ test_settings = [ ...@@ -59,7 +59,7 @@ test_settings = [
model_args=["--task", "embed"], model_args=["--task", "embed"],
pp_size=1, pp_size=1,
tp_size=1, tp_size=1,
attn_backend="FLASHINFER", attn_backend="FLASH_ATTN",
method="encode", method="encode",
fullgraph=True, fullgraph=True,
), ),
......
...@@ -30,13 +30,13 @@ from vllm.distributed import (cleanup_dist_env_and_memory, ...@@ -30,13 +30,13 @@ from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment, init_distributed_environment,
initialize_model_parallel) initialize_model_parallel)
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
to_enc_dec_tuple_list, zip_enc_dec_prompts) TokensPrompt, to_enc_dec_tuple_list,
zip_enc_dec_prompts)
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.outputs import RequestOutput from vllm.outputs import RequestOutput
from vllm.platforms import current_platform
from vllm.sampling_params import BeamSearchParams from vllm.sampling_params import BeamSearchParams
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless, from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, cuda_device_count_stateless,
identity) identity, is_list_of)
from .utils import models_path_prefix from .utils import models_path_prefix
logger = init_logger(__name__) logger = init_logger(__name__)
...@@ -44,6 +44,7 @@ logger = init_logger(__name__) ...@@ -44,6 +44,7 @@ logger = init_logger(__name__)
_TEST_DIR = os.path.dirname(__file__) _TEST_DIR = os.path.dirname(__file__)
_TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")] _TEST_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "example.txt")]
_LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")] _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
_SYS_MSG = os.path.join(_TEST_DIR, "system_messages", "sonnet3.5_nov2024.txt")
_M = TypeVar("_M") _M = TypeVar("_M")
_PromptMultiModalInput = Union[List[_M], List[List[_M]]] _PromptMultiModalInput = Union[List[_M], List[List[_M]]]
...@@ -181,6 +182,12 @@ def example_prompts() -> List[str]: ...@@ -181,6 +182,12 @@ def example_prompts() -> List[str]:
return prompts return prompts
@pytest.fixture
def example_system_message() -> str:
with open(_SYS_MSG) as f:
return f.read()
class DecoderPromptType(Enum): class DecoderPromptType(Enum):
"""For encoder/decoder models only.""" """For encoder/decoder models only."""
CUSTOM = 1 CUSTOM = 1
...@@ -240,11 +247,13 @@ def video_assets() -> _VideoAssets: ...@@ -240,11 +247,13 @@ def video_assets() -> _VideoAssets:
_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict) _T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding, BatchFeature, dict)
_R = TypeVar("_R")
class HfRunner: class HfRunner:
def wrap_device(self, x: _T, device: Optional[str] = None) -> _T: def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
from vllm.platforms import current_platform
if x is None or isinstance(x, (bool, )): if x is None or isinstance(x, (bool, )):
return x return x
...@@ -882,6 +891,12 @@ class VllmRunner: ...@@ -882,6 +891,12 @@ class VllmRunner:
beam_width: int, beam_width: int,
max_tokens: int, max_tokens: int,
) -> List[Tuple[List[List[int]], List[str]]]: ) -> List[Tuple[List[List[int]], List[str]]]:
if is_list_of(prompts, str, check="all"):
prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
else:
prompts = [
TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
]
outputs = self.model.beam_search( outputs = self.model.beam_search(
prompts, prompts,
BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens)) BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
...@@ -919,6 +934,10 @@ class VllmRunner: ...@@ -919,6 +934,10 @@ class VllmRunner:
req_outputs = self.model.score(text_1, text_2) req_outputs = self.model.score(text_1, text_2)
return [req_output.outputs.score for req_output in req_outputs] return [req_output.outputs.score for req_output in req_outputs]
def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
executor = self.model.llm_engine.model_executor
return executor.apply_model(func)
def __enter__(self): def __enter__(self):
return self return self
......
...@@ -796,6 +796,44 @@ class TestPrefixCachingBlockAllocator: ...@@ -796,6 +796,44 @@ class TestPrefixCachingBlockAllocator:
block_hashes=block_hashes_seq1) block_hashes=block_hashes_seq1)
assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks assert len(cached_blocks) == len(blocks_seq1) - num_evicted_blocks
# Test reset prefix cache
@staticmethod
@pytest.mark.parametrize("num_blocks", [10])
@pytest.mark.parametrize("block_size", [16])
def test_reset_prefix_cache(num_blocks: int, block_size: int):
"""This test case simulates the case of resetting the prefix cache."""
allocator = PrefixCachingBlockAllocator(num_blocks=num_blocks,
block_size=block_size)
token_ids = list(range(3 * block_size))
first_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
second_chain = TestPrefixCachingBlockAllocator.create_immutable_chain(
block_size=block_size,
token_ids=token_ids,
allocator=allocator,
)
# Free each block in the first chain.
for block in first_chain:
allocator.free(block)
# Failed to reset prefix cache because some blocks are not freed yet.
assert not allocator.reset_prefix_cache()
assert allocator.get_prefix_cache_hit_rate() > 0.0
# Free each block in the second chain.
for block in second_chain:
allocator.free(block)
# Reset prefix cache.
assert allocator.reset_prefix_cache()
assert allocator.get_prefix_cache_hit_rate() == 0.0
@staticmethod @staticmethod
def create_immutable_chain( def create_immutable_chain(
block_size: int, block_size: int,
......
...@@ -50,7 +50,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port): ...@@ -50,7 +50,7 @@ def graph_allreduce(tp_size, pp_size, rank, distributed_init_port):
for sz in test_sizes: for sz in test_sizes:
for dtype in [torch.float32, torch.float16, torch.bfloat16]: for dtype in [torch.float32, torch.float16, torch.bfloat16]:
with graph_capture() as graph_capture_context: with graph_capture(device=device) as graph_capture_context:
# use integers so result matches NCCL exactly # use integers so result matches NCCL exactly
inp1 = torch.randint(1, inp1 = torch.randint(1,
16, (sz, ), 16, (sz, ),
......
...@@ -59,8 +59,7 @@ def worker_fn(): ...@@ -59,8 +59,7 @@ def worker_fn():
device=get_world_group().device) device=get_world_group().device)
tensor = torch.ones(16, 1024, 1024, tensor = torch.ones(16, 1024, 1024,
dtype=torch.float32).cuda(pynccl_comm.rank) dtype=torch.float32).cuda(pynccl_comm.rank)
with pynccl_comm.change_state(enable=True): tensor = pynccl_comm.all_reduce(tensor)
tensor = pynccl_comm.all_reduce(tensor)
torch.cuda.synchronize() torch.cuda.synchronize()
assert torch.all(tensor == pynccl_comm.world_size).cpu().item() assert torch.all(tensor == pynccl_comm.world_size).cpu().item()
...@@ -81,17 +80,16 @@ def multiple_allreduce_worker_fn(): ...@@ -81,17 +80,16 @@ def multiple_allreduce_worker_fn():
group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1] group = groups[0] if torch.distributed.get_rank() in [0, 1] else groups[1]
pynccl_comm = PyNcclCommunicator(group=group, device=device) pynccl_comm = PyNcclCommunicator(group=group, device=device)
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
with pynccl_comm.change_state(enable=True): # two groups can communicate independently
# two groups can communicate independently if torch.distributed.get_rank() in [0, 1]:
if torch.distributed.get_rank() in [0, 1]: tensor = pynccl_comm.all_reduce(tensor)
tensor = pynccl_comm.all_reduce(tensor) tensor = pynccl_comm.all_reduce(tensor)
tensor = pynccl_comm.all_reduce(tensor) torch.cuda.synchronize()
torch.cuda.synchronize() assert torch.all(tensor == 4).cpu().item()
assert torch.all(tensor == 4).cpu().item() else:
else: tensor = pynccl_comm.all_reduce(tensor)
tensor = pynccl_comm.all_reduce(tensor) torch.cuda.synchronize()
torch.cuda.synchronize() assert torch.all(tensor == 2).cpu().item()
assert torch.all(tensor == 2).cpu().item()
@pytest.mark.skipif(torch.cuda.device_count() < 4, @pytest.mark.skipif(torch.cuda.device_count() < 4,
...@@ -107,7 +105,7 @@ def multiple_allreduce_with_vllm_worker_fn(): ...@@ -107,7 +105,7 @@ def multiple_allreduce_with_vllm_worker_fn():
device = torch.device(f"cuda:{torch.distributed.get_rank()}") device = torch.device(f"cuda:{torch.distributed.get_rank()}")
ensure_model_parallel_initialized(2, 2) ensure_model_parallel_initialized(2, 2)
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device) tensor = torch.ones(16, 1024, 1024, dtype=torch.float32, device=device)
with graph_capture(): with graph_capture(device=device):
# two tp groups can communicate independently # two tp groups can communicate independently
if torch.distributed.get_rank() in [0, 1]: if torch.distributed.get_rank() in [0, 1]:
tensor = tensor_model_parallel_all_reduce(tensor) tensor = tensor_model_parallel_all_reduce(tensor)
...@@ -137,9 +135,7 @@ def worker_fn_with_cudagraph(): ...@@ -137,9 +135,7 @@ def worker_fn_with_cudagraph():
# run something in the default stream to initialize torch engine # run something in the default stream to initialize torch engine
a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}') a = torch.ones((4, 4), device=f'cuda:{pynccl_comm.rank}')
torch.cuda.synchronize() torch.cuda.synchronize()
with torch.cuda.graph( with torch.cuda.graph(graph):
graph, stream=pynccl_comm.stream), pynccl_comm.change_state(
enable=True):
a_out = pynccl_comm.all_reduce(a) a_out = pynccl_comm.all_reduce(a)
torch.cuda.synchronize() torch.cuda.synchronize()
graph.replay() graph.replay()
...@@ -168,8 +164,7 @@ def all_gather_worker_fn(): ...@@ -168,8 +164,7 @@ def all_gather_worker_fn():
for r in range(world_size) for r in range(world_size)
]).to(device) ]).to(device)
with pynccl_comm.change_state(enable=True): pynccl_comm.all_gather(result, tensor)
pynccl_comm.all_gather(result, tensor)
torch.cuda.synchronize() torch.cuda.synchronize()
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
...@@ -206,8 +201,7 @@ def reduce_scatter_worker_fn(): ...@@ -206,8 +201,7 @@ def reduce_scatter_worker_fn():
expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size] expected = sum(tensor[rank * scattered_size:(rank + 1) * scattered_size]
for tensor in all_tensors).to(device) for tensor in all_tensors).to(device)
with pynccl_comm.change_state(enable=True): pynccl_comm.reduce_scatter(result, tensor)
pynccl_comm.reduce_scatter(result, tensor)
torch.cuda.synchronize() torch.cuda.synchronize()
torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8) torch.testing.assert_close(result, expected, rtol=1e-5, atol=1e-8)
...@@ -234,15 +228,13 @@ def send_recv_worker_fn(): ...@@ -234,15 +228,13 @@ def send_recv_worker_fn():
else: else:
tensor = torch.empty(16, 1024, 1024, tensor = torch.empty(16, 1024, 1024,
dtype=torch.float32).cuda(pynccl_comm.rank) dtype=torch.float32).cuda(pynccl_comm.rank)
with pynccl_comm.change_state(enable=True):
if pynccl_comm.rank == 0: if pynccl_comm.rank == 0:
pynccl_comm.send(tensor, pynccl_comm.send(tensor,
dst=(pynccl_comm.rank + 1) % dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
pynccl_comm.world_size) else:
else: pynccl_comm.recv(tensor,
pynccl_comm.recv(tensor, src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
src=(pynccl_comm.rank - 1) %
pynccl_comm.world_size)
torch.cuda.synchronize() torch.cuda.synchronize()
assert torch.all(tensor == 1).cpu().item() assert torch.all(tensor == 1).cpu().item()
...@@ -273,15 +265,12 @@ def multiple_send_recv_worker_fn(): ...@@ -273,15 +265,12 @@ def multiple_send_recv_worker_fn():
1024, 1024,
dtype=torch.float32, dtype=torch.float32,
device=device) device=device)
with pynccl_comm.change_state(enable=True): if torch.distributed.get_rank() in [0, 1]:
if torch.distributed.get_rank() in [0, 1]: pynccl_comm.send(tensor,
pynccl_comm.send(tensor, dst=(pynccl_comm.rank + 1) % pynccl_comm.world_size)
dst=(pynccl_comm.rank + 1) % else:
pynccl_comm.world_size) pynccl_comm.recv(tensor,
else: src=(pynccl_comm.rank - 1) % pynccl_comm.world_size)
pynccl_comm.recv(tensor,
src=(pynccl_comm.rank - 1) %
pynccl_comm.world_size)
torch.cuda.synchronize() torch.cuda.synchronize()
if torch.distributed.get_rank() in [0, 2]: if torch.distributed.get_rank() in [0, 2]:
assert torch.all(tensor == 1).cpu().item() assert torch.all(tensor == 1).cpu().item()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment