Merge tag 'v0.19.1' into v0.19.0

fc67613a · zhuwenwen · 31aec25b · b1388b1f · fc67613a · fc67613a
Commit fc67613a authored Apr 18, 2026 by zhuwenwen
20 changed files
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -4,7 +4,7 @@ absl-py==2.1.0
    # via
    #   rouge-score
    #   tensorboard
-accelerate==1.0.1
+accelerate==1.13.0
    # via peft
 aenum==3.1.16
    # via lightly
@@ -240,7 +240,6 @@ filelock==3.16.1
    #   huggingface-hub
    #   ray
    #   torch
-    #   transformers
    #   virtualenv
 fiona==1.10.1
    # via torchgeo
@@ -323,7 +322,7 @@ h5py==3.13.0
    # via terratorch
 harfile==0.3.0
    # via schemathesis
-hf-xet==1.1.7
+hf-xet==1.4.3
    # via huggingface-hub
 hiredis==3.0.0
    # via tensorizer
@@ -337,9 +336,10 @@ httpx==0.27.2
    # via
    #   -r requirements/test.in
    #   diffusers
+    #   huggingface-hub
    #   perceptron
    #   schemathesis
-huggingface-hub==0.36.2
+huggingface-hub==1.10.2
    # via
    #   accelerate
    #   datasets
@@ -740,7 +740,7 @@ pathvalidate==3.2.1
    # via pytablewriter
 patsy==1.0.1
    # via statsmodels
-peft==0.16.0
+peft==0.18.1
    # via -r requirements/test.in
 perceptron==0.1.4
    # via -r requirements/test.in
@@ -963,7 +963,7 @@ referencing==0.35.1
    # via
    #   jsonschema
    #   jsonschema-specifications
-regex==2024.9.11
+regex==2026.2.28
    # via
    #   diffusers
    #   nltk
@@ -982,7 +982,6 @@ requests==2.32.3
    #   google-api-core
    #   google-cloud-storage
    #   gpt-oss
-    #   huggingface-hub
    #   lightly
    #   lm-eval
    #   mistral-common
@@ -995,7 +994,6 @@ requests==2.32.3
    #   starlette-testclient
    #   tacoreader
    #   tiktoken
-    #   transformers
    #   wandb
 resampy==0.4.3
    # via -r requirements/test.in
@@ -1193,7 +1191,7 @@ timm==1.0.17
    #   segmentation-models-pytorch
    #   terratorch
    #   torchgeo
-tokenizers==0.22.0
+tokenizers==0.22.2
    # via
    #   -r requirements/test.in
    #   transformers
@@ -1269,7 +1267,7 @@ tqdm==4.67.3
    #   tacoreader
    #   terratorch
    #   transformers
-transformers==4.57.5
+transformers==5.5.3
    # via
    #   -r requirements/test.in
    #   genai-perf
@@ -1290,7 +1288,9 @@ typepy==1.3.2
 typer==0.15.2
    # via
    #   fastsafetensors
+    #   huggingface-hub
    #   perceptron
+    #   transformers
 types-python-dateutil==2.9.0.20241206
    # via arrow
 typeshed-client==2.8.2

--- a/requirements/test/xpu.txt
+++ b/requirements/test/xpu.txt
+# This file was autogenerated by uv via the following command:
+#    uv pip compile requirements/test/xpu.in -c requirements/xpu.txt -o requirements/test/xpu.txt --index-strategy unsafe-best-match --torch-backend xpu --python-platform x86_64-manylinux_2_39 --python-version 3.12
+absl-py==2.4.0
+    # via
+    #   -r requirements/test/xpu.in
+    #   rouge-score
+accelerate==1.13.0
+    # via -r requirements/test/xpu.in
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.13.4
+    # via
+    #   -c requirements/common.txt
+    #   fsspec
+    #   gpt-oss
+    #   lm-eval
+aiosignal==1.4.0
+    # via aiohttp
+albumentations==1.4.6
+    # via -r requirements/test/xpu.in
+annotated-doc==0.0.4
+    # via
+    #   fastapi
+    #   typer
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.13.0
+    # via
+    #   httpx
+    #   starlette
+arctic-inference==0.1.1
+    # via -r requirements/test/xpu.in
+attrs==26.1.0
+    # via
+    #   aiohttp
+    #   jsonlines
+    #   jsonschema
+    #   referencing
+audioread==3.0.1
+    # via
+    #   -r requirements/test/xpu.in
+    #   librosa
+blobfile==3.0.0
+    # via -r requirements/test/xpu.in
+bm25s==0.2.13
+    # via
+    #   -r requirements/test/xpu.in
+    #   mteb
+bounded-pool-executor==0.0.3
+    # via pqdm
+certifi==2026.2.25
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+cffi==2.0.0
+    # via soundfile
+chardet==5.2.0
+    # via mbstrdecoder
+charset-normalizer==3.4.6
+    # via requests
+chz==0.4.0
+    # via gpt-oss
+click==8.3.1
+    # via
+    #   jiwer
+    #   nltk
+    #   schemathesis
+    #   typer
+    #   uvicorn
+colorama==0.4.6
+    # via sacrebleu
+coverage==7.13.5
+    # via pytest-cov
+dataproperty==1.1.0
+    # via
+    #   pytablewriter
+    #   tabledata
+datasets==4.8.4
+    # via
+    #   evaluate
+    #   lm-eval
+    #   mteb
+decorator==5.2.1
+    # via librosa
+dill==0.4.1
+    # via
+    #   datasets
+    #   evaluate
+    #   lm-eval
+    #   multiprocess
+docker==7.1.0
+    # via gpt-oss
+docopt==0.6.2
+    # via num2words
+dpcpp-cpp-rt==2025.3.1
+    # via
+    #   onemkl-sycl-blas
+    #   onemkl-sycl-dft
+    #   onemkl-sycl-lapack
+    #   onemkl-sycl-rng
+    #   onemkl-sycl-sparse
+    #   torch
+evaluate==0.4.6
+    # via lm-eval
+fastapi==0.135.2
+    # via
+    #   -c requirements/common.txt
+    #   gpt-oss
+filelock==3.25.2
+    # via
+    #   -c requirements/common.txt
+    #   blobfile
+    #   datasets
+    #   huggingface-hub
+    #   modelscope
+    #   torch
+frozenlist==1.8.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2026.2.0
+    # via
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   torch
+gpt-oss==0.0.8
+    # via -r requirements/test/xpu.in
+graphql-core==3.2.8
+    # via hypothesis-graphql
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+harfile==0.4.0
+    # via schemathesis
+hf-xet==1.4.3
+    # via huggingface-hub
+html2text==2025.4.15
+    # via gpt-oss
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   schemathesis
+huggingface-hub==1.10.2
+    # via
+    #   accelerate
+    #   datasets
+    #   evaluate
+    #   sentence-transformers
+    #   timm
+    #   tokenizers
+    #   transformers
+hypothesis==6.151.10
+    # via
+    #   hypothesis-graphql
+    #   hypothesis-jsonschema
+    #   schemathesis
+hypothesis-graphql==0.12.0
+    # via schemathesis
+hypothesis-jsonschema==0.23.1
+    # via schemathesis
+idna==3.11
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+imageio==2.37.3
+    # via scikit-image
+impi-rt==2021.17.0
+    # via
+    #   oneccl
+    #   torch
+iniconfig==2.3.0
+    # via pytest
+intel-cmplr-lib-rt==2025.3.1
+    # via
+    #   intel-sycl-rt
+    #   torch
+intel-cmplr-lib-ur==2025.3.1
+    # via
+    #   intel-openmp
+    #   intel-sycl-rt
+    #   torch
+intel-cmplr-lic-rt==2025.3.1
+    # via
+    #   intel-opencl-rt
+    #   intel-sycl-rt
+    #   torch
+intel-opencl-rt==2025.3.1
+    # via
+    #   dpcpp-cpp-rt
+    #   onemkl-sycl-blas
+    #   onemkl-sycl-dft
+    #   onemkl-sycl-lapack
+    #   onemkl-sycl-rng
+    #   onemkl-sycl-sparse
+    #   torch
+intel-openmp==2025.3.1
+    # via
+    #   dpcpp-cpp-rt
+    #   mkl
+    #   torch
+intel-pti==0.15.0
+    # via torch
+intel-sycl-rt==2025.3.1
+    # via
+    #   dpcpp-cpp-rt
+    #   oneccl
+    #   torch
+jinja2==3.1.6
+    # via
+    #   -c requirements/xpu.txt
+    #   lm-eval
+    #   torch
+jiwer==4.0.0
+    # via -r requirements/test/xpu.in
+joblib==1.5.3
+    # via
+    #   librosa
+    #   nltk
+    #   scikit-learn
+jsonlines==4.0.0
+    # via lm-eval
+jsonschema==4.26.0
+    # via
+    #   hypothesis-jsonschema
+    #   mistral-common
+    #   schemathesis
+jsonschema-rs==0.45.0
+    # via schemathesis
+jsonschema-specifications==2025.9.1
+    # via jsonschema
+junit-xml==1.9
+    # via schemathesis
+lazy-loader==0.5
+    # via
+    #   librosa
+    #   scikit-image
+librosa==0.10.2.post1
+    # via -r requirements/test/xpu.in
+llvmlite==0.44.0
+    # via numba
+lm-eval==0.4.11
+    # via -r requirements/test/xpu.in
+lxml==6.0.2
+    # via
+    #   blobfile
+    #   gpt-oss
+    #   sacrebleu
+markdown-it-py==4.0.0
+    # via rich
+markupsafe==3.0.3
+    # via
+    #   jinja2
+    #   werkzeug
+mbstrdecoder==1.1.4
+    # via
+    #   dataproperty
+    #   pytablewriter
+    #   typepy
+mdurl==0.1.2
+    # via markdown-it-py
+mistral-common==1.11.0
+    # via
+    #   -c requirements/common.txt
+    #   -r requirements/test/xpu.in
+mkl==2025.3.0
+    # via
+    #   onemkl-sycl-blas
+    #   onemkl-sycl-dft
+    #   onemkl-sycl-lapack
+    #   onemkl-sycl-rng
+    #   onemkl-sycl-sparse
+    #   torch
+modelscope==1.35.3
+    # via -r requirements/test/xpu.in
+more-itertools==10.8.0
+    # via lm-eval
+mpmath==1.3.0
+    # via sympy
+msgpack==1.1.2
+    # via librosa
+mteb==2.12.7
+    # via -r requirements/test/xpu.in
+multidict==6.7.1
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.19
+    # via
+    #   datasets
+    #   evaluate
+networkx==3.6.1
+    # via
+    #   scikit-image
+    #   torch
+nltk==3.9.4
+    # via rouge-score
+num2words==0.5.14
+    # via -r requirements/test/xpu.in
+numba==0.61.2
+    # via
+    #   -c requirements/xpu.txt
+    #   librosa
+numpy==2.2.6
+    # via
+    #   accelerate
+    #   albumentations
+    #   bm25s
+    #   datasets
+    #   evaluate
+    #   imageio
+    #   librosa
+    #   lm-eval
+    #   mistral-common
+    #   mteb
+    #   numba
+    #   opencv-python-headless
+    #   pandas
+    #   pytrec-eval-terrier
+    #   rouge-score
+    #   sacrebleu
+    #   scikit-image
+    #   scikit-learn
+    #   scipy
+    #   sentence-transformers
+    #   soundfile
+    #   soxr
+    #   tifffile
+    #   torchvision
+    #   transformers
+oneccl==2021.17.1
+    # via
+    #   oneccl-devel
+    #   torch
+oneccl-devel==2021.17.1
+    # via torch
+onemkl-license==2025.3.0
+    # via
+    #   mkl
+    #   torch
+onemkl-sycl-blas==2025.3.0
+    # via
+    #   onemkl-sycl-lapack
+    #   onemkl-sycl-sparse
+    #   torch
+onemkl-sycl-dft==2025.3.0
+    # via torch
+onemkl-sycl-lapack==2025.3.0
+    # via torch
+onemkl-sycl-rng==2025.3.0
+    # via torch
+onemkl-sycl-sparse==2025.3.0
+    # via torch
+openai-harmony==0.0.8
+    # via
+    #   -c requirements/common.txt
+    #   gpt-oss
+opencv-python-headless==4.13.0.92
+    # via
+    #   -c requirements/common.txt
+    #   albumentations
+    #   mistral-common
+packaging==26.0
+    # via
+    #   -c requirements/xpu.txt
+    #   accelerate
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   lazy-loader
+    #   modelscope
+    #   pooch
+    #   pytest
+    #   pytest-rerunfailures
+    #   scikit-image
+    #   transformers
+    #   typepy
+pandas==3.0.1
+    # via
+    #   datasets
+    #   evaluate
+pathvalidate==3.3.1
+    # via pytablewriter
+pillow==12.1.1
+    # via
+    #   imageio
+    #   mistral-common
+    #   scikit-image
+    #   torchvision
+platformdirs==4.9.4
+    # via pooch
+pluggy==1.6.0
+    # via
+    #   pytest
+    #   pytest-cov
+polars==1.39.3
+    # via mteb
+polars-runtime-32==1.39.3
+    # via polars
+pooch==1.8.2
+    # via
+    #   -r requirements/test/xpu.in
+    #   librosa
+portalocker==3.2.0
+    # via sacrebleu
+pqdm==0.2.0
+    # via -r requirements/test/xpu.in
+propcache==0.4.1
+    # via
+    #   aiohttp
+    #   yarl
+psutil==7.2.2
+    # via accelerate
+py==1.11.0
+    # via pytest-forked
+pyarrow==23.0.1
+    # via datasets
+pycountry==26.2.16
+    # via pydantic-extra-types
+pycparser==3.0
+    # via cffi
+pycryptodomex==3.23.0
+    # via blobfile
+pydantic==2.12.5
+    # via
+    #   -c requirements/common.txt
+    #   albumentations
+    #   fastapi
+    #   gpt-oss
+    #   mistral-common
+    #   mteb
+    #   openai-harmony
+    #   pydantic-extra-types
+pydantic-core==2.41.5
+    # via pydantic
+pydantic-extra-types==2.11.1
+    # via mistral-common
+pyelftools==0.32
+    # via triton-xpu
+pygments==2.20.0
+    # via
+    #   pytest
+    #   rich
+pyrate-limiter==4.1.0
+    # via schemathesis
+pystemmer==3.0.0
+    # via
+    #   -r requirements/test/xpu.in
+    #   mteb
+pytablewriter==1.2.1
+    # via lm-eval
+pytest==9.0.2
+    # via
+    #   -r requirements/test/xpu.in
+    #   pytest-asyncio
+    #   pytest-cov
+    #   pytest-forked
+    #   pytest-rerunfailures
+    #   pytest-shard
+    #   pytest-timeout
+    #   schemathesis
+pytest-asyncio==1.3.0
+    # via -r requirements/test/xpu.in
+pytest-cov==6.3.0
+    # via -r requirements/test/xpu.in
+pytest-forked==1.6.0
+    # via -r requirements/test/xpu.in
+pytest-rerunfailures==14.0
+    # via -r requirements/test/xpu.in
+pytest-shard==0.1.2
+    # via -r requirements/test/xpu.in
+pytest-timeout==2.3.1
+    # via -r requirements/test/xpu.in
+python-dateutil==2.9.0.post0
+    # via
+    #   pandas
+    #   typepy
+pytrec-eval-terrier==0.5.10
+    # via mteb
+pytz==2026.1.post1
+    # via typepy
+pyyaml==6.0.3
+    # via
+    #   accelerate
+    #   albumentations
+    #   datasets
+    #   huggingface-hub
+    #   schemathesis
+    #   timm
+    #   transformers
+rapidfuzz==3.12.1
+    # via
+    #   -r requirements/test/xpu.in
+    #   jiwer
+referencing==0.37.0
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+regex==2026.3.32
+    # via
+    #   nltk
+    #   sacrebleu
+    #   tiktoken
+    #   transformers
+requests==2.33.1
+    # via
+    #   -c requirements/common.txt
+    #   datasets
+    #   docker
+    #   evaluate
+    #   gpt-oss
+    #   lm-eval
+    #   mistral-common
+    #   modelscope
+    #   mteb
+    #   pooch
+    #   schemathesis
+    #   starlette-testclient
+    #   tiktoken
+rich==14.3.3
+    # via
+    #   mteb
+    #   schemathesis
+    #   typer
+rouge-score==0.1.2
+    # via lm-eval
+rpds-py==0.30.0
+    # via
+    #   jsonschema
+    #   referencing
+sacrebleu==2.6.0
+    # via lm-eval
+safetensors==0.7.0
+    # via
+    #   accelerate
+    #   timm
+    #   transformers
+schemathesis==4.14.2
+    # via -r requirements/test/xpu.in
+scikit-image==0.26.0
+    # via albumentations
+scikit-learn==1.8.0
+    # via
+    #   albumentations
+    #   librosa
+    #   lm-eval
+    #   mteb
+    #   sentence-transformers
+scipy==1.17.1
+    # via
+    #   albumentations
+    #   bm25s
+    #   librosa
+    #   mteb
+    #   pytrec-eval-terrier
+    #   scikit-image
+    #   scikit-learn
+    #   sentence-transformers
+sentence-transformers==5.3.0
+    # via mteb
+setuptools==80.10.2
+    # via
+    #   -c requirements/common.txt
+    #   -c requirements/xpu.txt
+    #   modelscope
+    #   pytablewriter
+    #   torch
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via
+    #   -c requirements/common.txt
+    #   junit-xml
+    #   python-dateutil
+    #   rouge-score
+sortedcontainers==2.4.0
+    # via hypothesis
+soundfile==0.13.1
+    # via
+    #   -r requirements/test/xpu.in
+    #   librosa
+    #   mistral-common
+soxr==0.5.0.post1
+    # via
+    #   -r requirements/test/xpu.in
+    #   librosa
+    #   mistral-common
+sqlitedict==2.1.0
+    # via lm-eval
+starlette==1.0.0
+    # via
+    #   fastapi
+    #   starlette-testclient
+starlette-testclient==0.4.1
+    # via schemathesis
+structlog==25.5.0
+    # via gpt-oss
+sympy==1.14.0
+    # via torch
+tabledata==1.3.4
+    # via pytablewriter
+tabulate==0.10.0
+    # via sacrebleu
+tbb==2022.3.0
+    # via
+    #   intel-opencl-rt
+    #   mkl
+    #   torch
+tblib==3.1.0
+    # via -r requirements/test/xpu.in
+tcmlib==1.4.1
+    # via
+    #   tbb
+    #   torch
+    #   umf
+tcolorpy==0.1.7
+    # via pytablewriter
+tenacity==9.1.4
+    # via
+    #   gpt-oss
+    #   lm-eval
+    #   schemathesis
+termcolor==3.3.0
+    # via gpt-oss
+threadpoolctl==3.6.0
+    # via scikit-learn
+tifffile==2026.3.3
+    # via scikit-image
+tiktoken==0.12.0
+    # via
+    #   -c requirements/common.txt
+    #   gpt-oss
+    #   lm-eval
+    #   mistral-common
+timm==1.0.17
+    # via -r requirements/test/xpu.in
+tokenizers==0.22.2
+    # via
+    #   -c requirements/common.txt
+    #   transformers
+torch==2.10.0+xpu
+    # via
+    #   -c requirements/xpu.txt
+    #   accelerate
+    #   mteb
+    #   sentence-transformers
+    #   timm
+    #   torchvision
+torchvision==0.25.0+xpu
+    # via timm
+tqdm==4.67.3
+    # via
+    #   datasets
+    #   evaluate
+    #   huggingface-hub
+    #   lm-eval
+    #   modelscope
+    #   mteb
+    #   nltk
+    #   pqdm
+    #   sentence-transformers
+    #   transformers
+transformers==5.5.3
+    # via
+    #   -c requirements/common.txt
+    #   sentence-transformers
+triton-xpu==3.6.0
+    # via torch
+typepy==1.3.4
+    # via
+    #   dataproperty
+    #   pytablewriter
+    #   tabledata
+typer==0.24.1
+    # via
+    #   huggingface-hub
+    #   transformers
+typing-extensions==4.15.0
+    # via
+    #   -c requirements/common.txt
+    #   aiosignal
+    #   albumentations
+    #   anyio
+    #   chz
+    #   fastapi
+    #   huggingface-hub
+    #   librosa
+    #   lm-eval
+    #   mistral-common
+    #   mteb
+    #   pqdm
+    #   pydantic
+    #   pydantic-core
+    #   pydantic-extra-types
+    #   pytest-asyncio
+    #   referencing
+    #   schemathesis
+    #   sentence-transformers
+    #   starlette
+    #   torch
+    #   typing-inspection
+typing-inspection==0.4.2
+    # via
+    #   fastapi
+    #   pydantic
+umf==1.0.2
+    # via
+    #   intel-cmplr-lib-ur
+    #   torch
+urllib3==2.6.3
+    # via
+    #   blobfile
+    #   docker
+    #   modelscope
+    #   requests
+uvicorn==0.42.0
+    # via gpt-oss
+werkzeug==3.1.7
+    # via schemathesis
+word2number==1.1
+    # via lm-eval
+xxhash==3.6.0
+    # via
+    #   datasets
+    #   evaluate
+yarl==1.23.0
+    # via aiohttp
+zstandard==0.25.0
+    # via lm-eval
--- a/requirements/xpu-test.in
+++ b/requirements/xpu-test.in
@@ -9,6 +9,8 @@ pytest-shard
 # --- Core Tools & Bindings ---
 absl-py
 arctic-inference
+lm_eval[api]
+modelscope

 # --- Audio Processing ---
 librosa

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -409,6 +409,15 @@ class HfRunner:
            model_name,
            trust_remote_code=trust_remote_code,
        )
+        # HF runner should use the HF config so that it's consistent with the HF model
+        if self.config.__module__.startswith("vllm.transformers_utils.configs"):
+            from transformers.models.auto.configuration_auto import CONFIG_MAPPING
+
+            del CONFIG_MAPPING._extra_content[self.config.model_type]
+            self.config = AutoConfig.from_pretrained(
+                model_name,
+                trust_remote_code=trust_remote_code,
+            )
        self.device = self.get_default_device()
        self.dtype = dtype = _get_and_verify_dtype(
            self.model_name,

--- a/tests/kernels/core/test_minimax_reduce_rms.py
+++ b/tests/kernels/core/test_minimax_reduce_rms.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for MiniMax QK RMS-norm: NCCL reference vs Lamport fused kernel."""
+
+import pytest
+import torch
+import torch.nn as nn
+from torch.multiprocessing import spawn
+
+from tests.kernels.utils import opcheck
+from tests.utils import ensure_current_vllm_config, init_test_distributed_environment
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.model_executor.layers.mamba.linear_attn import MiniMaxText01RMSNormTP
+from vllm.platforms import current_platform
+from vllm.utils.network_utils import get_open_port
+from vllm.utils.torch_utils import set_random_seed
+
+
+@ensure_current_vllm_config()
+def _worker_forward_qk(
+    local_rank,
+    world_size,
+    port,
+    num_tokens,
+    hidden_q_full,
+    hidden_k_full,
+    dtype,
+    seed,
+    eps,
+):
+    """Per-rank worker: compare NCCL allreduce path vs Lamport fused kernel."""
+
+    if not hasattr(torch.ops._C, "minimax_allreduce_rms_qk"):
+        cleanup_dist_env_and_memory()
+        return
+    device = torch.device(f"cuda:{local_rank}")
+    torch.accelerator.set_device_index(device)
+    init_test_distributed_environment(
+        world_size, 1, local_rank, port, local_rank=local_rank
+    )
+
+    hq = hidden_q_full // world_size
+    hk = hidden_k_full // world_size
+
+    q_norm = MiniMaxText01RMSNormTP(hidden_q_full, eps=eps).cuda()
+    k_norm = MiniMaxText01RMSNormTP(hidden_k_full, eps=eps).cuda()
+
+    set_random_seed(seed)
+    qw = torch.randn(hidden_q_full, dtype=dtype, device="cuda")
+    kw = torch.randn(hidden_k_full, dtype=dtype, device="cuda")
+    q_norm.weight = nn.Parameter(qw[local_rank * hq : (local_rank + 1) * hq])
+    k_norm.weight = nn.Parameter(kw[local_rank * hk : (local_rank + 1) * hk])
+
+    torch.manual_seed(seed + 1000 + local_rank)
+    qkv = torch.randn(num_tokens, hq + hk + hk, dtype=dtype, device="cuda")
+
+    q_ref, k_ref, v_ref = qkv.clone().split([hq, hk, hk], dim=-1)
+    ref_q, ref_k = MiniMaxText01RMSNormTP.forward_qk(q_norm, k_norm, q_ref, k_ref)
+
+    # Set up Lamport workspace.
+    from vllm.distributed.parallel_state import get_tp_group
+    from vllm.model_executor.layers.mamba.lamport_workspace import (
+        get_allreduce_workspace,
+    )
+
+    workspace = get_allreduce_workspace(
+        rank=local_rank,
+        world_size=world_size,
+        max_tokens=num_tokens,
+        process_group=get_tp_group().cpu_group,
+    )
+
+    opcheck(
+        torch.ops._C.minimax_allreduce_rms_qk,
+        (
+            qkv.clone(),
+            q_norm.weight,
+            k_norm.weight,
+            workspace,
+            hq,
+            hk,
+            local_rank,
+            world_size,
+            eps,
+        ),
+    )
+    fused_q, fused_k = torch.ops._C.minimax_allreduce_rms_qk(
+        qkv.clone(),
+        q_norm.weight,
+        k_norm.weight,
+        workspace,
+        hq,
+        hk,
+        local_rank,
+        world_size,
+        eps,
+    )
+    _, _, fused_v = qkv.split([hq, hk, hk], dim=-1)
+    torch.accelerator.synchronize()
+
+    torch.testing.assert_close(
+        fused_q,
+        ref_q,
+        atol=3e-2,
+        rtol=3e-2,
+    )
+    torch.testing.assert_close(fused_k, ref_k, atol=3e-2, rtol=3e-2)
+
+    cleanup_dist_env_and_memory()
+
+
+@pytest.mark.skipif(
+    not current_platform.is_cuda(),
+    reason="CUDA required",
+)
+@pytest.mark.parametrize("world_size", [2, 4, 8])
+@pytest.mark.parametrize("num_tokens", [1, 128, 333])
+@pytest.mark.parametrize(
+    "hidden_dims",
+    [(6144, 1024)],
+)
+@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+@pytest.mark.parametrize("eps", [1e-6])
+@pytest.mark.parametrize("seed", [42])
+def test_minimax_reduce_rms_qk(
+    world_size,
+    num_tokens,
+    hidden_dims,
+    dtype,
+    eps,
+    seed,
+):
+    num_gpus = current_platform.device_count()
+    if num_gpus < world_size:
+        pytest.skip(f"Need >= {world_size} GPUs, have {num_gpus}")
+    hidden_q_full, hidden_k_full = hidden_dims
+    port = str(get_open_port())
+    spawn(
+        _worker_forward_qk,
+        args=(
+            world_size,
+            port,
+            num_tokens,
+            hidden_q_full,
+            hidden_k_full,
+            dtype,
+            seed,
+            eps,
+        ),
+        nprocs=world_size,
+        join=True,
+    )
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -3,6 +3,7 @@

 import tempfile
 from collections import OrderedDict
+from importlib import reload
 from unittest.mock import MagicMock

 import pytest
@@ -43,6 +44,18 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
        cleanup_dist_env_and_memory(shutdown_ray=True)


+@pytest.fixture
+def maybe_enable_lora_dual_stream(monkeypatch: pytest.MonkeyPatch):
+    if current_platform.is_cuda():
+        monkeypatch.setenv("VLLM_LORA_ENABLE_DUAL_STREAM", "1")
+        import vllm.lora.layers.base_linear
+
+        if not hasattr(vllm.lora.layers.base_linear, "lora_linear_async"):
+            # Reload the module to ensure the environment variable takes effect.
+            reload(vllm.lora.layers.base_linear)
+    yield
+
+
 @pytest.fixture
 def dist_init():
    from tests.utils import ensure_current_vllm_config

--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -5,7 +5,9 @@ import pytest

 from vllm.lora.lora_model import LoRAModel
 from vllm.lora.peft_helper import PEFTHelper
+from vllm.lora.utils import parse_fine_tuned_lora_name
 from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
+from vllm.model_executor.models.gemma4 import Gemma4ForCausalLM
 from vllm.model_executor.models.utils import WeightsMapper

 lora_lst = ["baichuan7B", "baichuan7B-zero", "baichuan7B-zero-regex", "chatglm3-6b"]
@@ -128,3 +130,24 @@ def test_lora_weights_mapping(baichuan_lora_files):
    for name in lora_model.loras:
        assert name.startswith(hf_to_vllm_mapper.orig_to_new_prefix["model."])
        assert ".baichuan_layers." in name
+
+
+def test_gemma4_lora_weights_mapping():
+    mapper = Gemma4ForCausalLM.hf_to_vllm_mapper
+    name = "base_model.model.model.language_model.layers.9.mlp.down_proj.lora_A.weight"
+    assert parse_fine_tuned_lora_name(name, mapper) == (
+        "model.layers.9.mlp.down_proj",
+        True,
+    )
+
+
+def test_gemma4_moe_lora_weights_mapping():
+    mapper = Gemma4ForCausalLM.hf_to_vllm_mapper
+    name = (
+        "base_model.model.model.language_model.layers.9.moe.experts."
+        "gate_up_proj.lora_B.weight"
+    )
+    assert parse_fine_tuned_lora_name(name, mapper) == (
+        "model.layers.9.moe.gate_up_proj",
+        False,
+    )
--- a/tests/lora/test_minicpmv_tp.py
+++ b/tests/lora/test_minicpmv_tp.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

+from importlib.metadata import version
+
 import pytest
+from packaging.version import Version

 import vllm
 from vllm.assets.image import ImageAsset
@@ -10,6 +13,14 @@ from vllm.platforms import current_platform

 from ..utils import multi_gpu_test

+pytestmark = pytest.mark.skipif(
+    Version("5.0") <= Version(version("transformers")),
+    reason=(
+        "MiniCPMV custom processor uses tokenizer.im_start_id which is not "
+        "available on TokenizersBackend in transformers v5.0+"
+    ),
+)
+
 MODEL_PATH = "openbmb/MiniCPM-Llama3-V-2_5"

 PROMPT_TEMPLATE = (

--- a/tests/model_executor/test_weight_utils.py
+++ b/tests/model_executor/test_weight_utils.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import os
 import tempfile

 import huggingface_hub.constants
@@ -10,26 +9,10 @@ from huggingface_hub.utils import LocalEntryNotFoundError

 from vllm.model_executor.model_loader.weight_utils import (
    download_weights_from_hf,
-    enable_hf_transfer,
    maybe_remap_kv_scale_name,
 )


-def test_hf_transfer_auto_activation():
-    if "HF_HUB_ENABLE_HF_TRANSFER" in os.environ:
-        # in case it is already set, we can't test the auto activation
-        pytest.skip("HF_HUB_ENABLE_HF_TRANSFER is set, can't test auto activation")
-    enable_hf_transfer()
-    try:
-        # enable hf hub transfer if available
-        import hf_transfer  # type: ignore # noqa
-
-        HF_TRANSFER_ACTIVE = True
-    except ImportError:
-        HF_TRANSFER_ACTIVE = False
-    assert huggingface_hub.constants.HF_HUB_ENABLE_HF_TRANSFER == HF_TRANSFER_ACTIVE
-
-
 def test_download_weights_from_hf():
    with tempfile.TemporaryDirectory() as tmpdir:
        # assert LocalEntryNotFoundError error is thrown
@@ -178,5 +161,4 @@ class TestMaybeRemapKvScaleName:


 if __name__ == "__main__":
-    test_hf_transfer_auto_activation()
    test_download_weights_from_hf()
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -143,6 +143,11 @@ def test_models(
        # in parts of the operators
        pytest.skip(f"Skipping '{model}' model test with AITER kernel.")

+    if current_platform.is_cpu() and model == "TitanML/tiny-mixtral":
+        # This untrained model is sensitive to the rounding error
+        # Fuse ops to reduce bfloat16 rounding
+        monkeypatch.setenv("VLLM_CPU_CI_ENV", "0")
+
    with hf_runner(model) as hf_model:
        hf_outputs = hf_model.generate_greedy_logprobs_limit(
            example_prompts, max_tokens, num_logprobs

--- a/tests/models/language/pooling/test_colbert.py
+++ b/tests/models/language/pooling/test_colbert.py
@@ -109,6 +109,14 @@ def _load_hf_model(model_name: str, hf_spec: dict, device: torch.device):
        **extra,
    ).to(device)
    model.eval()
+
+    # Transformers 5.0 weight materialization can clear non-persistent
+    # buffers (e.g. rotary inv_freq) that were registered with
+    # persistent=False.  Re-compute them so the model produces valid output.
+    for mod in model.modules():
+        if hasattr(mod, "_compute_inv_freq") and hasattr(mod, "inv_freq"):
+            mod.inv_freq = mod._compute_inv_freq(device=device)
+
    return model



--- a/tests/models/language/pooling/test_nomic_max_model_len.py
+++ b/tests/models/language/pooling/test_nomic_max_model_len.py
@@ -8,7 +8,13 @@ import pytest
 from ...utils import EmbedModelInfo

 MODELS = [
-    EmbedModelInfo("nomic-ai/nomic-embed-text-v1"),
+    EmbedModelInfo(
+        "nomic-ai/nomic-embed-text-v1",
+        # Fixme:
+        #  Update nomic-embed code to support the latest
+        #  HF version and remove revision set.
+        revision="720244025c1a7e15661a174c63cce63c8218e52b",
+    ),
    # EmbedModelInfo("nomic-ai/nomic-embed-text-v1.5"),
    # EmbedModelInfo("nomic-ai/CodeRankEmbed"),
    EmbedModelInfo("nomic-ai/nomic-embed-text-v2-moe"),
@@ -24,7 +30,10 @@ max_model_len = int(original_max_position_embeddings * factor)
 @pytest.mark.parametrize("model_info", MODELS)
 def test_default(model_info, vllm_runner):
    with vllm_runner(
-        model_info.name, runner="pooling", max_model_len=None
+        model_info.name,
+        revision=model_info.revision,
+        runner="pooling",
+        max_model_len=None,
    ) as vllm_model:
        model_config = vllm_model.llm.llm_engine.model_config
        if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
@@ -39,7 +48,10 @@ def test_default(model_info, vllm_runner):
 def test_set_max_model_len_legal(model_info, vllm_runner):
    # set max_model_len <= 512
    with vllm_runner(
-        model_info.name, runner="pooling", max_model_len=256
+        model_info.name,
+        revision=model_info.revision,
+        runner="pooling",
+        max_model_len=256,
    ) as vllm_model:
        model_config = vllm_model.llm.llm_engine.model_config
        assert model_config.max_model_len == 256
@@ -49,11 +61,19 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
        # For nomic-embed-text-v2-moe the length is set to 512
        # by sentence_bert_config.json.
        with pytest.raises(ValueError):
-            with vllm_runner(model_info.name, runner="pooling", max_model_len=1024):
+            with vllm_runner(
+                model_info.name,
+                revision=model_info.revision,
+                runner="pooling",
+                max_model_len=1024,
+            ):
                pass
    else:
        with vllm_runner(
-            model_info.name, runner="pooling", max_model_len=1024
+            model_info.name,
+            revision=model_info.revision,
+            runner="pooling",
+            max_model_len=1024,
        ) as vllm_model:
            model_config = vllm_model.llm.llm_engine.model_config
            assert model_config.max_model_len == 1024
@@ -63,7 +83,12 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
 def test_set_max_model_len_illegal(model_info, vllm_runner):
    # set max_model_len > 2048
    with pytest.raises(ValueError):
-        with vllm_runner(model_info.name, runner="pooling", max_model_len=4096):
+        with vllm_runner(
+            model_info.name,
+            revision=model_info.revision,
+            runner="pooling",
+            max_model_len=4096,
+        ):
            pass

    # set max_model_len > 2048 by hf_overrides
@@ -71,6 +96,7 @@ def test_set_max_model_len_illegal(model_info, vllm_runner):
    with pytest.raises(ValueError):
        with vllm_runner(
            model_info.name,
+            revision=model_info.revision,
            runner="pooling",
            max_model_len=None,
            hf_overrides=hf_overrides,
@@ -91,7 +117,11 @@ def test_use_rope_scaling_legal(model_info, vllm_runner):
    }

    with vllm_runner(
-        model_info.name, runner="pooling", max_model_len=None, hf_overrides=hf_overrides
+        model_info.name,
+        revision=model_info.revision,
+        runner="pooling",
+        max_model_len=None,
+        hf_overrides=hf_overrides,
    ):
        pass

@@ -110,6 +140,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
    with pytest.raises(ValueError):
        with vllm_runner(
            model_info.name,
+            revision=model_info.revision,
            runner="pooling",
            max_model_len=max_model_len + 1,
            hf_overrides=hf_overrides,
@@ -129,6 +160,7 @@ def test_use_rope_scaling_illegal(model_info, vllm_runner):
    with pytest.raises(ValueError):
        with vllm_runner(
            model_info.name,
+            revision=model_info.revision,
            runner="pooling",
            max_model_len=None,
            hf_overrides=hf_overrides,

--- a/tests/models/language/pooling_mteb_test/mteb_embed_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_embed_utils.py
@@ -151,6 +151,7 @@ def mteb_test_embed_models(

    with vllm_runner(
        model_info.name,
+        revision=model_info.revision,
        runner="pooling",
        max_model_len=model_info.max_model_len,
        **vllm_extra_kwargs,
@@ -201,6 +202,7 @@ def mteb_test_embed_models(
    if model_info.mteb_score is None:
        with hf_runner(
            model_info.name,
+            revision=model_info.revision,
            is_sentence_transformer=True,
            dtype=ci_envs.VLLM_CI_HF_DTYPE or model_info.hf_dtype,
        ) as hf_model:

--- a/tests/models/language/pooling_mteb_test/mteb_score_utils.py
+++ b/tests/models/language/pooling_mteb_test/mteb_score_utils.py
@@ -241,6 +241,7 @@ def mteb_test_rerank_models(

    with vllm_runner(
        model_info.name,
+        revision=model_info.revision,
        runner="pooling",
        max_model_len=None,
        max_num_seqs=8,
@@ -286,7 +287,9 @@ def mteb_test_rerank_models(
    # Accelerate mteb test by setting
    # SentenceTransformers mteb score to a constant
    if model_info.mteb_score is None:
-        with hf_runner(model_info.name, dtype=model_info.hf_dtype) as hf_model:
+        with hf_runner(
+            model_info.name, revision=model_info.revision, dtype=model_info.hf_dtype
+        ) as hf_model:
            hf_model.chat_template = chat_template
            st_main_score = run_mteb_rerank(
                hf_model,

--- a/tests/models/language/pooling_mteb_test/test_baai.py
+++ b/tests/models/language/pooling_mteb_test/test_baai.py
@@ -69,7 +69,10 @@ MODELS = [
        attn_type="decoder",
        is_prefix_caching_supported=True,
        is_chunked_prefill_supported=True,
-        enable_test=True,
+        # Skip: model's custom tokenizer on HF hub is incompatible with
+        # transformers v5 (sets attrs before super().__init__, triggering
+        # AttributeError on 'verbose' in __getattr__).
+        enable_test=False,
    ),
 ]


--- a/tests/models/language/pooling_mteb_test/test_gte.py
+++ b/tests/models/language/pooling_mteb_test/test_gte.py
@@ -72,7 +72,8 @@ MODELS = [
        attn_type="encoder_only",
        is_prefix_caching_supported=False,
        is_chunked_prefill_supported=False,
-        enable_test=True,
+        # Skip: numerical regression with transformers v5.
+        enable_test=False,
    ),
    ########## ModernBertModel
    EmbedModelInfo(

--- a/tests/models/language/pooling_mteb_test/test_jina.py
+++ b/tests/models/language/pooling_mteb_test/test_jina.py
@@ -75,6 +75,10 @@ def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None:
    mteb_test_rerank_models(vllm_runner, model_info)


+@pytest.mark.skip(
+    reason="jinaai/jina-embeddings-v3 custom XLMRobertaLoRA model on HF hub "
+    "is incompatible with transformers v5 (missing all_tied_weights_keys)"
+)
 @pytest.mark.parametrize("model_info", EMBEDDING_MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("dimensions", [16, 32])

--- a/tests/models/language/pooling_mteb_test/test_nomic.py
+++ b/tests/models/language/pooling_mteb_test/test_nomic.py
@@ -12,6 +12,10 @@ MODELS = [
    EmbedModelInfo(
        "nomic-ai/nomic-embed-text-v1",
        architecture="NomicBertModel",
+        # Fixme:
+        #  Update nomic-embed code to support the latest
+        #  HF version and remove revision set.
+        revision="720244025c1a7e15661a174c63cce63c8218e52b",
        mteb_score=0.737568559,
        enable_test=True,
        seq_pooling_type="MEAN",

--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -186,7 +186,14 @@ VLM_TEST_SETTINGS = {
        max_num_seqs=2,
        auto_cls=AutoModel,
        hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
-        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        marks=[
+            pytest.mark.core_model,
+            pytest.mark.cpu_model,
+            # TODO: Remove skip once model has been upstreamed to Transformers
+            pytest.mark.skip(
+                reason="Custom model code is not compatible with Transformers v5"
+            ),
+        ],
    ),
    #### Transformers fallback to test
    ## To reduce test burden, we only test batching arbitrary image size
@@ -397,14 +404,14 @@ VLM_TEST_SETTINGS = {
    "gemma4": VLMTestInfo(
        models=["google/gemma-4-E2B-it"],
        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n",  # noqa: E501
+        prompt_formatter=lambda img_prompt: f"<bos><|turn>user\n{img_prompt}<turn|>\n<|turn>model\n",  # noqa: E501
        single_image_prompts=IMAGE_ASSETS.prompts(
            {
-                "stop_sign": "What's the content in the center of the image?",
-                "cherry_blossom": "What is the season?",
+                "stop_sign": "<|image|>What's the content in the center of the image?",  # noqa: E501
+                "cherry_blossom": "<|image|>What is the season?",
            }
        ),
-        multi_image_prompt="Describe the two images in detail.",
+        multi_image_prompt="<|image|><|image|>Describe the two images in detail.",  # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
        auto_cls=AutoModelForImageTextToText,
@@ -533,6 +540,12 @@ VLM_TEST_SETTINGS = {
        max_model_len=4096,
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
+        # TODO: Remove skip once model has been upstreamed to Transformers
+        marks=[
+            pytest.mark.skip(
+                reason="Custom model code tries to access data from meta-tensor"
+            )
+        ],
    ),
    "intern_vl-video": VLMTestInfo(
        models=[
@@ -545,6 +558,12 @@ VLM_TEST_SETTINGS = {
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
        num_logprobs=10 if current_platform.is_rocm() else 5,
+        # TODO: Remove skip once model has been upstreamed to Transformers
+        marks=[
+            pytest.mark.skip(
+                reason="Custom model code tries to access data from meta-tensor"
+            )
+        ],
    ),
    "intern_vl-hf": VLMTestInfo(
        models=["OpenGVLab/InternVL3-1B-hf"],
@@ -591,6 +610,8 @@ VLM_TEST_SETTINGS = {
        hf_model_kwargs={"device_map": "auto"},
        patch_hf_runner=model_utils.isaac_patch_hf_runner,
        image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        # TODO: Remove skip once model has been upstreamed to Transformers
+        marks=[pytest.mark.skip(reason="Custom model imports deleted object")],  # noqa: E501
    ),
    "kimi_vl": VLMTestInfo(
        models=["moonshotai/Kimi-VL-A3B-Instruct"],
@@ -806,7 +827,12 @@ VLM_TEST_SETTINGS = {
            pytest.mark.skipif(
                Version(TRANSFORMERS_VERSION) == Version("4.57.3"),
                reason="This model is broken in Transformers v4.57.3",
-            )
+            ),
+            pytest.mark.skipif(
+                Version(TRANSFORMERS_VERSION) >= Version("5.0.0"),
+                reason="Model's custom code uses ROPE_INIT_FUNCTIONS"
+                "['default'] which was removed in transformers v5",
+            ),
        ],
    ),
    "phi3v": VLMTestInfo(
@@ -960,6 +986,12 @@ VLM_TEST_SETTINGS = {
            )
            for inp in custom_inputs.different_patch_input_cases_internvl()
        ],
+        # TODO: Remove skip once model has been upstreamed to Transformers
+        marks=[
+            pytest.mark.skip(
+                reason="Custom model code tries to access data from meta-tensor"
+            )
+        ],
    ),
    "llava_onevision-multiple-images": VLMTestInfo(
        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],

--- a/tests/models/multimodal/generation/test_nemotron_parse.py
+++ b/tests/models/multimodal/generation/test_nemotron_parse.py
@@ -103,6 +103,10 @@ def run_test(
        )


+@pytest.mark.skip(
+    reason="Model's custom MBart decoder has head count mismatch with "
+    "transformers v5's GQA-aware cross-attention (8 vs 16 heads)"
+)
 @pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"])
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("num_logprobs", [5])